src/hg/makeDb/doc/petMar1.txt 1.21
1.21 2009/07/21 21:01:45 markd
added transmap update blurb
Index: src/hg/makeDb/doc/petMar1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/petMar1.txt,v
retrieving revision 1.20
retrieving revision 1.21
diff -b -B -U 1000000 -r1.20 -r1.21
--- src/hg/makeDb/doc/petMar1.txt 25 Nov 2008 20:22:45 -0000 1.20
+++ src/hg/makeDb/doc/petMar1.txt 21 Jul 2009 21:01:45 -0000 1.21
@@ -1,1737 +1,1746 @@
# for emacs: -*- mode: sh; -*-
# This file describes browser build for the Sea Lamprey
# genome, Petromyzon marinus, March 2007, v.3.0 from WUSTL
#
# "$Id$"
#
##########################################################################
### Fetch sequence (DONE - 2008-01-14 - Hiram)
# determine estimated disk usage, from example genome like this one,
# braFlo1 which used about 3.5 Gb, do not need much space, store04
# has 42 Gb, plenty of space on store9
ssh kkstore04
mkdir /cluster/store9/petMar1
ln -s /cluster/store9/petMar1 /cluster/data/petMar1
mkdir /cluster/data/petMar1/downloads
cd /cluster/data/petMar1/downloads
wget --timestamping \
'ftp://genome.wustl.edu/pub/organism/Other_Vertebrates/Petromyzon_marinus/assembly/Petromyzon_marinus-3.0/output/*'
##########################################################################
# Fixup the agp file to make a chrUn agp file, add a 1000 base gap
# between the supercontigs
##########################################################################
# Run the makeGenomeDb.pl script (DONE - 2008-01-27 - Hiram)
ssh kkstore04
cd /cluster/data/petMar1/downloads
# eliminate zero-length fragments from the AGP file
zcat supercontigs.agp.gz | awk '
{
size=$3-$2
if (size >= 0) print
}
' | gzip -c > supers.agp.gz
cd /cluster/data/petMar1
cat << '_EOF_' > petMar1.config.ra
# Config parameters for makeGenomeDb.pl:
db petMar1
clade vertebrate
genomeCladePriority 130
scientificName Petromyzon marinus
commonName Lamprey
assemblyDate Mar. 2007
assemblyLabel WUSTL v.3.0 (March 2007)
orderKey 480
mitoAcc 5835065
fastaFiles /cluster/data/petMar1/downloads/supercontigs.fa.gz
agpFiles /cluster/data/petMar1/downloads/supers.agp.gz
# qualFiles none
dbDbSpeciesDir lamprey
'_EOF_'
# can stop at db since trackDb and dbDb were created in first
# attempt
makeGenomeDb.pl -stop=db petMar1.config.ra > makeGenome.log 2>&1 &
twoBitToFa petMar1.unmasked.2bit stdout | faSize stdin
# 1135497967 bases (303801529 N's 831696438 real 831696438
# upper 0 lower) in 2 sequences in 1 files
# chrM has a gi:5835065 fragment name, I would rather have NC_001626
hgsql -e 'update gold set frag="NC_001626" where chrom="chrM";' petMar1
##########################################################################
# Pick up an image from the EPA, usage rights are free
# http://www.epa.gov/glnpo/image/restrictions.htm
mkdir /cluster/data/petMar1/photo
cd /cluster/data/petMar1/photo
wget --timestamping 'http://www.epa.gov/glnpo/image/vbig/229.jpg' \
-O petromyzon_marinus.epa.jpg
##########################################################################
# Running WindowMasker (DONE - 2008-01-26 - Hiram)
ssh kolossus
mkdir /cluster/data/petMar1/bed/WindowMasker
cd /cluster/data/petMar1/bed/WindowMasker
/cluster/bin/scripts/doWindowMasker.pl \
-buildDir /cluster/data/petMar1/bed/WindowMasker \
-workhorse=kolossus -verbose=2 petMar1 > wm.log 2>&1 &
twoBitToFa petMar1.wmsk.sdust.2bit stdout | faSize stdin
# 1027258967 bases (195562529 N's 831696438 real 494032631 upper
# 337663807 lower) in 108241 sequences in 1 files
# %32.87 masked total, %40.60 masked real
ssh hgwdev
cd /cluster/data/petMar1/bed/WindowMasker
hgLoadBed petMar1 windowmaskerSdust windowmasker.sdust.bed.gz
# Loaded 4938211 elements of size 3
featureBits petMar1 windowmaskerSdust
# 533224658 bases of 831696438 (64.113%) in intersection
# WM has a bug where it masks the gaps too, don't like that,
# fetch the WM track without the gaps, can do this in the
# table browser with an intersection of WMask AND ! gap
# or via featureBits:
featureBits petMar1 -not gap -bed=notGap.bed
# hint: also works with the gold table instead of notGap
time nice -n +19 featureBits petMar1 windowmaskerSdust notGap.bed \
-bed=stdout | gzip -c > cleanWMask.bed.gz
# 337663807 bases of 831696438 (40.599%) in intersection
# real 182m45.469s
hgLoadBed petMar1 windowmaskerSdust cleanWMask.bed.gz
# Loaded 4919722 elements of size 4
featureBits petMar1 windowmaskerSdust > fb.petMar1.cleanWMask.txt 2>&1
# 337663807 bases of 831696438 (40.599%) in intersection
##########################################################################
# Running TRF simple repeats (DONE - 2008-01-26 - Hiram)
ssh kkstore04
mkdir /cluster/data/petMar1/bed/simpleRepeat
cd /cluster/data/petMar1/bed/simpleRepeat
doSimpleRepeat.pl petMar1 -smallClusterHub=memk \
-workhorse=kolossus \
-buildDir=/cluster/data/petMar1/bed/simpleRepeat > do.log 2>&1 &
# ran into a bug with the nice difficulty on the featureBits
featureBits petMar1 simpleRepeat > fb.petMar1.simpleRepeat.txt 2>&1
# 75844864 bases of 831696438 (9.119%) in intersection
doSimpleRepeat.pl petMar1 -smallClusterHub=memk \
-continue=cleanup -workhorse=kolossus \
-buildDir=/cluster/data/petMar1/bed/simpleRepeat > cleanup.log 2>&1 &
##########################################################################
# Repeat Masker (DONE - 2008-01-26 - Hiram)
ssh kkstore04
mkdir /cluster/data/petMar1/bed/RepeatMasker
cd /cluster/data/petMar1/bed/RepeatMasker
doRepeatMasker.pl petMar1 -bigClusterHub=kk -workhorse=kolossus \
-buildDir=/cluster/data/petMar1/bed/RepeatMasker \
-verbose=2 > do.log 2>&1
##########################################################################
# Add WindowMasker and trfMask masking to the 2bit sequenct
# (DONE - 2008-01-26 - Hiram)
ssh kkstore04
cd /cluster/data/petMar1
gzip bed/simpleRepeat/trfMask.bed
zcat bed/WindowMasker/cleanWMask.bed.gz bed/simpleRepeat/trfMask.bed.gz \
| twoBitMask -type=.bed petMar1.unmasked.2bit stdin petMar1.2bit
# ignore the warning about bed file > 13 fields, that is OK
twoBitToFa petMar1.2bit stdout | faSize stdin
# 1027258967 bases (195562529 N's 831696438 real 493479824 upper
# 338216614 lower) in 108241 sequences in 1 files
# %32.92 masked total, %40.67 masked real
############################################################################
# BLATSERVERS ENTRY (DONE - 2008-01-15 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("petMar1", "blat15", "17788", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("petMar1", "blat15", "17789", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
## Reset default position to
# Where RHO1 protein is found via blat:
# GEMSLWSLVV LAIERYIVIC KPMGNFRFGS THAYMGVAFT WFMALSCAAP PLVGWSR
ssh hgwdev
hgsql -e 'update dbDb set defaultPos="Contig11034:8429-10197"
where name="petMar1";' hgcentraltest
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2008-01-29 - Hiram)
# Use -repMatch=200 (based on size -- for human we use 1024, and
# medaka size is ~20% of human judging by gapless oryLat1 vs. hg18
# genome sizes from featureBits.
ssh kkstore04
cd /cluster/data/petMar1
blat petMar1.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=petMar1.11.ooc -repMatch=200
# Wrote 49155 overused 11-mers to petMar1.11.ooc
##########################################################################
# GENBANK AUTO UPDATE (DONE - 2008-01-15 - Hiram)
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add petMar1 just after gasAcu1
# petMar1 (Branchiostoma floridae - Lancelet)
petMar1.serverGenome = /cluster/data/petMar1/petMar1.2bit
petMar1.clusterGenome = /cluster/bluearc/scratch/data/petMar1/petMar1.2bit
petMar1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
petMar1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
petMar1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
petMar1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
petMar1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
petMar1.ooc = /cluster/data/petMar1/petMar1.11.ooc
petMar1.lift = /cluster/data/petMar1/jkStuff/petMar1.nonBridged.lft
petMar1.refseq.mrna.native.load = yes
petMar1.refseq.mrna.xeno.load = yes
petMar1.genbank.mrna.xeno.load = yes
petMar1.genbank.est.native.load = yes
petMar1.downloadDir = petMar1
petMar1.perChromTables = no
cvs ci -m "Added petMar1." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
# Edit src/lib/gbGenome.c to add new species.
cvs ci -m "Added Petromyzon marinus (Sea Lamprey)." src/lib/gbGenome.c
make install-server
ssh genbank
screen # control this with a screen so you can get back to it
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial petMar1 &
# logFile: var/build/logs/2008.01.29-11:27:10.petMar1.initalign.log
# real 332m36.025s
# There was some kind of problem with this alignment, although the
# kluster run actually finished:
# Completed: 366520 of 366520 jobs
# CPU time in finished jobs: 6854057s 114234.28m 1903.90h 79.33d 0.217 y
# IO & Wait Time: 2300432s 38340.53m 639.01h 26.63d 0.073 y
# Average job time: 25s 0.42m 0.01h 0.00d
# Longest finished job: 1292s 21.53m 0.36h 0.01d
# Submission to last job: 21121s 352.02m 5.87h 0.24d
# So, continuing:
time nice -n +19 bin/gbAlignStep -continue=finish -initial petMar1 &
# logFile: var/build/logs/2008.01.30-09:18:56.petMar1.initalign.log
# real 70m20.669s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad petMar1
# logFile: var/dbload/hgwdev/logs/2008.01.15-16:21:10.dbload.log
# real 19m34.853s
# enable daily alignment and update of hgwdev (2008-01-30 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add petMar1 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added petMar1 - Petromyzon marinus (lamprey)" \
etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-15 - Hiram)
# Try 1 failed, run again with contigs
ssh kkstore04
screen # use screen to control this job
mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
cat << '_EOF_' > DEF
# Medaka vs. Lamprey
# using the "close" genome alignment parameters
# see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
# TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
# chrUn in Scaffolds for this alignment run
SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LIMIT=30
SEQ1_LAP=10000
# QUERY: Lamprey petMar1
SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=memk > do.log 2>&1 &
# abandon this run, ended up using the swap from oryLat1 in:
# /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
# real 1053m9.027s
# netChains failed, during chainStitchId:
# q end mismatch 1073742062 vs 1111312799 line 510 of stdin
############################################################################
# Create contigs-only masked 2bit file for non-bridged gap free blastz runs
# (DONE - 2008-01-15 - Hiram)
ssh kkstore04
mkdir /cluster/data/petMar1/contigSequence
cd /cluster/data/petMar1/contigSequence
cp -p /cluster/data/priPac1/jkStuff/lft2BitToFa.pl ../jkStuff
../jkStuff/lft2BitToFa.pl ../petMar1.2bit \
../jkStuff/petMar1.nonBridged.lft > supercontigs.fa
# this took quite a long time, about 6 hours or so
# this doesn't have chrM yet
faSize supercontigs.fa ../M/chrM.fa
# 1027258967 bases (195562529 N's 831696438 real 493405059 upper
# 338291379 lower) in 108241 sequences in 2 files
# %32.93 masked total, %40.67 masked real
cat supercontigs.fa ../M/chrM.fa | faToTwoBit stdin petMar1.supers.2bit
twoBitToFa petMar1.supers.2bit stdout | faSize stdin
# 1027258967 bases (195562529 N's 831696438 real 493405059 upper
# 338291379 lower) in 108241 sequences in 1 files
# %32.93 masked total, %40.67 masked real
twoBitInfo petMar1.supers.2bit stdout | sort -k2nr > super.sizes
cp -p petMar1.supers.2bit /cluster/bluearc/scratch/data/petMar1
cd ..
ln -s contigSequence/super.sizes .
#########################################################################
# BLASTZ/CHAIN/NET Medaka oryLat1 (DONE - 2008-01-16 - 2008-01-30 - Hiram)
# Try 2 with contigs for Lamprey
ssh kkstore04
screen # use screen to control this job
mkdir /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
cd /cluster/data/oryLat1/bed/blastz.petMar1.2008-01-15
cat << '_EOF_' > DEF
# Medaka vs. Lamprey
# using the "close" genome alignment parameters
# see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=9400
BLASTZ_L=3000
BLASTZ_K=3000
BLASTZ_M=50
# TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
# chrUn in Scaffolds for this alignment run
SEQ1_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ1_LEN=/cluster/data/oryLat1/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LIMIT=30
SEQ1_LAP=10000
# QUERY: Lamprey petMar1
SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CTGDIR=/cluster/bluearc/scratch/data/petMar1/petMar1.supers.2bit
SEQ2_CTGLEN=/cluster/data/petMar1/super.sizes
SEQ2_LIFT=/cluster/data/petMar1/jkStuff/petMar1.nonBridged.lft
SEQ2_CHUNK=10000000
SEQ2_LIMIT=150
SEQ2_LAP=0
BASE=/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=kk > do.log 2>&1 &
# real 218m14.787s
# Completed: 69774 of 70854 jobs
# Crashed: 1080 jobs
# CPU time in finished jobs: 3950133s 65835.55m 1097.26h 45.72d 0.125 y
# IO & Wait Time: 1218716s 20311.93m 338.53h 14.11d 0.039 y
# Average job time: 74s 1.23m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1629s 27.15m 0.45h 0.02d
# Submission to last job: 12951s 215.85m 3.60h 0.15d
# had some parasol bookeeping problems, it actually finished
# the kluster run but got confused. Checked manually with
# para recover -> nothing in new job list, and check files in
# psl result directory finding the correct number of files for the
# the job count. Make the run.time file and continuing:
time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=cat -chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=kk > cat.log 2>&1 &
# real 7m47.963s
# something failed
# had the specification to the lft file incorrect, failed during
# chain run. Fix that, finish the run, now continuing:
time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=chainMerge -chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=kk > chainMerge.log 2>&1 &
# had the same failure as the first time test of this alignment.
# something is wrong with netChainSubset, needs to be debugged.
# ignore the step to create the "over.chain" file, finish the net
# step manually, then continuing:
time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=load -chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=kk > load.log 2>&1 &
# real 3m53.686s
cat fb.oryLat1.chainPetMar1Link.txt
# 24990733 bases of 700386597 (3.568%) in intersection
# And, for the swap:
mkdir /cluster/data/petMar1/bed/blastz.oryLat1.swap
cd /cluster/data/petMar1/bed/blastz.oryLat1.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=kk > swap.log 2>&1 &
# real 1m3.572s
# same problem in netChainSubset trying to make the "over.chain" file.
# skip that, finish the chains manually:
# real 14m35.340s
# then continuing:
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/oryLat1/bed/blastz.petMar1.2008-01-16/DEF \
-continue=load -chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=kk > load.log 2>&1 &
cat fb.petMar1.chainOryLat1Link.txt
# 23358156 bases of 831696438 (2.808%) in intersection
# create reciprocal best chains/nets for 9-way maf alignments
ssh hgwdev
cd /cluster/data/petMar1/bed/blastz.oryLat1.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 oryLat1 \
> rbest.log 2>&1 &
# real 22m20.900s
#############################################################################
# BLASTZ SWAP Lamprey - Lanclet (DONE - 2008-04-12 - Hiram)
ssh kkstore05
screen # control this job with a screen
# the original
cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
cat fb.braFlo1.chainPetMar1Link.txt
# 27418217 bases of 923355587 (2.969%) in intersection
# and the swap
mkdir /cluster/data/petMar1/bed/blastz.braFlo1.swap
cd /cluster/data/petMar1/bed/blastz.braFlo1.swap
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
# real 194m53.514s
cat fb.petMar1.chainBraFlo1Link.txt
# 27373264 bases of 831696438 (3.291%) in intersection
# create reciprocal best chains/nets for 9-way maf alignments
ssh hgwdev
cd /cluster/data/petMar1/bed/blastz.braFlo1.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 braFlo1 \
> rbest.log 2>&1 &
# real 110m25.613s
#############################################################################
# BLASTZ SWAP Chicken GalGal3 (DONE - 2008-04-15 - Hiram)
# the original
ssh kkstore03
cd /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
cat fb.galGal3.chainPetMar1Link.txt
# 20134896 bases of 1042591351 (1.931%) in intersection
# and the swap
mkdir /cluster/data/petMar1/bed/blastz.galGal3.swap
cd /cluster/data/petMar1/bed/blastz.galGal3.swap
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=memk -verbose=2 > swap.log 2>&1 &
# real 33m41.587s
cat fb.petMar1.chainGalGal3Link.txt
# 22308118 bases of 831696438 (2.682%) in intersection
# create reciprocal best chains/nets for 9-way maf alignments
ssh hgwdev
cd /cluster/data/petMar1/bed/blastz.galGal3.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 galGal3 \
> rbest.log 2>&1 &
# real 13m33.139s
#############################################################################
# BLASTZ SWAP Human hg18 (DONE - 2008-01-30 - Hiram)
ssh kkstore02
screen # use screen to control this job
# the original
cd /cluster/data/hg18/bed/blastzPetMar1.2008-01-29
cat fb.hg18.chainPetMar1Link.txt
# 36042598 bases of 2881515245 (1.251%) in intersection
# That is OK, now for the swap:
mkdir /cluster/data/petMar1/bed/blastz.hg18.swap
cd /cluster/data/petMar1/bed/blastz.hg18.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/hg18/bed/blastzPetMar1.2008-01-29/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk > swap.log 2>&1 &
# real 60m1.928s
cat fb.petMar1.chainHg18Link.txt
# 26751073 bases of 831696438 (3.216%) in intersection
# create reciprocal best chains/nets for 9-way maf alignments
ssh hgwdev
cd /cluster/data/petMar1/bed/blastz.hg18.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 hg18 \
> rbest.log 2>&1 &
# real 22m34.274s
#############################################################################
# BLASTZ SWAP Mouse mm9 (DONE - 2008-04-15 - Hiram)
ssh kkstore06
screen # use screen to control this job
# the original
cd /cluster/data/mm9/bed/blastzPetMar1.2008-04-14
cat fb.mm9.chainPetMar1Link.txt
# 29113438 bases of 2620346127 (1.111%) in intersection
# That is OK, now for the swap:
mkdir /cluster/data/petMar1/bed/blastz.mm9.swap
cd /cluster/data/petMar1/bed/blastz.mm9.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust -bigClusterHub=pk > swap.log 2>&1 &
# real 33m29.076s
cat fb.petMar1.chainMm9Link.txt
# 26052507 bases of 831696438 (3.132%) in intersection
# create reciprocal best chains/nets for 9-way maf alignments
ssh hgwdev
cd /cluster/data/petMar1/bed/blastz.mm9.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl petMar1 mm9 \
> rbest.log 2>&1 &
# real 21m9.320s
#############################################################################
## 6-Way Multiz (DONE - 2008-04-17 - Hiram)
##
ssh hgwdev
mkdir /cluster/data/petMar1/bed/multiz6way
cd /cluster/data/petMar1/bed/multiz6way
# take the 30-way tree from mm9 and eliminate genomes not in
# this alignment. Manually add in petMar1 and braFlo1
# arbitrarily somewhere outside of the fishes
# rearrange to get petMar1 on the top of the graph
# paste this tree into the on-line phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to create the image for the tree diagram
# select the 6 organisms from the 30-way recently done on mouse mm9
/cluster/bin/phast/tree_doctor \
--prune-all-but Human_hg18,Mouse_mm9,Chicken_galGal3,Medaka_oryLat1 \
/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
> 6-way.fullNames.nh
# looks something like this:
(((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757,
Chicken_galGal3:0.488824):0.325954,Medaka_oryLat1:1.077873);
# Adding Lamprey and Lanclet:
((Lamprey_petMar1:1.6,
(((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757,
Chicken_galGal3:0.488824):0.325954,Medaka_oryLat1:1.077873):0.4):0.4,
Lanclet_braFlo1:2.0);
((Lamprey_petMar1:1.6,
(Medaka_oryLat1:1.077873,((Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757,
Chicken_galGal3:0.488824):0.325954):0.4):0.4,
Lanclet_braFlo1:2.0);
((Lamprey_petMar1:1.6,
(Medaka_oryLat1:1.077873,(Chicken_galGal3:0.488824,
(Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.325954):0.4):0.4,
Lanclet_braFlo1:2.0);
# rearrange to get Marmoset at the top:
# this leaves us with:
cat << '_EOF_' > petMar1.6-way.nh
((Lamprey_petMar1:1.6,
(Medaka_oryLat1:1.077873,(Chicken_galGal3:0.488824,
(Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.325954):0.4):0.4,
Lanclet_braFlo1:2.0);
'_EOF_'
# << happy emacs
# create a species list from that file:
sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' petMar1.6-way.nh \
| sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
| sed -e "s/.*_//; s/:.*//" | sort > species.list
# verify that has 6 db names in it
# create a stripped down nh file for use in autoMZ run
echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:[012].[0-9]*//g; s/[,;]/ /g' petMar1.6-way.nh \
| sed -e "s/ / /g"` > tree.6.nh
# that looks like, as a single line:
# ((petMar1 (oryLat1 (galGal3 (mm9 hg18)))) braFlo1)
# verify all blastz's exists
cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/petMar1/bed/multiz6way
foreach db (`grep -v petMar1 species.list`)
set bdir = /cluster/data/petMar1/bed/blastz.$db
if (-e $bdir/mafRBestNet/petMar1.$db.rbest.maf.gz) then
echo "$db mafRBestNet"
else if (-e $bdir/mafSynNet/petMar1.$db.net.maf.gz) then
echo "$db mafSynNet"
else if (-e $bdir/mafNet/petMar1.$db.net.maf.gz) then
echo "$db mafNet"
else
echo "$db mafs not found"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./listMafs.csh
# see what it says, the "mafs not found" should only show up on petMar1
./listMafs.csh
# braFlo1 mafRBestNet
# galGal3 mafRBestNet
# hg18 mafRBestNet
# mm9 mafRBestNet
# oryLat1 mafRBestNet
/cluster/bin/phast/all_dists petMar1.6-way.nh > 6way.distances.txt
grep -i petmar 6way.distances.txt | sort -k3,3n
Lamprey_petMar1 Chicken_galGal3 2.814778
Lamprey_petMar1 Human_hg18 2.923612
Lamprey_petMar1 Medaka_oryLat1 3.077873
Lamprey_petMar1 Mouse_mm9 3.122529
Lamprey_petMar1 Lanclet_braFlo1 4.000000
# use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser. Zebrafish ends up before
# tetraodon and fugu on the browser despite its distance.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chainCalJac1Link chain linearGap
# distance on PetMar1 on other minScore
# 1 Chicken_galGal3 2.814778 (% 2.682) (% 1.931) 5000 loose
# 2 Human_hg18 2.923612 (% 3.216) (% 1.251) 5000 loose
# 3 Medaka_oryLat1 3.077873 (% 2.808) (% 3.568) 3000 medium
# 4 Mouse_mm9 3.122529 (% 3.132) (% 1.111) 5000 loose
# 5 Lanclet_braFlo1 4.000000 (% 3.291) (% 2.969) 5000 loose
# copy net mafs to cluster-friendly storage, splitting chroms
mkdir mafLinks
cd mafLinks
ln -s ../../blastz.*.swap/mafRBestNet/*.rbest.maf.gz .
# need to split these things up by Contig number for efficient kluster run
ssh kkstore04
mkdir -p /san/sanvol1/scratch/petMar1/multiz6way/contigMaf
cd /scratch/tmp
# the 16201 is from petMar/chrom.sizes
echo "chrM 0 16201" > chrM.bed
for D in `grep -v petMar1 /cluster/data/petMar1/bed/multiz6way/species.list`
do
echo ${D}
zcat \
/cluster/data/petMar1/bed/multiz6way/mafLinks/petMar1.${D}.rbest.maf.gz \
> ${D}.maf
mkdir /scratch/tmp/${D}
cd /scratch/tmp/${D}
mafSplit -verbose=2 /dev/null -byTarget -useSequenceName Contig \
../${D}.maf -outDirDepth=2
mafsInRegion ../chrM.bed 0/0/chrM.maf ../${D}.maf
rsync -a --progress ./ \
/san/sanvol1/scratch/petMar1/multiz6way/contigMaf/${D}
cd /scratch/tmp
rm -fr ${D} ${D}.maf
done
# create a run-time list of contigs to operate on, not all contigs
# exist in all alignments, but we want all contig names used in any
# alignment:
cd /san/sanvol1/scratch/petMar1/multiz6way/contigMaf
for D in *
do
cd "${D}"
find . -type f
cd ..
done | sort -u > /tmp/6-way.contig.list
wc -l /tmp/6-way.contig.list
mkdir /cluster/data/petMar1/bed/multiz6way/splitRun
cp -p /tmp/6-way.contig.list \
/cluster/data/petMar1/bed/multiz6way/splitRun
# 31488 /tmp/6-way.contig.list
# ready for the multiz run
ssh pk
cd /cluster/data/petMar1/bed/multiz6way/splitRun
mkdir -p maf run
cd run
mkdir penn
# use latest penn utilities
P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# set the db and pairs directories here
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = petMar1
set subdir = $1
set c = $2
set result = $3
set resultDir = $result:h
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz6way/contigMaf
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp ../../tree.6.nh ../../species.list $tmp
pushd $tmp
foreach s (`grep -v $db species.list`)
set in = $pairs/$s/$subdir/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.6.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/petMar1/bed/multiz6way/splitRun/maf/$(dir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs
sed -e "s/^\.\///" ../6-way.contig.list \
| gensub2 stdin single template jobList
para create jobList
para try ... check ... push ... etc
# Completed: 31488 of 31488 jobs
# CPU time in finished jobs: 5723s 95.38m 1.59h 0.07d 0.000 y
# IO & Wait Time: 84949s 1415.82m 23.60h 0.98d 0.003 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest finished job: 12s 0.20m 0.00h 0.00d
# Submission to last job: 900s 15.00m 0.25h 0.01d
# Estimated complete: 0s 0.00m 0.00h 0.00d
# put the split maf results back together into a single maf file
# eliminate duplicate comments
ssh kkstore04
cd /cluster/data/petMar1/bed/multiz6way
mkdir togetherMaf
grep "^##maf version" splitRun/maf/0/0/Contig00000.maf \
| sort -u > togetherMaf/petMar1.6way.maf
for F in `find ./splitRun/maf -type f -depth`
do
grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
| sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
done | sort -u >> togetherMaf/petMar1.6way.maf
for F in `find ./splitRun/maf -type f -depth`
do
grep -v -h "^#" "${F}"
done >> togetherMaf/petMar1.6way.maf
grep "^##eof maf" splitRun/maf/0/0/Contig00000.maf \
| sort -u >> togetherMaf/petMar1.6way.maf
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/petMar1/multiz6way/maf
ln -s /cluster/data/petMar1/bed/multiz6way/togetherMaf/*.maf \
/gbdb/petMar1/multiz6way/maf/multiz6way.maf
# this generates an immense multiz6way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /scratch/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/petMar1/multiz6way/maf petMar1 multiz6way
# real 0m6.011s
# Loaded 232205 mafs in 1 files from /gbdb/petMar1/multiz6way/maf
# load summary table *NOT* - no Contigs are long enough to create
# anything for the summary table. This command produces an empty
# table.
time nice -n +19 cat /gbdb/petMar1/multiz6way/maf/*.maf \
| hgLoadMafSummary petMar1 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz6waySummary stdin
# real 5m58.150s
# Gap Annotation
# prepare bed files with gap info
ssh kkstore04
mkdir /cluster/data/petMar1/bed/multiz6way/anno
cd /cluster/data/petMar1/bed/multiz6way/anno
mkdir maf run
# these actually already all exist from previous multiple alignments
# if there are done you will see an ls of them
# or else the twoBitInfo command will be echoed, run it if you want
for DB in `cat ../species.list`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `grep -v petMar1 ../../species.list`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
ssh memk
# temporarily copy the petMar1.6way.maf file onto the memk
# nodes /scratch/data/petMar1/maf/ directory
for R in 0 1 2 3 4 5 6 7
do
ssh mkr0u${R} mkdir /scratch/data/petMar1/maf
ssh mkr0u${R} rsync -a --progress \
/cluster/data/petMar1/bed/multiz6way/togetherMaf/petMar1.6way.maf \
/scratch/data/petMar1/maf/
done
mkdir /cluster/data/petMar1/bed/multiz6way/anno/splitMaf
# need to split up the single maf file into individual
# per-scaffold maf files to run annotation on
cd /cluster/data/petMar1/bed/multiz6way/anno/splitMaf
# create bed files to list approximately 1553 scaffolds in
# a single list, approximately 33 lists
cat << '_EOF_' > mkBedLists.pl
#!/usr/bin/env perl
use strict;
use warnings;
my $bedCount = 0;
my $i = 0;
my $bedFile = sprintf("file_%d.bed", $bedCount);
open (BF,">$bedFile") or die "can not write to $bedFile $!";
open (FH,"</cluster/data/petMar1/chrom.sizes") or
die "can not read /cluster/data/petMar1/chrom.sizes $!";
while (my $line = <FH>) {
chomp $line;
if ( (($i + 1) % 1553) == 0 ) {
printf "%s\n", $line;
close (BF);
++$bedCount;
$bedFile = sprintf("file_%d.bed", $bedCount);
open (BF,">$bedFile") or die "can not write to $bedFile $!";
}
++$i;
my ($chr, $size) = split('\s+',$line);
printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
}
close (FH);
close (BF);
'_EOF_'
# << happy emacs
chmod +x mkBedLists.pl
./mkBedLists.pl
# now, run a mafsInRegion on each one of those lists
cat << '_EOF_' > runOne
#!/bin/csh -fe
set runDir = "/cluster/data/petMar1/bed/multiz6way/anno/splitMaf"
set resultDir = $1
set bedFile = $resultDir.bed
mkdir -p $resultDir
mkdir -p /scratch/tmp/petMar1/$resultDir
pushd /scratch/tmp/petMar1/$resultDir
mafsInRegion $runDir/$bedFile -outDir . \
/scratch/data/petMar1/maf/petMar1.6way.maf
popd
rsync -q -a /scratch/tmp/petMar1/$resultDir/ ./$resultDir/
rm -fr /scratch/tmp/petMar1/$resultDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/petMar1
'_EOF_'
# << happy emacs
chmod +x runOne
cat << '_EOF_' > template
#LOOP
./runOne $(root1)
#ENDLOOP
'_EOF_'
# << happy emacs
ls file*.bed > runList
gensub2 runList single template jobList
para create jobList
para try ... check ... push ... etc
# Completed: 70 of 70 jobs
# CPU time in finished jobs: 255s 4.24m 0.07h 0.00d 0.000 y
# IO & Wait Time: 1904s 31.74m 0.53h 0.02d 0.000 y
# Average job time: 31s 0.51m 0.01h 0.00d
# Longest finished job: 72s 1.20m 0.02h 0.00d
# Submission to last job: 143s 2.38m 0.04h 0.00d
cd /cluster/data/petMar1/bed/multiz6way/anno/run
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set outDir = ../maf/$2
set result = $3
set input = $1
mkdir -p $outDir
cat $input | \
nice mafAddIRows -nBeds=nBeds stdin /scratch/data/petMar1/petMar1.2bit $result
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
find ../splitMaf -type f -name "*.maf" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 31488 of 31488 jobs
# CPU time in finished jobs: 10711s 178.52m 2.98h 0.12d 0.000 y
# IO & Wait Time: 79430s 1323.83m 22.06h 0.92d 0.003 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest finished job: 12s 0.20m 0.00h 0.00d
# Submission to last job: 3237s 53.95m 0.90h 0.04d
ssh kkstore04
# put them all back together into a single file
cd /cluster/data/petMar1/bed/multiz6way/anno
grep "^##maf version" maf/file_0/Contig0.maf \
| sort -u > petMar1.anno.6way.maf
find ./maf -type f -depth -name "*.maf" | while read F
do
grep -v -h "^#" "${F}"
done >> petMar1.anno.6way.maf
echo "##eof maf" >> petMar1.anno.6way.maf
XXX - running 2008-04-23 09:58
ls -og *.maf
# -rw-rw-r-- 1 285783792 Apr 23 09:58 petMar1.anno.6way.maf
ssh hgwdev
cd /cluster/data/petMar1/bed/multiz6way/anno
mkdir -p /gbdb/petMar1/multiz6way/anno
ln -s `pwd`/petMar1.anno.6way.maf \
/gbdb/petMar1/multiz6way/anno/multiz6way.maf
# by loading this into the table multiz6way, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /scratch/tmp
time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/petMar1/multiz6way/anno \
petMar1 multiz6way
# Loaded 296783 mafs in 1 files from /gbdb/petMar1/multiz6way/anno
# real 0m6.690s
# normally filter this for chrom size > 1,000,000 and only load
# those chroms. But this is a scaffold assembly, load everything.
# *NOT* Does not work on petMar1, all contigs are too small,
# this makes an empty table.
hgLoadMafSummary petMar1 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz6waySummary \
/gbdb/petMar1/multiz6way/anno/multiz6way.maf
# Created 121083 summary blocks from 3410157 components and 749940 mafs
# from /gbdb/petMar1/multiz6way/anno/multiz6way.maf
# by loading this into the table multiz6waySummary, it will replace
# the previously loaded table with the unannotated mafs
# remove the multiz6way*.tab files in this /scratch/tmp directory
rm multiz6way*.tab
# And, you can remove the previously loaded non-annotated maf file link:
rm /gbdb/petMar1/multiz6way/maf/multiz6way.maf
rmdir /gbdb/petMar1/multiz6way/maf
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-19)
# (auto) establish native host of /cluster/data/<assembly>
ssh kkstore04
# bash if not using bash shell already
mkdir /cluster/data/petMar1/blastDb
cd /cluster/data/petMar1
awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
if test -s 1meg.lst; then
twoBitToFa -seqList=1meg.lst petMar1.unmasked.2bit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa
fi
rm 1meg.lst
awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
if test -s less1meg.lst; then
twoBitToFa -seqList=less1meg.lst petMar1.unmasked.2bit temp.fa
faSplit about temp.fa 1000000 blastDb/y
rm temp.fa
fi
rm less1meg.lst
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
ls *.nsq | wc -l
# 1018
mkdir -p /san/sanvol1/scratch/petMar1/blastDb
cd /cluster/data/petMar1/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /san/sanvol1/scratch/petMar1/blastDb
done
mkdir -p /cluster/data/petMar1/bed/tblastn.hg18KG
cd /cluster/data/petMar1/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/petMar1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 1018 query.lst
# we want around 100,000 jobs per gig (0.0001 jobs per base)
numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/petMar1/chrom.sizes`
# we want around numJobs jobs
numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`
numQueries=`wc query.lst | awk '{print $1}'`
numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' `
mkdir -p /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa
split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/petMar1/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/petMar1/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
if test -s /cluster/data/petMar1/blastDb.lft
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/petMar1/blastDb.lft carry $f.2
else
mv $f.2 $f.3
fi
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit
ssh pk
cd /cluster/data/petMar1/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 103836 of 103836 jobs
# CPU time in finished jobs: 9418283s 156971.39m 2616.19h 109.01d 0.299 y
# IO & Wait Time: 867856s 14464.26m 241.07h 10.04d 0.028 y
# Average job time: 99s 1.65m 0.03h 0.00d
# Longest finished job: 683s 11.38m 0.19h 0.01d
# Submission to last job: 38789s 646.48m 10.77h 0.45d
ssh kkstore05
cd /cluster/data/petMar1/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh memk
cd /cluster/data/petMar1/bed/tblastn.hg18KG/chainRun
para create chainSpec
para try, check, push, check etc.
# Completed: 102 of 102 jobs
# CPU time in finished jobs: 4354s 72.57m 1.21h 0.05d 0.000 y
# IO & Wait Time: 70795s 1179.91m 19.67h 0.82d 0.002 y
# Average job time: 737s 12.28m 0.20h 0.01d
# Longest finished job: 1308s 21.80m 0.36h 0.02d
# Submission to last job: 2927s 48.78m 0.81h 0.03d
ssh kkstore04
cd /cluster/data/petMar1/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/petMar1/bed/tblastn.hg18KG/blastHg18KG.psl
cd ..
pslCheck blastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/petMar1/bed/tblastn.hg18KG
hgLoadPsl petMar1 blastHg18KG.psl
# check coverage
featureBits petMar1 blastHg18KG
# 7171010 bases of 831696438 (0.862%) in intersection
featureBits petMar1 xenoRefGene:cds blastHg18KG -enrichment
# xenoRefGene:cds 0.676%, blastHg18KG 0.862%, both 0.362%, cover 53.51%, enrich 62.06x
ssh kkstore05
rm -rf /cluster/data/petMar1/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/petMar1/bed/tblastn.hg18KG/blastOut
#end tblastn
###########################################################################
## Annotate 6-way multiple alignment with gene annotations
## (DONE - 2008-01-08 - Hiram)
# Gene frames
## given previous survey done for 8-way alignment on Orangutan,
## try using the following tables for this gene annotation:
# hg18 is in a bit of transition at the moment, so use ensGene
# mm9 can use knownGene
# xenoMrna for braFlo1, petMar1
# ensGene v49 for galGal3, oryLat1
ssh hgwdev
mkdir /cluster/data/petMar1/bed/multiz6way/frames
cd /cluster/data/petMar1/bed/multiz6way/frames
mkdir genes
# knownGene
for DB in mm9
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# ensGene
for DB in hg18 galGal3 oryLat1
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# use xenoMrna for braFlo1, petMar1
for DB in braFlo1 petMar1
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \
from xenoMrna,gbCdnaInfo,cds \
where (xenoMrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done
ls -og genes
# -rw-rw-r-- 1 1040642 Apr 28 14:30 braFlo1.gp.gz
# -rw-rw-r-- 1 1557911 Apr 28 14:29 galGal3.gp.gz
# -rw-rw-r-- 1 2151412 Apr 28 14:29 hg18.gp.gz
# -rw-rw-r-- 1 1965274 Apr 28 14:28 mm9.gp.gz
# -rw-rw-r-- 1 1713692 Apr 28 14:29 oryLat1.gp.gz
# -rw-rw-r-- 1 596014 Apr 28 14:31 petMar1.gp.gz
ssh kkstore04
cd /cluster/data/petMar1/bed/multiz6way/frames
# hg18 is in a bit of transition at the moment, so use ensGene
# mm9 can use knownGene
# xenoMrna for braFlo1, petMar1
# ensGene v49 for galGal3, oryLat1
# anything to annotate is in a pair, e.g.: petMar1 genes/petMar1.gp.gz
time (cat ../anno/petMar1.anno.6way.maf | nice -n +19 genePredToMafFrames petMar1 stdin stdout petMar1 genes/petMar1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz oryLat1 genes/oryLat1.gp.gz braFlo1 genes/braFlo1.gp.gz | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multiz6way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
# 37711 braFlo1
# 55189 petMar1
# 75566 oryLat1
# 95715 galGal3
# 107885 mm9
# 110835 hg18
# load the resulting file
ssh hgwdev
cd /cluster/data/petMar1/bed/multiz6way/frames
time nice -n +19 hgLoadMafFrames petMar1 multiz6wayFrames \
multiz6way.mafFrames.gz
# real 0m7.652s
# enable the trackDb entries:
# frames multiz6wayFrames
# irows on
#############################################################################
# phastCons 6-way (DONE - 2008-04-28 - Hiram)
# split 6way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh memk
mkdir /cluster/data/petMar1/bed/multiz6way/msa.split
cd /cluster/data/petMar1/bed/multiz6way/msa.split
mkdir -p /san/sanvol1/scratch/petMar1/multiz6way/cons/ss
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/petMar1/bed/multiz6way/anno/maf
set WINDOWS = /san/sanvol1/scratch/petMar1/multiz6way/cons/ss
pushd $WINDOWS
set resultDir = $1
set c = $2
rm -fr $resultDir/$c
mkdir -p $resultDir
twoBitToFa -seq=$c /scratch/data/petMar1/petMar1.2bit /scratch/tmp/petMar1.$c.fa
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$resultDir/$c.maf -i MAF \
-M /scratch/tmp/petMar1.$c.fa \
-o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000
rm -f /scratch/tmp/petMar1.$c.fa
popd
mkdir -p $resultDir
date > $resultDir/$c.out
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
# << happy emacs
# create list of maf files:
(cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... etc
# Completed: 31488 of 31488 jobs
# CPU time in finished jobs: 2289s 38.15m 0.64h 0.03d 0.000 y
# IO & Wait Time: 82557s 1375.95m 22.93h 0.96d 0.003 y
# Average job time: 3s 0.04m 0.00h 0.00d
# Longest finished job: 40s 0.67m 0.01h 0.00d
# Submission to last job: 3052s 50.87m 0.85h 0.04d
# take the cons and noncons trees from the mouse 30-way
# Estimates are not easy to make, probably more correctly,
# take the 30-way .mod file, and re-use it here.
ssh hgwdev
cd /cluster/data/petMar1/bed/multiz6way
cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh memk
mkdir -p /cluster/data/petMar1/bed/multiz6way/cons/run.cons
cd /cluster/data/petMar1/bed/multiz6way/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
# all gliers placentals
# Well, that's what it was when used in the Mm9 30-way,
# in this instance, there is only the directory "all"
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin
set subDir = $1
set f = $2
set c = $2:r
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/petMar1/bed/multiz6way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/petMar1/multiz6way/cons
if (-s $cons/$grp/$grp.non-inf) then
cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
cp -p $cons/$grp/$grp.mod $tmp
cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
sleep 4
touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
rm -f $san/$grp/pp/$subDir/$f.pp
rm -f $san/$grp/bed/$subDir/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$subDir
mv $tmp/$f.bed $san/$grp/bed/$subDir
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/petMar1/multiz6way/cons/all/pp/$(root1)/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/petMar1/multiz6way/cons
find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
> /cluster/data/petMar1/bed/multiz6way/cons/ss.list
popd
# run for all species
cd ..
mkdir -p all run.cons/all
cd all
/cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \
../../mm9.30way.mod \
--prune-all-but=petMar1,hg18,galGal3,mm9,oryLat1,braFlo1 \
> all.mod
# Actually, this doesn't work because petMar1 and braFlo1 are not
# in that business. So, manually place them in the TREE: line in all.mod,
# from the manufactured tree above, and the numbers from mm9.30way.mod:
# TREE: ((petMar1:1.6,(oryLat1:1.077873,(galGal3:0.488824,(mm9:0.325818,hg18:0.136019):0.470757):0.325954):0.4):0.4,braFlo1:2.0);
cd ../run.cons/all
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/petMar1/multiz6way/cons/all/pp/$(lastDir1)/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 10471 of 10471 jobs
# CPU time in finished jobs: 991s 16.52m 0.28h 0.01d 0.000 y
# IO & Wait Time: 68810s 1146.83m 19.11h 0.80d 0.002 y
# Average job time: 7s 0.11m 0.00h 0.00d
# Longest finished job: 10s 0.17m 0.00h 0.00d
# Submission to last job: 4767s 79.45m 1.32h 0.06d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all
find ./bed -type f -name "Contig*.bed" | xargs cat \
| sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 3 minutes
cp -p mostConserved.bed /cluster/data/petMar1/bed/multiz6way/cons/all
# load into database
ssh hgwdev
cd /cluster/data/petMar1/bed/multiz6way/cons/all
time nice -n +19 hgLoadBed petMar1 phastConsElements6way mostConserved.bed
# Loaded 76016 elements of size 5
# Try for 5% overall cov, and 70% CDS cov
# We don't have any gene tracks to compare CDS coverage
# --rho .31 --expected-length 45 --target-coverage .3
featureBits petMar1 phastConsElements6way
# 23909522 bases of 831696438 (2.875%) in intersection
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all
mkdir -p phastCons6wayScores
for D in `ls -1d pp/file* | sort -t_ -k2n`
do
F=${D/pp\/}
out=phastCons6wayScores/${F}.data.gz
echo "${D} > ${F}.data.gz"
ls -S ${D}/*.pp | xargs cat | gzip > ${out}
done
# copy the phastCons6wayScores to:
# /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way/phastConsScores
# for hgdownload downloads
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
cd /san/sanvol1/scratch/petMar1/multiz6way/cons/all
ls -1 phastCons6wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \
| wigEncode stdin phastCons6way.wig phastCons6way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
time nice -n +19 cp -p *.wi? /cluster/data/petMar1/bed/multiz6way/cons/all
# real 0m1.729s
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/petMar1/bed/multiz6way/cons/all
ln -s `pwd`/phastCons6way.wib /gbdb/petMar1/multiz6way/phastCons6way.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/petMar1/multiz6way petMar1 \
phastCons6way phastCons6way.wig
# real 0m2.068s
# remove garbage
rm wiggle.tab
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/petMar1/bed/multiz6way/cons/all
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=petMar1 phastCons6way > histogram.data 2>&1
# real 5m0.608s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Lamprey PetMar1 Histogram phastCons6way track"
set xlabel " phastCons6way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# These trackDb entries turn on the wiggle phastCons data track:
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons6way
# spanList 1
# autoScaleDefault Off
# windowingFunction mean
# pairwiseHeight 12
# yLineOnOff Off
#############################################################################
# Downloads (DONE - 2008-04-29 - Hiram)
# Let's see if the downloads will work
ssh hgwdev
cd /cluster/data/petMar1
# expecting to find repeat masker .out file here:
ln -s bed/RepeatMasker/petMar1.fa.out .
gunzip bed/simpleRepeat/trfMask.bed.gz
time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
-workhorse=hgwdev petMar1 > jkStuff/downloads.log 2>&1
# real 7m13.080s
# failed making upstream sequences:
# featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout
# setpriority: Permission denied.
# the 'nice' from my bash shell causes trouble inside the csh
# script which uses nice. Finish off the install step manually
# with the mgcGenes upstreams ...
#############################################################################
# PushQ entries (DONE - 2008-04-29 - Hiram)
ssh hgwdev
/cluster/data/petMar1
/cluster/bin/scripts/makePushQSql.pl petMar1 > jkStuff/pushQ.sql
# output warnings:
# petMar1 does not have seq
# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
# gbWarn
# multiz6wayFrames
# phastCons6way
scp -p jkStuff/pushQ.sql hgwbeta:/tmp/petMar1.sql
ssh hgwbeta
cd /tmp
hgsql qapushq < petMar1.sql
#############################################################################
# Create 6-way downloads (DONE - 2008-05-01 - Hiram)
ssh hgwdev
mkdir -p /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way
cd /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way
# if these haven't been copied here from before:
cp -p \
/san/sanvol1/scratch/petMar1/multiz6way/cons/all/phastCons6wayScores/* .
ln -s ../../cons/all/all.mod ./6way.mod
cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt .
# edit that README.txt to be correct for this 6-way alignment
cd ..
mkdir multiz6way
cd multiz6way
cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt .
# edit that README.txt to be correct for this 6-way alignment
ssh kkstore04
cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
ln -s ../../petMar1.6-way.nh ./6way.nh
time nice -n +19 gzip -c ../../anno/petMar1.anno.6way.maf \
> petMar1.6way.maf.gz
# real 0m49.861s
ssh hgwdev
cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
# creating upstream files from xenoRefGene, bash script:
cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=petMar1
GENE=xenoRefGene
NWAY=multiz6way
export DB GENE
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
stdin stdout \
-orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
'_EOF_'
# << happy emacs
chmod +x ./mkUpstream.sh
time nice -n +19 ./mkUpstream.sh
# real 2m26.254s
# -rw-rw-r-- 1 61861 May 1 16:10 upstream1000.maf.gz
# -rw-rw-r-- 1 104203 May 1 16:11 upstream2000.maf.gz
# -rw-rw-r-- 1 195382 May 1 16:12 upstream5000.maf.gz
# check the names in these upstream files to ensure sanity:
zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
| sort | uniq -c | sort -rn | less
# should be a list of the other 4 species with a high count,
# then xenoRefGene names, e.g.:
# 206 oryLat1
# 206 mm9
# 206 hg18
# 206 galGal3
# 206 braFlo1
# 8 NM_054045
# 7 NM_013550
# 5 NM_069752
ssh kkstore04
cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
md5sum *.maf.gz > md5sum.txt
cd ../phastCons6way
md5sum *.data.gz *.mod > md5sum.txt
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way
mkdir /usr/local/apache/htdocs/goldenPath/petMar1/phastCons6way
cd /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way
rm /usr/local/apache/htdocs/goldenPath/petMar1/multiz6way/mkUpstream.sh
cd ../phastCons6way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/petMar1/phastCons6way
# if your ln -s `pwd`/* made extra links to files you don't want there,
# check the goldenPath locations and remove those extra links
#############################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
# SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram
ssh kkstore01 # not too important since everything moved to hive
screen # use screen to control this job
cd /cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26
cat fb.oryLat2.chainPetMar1Link.txt
# 41568923 bases of 700386597 (5.935%) in intersection
# And, for the swap:
mkdir /cluster/data/petMar1/bed/blastz.oryLat2.swap
cd /cluster/data/petMar1/bed/blastz.oryLat2.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/oryLat2/bed/blastz.petMar1.2008-08-26/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-smallClusterHub=pk -bigClusterHub=kk > swap.log 2>&1 &
# real 52m27.942s
cat fb.petMar1.chainOryLat2Link.txt
# 39181422 bases of 831696438 (4.711%) in intersection
############################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
petMar1.upstreamGeneTbl = xenoRefGene
petMar1.upstreamMaf = multiz6way /hive/data/genomes/petMar1/bed/multiz6way/species.list
############################################################################
# QUALITY TRACK (DONE - 2008-11-25 - Hiram)
cd /hive/data/genomes/petMar1/downloads
qaToQac contigs.fa.quals.gz assembly.qac
qacAgpLift ../petMar1.agp assembly.qac scaffolds.qac
mkdir /hive/data/genomes/petMar1/bed/qual
cd /hive/data/genomes/petMar1/bed/qual
qacToWig -fixed ../../downloads/scaffolds.qac stdout \
| wigEncode stdin qual.wig qual.wib
ln -s `pwd`/qual.wib /gbdb/petMar1/wib
hgLoadWiggle -pathPrefix=/gbdb/petMar1/wib petMar1 quality qual.wig
+############################################################################
+# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
+
+vertebrate-wide transMap alignments were built Tracks are created and loaded
+by a single Makefile. This is available from:
+ svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
+
+see doc/builds.txt for specific details.
+############################################################################