src/hg/makeDb/doc/mm5.txt 1.4
1.4 2009/11/25 21:48:41 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/mm5.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm5.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/makeDb/doc/mm5.txt 14 Jan 2008 23:06:14 -0000 1.3
+++ src/hg/makeDb/doc/mm5.txt 25 Nov 2009 21:48:41 -0000 1.4
@@ -1,7901 +1,7901 @@
# This file describes how we made the browser database on the mouse
# genome, June 2004 build. - Mm5
#
#
# NOTE: There is a new chrMT sequence in the build 32
# >gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion
#
# Will have to beware of this NC_ contig in the processing since
# all previous builds had only NT_ contigs
#
# NOTE: The README_PREBUILD file for this assembly mentions several
# differences from the previous release (build 30):
# 1. seq_contig.md - new first line is a comment containing column name
# Also, last two columns (group label and weight, have been swapped)
# Also, some lines have id with CONTIG: prepended, and upper-case
# feature type (CONTIG)
# 2. contig.idmap - has an additional column "contig label"
# This required changing the jkStuff ncbi* utilities (7/1/03 KRR)
#
# DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2004-06-27 - Fan)
ssh kksilo
mkdir -p /cluster/store6/mm5/ncbi
ln -s /cluster/store6/mm5 /cluster/data
cd /cluster/data/mm5/ncbi
mkdir chrfasta contigfasta
ftp ftp.ncbi.nih.gov
# user hgpguest, password from /cse/faculty/kent/buildHg6.doc
cd mouse_33
prompt
bin
mget *
quit
gunzip *.agp.gz
# compress chrY.fa (at NCBI site, this one file some how was not compressed)
cd chrfasta
gzip chrY.fa
cd ..
#use chrMT.fa.gz from mm4 instead because its first line format is correct
cp -p /cluster/store6/mm4/ncbi/chrfasta/chrMT.fa.gz chrfasta
cp -p /cluster/store6/mm4/ncbi/contigfasta/chrMT.fa.gz contigfasta
# Fix the troubles caused by chrMT released later separately
# Fixed allcontig.agp
# add the last line of .../mm4/ncbi/allcontig.agp to allcontig.agp
# Fixed allrefcontig.chr.agp
# add the last line of .../mm4/ncbi/allrefcontig.chr.agp to allrefcontig.chr.agp
# Fix contig.idmap
cat contig.idmap chrMT/contig.idmap >new.idmap
mv new.idmap contig.idmap
# Fix seq_contig.md
# Edit seq_contig.md to add 3 lines (from mm4) in its middle before Un|...
10090 MT 0 0 + start -1 CONTIG C57BL/6J
1010090 MT 1 16299 + NC_005089 GI:34538597 CONTIG
C57BL/6J na10090 MT 16299 16299 + end -2 CONTIG C57BL/6J
10
# ctg_coords, contig_overlaps.agp and sequence.inf not fixed.
# Check chromosome files (DONE - 2004-06-27 - Fan)
cd chrfasta
foreach f (*.fa.gz)
echo $f:r >> faSize.out
gunzip $f
/cluster/bin/i386/faSize $f:r >> faSize.out
echo $f:r done
end
/cluster/bin/i386/faSize *.fa >> faSize.out
grep "^>" *.fa > ../chrfasta.all.fa.headers
gzip *.fa
cd ../contigfasta
gunzip *.fa.gz
grep "^>" *.fa > ../contigfasta.all.fa.headers
gzip *.fa
# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS
# (DONE - 2004-06-27 - Fan)
ssh kksilo
cd /cluster/data/mm5
gunzip ncbi/allrefcontig.chr.agp.gz
# splitFaIntoContigs doesn't do right with agp lines arriving in a
# different order than fasta chrom sequences. so split up the agp
# into one per chrom.
foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y MT Un)
mkdir $c
perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
./ncbi/allrefcontig.chr.agp \
> $c/chr$c.agp
gunzip -c ./ncbi/chrfasta/chr$c.fa.gz \
| perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' \
| splitFaIntoContigs $c/chr$c.agp \
stdin /cluster/data/mm5 -nSize=5000000
end
# gzip ncbi/chrfasta/chr*.fa
# CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2004-06-27 - Fan)
ssh kksilo
cd /cluster/data/mm5/ncbi
gunzip seq_contig.md.gz
# reorder random contigs in allrefcontig agp file to match seq_contig.md
# this is required by the ncbiToRandomAgps scripts
# had to fixup ncbiToRandomAgps from previous use to match the
# lines better, and to do the MT/NC_ mitochondrion thing
mkdir /cluster/store6/mm5/jkStuff
# copy scripts used from previous trial mm5 build
cd /cluster/data/mm5
cp -p ~/mm50/jkStuff/* jkStuff
cd /cluster/data/mm5/ncbi
../jkStuff/ncbiFixAgp allrefcontig.chr.agp > \
allrefcontig.chr.ordered.agp
#Edit MANUALLY ../jkStuff/ncbiToRandomAgps, to change build 32 to build 33.
../jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.ordered.agp \
contig.idmap ..
# creating ../mm5/1/chr1_random.agp...
# ... creating ../mm5/Un/chrUn_random.agp...
# The chrUn_random.agp created by this is too large with the 5000
# gaps. it will work with 1000 gaps, so fixup the chrUn_random agp:
../jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \
seq_contig.md allrefcontig.chr.ordered.agp contig.idmap ..
ssh kksilo
cd /cluster/data/mm5
foreach c (?{,?})
if (-e $c/chr${c}_random.ctg.agp) then
echo building $c/chr${c}_random.fa
gunzip -c ./ncbi/contigfasta/chr$c.fa.gz \
| perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \
> ./tmp.fa
agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \
$c/chr${c}_random.fa ./tmp.fa
rm tmp.fa
endif
end
# building 1/chr1_random.fa
# ... etc ...
# building Un/chrUn_random.fa
# Writing 102265694 bases to Un/chrUn_random.fa
# Clean these up to avoid confusion later... they're easily rebuilt
# with the ncbiToRandomAgps script above
rm ?/*.ctg.agp ??/*.ctg.agp
# BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (DONE 2004-06-27 - Fan)
ssh kksilo
cd /cluster/data/mm5
foreach c (?{,?})
if (-e $c/chr${c}_random.agp) then
splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \
-nSize=5000000
mkdir -p $c/lift
mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst
mv ${c}_random/lift/ordered.lft $c/lift/random.lft
mv ${c}_random/lift/ordered.lst $c/lift/random.lst
rmdir ${c}_random/lift
rm ${c}_random/chr${c}_random.{agp,fa}
mv ${c}_random/* $c
rmdir ${c}_random
endif
end
# This has a lot of output. It is difficult to see if anything
# goes wrong.
# Fixup chrMT name to be chrM (DONE - 2004-06-27 - Fan)
ssh kksilo
cd /cluster/data/mm5
mv MT MT.ncbi
mkdir M
mkdir M/chrM_1
mkdir M/lift
cd MT.ncbi
bash
find . -type f | while read FN
do
NF=`echo $FN | sed -e "s/MT/M/g"`
sed -e "s/chrMT/chrM/g" $FN > ../M/$NF
done
# MAKE LIFTALL.LFT (DONE - 2003-06-27 - Fan)
cd /cluster/data/mm5
cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft
# 7:40 PM 6/27/04, used dark blue color above.
# Now changed to use dark pink color for things done.
# CREATING DATABASE (DONE 2004-06-27 - Fan)
# First, clean out mm5 tables built by previous trail build.
# Rename all mm5.* tables to mm5_old4.*,
# then drop database mm5
o - Create the database.
ssh hgwdev
hgsql -e 'create database mm5;' ''
# if you need to delete this database: !!! WILL DELETE EVERYTHING !!!
# hgsql -e "drop database mm5;" mm5
o - Use df to make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
df -h /var/lib/mysql
Filesystem Size Used Avail Use% Mounted on
/dev/sdc1 1.8T 383G 1.3T 24% /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2004-06-27 - Fan)
# Use any of the newest databases to ensure that the organization
# of the grp table is up to date
ssh hgwdev
hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" mm5
# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE - 2004-06-27 - Fan)
# Create (unmasked) nib files
ssh kksilo
cd /cluster/data/mm5
mkdir -p unmaskedNib
foreach f (?{,?}/chr?{,?}{,_random}.fa)
echo $f:t:r
faToNib $f unmaskedNib/$f:t:r.nib
end
# Create symbolic links from /gbdb/mm5/nib to real nib files
# These unmasked Nib files are temporary just to get the browser
# up an running immediately. After the masking is done and masked
# sequence is created, these nibs will be replaced with the masked
# nibs
ssh hgwdev
mkdir -p /gbdb/mm5/nib
cd /gbdb/mm5/nib
ln -s /cluster/data/mm5/unmaskedNib/chr*.nib .
# Load /gbdb nib paths into database and save size info.
ssh hgwdev
cd /cluster/data/mm5
hgsql mm5 < ~/kent/src/hg/lib/chromInfo.sql
hgNibSeq -preMadeNib mm5 /gbdb/mm5/nib ?{,?}/chr?{,?}{,_random}.fa
# 3164952073 total bases
# NOTE: mm4 was 2952612207, an increase of 212 Mb (~7.2%)
hgsql -N -e "select chrom,size from chromInfo;" mm5 > chrom.sizes
# check the resulting file chrom.sizes
# Store o+o info in database.
cd /cluster/data/mm5/ncbi
gunzip sequence.inf
cd /cluster/data/mm5
ln -s ncbi ffa
# remove so as not to confuse hgGoldGap -- they are easily regenerated
rm */chr*.ctg.agp
# to undo/redo:
# jkStuff/dropSplitTable.csh gap
# jkStuff/dropSplitTable.csh gold
/cluster/bin/i386/hgGoldGapGl mm5 /cluster/data/mm5 .
featureBits mm5 gold
# 2615483787 bases of 2615483787 (100.000%) in intersection
featureBits mm4 gold
# 2627444668 bases of 2627444668 (100.000%) in intersection
featureBits mm5 gap
# 549468286 bases of 2615483787 (21.008%) in intersection
featureBits mm4 gap
# 325167539 bases of 2627444668 (12.376%) in intersection
featureBits mm3 gap
# 202319873 bases of 2505900260 (8.074%) in intersection
# Make and load GC percent table (DONE - 2004-06-27 - Fan)
# NOT REQUIRED, been replaced by gc5Base procedure below
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/gcPercent
cd /cluster/data/mm5/bed/gcPercent
hgsql mm5 < ~/kent/src/hg/lib/gcPercent.sql
hgGcPercent mm5 ../../unmaskedNib
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR MM5 (DONE - 2004-06-27 - Fan)
# using the Mm3 position blatted onto Mm5:
# Enter mm5 into hgcentraltest.dbDb so test browser knows about it:
hgsql -e 'INSERT INTO dbDb \
(name, description, nibPath, organism, defaultPos, \
active, orderKey, genome, scientificName, htmlPath, \
hgNearOk, hgPbOk, sourceName) \
VALUES("mm5", "May 2004", "/gbdb/mm5/nib", "Mouse", \
"chr6:121658238-121674165", \
1, 20, "Mouse", "Mus musculus", "/gbdb/mm5/html/description.html",\
0, 0, "NCBI Build 33");' \
-h genome-testdb hgcentraltest
# If you need to delete that entry:
hgsql -e 'delete from dbDb where name="mm5";' -h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
cd ~kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add mm5 in all the right places and do
make update
make alpha
cvs commit makefile
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR MM5 (DONE - 2004-07-14 Fan)
ssh hgwdev
# Make one big 2bit file as well, and make a link to it in
# /gbdb/mm5/nib because hgBlat looks there:
cd /cluster/data/mm5
faToTwoBit */chr*.fa mm5.2bit
ln -s /cluster/data/mm5/mm5.2bit /gbdb/mm5/nib/
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("mm5", "snort", "17778", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("mm5", "snort", "17779", "0", "1");' \
-h genome-testdb hgcentraltest
# REPEAT MASKING (Working on 2004-06-27 Fan)
# TRF simpleRepeat below can be run at the same time
# Split contigs, run RepeatMasker, lift results
# * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make
# RepeatMasker runs manageable on the cluster ==> results need lifting.
# * For the NCBI assembly we repeat mask on the sensitive mode setting
# (RepeatMasker -m -s -ali)
#- Split contigs into 500kb chunks:
ssh kksilo
cd /cluster/data/mm5
foreach d ( */chr?{,?}{,_random}_?{,?} )
cd $d
set contig = $d:t
faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-maxN=500000
cd ../..
end
# ...
# 11 pieces of 11 written
# 1 pieces of 1 written
# ...
#- Make the run directory and job list:
cd /cluster/data/mm5
cat << '_EOF_' > jkStuff/RMMouse
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/mm5/$2
/bin/cp $2 /tmp/mm5/$2
cd /tmp/mm5/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species mus $2
popd
/bin/cp /tmp/mm5/$2/$2.out ./
if (-e /tmp/mm5/$2/$2.align) /bin/cp /tmp/mm5/$2/$2.align ./
if (-e /tmp/mm5/$2/$2.tbl) /bin/cp /tmp/mm5/$2/$2.tbl ./
if (-e /tmp/mm5/$2/$2.cat) /bin/cp /tmp/mm5/$2/$2.cat ./
/bin/rm -fr /tmp/mm5/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm5/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/mm5
'_EOF_'
chmod +x jkStuff/RMMouse
mkdir -p RMRun
rm -f RMRun/RMJobs
foreach d ( ?{,?}/chr*_?{,?} )
foreach f ( $d/chr*_?{,?}_?{,?}.fa )
set f = $f:t
echo /cluster/data/mm5/jkStuff/RMMouse \
/cluster/data/mm5/$d $f \
'{'check out line+ /cluster/data/mm5/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
#- Do the run
ssh kk
cd /cluster/data/mm5/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
[kk:RMRun> para check
6885 jobs in batch
8 jobs (including everybody's) in Parasol queue.
Checking finished jobs.
ranOk: 6885
total jobs in batch: 6885
[kk:RMRun> para time
6885 jobs in batch
8 jobs (including everybody's) in Parasol queue.
Checking finished jobs
Completed: 6885 of 6885 jobs
CPU time in finished jobs: 40084305s 668071.74m 11134.53h 463.94d 1.271 y
IO & Wait Time: 122589s 2043.16m 34.05h 1.42d 0.004 y
Average job time: 5840s 97.33m 1.62h 0.07d
Longest job: 9804s 163.40m 2.72h 0.11d
Submission to last job: 46771s 779.52m 12.99h 0.54d
# Done 11:57 AM 6/28/04
#- Lift up the split-contig .out's to contig-level .out's
ssh kksilo
cd /cluster/data/mm5
foreach d ( ?{,?}/chr*_?{,?} )
cd $d
set contig = $d:t
liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
cd ../..
end
#- Lift up the contig-level .out's to chr-level
ssh kksilo
cd /cluster/data/mm5
./jkStuff/liftOut5.csh
# This one error is OK
# Can not find Un/lift/ordered.lft .
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/mm5
# to redo:
# ./jkStuff/dropSplitTable.csh rmsk
# make sure there's no chrUn -- rm Un/chrUn.fa.out
hgLoadOut mm5 ?/*.fa.out ??/*.fa.out
# VERIFY REPEATMASKER RESULTS (DONE - 2004-06-28 Fan)
# Run featureBits on mm5 and on a comparable genome build, and compare:
ssh hgwdev
featureBits mm5 rmsk
#1137310280 bases of 2615483787 (43.484%) in intersection
#featureBits mm4 rmsk
1130883581 bases of 2627444668 (43.041%) in intersection
#featureBits mm3 rmsk
1080265553 bases of 2505900260 (43.109%) in intersection
#cd /cluster/data/mm5
#awk '{print $1}' chrom.sizes | sed -e "s/chr//" | grep -v random > chrom.lst
# SIMPLE REPEAT TRACK (DONE - 2004-06-29 Fan)
# TRF can be run in parallel with RepeatMasker on the file server
# since it doesn't require masked input sequence.
ssh kksilo
mkdir /cluster/data/mm5/bed/simpleRepeat
cd /cluster/data/mm5/bed/simpleRepeat
mkdir trf
rm -f jobs.csh
echo '#\!/bin/csh -fe' > jobs.csh
# create job list of 5MB chunks
foreach f \
(/cluster/data/mm5/?{,?}/chr?{,?}_[0-9]*/chr?{,?}_?{,?}.fa \
/cluster/data/mm5/?{,?}/chr*_random_?{,?}/chr*_random_?{,?}.fa)
set fout = $f:t:r.bed
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
>> jobs.csh
end
chmod +x jobs.csh
wc jobs.csh
# 640 3836 90839 jobs.csh
./jobs.csh >&! jobs.log &
# in bash: ./jobs.csh > jobs.log 2>&1 &
tail -f jobs.log
# Done 3:07 PM 6/29/04, took about 6 hours.
# When job is done lift output files
liftUp simpleRepeat.bed /cluster/data/mm5/jkStuff/liftAll.lft warn trf/*.bed
# Load into the database
ssh hgwdev
cd /cluster/data/mm5/bed/simpleRepeat
hgLoadBed mm5 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
# Loaded 1150615 elements of size 16
featureBits mm5 simpleRepeat
# 81414259 bases of 2615483787 (3.113%) in intersection
featureBits mm4 simpleRepeat
# 82600648 bases of 2627444668 (3.144%) in intersection
featureBits mm3 simpleRepeat
# 75457193 bases of 2505900260 (3.011%) in intersection
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-06-29 - Fan)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kksilo
cd /cluster/data/mm5/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords
cd /cluster/data/mm5
mkdir -p bed/simpleRepeat/trfMaskChrom
foreach c (?{,?})
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
else
echo "WARNING NO FILE: $c/lift/ordered.lst"
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# NOTE: ignore warning about non-existent Un/Lift/ordered.lift
# since there is no chrUn
# MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
# (Working on - 2004-06-29 Fan)
ssh kksilo
cd /cluster/data/mm5
#- Soft-mask (lower-case) the contig and chr .fa's
./jkStuff/makeFaMasked.csh >&! maskFa.out &
# bash: ./jkStuff/makeFaMasked.csh > maskFa.out 2>&1 &
tail -100f maskFa.out
#- Make hard-masked .fa.masked files as well:
./jkStuff/makeHardMasked.csh
Edited ./jkStuff/makeNib.csh to comment out "if ..." and "endif" as below:
#!/bin/csh -fe
mkdir -p nib mixedNib maskedNib
foreach i (?{,?})
cd $i
# foreach j (chr$i{,_random}.fa)
foreach j (*.fa)
# if (-e "${j}")
set r = $j:r
/cluster/bin/i386/faToNib $j ../nib/$r.nib
/cluster/bin/i386/faToNib -softMask $j ../mixedNib/$r.nib
/cluster/bin/i386/faToNib -hardMask $j ../maskedNib/$r.nib
# endif
echo done $j
end
cd ..
end
#- Rebuild the nib, mixedNib, maskedNib files:
./jkStuff/makeNib.csh
# ignore complaints about missing chrUn
# Redo symbolic links from /gbdb/mm5/nib to
# mixed (RM and TRF) soft-masked nib files
ssh hgwdev
rm -fr /gbdb/mm5/nib/*
ln -s /cluster/data/mm5/mixedNib/chr*.nib /gbdb/mm5/nib
# Copy data to /cluster/bluearc for cluster runs
ssh kksilo
# masked contigs
rm -fr /cluster/bluearc/scratch/mus/mm5/trfFa
mkdir -p /cluster/bluearc/scratch/mus/mm5/trfFa
cp -p /cluster/data/mm5/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \
/cluster/bluearc/scratch/mus/mm5/trfFa
# masked chrom nibs
cd /cluster/data/mm5
rm -fr /cluster/bluearc/scratch/mus/mm5/softNib
mkdir -p /cluster/bluearc/scratch/mus/mm5/softNib
cp -p mixedNib/chr*.nib /cluster/bluearc/scratch/mus/mm5/softNib
rm -fr /cluster/bluearc/scratch/mus/mm5/hardNib
mkdir -p /cluster/bluearc/scratch/mus/mm5/hardNib
cp -p maskedNib/chr*.nib /cluster/bluearc/scratch/mus/mm5/hardNib
# fasta files
rm -fr /cluster/bluearc/scratch/mus/mm5/fasta
mkdir -p /cluster/bluearc/scratch/mus/mm5/fasta
cp -p ?/*.fa ??/*.fa /cluster/bluearc/scratch/mus/mm5/fasta
# RepeatMasker *.out files
rm -rf /cluster/bluearc/scratch/mus/mm5/rmsk
mkdir -p /cluster/bluearc/scratch/mus/mm5/rmsk
cp -p ?{,?}/chr?{,?}{,_random}.fa.out /cluster/bluearc/scratch/mus/mm5/rmsk
# lift file, for mrna processing
cp -p jkStuff/liftAll.lft /cluster/bluearc/scratch/mus/mm5
#above was done 6/29/04 4:50PM
# also copy to iservers
ssh kkr1u00
#cd ~/mm5
cd /cluster/bluearc/scratch/mus/mm5
mkdir /iscratch/i/mus/mm5
cp -p liftAll.lft /iscratch/i/mus/mm5
mkdir -p /iscratch/i/mus/mm5/softNib
cp -p /cluster/bluearc/scratch/mus/mm5/softNib/chr*.nib /iscratch/i/mus/mm5/softNib
mkdir -p /iscratch/i/mus/mm5/trfFa
cd /cluster/store6/mm5
cp ?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /cluster/bluearc/scratch/mus/mm5/trfFa
/cluster/bin/scripts/iSync
ssh kkr1u00
mkdir /iscratch/i/mus/mm5
cd /iscratch/i/mus
rsync -arlv /cluster/bluearc/scratch/mus/mm5 .
#wrote 8660800915 bytes read 15380 bytes 17729409.00 bytes/sec
#total size is 10242205742 speedup is 1.18
cd /iserver/kkr1u00/i/mus/mm5
mv trfFa maskedContigs
cd /cluster/bluearc/scratch/mus/mm5
mv trfFa maskedContigs
# PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2004-06-29 - Fan)
ssh kksilo
mkdir -p /cluster/bluearc/scratch/mus/mm5/rmsk.spec
cd /cluster/bluearc/scratch/mus/mm5/rmsk.spec
ln -s ../rmsk/*.out .
# NOTE: DON't leave indentations in the script below.
cat << '_EOF_' > runArian.sh
#!/bin/sh
for FN in *.out
do
echo ${FN}
/cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
${FN} -query mouse -comp human -comp rat
done
'_EOF_'
chmod +x runArian.sh
./runArian.sh
cd /cluster/bluearc/scratch/mus/mm5
mkdir linSpecRep.notInHuman
mkdir linSpecRep.notInRat
foreach f (rmsk.spec/*.out_hum_rat)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 1 $f > \
linSpecRep.notInHuman/$base.out.spec
end
foreach f (rmsk.spec/*.out_hum_rat)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 2 $f > \
linSpecRep.notInRat/$base.out.spec
end
cp rmsk.spec /iscratch/i/mus/mm5 -Rp
cp linSpecRep.notInRat /iscratch/i/mus/mm5 -Rp
cp linSpecRep.notInHuman /iscratch/i/mus/mm5 -Rp
/cluster/bin/scripts/iSync
# Request rsync /cluster/bluearc/scratch/mus/mm5 to the KiloKluster
# GC5BASE WIGGLE TRACK (DONE - 2004-06-24 - Hiram)
# This previously was a script that ran through each nib.
# Recently transformed into a mini cluster run.
ssh kki
mkdir /cluster/data/mm5/bed/gc5Base
cd /cluster/data/mm5/bed/gc5Base
mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K
cat << '_EOF_' > kkRun.sh
#!/bin/sh
NIB=$1
chr=${NIB/.nib/}
chrom=${chr#chr}
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
/cluster/data/mm5/mixedNib | \
grep -w GC | \
awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
-wibFile=wigData5/gc5Base_${chrom} \
-name=${chrom} stdin 2> dataLimits5/${chr}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x kkRun.sh
ls /cluster/data/mm5/mixedNib > nibList
cat << '_EOF_' > gsub
#LOOP
./kkRun.sh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 nibList single gsub jobList
para create jobList
para try, check, ... etc
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 4969s 82.81m 1.38h 0.06d 0.000 y
# IO & Wait Time: 611s 10.19m 0.17h 0.01d 0.000 y
# Average job time: 130s 2.16m 0.04h 0.00d
# Longest job: 370s 6.17m 0.10h 0.00d
# Submission to last job: 598s 9.97m 0.17h 0.01d
# load the .wig files back on hgwdev:
ssh hgwdev
cd /cluster/data/mm5/bed/gc5Base
hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base mm5 gc5Base wigData5/*.wig
# and symlink the .wib files into /gbdb
mkdir /gbdb/mm5/wib/gc5Base
ln -s `pwd`/wigData5/*.wib /gbdb/mm5/wib/gc5Base
# And then the zoomed data view
ssh kki
cd /cluster/data/mm5/bed/gc5Base
mkdir wigData5_1K dataLimits5_1K
cat << '_EOF_' > kkRunZoom.sh
#!/bin/sh
NIB=$1
chr=${NIB/.nib/}
chrom=${chr#chr}
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
/cluster/data/mm5/mixedNib | \
grep -w GC | \
awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
-name=${chrom} stdin 2> dataLimits5_1K/${chr}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x kkRunZoom.sh
cat << '_EOF_' > gsubZoom
#LOOP
./kkRunZoom.sh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 nibList single gsubZoom jobListZoom
para create jobListZoom
para try ... check ... etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 4878s 81.29m 1.35h 0.06d 0.000 y
# IO & Wait Time: 488s 8.14m 0.14h 0.01d 0.000 y
# Average job time: 125s 2.08m 0.03h 0.00d
# Longest job: 378s 6.30m 0.10h 0.00d
# Submission to last job: 665s 11.08m 0.18h 0.01d
# Then load these .wig files into the same database as above
ssh hgwdev
cd /cluster/data/mm5/bed/gc5Base
hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base \
-oldTable mm5 gc5Base wigData5_1K/*.wig
# and symlink these .wib files into /gbdb
ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm5/wib/gc5Base
# GC5BASE WIGGLE TRACK (DONE - 2004-07-01 - Hiram)
# This previously was a script that ran through each nib.
# Recently transformed into a mini cluster run.
ssh kki
mkdir /cluster/data/mm5/bed/gc5Base
cd /cluster/data/mm5/bed/gc5Base
mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K
cat << '_EOF_' > kkRun.sh
#!/bin/sh
NIB=$1
chr=${NIB/.nib/}
chrom=${chr#chr}
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
/cluster/data/mm5/mixedNib | \
grep -w GC | \
awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
-wibFile=wigData5/gc5Base_${chrom} \
-name=${chrom} stdin 2> dataLimits5/${chr}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x kkRun.sh
ls /cluster/data/mm5/mixedNib > nibList
cat << '_EOF_' > gsub
#LOOP
./kkRun.sh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 nibList single gsub jobList
para create jobList
para try, check, ... etc
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 4857s 80.94m 1.35h 0.06d 0.000 y
# IO & Wait Time: 121s 2.02m 0.03h 0.00d 0.000 y
# Average job time: 116s 1.93m 0.03h 0.00d
# Longest job: 335s 5.58m 0.09h 0.00d
# Submission to last job: 516s 8.60m 0.14h 0.01d
# load the .wig files back on hgwdev:
ssh hgwdev
cd /cluster/data/mm5/bed/gc5Base
hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base mm5 gc5Base wigData5/*.wig
# and symlink the .wib files into /gbdb
mkdir /gbdb/mm5/wib
mkdir /gbdb/mm5/wib/gc5Base
ln -s `pwd`/wigData5/*.wib /gbdb/mm5/wib/gc5Base
# And then the zoomed data view
ssh kki
cd /cluster/data/mm5/bed/gc5Base
mkdir wigData5_1K dataLimits5_1K
cat << '_EOF_' > kkRunZoom.sh
#!/bin/sh
NIB=$1
chr=${NIB/.nib/}
chrom=${chr#chr}
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
/cluster/data/mm5/mixedNib | \
grep -w GC | \
awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
-name=${chrom} stdin 2> dataLimits5_1K/${chr}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x kkRunZoom.sh
cat << '_EOF_' > gsubZoom
#LOOP
./kkRunZoom.sh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 nibList single gsubZoom jobListZoom
para create jobListZoom
para try ... check ... etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 4819s 80.31m 1.34h 0.06d 0.000 y
# IO & Wait Time: 82s 1.37m 0.02h 0.00d 0.000 y
# Average job time: 114s 1.90m 0.03h 0.00d
# Longest job: 336s 5.60m 0.09h 0.00d
# Submission to last job: 500s 8.33m 0.14h 0.01d
# Then load these .wig files into the same database as above
ssh hgwdev
cd /cluster/data/mm5/bed/gc5Base
hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base \
-oldTable mm5 gc5Base wigData5_1K/*.wig
# and symlink these .wib files into /gbdb
ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm5/wib/gc5Base
# BLASTZ HG17 (WORKING - 2004-07-06 - Hiram)
ssh kk
mkdir -p /cluster/data/mm5/bed/blastz.hg17.2004-07-06
cd /cluster/data/mm5/bed
ln -s blastz.hg17.2004-07-06 blastz.hg17
cd blastz.hg17
cat << '_EOF_' > DEF
# mouse vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
# not used
SEQ1_RMSK=/scratch/mus/mm5/rmsk
# not used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Human
SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
# RMSK not currently used
SEQ2_RMSK=
# FLAG not currently used
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastz.hg17
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# prepare first cluster run
ssh kk
cd /cluster/data/mm5/bed/blastz.hg17
# OK to use this script here, it is generic, works anywhere
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
# Completed: 46717 of 46717 jobs
# CPU time in finished jobs: 16171136s 269518.93m 4491.98h 187.17d 0.513 y
# IO & Wait Time: 534501s 8908.35m 148.47h 6.19d 0.017 y
# Average job time: 358s 5.96m 0.10h 0.00d
# Longest job: 5263s 87.72m 1.46h 0.06d
# Submission to last job: 30066s 501.10m 8.35h 0.35d
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/mm5/bed/blastz.hg17
/cluster/data/hg17/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 2186s 36.43m 0.61h 0.03d 0.000 y
# IO & Wait Time: 1804s 30.07m 0.50h 0.02d 0.000 y
# Average job time: 12s 0.20m 0.00h 0.00d
# Longest job: 82s 1.37m 0.02h 0.00d
# Submission to last job: 3895s 64.92m 1.08h 0.05d
# Third cluster run to convert lav's to axt's
# Does not work on kki since /scratch on the iservers is not the
# same as /scratch on the other clusters.
ssh kk
cd /cluster/data/mm5/bed/blastz.hg17
/cluster/data/hg17/jkStuff/BlastZ_run2.sh
cd run.2
para try, check, push, etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 2099s 34.98m 0.58h 0.02d 0.000 y
# IO & Wait Time: 6862s 114.37m 1.91h 0.08d 0.000 y
# Average job time: 208s 3.47m 0.06h 0.00d
# Longest job: 1276s 21.27m 0.35h 0.01d
# Submission to last job: 1291s 21.52m 0.36h 0.01d
# translate sorted axt files into psl
ssh kksilo
cd /cluster/data/mm5/bed/blastz.hg17
mkdir p pslChrom
set tbl = "blastzHg17"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# This takes more than an hour. You can shorten this by changing
# that command to a simple echo, put the results into a file,
# split the file into four parts and run the four files as shell
# scripts on kksilo to have four processes running at the same
# time. Load on kksilo gets up to about 20 which is reasonable.
# Load database tables
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.hg17/pslChrom
bash # for tcsh users
for F in chr*_blastzHg17.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${F}
echo "${F} done"
done
# this is a 40 minute job
# exit bash if you are tcsh
# featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
# memory. But if you reset your ~/.hg.conf to use the read-only
# user and contact the hgwdev host, then use the x86_64 featureBits
# featureBits mm5 blastzHg17
# 1057836001 bases of 2615483787 (40.445%) in intersection
# featureBits mm4 blastzHg16
# 1068995521 bases of 2627444668 (40.686%) in intersection
# CHAIN MM5 BLASTZ (DONE - 2004-07-02 - Hiram)
# The axtChain is best run on the small kluster, or the kk9 kluster
ssh kki
mkdir -p /cluster/data/mm5/bed/blastz.hg17/axtChain/run1
cd /cluster/data/mm5/bed/blastz.hg17/axtChain/run1
mkdir out chain
ls -1S /cluster/data/mm5/bed/blastz.hg17/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# May need -minScore=5000 for all chroms if chr19 won't finish on kolossus
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/mus/mm5/softNib \
/iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
# 46 jobs
gensub2 input.lst single gsub jobList
para create jobList
para try
para push # ... etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 5354s 89.23m 1.49h 0.06d 0.000 y
# IO & Wait Time: 10543s 175.72m 2.93h 0.12d 0.000 y
# Average job time: 370s 6.16m 0.10h 0.00d
# Longest job: 1694s 28.23m 0.47h 0.02d
# Submission to last job: 1694s 28.23m 0.47h 0.02d
# now on the file server, sort chains
ssh kksilo
cd /cluster/data/mm5/bed/blastz.hg17/axtChain
time chainMergeSort run1/chain/*.chain > all.chain
# real 4m53.428s
# user 4m3.040s
# sys 0m29.440s
time chainSplit chain all.chain
# real 4m34.674s
# user 3m38.370s
# sys 0m29.990s
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.hg17/axtChain/chain
bash # for tcsh users
for I in *.chain
do
c=${I/.chain/}
hgLoadChain mm5 ${c}_chainHg17 $I
echo done $c
done
# exit bash if you are tcsh
# This is a 50 minute job
# featureBits mm5 chainHg17
# 2507720521 bases of 2615483787 (95.880%) in intersection
# featureBits mm4 chainHg16
# 2558968088 bases of 2627444668 (97.394%) in intersection
# NET MM5 (WORKING - 2004-07-02 - Hiram)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.hg17/axtChain
mkdir preNet
cd chain
bash # for tcsh users
for I in *.chain
do
echo preNetting $I
/cluster/bin/i386/chainPreNet $I /cluster/data/mm5/chrom.sizes \
/cluster/data/hg17/chrom.sizes ../preNet/$I
done
# exit bash if you are tcsh
# 7 minute job
cd ..
mkdir n1
cd preNet
bash # for tcsh users
for I in *.chain
do
n=${I/.chain/}.net
echo primary netting $I $n
/cluster/bin/i386/chainNet $I -minSpace=1 /cluster/data/mm5/chrom.sizes \
/cluster/data/hg17/chrom.sizes ../n1/$n /dev/null
done
# exit bash if you are tcsh
# 5 minute job
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 2546110464, utime 16327 s/100, stime 3546
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.hg17/axtChain
time netClass hNoClass.net mm5 hg17 human.net \
-tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInHuman \
-qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse
# real 9m45.271s
# user 6m47.170s
# sys 1m20.440s
# If things look good do
ssh kksilo
cd /cluster/data/mm5/bed/blastz.hg17/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
time netFilter -syn human.net > humanSyn.net
# real 12m3.701s
# user 8m44.180s
# sys 1m1.610s
# Load the nets into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.hg17/axtChain
netFilter -minGap=10 human.net | hgLoadNet mm5 netHg17 stdin
netFilter -minGap=10 humanSyn.net | hgLoadNet mm5 syntenyNetHg17 stdin
# check results
# featureBits mm5 netHg17
# 2504056038 bases of 2615483787 (95.740%) in intersection
# featureBits mm4 netHg16
# 2553137690 bases of 2627444668 (97.172%) in intersection
# featureBits mm5 syntenyNetHg17
# 2460442823 bases of 2615483787 (94.072%) in intersection
# featureBits mm4 syntenyNetHg16
# 2495783103 bases of 2627444668 (94.989%) in intersection
# Add entries for net and chain to mouse/hg17 trackDb
# make net
ssh kksilo
cd /cluster/data/mm5/bed/blastz.hg17/axtChain
mkdir humanNet
time netSplit human.net humanNet
# real 4m46.190s
# user 3m27.740s
# sys 0m38.900s
# extract axt's from net, and convert to maf's
ssh kksilo
cd /cluster/data/mm5/bed/blastz.hg17/axtChain
mkdir ../axtNet ../mafNet
cat > makeMaf.csh << '_EOF_'
#!/bin/csh -ef
foreach f (humanNet/chr*.net)
set c = $f:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt humanNet/$c.net chain/$c.chain \
/cluster/data/mm5/nib /cluster/data/hg17/nib stdout | \
axtSort stdin ../axtNet/$c.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/mm5/chrom.sizes /cluster/data/hg17/chrom.sizes \
../mafNet/$c.maf -tPrefix=mm5. -qPrefix=hg17.
echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf"
end
'_EOF_'
# << for emacs
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
# real 39m53.316s
# user 20m2.530s
# sys 4m40.120s
ssh hgwdev
mkdir /cluster/data/mm5/bed/blastz.hg17/axtBest
cd /cluster/data/mm5/bed/blastz.hg17/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.hg17/axtNet
mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet
cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet
cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet
gzip *.axt
XXX - running 2004-07-13 14;18
# add README.txt file to dir (use previous assembly's copy as template)
# 32 minute gzip
# Convert those axt files to psl
ssh kksilo
cd /cluster/data/mm5/bed/blastz.hg17
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo -n "processing $c.axt -> ${c}_blastzBesthg17.psl ..."
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestHg17.psl
echo "Done"
end
# Load tables
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.hg17/pslBest
for I in chr*BestHg17.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
echo "done ${I}"
done
# check results
# featureBits mm5 blastzBestHg17
# 1020692679 bases of 2615483787 (39.025%) in intersection
# featureBits mm4 blastzBestHg16
# 1030510540 bases of 2627444668 (39.221%) in intersection
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/mm5/axtBest/Hg17
cd /gbdb/mm5/axtBest/Hg17
ln -s /cluster/data/mm5/bed/blastz.hg17/axtNet/chr*.axt .
cd /cluster/data/mm5/bed/blastz.hg17/axtNet
rm -f axtInfoInserts.sql
foreach f (/gbdb/mm5/axtBest/Hg17/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('hg17','Blastz Best in Genome','$chr','$f');" \
>>! axtInfoInserts.sql
end
hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql
# table axtInfo may already exist, ignore create error.
hgsql mm5 < axtInfoInserts.sql
# MM5 TO CANFAM1 LIFTOVER CHAIN (DONE 1/7/05 Andy)
ssh kolossus
cd /cluster/data/mm5/bed/blastz.canFam1/axtChain
mkdir net
netSplit dog.net net
mkdir over
for file in chain/*.chain; do
chrom=`basename $file .chain`
netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
cat over/$chrom.over >> /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain
done
rm -rf over/
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver
cp /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain .
gzip mm5ToCanFam1.chain
mkdir -p /gbdb/mm5/liftOver
ln -s /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain /gbdb/mm5/liftOver/mm5ToCanFam1.over.chain
hgAddLiftOverChain -multiple mm5 canFam1
# ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE Sept. 8th, 2004, Heather)
ssh hgwdev
cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/all.chain.gz \
/usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.chain.gz
cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/human.net.gz \
/usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.net.gz
cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17
md5sum *.gz */*.gz > md5sum.txt
# Update the README.txt
# LIFTOVER CHAIN TO MM6 (DONE 4/20/2005 Andy)
ssh kkstore
cd /cluster/data/mm6
mkdir liftSplits/
cat << _EOF_ > split.csh
#!/bin/tcsh
set liftDir = /cluster/data/mm6/liftSplits
cd /cluster/data/mm6
foreach n (\`ls ?{,?}/*.fa\`)
set d = \$n:h
set c = \$n:t:r
echo \$c
faSplit -lift=\$liftDir/lift/\$c.lft size /cluster/data/mm6/\$d/\$c.fa -oneFile 3000 \$liftDir/split/\$c
end
_EOF_
chmod +x split.csh
./split.csh
# kkstore not mounting /panasas ... weird.
ssh hgwdev
cd /cluster/data/mm6
cp -r liftSplits/ /panasas/store/mm6
ssh kk
cd /cluster/data/mm5
makeLoChain-align mm5 /scratch/mus/mm5/softNib \
mm6 /panasas/store/mm6/liftSplits/split
# Created parasol job in bed/blat.mm6.2005-04-20/run
cd bed/blat.mm6.2005-04-20/run/
para create spec
para push
# para time was complicated by the fact I redid some hippos (mostly chrUn_random
# alignments) on kk9. Basically, it took about a day.
# In the end, the chrUn_random vs. chrUn_random just took wayyyyyy too long.
# Later, if a more rigorous chain file is desired, it can be made after rerunning
# that blat.
# Lifting
ssh kksilo
cd /cluster/data/mm5/bed/blat.mm6
makeLoChain-lift mm5 mm6 /panasas/store/mm6/liftSplits/lift \
> lift.log &
tail -f lift.log
# OK so I remember this problem with makeLoChain-lift: it always stops with chr1.
# I'll just do it manually.
cd raw/
for nib in `ls /cluster/data/mm6/nib`; do
chrom=${nib%.nib}
echo $chrom
liftUp -pslQ ../psl/${chrom}.psl /panasas/store/mm6/liftSplits/lift/${chrom}.lft warn chr*_${chrom}.psl
echo done $chrom
done
ssh kk9
cd /cluster/data/mm5/bed
ln -s blat.mm6.2005-04-20 blat.mm6.2005-04-22
makeLoChain-chain mm5 /cluster/data/mm5/nib mm6 /cluster/data/mm6/nib
cd /cluster/data/mm5/bed/blat.mm5.2005-02-08/chainRun
para try
para check
para push
para time
#Completed: 40 of 40 jobs
#CPU time in finished jobs: 27315s 455.25m 7.59h 0.32d 0.001 y
#IO & Wait Time: 67093s 1118.22m 18.64h 0.78d 0.002 y
#Average job time: 2360s 39.34m 0.66h 0.03d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 11656s 194.27m 3.24h 0.13d
#Submission to last job: 31329s 522.15m 8.70h 0.36d
# That looks weird but I think it was because 8 jobs crashed because there was no disk space.
# I freed up some space but then there wasn't much room for the netting stage.
# It crashed twice when I tried it using the script makeLoChain-net after the
# chainMergeSort/split. I figured out that it needed more memory. So I ran it manually on
# kolossus
ssh kolossus
mkdir -p /tmp/andy
cd /tmp/andy
cp -r /cluster/data/mm5/bed/blat.mm6/chainRaw .
rm -rf /cluster/data/mm5/bed/blat.mm6/chainRaw
mkdir chain
chainMergeSort chainRaw/*.chain | chainSplit chain stdin
mkdir net over
cd chain
for c in *.chain; do
echo ${c%.chain};
chainNet $c /cluster/data/mm5/chrom.sizes \
/cluster/data/mm6/chrom.sizes ../net/${c%.chain}.net /dev/null
echo done $c
done
for chain in *; do
c=${chain%.chain}
netChainSubset ../net/$c.net $chain ../over/$c.over
done
cd ../over/
cat * >> ../mm5ToMm6.chain
cd ../
cp mm5ToMm6.chain /cluster/data/mm5/bed/liftOver/
cd /cluster/data/mm5/bed/liftOver
mv mm5ToMm6.chain mm5ToMm6.over.chain
ssh hgwdev
ln -s /cluster/data/mm5/bed/liftOver/mm5ToMm6.over.chain /gbdb/mm5/liftOver/mm5ToMm6.over.chain
hgAddLiftOverChain mm5 mm6 /gbdb/mm5/liftOver/mm5ToMm6.over.chain
cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver
cp /gbdb/mm5/liftOver/mm5ToMm6.over.chain .
gzip mm5ToMm6.over.chain
# MAKING HUMAN SYNTENY (DONE - 2004-07-13 - Hiram)
ssh hgwdev
mkdir /cluster/data/mm5/bed/syntenyHg17
cd /cluster/data/mm5/bed/syntenyHg17
# Copy all the needed scripts from /cluster/data/hg16/bed/syntenyRn3
cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .
./syntenicBest.pl -db=mm5 -table=blastzBestHg17 > synBest.out 2>&1
./smooth.pl > smooth.out 2>&1
./joinsmallgaps.pl > joingaps.out 2>&1
./fillgap.pl -db=mm5 -table=blastzBestHg17 > fillgap.out 2>&1
./synteny2bed.pl > syn2bed.out 2>&1
# The five commands above
# real 168m43.627s
# user 0m18.680s
# sys 0m4.990s
# Used to load this in syntenyHg17, but that type is misleading to
# the table browser and fails the checkTableCoords check.
# Better to use this ensRatMusHom type:
# Need a new name here for the Hg17 to not conflict with the
# others
sed -e 's/ensPhusionBlast/ensRatMusHg17/g' \
$HOME/kent/src/hg/lib/ensPhusionBlast.sql \
> ensRatMusHg17.sql
hgLoadBed mm5 ensRatMusHg17 ucsc100k.bed -sqlTable=ensRatMusHg17.sql
# featureBits mm5 ensRatMusHg17
# 2366463967 bases of 2615483787 (90.479%) in intersection
# featureBits mm4 syntenyHg16
# 2299774191 bases of 2627444668 (87.529%) in intersection
# MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-07-13 - Hiram)
# After creating axtBest alignments above, use subsetAxt to get axtTight:
ssh kksilo
cd /cluster/data/mm5/bed/blastz.hg17/axtNet
mkdir -p ../axtTight
bash # for tcsh users
for I in *.axt
do
echo "axtNet/$I -> ../axtTight/$I"
subsetAxt $I ../axtTight/$I \
~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
done
# exit bash if you are tcsh
# An 8 minute job
# translate to psl
cd ../axtTight
mkdir ../pslTight
bash # for tcsh users
for I in *.axt
do
C=${I/.axt/}
axtToPsl $I ../S1.len ../S2.len ../pslTight/${C}_blastzTightHg17.psl
echo "Done: $I -> ${C}_blastzTightHg17.psl"
done
# exit bash if you are tcsh
# Load tables into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.hg17/pslTight
for I in chr*TightHg17.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
echo "done ${I}"
done
# Compare results with previous assembly:
# featureBits mm5 blastzTightHg17
# 168148800 bases of 2615483787 (6.429%) in intersection
# featureBits mm4 blastzTightHg16
# 170163839 bases of 2627444668 (6.476%) in intersection
# copy axt's to download area
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.hg17/axtTight
mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight
cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight
cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight
gzip *.axt
# add README.txt file to dir (use previous assembly's copy as template)
# 4 minute gzip
#### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-07-13 - Fan)
# PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS BUILT USING ENSMART DATA OF MOUSE BUILD 32.
# THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER.
# WHEN ENSEMBL FINISHES THEIR MOUSE BUILD 33 RELEASE, WE NEED TO REBUILD THIS
# TABLE.
# Get the ensembl gene/protein cross-reference data from
# http://www.ensembl.org/Multi/martview?species=Mus_musculus
# Follow this sequence through the pages:
# Page 1) Make sure that the Mus musculus choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs,
SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC
# Page 4) Choose "Text, tab separated". choose gzip compression. hit export.
# Save as ensXref
sed ensXref.tsv -e 's/\./\t/g' > ensemblXref3.tab
hgsql mm5 -e "drop table ensemblXref3"
hgsql mm5 < ~/src/hg/lib/ensemblXref3.sql
hgsql mm5 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'
# CPGISLANDS (DONE - 2004-07-13 - Fan)
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/cpgIsland
cd /cluster/data/mm5/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
# gcc readseq.c cpg_lh.c -o cpglh.exe
mv cpglh.exe /cluster/data/mm5/bed/cpgIsland/
# cpglh.exe requires hard-masked (N) .fa's.
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
ssh kksilo
cd /cluster/data/mm5/bed/cpgIsland
foreach f (../../*/chr*.fa.masked)
set fout=$f:t:r:r.cpg
echo running cpglh on $f to $fout
./cpglh.exe $f > $fout
end
# the warnings:
# Bad char 0x52 = 'R' at line 117472, base 5873535, sequence chr14
# Bad char 0x53 = 'S' at line 120651, base 6032462, sequence chr14
# Bad char 0x53 = 'S' at line 120652, base 6032546, sequence chr14
# real 21m47.823s
# user 18m30.810s
# sys 1m13.420s
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
ssh hgwdev
cd /cluster/data/mm5/bed/cpgIsland
hgLoadBed mm5 cpgIslandExt -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 16238 elements of size 10
# Sorted
# Saving bed.tab
# Loading mm5
# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 2004-07-14 Fan)
ssh kksilo
cd /cluster/data/mm5
# Build the .zip files
cp /cluster/data/rn3/jkStuff/zipAll.sh jkStuff
# edit this zipAll.sh to produce output to /cluster/data/mm5/bigZips
jkStuff/zipAll.sh > zipAll.log
# bash: ./jkStuff/zipAll.sh > zipAll.log 2>&1 &
tail -f zipAll.log
mkdir zip
mv *.zip zip
cd zip
# Look at zipAll.log to make sure all file lists look reasonable.
# Check zip file integrity:
foreach f (*.zip)
unzip -t $f > $f.test
tail -1 $f.test
end
wc -l *.zip.test
# 46 chromAgp.zip.test
# 45 chromFa.zip.test
# 45 chromFaMasked.zip.test
# 45 chromOut.zip.test
# 45 chromTrf.zip.test
# 641 contigAgp.zip.test
# 641 contigFa.zip.test
# 641 contigFaMasked.zip.test
# 641 contigOut.zip.test
# 641 contigTrf.zip.test
#3431 total
ssh hgwdev
cd /cluster/data/mm5/jkStuff
# create generic copy program
cat << '_EOF_' > cpToWeb.sh
#!/bin/sh
if [ $# -ne 1 ]; then
echo "usage: cpToWeb.sh <goldenPath download directory>"
echo -e "\texample: cpToWeb.sh mm5"
exit 255
fi
GP=/usr/local/apache/htdocs/goldenPath/$1
mkdir -p ${GP}
mkdir -p ${GP}/chromosomes
for f in ../?/*.fa ../??/*.fa
do
BN=`basename ${f}`
zip -j ${GP}/chromosomes/${BN}.zip ${f}
echo "zipped: ${BN}"
done
mkdir -p ${GP}/bigZips
for Z in *.zip
do
cp -p ${Z} ${GP}/bigZips
echo "copied: ${Z}"
done
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x cpToWeb.sh
cd /cluster/data/mm5/zip
../jkStuff/cpToWeb.sh mm5
cd /usr/local/apache/htdocs/goldenPath/mm5
# Take a look at bigZips/* and chromosomes/*, update their README.txt's
# Make the upstream sequence files.
# NOTE: must be redone due to bad gap track
cd bigZips
featureBits mm5 refGene:upstream:1000 -fa=upstream1000.fa
zip upstream1000.zip upstream1000.fa
rm upstream1000.fa
featureBits mm5 refGene:upstream:2000 -fa=upstream2000.fa
zip upstream2000.zip upstream2000.fa
rm upstream2000.fa
featureBits mm5 refGene:upstream:5000 -fa=upstream5000.fa
zip upstream5000.zip upstream5000.fa
rm upstream5000.fa
# mrna zips -- auto dump process takes care of this
# MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 7/15/04 angie)
# In an email 2/13/04, Arian said we could treat all human repeats as
# lineage-specific for human-chicken blastz. Do the same for mouse.
# Scripts expect *.out.spec filenames, so set that up:
ssh kkr1u00
cd /cluster/data/mm5
mkdir /iscratch/i/mus/mm5/linSpecRep.notInChicken
foreach f (/iscratch/i/mus/mm5/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/mus/mm5/linSpecRep.notInChicken/$f:t:r:r.out.spec
end
iSync
# Use these the next time we run human-chicken blastz.
# BLASTZ CHICKEN (GALGAL2) (DONE 7/19/04 angie)
ssh kk
mkdir /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
ln -s blastz.galGal2.2004-07-15 /cluster/data/mm5/bed/blastz.galGal2
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
# Use human-chicken params: set L=10000 (higher threshold on blastz's
# outer loop) and abridge repeats.
cat << '_EOF_' > DEF
# mouse vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInChicken
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken
SEQ2_DIR=/iscratch/i/galGal2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastz.galGal2.2004-07-15
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# first cluster run: raw blastz alignments
ssh kk
bash # if a csh/tcsh user
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
para try, check, push, check, ....
#Completed: 51491 of 51491 jobs
#Average job time: 357s 5.95m 0.10h 0.00d
#Longest job: 1015s 16.92m 0.28h 0.01d
#Submission to last job: 89841s 1497.35m 24.96h 1.04d
# second cluster run: lift raw alignments -> lav dir
ssh kki
bash # if a csh/tcsh user
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
para try, check, push, etc ...
#Completed: 341 of 341 jobs
#Average job time: 11s 0.18m 0.00h 0.00d
#Longest job: 55s 0.92m 0.02h 0.00d
#Submission to last job: 245s 4.08m 0.07h 0.00d
# third run: lav -> axt
# NOTE: use axtRescore here because we used a non-default BLASTZ_Q matrix
# and abridged repeats (Penn State's restore_rpts program rescores with
# default matrix, oops).
ssh kki
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
mkdir axtChrom pslChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
set path = (/cluster/bin/x86_64 $path)
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
/iscratch/i/mus/mm5/softNib /iscratch/i/galGal2/nib stdout \
| axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q stdin stdout \
| axtSort stdin ../../axtChrom/$chr.axt
axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
foreach d (../lav/chr*)
echo "do.csh $d" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 43 of 43 jobs
#Average job time: 38s 0.63m 0.01h 0.00d
#Longest job: 160s 2.67m 0.04h 0.00d
#Submission to last job: 233s 3.88m 0.06h 0.00d
# CHAIN CHICKEN BLASTZ (DONE 7/19/04 angie)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
-minScore=5000 $1 \
/iscratch/i/mus/mm5/softNib \
/iscratch/i/galGal2/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
#Completed: 43 of 43 jobs
#Average job time: 60s 1.00m 0.02h 0.00d
#Longest job: 355s 5.92m 0.10h 0.00d
#Submission to last job: 355s 5.92m 0.10h 0.00d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=5000 /tmp/score.$f:t:r
echo ""
end
# Load chains into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain mm5 ${c}_chainGalGal2 $i
end
featureBits mm5 chainGalGal2Link
#78951466 bases of 2615483787 (3.019%) in intersection
featureBits hg17 chainGalGal2Link
#103882699 bases of 2866216770 (3.624%) in intersection
# NET CHICKEN BLASTZ (DONE 7/19/04 angie)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
netClass -noAr noClass.net mm5 galGal2 chicken.net
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn chicken.net > chickenSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
netFilter -minGap=10 chicken.net | hgLoadNet mm5 netGalGal2 stdin
netFilter -minGap=10 chickenSyn.net | hgLoadNet mm5 syntenyNetGalGal2 stdin
# Add entries for chainGalGal2, netGalGal2, syntenyNetGalGal2 to
# mouse/mm5 trackDb
# GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
netSplit chicken.net net
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
mkdir axtNet
foreach f (axtChain/net/*)
set chr = $f:t:r
netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/mixedNib \
/cluster/data/galGal2/nib stdout \
| axtSort stdin axtNet/$chr.axt
end
mkdir mafNet
foreach f (axtNet/chr*.axt)
set maf = mafNet/$f:t:r.maf
axtToMaf $f \
/cluster/data/mm5/chrom.sizes /cluster/data/galGal2/chrom.sizes \
$maf -tPrefix=mm5. -qPrefix=galGal2.
end
# XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk)
# see makeXenTro1.doc and search for zb.mm5
# The results of this are also symlinked under mm5/bed
# MAKE VSGALGAL2 DOWNLOADABLES (DONE 7/19/04 angie)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
gzip axtNet/*.axt
cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
ln all.chain chicken.chain
zip /cluster/data/mm5/zip/chicken.chain.zip chicken.chain
rm chicken.chain
zip /cluster/data/mm5/zip/chicken.net.zip chicken.net
zip /cluster/data/mm5/zip/chickenSyn.net.zip chickenSyn.net
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsGalGal2
cd /usr/local/apache/htdocs/goldenPath/mm5/vsGalGal2
mv /cluster/data/mm5/zip/chicken*.zip .
cp -pR /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtNet .
md5sum *.zip axtNet/* > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 7/15/04 angie)
ssh kkr1u00
cd /cluster/bluearc/scratch/mus/mm5/rmsk
# Run Arian's DateRepsinRMoutput.pl to add extra columns telling
# whether repeats in -query are also expected in -comp species.
# Even though we already have the mouse-human linSpecReps,
# extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
# additions. So add human, then ignore it.
# Dog in extra column 1, Human in extra column 2
foreach outfl ( *.out )
echo "$outfl"
/cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
${outfl} -query mouse -comp dog -comp human
end
# Now extract dog (extra column 1), ignore human.
cd /iscratch/i/mus/mm5
mkdir linSpecRep.notInDog
foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/*.out_dog_hum)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 1 $f > \
linSpecRep.notInDog/$base.out.spec
end
# Clean up.
rm /cluster/bluearc/scratch/mus/mm5/rmsk/*.out_dog_hum
iSync
# BLASTZ DOG (CANFAM1) (DONE 7/16/04 angie)
ssh kk
mkdir /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
ln -s blastz.canFam1.2004-07-15 /cluster/data/mm5/bed/blastz.canFam1
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
# Use default (Human-Mouse) settings for starters.
cat << '_EOF_' > DEF
# mouse vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Default
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInDog
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog
SEQ2_DIR=/scratch/hg/canFam1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastz.canFam1.2004-07-15
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# first cluster run: raw blastz alignments
ssh kk
bash # if a csh/tcsh user
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
para try, check, push, check, ....
# cluster was mobbed...
#Completed: 93775 of 93775 jobs
#Average job time: 187s 3.11m 0.05h 0.00d
#Longest job: 3907s 65.12m 1.09h 0.05d
#Submission to last job: 76763s 1279.38m 21.32h 0.89d
# second cluster run: lift raw alignments -> lav dir
ssh kki
bash # if a csh/tcsh user
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
para try, check, push, etc ...
#Completed: 341 of 341 jobs
#Average job time: 98s 1.63m 0.03h 0.00d
#Longest job: 281s 4.68m 0.08h 0.00d
#Submission to last job: 2102s 35.03m 0.58h 0.02d
# third run: lav -> axt
# (if non-default BLASTZ_Q is used in the future, put axtRescore in
# the pipe after lavToAxt)
ssh kki
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
mkdir axtChrom pslChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
/iscratch/i/mus/mm5/softNib /iscratch/i/canFam1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
foreach d (../lav/chr*)
echo "do.csh $d" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 43 of 43 jobs
#Average job time: 671s 11.18m 0.19h 0.01d
#Longest job: 2398s 39.97m 0.67h 0.03d
#Submission to last job: 2417s 40.28m 0.67h 0.03d
# CHAIN DOG BLASTZ (DONE 7/16/04 angie)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chainchimpSuperQuals
ls -1S /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
/iscratch/i/mus/mm5/softNib \
/iscratch/i/canFam1/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
#Completed: 43 of 43 jobs
#Average job time: 537s 8.96m 0.15h 0.01d
#Longest job: 2071s 34.52m 0.58h 0.02d
#Submission to last job: 2071s 34.52m 0.58h 0.02d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=5000 /tmp/score.$f:t:r
echo ""
end
# Lots of chaff with scores in the 3000's. Many very-high-scoring
# chains. So filter the chain down somewhat...
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
rm chain/*
chainSplit chain all.chain
gzip all.chain.unfiltered
# Load chains into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain mm5 ${c}_chainCanFam1 $i
end
# mouse-dog gets significantly less coverage than human-dog:
featureBits mm5 -chrom=chr1 chainCanFam1Link
#63386139 bases of 185739816 (34.126%) in intersection
featureBits hg17 -chrom=chr1 chainCanFam1Link
#123999291 bases of 222827847 (55.648%) in intersection
# mouse-dog isn't a whole lot less than mouse-human though:
featureBits mm5 -chrom=chr1 chainHg17Link
#75492250 bases of 185739816 (40.644%) in intersection
# NET DOG BLASTZ (DONE 7/16/04 angie)
ssh kolossus
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
netClass -noAr noClass.net mm5 canFam1 dog.net
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn dog.net > dogSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
netFilter -minGap=10 dog.net | hgLoadNet mm5 netCanFam1 stdin
netFilter -minGap=10 dogSyn.net | hgLoadNet mm5 syntenyNetCanFam1 stdin
# Add entries for chainCanFam1, netCanFam1 to mouse/mm5 trackDb
# MAKE VSCANFAM1 DOWNLOADABLES (DONE 7/19/04 angie)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
gzip axtNet/chr*.axt
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
ln all.chain dog.chain
zip /cluster/data/mm5/zip/dog.chain.zip dog.chain
rm dog.chain
zip /cluster/data/mm5/zip/dog.net.zip dog.net
zip /cluster/data/mm5/zip/dogSyn.net.zip dogSyn.net
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsCanFam1
cd /usr/local/apache/htdocs/goldenPath/mm5/vsCanFam1
mv /cluster/data/mm5/zip/dog*.zip .
cp -pR /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtNet .
md5sum *.zip axtNet/* > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# GENERATE CANFAM1 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
netSplit dog.net net
cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
mkdir axtNet
foreach f (axtChain/net/*)
set chr = $f:t:r
netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/nib \
/cluster/data/canFam1/nib stdout \
| axtSort stdin axtNet/$chr.axt
end
mkdir mafNet
foreach f (axtNet/chr*.axt)
set maf = mafNet/$f:t:r.maf
axtToMaf $f \
/cluster/data/mm5/chrom.sizes /cluster/data/canFam1/chrom.sizes \
$maf -tPrefix=mm5. -qPrefix=canFam1.
end
### MAKE THE affyU74 TRACK - needed for the Gene Sorter
# (DONE - 2004-07-16 - Fan)
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of
# target sequences. Recalculate alignments and load data
----------------------------------
# Load up semi-local disk with target sequences for Affy mouse U74 chips.
ssh kkr1u00
mkdir -p /iscratch/i/affy
# This /projects filesystem is not available on kkr1u00
# but it is on kk
ssh kk
cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy
ssh kkr1u00
iSync
# Run cluster job to do alignments
ssh kk
mkdir /cluster/data/mm5/bed/affyU74.2004-07-16
cd /cluster/data/mm5/bed/affyU74.2004-07-16
mkdir run
cd run
mkdir psl
echo /scratch/mus/mm5/maskedContigs/*.fa | wordLine stdin > genome.lst
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.lst affy.lst gsub jobList
para create jobList
para try
# do usual para check/para push etc. until the job is done.
# Completed: 1917 of 1917 jobs
# CPU time in finished jobs: 14240s 237.34m 3.96h 0.16d 0.000 y
# IO & Wait Time: 7946s 132.43m 2.21h 0.09d 0.000 y
# Average job time: 12s 0.19m 0.00h 0.00d
# Longest job: 40s 0.67m 0.01h 0.00d
# Submission to last job: 307s 5.12m 0.09h 0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
ssh kksilo
cd /cluster/data/mm5/bed/affyU74.2004-07-16/run
pslSort dirs raw.psl tmp psl
# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
# Processed 44630 alignments
liftUp ../all_affyU74.psl ../../../jkStuff/liftAll.lft warn contig.psl
# Sort by chromosome and load into database.
ssh hgwdev
cd /cluster/data/mm5/bed/affyU74.2004-07-16
pslSortAcc nohead chrom temp all_affyU74.psl
cat chrom/*.psl > affyU74.psl
# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
hgLoadPsl mm5 affyU74.psl
rm -fr chrom temp run
## MAKE THE affyGnfU74 TRACKs (DONE - 2004-07-18 - Fan)
# Make bed files and load consensus sequences for Affy U74 chip set.
# Fix broken symlinks to microarray data after directory structure changed
# (DONE, 2005-05-03, hartera)
----------------------------------
#This needs to be done after affyU74 is already made.
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/affyGnf.2004-07-16
cd /cluster/data/mm5/bed/affyGnf.2004-07-16
# may need to build this command in src/hg/affyGnf
affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \
/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \
/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \
/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2
# edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
hgLoadBed mm5 affyGnfU74A affyGnfU74A.bed
hgLoadBed mm5 affyGnfU74B affyGnfU74B.bed
hgLoadBed mm5 affyGnfU74C affyGnfU74C.bed
# Add in sequence data for U74 tracks.
# Copy consensus sequence to /gbdb if it isn't already
# [THE SYM LINKS WERE ALREADY DONE.]
mkdir -p /gbdb/hgFixed/affyProbes
cd /gbdb/hgFixed/affyProbes
# fix broken symlinks after directory structure changed
# /projects/compbiodata ----> /projects/compbio/data
rm U74*
# make correct symlinks (hartera, 2005-05-03)
ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .
# used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
# ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
# reload sequences with prefix removed so acc matches name used in
# other dependent tables
hgLoadSeq -abbr=U74Av2: mm5 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
hgLoadSeq -abbr=U74Bv2: mm5 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
hgLoadSeq -abbr=U74Cv2: mm5 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa
### GNF ATLAS 2 [DONE Fan 7/18/2004]
# Align probes from GNF1M chip.
ssh kk
cd /cluster/data/mm5/bed
mkdir -p geneAtlas2/run/psl
cd geneAtlas2/run
mkdir -p /cluster/bluearc/geneAtlas2
cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2
ls -1 /scratch/mus/mm5/maskedContigs/ > genome.lst
ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc /scratch/mus/mm5/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
gensub2 genome.lst mrna.lst gsub spec
para create spec
para try
para check
para push
para time
# Completed: 639 of 639 jobs
# CPU time in finished jobs: 58174s 969.57m 16.16h 0.67d 0.002 y
# IO & Wait Time: 4833s 80.55m 1.34h 0.06d 0.000 y
# Average job time: 99s 1.64m 0.03h 0.00d
# Longest job: 189s 3.15m 0.05h 0.00d
# Submission to last job: 1749s 29.15m 0.49h 0.02d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create gnf1h.psl.
pslSort dirs raw.psl tmp psl
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp ../affyGnf1m.psl ../../../jkStuff/liftAll.lft warn contig.psl
rm -r contig.psl raw.psl psl
# Load probes and alignments from GNF1H into database.
ssh hgwdev
cd /cluster/data/mm5/bed/geneAtlas2
ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
hgLoadPsl mm5 affyGnf1m.psl
hgLoadSeq mm5 /gbdb/hgFixed/affyProbes/gnf1m.fa
# Load up track
hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
affyGnf1m.psl
# Note that the unmapped 5000 records are from all-N sequences.
hgLoadBed mm5 gnfAtlas2 gnfAtlas2.bed
# MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2004-07-19, Fan)
mkdir -p /projects/compbio/data/microarray/affyMouse
# Download MOE430A and MOE430B consensus sequences from Affymetrix web site
# http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
unzip MOE430*_consensus.zip
# check for duplicate probes: there are none, all have unique names
# check for duplicate probes: 100 from 136745_at to 1367551_a_at
# remove "consensus:" and ";" from FASTA headers to shorten probeset
# names for database
sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
/cluster/bluearc/affy/
# THE ABOVE WAS ALREADY DONE BY RACHEL 4/16/04.
# Set up cluster job to align MOE430 consensus sequences to mm5
ssh kkr1u00
cd /cluster/data/mm5/bed
mkdir -p affyMOE430
cd affyMOE430
mkdir -p /iscratch/i/affy
cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
iSync
ssh kk
cd /cluster/data/mm5/bed/affyMOE430
ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
ls -1 /scratch/mus/mm5/maskedContigs/ > allctg.lst
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -
ooc=/scratch/hg/h/mouse11.ooc /scratch/mus/mm5/maskedContigs/$(path1) $(path2)
{check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 allctg.lst affy.lst template.sub para.spec
mkdir psl
para create para.spec
# Actually do the job with usual para try/check/push/time etc.
# para time
# Completed: 639 of 639 jobs
# CPU time in finished jobs: 24369s 406.14m 6.77h 0.28d 0.001 y
# IO & Wait Time: 2263s 37.72m 0.63h 0.03d 0.000 y
# Average job time: 42s 0.69m 0.01h 0.00d
# Longest job: 63s 1.05m 0.02h 0.00d
# Submission to last job: 671s 11.18m 0.19h 0.01d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyRAE230.psl
pslSort dirs raw.psl tmp psl
# only use alignments that cover 30% of sequence and have at least
# 95% identity in aligned region.
# low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl
contig.psl /dev/null
liftUp affyMOE430.psl ../../jkStuff/liftAll.lft warn contig.psl
# Load alignments and sequences into database
ssh hgwdev
cd /cluster/data/mm5/bed/affyMOE430
# shorten names in psl file
sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
mv affyMOE430.psl.bak affyMOE430.psl
# load track into database
hgLoadPsl mm5 affyMOE430.psl
# 1 warning on loading: Blat error so that 1449824_at has a
# negative entry (-195) in the qBaseInsert field.
# Loading into the database forces this to 0.
# Add consensus sequences for MOE430
# Copy sequences to gbdb is they are not there already
mkdir -p /gbdb/hgFixed/affyProbes
ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
/gbdb/hgFixed/affyProbes
hgLoadSeq -abbr=MOE430 mm5 /gbdb/hgFixed/affyProbes/MOE430_all.fa
# Clean up
rm batch.bak contig.psl raw.psl
# BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4
# add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
# add affyMOE430.html file and then do make alpha to add to trackDb table
######## MAKING GENE SORTER TABLES ####### (STARTED - 2004-07-15 - Hiram)
# These are instructions for building the
# Gene Sorter. Don't start these until
# there is a knownGene track. and the affy tracks
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
cd /tmp
hgClusterGenes mm5 knownGene knownIsoforms knownCanonical
# You may need to build this binary in src/hg/near/hgClusterGenes
# Got 24603 clusters, from 41208 genes in 43 chromosomes
# featureBits mm5 knownCanonical
# 853516995 bases of 2615483787 (32.633%) in intersection
# featureBits mm4 knownCanonical
# 840021165 bases of 2627444668 (31.971%) in intersection
# featureBits mm3 knownCanonical
# 825943052 bases of 2505900260 (32.960%) in intersection
# ! ! ! Can not do featureBits on knownIsoforms
# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/geneSorter/blastp
cd /cluster/data/mm5/bed/geneSorter/blastp
pepPredToFa mm5 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/cluster/bluearc/blast229/formatdb -i known.faa -t known -n known
# Copy over database to bluearc scratch
mkdir /cluster/bluearc/scratch/mus/mm5/blastp
cp -p /cluster/data/mm5/bed/geneSorter/blastp/known.* \
/cluster/bluearc/scratch/mus/mm5/blastp
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm5/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg
# Make parasol run directory
ssh kk
mkdir /cluster/data/mm5/bed/geneSorter/blastp/self
cd /cluster/data/mm5/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# 'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para check
para push ... etc ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 120685s 2011.42m 33.52h 1.40d 0.004 y
# IO & Wait Time: 22722s 378.69m 6.31h 0.26d 0.001 y
# Average job time: 19s 0.31m 0.01h 0.00d
# Longest job: 147s 2.45m 0.04h 0.00d
# Submission to last job: 705s 11.75m 0.20h 0.01d
# Load into database. This takes about an hour.
ssh hgwdev
cd /cluster/data/mm5/bed/geneSorter/blastp/self/run/out
hgLoadBlastTab mm5 knownBlastTab *.tab
# Scanning through 7739 files
# Loading database with 8017562 rows
# real 17m9.104s
# user 3m8.980s
# sys 0m28.800s
# Create known gene mapping table and expression distance tables
# for GNF Atlas 2. (The hgExpDistance takes an hour.)
# DONE (04-07-18 Fan)
hgMapToGene mm5 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm5 hgFixed.gnfMouseAtlas2MedianRatio \
hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnf1m
# Create table that maps between known genes and RefSeq
hgMapToGene mm5 refGene knownGene knownToRefSeq
# may need to build this command in src/hg/near/hgMapToGene
# Create a table that maps between known genes and
# the nice affy expression data.
hgMapToGene mm5 affyU74 knownGene knownToU74
hgMapToGene mm5 affyMOE430 knownGene knownToMOE430
hgMapToGene mm5 affyMOE430 -prefix=A: knownGene knownToMOE430A
# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm5/bed/rinnSex
cd !$
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
../affyMOE430/affyMOE430.psl
hgLoadBed mm5 rinnSex rinnSex.bed
# Format and load the GNF data
mkdir /cluster/data/mm5/bed/affyGnf95
cd /cluster/data/mm5/bed/affyGnf95
affyPslAndAtlasToBed -newType ../affyU95.psl \
/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
affyGnfU95.tab affyGnfU95Exps.tab -shortOut
# this .sql load was in preceeding instructions, but this .sql file
# appears to not exist and it doesn't seem to be needed anyway.
# Everything below this seems to create tables OK.
# hgsql mm5 < ~/kent/src/hg/affyGnf/affyGnfU95.sql
# Create table that gives distance in expression space between
# GNF genes. These commands take about 15 minutes each
# The affyGnfU74?Exps arguments appear to be unused in
hgExpDistance
hgExpDistance mm5 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
-lookup=knownToU74
# Got 13593 unique elements in affyGnfU74A
hgExpDistance mm5 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
-lookup=knownToU74
# Got 8512 unique elements in affyGnfU74B
hgExpDistance mm5 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
-lookup=knownToU74
# Got 2318 unique elements in affyGnfU74C
# C.ELEGANS BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to iscratch/i
# if it doesn't exist already:
ssh eieio
mkdir /cluster/data/ce2/bed/blastp
cd /cluster/data/ce2/bed/blastp
# Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
# to find out the latest version. Then use that in place of 128 below.
wget -O wormPep128.faa \
ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep128/wormpep128
formatdb -i wormPep128.faa -t wormPep128 -n wormPep128
ssh kkr1u00
if (-e /iscratch/i/ce2/blastp) then
rm -r /iscratch/i/ce2/blastp
endif
mkdir -p /iscratch/i/ce2/blastp
cp /cluster/data/ce2/bed/blastp/wormPep128.p?? /iscratch/i/ce2/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/mm5/bed/blastp/ce2/run/out
cd /cluster/data/mm5/bed/blastp/ce2/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep128 -i \$1
-o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
#ls -1S ../../split/*.fa > split.lst
#EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 54871s 914.51m 15.24h 0.64d 0.002 y
# IO & Wait Time: 26157s 435.95m 7.27h 0.30d 0.001 y
# Average job time: 10s 0.17m 0.00h 0.00d
# Longest job: 41s 0.68m 0.01h 0.00d
# Submission to last job: 210s 3.50m 0.06h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/mm5/bed/blastp/ce2/run/out
hgLoadBlastTab mm5 ceBlastTab -maxPer=1 *.tab
# HUMAN BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
# Make human ortholog column using blastp on human known genes.
# First make human protein database and copy it to iscratch/i
# if it doesn't exist already:
mkdir /cluster/data/hg17/bed/blastp
cd /cluster/data/hg17/bed/blastp
pepPredToFa hg17 knownGenePep known.faa
formatdb -i known.faa -t known -n known
ssh kkr1u00
if (-e /iscratch/i/hg17/blastp) then
rm -r /iscratch/i/hg17/blastp
endif
mkdir -p /iscratch/i/hg17/blastp
cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/mm5/bed/blastp/hg17/run/out
cd /cluster/data/mm5/bed/blastp/hg17/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17/blastp/known -i \$1 -o
\$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
#EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 125830s 2097.17m 34.95h 1.46d 0.004 y
# IO & Wait Time: 22740s 379.00m 6.32h 0.26d 0.001 y
# Average job time: 19s 0.32m 0.01h 0.00d
# Longest job: 137s 2.28m 0.04h 0.00d
# Submission to last job: 301s 5.02m 0.08h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/mm5/bed/blastp/hg17/run/out
hgLoadBlastTab mm5 hgBlastTab -maxPer=1 *.tab
# ZEBRAFISH BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to iscratch/i
# if it doesn't exist already:
ssh kkstore
mkdir /cluster/data/danRer1/bed/blastp
cd /cluster/data/danRer1/bed/blastp
wget
ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH3.ma
y.pep.fa.gz
zcat Dan*.pep.fa.gz > ensembl.faa
formatdb -i ensembl.faa -t ensembl -n ensembl
ssh kkr1u00
if (-e /iscratch/i/danRer1/blastp) then
rm -r /iscratch/i/danRer1/blastp
endif
mkdir -p /iscratch/i/danRer1/blastp
cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp
iSync
# THE ABOVE IS ALREADY DONE BY ANGIE
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/mm5/bed/blastp/danRer1/run/out
cd /cluster/data/mm5/bed/blastp/danRer1/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer1/blastp/ensembl -i
\$1 -o \$2 -e 0.005 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
#EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 96773s 1612.89m 26.88h 1.12d 0.003 y
# IO & Wait Time: 29356s 489.26m 8.15h 0.34d 0.001 y
# Average job time: 16s 0.27m 0.00h 0.00d
# Longest job: 73s 1.22m 0.02h 0.00d
# Submission to last job: 282s 4.70m 0.08h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/mm5/bed/blastp/danRer1/run/out
hgLoadBlastTab mm5 drBlastTab -maxPer=1 *.tab
# YEAST BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on
# RefSeq. First make protein database and copy it to iscratch/i
# if it doesn't exist already:
mkdir /cluster/data/sacCer1/bed/blastp
cd /cluster/data/sacCer1/bed/blastp
wget ftp://genome-
ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/o
rf_trans.fasta.gz
zcat orf_trans.fasta.gz > sgdPep.faa
formatdb -i sgdPep.faa -t sgdPep -n sgdPep
#ABOVE WAS ALREDY DONE BY JIM
ssh kkr1u00
# Note: sacCer1 is a name conflict with SARS coronavirus... oh well,
# fortunately we won't be looking for homologs there. :)
if (-e /iscratch/i/sacCer1/blastp) then
rm -r /iscratch/i/sacCer1/blastp
endif
mkdir -p /iscratch/i/sacCer1/blastp
cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/mm5/bed/blastp/sacCer1/run/out
cd /cluster/data/mm5/bed/blastp/sacCer1/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/sacCer1/blastp/sgdPep -i \$1
-o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
#EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 16348s 272.46m 4.54h 0.19d 0.001 y
# IO & Wait Time: 23063s 384.39m 6.41h 0.27d 0.001 y
# Average job time: 5s 0.08m 0.00h 0.00d
# Longest job: 14s 0.23m 0.00h 0.00d
# Submission to last job: 203s 3.38m 0.06h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/mm5/bed/blastp/sacCer1/run/out
hgLoadBlastTab mm5 scBlastTab -maxPer=1 *.tab
# DM1 BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make protein database and copy it to iscratch/i
# if it doesn't exist already:
# This is already done, see makeMm3.doc for procedure
# the directory: /cluster/bluearc/dm1/blastp should have data
ssh kkr1u00
if (-e /iscratch/i/dm1/blastp) then
rm -r /iscratch/i/dm1/blastp
endif
mkdir -p /iscratch/i/dm1/blastp
cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
iSync
# THE ABOVE IS ALREADY DONE BY ANGIE
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/mm5/bed/blastp/dm1/run/out
cd /cluster/data/mm5/bed/blastp/dm1/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o
\$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
#EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
# Completed: 7739 of 7739 jobs
# CPU time in finished jobs: 64033s 1067.22m 17.79h 0.74d 0.002 y
# IO & Wait Time: 20868s 347.79m 5.80h 0.24d 0.001 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest job: 45s 0.75m 0.01h 0.00d
# Submission to last job: 351s 5.85m 0.10h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/mm5/bed/blastp/dm1/run/out
hgLoadBlastTab mm5 dmBlastTab -maxPer=1 *.tab
# Create table that maps between known genes and LocusLink (DONE 7/20/04 Fan)
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm5 \
> refToLl.txt
hgMapToGene mm5 refGene knownGene knownToLocusLink -lookup=refToLl.txt
# row count is 30303
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm5 knownGene name proteinID Pfam knownToPfam
# row count is 29069
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene mm5 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
# Create table that maps between known genes and genePix database (DONE 3/15/05 JK)
knownToGenePix mm5
# ENABLE GENE SORTER FOR mm5 IN HGCENTRALTEST (DONE 7/20/04 Fan)
echo "update dbDb set hgNearOk = 1 where name = 'mm5';" \
| hgsql -h genome-testdb hgcentraltest
# RAT BLASTP FOR GENE SORTER (DONE 4/20/05 Fan)
# Make RAT ortholog column using blastp on RAT known genes.
# First make RAT protein database and copy it to iscratch/i
# if it doesn't exist already:
mkdir /cluster/data/rn3/bed/blastp
cd /cluster/data/rn3/bed/blastp
pepPredToFa rn3 knownGenePep known.faa
formatdb -i known.faa -t known -n known
ssh kkr1u00
if (-e /iscratch/i/rn3/blastp) then
rm -r /iscratch/i/rn3/blastp
endif
mkdir -p /iscratch/i/rn3/blastp
cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/mm5/bed/blastp/rn3/run/out
cd /cluster/data/mm5/bed/blastp/rn3/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/rn3/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm5/bed/geneSorter/blastp/split >split.lst
#EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
Completed: 7739 of 7739 jobs
CPU time in finished jobs: 24369s 406.14m 6.77h 0.28d 0.001 y
IO & Wait Time: 21867s 364.46m 6.07h 0.25d 0.001 y
Average job time: 6s 0.10m 0.00h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 25s 0.42m 0.01h 0.00d
Submission to last job: 276s 4.60m 0.08h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/mm5/bed/blastp/rn3/run/out
hgLoadBlastTab mm5 rnBlastTab -maxPer=1 *.tab
# END OF GENE SORTER STUFF
#############################################################################
# BLASTZ RAT RN3 (DONE - 2004-07-15 - Fan)
# NOTE: THIS IS RE-DONE. SEE THE SAME SECTION OF 2004-08-30. Fan.
ssh kk
mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-07-14
cd /cluster/data/mm5/bed
ln -s blastz.rn3.2004-07-14 blastz.rn3
cd blastz.rn3
cat << '_EOF_' > DEF
# rat vs. mouse
export
PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartz
bin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastz.rn3
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# prepare first cluster run
ssh kk
cd /cluster/data/mm5/bed/blastz.rn3
bash
source ./DEF
# script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh
# it is a generic script and works for any assembly
cp /cluster/data/hg17/jkStuff/BlastZ_run0.sh \
/cluster/data/mm5/jkStuff/BlastZ_run0.sh
/cluster/data/mm5/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
Completed: 41943 of 41943 jobs
CPU time in finished jobs: 16854319s 280905.31m 4681.76h 195.07d 0.534 y
IO & Wait Time: 448464s 7474.41m 124.57h 5.19d 0.014 y
Average job time: 413s 6.88m 0.11h 0.00d
Longest job: 9358s 155.97m 2.60h 0.11d
Submission to last job: 73416s 1223.60m 20.39h 0.85d
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/mm5/bed/blastz.rn3
# script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
# fixup machine check, should be kki, not kk
cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh \
/cluster/data/mm5/jkStuff/BlastZ_run1.sh
vi /cluster/data/mm5/jkStuff/BlastZ_run1.sh
/cluster/data/mm5/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 7859s 130.98m 2.18h 0.09d 0.000 y
# IO & Wait Time: 104771s 1746.19m 29.10h 1.21d 0.003 y
# Average job time: 330s 5.50m 0.09h 0.00d
# Longest job: 1625s 27.08m 0.45h 0.02d
# Submission to last job: 8535s 142.25m 2.37h 0.10d
# Third cluster run to convert lav's to axt's
ssh kki
cd /cluster/data/mm5/bed/blastz.rn3
bash
source ./DEF
# The copy of this in mm4 was broken, use the hg17 one instead
cp /cluster/data/hg17/jkStuff/BlastZ_run2.sh \
/cluster/data/mm5/jkStuff/BlastZ_run2.sh
# vi /cluster/data/mm5/jkStuff/BlastZ_run2.sh
/cluster/data/mm5/jkStuff/BlastZ_run2.sh
cd run.2
#edited gsub to change /scratch/mus/mm5 to /cluster/bluearc/scratch/mus/mm5
# and recreated jobList by:
gensub2 chrom.list single gsub jobList
para create jobList
para try, check, push, etc ...
# Completed: 42 of 43 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 2050s 34.17m 0.57h 0.02d 0.000 y
# IO & Wait Time: 143135s 2385.58m 39.76h 1.66d 0.005 y
# Average job time: 3457s 57.61m 0.96h 0.04d
# Longest job: 14460s 241.00m 4.02h 0.17d
# Submission to last job: 14849s 247.48m 4.12h 0.17d
# Note: previous numbers were:
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 426s 7.09m 0.12h 0.00d 0.000 y
# IO & Wait Time: 7283s 121.39m 2.02h 0.08d 0.000 y
# Average job time: 168s 2.79m 0.05h 0.00d
# Longest job: 642s 10.70m 0.18h 0.01d
# Submission to last job: 642s 10.70m 0.18h 0.01d
# probably due to data on bluearc instead of on kki nodes.
# One job failed consistently because short of memory error
# went to kkr4u00 to run the following job:
# Per Angie's advice, created /cluster/bin/scripts/blastz-chromlav2axtLargeMem
# by from /cluster/bin/scripts/blastz-chromlav2axt and changed /cluster/bin/i386
# to /cluster/bin/x86_64 and then ran:
/cluster/bin/scripts/blastz-chromlav2axtLargeMem
/cluster/data/mm5/bed/blastz.rn3/lav/chr2
/cluster/data/mm5/bed/blastz.rn3/axtChrom/chr2.axt
/cluster/bluearc/scratch/mus/mm5/softNib /iscratch/i/rn3/bothMaskedNibs
# It worked!
# translate sorted axt files into psl
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3
mkdir pslChrom
set tbl = "blastzRn3"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# That takes about 2 hours
# Load database tables
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3/pslChrom
bash
for I in *.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
echo "done: ${I}"
done
# Check results
# featureBits hg16 blastzRn3
# 1013603401 bases of 2865248791 (35.376%) in intersection
# featureBits mm5 blastzRn3 ran out of memory.
# So check a few specific chromosomes
# featureBits mm5 blastzRn3 -chrom=chr17
# 61029084 bases of 86658738 (70.425%) in intersection
# featureBits mm4 blastzRn3 -chrom=chr17
# 62824556 bases of 89616841 (70.104%) in intersection
# featureBits mm5 blastzRn3 -chrom=chr18
# 61442155 bases of 86685738 (70.879%) in intersection
# featureBits mm4 blastzRn3 -chrom=chr18
# 57158006 bases of 81388777 (70.228%) in intersection
# CHAIN RN3 BLASTZ (DONE - 2004-07-22 - Fan)
# NOTE: THIS IS RE-DONE. SEE THE SAME SECTION OF 2004-08-30. Fan.
# The axtChain is best run on the small kluster, or the kk9 kluster
ssh kki
mkdir -p /cluster/data/mm5/bed/blastz.rn3/axtChain/run1
cd /cluster/data/mm5/bed/blastz.rn3/axtChain/run1
mkdir out chain
ls -1S /cluster/data/mm5/bed/blastz.rn3/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
out/$(root1).out
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
/iscratch/i/mus/mm5/softNib \
/iscratch/i/rn3/bothMaskedNibs $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
# 46 jobs
gensub2 input.lst single gsub jobList
para create jobList
para try
para push # ... etc ...
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 18318s 305.30m 5.09h 0.21d 0.001 y
# IO & Wait Time: 41906s 698.44m 11.64h 0.49d 0.001 y
# Average job time: 1401s 23.34m 0.39h 0.02d
# Longest job: 5598s 93.30m 1.55h 0.06d
# Submission to last job: 5635s 93.92m 1.57h 0.07d
# now on the file server, sort chains
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3/axtChain
time chainMergeSort run1/chain/*.chain > all.chain &
# real 26m14.694s
# user 16m16.190s
# sys 2m19.520s
time chainSplit chain all.chain &
# real 26m29.801s
# user 15m40.780s
# sys 2m40.610s
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain mm5 ${c}_chainRn3 $i
echo done $c
end
# featureBits mm4 chainRn3Link -chrom=chr16
# 67474802 bases of 95076222 (70.969%) in intersection
# featureBits mm5 chainRn3Link -chrom=chr16
# 66703715 bases of 92679592 (71.972%) in intersection
# featureBits mm4 chainRn3Link -chrom=chr17
# 61932430 bases of 89616841 (69.108%) in intersection
# featureBits mm5 chainRn3Link -chrom=chr17
# 60676019 bases of 86658738 (70.017%) in intersection
# NET RN3 (DONE - 2004-07-23 - Fan)
# NOTE: THIS IS RE-DONE. SEE THE SAME SECTION OF 2004-08-31. Fan.
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/mm5/chrom.sizes \
/cluster/data/rn3/chrom.sizes ../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/mm5/chrom.sizes \
/cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 1850904576, utime 9294 s/100, stime 2079
# The netClass operations requires an "ancientRepeat" table to exist
# in either mm5 or rn3. So, create the table:
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/ancientRepeat
cd /cluster/data/mm5/bed/ancientRepeat
# mysqldump needs write permission to this directory
# and you need to use your read/write enabled user with password
chmod 777 .
hgsqldump --all --tab=. mm4 ancientRepeat
chmod 775 .
hgsql mm5 < ancientRepeat.sql
mysqlimport -u<r/w user> -p<r/w pass> mm5 ancientRepeat.txt
# This is a hand curated table obtained from Arian.
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3/axtChain
time netClass hNoClass.net mm5 rn3 rat.net \
-tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat \
-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
# 508.060u 89.340s 12:10.36 81.7% 0+0k 0+0io 201pf+0w
# If things look good do
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3/axtChain
rm -r n1 hNoClass.net
# Make a 'syntenic' subset of these with
time netFilter -syn rat.net > ratSyn.net
# real 5m5.494s
# user 3m52.710s
# sys 0m32.670s
# Load the nets into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3/axtChain
netFilter -minGap=10 rat.net | hgLoadNet mm5 netRn3 stdin
netFilter -minGap=10 ratSyn.net | hgLoadNet mm5 syntenyNetRn3 stdin
# real 8m50.781s
# user 4m59.660s
# sys 0m52.840s
# check results
# featureBits mm4 netRn3
# 96806381 bases of 95076222 (101.820%) in intersection
# featureBits mm5 netRn3
# 2638255333 bases of 2615483787 (100.871%) in intersection
# featureBits mm4 syntenyNetRn3
# 96760405 bases of 95076222 (101.771%) in intersection
# featureBits mm5 syntenyNetRn3
# 2600849289 bases of 2615483787 (99.440%) in intersection
# Add entries for net and chain to mouse/mm5 trackDb
# make net
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3/axtChain
mkdir ratNet
time netSplit rat.net ratNet
# real 5m28.037s
# user 3m58.150s
# sys 0m37.870s
# extract axts from net
mkdir ../axtNet
foreach n (ratNet/chr*.net)
set c=$n:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt ratNet/$c.net chain/$c.chain \
/cluster/data/mm5/nib \
/cluster/data/rn3/nib ../axtNet/$c.axt
echo "Complete: $c.net -> axtNet/$c.axt"
end
# sort axt's and convert to maf format
mkdir ../mafNet
cat << 'EOF' > makeMaf.csh
foreach f (../axtNet/chr*.axt)
set c=$f:t:r
echo $c.axt
mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
rm ../axtNet/$c.unsorted.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \
../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3.
end
'EOF'
#csh makeMaf.csh >&! makeMaf.log &
csh makeMaf.csh > makeMaf.log &
tail -100f makeMaf.log
# THE ABOVE DID NOT WORK. TRIED THE FOLLOWING:
foreach f (../axtNet/chr*.axt)
set c=$f:t:r
echo $c.axt
mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
rm ../axtNet/$c.unsorted.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \
../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3.
end
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/blastz.rn3/axtBest
cd /cluster/data/mm5/bed/blastz.rn3/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3/axtNet
mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
gzip *.axt
# add README.txt file to dir (use previous assembly's copy as template)
# Convert those axt files to psl
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestRn3.psl
echo "Done: ${c}_blastzBestRn3.psl"
end
# Load tables
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3/pslBest
bash
for I in chr*BestRn3.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
echo "done ${I}"
done
# check results
# featureBits mm5 blastzBestRn3
# 1778653886 bases of 2615483787 (68.005%) in intersection
# featureBits mm4 blastzBestRn3
# 1780774716 bases of 2627444668 (67.776%) in intersection
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/mm5/axtBest/Rn3
cd /gbdb/mm5/axtBest/Rn3
ln -s /cluster/data/mm5/bed/blastz.rn3/axtNet/chr*.axt .
cd /cluster/data/mm5/bed/blastz.rn3/axtNet
rm -f axtInfoInserts.sql
foreach f (/gbdb/mm5/axtBest/Rn3/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
>> axtInfoInserts.sql
end
hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql
# table axtInfo may already exist, ignore create error.
hgsql mm5 < axtInfoInserts.sql
# BLASTZ RN3 CLEAN UP (DONE - 2004-07-26 - Fan)
# NOTE: THIS IS RE-DONE. SEE THE SAME SECTION OF 2004-08-31. Fan.
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3
nice rm -rf raw &
nice rm axtChain/run1/chain/* &
nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} &
# MAKE VSRN3 DOWNLOADABLES (DONE 9/14/04 Fan)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3/axtChain
ln all.chain rat.chain
foreach f (rat.chain rat.net)
gzip -c $f > $f.gz
end
rm rat.chain
# Make chain-format of raw alignments
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3
mkdir blastzECF
foreach f (axtChrom/chr*.axt)
set chr = $f:t:r
axtToChain $f S1.len S2.len stdout \
| gzip -c - > blastzECF/$chr.ecf.gz
end
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsRn3
cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3
mv /cluster/data/mm5/bed/blastz.rn3/axtChain/rat*.gz .
cp -p /cluster/data/mm5/bed/blastz.rn3/axtChain/all.chain.gz \
/usr/local/apache/htdocs/goldenPath/mm5/vsRn3/rat.chain.gz
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# Not for pushing -- handle separately.
mv /cluster/data/mm5/bed/blastz.rn3/blastzECF .
cd blastzECF
md5sum *.gz > md5sum.txt
# BLASTZ ZEBRAFISH (DANRER1) (DONE, 2004-07-29, hartera)
ssh kkr1u00
# blastz requires lineage-specific repeats
# Treat all repeats as lineage-specific.
mkdir -p /iscratch/i/mm5/linSpecRep.notInZebrafish
foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/mm5/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
end
mkdir -p /iscratch/i/danRer1/linSpecRep.notInMouse
foreach f (/iscratch/i/danRer1/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/danRer1/linSpecRep.notInMouse/$f:t:r:r.out.spec
end
iSync
ssh kk
mkdir -p /cluster/data/mm5/bed/blastz.danRer1.2004-07-27
ln -s /cluster/data/mm5/bed/blastz.danRer1.2004-07-27 \
/cluster/data/mm5/bed/blastz.danRer1
cd /cluster/data/mm5/bed/blastz.danRer1
# use same parameters as for danRer1-hg17
cat << '_EOF_' > DEF
# mouse (mm5) vs zebrafish (danRer1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Reuse parameters from hg16-fr1 and danRer1-hg17.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse (mm5)
SEQ1_DIR=/cluster/bluearc/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer1)
SEQ2_DIR=/iscratch/i/danRer1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/danRer1/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastz.danRer1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
# Save the DEF file in the current standard place
chmod +x DEF
cp DEF ~angie/hummus/DEF.mm5-danRer1.2004-07-27
# setup cluster run
# copy shell scripts for blastz runs if not there already
cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/
# edit BlastZ_run0.sh
# replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
# this is the directory for the latest version of blastz-run
# source the DEF file
bash
. ./DEF
/cluster/data/mm5/jkStuff/BlastZ_run0.sh
cd run.0
# check batch looks ok then
para try, check, push, check, ....
# para time
# Completed: 57970 of 57970 jobs
# CPU time in finished jobs: 18228826s 303813.77m 5063.56h 210.98d 0.578 y
# IO & Wait Time: 1019215s 16986.92m 283.12h 11.80d 0.032 y
# Average job time: 332s 5.53m 0.09h 0.00d
# Longest job: 2211s 36.85m 0.61h 0.03d
# Submission to last job: 45422s 757.03m 12.62h 0.53d
# Took about 12 hours to run and output is 1.7G
# second cluster run to convert the .out's to .lav's
cd /cluster/data/mm5/bed/blastz.danRer1
bash # if a csh/tcsh user
. ./DEF
/cluster/data/mm5/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# para time
# Checking finished jobs
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 4536s 75.60m 1.26h 0.05d 0.000 y
# IO & Wait Time: 65931s 1098.85m 18.31h 0.76d 0.002 y
# Average job time: 207s 3.44m 0.06h 0.00d
# Longest job: 636s 10.60m 0.18h 0.01d
# Submission to last job: 1282s 21.37m 0.36h 0.01d
# Third cluster run to convert lav's to axt's
ssh kki
cd /cluster/data/mm5/bed/blastz.danRer1
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /cluster/bluearc/scratch/mus/mm5/softNib \
/iscratch/i/danRer1/nib stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.danRer1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 246s 4.10m 0.07h 0.00d 0.000 y
# IO & Wait Time: 4985s 83.08m 1.38h 0.06d 0.000 y
# Average job time: 122s 2.03m 0.03h 0.00d
# Longest job: 446s 7.43m 0.12h 0.01d
# Submission to last job: 653s 10.88m 0.18h 0.01d
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/mm5/bed/blastz.danRer1
mkdir -p pslChrom
set tbl = "blastzDanRer1"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.danRer1/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl mm5 $f
end
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1 -enrichment
#refGene:cds 0.763%,blastzDanRer1 2.918%,both 0.512%,cover 67.12%,enrich 23.00x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L4000 -enrichment
# refGene:cds 0.763%, blastzDanRer1L4000 17.878%, both 0.581%, cover 76.18%,
# enrich 4.26x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L5000 -enrichment
# refGene:cds 0.763%,blastzDanRer1L5000 6.013%,both 0.540%,cover 70.81%,
# enrich 11.78x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L6500 -enrichment
# refGene:cds 0.763%, blastzDanRer1L6500 2.386%, both 0.495%, cover 64.91%,
# enrich 27.20x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L7000 -enrichment
# refGene:cds 0.763%, blastzDanRer1L7000 2.062%, both 0.480%, cover 62.87%,
# enrich 30.50x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1HumanParams -enrichment
# refGene:cds 0.763%,blastzDanRer1HumanParams 1.661%,both 0.502%, cover 65.82%,
# enrich 39.64x
# row counts: 172167 blastzDanRer1,
# 2288714 blastzDanRer1HumanParams,
# 3373525 blastzDanRer1L4000
# 700927 blastzDanRer1L5000
# 13719318 blastzDanRer1L3000
# 103190 blastzDanRer1L6500
# 76758 blastzDanRer1L7000
# Do test runs - repeat above using L=4000 and then try the mm5-hg17 parameters
# also L=2000, L=3000 and L=5000. Use only mm5 chr1 for tests.
# L=2000 and L=3000 lavToAxt crashed so re-do on kolossus. L2000 crashed again
# probably ran out of memory.
# The orginal blastzDanRer1 with L= 6000 looks best: good coverage and
# enrichment without too many alignments in the database table.
# RESCORE DANRER1 BLASTZ ALIGNMENTS (DONE, 2004-08-02, hartera)
# Low scores can occur with repeats abridged and using the
# HoxD55.q matrix. PSU's restore_rpts program rescored alignments
# with the default matrix instead of the BLASTZ_Q matrix.
# Rescore them here so the chainer sees the higher scores:
ssh kolossus
cd /cluster/data/mm5/bed/blastz.danRer1
mkdir axtChrom.rescore
foreach f (axtChrom/chr*.axt)
axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
$f axtChrom.rescore/$f:t
end
mv axtChrom axtChrom.orig
mv axtChrom.rescore axtChrom
# psl files and blastz tables will be the same regardless of score so
# no need to reload
# CHAIN DANRER1 BLASTZ (DONE, 2004-08-03, hartera)
# FILTERED WITH A HIGHER MINSCORE THRESHOLD (DONE, 2004-08-04, hartera)
# RELOADED TABLES (DONE, 2004-08-18, hartera)
# removed all chainDanRer1 and chainDanRer1Link tables, some extra tables had
# been accidentally loaded with this name from a different genome so there
# were duplicate chain ids causing joinerCheck to complain.
# Re do chains with rescored blastz danRer1
# Run axtChain on little cluster
ssh kki
cd /cluster/data/mm5/bed/blastz.danRer1
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/mm5/bed/blastz.danRer1/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Make our own linear gap file with reduced gap penalties,
# in hopes of getting longer chains:
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V 11
smallSize^V 111
position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111
qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600
tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600
bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../chickenHumanTuned.gap \
-minScore=5000 $1 \
/cluster/bluearc/scratch/mus/mm5/softNib \
/iscratch/i/danRer1/nib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 2260s 37.67m 0.63h 0.03d 0.000 y
# IO & Wait Time: 863s 14.38m 0.24h 0.01d 0.000 y
# Average job time: 73s 1.21m 0.02h 0.00d
# Longest job: 342s 5.70m 0.10h 0.00d
# Submission to last job: 36951s 615.85m 10.26h 0.43d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
chainMergeSort run1/chain/*.chain > all.chain
# filter again to use minScore of 7500 (see featureBits below) (2004-08-04)
mv all.chain all.chain.filt5k
chainFilter -minScore=7500 all.chain.unfiltered > all.chain
# remove old chains
rm -r chain
chainSplit chain all.chain
gzip all.chain.filt5k
# take a look at score distr's,try also with smaller bin size.
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist.out
textHistogram -binSize=10000 /tmp/score.$f:t:r >> hist.out
echo ""
end
# also hist5000.out has bin size 5000. looks good so load into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.danRer1/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain mm5 ${c}_chainDanRer1 $i
echo done $c
end
# featureBits still shows good coverage and enrichment
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Link -enrichment
# refGene:cds 0.763%, chainDanRer1Link 2.246%, both 0.508%, cover 66.61%,
# enrich 29.65x
# Human Parameters Blastz Chain with minScore = 5,000 filter:
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1HPLink -enrichment
# refGene:cds 0.763%, chainDanRer1HPLink 1.208%, both 0.484%, cover 63.43%,
# enrich 52.49x
# L=5000 Blastz Chain with minScore = 5,000 filter:
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1L5kLink -enrichment
# refGene:cds 0.763%, chainDanRer1L5kLink 4.137%, both 0.534%, cover 69.96%,
# enrich 16.91x
# L=5000 Blastz Chain with minScore =10,000 filter:
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1L5k10kLink -enrichment
# refGene:cds 0.763%, chainDanRer1L5k10kLink 1.038%, both 0.448%, cover 58.69%,
# enrich 56.54x
# filter too stringent, coverage has dropped a lot
# with less filtering of blastzDanRer1 where minScore =3000
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt3kLink -enrichment
# refGene:cds 0.763%, chainDanRer1Filt3kLink 2.487%, both 0.509%, cover 66.78%,
# enrich 26.86x
# with more filtering, minScore = 6000
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt6kLink -enrichment
# refGene:cds 0.763%, chainDanRer1Filt6kLink 2.172%, both 0.508%, cover 66.54%, # enrich 30.64x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt7500Link -enrichment
# refGene:cds 0.763%, chainDanRer1Filt75kLink 2.022%, both 0.504%, cover 66.10%,# enrich 32.70x
# rows in database table:
# chr1_blastzDanRer1Link: 515119
# chr1_chainDanRer1L5kLink: 1241480
# chr1_chainDanRer1L5k10kLink: 74963
# chr1_chainDanRer1HPLink: 309740
# chr1_chainDanRer1Filt3k: 594057
# chr1_chainDanRer1Filt6kLink: 479368
# chr1_chainDanRer1Filt7500Link: 378954
# Using the original parameters is a good compromise between high coverage
# and high enrichment but a filter of 7500 on the score produces only a tiny
# reduction in coverage with higher enrichment as there are a lot less
# alignments of low score of the same regions or other low scoring alignments.
# NET DANRER1 BLASTZ (DONE, 2004-08-04, hartera)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 103493632, utime 668 s/100, stime 127
# Add classification info using db tables:
cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInMouse
cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
/cluster/bluearc/mm5/linSpecRep.notInZebrafish
cp /iscratch/i/danRer1/linSpecRep.notInMouse/* \
/cluster/bluearc/danRer1/linSpecRep.notInMouse
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
time netClass noClass.net mm5 danRer1 danRer1.net \
-tNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish \
-qNewR=/cluster/bluearc/danRer1/linSpecRep.notInMouse -noAr
# 77.700u 46.610s 3:05.75 66.9% 0+0k 0+0io 215pf+0w
netFilter -minGap=10 danRer1.net | hgLoadNet mm5 netDanRer1 stdin
# EXTRACT AXTs AND MAFs FROM ZEBRAFISH (danRer1) NET
# (DONE, 2004-08-04, hartera)
ssh eieio
# create axts
cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
netSplit danRer1.net danRer1Net
mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
foreach f (danRer1Net/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt danRer1Net/$c.net chain/$c.chain \
/cluster/data/mm5/mixedNib \
/cluster/data/danRer1/nib ../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
'EOF'
chmod +x axtNet.csh
csh axtNet.csh >&! axtNet.log &
tail -100f axtNet.log
# sort axts before making mafs - must be sorted for multiz
cd /cluster/data/mm5/bed/blastz.danRer1
mv axtNet axtNet.unsorted
mkdir axtNet
foreach f (axtNet.unsorted/*.axt)
set c = $f:t:r
echo "Sorting $c"
axtSort $f axtNet/$c.axt
end
# create maf
ssh eieio
cd /cluster/data/mm5/bed/blastz.danRer1
cd axtNet
mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
foreach f (chr*.axt)
set maf = $f:t:r.danRer1.maf
echo translating $f to $maf
axtToMaf $f \
/cluster/data/mm5/chrom.sizes /cluster/data/danRer1/chrom.sizes \
../mafNet/$maf -tPrefix=mm5. -qPrefix=danRer1.
end
'EOF'
chmod +x makeMaf.csh
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
# BLASTZ DANRER1 CLEAN UP (DONE, 2004-08-04, hartera)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.danRer1
nice rm -rf raw &
nice rm -rf lav &
nice rm -rf axtChrom.orig &
nice rm axtChain/run1/chain/* &
nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
# unzip all.chain.gz and danRer1.net.gz to make vsDanRer1 downloadables
# then zip these again (hartera, 2004-09-10)
# UPDATE BACEND SEQUENCES (DONE - 2004-07-20 - Fan)
# Download new files
ssh kksilo
mkdir –p /cluster/data/mm5/bed/bacends/ncbi
cd /cluster/data/mm5/bed/bacends/ncbi
wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/AllBACends.mfa.gz
wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/cl_acc_gi_len.gz
wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/README
gunzip AllBACends.mfa.gz
gunzip cl_acc_gi_len.gz
# Convert fa file
cat << '_EOF_' > convert.pl
#!/usr/local/bin/perl -w
use strict;
while (my $line = <>) {
if (substr($line,0,1) ne ">") {
print $line;
} else {
my @fields = split(/\|/, $line);
my $printed = 0;
for (my $i = 0; $i < $#fields; $i++) {
if ($fields[$i] eq "gb") {
(my $name, my $vers) = split(/\./,$fields[$i+1]);
print ">$name\n";
$i= $#fields;
$printed = 1;
}
}
if (!$printed) {
die("Failed for $line\n");
}
}
}
'_EOF_'
chmod +x convert.pl
./convert.pl < AllBACends.mfa > BACends.fa
# Create new pairs files
convertBacEndPairInfo cl_acc_gi_len
# Split file into pieces and copy to cluster to propagate
ssh kksilo
cd /cluster/data/mm5/bed/bacends/ncbi
/cluster/bin/i386/faSplit sequence BACends.fa 100 BACends
rm -rf /cluster/bluearc/scratch/mus/mm5/bacEnds
mkdir /cluster/bluearc/scratch/mus/mm5/bacEnds
mv BACends???.fa /cluster/bluearc/scratch/mus/mm5/bacEnds
cp -p BACends.fa /cluster/bluearc/scratch/mus/mm5/bacEnds
# Ask for propagation from sysadmin
# Load the sequences (change bacends.# to match correct location)
ssh hgwdev
mkdir /gbdb/mm5/bacends
cd /gbdb/mm5/bacends
ln -s /cluster/data/mm5/bed/bacends/ncbi/BACends.fa .
cd /tmp
hgLoadSeq mm5 /gbdb/mm5/bacends/BACends.fa
#Adding /gbdb/mm5/bacends/BACends.fa
#452237 sequences
#Updating seq table
# One additional step 9/10/04 Fan.
# Create a composite index to speed up hgTracks display when BAC Ends track selected.
hgsql mm5 -e 'create index bacIndex2 on all_bacends(bin, qName(8));'
# This will take hours.
#All done
# BACEND SEQUENCE ALIGNMENTS (DONE - 2004-07-23 - Fan)
# (alignments done without RepeatMasking)
# We need an ooc file for this genome
ssh kksilo
mkdir /cluster/data/mm5/ooc
cd /cluster/data/mm5/ooc
ls ../unmaskedNib/chr*.nib > nib.list
blat -makeOoc=11.ooc -repMatch=1024 nib.list nib.list output.psl
# Wrote 26077 overused 11-mers to 11.ooc
# Did not end using this. Used an old one instead.
# Create full sequence alignments
ssh kk
cd /cluster/data/mm5/bed/bacends
/cluster/bin/scripts/splitContigList -scratch
/iscratch/i/mus/mm5/maskedContigs 1
# allow blat to run politely in /tmp while it writes output, then
# copy results to results file:
cat << '_EOF_' > runBlat.sh
#!/bin/sh
path1=$1
path2=$2
root1=$3
root2=$4
result=$5
rm -fr /tmp/${root1}_${root2}
mkdir /tmp/${root1}_${root2}
pushd /tmp/${root1}_${root2}
/cluster/bin/i386/blat ${path1} ${path2} -ooc=/scratch/hg/h/mouse11.ooc \
${root1}.${root2}.psl
popd
rm -f ${result}
mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /tmp/${root1}_${root2}
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x runBlat.sh
cat << '_EOF_' > template
#LOOP
./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1)
$(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
#ls -1S /iscratch/i/mm5/bacEnds/BACends???.fa > bacEnds.lst
ls -1S /scratch/mus/mm5/bacEnds/BACends???.fa > bacEnds.lst
mkdir bacEnds.out
# create results directories for each to avoid the all result files in
# one directory problem
foreach f (`cat bacEnds.lst`)
set b = $f:t:r
echo $b
mkdir bacEnds.out/$b
end
gensub2 contig.lst bacEnds.lst template jobList
para create jobList
# 62622 jobs written to batch
para try, check, push, etc ...
# Completed: 62622 of 62622 jobs
# CPU time in finished jobs: 3760354s 62672.57m 1044.54h 43.52d 0.119 y
# IO & Wait Time: 3216480s 53608.00m 893.47h 37.23d 0.102 y
# Average job time: 111s 1.86m 0.03h 0.00d
# Longest job: 2841s 47.35m 0.79h 0.03d
# Submission to last job: 9395s 156.58m 2.61h 0.11d
# Compile alignments and lift the files.
# First attempt failed due to /cluster/store6 ran out of space.
# Redoing it 7/22/04.
ssh kksilo
cd /cluster/data/mm5/bed/bacends
mkdir /cluster/store8/fanTemp
time pslSort dirs raw.psl /cluster/store8/fanTemp bacEnds.out/* \
> time.out &
# This may take over over 14 hours!
ssh kolossus
cd /cluster/data/mm5/bed/bacends
time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl
bacEnds.psl /dev/null
# Processed 562840490 alignments
rmdir temp
# You will want to keep this file around until later processing is
# proven correct
rm raw.psl # 72 Gb ! It takes a while even to remove it.
ssh kksilo
cd /cluster/data/mm5/bed/bacends
time /cluster/bin/scripts/lifter -psl -mouse /cluster/data/mm5 bacEnds.psl
# real 130m36.149s
# user 82m38.180s
# sys 10m59.580s
cp -p ~booch/clusterJobs/bacends/split.pl .
cp -p ~booch/clusterJobs/bacends/header .
time ./split.pl header < bacEnds.psl.lifted
# real 2m16.354s
# user 0m36.390s
# sys 0m42.290s
cp -p bacEnds.psl.lifted bacEnds.psl.lifted.save
time pslSort dirs bacEnds.psl.lifted temp split
# real 17m2.353s
# user 14m17.040s
# sys 1m38.560s
rmdir temp
rm -r split
# Copy files to final destination and remove
mkdir /cluster/data/mm5/bacends
cp -p bacEnds.psl.lifted /cluster/data/mm5/bacends
# BACEND PAIRS TRACK (DONE 2004-07-27 - Fan)
ssh kolossus
cd /cluster/data/mm5/bacends
bash
time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl.lifted \
../bed/bacends/ncbi/bacEndPairs.txt all_bacends bacEnds
# create header required by "rdb" tools
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' >
header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
# edit header to make sure \t is/become tab character
cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -
del > bacEndPairs.bed
cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch
bacEnds.orphan \
| row score ge 300 | sorttbl chr start | headchg -del >
bacEndPairsBad.bed
# The following took too long, break it into 3 steps.
# extractPslLoad -noBin bacEnds.psl.lifted bacEndPairs.bed \
# bacEndPairsBad.bed | sorttbl tname tstart | headchg -del > bacEnds.load.psl
extractPslLoad -noBin bacEnds.psl.lifted bacEndPairs.bed \
bacEndPairsBad.bed >j1.out
cat j1.out| sorttbl tname tstart >j2.out
cat j2.out | headchg -del > bacEnds.load.psl
rm j1.out j2.out
# load into database
ssh hgwdev
cd /cluster/data/mm5/bacends
# edit bacEndPairs.bed to fix one ID that has a blank character in it.
hgLoadBed mm5 bacEndPairs bacEndPairs.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql
# Loaded 168535
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed mm5 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 43182
#hgLoadPsl mm5 -nobin -table=all_bacends bacEnds.load.psl
# NOTE: truncates file to 0 if -nobin is used
hgLoadPsl mm5 -table=all_bacends bacEnds.load.psl
# load of all_bacends did not go as planned: 14426473 record(s), 0 row(s)
skipped, 4519 warning(s) loading psl.tab
# featureBits mm5 all_bacends
# 268502414 bases of 2615483787 (10.266%) in intersection
# featureBits mm4 all_bacends
# 243096171 bases of 2627444668 (9.252%) in intersection
# featureBits mm5 bacEndPairs
# 2567958504 bases of 2615483787 (98.183%) in intersection
# featureBits mm4 bacEndPairs
# 2549945356 bases of 2627444668 (97.050%) in intersection
# featureBits mm5 bacEndPairsBad
# 541027882 bases of 2615483787 (20.686%) in intersection
# featureBits mm4 bacEndPairsBad
# 1074505863 bases of 2627444668 (40.895%) in intersection
# BLASTZ FUGU (FR1) (WORKING 7/28/04 kate)
# Using Angie's hg17/fugu as a model
# Treat all mouse repeats as lineage-specific (same as chicken, so just
# reuse linSpecRep.Chicken).
ssh kkr1u00
ln -s /iscratch/i/mus/mm5/linSpecRep.notInChicken \
/iscratch/i/mus/mm5/linSpecRep.notInFugu
iSync
ssh kk
cd /cluster/data/mm5/bed
mkdir blastz.fr1.2004-07-28
ln -s blastz.fr1.2004-07-28 blastz.fr1
cd blastz.fr1
cat << '_EOF_' > DEF
# mouse vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse
SEQ1_DIR=/iscratch/i/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInFugu
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Fugu
SEQ2_DIR=/iscratch/i/fr1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/fr1/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastz.fr1.2004-07-28
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# first cluster run: raw blastz alignments
ssh kk
bash # if a csh/tcsh user
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
source DEF
mkdir $RAW run.0
/cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
sh ./xdir.sh
cd run.0
sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
para create jobList
para try, check, push, check, ....
# GOT HERE
#Completed: 93775 of 93775 jobs
#Average job time: 187s 3.11m 0.05h 0.00d
#Longest job: 3907s 65.12m 1.09h 0.05d
#Submission to last job: 76763s 1279.38m 21.32h 0.89d
# second cluster run: lift raw alignments -> lav dir
ssh kki
bash # if a csh/tcsh user
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
source DEF
mkdir run.1 lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
cd run.1
wc -l jobList
para create jobList
para try, check, push, etc ...
#Completed: 341 of 341 jobs
#Average job time: 98s 1.63m 0.03h 0.00d
#Longest job: 281s 4.68m 0.08h 0.00d
#Submission to last job: 2102s 35.03m 0.58h 0.02d
# third run: lav -> axt
# (if non-default BLASTZ_Q is used in the future, put axtRescore in
# the pipe after lavToAxt)
ssh kki
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
mkdir axtChrom pslChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
/iscratch/i/mus/mm5/softNib /iscratch/i/fr1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
foreach d (../lav/chr*)
echo "do.csh $d" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 43 of 43 jobs
#Average job time: 671s 11.18m 0.19h 0.01d
#Longest job: 2398s 39.97m 0.67h 0.03d
#Submission to last job: 2417s 40.28m 0.67h 0.03d
# CHAIN FUGU BLASTZ (WORKING 7/16/04 kate)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chainchimpSuperQuals
ls -1S /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
/iscratch/i/mus/mm5/softNib \
/iscratch/i/fr1/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
#Completed: 43 of 43 jobs
#Average job time: 537s 8.96m 0.15h 0.01d
#Longest job: 2071s 34.52m 0.58h 0.02d
#Submission to last job: 2071s 34.52m 0.58h 0.02d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=5000 /tmp/score.$f:t:r
echo ""
end
# Lots of chaff with scores in the 3000's. Many very-high-scoring
# chains. So filter the chain down somewhat...
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
rm chain/*
chainSplit chain all.chain
gzip all.chain.unfiltered
# Load chains into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain mm5 ${c}_chainFr1 $i
end
# mouse-fugu gets significantly less coverage than human-fugu:
featureBits mm5 -chrom=chr1 chainFr1Link
#63386139 bases of 185739816 (34.126%) in intersection
featureBits hg17 -chrom=chr1 chainFr1Link
#123999291 bases of 222827847 (55.648%) in intersection
# mouse-fugu isn't a whole lot less than mouse-human though:
featureBits mm5 -chrom=chr1 chainHg17Link
#75492250 bases of 185739816 (40.644%) in intersection
featureBits mm5 -chrom=chr1 chainCanFam1Link
#63386139 bases of 185739816 (34.126%) in intersection
# NET FUGU BLASTZ (WORKING 7/16/04 kate)
ssh kolossus
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
netClass -noAr noClass.net mm5 fr1 fugu.net
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn fugu.net > fuguSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
netFilter -minGap=10 fugu.net | hgLoadNet mm5 netFr1 stdin
netFilter -minGap=10 fuguSyn.net | hgLoadNet mm5 syntenyNetFr1 stdin
# Add entries for chainFr1, netFr1 to mouse/mm5 trackDb
# MAKE VSFR1 DOWNLOADABLES (WORKING 7/19/04 kate)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
gzip axtNet/chr*.axt
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
ln all.chain fugu.chain
zip /cluster/data/mm5/zip/fugu.chain.zip fugu.chain
rm fugu.chain
zip /cluster/data/mm5/zip/fugu.net.zip fugu.net
zip /cluster/data/mm5/zip/fuguSyn.net.zip fuguSyn.net
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsFr1
cd /usr/local/apache/htdocs/goldenPath/mm5/vsFr1
mv /cluster/data/mm5/zip/fugu*.zip .
cp -pR /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtNet .
md5sum *.zip axtNet/* > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# GENERATE FR1 MAF FOR MULTIZ FROM NET (WORKING 7/19/04 kate)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
netSplit fugu.net net
cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
mkdir axtNet
foreach f (axtChain/net/*)
set chr = $f:t:r
netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/nib \
/cluster/data/fr1/nib stdout \
| axtSort stdin axtNet/$chr.axt
end
mkdir mafNet
foreach f (axtNet/chr*.axt)
set maf = mafNet/$f:t:r.mc.maf
axtToMaf $f \
/cluster/data/mm5/chrom.sizes /cluster/data/fr1/chrom.sizes \
$maf -tPrefix=mm5. -qPrefix=fr1.
end
# BLASTZ FR1 CLEAN UP (WORKING - 2004-07-28 - kate)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.fr1
nice rm -rf raw &
nice rm axtChain/run1/chain/* &
nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} &
# CONSERVATION TRACK - MULTIZ AND PHASTCONS (WORKING 2004-07-29 kate)
ssh kksilo
set multizDir = multiz.2004-07-29
set workingDir = /cluster/bluearc/mm5/$multizDir
ln -s $workingDir /cluster/bluearc/mm5/multiz5way
mkdir -p $workingDir
mkdir -p /cluster/data/mm5/bed/$multizDir
cd /cluster/data/mm5/bed/$multizDir
# wrapper script for multiz
# NOTE: first arg is pairwise, 2nd arg is multiple (to add to)
# NOTE: next time, modify script so it only needs one arg -- saves the
# multiple dirname in a file for use by the next run
cat << 'EOF' > doMultiz.csh
#!/bin/csh -fe
mkdir -p $3:h
/cluster/bin/penn/multiz $1 $2 - > $3
'EOF'
# << for emacs
cat << 'EOF' > gsub
#LOOP
../doMultiz.csh {check in line /cluster/bluearc/mm5/multiz.2004-07-29/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/mm5/multiz.2004-07-29/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/mm5/multiz.2004-07-29/$(root1)$(dir1)/$(root2).maf}
#ENDLOOP
'EOF'
# << for emacs
chmod +x doMultiz.csh
# copy mafs to bluearc -- rat
ssh kksilo
set workingDir = /cluster/bluearc/mm5/multiz.2004-07-29
mkdir $workingDir/rn3
cp /cluster/data/mm5/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3
ls $workingDir/rn3/*.maf > chrom.lst
# human
mkdir $workingDir/hg17
cp /cluster/data/mm5/bed/blastz.hg17/mafNet/chr*.maf $workingDir/hg17
# dog
mkdir $workingDir/canFam1
cp /cluster/data/mm5/bed/blastz.canFam1/mafNet/chr*.maf $workingDir/canFam1
# chicken
mkdir $workingDir/galGal2
cp /cluster/data/mm5/bed/blastz.galGal2/mafNet/chr*.maf $workingDir/galGal2
# first multiz - add in human to mouse/rat
#
ssh kki
set multizDir = multiz.2004-07-29
set workingDir = /cluster/bluearc/mm5/$multizDir
cd /cluster/data/mm5/bed/$multizDir
mkdir run.hg17
cd run.hg17
echo "hg17/rn3" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
para create jobList
# 43 jobs
para try, check, push, check
cd ..
# dog
mkdir run.canFam1
cd run.canFam1
echo "canFam1/rn3hg17" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
para create jobList
para try, check, push, check
cd ..
# chicken
mkdir run.galGal2
cd run.galGal2
echo "galGal2/rn3hg17canFam1" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
# no alignment file for chr18_random -- create one so we can create jobList
para create jobList
para try, check, push, check
cd ..
# copy 5-way mafs to build directory
ssh kksilo
set multizDir = multiz.2004-07-29
set workingDir = /cluster/bluearc/mm5/$multizDir
ln -s $workingDir/rn3hg17canFam1galGal2 $workingDir/maf
cd /cluster/data/mm5/bed/multiz.2004-07-29
mkdir maf
cp $workingDir/maf/*.maf maf
# PHYLO-HMM CONSERVATION FOR 5-WAY MULTIZ (DONE 2004-07-29 kate)
# updated 09-13-04 acs
ssh kksilo
set path = ($path /cluster/bin/phast)
cd /cluster/data/mm5/bed/multiz.2004-07-29
mkdir cons
cd cons
#break up the genome-wide MAFs into pieces
mkdir /cluster/bluearc/mm5/chrom
cd /cluster/data/mm5
foreach f (?{,?}/*.fa)
echo $f
cp $f /cluster/bluearc/mm5/chrom
end
ssh kki
cd /cluster/data/mm5/bed/multiz.2004-07-29/cons
mkdir run.split
cd run.split
set WINDOWS = /cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.sh
#!/bin/sh
PHAST=/cluster/bin/phast
FA_SRC=/cluster/bluearc/mm5/chrom
WINDOWS=/cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS
maf=$1
c=`basename $maf .maf`
echo $c
mkdir -p /scratch/msa_split
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O mm5,rn3,hg17,canFam1,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000
[ $? -eq 0 ] || exit 1
echo "Copying..."
cd /scratch/msa_split
for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
[ $? -eq 0 ] || exit 1
rm -f /scratch/msa_split/$c.*.ss
echo "Done copying"
echo "Done" >> ${WINDOWS}/$c.done
'EOF'
# << for emacs
chmod +x doSplit.sh
rm -f jobList
foreach file (/cluster/bluearc/mm5/multiz.2004-07-29/maf/*.maf)
set c = $file:t:r
echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList
end
para create jobList
# 43 jobs
para try
para check
para push
#CPU time in finished jobs: 4354s 72.57m 1.21h 0.05d 0.000 y
#IO & Wait Time: 6102s 101.70m 1.70h 0.07d 0.000 y
#Average job time: 243s 4.05m 0.07h 0.00d
#Longest job: 728s 12.13m 0.20h 0.01d
#Submission to last job: 1300s 21.67m 0.36h 0.02d
cd ..
# generate conservation scoring using phastCons
ssh kk
cd /cluster/data/mm5/bed/multiz.2004-07-29/cons
mkdir run.cons
cd run.cons
# skip parameter estimation step: use parameters already estimated for
# hg17 (see makeHg17.doc)
cp /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/ave.cons.mod /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/ave.noncons.mod .
cat << 'EOF' > doPhastCons.sh
#!/bin/sh
mkdir -p /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod
--expected-lengths 12 --target-coverage 0.15 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/mm5/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/mm5/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
EOF
chmod u+x doPhastCons.sh
rm -fr /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/ELEMENTS
rm -f jobs.lst
for f in /cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs.lst ; done
# run cluster job
para create, ...
# took about 10 minutes
# combine predictions and transform scores to be in 0-1000 interval
# do in a way that avoids limits on numbers of args
find /cluster/bluearc/mm5/phastCons/ELEMENTS -name "*.bed" > files
rm -f splitfiles* all.raw.bed
split files splitfiles
for s in splitfiles* ; do awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed ; done
/cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
rm files splitfiles*
hgLoadBed mm5 phastConsElements all.bed
# check coverage
featureBits mm5 phastConsElements
#135605549 bases of 2615483787 (5.185%) in intersection
# This should be close enough. If necessary, you can rerun the
# steps above with a different target coverage. When hitting the
# target is important, you may want to perform several iterations
# using a representative subset of the entire dataset (in human, chr1
# seems to work pretty well)
# set up wiggle
mkdir -p /cluster/bluearc/mm5/phastCons/wib
cat << 'EOF' > doWigAsciiToBinary.sh
#!/bin/sh
chr=$1
zcat `ls /cluster/bluearc/mm5/phastCons/POSTPROBS/$chr.*.pp.gz | sort -t\. -k2,2n` | wigAsciiToBinary -chrom=$chr -wibFile=/cluster/bluearc/mm5/phastCons/wib/${chr}_phastCons stdin
EOF
chmod u+x doWigAsciiToBinary.sh
rm -f jobs2.lst
for chr in `ls /cluster/bluearc/mm5/phastCons/POSTPROBS | awk -F\. '{print $1}' | sort -u` ; do echo doWigAsciiToBinary.sh $chr >> jobs2.lst ; done
# run a little wigAsciiToBinary cluster job
ssh kk, etc.
# copy wibs and wigs from bluearc
rsync -av /cluster/bluearc/mm5/phastCons/wib .
# load track
hgLoadWiggle mm5 phastCons -pathPrefix=/gbdb/mm5/phastCons/wib \
wib/chr*_phastCons.wig
mkdir -p /gbdb/mm5/phastCons/wib
rm -f /gbdb/mm5/phastCons/wib/chr*phastCons.wib
ln -s /cluster/data/mm5/bed/multiz.2004-07-29/cons/run.cons/wib/*.wib /gbdb/mm5/phastCons/wib
chmod 775 . wib /gbdb/mm5/phastCons /gbdb/mm5/phastCons/wib
chmod 664 wib/*.wib
# move postprobs over and clean up bluearc
rsync -av /cluster/bluearc/mm5/phastCons/POSTPROBS .
# (people sometimes want the raw scores)
rm -r /cluster/bluearc/mm5/phastCons/ELEMENTS /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/wib
# load data for track name "multiz5way"
# load multiz maf tables
ssh hgwdev
cd /cluster/data/mm5/bed/multiz.2004-07-29
set mafDir = /gbdb/mm5/multiz5way/maf
set table = multiz5way
mkdir -p $mafDir/$table
ln -s `pwd`/maf/*.maf $mafDir/$table
cd maf
hgLoadMaf mm5 -warn multiz5way -pathPrefix=$mafDir/$table
# load blastz maf tables
# TODO: change mafWiggle to use db names instead of species names
# in speciesOrder
ssh hgwdev
cd /cluster/data/mm5/bed
ln -s multiz.2004-07-29 multiz5way
cat > multiz5way/loadMaf.csh << 'EOF'
set mafDir = /gbdb/mm5/multiz5way/maf
foreach s (rn3 hg17 canFam1 galGal2)
set O = `echo "select genome from dbDb where name='$s'" | \
hgsql -s -h genome-testdb hgcentraltest`
set o = $O:l
set table = ${o}_netBlastz
mkdir -p $mafDir/$table
ln -s `pwd`/blastz.$s/mafNet/*.maf $mafDir/$table
echo $o
hgLoadMaf mm5 -warn ${o}_netBlastz -pathPrefix=$mafDir/$table
end
'EOF'
# <<EOF for emacs
csh multiz5way/loadMaf.csh >&! multiz5way/loadMaf.log &
# track multiz5way
# shortLabel Conservation
# longLabel Rat/Human/Dog/Chicken Multiz Alignments & PhyloHMM Cons
# group compGeno
# priority 149
# visibility pack
#color 0, 10, 100
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons
# yLineOnOff Off
-# autoScaleDefault Off
+# autoScale Off
# pairwise netBlastz
# speciesOrder rat human dog chicken
# MULTIZ DOWNLOAD FILES (DONE kate 2004-08-03)
ssh kksilo
cd /cluster/data/mm5/bed/multiz5way
# multiz
mkdir gzMaf
foreach f (maf/*.maf)
gzip -c $f > gzMaf/$f:t.gz
echo $f
end
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/multiz5way
cd /usr/local/apache/htdocs/goldenPath/mm5/multiz5way
mv /cluster/data/mm5/bed/multiz5way/gzMaf/* .
rmdir /cluster/data/mm5/bed/multiz5way/gzMaf
md5sum *.gz > md5sum.txt
# make a README.txt file
# PHASTCONS SCORES DOWNLOADABLES (DONE 10/11/04 angie)
ssh kksilo
mkdir /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2
cd /cluster/data/mm5/bed/multiz5way/cons/run.cons/POSTPROBS
foreach chr (`awk '{print $1;}' /cluster/data/mm5/chrom.sizes`)
echo $chr
zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` \
| gzip -c \
> /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2/$chr.gz
end
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/mm5/phastCons
# Doh! /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2 is 8.6G now -- too much
# to dump on hgwdev's / which is at 94%. Instead of doing this:
#mv /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2 .
# make symbolic links:
mkdir /usr/local/apache/htdocs/goldenPath/mm5/phastCons/mzRn3Hg17Cf1Gg2
cd /usr/local/apache/htdocs/goldenPath/mm5/phastCons/mzRn3Hg17Cf1Gg2
ln -s /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2/* .
md5sum *.gz > md5sum.txt
# make a README.txt.
# PREP FOR LIFTOVER CHAINS TO THIS ASSEMBLY (2004-08-02 kate)
# split into 3K chunks
ssh kksilo
cd /cluster/data/mm5
set liftDir = /iscratch/i/mm5/liftOver/liftSplit
mkdir -p $liftDir
cd $liftDir
mkdir -p split lift
cat > split.csh << 'EOF'
set liftDir = /iscratch/i/mm5/liftOver/liftSplit
cd /cluster/data/mm5
foreach n (`ls ?{,?}/*.fa`)
set d = $n:h
set c = $n:t:r
echo $c
faSplit -lift=$liftDir/lift/$c.lft size \
/cluster/data/mm5/$d/$c.fa -oneFile 3000 $liftDir/split/$c
end
'EOF'
# << for emacs
csh split.csh >&! split.log &
tail -100f split.log
ssh kkr1u00
iSync
# LOAD GENEID GENES (DONE 8/2/04 Fan)
# reloaded 3/16/04 with -gtf instead of -exon=CDS (nec. now! for stop_codon)
mkdir -p /cluster/data/mm5/bed/geneid/download
cd /cluster/data/mm5/bed/geneid/download
foreach f (/cluster/data/mm5/*/chr*.fa)
set chr = $f:t:r
wget \
http://genome.imim.es/genepredictions/M.musculus/mmMay2004/geneid_v1.2/$chr.gtf
wget \
http://genome.imim.es/genepredictions/M.musculus/mmMay2004/geneid_v1.2/$chr.prot
end
# Add missing .1 to protein id's
foreach f (*.prot)
perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
end
cd ..
ldHgGene -genePredExt -gtf mm5 geneid download/*.gtf
hgPepPred mm5 generic geneidPep download/*-fixed.prot
# PRODUCING GENSCAN PREDICTIONS (DONE 08-03-04 Fan)
ssh hgwdev
mkdir /cluster/data/mm5/bed/genscan
cd /cluster/data/mm5/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kki
cd /cluster/data/mm5/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
rm -f genome.list
touch genome.list
foreach f ( `ls -1S /cluster/data/mm5/*/chr*_*/chr*_?{,?}.fa.masked` )
egrep '[ACGT]' $f > /dev/null
if ($status == 0) echo $f >> genome.list
end
wc -l genome.list
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line
gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out
line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -
par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para create jobList
para try, check, push, check, ...
# Completed: 638 of 639 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 386282s 6438.03m 107.30h 4.47d 0.012 y
# IO & Wait Time: 3735s 62.25m 1.04h 0.04d 0.000 y
# Average job time: 611s 10.19m 0.17h 0.01d
# Longest job: 22687s 378.12m 6.30h 0.26d
# Submission to last job: 33710s 561.83m 9.36h 0.39d
# If there are crashes, diagnose with "para problems".
# If a job crashes due to genscan running out of memory, re-run it
# manually with "-window=1200000" instead of "-window=2400000".
/cluster/bin/i386/gsBig /cluster/data/mm5/19/chr19_1/chr19_1.fa.masked
gtf/chr19_1.fa.gtf -trans=pep/chr19_1.fa.pep -subopt=subopt/chr19_1.fa.bed -
exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -
tmp=/tmp -window=1200000
# Convert these to chromosome level files as so:
ssh kksilo
cd /cluster/data/mm5/bed/genscan
liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/mm5/bed/genscan
# Reloaded without -genePredExt 1/6/05:
ldHgGene mm5 -gtf genscan genscan.gtf
hgPepPred mm5 generic genscanPep genscan.pep
hgLoadBed mm5 genscanSubopt genscanSubopt.bed
# MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie)
ssh hgwdev
mkdir /cluster/data/mm5/bed/mitopred
cd /cluster/data/mm5/bed/mitopred
wget http://mitopred.sdsc.edu/data/mus_30.out
perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' mus_30.out > mitopred.tab
cat > mitopred.sql << '_EOF_'
# Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
CREATE TABLE mitopred (
name varchar(10) not null, # SwissProt ID
confidence varchar(8) not null, # Confidence level
#Indices
PRIMARY KEY(name(6))
);
'_EOF_'
# << this line makes emacs coloring happy
hgsql mm5 < mitopred.sql
hgsql mm5 -e 'load data local infile "mitopred.tab" into table mitopred'
# STS MARKERS TRACK (RE-BUILT - 2004-08-24- Fan)
ssh kksilo
mkdir -p /cluster/data/mm5/bed/STSmarkers/downloads
cd /cluster/data/mm5/bed/STSmarkers/downloads
# these files appear to be new almost every day
wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
# these map files appear to be old, 2002 Data
wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*
# Picks up files:
# 345184 Feb 20 2002 10090.MGD.txt
# 173294 Jun 27 2002 10090.WI_Mouse_Genetic.txt
# 240637 Jun 27 2002 10090.WI_Mouse_YAC.txt
# 390088 Jun 27 2002 10090.Whitehead-MRC_RH.txt
# If these files have not been changing, then no need to worry about
# them. We are just picking them up to see if they have changed
# since the last time we worked on this.
# these reports from jax.org appear to be changing daily
wget --timestamping \
ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
wget --timestamping \
ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
wget --timestamping \
ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt
# compare them with previous versions. Before this these were
# in /cluster/store5/mouseMarker/orig
# these newly picked up files:
sum -r 10090*
# 48882 338 10090.MGD.txt
# 24176 381 10090.Whitehead-MRC_RH.txt
# 62367 170 10090.WI_Mouse_Genetic.txt
# 50616 235 10090.WI_Mouse_YAC.txt
sum -r *.rpt
# 21267 4442 MRK_Dump2.rpt
# 51274 3743 MRK_Sequence.rpt
# 35293 2315 PRB_PrimerSeq.rpt
sum -r UniSTS*
# 40884 10502 UniSTS.aliases
# 14407 2931 UniSTS_mouse.sts
# the previous copies
cd /cluster/store5/mouseMarker/orig
sum -r 10090*
# 48882 338 10090.MGD.txt
# 24176 381 10090.Whitehead-MRC_RH.txt
# 62367 170 10090.WI_Mouse_Genetic.txt
# 50616 235 10090.WI_Mouse_YAC.txt
sum -r *.rpt
# 36880 4160 MRK_Dump2.rpt
# 02447 3132 MRK_Sequence.rpt
# 57914 2220 PRB_PrimerSeq.rpt
sum -r UniSTS*
# 36201 8843 UniSTS.aliases
# 58524 970 UniSTS_mouse.alias
# 42464 2291 UniSTS_mouse.sts
# back to our work area, update the bed file
# to do this we need a new UniSTS_mouse.alias file
# it is created by a combination of information from several
# of the above files ! AND ! the previous stsInfoMouse.bed file
cp /cluster/data/mm4/bed/STSmarkers/downloads/*.sh . -p
cp /cluster/data/mm4/bed/STSmarkers/downloads/*.pl . -p
# This process has been captured in the script:
# /cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
# which uses a couple of perl scripts in that same directory.
# briefly it is:
# cd /cluster/data/mm5/bed/STSmarkers/downloads
# ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0
# grep MGI: UniSTS.aliases > MGI.aliases
# ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \
# stsInfoAliases.txt
# ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases
# cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \
# | sort -n > UniSTS_mouse.alias
/cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
# with that, we can create a new stsInfoMouse.bed file:
bash
cd /cluster/data/mm5/bed/STSmarkers
/cluster/store5/mouseMarker/code/updateBed.pl \
/cluster/store5/mouseMarker/stsInfoMouse.bed \
downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \
downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \
downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile
# Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04
/cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed
# copy the stsInfoMouse.bed file from working dir to the marker info storage fold.
# added 2 new steps by Yontao
mv /cluster/store5/mouseMarker/stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed_mm3
cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed
# comparing to Mm4, this file was used there:
# /cluster/store6/mm4/bed/STSmarkers
# a wc of it shows:
# 56406 786036 6425721 stsInfoMouse.bed
# Now we have:
# 58488 790056 6602318 stsInfoMouse.bed
# and from that, create new primer fa, epcr, etc:
/cluster/store5/mouseMarker/code/luConvertPrimerToFa \
stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
# the mouseC.fa file will be empty
wc mouse?.*
# 0 0 0 mouseC.fa
# 286740 286686 6474893 mouseP.fa
# 32232 161234 2044810 mouseP.info
# 318972 447920 8519703 total
# the equivalent Mm4 versions:
# 0 0 0 mouseC.fa
# 258307 258245 5815248 mouseP.fa
# 29906 149545 1890926 mouseP.info
# copy the primers over to the bluearc for the kluster run
cp -p mouseP.fa /cluster/bluearc/scratch/mus/mm5
cp -p mouseP.info /cluster/bluearc/scratch/mus/mm5
# CLUSTER RUN FOR THE STS PRIMERS
ssh kk
mkdir -p /cluster/data/mm5/bed/STSmarkers/primer
mkdir -p /cluster/data/mm5/bed/STSmarkers/ePCR
cd /cluster/data/mm5/bed/STSmarkers/primer
# the mouseP.fa comes from above
echo "/cluster/bluearc/scratch/mus/mm5/mouseP.fa" > primers.lst
# PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE.
cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat.2 $(path1) $(path2) -ooc=/scratch/hg/h/mouse11.ooc -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl}
#ENDLOOP
'_EOF_'
mkdir primers.out
/cluster/bin/scripts/splitContigList -mouse -scratch \
/cluster/bluearc/scratch/mus/mm5/maskedContigs 1
/cluster/bin/i386/gensub2 contig.lst primers.lst template jobList
para create jobList
para try
para check
para push
... etc ...
# Completed: 639 of 639 jobs
# CPU time in finished jobs: 334066s 5567.76m 92.80h 3.87d 0.011 y
# IO & Wait Time: 72565s 1209.42m 20.16h 0.84d 0.002 y
# Average job time: 636s 10.61m 0.18h 0.01d
# Longest job: 800s 13.33m 0.22h 0.01d
# Submission to last job: 1090s 18.17m 0.30h 0.01d
# on the file server
ssh kksilo
cd /cluster/data/mm5/bed/STSmarkers/primer
/cluster/bin/i386/pslSort dirs primers.psl temp primers.out
rmdir temp
# comparing results to Mm4:
wc primers.psl
# 5719969 120119288 590806241 primers.psl
# Mm4 wc primers.psl /cluster/data/mm4/bed/STSmarkers/primer/primers.psl
# 5745617 120657896 592135728 primers.psl
# another kluster run
ssh kk
cd /cluster/data/mm5/bed/STSmarkers/ePCR
ls -1S /cluster/bluearc/scratch/mus/mm5/maskedContigs > contig.lst
# Edit this list to get full path names!
mkdir epcr.out
cat << '_EOF_' > template
#LOOP
/cluster/bin/scripts/luRunEpcr $(path1) $(path2) epcr.out/$(num2).epcr
#ENDLOOP
'_EOF_'
# the mouseP.info was created above
echo "/cluster/bluearc/scratch/mus/mm5/mouseP.info" > epcr.lst
gensub2 epcr.lst contig.lst template jobList
para create jobList
para try
para check
para push
... etc ...
# Completed: 639 of 639 jobs
# CPU time in finished jobs: 146365s 2439.41m 40.66h 1.69d 0.005 y
# IO & Wait Time: 67691s 1128.19m 18.80h 0.78d 0.002 y
# Average job time: 335s 5.58m 0.09h 0.00d
# Longest job: 427s 7.12m 0.12h 0.00d
# Submission to last job: 485s 8.08m 0.13h 0.01d
ssh hgwdev
cd /cluster/data/mm5/bed/STSmarkers/ePCR
# all those results become all.epcr
cat epcr.out/*.epcr > all.epcr
# comparing results to Mm4:
wc *.epcr
# 55677 222708 2945623 all.epcr
wc /cluster/store6/mm4/bed/STSmarkers/ePCR/*.epcr
# 74705 298820 3971712 /cluster/store6/mm4/bed/STSmarkers/ePCR/all.epcr
cd /cluster/data/mm5/bed/STSmarkers/primer
/cluster/bin/scripts/filterSTSPrimers \
-mouse ../stsInfoMouse.bed primers.psl \
../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat
# The output should show an increasing count:
# Reading name info
# Reading primer info
# Processing file
# 100000
# 200000
# 300000
# ...
# 5700000
# Determining ePCR not found
#
wc primers.psl.filter.blat
# 33476 702996 3442402 primers.psl.filter.blat
# Mm4: wc primers.psl.filter.blat
# 32729 687309 3331894 primers.psl.filter.blat
# create accession_info.rdb (chrM added to Terry's script for mouse)
touch empty_sequence.inf
/cluster/bin/scripts/compileAccInfo -mouse \
/cluster/data/mm5 empty_sequence.inf
# works with two seemingly errors:
# cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory
# cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory
mv accession_info.rdb accession_info.rdb.tmp
/cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \
accession_info.rdb
rm accession_info.rdb.tmp
# comparing results to Mm4:
# Mm5 wc accession_info.rdb
# 131845 1450299 9681940 accession_info.rdb
# Mm4 wc accession_info.rdb
# 86935 956289 6374930 accession_info.rdb
#
# 219652 1885501 11875772 total
# wc /cluster/data/mm5/?/*.agp /cluster/data/mm5/??/*.agp
# 252515 2152346 13568720 total
# creates epcr.not.found.nomatch and epcr.not.found.psl
/cluster/bin/scripts/epcrToPsl -mouse \
epcr.not.found ../mouseP.info \
accession_info.rdb /cluster/data/mm5
# Comparing results to Mm4:
# Mm5 wc epcr*
# 463 1852 17080 epcr.not.found
# 61 732 5845 epcr.not.found.nomatch
# 402 8442 39011 epcr.not.found.psl
# Mm4 wc epcr*
# 328 1312 12011 epcr.not.found
# 57 684 5474 epcr.not.found.nomatch
# 266 5586 25711 epcr.not.found.psl
# there is a single error being propagated here from the file
# /cluster/store5/mouseMarker/stsInfoMouse.bed which has an error
# at line 53958:
62943 D2J3 91947 D2J3 CAACCAGCTCAC
CAACCAGCTCAC 1825, 1025BP 0 MUS MUSCULUS
# The value '1825,' is incorrect. Should be a small integer here.
# to work around this problem, I'm manually eliminating this problem
# from the epcr.not.found.psl file where it has now become four bad
# lines:
# 24 0 0 0 1 1801 1 1789 + 27119
1825 0 1825chr11_16 0 1115413 1117226 2 12,12, 0,1813,
1115413,1117214,
# 24 0 0 0 1 1801 1 1789 + 27119
1825 0 1825chr11_16 0 1115413 1117226 2 12,12, 0,1813,
1115413,1117214,
216a219,220
# 24 0 0 0 1 1801 1 1789 + 62943
1825, 0 1825,chr11_16 0 1115413 1117226 2 12,12,
0,1813, 1115413,1117214,
# 24 0 0 0 1 1801 1 1789 + 62943
1825, 0 1825,chr11_16 0 1115413 1117226 2 12,12,
0,1813, 1115413,1117214,
# taking those four lines out.
cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
# lift those primers (added chrM to this lifter script for mouse)
# creates primers.psl.filter.lifted
/cluster/bin/scripts/lifter -mouse -psl \
/cluster/data/mm5 primers.psl.filter
# wc primers.psl.filter.lifted
# 33691 707511 3601164 primers.psl.filter.lifted
# create primers.psl.filter.lifted.initial
bash
PATH=/cluster/bin/scripts:$PATH
/cluster/bin/scripts/extractPslInfo primers.psl.filter.lifted
# wc primers.psl.filter.lifted.initial
# 33689 202134 1799016 primers.psl.filter.lifted.initial
# create primers.psl.filter.lifted.initial.acc
/cluster/bin/scripts/findAccession -agp \
-mouse primers.psl.filter.lifted.initial /cluster/data/mm5
# wc primers.psl.filter.lifted.initial.acc
# 33689 235823 2158029 primers.psl.filter.lifted.initial.acc
# this needs to be -rat as that specifies how to scan the
# stsInfoMouse.bed file and it does not work if you use -mouse
/cluster/bin/scripts/getStsId -rat \
../stsInfoMouse.bed primers.psl.filter.lifted.initial.acc \
> primers.initial.acc.trans
# wc primers.initial.acc.trans
# 33689 235823 1834889 primers.initial.acc.trans
sort -k 4n primers.initial.acc.trans > primers.final
rm primers.psl.filter.lifted.initial.acc primers.initial.acc.trans
# comparing results to Mm4:
# Mm5 wc primers.final
# 33689 235823 1834889 primers.final
# Mm4 wc primers.final
# 32983 230881 1771293 primers.final
cd /cluster/data/mm5/bed/STSmarkers
# stsMarkers.final is empty for mouse
touch stsMarkers.final dummy
bash
PATH=/cluster/bin/scripts:$PATH \
/cluster/bin/scripts/combineSeqPrimerPos \
stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
# Comparing results to Mm4
# Mm5 wc stsMarkers_pos.rdb
# 32085 224595 1862816 stsMarkers_pos.rdb
# Mm4 wc stsMarkers_pos.rdb
# 31270 218890 1869417 stsMarkers_pos.rdb
/projects/cc/hg/ytlu/bin/script/perl/createStsBed \
stsInfoMouse.bed stsMarkers_pos.rdb 500 > stsMapMouse.bed
# wc stsMapMouse.bed
# 29069 301535 2123622 stsMapMouse.bed
# loading STS markers tables
ssh hgwdev
cd /cluster/data/mm5/bed/STSmarkers
cp -p /cluster/store6/mm4/bed/STSmarkers/ucscAlias.pl .
bash
./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
# wc ucscStsAlias.tab
# 126624 379859 3037850 ucscStsAlias.tab
hgsql -e "drop table stsAlias;" mm5
hgsql mm5 < ~/kent/src/hg/lib/stsAlias.sql
hgsql -e \
'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm5
hgsql -e "drop table stsMapMouseNew;" mm5
hgsql mm5 < ~/kent/src/hg/lib/stsMapMouseNew.sql
hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm5
hgsql -e "drop table stsInfoMouseNew;" mm5
hgsql mm5 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
hgsql -e \
'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm5
hgLoadPsl -nobin -table=all_sts_primer mm5 primer/primers.psl.filter.lifted
# load primer sequences
mkdir /gbdb/mm5/stsMarker
ln -s /cluster/data/mm5/bed/STSmarkers/mouseP.fa \
/gbdb/mm5/stsMarker/mouseP.fa
# PLEASE NOTE THAT THE -replace option is used because this is a rebuild,
# otherwise there will be a problem that the seq and extFile tables
# will be out of sync.
hgLoadSeq -replace mm5 /gbdb/mm5/stsMarker/mouseP.fa
# Adding /gbdb/mm5/stsMarker/mouseP.fa
# 32232 sequences
# DONE - 2004-08-24 17:02
# QA repush 2006-02-08 seq table to remove old STS sequences with no extFile reference (Jen)
Heather found problem found on rr. RR table matched dev and beta was correct, so no
joinerCheck errors for the mismatch were flagged for review.
# BLASTZ RAT RN3 (RE-DONE - 2004-08-30 - Fan)
# !!! PLEASE NOTE AS OF 9/2/04, THE 8/30/04-8/31/04 REBUILD OF BLASTZ, CHAIN, AND NET
# FOR MM5-RN3 IS NO LONG USED FOR MM5. THE OLD MM5-RN3 CHAIN AND NET BUILD OF 7/14/04
# IS REVERSE PUSHED FROM RR BACK TO HGWDEV.
# Reason for rebuild is to use more stringent blastz parameters to reduce size
# of output files.
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=50000
# scoring matrix
# BLASTZ_Q=/cluster/data/blastz/mus_rat.q
# MAKE SURE TO INCLUDE THE RESCORE STEP TO CORRECT A BLASTZ PROBLEM.
# (axtRescore -scoreScheme=/cluster/data/blastz/mus_rat.q ...)
ssh kk
mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-08-29
cd blastz.rn3.2004-08-29
cat << '_EOF_' > DEF
# rat vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=50000
BLASTZ_T=2
# scoring matrix
BLASTZ_Q=/cluster/data/blastz/mus_rat.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInRat
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Rat
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastz.rn3.2004-08-29
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# prepare first cluster run
ssh kk
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
bash
source ./DEF
# script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh
# it is a generic script and works for any assembly
cp -p /cluster/data/hg17/jkStuff/BlastZ_run0.sh \
/cluster/data/mm5/jkStuff/BlastZ_run0.sh
/cluster/data/mm5/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
# Completed: 41943 of 41943 jobs
# CPU time in finished jobs: 4656727s 77612.11m 1293.54h 53.90d 0.148 y
# IO & Wait Time: 460782s 7679.70m 128.00h 5.33d 0.015 y
# Average job time: 122s 2.03m 0.03h 0.00d
# Longest job: 2042s 34.03m 0.57h 0.02d
# Submission to last job: 8307s 138.45m 2.31h 0.10d
# Second cluster run to convert the .out's to .lav's
# You do NOT want to run this on the big cluster. It brings
# the file server to its knees. Run this on the small cluster.
ssh kki
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
# script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
# fixup machine check, should be kki, not kk
cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh \
/cluster/data/mm5/jkStuff/BlastZ_run1.sh
vi /cluster/data/mm5/jkStuff/BlastZ_run1.sh
/cluster/data/mm5/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 1293s 21.54m 0.36h 0.01d 0.000 y
# IO & Wait Time: 2113s 35.22m 0.59h 0.02d 0.000 y
# Average job time: 10s 0.17m 0.00h 0.00d
# Longest job: 54s 0.90m 0.01h 0.00d
# Submission to last job: 719s 11.98m 0.20h 0.01d
# NOTE: BlastZ_run2.sh is not used here. Instead Angie's approach
# (using Rescore) is adopted here.
# third run: lav -> axt
# NOTE: use axtRescore here because we used a non-default BLASTZ_Q matrix
# and abridged repeats (Penn State's restore_rpts program rescores with
# default matrix, oops).
ssh kki
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
# mv old subdirectories
mv axtChrom axtChrom.old
mv run.2 run.2.old
mkdir axtChrom pslChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
set path = (/cluster/bin/x86_64 $path)
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
/iscratch/i/mus/mm5/softNib /iscratch/i/rn3/bothMaskedNibs stdout \
| axtRescore -scoreScheme=/cluster/data/blastz/mus_rat.q stdin stdout \
| axtSort stdin ../../axtChrom/$chr.axt
axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
../../pslChrom/$chr.psl
'_EOF_'
# << this line keeps emacs coloring happy
chmod a+x do.csh
cp /dev/null jobList
foreach d (../lav/chr*)
echo "do.csh $d" >> jobList
end
para create jobList
para try, check, push, check
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 498s 8.31m 0.14h 0.01d 0.000 y
# IO & Wait Time: 3367s 56.11m 0.94h 0.04d 0.000 y
# Average job time: 90s 1.50m 0.02h 0.00d
# Longest job: 299s 4.98m 0.08h 0.00d
# Submission to last job: 685s 11.42m 0.19h 0.01d
# CHAIN RAT BLASTZ (RE-DONE 8/30/04 Fan)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/mus_rat.q \
-minScore=5000 $1 \
/iscratch/i/mus/mm5/softNib \
/iscratch/i/rn3/bothMaskedNibs $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 3145s 52.42m 0.87h 0.04d 0.000 y
# IO & Wait Time: 989s 16.48m 0.27h 0.01d 0.000 y
# Average job time: 96s 1.60m 0.03h 0.00d
# Longest job: 280s 4.67m 0.08h 0.00d
# Submission to last job: 1219s 20.32m 0.34h 0.01d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=5000 /tmp/score.$f:t:r
echo ""
end
# Load chains into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain mm5 ${c}_chainRn3 $i
end
featureBits mm5 chainRn3Link
# 1677291680 bases of 2615483787 (64.129%) in intersection
nice featureBits hg17 chainRn3Link
# 982059013 bases of 2866216770 (34.263%) in intersection
# NET RAT BLASTZ (RE-DONE 8/31/04 Fan)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin hNoClass.net
# memory usage 1710399488, utime 7360 s/100, stime 1891
# The above adapted from Angie's approach
# The netClass operations requires an "ancientRepeat" table to exist
# in either mm5 or rn3. So, create the table:
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/ancientRepeat
cd /cluster/data/mm5/bed/ancientRepeat
# mysqldump needs write permission to this directory
# and you need to use your read/write enabled user with password
chmod 777 .
hgsqldump --all --tab=. mm4 ancientRepeat
chmod 775 .
hgsql mm5 < ancientRepeat.sql
mysqlimport -u<r/w user> -p<r/w pass> mm5 ancientRepeat.txt
# This is a hand curated table obtained from Arian.
# The ancientRepeat table was loaded during the first build of NET RAT BLASTZ.
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
time netClass hNoClass.net mm5 rn3 rat.net \
-tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat \
-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
# 491.210u 96.250s 12:27.37 78.6% 0+0k 0+0io 249pf+0w
# If things look good do
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
rm -r hNoClass.net
# Make a 'syntenic' subset of these with
time netFilter -syn rat.net > ratSyn.net
# 216.290u 34.220s 4:27.60 93.6% 0+0k 0+0io 119pf+0w
# Load the nets into database
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
netFilter -minGap=10 rat.net | hgLoadNet mm5 netRn3 stdin
netFilter -minGap=10 ratSyn.net | hgLoadNet mm5 syntenyNetRn3 stdin
# check results
# featureBits mm4 netRn3
# 96806381 bases of 95076222 (101.820%) in intersection
# featureBits mm5 netRn3
# 2601384082 bases of 2615483787 (99.461%) in intersection
# featureBits mm4 syntenyNetRn3
# 96760405 bases of 95076222 (101.771%) in intersection
# featureBits mm5 syntenyNetRn3
# 2575035774 bases of 2615483787 (98.454%) in intersection
# Add entries for net and chain to mouse/mm5 trackDb
# make net
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
mkdir ratNet
time netSplit rat.net ratNet
# 218.990u 29.290s 4:27.86 92.6% 0+0k 0+0io 190pf+0w
# extract axts from net
mkdir ../axtNet
foreach n (ratNet/chr*.net)
set c=$n:t:r
echo "netToAxt: $c.net -> $c.axt"
rm -f ../axtNet/$c.axt
netToAxt ratNet/$c.net chain/$c.chain \
/cluster/data/mm5/nib \
/cluster/data/rn3/nib ../axtNet/$c.axt
echo "Complete: $c.net -> axtNet/$c.axt"
end
# sort axt's and convert to maf format
mkdir ../mafNet
foreach f (../axtNet/chr*.axt)
set c=$f:t:r
echo $c.axt
mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
rm ../axtNet/$c.unsorted.axt
axtToMaf ../axtNet/$c.axt \
/cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \
../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3.
end
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtBest
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtBest
ln -s ../axtNet/chr*.axt .
# copy net axt's to download area
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet
mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
nice gzip *.axt
# add README.txt file to dir (use previous assembly's copy as template)
# Convert those axt files to psl
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
mkdir pslBest
foreach a (axtBest/chr*.axt)
set c=$a:t:r
echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
/cluster/bin/i386/axtToPsl axtBest/${c}.axt \
S1.len S2.len pslBest/${c}_blastzBestRn3.psl
echo "Done: ${c}_blastzBestRn3.psl"
end
# Load tables
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/pslBest
bash
for I in chr*BestRn3.psl
do
/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
echo "done ${I}"
done
# check results
# featureBits mm5 blastzBestRn3
# 1674716868 bases of 2615483787 (64.031%) in intersection
# featureBits mm4 blastzBestRn3
# 1780774716 bases of 2627444668 (67.776%) in intersection
# Make /gbdb links and add them to the axtInfo table:
mkdir -p /gbdb/mm5/axtBest/Rn3
cd /gbdb/mm5/axtBest/Rn3
rm *
ln -s /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet/chr*.axt .
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet
rm -f axtInfoInserts.sql
foreach f (/gbdb/mm5/axtBest/Rn3/chr*.axt)
set chr=$f:t:r
echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
>> axtInfoInserts.sql
end
# these axtInfo file entries should be appended to the table,
# not replacing it. The previous hg17 entries are needed -- bob kuhn
hgsql mm5 -e 'drop table mm5.axtInfo;'
hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql
hgsql mm5 < axtInfoInserts.sql
cd /cluster/data/mm5/bed
rm blastz.rn3
ln -s blastz.rn3.2004-08-29 blastz.rn3
# BLASTZ RN3 CLEAN UP (RE-DONE - 2004-08-31 - Fan)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
nice rm -rf raw
nice rm axtChain/run1/chain/*
# do the following later, after rn3-mm5 net and chain done.
nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} &
# The above line done on 9/7/04. Fan.
:
# CREATE CYTOBAND TRACK (DONE - 2004-09-7 - Fan)
# Should be done after NCBI updated their MapViewer to the latest release.
ssh hgwdev
cd /cluster/data/mm5
mkdir cytoBand
cd cytoBand
# Get file from NCBI
wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.33/ideogram.gz
gunzip ideogram
# Create bed file
/cluster/bin/scripts/createNcbiCytoBand ideogram
# Load the bed file
hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm5 cytoBand cytoBand.bed
# Make cytoBandIdeo track for ideogram gif on hgTracks page.
# For mouse cytoBandIdeo is just a replicate of the cytoBand track.
# Make the cytoBand track (above) and then:
echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql mm5
# REBUILD CYTOBAND TRACK (DONE - 2004-09-15 - Fan)
# NCBI updated the ideogram.gz file and also changed its format,
# added a new density field after stein.
ssh hgwdev
cd /cluster/data/mm5
mv cytoBand cytoBand.old
mkdir cytoBand
cd cytoBand
# Get file from NCBI
wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.33/ideogram.gz
gunzip ideogram
# Create bed file
/cluster/bin/scripts/createNcbiCytoBand ideogram
# Load the bed file
hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm5 cytoBand cytoBand.bed
# Make cytoBandIdeo track for ideogram gif on hgTracks page.
# For mouse cytoBandIdeo is just a replicate of the cytoBand track.
# First, drop the cytoBandIdeo table in mm5.
# Make the cytoBand track (above) and then:
echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"|hgsql mm5
# ADD MAP CONTIGS TRACK (DONE - 2004-09-07 - Fan)
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/ctgPos
cd /cluster/data/mm5/bed/ctgPos
# hgCtgPos uses the lift files... but mouse lift files are for the
# 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs
# from the assembly. (In the future, we should go with the NT's!)
# So... just for this release, go straight from the seq_contig.md
# to the table def'n: contig, size, chrom, chromStart, chromEnd
cat << '_EOF_' > parseSeqContig.pl
#!/usr/local/bin/perl -w
use strict;
while (<>) {
if (/^\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(N[TC]_\d+)\s+(\S+)\s+contig\s+\S+\s+\S+\s*$/i) {
my $chr=$1; my $start=$2; $start -= 1; my $end=$3; my $ctg=$5;
if ($chr !~ /N/ ) {
print "$ctg\t" . ($end-$start) . "\tchr$chr\t$start\t$end\n";
}
}
}
'_EOF_'
chmod +x parseSeqContig.pl
./parseSeqContig.pl ../../ncbi/seq_contig.md > ctgPos.tab
hgsql mm5 < ~/kent/src/hg/lib/ctgPos.sql
echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql mm5
# Note: the info is there in seq_contig.md to also do the _random's,
# but we'd have to do some more work: duplicate the gaps of 50000 between
# contigs for all _random's except chrUn_random (1000 between).
# featureBits mm5 ctgPos
# 2557516950 bases of 2615483787 (97.784%) in intersection
# featureBits mm4 ctgPos
# 2554101163 bases of 2627444668 (97.209%) in intersection
# featureBits mm3 ctgPos
# 2500661074 bases of 2505900260 (99.791%) in intersection
# RELOAD MAP CONTIGS TRACK (DONE - 2005-Mar-03 - Heather)
# /cluster/data/mm5/ncbi/seq_contig.md contains more than just C57BL/6J.
# Filter those out.
ssh hgwdev
cd /cluster/data/mm5/bed/ctgPos
cp /cluster/data/mm5/ncbi/seq_contig.md .
grep C57BL seq_contig.md > contig.C57BL
# contig.C57BL has 41061 lines (252 lines fewer than seq_contig.md)
./parseSeqContig.pl contig.C57BL > ctgPosFiltered.tab
# ctgPosFiltered.tab has 302 rows (227 fewer than ctgPos.tab)
echo "delete from ctgPos" | hgsql mm5
echo "load data local infile 'ctgPosFiltered.tab' into table ctgPos" | hgsql mm5
# echo "update ctgPos set chrom = "chrM" where chrom = "chrMT" | hgsql mm5
# featureBits mm5 ctgPos
# 2557064874 bases of 2615483787 (97.766%) in intersection
# FUGU BLAT ALIGNMENTS (DONE 2004-09-08 Fan)
ssh kk
mkdir /cluster/data/mm5/bed/blatFr1
cd /cluster/data/mm5/bed/blatFr1
ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
ls -1S /scratch/mus/mm5/softNib/*.nib > mouse.lst
cat << '_EOF_' > gsub
#LOOP
blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
mkdir psl
gensub2 mouse.lst fugu.lst gsub spec
para create spec
para try, check, push, check, ...
Completed: 24854 of 24854 jobs
CPU time in finished jobs: 8215774s 136929.56m 2282.16h 95.09d 0.261 y
IO & Wait Time: 1415723s 23595.39m 393.26h 16.39d 0.045 y
Average job time: 388s 6.46m 0.11h 0.00d
Longest job: 46761s 779.35m 12.99h 0.54d
Submission to last job: 46761s 779.35m 12.99h 0.54d
# Sort alignments:
ssh kksilo
cd /cluster/data/mm5/bed/blatFr1
pslCat -dir psl | pslSortAcc nohead chrom temp stdin
# Processed 1116383 lines into 5 temp files
# lift query side to Fugu browser chrUn coordinates
liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl
# load into database:
ssh hgwdev
cd /cluster/data/mm5/bed/blatFr1
hgLoadPsl -fastLoad -table=blatFr1 mm5 all.psl
# Processing all.psl
# load of blatFr1 did not go as planned: 1116383 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab
# a record is already in trackDb as type xeno psl fr1, with colorChromDefault off
# BLASTZ TETRAODON (tetNig1) (DONE, 2004-09-08, hartera)
ssh kkr1u00
# blastz requires lineage-specific repeats
# Treat all repeats as lineage-specific.
mkdir -p /iscratch/i/mm5/linSpecRep.notInTetraodon
foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/mm5/linSpecRep.notInTetraodon/$f:t:r:r.out.spec
end
mkdir -p /iscratch/i/tetNig1/linSpecRep.notInMouse
foreach f (/iscratch/i/tetNig1/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/tetNig1/linSpecRep.notInMouse/$f:t:r:r.out.spec
end
iSync
ssh kksilo
# more space on store8 than store6
mkdir -p /cluster/store8/mm5/blastz.tetNig1.2004-09-02
ln -s /cluster/store8/mm5/blastz.tetNig1.2004-09-02 \
/cluster/data/mm5/bed
ln -s /cluster/data/mm5/bed/blastz.tetNig1.2004-09-02 \
/cluster/data/mm5/bed/blastz.tetNig1
ssh kk
cd /cluster/data/mm5/bed/blastz.tetNig1
# use same parameters as for danRer1-mm5
cat << '_EOF_' > DEF
# mouse (mm5) vs Tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Reuse parameters from hg16-fr1 and danRer1-hg17.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse (mm5)
SEQ1_DIR=/iscratch/i/mus/mm5/test
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInTetraodon
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tetraodon (tetNig1)
SEQ2_DIR=/iscratch/i/tetNig1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/tetNig1/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastz.tetNig1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
# Save the DEF file in the current standard place
chmod +x DEF
cp DEF ~angie/hummus/DEF.mm5-tetNig1.2004-09-02
# setup cluster run
# copy shell scripts for blastz runs if not there already
cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/
# edit BlastZ_run0.sh
# replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
# this is the directory for the latest version of blastz-run
# source the DEF file
bash
. ./DEF
/cluster/data/mm5/jkStuff/BlastZ_run0.sh
cd run.0
# check batch looks ok then
para try, check, push, check, ....
# para time
# Completed: 19437 of 19437 jobs
# CPU time in finished jobs: 4681483s 78024.71m 1300.41h 54.18d 0.148 y
# IO & Wait Time: 176260s 2937.67m 48.96h 2.04d 0.006 y
# Average job time: 250s 4.17m 0.07h 0.00d
# Longest job: 790s 13.17m 0.22h 0.01d
# Submission to last job: 5475s 91.25m 1.52h 0.06d
# second cluster run to convert the .out's to .lav's
ssh kki
cd /cluster/data/mm5/bed/blastz.tetNig1
bash # if a csh/tcsh user
. ./DEF
/cluster/data/mm5/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# para time
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 262s 4.37m 0.07h 0.00d 0.000 y
# IO & Wait Time: 981s 16.35m 0.27h 0.01d 0.000 y
# Average job time: 4s 0.06m 0.00h 0.00d
# Longest job: 9s 0.15m 0.00h 0.00d
# Submission to last job: 108s 1.80m 0.03h 0.00d
# Third cluster run to convert lav's to axt's
ssh kki
cd /cluster/data/mm5/bed/blastz.tetNig1
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/mus/mm5/softNib \
/iscratch/i/tetNig1/nib stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.tetNig1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 41s 0.68m 0.01h 0.00d 0.000 y
# IO & Wait Time: 414s 6.90m 0.12h 0.00d 0.000 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest job: 28s 0.47m 0.01h 0.00d
# Submission to last job: 396s 6.60m 0.11h 0.00d
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/mm5/bed/blastz.tetNig1
mkdir -p pslChrom
set tbl = "blastzTetNig1"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.tetNig1/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl mm5 $f
end
# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1 -enrichment
# refGene:cds 0.765%, blastzTetNig1 1.709%, both 0.519%, cover 67.80%,
# enrich 39.67x
# default with H=2000
# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2000 -enrichment
# refGene:cds 0.765%, blastzTetNig1H2000 1.239%, both 0.502%, cover 65.59%,
# enrich 52.92x
# blastzDanRer1 with L=8000
# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1L8k -enrichment
# refGene:cds 0.765%, blastzTetNig1L8k 1.333%, both 0.444%, cover 58.05%,
# enrich 43.56x
# too much drop in coverage
# H=2000, L=4000
# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2kL4k -enrichment
# refGene:cds 0.765%, blastzTetNig1H2kL4k 1.166%, both 0.489%, cover 63.91%,
# enrich 54.81x
# H=2000, L=6000
# featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2kL6k -enrichment
# refGene:cds 0.765%, blastzTetNig1H2kL6k 1.014%, both 0.437%, cover 57.15%,
# enrich 56.36x
# too much drop in coverage
# number of rows in table
# blastzTetNig1 38196
# blatzTetNig1H2000 38314
# blastzTetNig1L8k 24749
# blastzTetNig1H2kL4k 31433
# blastzTetNig1H2kL6k 21389
# use blastzTetNig1 as this has the best coverage. enrich is quite high too.
# featureBits -chrom=chr1 hg17 refGene:cds blastzFr1 -enrichment
# refGene:cds 1.246%, blastzFr1 2.319%, both 0.833%, cover 66.87%, enrich 28.83x
# similar coverage to blastzFr1 for hg17
# RESCORE TETNIG1 BLASTZ (DONE, 2004-09-08, hartera)
# Low scores can occur with repeats abridged and using the
# HoxD55.q matrix. PSU's restore_rpts program rescored alignments
# with the default matrix instead of the BLASTZ_Q matrix.
# Rescore them here so the chainer sees the higher scores:
ssh kolossus
cd /cluster/data/mm5/bed/blastz.tetNig1
mkdir axtChrom.rescore
foreach f (axtChrom/chr*.axt)
axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
$f axtChrom.rescore/$f:t
end
mv axtChrom axtChrom.orig
mv axtChrom.rescore axtChrom
# CHAIN TETRAODON (TETNIG1) BLASTZ (DONE, 2004-09-08, hartera)
# Re do chains with rescored blastz Hg17
# Run axtChain on little cluster
ssh kki
cd /cluster/data/mm5/bed/blastz.tetNig1
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/mm5/bed/blastz.tetNig1/axtChrom/*.axt \
> input.lst
# Reuse gap penalties from hg16 vs chicken run.
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V 11
smallSize^V 111
position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V
72111^V 152111^V 252111
qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V
31600^V 56600
bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V
16000^V 32000^V 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -linearGap=../../chickenHumanTuned.gap $1 \
/iscratch/i/mus/mm5/softNib \
/iscratch/i/tetNig1/nib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 524s 8.74m 0.15h 0.01d 0.000 y
# IO & Wait Time: 140s 2.33m 0.04h 0.00d 0.000 y
# Average job time: 15s 0.26m 0.00h 0.00d
# Longest job: 25s 0.42m 0.01h 0.00d
# Submission to last job: 632s 10.53m 0.18h 0.01d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's,try also with larger bin size.
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# not a large amount of changes with score < 5000
# load chr1 into database to check
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chain
hgLoadChain mm5 chr1_chainTetNig1 chr1.chain
# featureBits -chrom=chr1 mm5 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.765%, chainTetNig1Link 1.563%, both 0.512%, cover 66.84%,
# enrich 42.76x
# try filtering with minScore=5000
ssh kksilo
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
chainSplit chainFilt5k all.chain
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chainFilt5k
hgLoadChain mm5 chr1_chainTetNig1Filt5k chr1.chain
# featureBits -chrom=chr1 mm5 refGene:cds chainTetNig1Filt5kLink -enrichment
# refGene:cds 0.765%, chainTetNig1Filt5kLink 1.398%, both 0.504%, cover 65.91%, # enrich 47.13x
# chr1_chainTetNig1 21782
# chr1_chainTetNig1Filt5k 9670
# loses very little in coverage so use filtering with minScore=5000
# remove chain
rm -r chain
mv chainFilt5k chain
rm all.chain.unfiltered
ssh hgwdev
# remove test tables
hgsql -e "drop table chr1_chainTetNig1Filt5k;" mm5
hgsql -e "drop table chr1_chainTetNig1Filt5kLink;" mm5
# load chains into database
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain mm5 ${c}_chainTetNig1 $i
echo done $c
end
# NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-09-08, hartera)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 69083136, utime 402 s/100, stime 37
# Add classification info using db tables:
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
mkdir -p /cluster/bluearc/mm5/linSpecRep.notInTetraodon
mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInMouse
cp /iscratch/i/mm5/linSpecRep.notInTetraodon/* \
/cluster/bluearc/mm5/linSpecRep.notInTetraodon
cp /iscratch/i/tetNig1/linSpecRep.notInMouse/* \
/cluster/bluearc/tetNig1/linSpecRep.notInMouse
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
# there is no ancient repeats table for rodent vs fish so use -noAr flag
time netClass noClass.net mm5 tetNig1 tetNig1.net \
-tNewR=/cluster/bluearc/mm5/linSpecRep.notInTetraodon \
-qNewR=/cluster/bluearc/tetNig1/linSpecRep.notInMouse -noAr
# 59.490u 37.630s 2:41.82 60.0% 0+0k 0+0io 216pf+0w
netFilter -minGap=10 tetNig1.net | hgLoadNet mm5 netTetNig1 stdin
# featureBits mm5 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.921%, netTetNig1 23.633%, both 0.725%, cover 78.70%,
# enrich 3.33x
# MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-09-10, hartera)
ssh kksilo
# zip chains and nets
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
cp all.chain tetNig1.chain
zip -j /cluster/data/mm5/zip/tetNig1.chain.zip tetNig1.chain
rm tetNig1.chain
zip -j /cluster/data/mm5/zip/tetNig1.net.zip tetNig1.net
ssh hgwdev
# copy chains and nets to downloads area
set gp = /usr/local/apache/htdocs/goldenPath/mm5
mkdir -p $gp/vsTetNig1
cd $gp/vsTetNig1
mv /cluster/data/mm5/zip/tetNig1*.zip .
md5sum *.zip > md5sum.txt
# move axt files to downloads area and zip
cd /cluster/data/mm5/bed/blastz.tetNig1/axtChrom
mkdir -p $gp/vsTetNig1/axtChrom
cp -p *.axt $gp/vsTetNig1/axtChrom
cd $gp/vsTetNig1/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# MAKE VSDANRER1 DOWNLOADABLES (DONE, 2004-09-10, hartera)
ssh kksilo
# zip chains and nets
cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
gunzip all.chain.gz
cp all.chain danRer1.chain
zip -j /cluster/data/mm5/zip/danRer1.chain.zip danRer1.chain
rm danRer1.chain
gunzip danRer1.net.gz
zip -j /cluster/data/mm5/zip/danRer1.net.zip danRer1.net
ssh hgwdev
# copy chains and nets to downloads area
set gp = /usr/local/apache/htdocs/goldenPath/mm5
mkdir -p $gp/vsDanRer1
cd $gp/vsDanRer1
mv /cluster/data/mm5/zip/danRer1*.zip .
md5sum *.zip > md5sum.txt
# move axt files to downloads area and zip
cd /cluster/data/mm5/bed/blastz.danRer1/axtChrom
mkdir -p $gp/vsDanRer1/axtChrom
cp -p *.axt $gp/vsDanRer1/axtChrom
cd $gp/vsDanRer1/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# add the axtNet *.axt in blastz.danRer1/axtNet
cd /cluster/data/mm5/bed/blastz.danRer1/axtNet
set gp = /usr/local/apache/htdocs/goldenPath/mm5
mkdir -p $gp/vsDanRer1/axtNet
nice cp -p *.axt $gp/vsDanRer1/axtNet
cd $gp/vsDanRer1/axtNet
nice gzip *.axt
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# BLASTZ TETNIG1 CLEAN UP (DONE, 2004-09-10, hartera)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.tetNig1
nice rm -rf raw &
nice rm -rf lav &
nice rm -rf axtChrom.orig &
nice rm axtChain/run1/chain/* &
nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
# SGP GENES (REDONE 5/24/05 angie)
# Originally loaded 9/17/04; user noticed chrX was missing; IMIM folks
# regenerated & we reloaded.
ssh kksilo
mkdir /cluster/data/mm5/bed/sgp
cd /cluster/data/mm5/bed/sgp
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
wget http://genome.imim.es/genepredictions/M.musculus/mmMay2004/SGP/humangp200405/$chr.gtf
wget http://genome.imim.es/genepredictions/M.musculus/mmMay2004/SGP/humangp200405/$chr.prot
end
# Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
cp /dev/null sgpPep.fa
foreach f (chr*.prot)
nice perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa
end
ssh hgwdev
cd /cluster/data/mm5/bed/sgp
ldHgGene -gtf -genePredExt mm5 sgpGene chr*.gtf
hgPepPred mm5 generic sgpPep sgpPep.fa
# SGP GENES (UPDATE 1/18/2006)
sgpPep table dropped, replaced by hgc generated protein seq in browser
# MAKE mm5-hg17 OVER.CHAIN FOR LIFTOVER (DONE 2004-09-24 braney)
ssh kolossus
mkdir -p /cluster/data/mm5/bed/bedOver/mm5Tohg17
cd /cluster/data/mm5/bed/bedOver/mm5Tohg17
set chainDir = /cluster/data/mm5/bed/blastz.hg17/axtChain
netSplit $chainDir/human.net net
mkdir subset
foreach f ($chainDir/chain/*.chain)
echo subsetting $f:t:r
netChainSubset net/$f:t:r.net $f subset/$f:t
end
cat subset/*.chain > /cluster/data/mm5/bed/bedOver/mm5Tohg17.chain
hgAddLiftOverChain -multiple mm5 hg17
# miRNA track (DONE - 2004-09-30 - Fan)
# data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
# and Michel.Weber@ibcg.biotoul.fr
# notify them when done.
cd /cluster/data/mm5/bed
mkdir miRNA
cd miRNA
wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/mmu.bed
grep -v "tion" mmu.bed | sed -e "s/ /\t/g" > mm5.bed
# check previous release track before update
nice featureBits mm4 miRNA
# 17782 bases of 2627444668 (0.001%) in intersection
hgLoadBed mm5 miRNA mm5.bed
# entry in trackDb/trackDb.ra already there
# and verify similar numbers after:
nice featureBits mm5 miRNA
# 17957 bases of 2615483787 (0.001%) in intersection
# BLASTZSELF Done (Tue Oct 19 18:06:45 PDT 2004) sugnet
# blastzSelf run for mm5. This took about a week due to
# being busy with other things and some crashed jobs in a
# few places. Think all of the instructions ended up here.
# based off of Hiram's instructions for blastzSelf in hg16 & hg17
mkdir -p /cluster/store6/mm5/bed/blastzSelf
cd /cluster/store6/mm5/bed/blastzSelf
# Create the definitions file.
cat << '_EOF_' > DEF
# mouse vs. mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
# RMSK not currently used
SEQ1_RMSK=/scratch/mus/mm5/rmsk
# FLAG not currently used
SEQ1_FLAG=-rodent
SEQ1_SMSK=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInMouse
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Mouse
SEQ2_DIR=/scratch/mus/mm5/softNib
# RMSK not currently used
SEQ2_RMSK=/scratch/mus/mm5/rmsk
# FLAG not currently used
SEQ2_FLAG=-rodent
SEQ2_SMSK=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastzSelf
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
ssh kk
cd /cluster/store6/mm5/bed/blastzSelf
/cluster/data/hg17/jkStuff/BlastZ_run0.sh
cd run.0
para try, push, check
# on mini-cluster, otherwise I/O gets very demanding....
ssh kki
cd /cluster/store6/mm5/bed/blastzSelf
mkdir -p run.1
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > run.1/jobList
cd run.1
wc -l jobList
# 341 jobList
head jobList
para create jobList
para try
# Third cluster run to convert lav's to axt's
mkdir run.2
cd run.2
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-chromlav2axt /cluster/data/mm5/bed/blastzSelf/lav/$(root1) {check out line+ /cluster/data/mm5/bed/blastzSelf/axtChrom/$(root1).axt} /scratch/mus/mm5/softNib /scratch/mus/mm5/softNib
#ENDLOOP
'_EOF_'
ls -1S /cluster/data/mm5/bed/blastzSelf/lav > chrom.list
gensub2 chrom.list single gsub jobList
para create
para push
# This seems to beat up on the file server a little, load up to 56 on kksilo
# Number of jobs died, unsure why. Try them on kksilo:
ssh kksilo
cat << '_EOF_' > doStragglers.csh
#!/bin/tcsh
cd /cluster/store6/mm5/bed/blastzSelf
set base=/cluster/data/hg16/bed/blastzSelf
set seq1_dir=/cluster/data/mm5/nib
set seq2_dir=/cluster/data/mm5/nib
foreach c (lav/chr17 lav/chr2 lav/chr3 lav/chr7 lav/chrUn_random lav/chrX lav/chrY)
echo "Doing $c"
pushd $c
set chr=$c:t
set out=axtChrom/$chr.axt
echo "Translating $chr lav to $out"
foreach d (*.lav)
set smallout=$d.axt
lavToAxt $d $seq1_dir $seq2_dir stdout \
| axtDropSelf stdin stdout \
| axtSort stdin $smallout
end
cat `ls -1 *.lav.axt | sort -g` > $base/$out
popd
end
'_EOF_'
# Need to drop overlaps to eliminate diagonals
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "doing $c"
/cluster/bin/i386/axtDropOverlap axtChrom/$c.axt chromSizes.tab chromSizes.tab \
/cluster/store6/mm5/bed/blastzSelf/axtChromDropped/$c.axt
echo "Done: $c"
end
cd axtChromDropped
gzip *.axt
# Translate to psls
cd /cluster/data/mm5/bed/blastzSelf
mkdir pslChrom
set tbl = "blastzSelf"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
zcat /cluster/data/mm5/bed/blastzSelf/axtChromDropped/${c}.axt.gz | \
/cluster/bin/i386/axtToPsl stdin S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load files into the database
/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 *_blastzSelf.psl
# end BLASTZSELF
# CREATE kgSpAlias TABLE FOR PB (Done 10/20/04)
hgsql mm5 -e \
'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
hgsql mm5 -e \
'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
>>j.tmp
cat j.tmp|sort -u |grep -v 'kgID' >mm5.kgSpAlias.tab
rm j.tmp
hgsql mm5 -e 'drop table kgSpAlias';
hgsql mm5 < ~/src/hg/lib/kgSpAlis.sql
hgsql mm5 -e 'load data local infile "mm5.kgSpAlias.tab" into table kgSpAlias'
# ECGENE TRACK (DONE, 2004-10-29, hartera)
ssh kksilo
mkdir -p /cluster/data/mm5/bed/ECgene.2004-10-29
ln -s /cluster/data/mm5/bed/ECgene.2004-10-29 \
/cluster/data/mm5/bed/ECgene
cd /cluster/data/mm5/bed/ECgene
wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_mm5_low_gene.txt.gz"
wget \
"http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_mm5_low_pep.txt.gz"
gunzip *.gz
# load database
ssh hgwdev
cd /cluster/data/mm5/bed/ECgene
ldHgGene -predTab mm5 ECgene v1.2_mm5_low_gene.txt
# 343337 gene predictions
hgPepPred mm5 tab ECgenePep v1.2_mm5_low_pep.txt
rm *.tab
nice gzip *.txt
## NIA Mouse Gene Index - (DONE - 2004-11-16 Fan)
# requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
# pick up data
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/NIAGene
cd /cluster/data/mm5/bed/NIAGene
wget --timestamp http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex4-Transcript-to-Genome.psl
wget --timestamping \
http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex4-Transcripts.fasta
hgLoadPsl mm5 -table=NIAGene NIA-Mouse-GeneIndex4-Transcript-to-Genome.psl
mkdir /gbdb/mm5/NIAGene
ln -s /cluster/data/mm5/bed/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta \
/gbdb/mm5/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta
hgLoadSeq mm5 /gbdb/mm5/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta
Added and edited NIAGene.html and trackDb.ra under
kent/src/hg/makeDb/trackDb/mouse/mm5
# CREATE jaxQTL3 (MOUSE QTL) TRACK (DONE - 2004-11-18 Fan)
cd /cluster/data/mm5/bed
mkdir qtl.2004-11-08
ln -s qtl.2004-11-08 qtl
cd qtl
# Get the raw data file, mouse_qtl_100804.txt, sent by Carol Bult [cjb@informatics.jax.org].
hgsql mm5 -e 'drop table jaxQtlRaw'
hgsql mm5 < ~/src/hg/lib/jaxQtlRaw.sql
hgsql mm5 -e 'load data local infile "mouse_qtl_100804.txt" into table jaxQtlRaw ignore 1 lines'
# Make sure hgJaxQtl binary executable exist. hgJaxQtl is under ~/src/hg/hgJaxQtl
hgJaxQtl mm5
wc jaxQTL3.tab
# 981 15310 105164 jaxQTL3.tab
hgLoadBed -nobin -tab -sqlTable=$HOME/src/hg/lib/jaxQTL3.sql mm5 jaxQTL3 jaxQTL3.tab
# TWINSCAN (DONE 11/29/04 angie)
ssh kksilo
mkdir /cluster/data/mm5/bed/twinscan
cd /cluster/data/mm5/bed/twinscan
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
wget http://genes.cs.wustl.edu/predictions/mouse/mm5_11-24-04/chr_gtf/$chr.gtf
wget http://genes.cs.wustl.edu/predictions/mouse/mm5_11-24-04/chr_ptx/$chr.ptx
end
# Add '.a' to end of protein fasta id's, to match gtf transcript_id's:
perl -wpe 's/^(>\S+).*/$1.a/' *.ptx > twinscanPep.fa
# load.
ssh hgwdev
cd /cluster/data/mm5/bed/twinscan
ldHgGene -gtf -genePredExt mm5 twinscan chr*.gtf
hgPepPred mm5 generic twinscanPep twinscanPep.fa
featureBits -enrichment mm5 refGene twinscan
#refGene 1.551%, twinscan 1.245%, both 0.783%, cover 50.46%, enrich 40.52x
# Create mm5GeneList.html (to be used by Google).
# This step was done 12/08/04.
cd /cluster/data/mm5/bed
mkdir geneList
cd geneList
wget -O mm5GeneList.html "http://hgwdev-fanhsu.cse.ucsc.edu/cgi-bin/hgGeneList?db=mm5"
cp -p mm5GeneList.html /usr/local/apache/htdocs/goldenPath
# Check this html file into CVS.
# BLASTZ ZEBRAFISH (danRer2) (DONE, 2004-12-12, hartera)
ssh kkr1u00
# blastz requires lineage-specific repeats
# Treat all repeats as lineage-specific.
# this directory of mouse repeats exists already
mkdir -p /iscratch/i/mm5/linSpecRep.notInZebrafish
foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/mm5/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
end
mkdir -p /iscratch/i/danRer2/linSpecRep.notInMouse
foreach f (/iscratch/i/danRer2/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/danRer2/linSpecRep.notInMouse/$f:t:r:r.out.spec
end
iSync
ssh kk
mkdir -p /cluster/data/mm5/bed/blastz.danRer2.2004-12-10
ln -s /cluster/data/mm5/bed/blastz.danRer2.2004-12-10 \
/cluster/data/mm5/bed/blastz.danRer2
cd /cluster/data/mm5/bed/blastz.danRer2
# use same parameters as for danRer[1|2]-hg17 and for hg16-fr1 and mm5-danRer1
# and similar to those originally used for hg17-galGal2
cat << '_EOF_' > DEF
# mouse (mm5) vs zebrafish (danRer2)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer1
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse (mm5)
SEQ1_DIR=/cluster/bluearc/scratch/mus/mm5/softNib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer2)
SEQ2_DIR=/iscratch/i/danRer2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/danRer2/linSpecRep.notInMouse
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm5/bed/blastz.danRer2
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
# Save the DEF file in the current standard place
chmod +x DEF
cp DEF ~angie/hummus/DEF.mm5-danRer2.2004-12-10
# setup cluster run
# copy shell scripts for blastz runs if not there already
cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/
# edit BlastZ_run0.sh
# replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
# this is the directory for the latest version of blastz-run
# source the DEF file
bash
. ./DEF
/cluster/data/mm5/jkStuff/BlastZ_run0.sh
cd run.0
# check batch looks ok then
para try, check, push, check, ....
# para time
# Completed: 58993 of 58993 jobs
# CPU time in finished jobs: 17513361s 291889.35m 4864.82h 202.70d 0.555 y
# IO & Wait Time: 1506128s 25102.13m 418.37h 17.43d 0.048 y
# Average job time: 322s 5.37m 0.09h 0.00d
# Longest job: 2552s 42.53m 0.71h 0.03d
# Submission to last job: 50001s 833.35m 13.89h 0.58d
# output is 864M
# second cluster run to convert the .out's to .lav's
ssh kki
cd /cluster/data/mm5/bed/blastz.danRer2
bash # if a csh/tcsh user
. ./DEF
/cluster/data/mm5/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# para time
# Checking finished jobs
# Completed: 341 of 341 jobs
# CPU time in finished jobs: 689s 11.48m 0.19h 0.01d 0.000 y
# IO & Wait Time: 1305s 21.76m 0.36h 0.02d 0.000 y
# Average job time: 6s 0.10m 0.00h 0.00d
# Longest job: 14s 0.23m 0.00h 0.00d
# Submission to last job: 250s 4.17m 0.07h 0.00d
# Third cluster run to convert lav's to axt's
ssh kki
cd /cluster/data/mm5/bed/blastz.danRer2
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /cluster/bluearc/scratch/mus/mm5/softNib \
/iscratch/i/danRer2/nib stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.danRer2/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 82s 1.37m 0.02h 0.00d 0.000 y
# IO & Wait Time: 1429s 23.82m 0.40h 0.02d 0.000 y
# Average job time: 35s 0.59m 0.01h 0.00d
# Longest job: 91s 1.52m 0.03h 0.00d
# Submission to last job: 1421s 23.68m 0.39h 0.02d
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/mm5/bed/blastz.danRer2
mkdir -p pslChrom
set tbl = "blastzDanRer2"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.danRer2/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl mm5 $f
end
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1 -enrichment
#refGene:cds 0.763%,blastzDanRer1 2.918%,both 0.512%,cover 67.12%,enrich 23.00x
# featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer2 -enrichment
# refGene:cds 0.780%, blastzDanRer2 2.816%, both 0.529%, cover 67.89%,
# enrich 24.11x
# RESCORE DANRER2 BLASTZ ALIGNMENTS (DONE, 2004-12-12, hartera)
# Low scores can occur with repeats abridged and using the
# HoxD55.q matrix. PSU's restore_rpts program rescored alignments
# with the default matrix instead of the BLASTZ_Q matrix.
# Rescore them here so the chainer sees the higher scores:
ssh kolossus
cd /cluster/data/mm5/bed/blastz.danRer2
mkdir axtChrom.rescore
foreach f (axtChrom/chr*.axt)
axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
$f axtChrom.rescore/$f:t
end
mv axtChrom axtChrom.orig
mv axtChrom.rescore axtChrom
# psl files and blastz tables will be the same regardless of score so
# no need to reload
# CHAIN ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-13, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE PRIMARILY THE RESULTS OF
# REPEATS AND DEGENERATE DNA (DONE, 2004-12-22, hartera)
# Make chains with rescored blastz danRer2
# Run axtChain on little cluster
ssh kki
cd /cluster/data/mm5/bed/blastz.danRer2
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/mm5/bed/blastz.danRer2/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Make our own linear gap file with reduced gap penalties,
# in hopes of getting longer chains:
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V 11
smallSize^V 111
position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111
qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600
tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600
bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../chickenHumanTuned.gap $1 \
/cluster/bluearc/scratch/mus/mm5/softNib \
/iscratch/i/danRer1/nib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 1797s 29.95m 0.50h 0.02d 0.000 y
# IO & Wait Time: 575s 9.59m 0.16h 0.01d 0.000 y
# Average job time: 55s 0.92m 0.02h 0.00d
# Longest job: 133s 2.22m 0.04h 0.00d
# Submission to last job: 514s 8.57m 0.14h 0.01d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's,try also with smaller bin size.
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# filter on minScore = 5000
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain.filt5k
# remove old chains
rm -r chain
chainSplit chain all.chain.filt5k
# remove repeats from chains and reload into database
# (2004-12-22, hartera)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
mv chain chainRaw
mkdir chain
cd chainRaw
foreach f (*.chain)
set c = $f:r
echo $c
nice chainAntiRepeat /cluster/bluearc/scratch/mus/mm5/softNib \
/cluster/bluearc/danRer2/nib $f \
../chain/$c.chain
end
cd ..
chainMergeSort ./chain/*.chain > all.chain.antirepeat
chainSplit chainAR all.chain.antirepeat
# load filtered chains with chains removed that are mostly due to repeats
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.danRer2/axtChain/chainAR
foreach i (*.chain)
set c = $i:r
hgLoadChain mm5 ${c}_chainDanRer2 $i
echo done $c
end
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2 -enrichment
# refGene:cds 0.780%, chainDanRer2 22.478%, both 0.604%, cover 77.48%,
# enrich 3.45x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 0.780%, chainDanRer2Link 2.164%, both 0.526%, cover 67.43%,
# enrich 31.17x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1 -enrichment
# refGene:cds 0.780%, chainDanRer1 20.053%, both 0.593%, cover 75.99%,
# enrich 3.79x
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Link -enrichment
# refGene:cds 0.780%, chainDanRer1Link 2.022%, both 0.512%, cover 65.64%,
# enrich 32.47x
# after chainAntiRepeat:
# featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment
# refGene:cds 0.785%, chainDanRer2Link 2.058%, both 0.530%, cover 67.53%,
# enrich 32.81x
# NET ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-13, hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-22,hartera)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
rm -r preNet
mkdir preNet
cd chainAR
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 105357312, utime 632 s/100, stime 117
# Add classification info using db tables:
cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInMouse
cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
/cluster/bluearc/mm5/linSpecRep.notInZebrafish
cp /iscratch/i/danRer2/linSpecRep.notInMouse/* \
/cluster/bluearc/danRer2/linSpecRep.notInMouse
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
time netClass noClass.net mm5 danRer2 zfishdanRer2.net \
-tNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish \
-qNewR=/cluster/bluearc/danRer2/linSpecRep.notInMouse -noAr
# 87.010u 56.100s 5:15.16 45.4% 0+0k 0+0io 207pf+0w
netFilter -minGap=10 zfishdanRer2.net | hgLoadNet mm5 netDanRer2 stdin
# featureBits mm5 refGene:cds netDanRer2 -enrichment
# refGene:cds 0.938%, netDanRer2 21.447%, both 0.714%, cover 76.17%,
# enrich 3.55x
# featureBits mm5 refGene:cds netDanRer1 -enrichment
# refGene:cds 0.938%, netDanRer1 19.993%, both 0.702%, cover 74.87%,
# enrich 3.74x
# after chainAntiRepeat:
# featureBits mm5 refGene:cds netDanRer2 -enrichment
# refGene:cds 0.942%, netDanRer2 21.161%, both 0.717%, cover 76.14%,
# enrich 3.60x
# add trackDb.ra entries and html for details pages
# TIGR GENE INDEX (DONE 2004-12-13 Fan)
mkdir -p /cluster/data/mm5/bed/tigr
cd /cluster/data/mm5/bed/tigr
wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm5_05-2004.tgz
tar xvzf TGI*.tgz
foreach f (*cattle*)
set f1 = `echo $f | sed -e 's/cattle/cow/g'`
mv $f $f1
end
foreach o (mouse cow human pig rat)
echo $o
setenv O $o
foreach f (chr*_$o*s)
tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
end
end
ssh hgwdev
cd /cluster/data/mm5/bed/tigr
hgsql mm5 -e "drop table tigrGeneIndex"
hgsql mm5 < ~/kent/src/hg/lib/tigrGeneIndex.sql
foreach f (*.gff)
echo Processing $f ...
/cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm5 tigrGeneIndex $f
hgsql mm5 -e "select count(*) from tigrGeneIndex"
end
# Total of 354491 entries created in tigrGeneIndex table.
hgsql mm5 -e "update tigrGeneIndex set cdsStart = txStart;"
hgsql mm5 -e "update tigrGeneIndex set cdsEnd = txEnd;"
checkTableCoords mm5 tigrGeneIndex
gzip *.gff *TCs
# TIGR GENE INDEX (RE-DONE 2004-12-21 Fan)
# This track is re-done due to an error (no strand info) in the original files provided by TIGR.
cd /cluster/data/mm5/bed
mv tigr tigr_old_wrong
mkdir -p /cluster/data/mm5/bed/tigr
cd /cluster/data/mm5/bed/tigr
wget --timestamp ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm5_12-2004.tgz
tar xvzf TGI*.tgz
foreach f (*cattle*)
set f1 = `echo $f | sed -e 's/cattle/cow/g'`
mv $f $f1
end
foreach o (mouse cow human pig rat)
echo $o
setenv O $o
foreach f (chr*_$o*s)
tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
end
end
ssh hgwdev
cd /cluster/data/mm5/bed/tigr
hgsql mm5 -e "drop table tigrGeneIndex"
hgsql mm5 < ~/kent/src/hg/lib/tigrGeneIndex.sql
foreach f (*.gff)
echo Processing $f ...
/cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm5 tigrGeneIndex $f
hgsql mm5 -e "select count(*) from tigrGeneIndex"
end
# Total of 385814 entries created in tigrGeneIndex table.
hgsql mm5 -e "update tigrGeneIndex set cdsStart = txStart;"
hgsql mm5 -e "update tigrGeneIndex set cdsEnd = txEnd;"
checkTableCoords mm5 tigrGeneIndex
gzip *.gff *TCs
#### LOAD ENSEMBL GENES (DONE - 2004-12-17 Fan)
# ADDDED STABLE URL TO TRACKDB BLOCK (V27, DEC 2004) (2008-01-11, rhead)
# needed for Gene Sorter procedure below
# Ensembl released Mouse build 33 the week of Dec 4 2004
mkdir /cluster/data/mm5/bed/ensembl
cd /cluster/data/mm5/bed/ensembl
Get the ensembl gene data from http://www.ensembl.org/
Go to the EnsMart link
Choose Mus musculus as the organism
Follow this sequence through the pages:
Page 1) Choose the Ensembl Genes choice. Hit next.
Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
Page 3) Choose the "Structures" tab.
Page 4) Choose GTF as the ouput, choose gzip compression , name the
output file ensGeneMm5.gtf.gz and then hit Export
# Ensembl handles random chromosomes differently than us, so we
# strip this data. Fortunately it just loses a couple of genes.
zcat ensGene.gtf.gz | grep -v ^6_DR51 | grep -v _NT_ > unrandom.gtf
# Let's see how much it loses:
# None.
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with ldHgGene
sed -e "s/^/chr/" unrandom.gtf > ensGene.gtf
# (should also fixup chrMT name here too - 2005-02-28 - Hiram)
# sed -e "s/^/chr/" unrandom.gtf | sed -e "s/chrMT/chrM/" > ensGene.gtf
ldHgGene mm5 ensGene ensGene.gtf
# Read 31035 transcripts in 551352 lines in 1 files
# 31035 groups 22 seqs 1 sources 4 feature types
# 31035 gene predictions
# save space, gzip them:
gzip unrandom.gtf
gzip ensGene.gtf
# The name on chrM was incorrect, fixed (2005-02-28 - Hiram)
hgsql mm5 -e 'update ensGene set chrom="chrM" where chrom="chrMT";'
# Load Ensembl peptides:
Get the ensembl protein data from http://www.ensembl.org/
Go to the EnsMart link
Choose Mus musculus as the organism
Follow this sequence through the pages:
Page 1) Choose the Ensembl Genes choice. Hit next.
Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
Page 3) Choose the "Sequences" tab.
Page 4) Choose Transcripts/Proteins and peptide Only as the output,
choose text/fasta and gzip compression,
name the file ensGeneMm5.pep.gz and then hit export.
#delete * at end of each protein
bash
zcat ensGeneMm5.pep.gz | sed "s/\*$//" > ensembl.pep
~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep
hgPepPred mm5 generic ensPep fixPep_ensembl.pep
#
# The chrMT (chrM) peptides as obtained via EnsMart have only
# aa's of: X (2005-02-28 - Hiram)
# These 13 peptides were fixed up manually by fetching each
# one individually by following the 13 links from our browser
# to the ensemble protein, asking it to dump the protein
# sequence, cut and paste that answer to a local file.
# The 13 peptides were dropped from ensPep table via:
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082392.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082396.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082402.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082405.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082407.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082408.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082409.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082411.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082413.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082414.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082418.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082419.1";'
hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082421.1";'
# Then explicitly reloaded with SQL statements such as:
INSERT into ensPep (name, seq) VALUES ('ENSMUST00000082407.1', 'MPQLDTSTWFITIISSMITLFILFQLKVSSQTFPLAPSPKSLTTMKVKTPWELKWTKIYLPHSLPQQ');
# The 13 SQL statements were left in the file:
# /cluster/data/mm5/bed/ensembl/chrMPep.sql
# loaded via:
hgsql mm5 < chrMPep.sql
# The following files were "touched" on the RR/MGC after the chrMT/M
# change to prevent false errors with joinerCheck. J.Jackson 2005-03-01
# mm5.superfamily.name
# mm5.ensGtp.transcript
# mm5.ensPep.name
# mm5.knownToEnsembl.value
# mm5.sfDescription.name
# Load ensGtp table.
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" tab. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format, gzip.
# Result name file as ensGtpMm5.tab.gz
gunzip ensGtpMm5.tab.gz
hgsql mm5 < ~/kent/src/hg/lib/ensGtp.sql
hgsql -N -e 'load data local infile "ensGtpMm5.tab" into table ensGtp ignore 1 lines;' mm5
# Create knownToEnsembl column
hgMapToGene mm5 ensGene knownGene knownToEnsembl
# Compress everthing to save space
gzip *.tab
gzip *.pep
#### RE-BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-11-17 - Fan)
# PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS RE-BUILT USING ENSMART DATA OF MOUSE BUILD 33.
# THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER.
# Get the ensembl gene/protein cross-reference data from
# http://www.ensembl.org/Multi/martview?species=Mus_musculus
# Follow this sequence through the pages:
# Page 1) Make sure that the Mus musculus choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs,
SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC
# Page 4) Choose "Text, tab separated". choose gzip compression. hit export.
# Save as ensXref
zcat ensXref.tsv.gz|sed -e 's/\./\t/g' > ensemblXref3.tab
hgsql mm5 -e "drop table ensemblXref3"
hgsql mm5 < ~/src/hg/lib/ensemblXref3.sql
hgsql mm5 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'
# CREATE SUPERFAMILY TRACK (DONE 2004-12-17 - Fan)
mkdir /cluster/data/mm5/bed/superfamily
cd /cluster/data/mm5/bed/superfamily
hgSuperfam mm5 superfam041128 > sf.log
wc *
# It is normal that many proteins does not have corresponding Superfamily entries.
# Load the sfDescription table.
hgsql mm5 < ~/src/hg/lib/sfDescription.sql
hgsql mm5 -e 'LOAD DATA local INFILE "sfDescription.tab" into table mm5.sfDescription;'
# Finally, load the superfamily table.
hgLoadBed mm5 superfamily superfamily.tab -tab
# Create knownToSuperfamily table
cat /cluster/data/superfamily/041128/ass_28-Nov-2004.tab | hgKnownToSuper mm5 mm stdin
# created 21899 records output
# MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-14, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat
# (DONE, 2004-12-22, hartera)
ssh hgwdev
cd /cluster/data/mm5/bed/blastz.danRer2/axtChrom
set gp = /usr/local/apache/htdocs/goldenPath/mm5
mkdir -p $gp/vsDanRer2/axtChrom
cp -p *.axt $gp/vsDanRer2/axtChrom
cd $gp/vsDanRer2/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# copy chains and nets to downloads area
# re-make chains and net downloadables (2004-12-22, hartera)
rm $gp/vsDanRer2/zebrafish*.gz $gp/vsDanRer2/md5sum.txt
cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
gzip -c all.chain.antirepeat > \
/cluster/data/mm5/zip/zebrafishDanRer2.chain.gz
gzip -c zfishdanRer2.net > /cluster/data/mm5/zip/zebrafishDanRer2.net.gz
cd $gp/vsDanRer2
mv /cluster/data/mm5/zip/zebrafish*.gz .
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# BLASTZ DANRER2 CLEANUP (DONE, 2004-12-14, hartera)
# RE-DONE (DONE, 2004-12-22, hartera)
ssh kksilo
cd /cluster/data/mm5/bed/blastz.danRer2
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/noClass.net &
nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
nice gzip axtChain/all.chain.antirepeat axtChain/all.chain.filt5k axtChain/chainAR/*.chain &
nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
# MOUSE PHOTOGRAPH added to gateway page
# Obtained from Jackson Labs press office via email:
#
# Subject: Re: mouse press photographs
# Date: Wed, 29 Dec 2004 14:26:15 -0500
# From: Joyce Peterson <joyce@jax.org>
# To: Hiram Clawson <hiram@soe.ucsc.edu>
# References: <41D2FF0B.3090207@soe.ucsc.edu>
# Hi, Hiram. You may use the attached photo, noting credit to "The
# Jackson Laboratory."
#
# Cheers,
# --Joyce
#
# Joyce Peterson
# Public Information Manager
# The Jackson Laboratory
# 610 Main Street, Mailbox 664
# Bar Harbor, ME 04609-1526
# Tel. 207-288-6058
# Mobile 207-266-5745
# E-mail joyce@jax.org
# http://www.jax.org/news
#
# Original from this email placed into /cluster/data/mm5/html/C57BL_6J.JPG
ssh hgwdev
cd /cluster/data/mm5/html
# view that image in 'display' to determine crop edges, then:
convert -crop 890x690+330+70 -quality 80 -sharpen 0 \
-normalize C57BL_6J.JPG mm.jpg
convert -geometry 300x200 -quality 80 mm.jpg Mus_musculus.jpg
rm -f mm.jpg
cp -p Mus_musculus.jpg /usr/local/apache/htdocs/images
# add links to this image in the description.html page, request push
# ANDY LAW CPGISSLANDS (DONE 1/14/05 angie)
# See notes about this in makeGalGal2.doc.
# Running only on masked sequence.
ssh kksilo
mkdir /cluster/data/mm5/bed/cpgIslandGgfAndy
cd /cluster/data/mm5/bed/cpgIslandGgfAndy
cp /dev/null cpgIslandGgfAndyMasked.bed
foreach f (../../?{,?}/chr*.fa.masked)
set chr = $f:t:r:r
echo preproc masked $chr
/cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc
echo running on $chr masked
/cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
$gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \
$pGc = (100.0 * $gc / $n); \
$_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
"$pCpG\t$pGc\t$oE\n";' \
>> cpgIslandGgfAndyMasked.bed
end
# load into database:
ssh hgwdev
cd /cluster/data/mm5/bed/cpgIslandGgfAndy
sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
hgLoadBed mm5 cpgIslandGgfAndyMasked -tab -noBin \
-sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
featureBits mm5 cpgIslandExt
#10422989 bases of 2615483787 (0.399%) in intersection
featureBits mm5 cpgIslandGgfAndyMasked
#38305840 bases of 2615483787 (1.465%) in intersection
wc -l ../cpgIsland/cpgIsland.bed cpgIslandGgfAndyMasked.bed
# 16238 ../cpgIsland/cpgIsland.bed
# 67737 cpgIslandGgfAndyMasked.bed
# 1/26/05: Make better island names in cpgIslandGgfAndyMasked,
# for Dave Burt's cross-species island comparisons.
ssh kksilo
cd /cluster/data/mm5/bed/cpgIslandGgfAndy
mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig
perl -wpe '@w=split("\t"); $w[3] = "mm5.$w[0]." . ($w[1]+1) . ".$w[2]"; \
$_ = join("\t", @w);' \
cpgIslandGgfAndyMasked.bed.orig \
> cpgIslandGgfAndyMasked.bed
ssh hgwdev
cd /cluster/data/mm5/bed/cpgIslandGgfAndy
hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
mm5 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed
# MAKE MM5-RN3 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie)
ssh kolossus
set chainDir = /cluster/data/mm5/bed/blastz.rn3/axtChain
mkdir -p /cluster/data/mm5/bed/bedOver
mkdir /tmp/mm5ToRn3
foreach f ($chainDir/ratNet/chr*.net.gz)
set chr = $f:t:r:r
echo $chr
netChainSubset $f $chainDir/chain/$chr.chain.gz \
/tmp/mm5ToRn3/$chr.chain
end
cat /tmp/mm5ToRn3/*.chain \
> /cluster/data/mm5/bed/bedOver/mm5ToRn3.over.chain
rm -r /tmp/mm5ToRn3
# MAKE MM5-GALGAL2 OVER.CHAIN FOR LIFTOVER (DONE 1/25/05 angie)
ssh kolossus
set chainDir = /cluster/data/mm5/bed/blastz.galGal2/axtChain
mkdir -p /cluster/data/mm5/bed/bedOver
netChainSubset $chainDir/chicken.net $chainDir/all.chain \
/cluster/data/mm5/bed/bedOver/mm5ToGalGal2.over.chain
# UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new mm5 protein display IDs to the alias table to support user search
ssh hgwdev
mkdir -p /cluster/data/mm5/bed/pb/newDisplayId
cd /cluster/data/mm5/bed/pb/newDisplayId
hgsql proteome -e 'select mm5.kgSpAlias.kgID, mm5.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, mm5.kgSpAlias where spOldNew.acc=mm5.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >mm5.tab
# get rid of the header line at the end of the file
vi mm5.tab
hgsql mm5 -e 'load data local infile "mm5.tab" into table mm5.kgSpAlias'
# UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
# Add new mm5 protein display IDs to the alias table to support user search
ssh hgwdev
cd /cluster/data/mm5/bed/pb/newDisplayId
hgsql proteome -e 'select mm5.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, mm5.kgSpAlias where spOldNew.acc=mm5.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >mm5.kgProtAlias.tab
# get rid of the header line at the end of the file
vi mm5.kgProtAlias.tab
hgsql mm5 -e 'load data local infile "mm5.kgProtAlias.tab" into table mm5.kgProtAlias'
# BLASTZ/CHAIN/NET BOSTAU1 (DONE 2/21/05 angie)
ssh kksilo
mkdir /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
cat << '_EOF_' > DEF
# mouse vs. cow
# TARGET
# Mouse
SEQ1_DIR=/scratch/mus/mm5/softNib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/mm5/chrom.sizes
# QUERY
# Cow
SEQ2_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes
BASE=/cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /cluster/bluearc/mouseVsCow >& do.log &
tail -f do.log
# kksilo was rebooted so original invocation of doBlastzChainNet.pl
# was killed in the middle of the cluster run. I watched the job
# progress and restarted 70 failed jobs like this:
ssh kk
cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19/run.blastz
para check
para push
para check ...
# When the batch was complete:
para time > run.time
# (doBlastzChainNet.pl uses run.time as a checkpoint)
# Then to continue the run:
ssh kksilo
cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
doBlastzChainNet.pl -continue=cat DEF \
-blastzOutRoot /cluster/bluearc/mouseVsCow >>& do.log &
tail -f do.log
# For some reason the script got hung waiting for tty input; I
# foregrounded it, hit return a few times, and it eventually completed.
# That should be fixed in a future version of doBlastzChainNet.pl.
ln -s blastz.bosTau1.2005-02-19 /cluster/data/mm5/bed/blastz.bosTau1
# Add chainBosTau1 and netBosTau1 to mm5/trackDb.ra
# Add /usr/local/apache/htdocs/goldenPath/mm5/vsBosTau1/README.txt
# LOAD SNPS (Done; March 3, 2005; Heather)
# directory structure
ssh hgwdev
cd /cluster/bluearc/snp
mkdir mm5.heather
cd mm5.heather
mkdir det loc seq str xml
# get data
ftp ftp.ncbi.nih.gov
cd snp/mouse/XML
prompt
mget ds_ch*.xml.gz
# make sure script is current (should add makefile so general build does this)
cp -f /cluster/home/heather/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts
# build jobList for parsing
touch jobList
foreach file ( ds_ch*.xml.gz )
set out = $file:t:r
echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/mm5.heather $out.contig >> jobList
end
# do the parsing
ssh kk
cd /cluster/bluearc/snp/mm5.heather
para create jobList
para try
para check
para push
# output goes to det, loc, seq, str and xml directories
# concatenate details
ssh hgwdev
zcat det/ds_ch*.xml.contig.det.gz > in.bed
# couldn't find contig-based lift file from mm5
# generate from ctgPos
echo "select chromStart, chrom, contig, size, chrom from ctgPos;" > ctgPos.sql
hgsql mm5 < ctgPos.sql > ctgPos.out
# edit ctgPos.out to put in proper format -- next time write script for this
# lift
# expect warnings from non-reference assemblies (limited to first 10)
liftUp out.bed ctgPos.out warn in.bed
# load (exception column will be empty for all rows)
hgLoadBed mm5 snp out.bed -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp.sql
# generate exceptions 1-20; drop 7 and 9 as they will be changing
cd /usr/local/apache/htdocs/qa/test-results/snpException
mkdir mm5
cd mm5
snpException mm5 0 mm5snpException
# Invariant 1 has 0 exceptions, written to this file: mm5snpException.01.bed
# Invariant 2 has 0 exceptions, written to this file: mm5snpException.02.bed
# Invariant 3 has 0 exceptions, written to this file: mm5snpException.03.bed
# Invariant 4 has 0 exceptions, written to this file: mm5snpException.04.bed
# Invariant 5 has 0 exceptions, written to this file: mm5snpException.05.bed
# Invariant 6 has 3 exceptions, written to this file: mm5snpException.06.bed
# Invariant 7 has 1 exceptions, written to this file: mm5snpException.07.bed
# Invariant 8 has 0 exceptions, written to this file: mm5snpException.08.bed
# Invariant 9 has 22 exceptions, written to this file: mm5snpException.09.bed
# Invariant 10 has 0 exceptions, written to this file: mm5snpException.10.bed
# Invariant 11 has 0 exceptions, written to this file: mm5snpException.11.bed
# Invariant 12 has 0 exceptions, written to this file: mm5snpException.12.bed
# Invariant 13 has 0 exceptions, written to this file: mm5snpException.13.bed
# Invariant 14 has 0 exceptions, written to this file: mm5snpException.14.bed
# Invariant 15 has 0 exceptions, written to this file: mm5snpException.15.bed
# Invariant 16 has 0 exceptions, written to this file: mm5snpException.16.bed
# Invariant 17 has 0 exceptions, written to this file: mm5snpException.17.bed
# Invariant 18 has 3634 exceptions, written to this file: mm5snpException.18.bed
# Invariant 19 has 0 exceptions, written to this file: mm5snpException.19.bed
# Invariant 20 has 0 exceptions, written to this file: mm5snpException.20.bed
# Invariant 21 has no query string
# Invariant 22 has no query string
# Invariant 23 has no query string
# Invariant 24 has no query string
mv mm5snpException.07.bed mm5snpException.07.bed.notused
mv mm5snpException.09.bed mm5snpException.09.bed.notused
# snpValid
cd /cluster/bluearc/snp/mm5.heather/seq
nice snpValid mm5 . > & snpValid.out &
tail -20 snpValid.out
# Grand Totals:
# matches: 494545
# mismatches: 246 (exceptionId #22)
# missing from flanks: 0 (exceptionId #23)
# rev compl matches: 56285
# not rptd strand : 1 (exceptionId #24)
# assembly = -: 0
# nib in gap : 0 (must be 0)
# Total rows in snp: 494791
# no dna found for : 0
# Total goodExact: 493886
# Total badExact: 534 (exceptionId #21)
# copy 21-24 exceptions to location of 1-20
cp *bed /usr/local/apache/htdocs/qa/test-results/snpException/mm5
# add exception data to snp table
cp ../build124/updateExceptionList.pl .
tail +3 mm5snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt
updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql
hgsql mm5 < updateExceptionList.sql
# HUMAN BLASTP FOR GENE SORTER (RE-DONE 7/28/05 Fan)
# Make human ortholog column using blastp on human known genes.
# First make human protein database and copy it to iscratch/i
# if it doesn't exist already:
# NOTE: THE SECTION BELOW WAS ALREADY DONE.
cd /cluster/data/hg17/bed/blastp
pepPredToFa hg17 knownGenePep known.faa
formatdb -i known.faa -t known -n known
ssh kkr1u00
if (-e /iscratch/i/hg17/blastp) then
rm -r /iscratch/i/hg17/blastp
endif
mkdir -p /iscratch/i/hg17/blastp
cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp
iSync
# THE SECTION ABOVE WAS ALREADY DONE PREVIOUSLY.
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/mm5/bed/blastp/hg17/run/out
cd /cluster/data/mm5/bed/blastp/hg17/run
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
ls -1S /cluster/data/mm5/bed/geneSorter/blastp/split >split.lst
#EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
gensub2 split.lst single gsub spec
para create spec
para try, check, push, check, ...
Completed: 7739 of 7739 jobs
CPU time in finished jobs: 113019s 1883.65m 31.39h 1.31d 0.004 y
IO & Wait Time: 22145s 369.08m 6.15h 0.26d 0.001 y
Average job time: 17s 0.29m 0.00h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 124s 2.07m 0.03h 0.00d
Submission to last job: 495s 8.25m 0.14h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/mm5/bed/blastp/hg17/run/out
hgLoadBlastTab mm5 hgBlastTab -maxPer=1 *.tab
# KNOWN GENES
# This was built using ~/kent/src/hg/protein/KGprocess.sh
# and it was not documented.
# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as
# the kgAlias and kgProAlias tables. The hgKgGetText takes
# about 5 minutes when the database is not too busy. The rest
# is real quick.
ssh hgwdev
cd /cluster/data/mm5/bed/
mkdir -p kgMm5/index
cd kgMm5/index
hgKgGetText mm5 knownGene.text
ixIxx knownGene.text knownGene.ix knownGene.ixx
ln -s /cluster/data/mm5/bed/kgMm5/index/knownGene.ix /gbdb/mm5/knownGene.ix
ln -s /cluster/data/mm5/bed/kgMm5/index/knownGene.ixx /gbdb/mm5/knownGene.ixx
# RE-BUILD cgapAlias TABLE
# ORIGINALLY TABLE WAS BUILT BY THE KNOWN GENES PROCESS
# cgapAlias table has replicate rows so remove (DONE, 2005-07-26, hartera)
# RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)
ssh hgwdev
cd /cluster/store6/kgDB/bed/kgMm5B
# DO TABLE RELOAD AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u
# OR sort -n | uniq.
# USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ
# (hartera, 2005-10-06)
sort -n cgapAlias.tab | uniq > cgapAliasSorted.tab
hgsql mm5 -e "drop table cgapAlias"
hgsql mm5 < ~/kent/src/hg/lib/cgapAlias.sql
hgsql mm5 -e 'load data local infile "cgapAliasSorted.tab" \
into table cgapAlias'
# Create table that maps between known genes and visiGene database (DONE 2005-10-10 galt)
knownToVisiGene mm5
#Made hashes of image: geneImageHash 2117, locusLinkImageHash 780, refSeqImageHash 780,
#genbankImageHash 1301
#knownToLocusLink 30303, knownToRefSeq 30291, knownToGene 266841
# RIKEN CAGE STUFF (DONE 11-16-2005 Andy)
# Make download area.
ssh hgwdev
cd /cluster/data/mm5/bed
mkdir rikenCageCtss
cd rikenCageCtss/
wget -r http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/
# stupid thing didn't work. Tried tinkering with wget almost every way possible.
# Finally just did it the hard way.
wget -O /dev/stdout http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/ 2> /dev/null
| egrep ".sql|.bz2" | grep href | sed 's/^.*href=\"//;s/\".*$//' > files.lst
rm -rf fantom*
for f in `cat files.lst`; do
wget http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/$f;
done
bunzip2 *.bz2
# Make the simple table of the CAGE-related TSSs.
awk 'BEGIN{FS="\t"};{printf("%s\t%s\t%s\t%s\t%s\t1000\t%s\n",$9,$4,$7,$8,$1,($6 == "F") ? "+" : "-")}' \
tss_summary.tsv | grep "^CAGE" | cut -f2- > basicCAGE.bed
# Make CAGE wiggle tracks for plus and minus strands
awk 'BEGIN{FS="\t"}; {if ($4=="F") printf("%s\t%s\t%d\t%s\n", $2, $5, $5+1, $6)}' \
ctss_summary.tsv | wigEncode stdin ctssForward.wig ctssForward.wib
awk 'BEGIN{FS="\t"}; {if ($4=="R") printf("%s\t%s\t%d\t%s\n", $2, $5-1, $5, $6)}' \
ctss_summary.tsv | wigEncode stdin ctssReverse.wig ctssReverse.wib
mkdir wiggle
mv ctss*.wi{g,b} wiggle/
# Load stuff up:
hgLoadBed mm5 rikenCageTc basicCAGE.bed
ln -s /cluster/data/mm5/bed/rikenCAGE/wiggle /gbdb/mm5/wib/ctssForward
ln -s /cluster/data/mm5/bed/rikenCAGE/wiggle /gbdb/mm5/wib/ctssReverse
hgLoadWiggle mm5 ctssForward ctssForward.wig
hgLoadWiggle mm5 ctssReverse ctssReverse.wig
# OK make them bedGraphs instead.
cd ../
rm -rf wiggle/
rm /gbdb/mm5/wib/ctss*
hgsql mm5 -e 'drop table ctssForward'
hgsql mm5 -e 'drop table ctssReverse'
awk 'BEGIN{FS="\t"}; {if ($4=="F") printf("%s\t%s\t%d\t%s\n", $2, $5, $5+1, $6)}' \
ctss_summary.tsv | hgLoadBed -strict -bedGraph=4 mm5 rikenCageCtssPlus stdin
awk 'BEGIN{FS="\t"}; {if ($4=="R") printf("%s\t%s\t%d\t%s\n", $2, $5-1, $5, $6)}' \
ctss_summary.tsv | hgLoadBed -strict -bedGraph=4 mm5 rikenCageCtssMinus stdin
# track html:
cp rikenCageCtss.html ~/kent/src/hg/makeDb/trackDb/mouse/
# trackDb:
track rikenCageTc
shortLabel Riken CAGE TC
longLabel Riken CAGE - Associated Transcript Clusters
group genes
priority 47.5
visibility hide
type bed 6 .
track rikenCageCtss
compositeTrack on
shortLabel Riken CAGE
longLabel Riken CAGE - Predicted Gene Start Sites
group genes
priority 47.51
visibility hide
type bedGraph 4
maxHeightPixels 128:16:16
minLimit 1
maxLimit 4316
viewLimits 1.0:10.0
windowingFunction mean
autoScale Off
origAssembly hg16
track rikenCageCtssPlus
subTrack rikenCageCtss
shortLabel Riken CAGE +
longLabel Riken CAGE Plus Strand - Predicted Gene Start Sites
priority 1
color 109,51,43
track rikenCageCtssMinus
subTrack rikenCageCtss
shortLabel Riken CAGE -
longLabel Riken CAGE Minus Strand - Predicted Gene Start Sites
priority 2
color 43,51,109
# MYTOUCH FIX - jen - 2006-01-24
sudo mytouch mm5 geneidPep 0408071900.00
sudo mytouch mm5 genscanPep 0501071300.00
sudo mytouch mm5 superfamily 0503011100.00
sudo mytouch mm5 ensGtp 0503011100.00
sudo mytouch mm5 knownToEnsembl 0503011100.00
sudo mytouch mm5 sfDescription 0503011100.00
############################################################################
# Mm7 to Mm5 liftOver creation (DONE - 2006-02-22 - 2006-02-24 - Hiram)
# instructions lifted from Andy's sequence in makeMm7.doc
######## LIFTOVER PREPARATION
# Split up mm5
ssh kkr1u00
cd /iscratch/i/mm5
mkdir liftSplits
mkdir liftSplits/split
mkdir liftSplits/lift
for fa in /cluster/data/mm5/?/*.fa /cluster/data/mm5/??/*.fa
do
c=`basename $fa .fa`
echo $c
faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 \
liftSplits/split/$c
done
mkdir biggerSplits
mkdir biggerSplits/split
cd biggerSplits/
ln -s ../liftSplits/lift
cd split/
ln -s ../../liftSplits/split/* .
faSplit sequence chr1.fa 5 chr1_
faSplit sequence chrX.fa 10 chrX_
rm chr{1,X}.fa
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/mm5/ kkr${R}u00:/iscratch/i/mm5/
done
######## LIFTOVER BLATING
# MM7
ssh kk
cd /cluster/data/mm7
/cluster/bin/scripts/makeLoChain-align mm7 /scratch/hg/mm7/nib mm5 \
/iscratch/i/mm5/biggerSplits/split
cd bed/blat.mm5.2006-02-22/run
# target is Mm7
# query is Mm5
cat << '_EOF_' > blat.csh
#!/bin/csh -fe
set target=$1
set query=$2
set output=$3
set chain=$4
set tPart=$target:t:r
set qPart=$query:t:r
set tmpDir=/scratch/tmp/${chain}.${tPart}_${qPart}
set tmpOutput=$tmpDir/$output:t
mkdir -p $tmpDir
sleep 2
/cluster/bin/$MACHTYPE/blat $target $query $tmpOutput \
-tileSize=11 -minScore=100 -minIdentity=98 -fastMap \
-ooc=/iscratch/i/mm5/11.ooc
mkdir -p `dirname $output`
cp $tmpOutput $output
rm $tmpOutput
rmdir --ignore-fail-on-non-empty $tmpDir
'_EOF_'
# happy emacs
chmod +x blat.csh
sed 's#^blat#./blat.csh#; s/\}.*$/}/; s/$/ mm7ToMm5/' spec > jobList
para create jobList
para -maxNode=200 -priority=25 push
para time
# Completed: 2451 of 2451 jobs
# CPU time in finished jobs: 1266001s 21100.02m 351.67h 14.65d 0.040 y
# IO & Wait Time: 13972s 232.87m 3.88h 0.16d 0.000 y
# Average job time: 522s 8.70m 0.15h 0.01d
# Longest finished job: 6769s 112.82m 1.88h 0.08d
# Submission to last job: 26506s 441.77m 7.36h 0.31d
######## LIFTOVER CHAINING
# LIFTING
ssh kki
cd /cluster/data/mm7/bed/blat.mm5.2006-02-22
cat << '_EOF_' > mm5SplitLift.sh
#!/bin/bash
for C in chr1 chrX
do
echo joining $C
for P in `ls *_${C}_[0-9]*.psl | sed -e "s/_chr.*//" | sort -u`
do
echo "${P}_${C}.psl"
tail --lines=+6 -q "${P}_${C}_[0-9]*.psl" > ${P}_${C}.psl
done
for f in *_${C}.psl; do
cat /san/sanvol1/scratch/andy/psl.header $f > tmp
mv tmp $f
done
done
echo Lifting...
for C in `awk '{print $1}' /cluster/data/mm5/chrom.sizes`; do
echo "lifting $C ... "
liftUp -pslQ ../psl/${C}.psl \
/iscratch/i/mm5/biggerSplits/lift/${C}.lft error chr*_${C}.psl
echo done $C
done
'_EOF_'
# happy emacs
chmod +x mm5SplitLift.sh
cat << "EOF" > mm5ChainMergeSplit.sh
#!/bin/bash
cp -r chainRaw/ /scratch/andy/mm5Lifts
pushd /scratch/andy/mm5Lifts
mkdir chain
/cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
cp -r chain `dirs +1`
rm -rf chain chainRaw
EOF
chmod +x mm5ChainMergeSplit.sh
cd /cluster/data/mm7/bed/blat.mm5.2006-02-22/raw
../mm5SplitLift.sh
cd ../
mkdir chainRun chainRaw
cd chainRun
cat << '_EOF_' > template
#LOOP
axtChain -linearGap=medium -verbose=0 -psl $(path1) /scratch/hg/mm7/nib /cluster/data/mm5/nib {check out line+ ../chainRaw/$(root1).chain}
#ENDLOOP
'_EOF_'
ls -1S ../psl/*.psl > in.lst
gensub2 in.lst single template jobList
para create jobList
para push
para time
# Completed: 43 of 43 jobs
# CPU time in finished jobs: 7259s 120.98m 2.02h 0.08d 0.000 y
# IO & Wait Time: 1086s 18.10m 0.30h 0.01d 0.000 y
# Average job time: 194s 3.23m 0.05h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1088s 18.13m 0.30h 0.01d
# Submission to last job: 2289s 38.15m 0.64h 0.03d
ssh kkstore02
cd /cluster/data/mm7/bed/blat.mm5.2006-02-22
mkdir chain
time chainMergeSort chainRaw/* | chainSplit chain stdin
# real 29m42.365s
mkdir net over
cd chain
for c in *.chain
do
echo ${c%.chain};
nice chainNet $c /cluster/data/mm7/chrom.sizes \
/cluster/data/mm5/chrom.sizes ../net/${c%.chain}.net /dev/null
echo done $c
done
# real 15m33.593s
for chain in *.chain
do
c=${chain%.chain}
nice netChainSubset ../net/$c.net $chain ../over/$c.over
done
# real 10m48.898s
########## FINISHING
ssh kkstore02
cd /cluster/data/mm7/bed/blat.mm5.2006-02-22/over
cat * > ../mm7ToMm5.over.chain
cd ..
gzip mm7ToMm5.over.chain
rm -rf psl net chain chainRaw over
ssh hgwdev
cd /cluster/data/mm7/bed
ln -s blat.mm5.2006-02-22 blat.mm5
ln -s `pwd`/blat.mm5/mm7ToMm5.over.chain.gz liftOver/mm7ToMm5.over.chain.gz
ln -s `pwd`/liftOver/mm7ToMm5.over.chain.gz \
/gbdb/mm7/liftOver/mm7ToMm5.over.chain.gz
ln -s `pwd`/liftOver/mm7ToMm5.over.chain.gz \
/usr/local/apache/htdocs/goldenPath/mm7/liftOver/mm7ToMm5.over.chain.gz
hgAddLiftOverChain mm7 mm5 /gbdb/mm7/liftOver/mm7ToMm5.over.chain.gz
############################################################################
# UPDATED mm5.knownToVisiGene (2006-03-21 galt)
ssh hgwdev
knownToVisiGene mm5
#######################################################################
## LIFTOVER To Mm8 (DONE - 2006-05-15 - 2006-06-05 - Hiram)
ssh kkr1u00
# do not need to run this command since /cluster/data/mm8/split10k
# already exists from previous liftOver jobs (mm7 to mm8)
# $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
# mm8 /cluster/data/mm8/nib
# as it says, DO THIS NEXT:
ssh kk
# if bin/scripts is not in your PATH, add it for this command:
PATH=$PATH:/cluster/bin/scripts \
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
mm5 /cluster/data/mm5/nib mm8 /iscratch/i/mm8/split10k \
/cluster/data/mm8/11.ooc
# as it says, DO THIS NEXT:
cd /cluster/data/mm5/bed/blat.mm8.2006-05-15/run
para try, check, push, check, ...
# Completed: 1462 of 1462 jobs
# CPU time in finished jobs: 3990246s 66504.10m 1108.40h 46.18d 0.127 y
# IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y
# Average job time: 2371s 39.51m 0.66h 0.03d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 24307s 405.12m 6.75h 0.28d
# Submission to last job: 1474509s 24575.15m 409.59h 17.07d
# as it says, DO THIS NEXT:
# this does the liftUp and makes the psl files
# kkr1u00 is down these days
ssh kkr3u00
cd /cluster/data/mm5/bed
ln -s blat.mm8.2006-05-15 blat.mm8
# edit this script to allow use on kkr3u00
time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm5 mm8
# real 16m5.091s
# as it says, DO THIS NEXT:
# the prepares the batch to run for the chaining
ssh kki
time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
mm5 /cluster/data/mm5/nib mm8 /cluster/data/mm8/nib
# as it says, DO THIS NEXT:
# running the chain batch
cd /cluster/data/mm5/bed/blat.mm8.2006-05-15/chainRun
para try, check, push, check, ...
# Completed: 34 of 34 jobs
# CPU time in finished jobs: 6893s 114.88m 1.91h 0.08d 0.000 y
# IO & Wait Time: 7183s 119.72m 2.00h 0.08d 0.000 y
# Average job time: 414s 6.90m 0.12h 0.00d
# Longest finished job: 1130s 18.83m 0.31h 0.01d
# Submission to last job: 1130s 18.83m 0.31h 0.01d
ssh kkstore03
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm5 mm8
# Created /cluster/data/mm5/bed/liftOver/mm5ToMm8.over.chain.gz
# as it says, DO THIS NEXT:
ssh hgwdev
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm5 mm8
# It says this:
# Now, add link for
# /usr/local/apache/htdocs/goldenPath/mm5/liftOver/mm5ToMm8.over.chain
# to hgLiftOver
# But I believe that link was already done:
cd /gbdb/mm5/liftOver
ls -og mm5ToMm8*
# lrwxrwxrwx 1 53 Jun 5 16:10 mm5ToMm8.over.chain.gz ->
# /cluster/data/mm5/bed/liftOver/mm5ToMm8.over.chain.gz
#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
# File emailed from Xinwei She <xws@u.washington.edu>
mkdir /cluster/data/mm5/bed/genomicSuperDups
cd /cluster/data/mm5/bed/genomicSuperDups
sed -e 's/\t_\t/\t-\t/' mm5_genomicSuperDup.tab \
| awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
| hgLoadBed mm5 genomicSuperDups stdin \
-sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page
# genbank release 157.0 now contains misc_diff fields for MGC clones
# reloading mRNAs results in gbMiscDiff table being created.
./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm5