src/hg/makeDb/doc/mm8.txt 1.76
1.76 2010/01/07 20:47:49 rhead
Added note about renaming mm8 jaxQTL table to jaxQtl.
Index: src/hg/makeDb/doc/mm8.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm8.txt,v
retrieving revision 1.75
retrieving revision 1.76
diff -b -B -U 1000000 -r1.75 -r1.76
--- src/hg/makeDb/doc/mm8.txt 20 Sep 2009 17:16:45 -0000 1.75
+++ src/hg/makeDb/doc/mm8.txt 7 Jan 2010 20:47:49 -0000 1.76
@@ -1,9647 +1,9651 @@
# for emacs: -*- mode: sh; -*-
# This file describes browser build for the mouse
# genome, February 2006, ncbi mouse_36 - Mm8
#
# "$Id$"
#
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+-------------------------------------+
# | tableName | type |
# +-------------+-------------------------------------+
# | knownGene | genePred knownGenePep knownGeneMrna |
# | refGene | genePred refPep refMrna |
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | mgcGenes | genePred |
# | ensGene | genePred ensPep |
# | genscan | genePred genscanPep |
# +-------------+-------------------------------------+
#######################################################################
# DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2006-02-14 - Hiram)
#
# Examine disk space issues, find some goodly amount of space
ssh kkstore01
mkdir /cluster/store9/mm8
ln -s /cluster/store9/mm8 /cluster/data/mm8
cd /cluster/data/mm8
mkdir ncbi
cd ncbi
cp -p /cluster/data/mm7/ncbi/.wgetrc .
WGETRC=`pwd`/.wgetrc
export WGETRC
wget --timestamping --force-directories --directory-prefix=. \
--dont-remove-listing --recursive --level=4 --no-parent \
--no-host-directories --cut-dirs=1 \
ftp://ftp-private.ncbi.nih.gov/mouse_36
# Downloaded: 2,201,934,141 bytes in 50 files
# real 44m48.975s
# The pre-release sequence, Feb 27th:
mkdir /cluster/data/mm8/pre_release
cd /cluster/data/mm8/pre_release
# The .wgetrc is the anonymous user
WGETRC=`pwd`/.wgetrc
export WGETRC
wget --timestamping --force-directories --directory-prefix=. \
--dont-remove-listing --recursive --level=4 --no-parent \
--no-host-directories --cut-dirs=3 \
ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release
# Fixup the agp and seq_contig.md files to add chrM
# No chrM or chrMT was delivered. Copy from previous assembly
ssh kkstore01
cd /cluster/data/mm8/ncbi/chrfasta
cp -p /cluster/data/mm7/ncbi/chrfasta/chrM.fa.gz .
cd ../contigfasta
cp -p /cluster/data/mm7/ncbi/contigfasta/chrM.fa.gz .
# with a fixed up header line to be like all the others:
# >lcl|chrM.fa gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion, complete genome
cd /cluster/data/mm8
zcat ncbi/allrefcontig.chr.agp.gz > allrefcontig.chr.agp
echo -e "chrM\t1\t16299\t1\tF\tAY172335.1\t1\t16299\t+" >> \
allrefcontig.chr.agp
gzip allrefcontig.chr.agp
# I don't see allcontig.agp being used anywhere else ?
# zcat ncbi/allcontig.agp.gz > allcontig.agp
# echo -e "NC_005089\t1\t16299\t1\tF\tAY172335\t\t1\t16299\t+" >> \
# allcontig.agp
# gzip allcontig.agp
zcat ncbi/seq_contig.md.gz | egrep -v "Celera|129_substrain" \
| sed -e "238i\
10090\tM\t0\t0\t+\tstart\t-1\tCONTIG\tC57BL/6J\t10\n\
10090\tM\t1\t16299\t+\tNC_005089\tGI:34538597\tCONTIG\tC57BL/6J\tna\n\
10090\tM\t16299\t16299\t+\tend\t-2\tCONTIG\tC57BL/6J\t10" > seq_contig.md
# (curiously, this sed command would not work on hgwdev,
# only when logged into kkstore01 ?)
# The line number 238 was found by checking the contents of
# ncbi/seq_contig.md.gz (after the egrep filter) and it was
# the line starting with:
# 10090 Un|NT_039877 1 35798
# Wanted this chrM information before that line.
# summarize sequence counts
mkdir faCounts
time faCount ncbi/chrfasta/chr*.fa.gz > faCounts/chrfasta.faCount 2>&1 &
# about 1.5 minutes
time faCount ncbi/contigfasta/chr*.fa.gz > \
faCounts/contigfasta.faCount 2>&1 &
# about 3 minutes
time zcat ncbi/chrfasta/chr*.fa.gz | grep "^>" > \
faCounts/chrfasta.headers 2>&1 &
time zcat ncbi/contigfasta/chr*.fa.gz | grep "^>" > \
faCounts/contigfasta.headers 2>&1 &
# about 2 minutes each for the above two zcat/greps
#############################################################################
# BREAK UP SEQUENCE INTO 5 MB CHUNKS at NON-BRIDGED CONTIGS
# (DONE - 2006-02-14 - Hiram)
######### Are these necessary ? They may no longer be needed.
######### TRF can run on full chroms on the kki kluster
# It would be better to use . in place of the /cluster/data/mm8
# for the outputDir argument to splitFaIntoContigs so this script
# is independent of specific locations, thus it works in .
ssh kkstore01
cd /cluster/data/mm8
for F in ncbi/chrfasta/chr*.fa.gz
do
CHR=`basename ${F} | sed -e "s/.fa.gz//; s/chr//"`
echo ${CHR} ${F}
mkdir -p "${CHR}"
zcat allrefcontig.chr.agp.gz | \
perl -we "while(<>){if (/^chr${CHR}\t/) {print;}}" > \
${CHR}/chr${CHR}.agp
zcat ncbi/chrfasta/chr${CHR}.fa.gz | \
perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' | \
splitFaIntoContigs ${CHR}/chr${CHR}.agp \
stdin /cluster/data/mm8 -nSize=5000000
done
# The above loop takes about 5 minutes
# Some of these in the chr1 directory got overwritten on 2006-02-27
# during an attempt to verify that the pre-release directory at
# NCBI was the same as what we worked with here.
#############################################################################
# CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2006-02-14 - Hiram)
ssh kkstore01
mkdir /cluster/data/mm8/jkStuff
cd /cluster/data/mm8
mkdir Un tmp
cp -p /cluster/data/mm7/jkStuff/ncbiFixAgp ./jkStuff
zcat allrefcontig.chr.agp.gz | ./jkStuff/ncbiFixAgp /dev/stdin > \
allrefcontig.chr.ordered.agp
# Set the appropriate release number here, this one is 35
# Fetch the script from the previous assembly
sed -e "s/buildNum = 35/buildNum = 36/" \
/cluster/data/mm7/jkStuff/ncbiToRandomAgps > \
jkStuff/ncbiToRandomAgps
chmod +x jkStuff/ncbiToRandomAgps
# NOTE ! * ! This mm8 contig.idmap now includes the celera assembly
# Filter that out for use here.
# There were two broken lines that began _36 - they were removed
# after I reported them and the contig.idmap.gz file here was
# updated later.
zcat ncbi/contig.idmap.gz | grep ref_strain | grep -v "^_36" \
| ./jkStuff/ncbiToRandomAgps seq_contig.md \
allrefcontig.chr.ordered.agp \
/dev/stdin . 2> dbg
for C in ? ??
do
if [ -s ${C}/chr${C}_random.ctg.agp ]; then
echo "building ${C}/chr${C}_random.fa"
rm -f ./tmp.fa
zcat ncbi/contigfasta/chr${C}.fa.gz | \
perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' > ./tmp.fa
agpToFa -verbose=2 -simpleMulti \
${C}/chr${C}_random.ctg.agp chr${C}_random \
${C}/chr${C}_random.fa ./tmp.fa
rm -f ./tmp.fa
fi
done > tmp/agpToFa.out 2>&1
# the above loop takes about 3 minutes, examine the tmp/agpToFa.out
# record for any errors
# We need the lift information from these random.ctg.agp files
cp -p /cluster/data/mm7/jkStuff/agpToLift.pl ./jkStuff
for AGP in ?/*_random.ctg.agp ??/*_random.ctg.agp
do
CHR=`dirname ${AGP}`
echo ${CHR}
mkdir -p ${CHR}/lift
./jkStuff/agpToLift.pl ${AGP} > ${CHR}/lift/ctg_random.lft
done
# Clean these up to avoid confusion later... they're easily rebuilt
# with the ncbiToRandomAgps script above
rm ?/*_random.ctg.agp ??/*_random.ctg.agp
gzip seq_contig.md allrefcontig.chr.ordered.agp
#############################################################################
# BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS
# (DONE 2006-02-14 - Hiram)
ssh kkstore01
cd /cluster/data/mm8
for C in ? ??
do
if [ -s ${C}/chr${C}_random.fa ]; then
splitFaIntoContigs -nSize=5000000 ${C}/chr${C}_random.agp \
${C}/chr${C}_random.fa .
mkdir -p ${C}/lift
rm -f ${C}/lift/rOut.lst ${C}/lift/random.lft ${C}/lift/random.lst
mv ${C}_random/lift/oOut.lst ${C}/lift/rOut.lst
mv ${C}_random/lift/ordered.lft ${C}/lift/random.lft
mv ${C}_random/lift/ordered.lst ${C}/lift/random.lst
rmdir ${C}_random/lift
rm ${C}_random/chr${C}_random.agp ${C}_random/chr${C}_random.fa
rm -rf ${C}/chr${C}_random_*
mv ${C}_random/chr${C}_random_* ${C}
rmdir ${C}_random
fi
done > tmp/split.out 2>&1
# the above loop takes less than a minute
# scan the tmp/split.out file for possible errors
#############################################################################
# MAKE LIFTALL.LFT (DONE - 2006-02-14 - Hiram)
ssh kkstore01
cd /cluster/data/mm8
cat ?/lift/*.lft ??/lift/*.lft > jkStuff/liftAll.lft
#############################################################################
# CREATING DATABASE (DONE - 2006-02-14 - Hiram)
ssh kkstore01
cd /cluster/data/mm8
faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
mm8.2bit
twoBitInfo mm8.2bit stdout | sort -rn +1 > chrom.sizes
grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst
twoBitInfo mm8.2bit stdout |
awk '{printf "%s\t%s\t/gbdb/mm8/mm8.2bit\n", $1,$2}' > chromInfo.tab
ssh hgwdev
cd /cluster/data/mm8
hgsql -e "create database mm8;" mysql
# Make sure we have enough room (eventually ~ 70Gb) for mysql tables:
df -h | grep mysql
# /dev/sda1 472G 225G 223G 51% /var/lib/mysql2
# /dev/sdc1 1.8T 1.5T 190G 89% /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2006-02-14 - Hiram)
# Use any of the newest databases to ensure that the organization
# of the grp table is up to date
ssh hgwdev
hgsql mm8 -e "create table grp (PRIMARY KEY(NAME)) select * from hg18.grp"
hgsql mm8 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql mm8 -e 'load data local infile "chromInfo.tab" into table chromInfo;'
# Enter mm8 into dbDb and defaultDb so test browser knows about it:
hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
VALUES("mm8", "Feb 2006", "/gbdb/mm8", "Mouse", \
"chr6:28912411-28925620", 1, 22, "Mouse", \
"Mus musculus", "/gbdb/mm8/html/description.html", 0, 0, \
"NCBI Build 36");' -h localhost hgcentraltest
# Reset default position to be like Mm7, 2006-03-09 - Hiram
hgsql -e \
'update dbDb set defaultPos="chr12:50258170-50263946" where name="mm8";' \
hgcentraltest
# Do *NOT* set default genome on genome-test until ready for release
# hgsql hgcentraltest \
# -e 'update defaultDb set name="mm8" where genome="Mouse";'
# start a new entry in the trackDb hierarchy
cd $HOME/kent/src/hg/makeDb/trackDb/mouse
mkdir mm8
cvs add mm8
cd mm8
cp ../mm7/description.html .
vi description.html - fixup text for this assembly
cvs add description.html
cvs commit
cd ../..
vi makefile - add mm8 to the list
mkdir /cluster/data/mm8/html
mkdir /gbdb/mm8
ln -s /cluster/data/mm8/html /gbdb/mm8/html
ln -s /cluster/data/mm8/mm8.2bit /gbdb/mm8/mm8.2bit
cp -p mouse/mm8/description.html /gbdb/mm8/html
make DBS=mm8
#############################################################################
# GOLD GAP tracks (DONE - 2006-02-14 - Hiram)
ssh hgwdev
cd /cluster/data/mm8
# make sure these tmp contig agp files are gone, easily generated
# as above with jkStuff/ncbiToRandomAgps
mkdir ffa
zcat ncbi/sequence.inf.gz > ffa/sequence.inf
hgGoldGapGl -chromLst=chrom.lst mm8 /cluster/data/mm8 .
featureBits mm8 gold
# 2567283971 bases of 2567283971 (100.000%) in intersection
featureBits mm7 gold
# 2583394090 bases of 2583394090 (100.000%) in intersection
featureBits mm6 gold
# 2597150411 bases of 2597150411 (100.000%) in intersection
featureBits mm5 gold
# 2615483787 bases of 2615483787 (100.000%) in intersection
featureBits mm4 gold
# 2627444668 bases of 2627444668 (100.000%) in intersection
featureBits mm8 gap
# 97171117 bases of 2567283971 (3.785%) in intersection
featureBits mm7 gap
# 264323239 bases of 2583394090 (10.232%) in intersection
featureBits mm6 gap
# 482483041 bases of 2597150411 (18.577%) in intersection
featureBits mm5 gap
# 549468286 bases of 2615483787 (21.008%) in intersection
featureBits mm4 gap
# 325167539 bases of 2627444668 (12.376%) in intersection
#############################################################################
# GC5BASE (DONE - 2006-02-14 - Hiram)
ssh kkstore01
mkdir -p /cluster/data/mm8/bed/gc5Base
cd /cluster/data/mm8/bed/gc5Base
time hgGcPercent -wigOut -doGaps -file=stdout -win=5 mm8 \
/cluster/data/mm8 | wigEncode stdin gc5Base.wig gc5Base.wib
# Calculating gcPercent with window size 5
# Using twoBit: /cluster/data/mm8/mm7.2bit
# File stdout created
# Converted stdin, upper limit 100.00, lower limit 0.00
# runs for about 14 minutes
# load database
ssh hgwdev
cd /cluster/data/mm8/bed/gc5Base
mkdir /gbdb/mm8/wib
ln -s `pwd`/gc5Base.wib /gbdb/mm8/wib
time hgLoadWiggle -pathPrefix=/gbdb/mm8/wib mm8 gc5Base gc5Base.wig
# 29 second load time
# verify index is correct:
hgsql mm8 -e "show index from gc5Base;"
# should see good numbers in Cardinality column
#############################################################################
# DISTRIBUTE SEQUENCE TO INTERMEDIATE SERVERS FOR KLUSTER RUNS
# (DONE - 2006-02-14 - Hiram)
ssh kkstore01
cd /cluster/data/mm8
# break up into 500,000 sized chunks for repeat masker runs
TOP=`pwd`
export TOP
for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
??/chr??_random_[0-9]*
do
ctg=`basename ${CTG_DIR}`
cd ${CTG_DIR}
faSplit size ${ctg}.fa 500000 ${ctg}_ -lift=${ctg}.lft -maxN=500000
cd ${TOP}
done > tmp/ctg_split.out 2>&1
# about 3 minutes, check the tmp/ctg_split.out for anything unusual
# make a list of the contigs
TOP=`pwd`
export TOP
for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
??/chr??_random_[0-9]*
do
ctg=`basename ${CTG_DIR}`
cd ${CTG_DIR}
ls ${ctg}_* | while read F
do
echo ${CTG_DIR}/${F}
done
cd ${TOP}
done > contig500K.lst
# count 'em
wc -l contig500K.lst
# 5772 contig500K.lst
mkdir /cluster/bluearc/scratch/hg/mm8
mkdir /cluster/bluearc/scratch/hg/mm8/contigs
rsync -a --progress --files-from=contig500K.lst . \
/cluster/bluearc/scratch/hg/mm8/contigs/
# verify the contig copy above functioned OK
cd /cluster/bluearc/scratch/hg/mm8
find ./contigs -type f | wc -l
# 5772
#############################################################################
# SIMPLE REPEAT TRACK (DONE - 2006-02-14 Hiram)
# TRF can be run in parallel with RepeatMasker
# since it doesn't require masked input sequence.
ssh kkr1u00
mkdir /iscratch/i/mm8
cd /iscratch/i/mm8
mkdir fa
cd fa
cp -p /cluster/data/mm8/?/*.fa .
cp -p /cluster/data/mm8/??/*.fa .
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/
done
ssh kki
mkdir /cluster/data/mm8/bed/simpleRepeat
cd /cluster/data/mm8/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = /iscratch/i/mm8/fa/$1
set inputFN = $1
set outpath = $2
set outputFN = $2:t
mkdir -p /scratch/tmp/$outputFN
cp $path1 /scratch/tmp/$outputFN
pushd .
cd /scratch/tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/scratch/tmp
popd
rm -f $outpath
cp -p /scratch/tmp/$outputFN/$outputFN $outpath
rm -fr /scratch/tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /scratch/tmp/$outputFN
'_EOF_'
# << happy emacs
chmod +x runTrf
cat << '_EOF_' > template
#LOOP
./runTrf $(path1) {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << keep emacs coloring happy
ls -1S /iscratch/i/mm8/fa > genome.lst
gensub2 genome.lst single template jobList
para create jobList
para try ... check ... push ... etc
para time
# Completed: 34 of 34 jobs
# CPU time in finished jobs: 14385s 239.75m 4.00h 0.17d 0.000 y
# IO & Wait Time: 794s 13.24m 0.22h 0.01d 0.000 y
# Average job time: 446s 7.44m 0.12h 0.01d
# Longest finished job: 1437s 23.95m 0.40h 0.02d
# Submission to last job: 1685s 28.08m 0.47h 0.02d
# Load into the database
ssh hgwdev
cd /cluster/data/mm8/bed/simpleRepeat
cat trf/chr*.bed > simpleRepeat.bed
hgLoadBed -strict mm8 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
# Loaded 1141941 elements of size 16
featureBits mm8 simpleRepeat
# 77752377 bases of 2567283971 (3.029%) in intersection
featureBits mm7 simpleRepeat
# 77021175 bases of 2583394090 (2.981%) in intersection
featureBits mm6 simpleRepeat
# 83220723 bases of 2597150411 (3.204%) in intersection
featureBits mm5 simpleRepeat
# 81414259 bases of 2615483787 (3.113%) in intersection
featureBits mm4 simpleRepeat
# 82600648 bases of 2627444668 (3.144%) in intersection
featureBits mm3 simpleRepeat
# 75457193 bases of 2505900260 (3.011%) in intersection
###########################################################################
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
ssh hgwdev
cd /cluster/data/mm8/bed
mkdir microsat
cd microsat
awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
/cluster/bin/i386/hgLoadBed mm8 microsat microsat.bed
#############################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2006-02-14 - Hiram)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kkstore01
cd /cluster/data/mm8/bed/simpleRepeat
mkdir trfMask
for F in trf/chr*.bed
do
echo "${F} -> ${F/trf\//}"
awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//}
done
#############################################################################
# REPEATMASKER RUN (after contigs have been distributed to bluearc FS)
# (DONE - 2006-02-14 - 2006-02-15 - Hiram)
# Record RM version used:
cat /cluster/bluearc/RepeatMasker060120/Libraries/version
# RM database version 20060120
ssh pk
#- Make the run directory and job list:
mkdir /cluster/data/mm8/RMRun
cd /cluster/data/mm8/RMRun
cat << '_EOF_' > ../jkStuff/RMMouse
#!/bin/csh -fe
set C = $1:h
set F = $1:t
set R = $F:r
cd /cluster/data/mm8/$C
/bin/mkdir -p /scratch/tmp/mm8/$R
/bin/cp /cluster/bluearc/scratch/hg/mm8/contigs/$1 /scratch/tmp/mm8/$R
pushd /scratch/tmp/mm8/$R
/cluster/bluearc/RepeatMasker060120/RepeatMasker -ali -s -species mus $F
popd
/bin/cp /scratch/tmp/mm8/$R/$R.fa.out ./
if (-e /scratch/tmp/mm8/$R/$R.fa.align) /bin/cp /scratch/tmp/mm8/$R/$R.fa.align ./
if (-e /scratch/tmp/mm8/$R/$R.fa.tbl) /bin/cp /scratch/tmp/mm8/$R/$R.fa.tbl ./
if (-e /scratch/tmp/mm8/$R/$R.fa.cat) /bin/cp /scratch/tmp/mm8/$R/$R.fa.cat ./
/bin/rm -fr /scratch/tmp/mm8/$R/*
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/mm8/$R
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/mm8
'_EOF_'
# << happy emacs
chmod +x ../jkStuff/RMMouse
cat << '_EOF_' > template
#LOOP
../jkStuff/RMMouse $(path1) {check out line ../$(dir1)/$(root1).fa.out}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../contig500K.lst single template jobList
para create jobList
wc -l jobList
# 5772 jobList
para try ... check ... push ... etc
# Completed: 6172 of 6172 jobs
# CPU time in finished jobs: 26381042s 439684.03m 7328.07h 305.34d 0.837 y
# IO & Wait Time: 46088s 768.13m 12.80h 0.53d 0.001 y
# Average job time: 4282s 71.36m 1.19h 0.05d
# Longest finished job: 6370s 106.17m 1.77h 0.07d
# Submission to last job: 127318s 2121.97m 35.37h 1.47d
#- Lift up the split-contig .out's to contig-level .out's
ssh kkstore01
cd /cluster/data/mm8
for D in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
??/chr??_random_[0-9]*
do
CONTIG=`basename ${D}`
liftUp ${D}/${CONTIG}.fa.out ${D}/${CONTIG}.lft error \
${D}/${CONTIG}_[0-9]*.fa.out
done > tmp/RM.lift.outs 2>&1
# real 2m32.275s
# scan tmp/RM.lift.outs for unusual errors or difficulties
cat << '_EOF_' > jkStuff/liftRM_out_to_chr.sh
#!/bin/sh
for C in ? ??
do
echo "lifting ${C}"
cd ${C}
if [ -s lift/ordered.lft ]; then
liftUp chr${C}.fa.out lift/ordered.lft error `cat lift/oOut.lst`
else
echo "WARNING: Can not find ${C}/lift/ordered.lft"
fi
if [ -s lift/random.lft ]; then
liftUp chr${C}_random.fa.out lift/random.lft error `cat lift/rOut.lst`
fi
cd ..
done
'_EOF_'
# << happy emacs
chmod +x jkStuff/liftRM_out_to_chr.sh
./jkStuff/liftRM_out_to_chr.sh > tmp/liftRM_out_to_chr.out 2>&1
# real 0m24.873s
# scan the results tmp/liftRM_out_to_chr.out
# there is a single: WARNING: Can not find Un/lift/ordered.lft
# which is OK
# List the final .out files, nothing should be size 0:
ls -og */*.fa.out | sort -k3,3nr
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/mm8
hgLoadOut mm8 ?/chr?.fa.out ??/chr??.fa.out ?/chr?_random.fa.out \
??/chr??_random.fa.out > tmp/hgLoadOut.out 2>&1
# about 7 minutes, there are always a few of these errors:
# verify everything seems normal compared with previous builds
featureBits mm8 rmsk
# 1087735582 bases of 2567283971 (42.369%) in intersection
featureBits mm7 rmsk
# 1092611581 bases of 2583394090 (42.294%) in intersection
featureBits mm6 rmsk
# 1110222842 bases of 2597150411 (42.748%) in intersection
featureBits mm5 rmsk
# 1137310280 bases of 2615483787 (43.484%) in intersection
featureBits mm4 rmsk
# 1130883581 bases of 2627444668 (43.041%) in intersection
featureBits mm3 rmsk
# 1080265553 bases of 2505900260 (43.109%) in intersection
#############################################################################
# PROCESS REPEAT MASKER AND SIMPLE REPEATS INTO MASKED SEQUENCE
# (DONE - 2006-02-16 - Hiram)
ssh kkstore01
cd /cluster/data/mm8
time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
do
FA=${CHR#*\/}
C=${FA%.fa}
echo -n "repeat masking ${C} ... "
/cluster/bin/i386/maskOutFa -soft ${CHR} ${CHR}.out ${CHR}
echo -n "adding simpleRepeats ... "
/cluster/bin/i386/maskOutFa -softAdd ${CHR} \
bed/simpleRepeat/trfMask/${C}.bed ${CHR}
echo "done - ${CHR}"
done > tmp/addRM_and_Simple.out 2>&1
# about 4 minutes
# you will note the usual warnings about troublesome coordinates
# in the repeat masker outputs - even more than when they were lifted.
# and make the hard masked sequences from these soft masked sequences
time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
do
echo "maskOutFa ${CHR} hard ${CHR}.masked"
/cluster/bin/i386/maskOutFa ${CHR} hard ${CHR}.masked
done > tmp/hardMask.out 2>&1
# about 2 minutes
# rebuild the nib file
time faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
mm8Soft.2bit
# 2 minutes
# verify the sequence is still the same size as before:
twoBitInfo mm8Soft.2bit stdout | sort -rn +1 | sum -r
# 20673 1
sum -r chrom.sizes
# 20673 1
# Let's see how much is masked:
time twoBitToFa mm8Soft.2bit stdout | faSize stdin
# 2664455088 bases (97171400 N's 2567283688 real 1477933003 upper
# 1089350685 lower) in 34 sequences in 1 files
# and bc says:
# 1089350685/2664455088 = .408845
# 1089350685/2567283688 = .424320
# replace the former unmasked 2bit file with this new one:
rm mm8.2bit; mv mm8Soft.2bit mm8.2bit
# check the browser, make sure it is functioning OK
# Generate fasta file for random contigs
# THIS IS OPTIONAL STUFF, not really needed, well, it is used in
# genscan to make the gene names there look pretty. This script
# has been checked into the source tree in hg/utils/lft2BitToFa.pl
# use it from there next time
cp -p /cluster/data/mm7/jkStuff/lft2BitToFa.pl ./jkStuff
mkdir randomContigs
for L in ?/lift/ctg_random.lft ??/lift/ctg_random.lft
do
D=${L/\/lift*}
echo $L $D
./jkStuff/lft2BitToFa.pl mm8.2bit ${L} \
> randomContigs/chr${D}_random.ctg.fa
done
#
# Verify these *.ctg.fa files have the same bases as the ordinary
# chr*_random.fa files:
faSize ?/chr?_random.fa ??/chr??_random.fa
# 20361100 bases (3250000 N's 17111100 real 7094373 upper 10016727 lower)
# in 12 sequences in 12 files
faSize randomContigs/*.ctg.fa
# 17111100 bases (0 N's 17111100 real 7094373 upper 10016727 lower)
# in 77 sequences in 12 files
# Note the number of real, upper and lower bases are the same
# This random contig business isn't actually needed
# Create a 2bit file with the full chrom sequences and these
# random contigs for use in blastz:
# faToTwoBit ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
# mm8Chroms_RandomContigs.2bit
# Copy to bluearc unit for kluster runs
# cp -p mm8.2bit /cluster/bluearc/mm8
# cp -p mm8Chroms_RandomContigs.2bit /cluster/bluearc/mm8
# And the lift file to go with it
# cat ?/lift/ctg_random.lft ??/lift/ctg_random.lft \
# > jkStuff/Chroms_RandomContigs.lft
# cp -p jkStuff/Chroms_RandomContigs.lft /cluster/bluearc/mm8
# create full chrom nibs for blastz SEQ1 target with Lin Spec Repeats
mkdir nib
for FA in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
do
B=${FA/*\/}
B=${B/.fa/}
echo faToNib -softMask ${FA} nib/${B}.nib
rm -f nib/${B}.nib
faToNib -softMask ${FA} nib/${B}.nib
done
mkdir /cluster/bluearc/scratch/hg/mm8/nib
cp -p nib/*.nib /cluster/bluearc/scratch/hg/mm8/nib
cp -p chrom.sizes /cluster/bluearc/scratch/hg/mm8
cp -p mm8.2bit /cluster/bluearc/scratch/hg/mm8
# The contigs over there are no longer needed
rm -fr /cluster/bluearc/scratch/hg/mm8/contigs
# after lineage specific repeats are created below, this business
# can be pushed to the kluster kk nodes and over to the Iservers
#############################################################################
# PREPARE "bigZips" files for public release
# (DONE 2006-02-16 - Hiram)
ssh kkstore01
mkdir /cluster/data/mm8/downloads
mkdir /cluster/data/mm8/downloads/bigZips
mkdir /cluster/data/mm8/downloads/chromosomes
cd /cluster/data/mm8/downloads/chromosomes
cp -p ../../?/chr?.fa ../../??/chr??.fa \
../../?/chr?_random.fa ../../??/chr??_random.fa.
gzip chr*.fa
# 12 minutes
# copy previous release README.txt
scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm8/chromosomes/README.txt .
# edit it to bring it up to date
cd /cluster/data/mm8/downloads/bigZips
# copy previous release README.txt
scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm8/bigZips/README.txt .
# edit README.txt to indicate proper version of sequence and
# RepeatMasker
cd /cluster/data/mm8
cp -p ?/chr*.fa ??/chr*.fa downloads/chromosomes
tar cvzf downloads/bigZips/chromAgp.tar.gz ?/chr*.agp ??/chr*.agp
tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa ??/chr*.fa
# 12 minutes
tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \
??/chr*.fa.masked
tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out ??/chr*.fa.out
cd /cluster/data/mm8/bed/simpleRepeat
tar cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMask
# get GenBank native mRNAs and refGene (DONE 2006-02-23)
# after the genbank run was complete
ssh hgwdev
cd /cluster/data/genbank
time ./bin/i386/gbGetSeqs -db=mm8 -native GenBank mrna \
/cluster/data/mm8/downloads/bigZips/mrna.fa
# 2 minutes
cd /cluster/data/mm8/downloads/bigZips
gzip mrna.fa
cd /cluster/data/mm8/downloads/bigZips
for I in 1000 2000 5000
do
echo "upstream${I} working ... "
featureBits mm8 refGene:upstream:${I} -fa=stdout \
| gzip -c > upstream${I}.fa.gz
echo "upstream${I} done"
done
# real 11m25.493s
ssh kkstore01
cd /cluster/data/mm8/downloads/bigZips
cp -p ../../mm8.2bit .
md5sum *.gz *.2bit README.txt > md5sum.txt
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/mm8
ln -s /cluster/data/mm8/downloads/bigZips \
/usr/local/apache/htdocs/goldenPath/mm8/bigZips
ln -s /cluster/data/mm8/downloads/chromosomes \
/usr/local/apache/htdocs/goldenPath/mm8/chromosomes
#############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2006-02-16 - Hiram)
ssh kkr1u00
mkdir /iscratch/i/mm8/rmsk
cd /cluster/data/mm8
cp -p */chr*.fa.out /iscratch/i/mm8/rmsk
cd /iscratch/i/mm8
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/
done
cd rmsk
ssh kki
mkdir /cluster/data/mm8/linSpecRep
cd /cluster/data/mm8/linSpecRep
ls -1S /iscratch/i/mm8/rmsk > fa.list
cat << '_EOF_' > mkLSR.csh
#!/bin/csh -fe
pushd /iscratch/i/mm8/rmsk
/cluster/bluearc/RepeatMasker060120/DateRepeats \
$1 -query mouse -comp human -comp rat -comp dog -comp cow \
-comp rabbit
popd
/bin/cp -p /iscratch/i/mm8/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus_oryctolagus-cuniculus .
'_EOF_'
# << happy emacs
chmod +x mkLSR.csh
cat << '_EOF_' > template
#LOOP
./mkLSR.csh $(path1) {check out line+ $(path1)_homo-sapiens_rattus_canis-familiaris_bos-taurus_oryctolagus-cuniculus}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 fa.list single template jobList
para try ... check ... push ... etc...
para time
# Completed: 34 of 34 jobs
# CPU time in finished jobs: 1338s 22.29m 0.37h 0.02d 0.000 y
# IO & Wait Time: 112s 1.87m 0.03h 0.00d 0.000 y
# Average job time: 43s 0.71m 0.01h 0.00d
# Longest finished job: 92s 1.53m 0.03h 0.00d
# Submission to last job: 181s 3.02m 0.05h 0.00d
ssh kkstore01
cd /cluster/data/mm8/linSpecRep
mkdir notInHuman notInRat notInDog notInCow notInRabbit
for F in chr*.out_homo-sapiens*
do
B=${F/.fa.out*/}
echo $B
/cluster/bin/scripts/extractRepeats 1 ${F} > \
notInHuman/${B}.out.spec
/cluster/bin/scripts/extractRepeats 2 ${F} > \
notInRat/${B}.out.spec
/cluster/bin/scripts/extractRepeats 3 ${F} > \
notInDog/${B}.out.spec
/cluster/bin/scripts/extractRepeats 4 ${F} > \
notInCow/${B}.out.spec
XXXXX /cluster/bin/scripts/extractRepeats 4 ${F} > \ XXXXX
notInRabbit/${B}.out.spec XXXXX
done
# NOTE: rabbit should be column 5 instead of 4.
# This isn't a problem, as we're not using rabbit anyway (see below)
# the notInHuman, notInDog, notInCow and notInRabit ended up being
# identical. Only the notInRat was different than them
# To check identical
find . -name "*.out.spec" | \
while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
| sort -k1,1n | sort -t"/" -k3,3
# Copy to scratch/hg for use in kluster runs
mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep
mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat
mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
cp -p notInHuman/* /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
cp -p notInRat/* /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat
# Request this scratch/hg/mm8 directory push to the kk nodes
# and we can do the Iservers simply:
ssh kkr1u00
cd /iscratch/i/mm8
# no longer need these two directories
rm -fr fa rmsk
rsync -a --progress /cluster/bluearc/scratch/hg/mm8/ .
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/
done
############################################################################
# BLATSERVERS ENTRY (DONE - 2006-02-16 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("mm8", "blat17", "17784", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("mm8", "blat17", "17785", "0", "1");' \
hgcentraltest
# test it with some sequence
#########################################################################
# CPGISLANDS (DONE - 2006-02-16 - Hiram)
ssh hgwdev
mkdir /cluster/data/mm8/bed/cpgIsland
cd /cluster/data/mm8/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
# gcc readseq.c cpg_lh.c -o cpglh.exe
cd ../..
ln -s hg3rdParty/cpgIslands/cpglh.exe .
# cpglh.exe requires hard-masked (N) .fa's.
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
ssh kkstore01
cd /cluster/data/mm8/bed/cpgIsland
for F in ../../*/chr*.fa.masked
do
FA=${F/*\/}
C=${FA/.fa.masked/}
echo "./cpglh.exe ${FA} > ${C}.cpg"
./cpglh.exe ${F} > ${C}.cpg
done > cpglh.out 2>&1 &
# about 3 minutes 20 seconds
# Several chroms have 0 results:
# -rw-rw-r-- 1 0 Feb 16 15:19 chr10_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:20 chr15_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chr8_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chr9_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chrM.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chrX_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chrY.cpg
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << happy emacs
awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed
ssh hgwdev
cd /cluster/data/mm8/bed/cpgIsland
hgLoadBed -strict mm8 cpgIslandExt -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 15963 elements of size 10
featureBits mm8 cpgIslandExt
# 10456823 bases of 2567283971 (0.407%) in intersection
featureBits mm7 cpgIslandExt
# 10439328 bases of 2583394090 (0.404%) in intersection
featureBits mm6 cpgIslandExt
# 10432360 bases of 2597150411 (0.402%) in intersection
featureBits mm5 cpgIslandExt
# 10422989 bases of 2615483787 (0.399%) in intersection
featureBits mm4 cpgIsland
# 11109692 bases of 2627444668 (0.423%) in intersection
featureBits mm3 cpgIsland
# 10102968 bases of 2505900260 (0.403%) in intersection
#########################################################################
# ANDY LAW CPGISSLANDS (DONE - 2006-02-16 - Hiram)
# See notes in makeGalGal2.doc and makeCanFam2.doc
ssh kkstore01
mkdir /cluster/data/mm8/bed/cpgIslandGgfAndy
cd /cluster/data/mm8/bed/cpgIslandGgfAndy
# Build the preProcGgfAndy program in
# kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE
# Use masked sequence since this is a mammal...
for F in ../../*/chr*.fa.masked
do
FA=${F/*\/}
C=${FA/.fa.masked/}
echo preproc and run on masked "${C} ${F}" 1>/dev/stderr
~/bin/$MACHTYPE/preProcGgfAndy ${F} \
| /cluster/home/angie/ggf-andy-cpg-island.pl \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--;
$gc=$c+$g1; $pCpG=(100.0 * 2 * $cpg / $n);
$pGc=(100.0 * $gc / $n);
$_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" .
"$pCpG\t$pGc\t$oE\n";'
done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed
# load into database:
ssh hgwdev
cd /cluster/data/mm8/bed/cpgIslandGgfAndy
sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
hgLoadBed -strict mm8 cpgIslandGgfAndyMasked -tab -noBin \
-sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
# Loaded 67442 elements of size 10
featureBits mm8 cpgIslandExt
# 10456823 bases of 2567283971 (0.407%) in intersection
featureBits mm7 cpgIslandExt
# 10439328 bases of 2583394090 (0.404%) in intersection
featureBits mm8 cpgIslandGgfAndyMasked
# 38850121 bases of 2567283971 (1.513%) in intersection
featureBits mm7 cpgIslandGgfAndyMasked
# 38774242 bases of 2583394090 (1.501%) in intersection
wc -l ../cpgIsland/cpgIsland.bed *bed
# 15963 ../cpgIsland/cpgIsland.bed
# 67442 cpgIslandGgfAndyMasked.bed
#########################################################################
# BLASTZ HUMAN Hg18 (DONE - 2006-02-16 - 2006-02-18 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastzHg18.2006-02-16
cd /cluster/data/mm8/bed
ln -s blastzHg18.2006-02-16 blastz.hg18
cd blastzHg18.2006-02-16
# Started this before the rsync to /scratch/hg/mm8/ had completed,
# hence the /cluster/bluearc/scratch/hg/mm8/ location is used
# here.
cat << '_EOF_' > DEF
# mouse vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm8
SEQ1_DIR=/cluster/bluearc/scratch/hg/mm8/nib
SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_LEN=/cluster/bluearc/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Human Hg18 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/scratch/hg/hg18/nib
SEQ2_SMSK=/scratch/hg/hg18/linSpecRep/notInMouse
SEQ2_LEN=/scratch/hg/hg18/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzHg18.2006-02-16
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# Started 2006-02-16 16:15
# failed due to pk node difficulties, finish the run.blastz
# manually
# Completed: 3724 of 3724 jobs
# CPU time in finished jobs: 5190293s 86504.89m 1441.75h 60.07d 0.165 y
# IO & Wait Time: 259150s 4319.16m 71.99h 3.00d 0.008 y
# Average job time: 1463s 24.39m 0.41h 0.02d
# Longest finished job: 10621s 177.02m 2.95h 0.12d
# Submission to last job: 74153s 1235.88m 20.60h 0.86d
# continuing
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=cat `pwd`/DEF > cat.out 2>&1 &
# Done 2006-02-17 15:02
# Then to swap over to Hg18
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > swap.out 2>&1 &
# Started 2006-02-17 15:30
ssh hgwdev
time nice -n +19 featureBits mm8 chainHg18Link
# 984380268 bases of 2567283971 (38.343%) in intersection
time nice -n +19 featureBits hg18 chainMm8Link
# 994530182 bases of 2881515245 (34.514%) in intersection
#########################################################################
# BLASTZ RAT Rn4 (DONE - 2006-02-16 - 2006-02-18 - Hiram)
ssh kkr1u00
cd /iscratch/i/rn4
rsync -a --progress /cluster/data/rn4/linSpecRep.notInMouse/ \
./linSpecRep.notInMouse
rsync -a --progress /cluster/data/rn4/nib/ ./nib/
cp -p /cluster/data/rn4/chrom.sizes .
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/rn4/ kkr${R}u00:/iscratch/i/rn4/
done
ssh kk
mkdir /cluster/data/mm8/bed/blastzRn4.2006-02-16
cd /cluster/data/mm8/bed
ln -s blastzRn4.2006-02-16 blastz.rn4
cd blastzRn4.2006-02-16
# Started this before the rsync to /scratch/hg/mm8/ had completed,
# hence the /cluster/bluearc/scratch/hg/mm8/ location is used
# here.
cat << '_EOF_' > DEF
# mouse vs rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm8
SEQ1_DIR=/cluster/bluearc/scratch/hg/mm8/nib
SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat
SEQ1_LEN=/cluster/bluearc/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/iscratch/i/rn4/nib
SEQ2_SMSK=/iscratch/i/rn4/linSpecRep.notInMouse
SEQ2_LEN=/iscratch/i/rn4/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzRn4.2006-02-16
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# Started 2006-02-16 16:15
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
-continue=cat `pwd`/DEF > cat.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits mm8 chainRn4Link
# 1770319811 bases of 2567283971 (68.957%) in intersection
time nice -n +19 featureBits rn4 chainMm8Link
# 1791093685 bases of 2571531505 (69.651%) in intersection
##############################################################################
# CLONE ENDS - BACEND TRACK (DONE - 2006-02-17 - Hiram)
ssh kkstore01
cd /cluster/data/mm8
# check disk space: 73Gb free
df -h .
# Filesystem Size Used Avail Use% Mounted on
# /export/cluster/store5
# 1.5T 1.3T 73G 95% /cluster/store5
mkdir -p bed/cloneend/ncbi
cd bed/cloneend/ncbi
wget --timestamping \
ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/*
cd /cluster/data/mm8/bed/cloneend
# seems like the *.mfa files were split just for convenience
# concatenate
for F in ncbi/*.mfa.gz
do
zcat ${F}
done | gzip > all.mfa.gz
# Convert the title line of the all.mfa file
cat << '_EOF_' > convert.pl
#!/usr/bin/env perl
use strict;
use warnings;
while (my $line = <>) {
if ($line !~ m/^>/) {
print $line
} else {
my @fields = split('\|', $line);
my $fieldCount = scalar(@fields);
my $printed = 0;
for (my $i = 0; $i < $fieldCount; $i++) {
if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") {
(my $name, my $vers) = split(/\./,$fields[$i+1]);
print ">$name\n";
$i= $fieldCount;
$printed = 1;
}
}
if (!$printed) {
die("Failed for $line\n");
}
}
}
'_EOF_'
# << happy emacs
chmod +x convert.pl
zcat all.mfa.gz | ./convert.pl | gzip > cloneEnds.fa.gz
# make sure nothing got broken:
faSize all.mfa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
# lower) in 789466 sequences in 1 files
faSize cloneEnds.fa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
# lower) in 789466 sequences in 1 files
# identical numbers, curiously, these are exactly the same numbers
# as were seen during the build of Mm7. Do these things not
# change with time ?
# concatenate the text files, too
for F in ncbi/*.txt.gz
do
zcat ${F}
done | gzip > all.txt.gz
# generate cloneEndPairs.txt and cloneEndSingles.txt
cp -p /cluster/data/mm7/bed/cloneend/convertTxt.pl .
zcat all.txt.gz | ./convertTxt.pl stdin
# Reading in end info
# Writing out pair info
# Writing out singleton info
# 354485 pairs and 78423 singles
# faSplit does not function correctly if given a .gz source file
# AND, we need the unzipped file for sequence loading below
gunzip cloneEnds.fa.gz
# split
mkdir splitdir
cd splitdir
faSplit sequence ../cloneEnds.fa 100 cloneEnds
# Check to ensure no breakage:
cat *.fa | faSize stdin
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
# lower) in 789466 sequences in 1 files
# same numbers as before
# Copy to san for cluster runs
mkdir /san/sanvol1/scratch/mm8/cloneEnds
cp -p *.fa /san/sanvol1/scratch/mm8/cloneEnds
rm *
cd ..
rmdir splitdir
# may as well remove the previous assembly copy:
rm -fr /san/sanvol1/scratch/mm7/cloneEnds
# load sequences
ssh hgwdev
mkdir /gbdb/mm8/cloneend
cd /gbdb/mm8/cloneend
ln -s /cluster/data/mm8/bed/cloneend/cloneEnds.fa .
cd /tmp
hgLoadSeq mm8 /gbdb/mm8/cloneend/cloneEnds.fa
# Advisory lock created
# Creating .tab file
# Adding /gbdb/mm8/cloneend/cloneEnds.fa
# 789466 sequences
# Updating seq table
# Advisory lock has been released
# All done
############################################################################
# BACEND SEQUENCE ALIGNMENTS (DONE - 2006-02-17 - 2006-02-22 - Hiram)
ssh kkstore01
mkdir /cluster/data/mm8/noMask
cd /cluster/data/mm8/
# Need an unmasked sequence for this work
for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
do
C=`basename ${CHR}`
echo -n "working ${C} ... "
head -1 ${CHR} > noMask/${C}
tail +2 ${CHR} | tr [:lower:] [:upper:] >> noMask/${C}
echo "done"
done
mkdir /san/sanvol1/scratch/mm8/noMask
time cp --verbose -p noMask/chr*.fa /san/sanvol1/scratch/mm8/noMask
# Size of mouse non-gap genome: 2567283971
# Size of Hg18 non-gap genome: 2881515245
# Adjusting the 1024 number from typical human ooc generation:
# 1024 * (2567283971 / 2881515245) = 912
time blat mm8.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=912
# Wrote 29643 overused 11-mers to 11.ooc
# real 2m13.206
# Copy over to the san
cp -p 11.ooc /san/sanvol1/scratch/mm8
# and for the kluster run
ssh pk
mkdir /cluster/data/mm8/bed/bacends
cd /cluster/data/mm8/bed/bacends
mkdir out
# allow blat to run politely in /tmp while it writes output, then
# copy results to results file:
cat << '_EOF_' > runBlat.sh
#!/bin/sh
root1=$1
root2=$2
result=$3
rm -fr /scratch/tmp/${root1}_${root2}
mkdir /scratch/tmp/${root1}_${root2}
pushd /scratch/tmp/${root1}_${root2}
/cluster/bin/x86_64/blat /san/sanvol1/scratch/mm8/noMask/${root1}.fa \
/san/sanvol1/scratch/mm8/cloneEnds/${root2}.fa \
-ooc=/san/sanvol1/scratch/mm8/11.ooc ${root1}.${root2}.psl
popd
mkdir -p out/${root2}
rm -f ${result}
mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /scratch/tmp/${root1}_${root2}
'_EOF_'
# << happy emacs
chmod +x runBlat.sh
cat << '_EOF_' > template
#LOOP
./runBlat.sh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
# << emacs happy
ls -1S /san/sanvol1/scratch/mm8/cloneEnds/cloneEnds???.fa > bacEnds.lst
ls -1S /san/sanvol1/scratch/mm8/noMask/chr*.fa > contig.lst
gensub2 contig.lst bacEnds.lst template jobList
para create jobList
# 3322 jobs written to batch
para try, check, push, etc ...
# Completed: 3332 of 3332 jobs
# CPU time in finished jobs: 649465s 10824.42m 180.41h 7.52d 0.021 y
# IO & Wait Time: 11633s 193.88m 3.23h 0.13d 0.000 y
# Average job time: 198s 3.31m 0.06h 0.00d
# Longest finished job: 1326s 22.10m 0.37h 0.02d
# Submission to last job: 429201s 7153.35m 119.22h 4.97d
ssh kkstore01
cd /cluster/data/mm8/bed/bacends
screen
mkdir temp
time pslSort dirs raw.psl temp out/* > pslSort.out 2>&1 &
# real 22m4.019s
# -rw-rw-r-- 1 8422362557 Feb 22 15:35 raw.psl
time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \
raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
# real 6m15.981s
# -rw-rw-r-- 1 197029888 Feb 22 15:37 bacEnds.psl
# utilize the scripts from the previous build
cp -p /cluster/data/mm7/bed/bacends/split.pl .
cp -p /cluster/data/mm7/bed/bacends/header .
time ./split.pl header < bacEnds.psl
# real 0m26.983s
mv bacEnds.psl bacEnds.psl.save
time pslSort dirs bacEnds.psl temp split
# real 2m19.131s
# -rw-rw-r-- 1 1227866614 Feb 22 15:48 bacEnds.psl
# Copy files to final destination and remove
mkdir /cluster/data/mm8/bacends
cp -p bacEnds.psl /cluster/data/mm8/bacends
############################################################################
# BACEND PAIRS TRACK (DONE - 2006-02-22 - Hiram)
ssh kolossus
cd /cluster/data/mm8/bacends
time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds
# real 0m47.401s
# create header required by "rdb" tools
echo -e \
"chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header
cat header bacEnds.pairs | \
/cluster/bin/scripts/row score ge 300 | \
/cluster/bin/scripts/sorttbl chr start | \
/cluster/bin/scripts/headchg -del > bacEndPairs.bed
# -rw-rw-r-- 1 23816801 Feb 22 15:52 bacEndPairs.bed
cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
/cluster/bin/scripts/sorttbl chr start | \
/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed
# -rw-rw-r-- 1 6843775 Feb 22 15:54 bacEndPairsBad.bed
/cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed >j1.out
cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out
cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl
# -rw-rw-r-- 1 983668200 Feb 22 16:04 bacEnds.load.psl
rm j1.out j2.out
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $5}' bacEndPairs.bed | sort -u
# result should be the scores, no extraneous strings:
# 1000
# 300
# 375
# 500
# 750
# edit the file and fix it if it has a bad name.
# load into database
ssh hgwdev
cd /cluster/data/mm8/bacends
hgLoadBed -strict -notItemRgb mm8 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 235440 elements of size 11
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed -strict -notItemRgb mm8 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 95099 elements of size 11
# NOTE: truncates file to 0 if -nobin is used
time hgLoadPsl mm8 -table=all_bacends bacEnds.load.psl
# load of all_bacends did not go as planned: 8132116 record(s), 0 row(s)
# skipped, 1 warning(s) loading psl.tab
# skipped, 1 warning(s) loading psl.tab
# real 20m45.055s
featureBits mm8 all_bacends
# 327086559 bases of 2567283971 (12.741%) in intersection
featureBits mm7 all_bacends
# 334161740 bases of 2583394090 (12.935%) in intersection
featureBits mm6 all_bacends
# 336981828 bases of 2597150411 (12.975%) in intersection
featureBits mm5 all_bacends
# 268502414 bases of 2615483787 (10.266%) in intersection
featureBits mm4 all_bacends
# 243096171 bases of 2627444668 (9.252%) in intersection
featureBits mm8 bacEndPairs
# 2572527283 bases of 2567283971 (100.204%) in intersection
featureBits mm7 bacEndPairs
# 2578837424 bases of 2583394090 (99.824%) in intersection
featureBits mm6 bacEndPairs
# 2570768812 bases of 2597150411 (98.984%) in intersection
featureBits mm5 bacEndPairs
# 2567958504 bases of 2615483787 (98.183%) in intersection
featureBits mm4 bacEndPairs
# 2549945356 bases of 2627444668 (97.050%) in intersection
featureBits mm8 bacEndPairsBad
# 879222026 bases of 2567283971 (34.247%) in intersection
featureBits mm7 bacEndPairsBad
# 954662115 bases of 2583394090 (36.954%) in intersection
featureBits mm6 bacEndPairsBad
# 1006314997 bases of 2597150411 (38.747%) in intersection
featureBits mm5 bacEndPairsBad
# 541027882 bases of 2615483787 (20.686%) in intersection
featureBits mm4 bacEndPairsBad
# 1074505863 bases of 2627444668 (40.895%) in intersection
#########################################################################
# GENBANK auto update (DONE - 2006-02-17 - 2006-02-23 - Hiram)
# align with revised genbank process. drop xeno ESTs.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvs update -d -P etc
# edit etc/genbank.conf to add mm8, it is a copy of mm7 with changes:
# mm8
mm8.serverGenome = /cluster/data/mm8/mm8.2bit
mm8.clusterGenome = /scratch/hg/mm8/mm8.2bit
mm8.ooc = /cluster/data/mm8/11.ooc
mm8.align.unplacedChroms = chrUn_random
mm8.lift = /cluster/data/mm8/jkStuff/liftAll.lft
mm8.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
mm8.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
mm8.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
mm8.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
mm8.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
mm8.downloadDir = mm8
mm8.refseq.mrna.xeno.load = yes
mm8.refseq.mrna.xeno.loadDesc = yes
mm8.mgcTables.default = full
mm8.mgcTables.mgc = all
# check that into CVS, then
# update /cluster/data/genbank/
make etc-update
ssh kkstore04
cd /cluster/data/genbank
nice bin/gbAlignStep -initial mm8 &
# var/build/logs/2006.02.17-16:10:17.mm8.initalign.log
# the parasol batch job on kk broke down in:
# /cluster/bluearc/genbank/work/initial.mm8/align
# go to kk and this directory and get the batch finished
nice bin/gbAlignStep -continue=finish -initial mm8 &
# var/build/logs/2006.02.22-20:26:54.mm8.initalign.log
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad mm8 &
# var/dbload/hgwdev/logs/2006.02.23-10:21:36.dbload.log
# real 228m59.734s
#########################################################################
# BLASTZ rheMac2 (DONE - 2006-02-17 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.rheMac2.2006-02-17
cd /cluster/data/mm8/bed
ln -s blastz.rheMac2.2006-02-17 blastz.rheMac2
cd blastz.rheMac2
cat << '_EOF_' > DEF
# mouse vs macaca mulatta
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
# TARGET - mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY - macaca mulatta - big enough chunk to do whole chroms at once
SEQ2_DIR=/san/sanvol1/scratch/rheMac2/rheMac2.2bit
SEQ2_LEN=/san/sanvol1/scratch/rheMac2/rheMac2.sizes
SEQ2_CHUNK=250000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastz.rheMac2.2006-02-17
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# Started 2006-02-17 16:42
# crashed due to no copies of mm8 in /scratch/hg/mm8/ on the
# Iservers. Fix that up and get the chain run done. Continuing.
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap `pwd`/DEF > swap.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &
# failed during a san hiccup, finish that off, then:
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -continue=load `pwd`/DEF > swap.load.out 2>&1 &
time nice -n +19 featureBits mm8 chainRheMac2Link
# 891310108 bases of 2567283971 (34.718%) in intersection
time nice -n +19 featureBits rheMac2 chainMm8Link
# 877906099 bases of 2646704109 (33.170%) in intersection
#########################################################################
# BLASTZ canFam2 (DONE - 2006-02-18 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.canFam2.2006-02-18
cd /cluster/data/mm8/bed
ln -s blastz.canFam2.2006-02-18 blastz.canFam2
cd blastz.canFam2
cat << '_EOF_' > DEF
# mouse vs dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole pieces
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse
SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzCanFam2.2006-02-18
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits mm8 chainCanFam2Link
# 828741604 bases of 2567283971 (32.281%) in intersection
time nice -n +19 featureBits canFam2 chainMm8Link
# 816262344 bases of 2384996543 (34.225%) in intersection
#########################################################################
# BLASTZ bosTau2 (DONE - 2006-02-18 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.bosTau2.2006-02-18
cd /cluster/data/mm8/bed
ln -s blastz.bosTau2.2006-02-18 blastz.bosTau2
cd blastz.bosTau2
cat << '_EOF_' > DEF
# mouse vs cow
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Cow (bosTau2)
# large enough chunk to do chroms in one piece
SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
SEQ2_CHUNK=150000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzBosTau.2006-02-18
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits mm8 chainBosTau2Link
# 688859641 bases of 2567283971 (26.832%) in intersection
time nice -n +19 featureBits bosTau2 chainMm8Link
# 683178156 bases of 2812203870 (24.293%) in intersection
#########################################################################
# BLASTZ galGal2 (DONE - 2006-02-18 - Hiram)
ssh kk
mkdir /cluster/data/mm8/bed/blastz.galGal2.2006-02-18
cd /cluster/data/mm8/bed
ln -s blastz.galGal2.2006-02-18 blastz.galGal2
cd blastz.galGal2
cat << '_EOF_' > DEF
# mouse vs chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzGalGal2.2006-02-18
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=cat `pwd`/DEF > cat.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net `pwd`/DEF > net.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits mm8 chainGalGal2Link
# 65517358 bases of 2567283971 (2.552%) in intersection
time nice -n +19 featureBits galGal2 chainMm8Link
# 57074100 bases of 1054197620 (5.414%) in intersection
#########################################################################
# BLASTZ dasNov1 (DONE - 2006-02-19 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.dasNov1.2006-02-19
cd /cluster/data/mm8/bed
ln -s blastz.dasNov1.2006-02-19 blastz.dasNov1
cd blastz.dasNov1
cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY - Armadillo dasNov1
SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzDasNov1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=cat `pwd`/DEF > cat.out 2>&1 &
time nice -n +19 featureBits mm8 chainDasNov1Link
# 431944142 bases of 2567283971 (16.825%) in intersection
#########################################################################
# BLASTZ echTel1 (DONE - 2006-02-19 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.echTel1.2006-02-19
cd /cluster/data/mm8/bed
ln -s blastz.echTel1.2006-02-19 blastz.echTel1
cd blastz.echTel1
cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY - Tenrec echTel1
SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzEchTel1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=cat `pwd`/DEF > cat.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=chainRun `pwd`/DEF > chain.out 2>&1 &
time nice -n +19 featureBits mm8 chainEchTel1Link
# 292970406 bases of 2567283971 (11.412%) in intersection
#########################################################################
# BLASTZ fr1 (DONE - 2006-02-19 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.fr1.2006-02-19
cd /cluster/data/mm8/bed
ln -s blastz.fr1.2006-02-19 blastz.fr1
cd blastz.fr1
cat << '_EOF_' > DEF
# mouse vs. fugu
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7
# Reuse parameters from human-chicken, except L=6000 (more relaxed)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Fugu - chunk big enough to run the whole chrom at once
SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CHUNK=400000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzFr1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &
time nice -n +19 featureBits mm8 chainFr1Link
# 48949500 bases of 2567283971 (1.907%) in intersection
time nice -n +19 featureBits fr1 chainMm8Link
# 42671288 bases of 315518167 (13.524%) in intersection
#########################################################################
# BLASTZ loxAfr1 (DONE - 2006-02-19 - Hiram)
ssh kk
mkdir /cluster/data/mm8/bed/blastz.loxAfr1.2006-02-19
cd /cluster/data/mm8/bed
ln -s blastz.loxAfr1.2006-02-19 blastz.loxAfr1
cd blastz.loxAfr1
cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
BLASTZ=blastz.v7
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=30000000
SEQ1_LAP=10000
# QUERY - Elephant loxAfr1
SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzLoxAfr1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# failed during the cat, fixed the script
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
-continue=chainRun `pwd`/DEF > chain.out 2>&1 &
time nice -n +19 featureBits mm8 chainLoxAfr1Link
# 472168702 bases of 2567283971 (18.392%) in intersection
#########################################################################
# BLASTZ tetNig1 (DONE - 2006-02-19 - Hiram)
ssh kk
mkdir /cluster/data/mm8/bed/blastz.tetNig1.2006-02-19
cd /cluster/data/mm8/bed
ln -s blastz.tetNig1.2006-02-19 blastz.tetNig1
cd blastz.tetNig1
cat << '_EOF_' > DEF
# Mouse vs tetraodon
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Tetraodon TetNig1 - single chunk big enough to run whole chroms
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzTetNig1.2006-02-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net `pwd`/DEF > net.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=net `pwd`/DEF > swap-net.out 2>&1 &
time nice -n +19 featureBits mm8 chainTetNig1Link
# 50358792 bases of 2567283971 (1.962%) in intersection
time nice -n +19 featureBits tetNig1 chainMm8Link
# 47024263 bases of 342403326 (13.734%) in intersection
#########################################################################
# BLASTZ oryCun1 (DONE - 2006-02-21 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.oryCun1.2006-02-21
cd /cluster/data/mm8/bed
ln -s blastz.oryCun1.2006-02-21 blastz.oryCun1
cd blastz.oryCun1
cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY - Rabbit oryCun1
SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit
SEQ2_LEN=/scratch/hg/oryCun1/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzOryCun1.2006-02-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
time nice -n +19 featureBits mm8 chainOryCun1Link
# 496060619 bases of 2567283971 (19.322%) in intersection
#########################################################################
# BLASTZ xenTro1 (DONE - 2006-02-21 - Hiram)
ssh kk
mkdir /cluster/data/mm8/bed/blastz.xenTro1.2006-02-21
cd /cluster/data/mm8/bed
ln -s blastz.xenTro1.2006-02-21 blastz.xenTro1
cd blastz.xenTro1
cat << '_EOF_' > DEF
# mouse vs. frog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Frog xenTro1 - single chunk big enough to run two of the
# largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/mm8/bed/blastzXenTro1.2006-02-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=cat `pwd`/DEF > cat.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits mm8 chainXenTro1Link
# 62015601 bases of 2567283971 (2.416%) in intersection
time nice -n +19 featureBits xenTro1 chainMm8Link
# 59307185 bases of 1381238994 (4.294%) in intersection
#########################################################################
# BLASTZ monDom4 (DONE - 2006-02-23 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.monDom4.2006-02-23
cd /cluster/data/mm8/bed
ln -s blastz.monDom4.2006-02-23 blastz.monDom4
cd blastz.monDom4
cat << '_EOF_' > DEF
# Mouse vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=20
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse (mm8)
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
# QUERY: Opossum monDom2
SEQ2_DIR=/cluster/bluearc/scratch/hg/monDom4/monDom4.2bit
SEQ2_LEN=/cluster/bluearc/scratch/hg/monDom4/chrom.sizes
SEQ2_CHUNK=50000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzMonDom4.2006-02-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits mm8 chainMonDom4Link
# 211663336 bases of 2567283971 (8.245%) in intersection
time nice -n +19 featureBits monDom4 chainMm8Link
# 210933035 bases of 3501643220 (6.024%) in intersection
# Something caused the loaded chains and nets on Mm8 to disappear.
# to reload them (DONE - Hiram - 2006-07-18)
# recover the individual chain files
ssh kkstore04
cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain
nice chainSplit chain mm8.monDom4.all.chain.gz
ssh hgwdev
cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain/chain
foreach f (*.chain)
set c = $f:r
echo hgLoadChain mm8 ${c}_chainMonDom4 $f
hgLoadChain mm8 ${c}_chainMonDom4 $f
end
time netFilter -minGap=10 mm8.monDom4.net.gz \
| hgLoadNet -verbose=0 mm8 netMonDom4 stdin
# clean up
ssh kkstore04
cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain
rm -fr chain
#########################################################################
# BLASTZ panTro1 (DONE - 2006-02-23 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.panTro1.2006-02-23
cd /cluster/data/mm8/bed
ln -s blastz.panTro1.2006-02-23 blastz.panTro1
cd blastz.panTro1
cat << '_EOF_' > DEF
# mouse vs chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_M=50
# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Chimp PanTro1
SEQ2_DIR=/scratch/hg/panTro1/nib
SEQ2_LEN=/scratch/hg/panTro1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzPanTro1.2006-02-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits mm8 chainPanTro1Link
# 901276629 bases of 2567283971 (35.106%) in intersection
time nice -n +19 featureBits panTro1 chainMm8Link
# 901976621 bases of 2733948177 (32.992%) in intersection
#########################################################################
# BLASTZ danRer4 (DONE - 2006-04-26 - 2006-04-28 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastzDanRer4.2006-04-26
cd /cluster/data/mm8/bed
ln -s blastzDanRer4.2006-04-26 blastz.danRer4
cd blastz.danRer4
cat << '_EOF_' > DEF
# mouse vs zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer4)
# large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer4/chromNib
SEQ2_LEN=/san/sanvol1/scratch/danRer4/chromNib.sizes
SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
SEQ2_CHUNK=100000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzDanRer4.2006-04-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
cd /cluster/data/mm8/bed/blastzDanRer4.2006-04-26
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net `pwd`/DEF > net.out 2>&1 &
# swap, see also makeDanRer4.doc
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits mm8 chainDanRer4Link \
> fb.mm8.chainDanRer4Link 2>&1 &
cat fb.mm8.chainDanRer4Link
# 54036008 bases of 2567283971 (2.105%) in intersection
time nice -n +19 featureBits danRer4 chainMm8Link \
> fb.danRer4.chainDanRer4Link 2>&1 &
cat fb.danRer4.chainDanRer4Link
# 58145856 bases of 1626093931 (3.576%) in intersection
#########################################################################
# BLASTZ danRer4 (DONE - 2006-04-26 - 2006-04-28 - Hiram)
# REMAKE THIS USING ALL CHROMS FOR danRer4 (2005-05-22 - ).
ssh pk
mkdir /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
# ln -s blastzDanRer4.2006-04-26 blastz.danRer4
cat << '_EOF_' > DEF
# mouse vs zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer4)
# large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer4/nib
SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
SEQ2_CHUNK=300000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzDanRer4.2006-05-22
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
chmod +x DEF
cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF >& blastz.out &
# 0.118u 0.107s 4:05:08.71 0.0% 0+0k 0+0io 0pf+0w
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net `pwd`/DEF >& net.out &
# 0.121u 0.072s 4:48.04 0.0% 0+0k 0+0io 0pf+0w
cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
# swap, see also makeDanRer4.doc
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& swap.out &
# 0.129u 0.109s 5:02.55 0.0% 0+0k 0+0io 0pf+0w
ssh hgwdev
cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
featureBits mm8 chainDanRer4Link >& fb.mm8.chainDanRer4Link &
cat fb.mm8.chainDanRer4Link
# 55147954 bases of 2567283971 (2.148%) in intersection
featureBits danRer4 chainMm8Link >& fb.danRer4.chainDanRer4Link &
cat fb.danRer4.chainDanRer4Link
# 60721886 bases of 1626093931 (3.734%) in intersection
featureBits -chrom=chr1 mm8 refGene:cds chainDanRer4Link -enrichment
# refGene:cds 0.856%, chainDanRer4Link 1.867%, both 0.584%,
# cover 68.16%, enrich 36.51x
featureBits -chrom=chr1 mm8 refGene:cds chainDanRer3Link -enrichment
# refGene:cds 0.856%, chainDanRer3Link 1.760%, both 0.492%, cover 57.49%,
# enrich 32.67x
featureBits -chrom=chr1 danRer4 refGene:cds chainMm8Link -enrichment
# refGene:cds 0.746%, chainMm8Link 3.807%, both 0.566%, cover 75.86%,
# enrich 19.93x
featureBits -chrom=chr1 danRer3 refGene:cds chainMm8Link -enrichment
# refGene:cds 0.786%, chainMm8Link 4.581%, both 0.612%, cover 77.88%,
# enrich 17.00x
# Higher coverage than for danRer3 chains on mm8 and similar coverage
# for mm8 chains on danRer4 as on danRer3 so that is good.
#########################################################################
# BLASTZ danRer3 (DONE - 2006-02-28 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.danRer3.2006-02-28
cd /cluster/data/mm8/bed
ln -s blastz.danRer3.2006-02-28 blastz.danRer3
cd blastz.danRer3
cat << '_EOF_' > DEF
# mouse vs zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer3)
# large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
SEQ2_CHUNK=100000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzDanRer3.2006-02-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# real 216m23.425s
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits mm8 chainDanRer3Link
# 53125783 bases of 2567283971 (2.069%) in intersection
time nice -n +19 featureBits danRer3 chainMm8Link
# 54831876 bases of 1630323462 (3.363%) in intersection
#############################################################################
# STS MARKERS DATA DOWNLOAD (DONE - 2006-02-23 - 2006-02-28 - Hiram)
### *** PLEASE NOTE - STS markers redone 2006-08-29 - look for section:
## redoing STS markers track to get them more correct
### later in this file
ssh kkstore01
mkdir -p /cluster/data/mm8/bed/STSmarkers/downloads
cd /cluster/data/mm8/bed/STSmarkers/downloads
# these files appear to be new almost every day
wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
# The new feature in the .aliases file this time are names with
# spaces in them ! This changes our parsing business below,
# hopefully the spaces in the names won't cause trouble elsewhere.
wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*
# these reports from jax.org appear to be changing daily
wget --timestamping \
ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
wget --timestamping \
ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
wget --timestamping \
ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt
ls -ogrt
# -rw-rw-r-- 1 676 Mar 11 2004 README
# -rw-rw-r-- 1 396858 Jan 28 2005 10090.MGI.txt
# -rw-rw-r-- 1 390139 Mar 16 2005 10090.WI_MRC_RH.txt
# -rw-rw-r-- 1 240688 Mar 16 2005 10090.WI-YAC.txt
# -rw-rw-r-- 1 173344 Mar 16 2005 10090.WI-Genetic.txt
# -rw-rw-r-- 1 25691253 Jan 13 16:42 UniSTS.aliases
# -rw-rw-r-- 1 4140920 Feb 22 18:43 UniSTS_mouse.sts
# -rw-rw-r-- 1 4576611 Feb 23 02:22 MRK_Dump2.rpt
# -rw-rw-r-- 1 2549974 Feb 23 02:23 PRB_PrimerSeq.rpt
# -rw-rw-r-- 1 4531489 Feb 23 02:23 MRK_Sequence.rpt
# I note the UniSTS.aliases file is over twice as big as was in
# Mm7 build. I wonder what got into it ...
# What got into it was that it was completely broken. It appeared
# to have a vast section of itself duplicated again in the file.
# It was cleaned up via:
echo -e "#Unique ID\tAliases" > uniqueSTS.aliases
grep -v "^#" UniSTS.aliases | sort -n | uniq >> uniqueSTS.aliases
mv UniSTS.aliases UniSTS.aliases.broken
mv uniqueSTS.aliases UniSTS.aliases
# back to our work area, update the bed file
# to do this we need a new UniSTS_mouse.alias file
# it is created by a combination of information from several
# of the above files ! AND ! the previous stsInfoMouse.bed file
cd /cluster/data/mm8/bed/STSmarkers/downloads
cp -p /cluster/data/mm7/bed/STSmarkers/downloads/*.sh .
cp -p /cluster/data/mm7/bed/STSmarkers/downloads/*.pl .
# There is a line in the fetchAllAliases.sh script that needs to
# be updated, it must point to the previous bed file:
# BEDFile=/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed
# Next time, this should read:
# BEDFile=/cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed
# This process has been captured in the script:
# /cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
# which uses a couple of perl scripts in that same directory.
# briefly it is:
# ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0
# grep MGI: UniSTS.aliases > MGI.aliases
# ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \
# stsInfoAliases.txt
# ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases
# cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \
# | sort -n > UniSTS_mouse.alias
time ./fetchAllAliases.sh > fetchAllAliases.out 2>&1
# Here is a normal set of errors:
# processing UniSTS_mouse.sts to find aliases
# # ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line
# # 2384
# processing MGI.aliases
# fetching existing aliases from previous stsInfoMouse.bed file
# found 27648 potential errors in
# /cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed
# to see the errors: grep ERROR stsInfoAliases.txt
# verify those stsInfoMouse.bed aliases with UniSTS.aliases
# those errors in the previous stsInfoMouse.bed file are an
# accumulation of errors from a long long time ago in this chain
# of processing. Some day it might be nice to fix them, but they
# don't seem to bother anything, so they continue to be carried
# forward, and a couple of new ones are added with each assembly.
# with that, we can create a new stsInfoMouse.bed file:
# Update the m m 7 directory name here to m m 8
# for the next build of m m 9
cd /cluster/data/mm8/bed/STSmarkers
/cluster/store5/mouseMarker/code/updateBed.pl \
/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed \
downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \
downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \
downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile
# Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04
/cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed
# copy the stsInfoMouse.bed file from working dir to the marker
# info storage fold. added 2 new steps by Yontao
# be wary of the archive name here, check the directory and get
# the name right here.
mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed
# comparing to previous, numbers increase slightly each time
wc /cluster/store5/mouseMarker/stsInfoMouse.bed \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
# 60440 801181 6871232 /cluster/store5/mouseMarker/stsInfoMouse.bed
# 59843 794642 6802825 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
# 58980 784786 6690105 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
# 58493 778055 6524821 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
# and from that, create new primer fa, epcr, etc:
/cluster/store5/mouseMarker/code/luConvertPrimerToFa \
stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
# the mouseC.fa file will be empty, should be more than last time
wc mouse?.*
# 0 0 0 mouseC.fa
# 305991 305937 6910111 mouseP.fa
# 34475 172467 2195057 mouseP.info
# 340466 478404 9105168 total
# the equivalent Mm7 files:
# 0 0 0 mouseC.fa
# 300968 300914 6798466 mouseP.fa
# 33838 169275 2153113 mouseP.info
# 334806 470189 8951579 total
# the equivalent Mm6 files:
# 0 0 0 mouseC.fa
# 293305 293251 6624638 mouseP.fa
# 32890 164528 2087271 mouseP.info
# 326195 457779 8711909 total
# the equivalent Mm5 files:
# 0 0 0 mouseC.fa
# 286740 286686 6474893 mouseP.fa
# 32232 161234 2044810 mouseP.info
# 318972 447920 8519703 total
# copy the primers over to some filesystem close to the klusters
# and split them up to have a small number of sequences in one file
mkdir /cluster/bluearc/mm8/stsMarkers
cp -p mouseP.fa /cluster/bluearc/mm8/stsMarkers
cd /cluster/bluearc/mm8/stsMarkers
cp -p /cluster/data/mm8/11.ooc .
mkdir split
# 400 files for 34,475 sequences, == about 80 sequences per file
faSplit sequence mouseP.fa 400 split/mm_
# PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE.
# This process could convert to a modern version of blat with the
# filters as described, for example, in the STS markers build in Hg18
# CLUSTER RUN FOR THE STS PRIMERS
ssh kk
mkdir /cluster/data/mm8/bed/STSmarkers/primer
mkdir /cluster/data/mm8/bed/STSmarkers/ePCR
cd /cluster/data/mm8/bed/STSmarkers/primer
mkdir out
# interestingly, this blat2.2 binary did not function correctly
# when given nib files. It has only about 1/4th of the number of
# alignments as it gets when it used fa files for the target
# sequence.
ls -1S /cluster/bluearc/mm8/stsMarkers/split > primers.list
ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list
cat << '_EOF_' > runBlat2.csh
#!/bin/csh -fe
set primer = /cluster/bluearc/mm8/stsMarkers/split/$1
set fa = /cluster/bluearc/mm8/stsMarkers/chroms/$2
set ooc = /cluster/bluearc/mm8/stsMarkers/11.ooc
set root2 = $2:r
mkdir -p out/${root2}
set out = $3
/cluster/bin/i386/blat.2 ${fa} ${primer} -ooc=${ooc} \
-minMatch=1 -minScore=0 -minIdentity=80 -oneOff ${out}
'_EOF_'
# << happy emacs
chmod +x runBlat2.csh
cat << '_EOF_' > template
#LOOP
./runBlat2.csh $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 primers.list chr.list template jobList
para create jobList
para try ... check ... push ... etc ...
# Completed: 12104 of 12104 jobs
# CPU time in finished jobs: 1075037s 17917.28m 298.62h 12.44d 0.034 y
# IO & Wait Time: 7444257s 124070.95m 2067.85h 86.16d 0.236 y
# Average job time: 704s 11.73m 0.20h 0.01d
# Longest finished job: 61869s 1031.15m 17.19h 0.72d
# Submission to last job: 168538s 2808.97m 46.82h 1.95d
# some of the jobs got stuck for unknown reasons. Had to find
# them and kill them on their nodes. Their blat.2 process was
# stuck and would not kill. Don't know what happened there.
# on the file server
ssh kkstore01
cd /cluster/data/mm8/bed/STSmarkers/primer
time pslSort dirs primers.raw.psl temp out/chr*
# -rw-rw-r-- 1 586124177 Feb 26 21:28 primers.raw.psl
# filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
# should not be more than 100 bases different.
# This filters out about 1,028,202 alignments, or
# %17.4 = 100.0 * 1028202 / 5921712
time pslSort dirs stdout temp out/chr* | awk -F"\t" '
{ if (((($13 - $12) - ($17 - $16)) > -100) &&
((($13 - $12) - ($17 - $16)) < 100)) {print}
}
' > primers.psl.100
rmdir temp
wc -l *.100 *.psl
# 5445367 primers.raw.psl
# 4500528 primers.psl.100
# 944839 difference
# a rough comparison with previous results:
wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.100
# 4893510 102763628 510563575 primers.psl.100
wc primers.psl (unfiltered, Mm7)
# 5921712 124355891 636898117 primers.psl
wc /cluster/data/mm7/bed/STSmarkers/primer/primers.psl
# 5724127 120206606 615248041
wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl
# 5719969 120119288 590806241
wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl
# 5745617 120657896 592135728
# another kluster run for the ePCR
ssh pk
cd /cluster/data/mm8/bed/STSmarkers/ePCR
ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list
# pick up e-PCR source from
# ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
# version 2.3.1 11 Feb 2005
# Had to add the following to both re-PCR_main.cpp and
# e-PCR_main.cpp to get them to compile on kolossus:
// max and min Copied from /usr/include/mysql/my_global.h
#define max(a, b) ((a) >? (b))
#define min(a, b) ((a) <? (b))
mkdir out
cat << '_EOF_' > runPCR
#!/bin/csh -fe
/cluster/bin/x86_64/e-PCR /cluster/data/mm8/bed/STSmarkers/mouseP.info \
/cluster/bluearc/mm8/stsMarkers/chroms/$1 N=1 M=50 W=5 > $2
'_EOF_'
# << happy emacs
chmod +x runPCR
cat << '_EOF_' > template
#LOOP
./runPCR $(path1) {check out line+ out/$(num1).epcr}
#ENDLOOP
'_EOF_'
# the mouseP.info was created above
gensub2 chr.list single template jobList
para create jobList
para try
para check
para push
... etc ...
# STARTED 2006-02-27 16:24
# There is a single job that produces no output:
./runPCR chrX_random.fa out/30.epcr
# WARNING: 96 STSs have primer shorter than W
# WARNING: 21 STSs have ambiguities within W of 3' end
# Not sure what's up with that
# Completed: 33 of 34 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 67601s 1126.69m 18.78h 0.78d 0.002 y
# IO & Wait Time: 1028s 17.13m 0.29h 0.01d 0.000 y
# Average job time: 2080s 34.66m 0.58h 0.02d
# Longest finished job: 5134s 85.57m 1.43h 0.06d
# Submission to last job: 5134s 85.57m 1.43h 0.06d
ssh kkstore01
cd /cluster/data/mm8/bed/STSmarkers/ePCR
# all those results become all.epcr
cat out/*.epcr > all.epcr
# comparing to previous results:
wc -l all.epcr
# 58088 all.epcr
wc -l /cluster/data/mm7/bed/STSmarkers/ePCR/all.epcr
# 57709 /cluster/data/mm7/bed/STSmarkers/ePCR/all.epcr
wc -l /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr
# 55871 /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr
wc /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
# 55677 222708 2945623 /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
wc /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
# 74705 298820 3971712 /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
# Mm4 seems to be out of whack
cd /cluster/data/mm8/bed/STSmarkers/primer
/cluster/bin/scripts/filterSTSPrimers \
-mouse ../stsInfoMouse.bed primers.psl.100 \
../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat
# The output should show an increasing count:
# Reading name info
# Reading primer info
# Processing file
# 100000
# 200000
# 300000
# ...
# 4500000
# Determining ePCR not found
#
wc -l primers.psl.filter.blat
# 34026 primers.psl.filter.blat
wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.blat
# 33986 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.blat
wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat
# 33128 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat
wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat
# 33476 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat
# create accession_info.rdb
touch empty_sequence.inf
/cluster/bin/scripts/compileAccInfo -mouse \
/cluster/data/mm8 empty_sequence.inf
# works with errors on missing randoms, etc...:
# cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory
# cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory
mv accession_info.rdb accession_info.rdb.tmp
/cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \
accession_info.rdb
rm accession_info.rdb.tmp
# comparing results to previous
# Continuing the trend that began with Mm7, the numbers in
# accession_info.rdb continue to decrease. Even Mm8 has much less
# fragments than did mm7:
# e.g.:
[hiram@kkstore01 /cluster/data] wc -l mm8/*/chr*.agp | tail -1
# 21910 total
[hiram@kkstore01 /cluster/data] wc -l mm7/*/chr*.agp | tail -1
# 70125 total
[hiram@kkstore01 /cluster/data] wc -l mm6/*/chr*.agp | tail -1
# 170812 total
wc -l accession_info.rdb
# 20385 accession_info.rdb
wc -l /cluster/data/mm7/bed/STSmarkers/primer/accession_info.rdb
# 44046 484510 3112816 accession_info.rdb
wc /cluster/data/mm7/bed/STSmarkers/primer/accession_info.rdb
# 93052 1023576 6824900 accession_info.rdb
wc /cluster/data/mm5/bed/STSmarkers/primer/accession_info.rdb
# 131845 1450299 9681940
wc /cluster/data/mm4/bed/STSmarkers/primer/accession_info.rdb
# 86935 956289 6374930
# creates epcr.not.found.nomatch and epcr.not.found.psl
# /cluster/bin/scripts/epcrToPsl
# Fixed this script (in mm7) to make it not look for contigs in the usual
# manner, we don't have those for this assembly
sed -e "s/mm7/mm8/g" /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl \
> ./epcrToPsl
chmod +x epcrToPsl
./epcrToPsl -mouse \
epcr.not.found ../mouseP.info \
accession_info.rdb /cluster/data/mm8
# Comparing results to previous:
wc -l epcr*
# 501 epcr.not.found
# 0 epcr.not.found.nomatch
# 501 epcr.not.found.psl
# 158 epcrToPsl
# 1160 total
# Mm7 wc epcr*
wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr*
# 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found
# 0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch
# 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl
# 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl
# 1106 total
# Mm6 wc epcr*
wc -l /cluster/data/mm6/bed/STSmarkers/primer/epcr*
# 472 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found
# 63 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.nomatch
# 404 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.psl
# 158 /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl
# 1097 total
cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
wc -l primers.psl.filter
# 34527 primers.psl.filter
wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
# 34460 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
# 33532 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
# 33691 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
# create primers.psl.filter.lifted.initial
# if you do not run with scripts in your path, add the PATH business
PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/extractPslInfo \
primers.psl.filter
wc -l primers.psl.filter.initial
# 34513 primers.psl.filter.initial
wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
# 34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
# 33514 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
wc -l \
/cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial
# 33689
# create primers.psl.filter.lifted.initial.acc
/cluster/bin/scripts/findAccession -agp \
-mouse primers.psl.filter.initial /cluster/data/mm8
# it complains about missing _random items, it is OK
wc -l primers.psl.filter.initial.acc
# 34513 primers.psl.filter.initial.acc
wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial.acc
# 34443
# this needs to be -rat as that specifies how to scan the
# stsInfoMouse.bed file and it does not work if you use -mouse
/cluster/bin/scripts/getStsId -rat \
../stsInfoMouse.bed primers.psl.filter.initial.acc \
| sort -k4,4n > primers.final
wc -l primers.final
# 34513 primers.final
wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.final
# 34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.final
cd /cluster/data/mm8/bed/STSmarkers
# stsMarkers.final is empty for mouse
touch stsMarkers.final dummy
# if you do not run with scripts in your path, add the PATH business
PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/combineSeqPrimerPos \
stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
wc -l stsMarkers_pos.rdb
# 33075 stsMarkers_pos.rdb
wc -l /cluster/data/mm7/bed/STSmarkers/stsMarkers_pos.rdb
# 32869 /cluster/data/mm7/bed/STSmarkers/stsMarkers_pos.rdb
wc -l /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb
# 31889 /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb
wc -l /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
# 32085 /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
wc -l /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb
# 31270 /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb
/projects/cc/hg/ytlu/bin/script/perl/createStsBed \
stsInfoMouse.bed stsMarkers_pos.rdb 500 \
| sort -k1,1 -k2,2n > stsMapMouse.bed
# Fixup --- 2006-04-12 - Hiram - it was found that column 12 had blanks
# as the first character of the field. This isn't what is needed
# here. Let's take those blanks out, turns out these were the
# only blanks in the file:
mv stsMapMouse.bed stsMapMouse_withBlanks.bed
sed -e "s/ //" stsMapMouse_withBlanks.bed > stsMapMouse.bed
wc stsMapMouse.bed
# 29888 308263 2087726 stsMapMouse.bed
wc /cluster/data/mm7/bed/STSmarkers/stsMapMouse.bed
# 29079 301678 2097544 stsMapMouse.bed
wc /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed
# 29069 301535 2123622 /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed
# loading STS markers tables
ssh hgwdev
cd /cluster/data/mm8/bed/STSmarkers
cp -p /cluster/data/mm7/bed/STSmarkers/ucscAlias.pl .
./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
# this does leave messages in ucscStsAlias.warnings but they seem
# to be very similar to Mm6 with just a few new ones
wc ucscStsAlias.tab (after applying filter to primers.psl above)
# 144570 433667 3366815 ucscStsAlias.tab
wc ucscStsAlias.tab (before applying filter to primers.psl above)
# 144570 433667 3366815 ucscStsAlias.tab
wc /cluster/data/mm7/bed/STSmarkers/ucscStsAlias.tab
# 141585 424725 3284106 ucscStsAlias.tab
wc /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
# 126624 379859 3037850 /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
# Use the drop tables if reloading
# hgsql -e "drop table stsAlias;" mm8
hgsql mm8 < ~/kent/src/hg/lib/stsAlias.sql
hgsql -e \
'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm8
# reloaded stsMapMouseNew 2006-04-12 to remove blanks in col 12 - Hiram
# hgsql -e "drop table stsMapMouseNew;" mm8
hgsql mm8 < ~/kent/src/hg/lib/stsMapMouseNew.sql
hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm8
# hgsql -e "drop table stsInfoMouseNew;" mm8
hgsql mm8 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
hgsql -e \
'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm8
hgLoadPsl -nobin -table=all_sts_primer mm8 primer/primers.psl.filter
# load of all_sts_primer did not go as planned: 34527 record(s), 0
# row(s) skipped, 19 warning(s) loading primer/primers.psl.filter
# load primer sequences
mkdir /gbdb/mm8/stsMarker
ln -s /cluster/data/mm8/bed/STSmarkers/mouseP.fa \
/gbdb/mm8/stsMarker/mouseP.fa
# PLEASE NOTE THAT THE If you are going to reload this business, use the
# -replace option on this hgLoadSeq
# hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa
# otherwise there will be a problem that the seq and extFile tables
# will be out of sync.
hgLoadSeq mm8 /gbdb/mm8/stsMarker/mouseP.fa
# Adding /gbdb/mm8/stsMarker/mouseP.fa
# 33838 sequences
featureBits mm8 all_sts_primer
# 3746196 bases of 2567283971 (0.146%) in intersection
featureBits mm7 all_sts_primer
# 3757119 bases of 2583394090 (0.145%) in intersection
featureBits mm6 all_sts_primer
# 3677372 bases of 2597150411 (0.142%) in intersection
featureBits mm8 stsMapMouseNew
# 4801964 bases of 2567283971 (0.187%) in intersection
featureBits mm7 stsMapMouseNew
# 4805958 bases of 2583394090 (0.186%) in intersection
featureBits mm6 stsMapMouseNew
# 4638338 bases of 2597150411 (0.179%) in intersection
hgsql -N mm8 -e "select count(*) from stsAlias;"
# 141981
hgsql -N mm7 -e "select count(*) from stsAlias;"
# 140649
hgsql -N mm7 -e "select count(*) from stsAlias;"
# 137738
hgsql -N mm5 -e "select count(*) from stsAlias;"
# 122944
hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;"
# 60440
hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
# 59843
hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
# 58980
hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;"
# 58493
# compare old and new name lists:
awk '{print $4}' stsMapMouse.bed | sort -u > mm8.nameList
awk '{print $4}' /cluster/data/mm7/bed/STSmarkers/stsMapMouse.bed | \
sort -u > mm7.nameList
comm -12 mm?.nameList | wc -l
# 28253 <- 28,253 names in common
comm -23 mm7.nameList mm8.nameList | wc -l
# 174 <- 174 unique to mm7 list
comm -13 mm7.nameList mm8.nameList | wc -l
# 445 <- 445 unique to mm8 list
# previously, Mm6 vs Mm7:
# 27320 <- 27,320 names in common
# 188 <- 188 unique to mm6 list
# 1107 <- 1,107 unique to mm7 list
####################################################################################
# BUILD KNOWN GENES TABLES (STARTED 2/25/06, PART I DONE 2/27/06 Fan)
# First build protein databases, sp060115 and proteins060115
# See makeProteins060115.doc for details.
# Create working subdirectories and temporary databases (kgMm8A)
ssh hgwdev
mkdir /cluster/store9/kg
cd /cluster/store9/kg
mkdir kgMm8A
ln -s /cluster/store9/kg/kgMm8A /cluster/store6/kgDB/bed/kgMm8A
ln -s /cluster/store9/kg/kgMm8A /cluster/data/mm8/bed/kgMm8A
hgsql mm8 -e "create database kgMm8A"
hgsql mm8 -e "create database kgMm8ATemp"
mkdir /cluster/bluearc/kgDB/kgMm8A
mkdir /cluster/bluearc/kgDB/kgMm8A/protBlat
ln -s /cluster/bluearc/kgDB/kgMm8A/protBlat /cluster/store9/kg/kgMm8A/protBlat
cd /cluster/store9/kg/kgMm8A/protBlat
# Get all mouse protein sequences
hgsql -N sp060115 -e \
'select p.acc, p.val from protein p, accToTaxon x where x.taxon=10090 and p.acc=x.acc'\
|awk '{print ">" $1;print $2}' >mouseProt.fa
hgsql -N sp060115 -e \
'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x where v.parAcc = p.acc and x.taxon=10090 and v.parAcc=x.acc'\
|awk '{print ">" $1;print $2}' \
>mouseVarProt.fa
# append var proteins to mouseProt.fa
cat mouseVarProt.fa >>mouseProt.fa
# Prepare and perform cluster run for protein/genome alignment
ssh pk
cd /cluster/data/mm8/bed/kgMm8A/protBlat
mkdir prot
faSplit sequence mouseProt.fa 2000 prot/prot
ls /cluster/bluearc/kgDB/kgMm8A/protBlat/prot/* > prot.lis
ssh hgwdev
cd /cluster/data/mm8/bed/kgMm8A/protBlat
hgsql mm8 -N -e 'select chrom from chromInfo' > chrom.lis
exit
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -t=dnax -q=prot /cluster/data/mm8/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm8A/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
mkdir result
gensub2 chrom.lis prot.lis gsub jobList
para create jobList
para try
para check
para push
para check ...
# started 8:15 AM 2/25/06, done 3:12 AM 2/26/06.
# Two jobs crashed due to empty result, push again and finished OK in a few minutes.
# Completed: 67354 of 67354 jobs
# CPU time in finished jobs: 12580047s 209667.46m 3494.46h 145.60d 0.399 y
# IO & Wait Time: 237270s 3954.49m 65.91h 2.75d 0.008 y
# Average job time: 190s 3.17m 0.05h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 19991s 333.18m 5.55h 0.23d
# Submission to last job: 68128s 1135.47m 18.92h 0.79d
# collect BLAT results
pslSort -nohead dirs raw.psl temp result
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl protBlat.psl /dev/null
ssh hgwdev
cd /cluster/bluearc/kgDB/kgMm8A/protBlat
hgLoadPsl mm8 protBlat.psl
# create all_mrna.psl and tight_mrna.psl
hgsql mm8 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl
pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
all_mrna.psl tight_mrna.psl /dev/null
# Save a copy of the following mm8 tables, to be used later to construct
# kgMore and kgEvenmore
all_mrna
gbCdnaInfo
gbExtFile
gbLoaded
gbSeq
gbStatus
refFlat
refGene
refLink
refSeqAli
refSeqStatus
refSeqSummary
xenoMrna
xenoRefFlat
xenoRefGene
xenoRefSeqAli
# Use overlapSelect to get protein and mRNA alignment overlaps
overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \
-selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.stat
overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
-inFmt=psl tight_mrna.psl protBlat.psl protMrna.out
# Create protein/mRNA pair and protein lists
cut -f 10,31 protMrna.out|sort -u >spMrna.tab
cut -f 10 protMrna.out|sort -u >protein.lis
cp -p protein.lis /cluster/data/mm8/bed/kgMm8A
# Load spMrna.tab into spMrna table in temp DB.
hgsql kgMm8ATemp < ~/src/hg/lib/spMrna.sql
hgsql kgMm8ATemp -e 'load data local infile "spMrna.tab" into table spMrna'
hgsql kgMm8ATemp -e 'create index mrnaID on spMrna(mrnaID)'
# Prepare and perform cluster run of protein/mRNA alignment
# Get mRNA fa file.
cd /cluster/data/mm8/bed/kgMm8A
/cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm8 \
-gbRoot=/cluster/data/genbank genbank mrna mrna.fa
# Create mrnaSeq table in kgMm8ATemp DB.
faToTab mrna.fa mrnaSeq.tab
hgsql kgMm8ATemp -e 'drop table mrnaSeq'
hgsql kgMm8ATemp <~/src/hg/lib/mrnaSeq.sql
hgsql kgMm8ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
# Prepare files for cluster run
cd /cluster/bluearc/kgDB/kgMm8A
~/src/hg/protein/KG2.sh kgMm8A mm8 060115
# Perform cluster run of protein/mRNA alignment
~/src/hg/protein/KG3.sh kgMm8A mm8 060115
# Collect cluster run results
cd kgBestMrna
ls out | sed -e 's/prot/do1 prot/g' >doall
# create do1 with the following 2 lines:
cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protMrnaRaw.psl
'_EOF_'
chmod +x do*
doall
# Filter out low quality alignments
pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
wc protMrna.lis
# Load BLAT results into temp DB.
ssh hgwdev
cd /cluster/store9/kg/kgMm8A/kgBestMrna
hgsql kgMm8ATemp < ~/src/hg/lib/protMrnaBlat.sql
hgsql kgMm8ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
hgsql kgMm8ATemp -e 'create index tName on protMrnaBlat(tName)'
# Create CDS files from protein/mRNA alignment results.
hgsql kgMm8ATemp -N -e \
'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
|sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds
# Create protMrna.psl with proteinID_mrnaID as query ID.
cut -f 22-30 ../protBlat/protMrna.out > j1.tmp
cut -f 32-42 ../protBlat/protMrna.out > j2.tmp
cut -f 10,31 ../protBlat/protMrna.out|sed -e 's/\t/_/g' >j3.tmp
paste j1.tmp j3.tmp j2.tmp >protMrna.psl
rm j1.tmp j2.tmp j3.tmp
# Run mrnaToGene to create protMrna.gp
bash
mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
exit
# move kgBestMrna to /san/sanvol1 to save space on store9
mv /cluster/store9/kg/kgMm8A/kgBestMrna/clusterRun /san/sanvol1/scratch/fan/mm8/kgMm8A/kgBestMrna
ln -s /san/sanvol1/scratch/fan/mm8/kgMm8A/kgBestMrna/clusterRun \
/cluster/store9/kg/kgMm8A/kgBestMrna/clusterRun
# Prepare refGene and all_mrna gp files.
cd ..
cp -p base/refGene.tab ref.gp
hgsql mm8 -N -e \
'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \
|sort -u > all_mrna.cds
cat base/all_mrna.tab |cut -f 2-22 >all_mrna.psl
bash
mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
exit
# Align proteins to RefSeq.
overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
protBlat/protBlat.psl ref.gp ref.stat
overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
protBlat/protBlat.psl ref.gp protRef.gp
overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
-selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out
cut -f 10,22 protRef.out | sort -u >spRef.tab
cut -f 10 protRef.out | sort -u >protRef.lis
hgsql kgMm8ATemp -e 'drop table spRef'
hgsql kgMm8ATemp <~/src/hg/lib/spRef.sql
hgsql kgMm8ATemp -e 'load data local infile "spRef.tab" into table spRef'
# Prepare and perform cluster runs for protein/RefSeq alignments
~/src/hg/protein/KGRef2.sh kgMm8A mm8 060115
~/src/hg/protein/KGRef3.sh kgMm8A mm8 060115
cd kgBestRef
ls out | sed -e 's/prot/do1 prot/g' >doall
cat << '_EOF_' > do1
echo processing $1
cat out/$1/*.out >>protRefRaw.psl
'_EOF_'
chmod +x do*
doall
# Filter out low quality alignments.
pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
wc protRef.lis
hgsql kgMm8ATemp -e 'drop table protRefBlat'
hgsql kgMm8ATemp < ~/src/hg/lib/protRefBlat.sql
hgsql kgMm8ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
hgsql kgMm8ATemp -e 'create index tName on protRefBlat(tName)'
# Run gene-check to filter out invalid gp entries
cd /cluster/data/mm8/bed/kgMm8A
cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
gene-check -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/mm8/nib kgCandidate0.gp kgCandidate0.check
hgsql kgMm8ATemp -e 'drop table kgCandidate0'
hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidate0.sql
hgsql kgMm8ATemp -e 'load data local infile "kgCandidate0.gp" into table kgCandidate0'
hgsql kgMm8ATemp -e 'drop table geneCheck'
hgsql kgMm8ATemp < ~/src/hg/lib/geneCheck.sql
hgsql kgMm8ATemp -e 'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'
# Run kgCheck to get all KG candidates that pass the KG gene check criteria
kgCheck kgMm8ATemp mm8 kgCandidate0 geneCheck kgCandidate.tab
hgsql kgMm8ATemp -e 'drop table kgCandidate'
hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidate.sql
hgsql kgMm8ATemp -e 'load data local infile "kgCandidate.tab" into table kgCandidate'
hgsql kgMm8ATemp -e 'create index alignID on kgCandidate(alignID)'
# Construct the kgCandidateX table that has alignID in the name field.
cut -f 2-10 kgCandidate.tab >j2.tmp
cut -f 11 kgCandidate.tab >j1.tmp
paste j1.tmp j2.tmp >kgCandidateX.tab
hgsql kgMm8ATemp -e 'drop table kgCandidateX'
hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateX.sql
hgsql kgMm8ATemp -e 'load data local infile "kgCandidateX.tab" into table kgCandidateX'
# Score protein/mRna and protein/RefSeq alignments
kgResultBestMrna2 060115 kgMm8ATemp mm8 protMrnaBlat|sort -u >protMrnaBlatScore.tab
kgResultBestRef2 060115 kgMm8ATemp mm8 protRefBlat|sort -u >protRefScore.tab
# Combine scoring results and load them into temp DB.
cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
hgsql kgMm8ATemp -e 'drop table protMrnaScore'
hgsql kgMm8ATemp < ~/src/hg/lib/protMrnaScore.sql
hgsql kgMm8ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
hgsql kgMm8ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
# Run kgGetCds to get CDS structure of each gene
kgGetCds kgMm8ATemp 060115 kgCandidateX jY.tmp
# G171564 does not have cds.
# G171565 does not have cds.
cat jY.tmp |sort -u >kgCandidateY.tab
rm jY.tmp
hgsql kgMm8ATemp -e 'drop table kgCandidateY'
hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateY.sql
hgsql kgMm8ATemp -e 'load data local infile "kgCandidateY.tab" into table kgCandidateY'
# Run kgPickPrep to replace long cds structure string with cdsId.
kgPickPrep kgMm8ATemp kgCandidateZ.tab
hgsql kgMm8ATemp -e 'drop table kgCandidateZ'
hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateZ.sql
hgsql kgMm8ATemp -e 'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
hgsql kgMm8ATemp -e 'create index cdsId on kgCandidateZ(cdsId)'
# Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.
kgPick kgMm8ATemp mm8 sp060115 kg3.tmp dupSpMrna.tmp
sort -u dupSpMrna.tmp >dupSpMrna.tab
# Create put back list
# gbGetSeqs2, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line.
gbGetSeqs2 -gbRoot=/cluster/data/genbank db=mm8 -get=ra RefSeq mrna ref.ra
cat ref.ra | sed -e 's/ /\t/' | sort -u >refRa.tab
hgsql mm8 -e 'drop table refRa'
hgsql mm8 < ~/src/hg/lib/refRa.sql
hgsql mm8 -e 'load data local infile "refRa.tab" into table refRa ignore 1 lines'
hgsql mm8 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
>kgPutBack2.tab
hgsql mm8 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
>>kgPutBack2.tab
hgsql mm8 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
>>kgPutBack2.tab
hgsql mm8 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
>>kgPutBack2.tab
hgsql mm8 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
>>kgPutBack2.tab
hgsql kgMm8ATemp -e 'drop table kgPutBack2'
hgsql kgMm8ATemp < ~/src/hg/lib/kgPutBack2.sql
hgsql kgMm8ATemp -e 'load data local infile "kgPutBack2.tab" into table kgPutBack2'
kgPutBack kgMm8ATemp mm8 sp060115 kgPutBack2 kgPutBack2.gp
# No matching protein found for NM_008523.
# No matching protein found for NM_194444.
# No matching protein found for NM_206941.
# Sort KG genes to make the kg4.gp table file.
cat kgPutBack2.gp kg3.tmp > kg4.tmp
~/kent/src/hg/protein/sortKg.pl kg4.tmp >knownGene.tab
hgsql kgMm8ATemp -e 'drop table knownGene'
hgsql kgMm8ATemp < ~/src/hg/lib/knownGene.sql
hgsql kgMm8ATemp -e 'load data local infile "knownGene.tab" into table knownGene'
# Load data into mm8 knownGene table.
hgsql mm8 -e 'drop table knownGene'
hgsql mm8 < ~/src/hg/lib/knownGene.sql
hgsql mm8 -e 'load data local infile "knownGene.tab" into table knownGene'
# Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain.
hgsql mm8 -e 'drop table dupSpMrna'
hgsql mm8 < ~/src/hg/lib/dupSpMrna.sql
hgsql mm8 -e 'load data local infile "dupSpMrna.tab" into table dupSpMrna'
# Perform analysis on KG
nice featureBits mm8 knownGene
# 54684224 bases of 2567283971 (2.130%) in intersection
nice featureBits mm8 knownGene:cds
# 28459053 bases of 2567283971 (1.109%) in intersection
nice featureBits mm8 refGene
# 46256526 bases of 2567283971 (1.802%) in intersection
nice featureBits mm8 refGene:cds
# 27221018 bases of 2567283971 (1.060%) in intersection
nice featureBits mm8 refGene knownGene
# 43441486 bases of 2567283971 (1.692%) in intersection
nice featureBits mm8 refGene:cds knownGene:cds
# 25164531 bases of 2567283971 (0.980%) in intersection
nice featureBits mm7 knownGene
# 53165921 bases of 2583394090 (2.058%) in intersection
nice featureBits mm7 knownGene:cds
# 27531524 bases of 2583394090 (1.066%) in intersection
nice featureBits mm7 refGene
# 46425940 bases of 2583394090 (1.797%) in intersection
nice featureBits mm7 refGene:cds
# 27319308 bases of 2583394090 (1.057%) in intersection
nice featureBits mm7 refGene knownGene
# 41777202 bases of 2583394090 (1.617%) in intersection
nice featureBits mm7 refGene:cds knownGene:cds
# 24297646 bases of 2583394090 (0.941%) in intersection
# Build knownGeneMrna and knownGenePep tables.
kgPepMrna kgMm8ATemp mm8 060115
hgsql mm8 -e 'drop table knownGeneMrna'
hgsql mm8 < ~/src/hg/lib/knownGeneMrna.sql
hgsql mm8 -e 'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
hgsql mm8 -e 'drop table knownGenePep'
hgsql mm8 < ~/src/hg/lib/knownGenePep.sql
hgsql mm8 -e 'load data local infile "knownGenePep.tab" into table knownGenePep'
# Build kgXref table
kgXref2 kgMm8ATemp 060115 mm8
hgsql mm8 -e 'drop table kgXref'
hgsql mm8 < ~/src/hg/lib/kgXref.sql
hgsql mm8 -e 'load data local infile "kgXref.tab" into table kgXref'
# Build spMrna table
hgsql mm8 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab
hgsql mm8 -e 'drop table spMrna'
hgsql mm8 <~/src/hg/lib/spMrna.sql
hgsql mm8 -e 'load data local infile "kgSpMrna.tab" into table spMrna'
# Build kgProtMap table
ssh hgwdev
cd /cluster/store9/kg/kgMm8A
ln -s protBlat/tight_mrna.psl .
~/src/hg/protein/kgProtMap2.sh kgMm8A mm8 060115
#####################################
# Build alias tables. (DONE 2/28/06, Fan)
ssh hgwdev
cd /cluster/store9/kg/kgMm8A
mkdir alias
cd alias
kgAliasM mm8 proteins060115
# kgAliasKgXref reads from mm8.knownGene.proteinID,
# mm8.knownGene.name, mm8.kgXref.geneSymbol
# to create kgAliasKgXref.tab
kgAliasKgXref mm8
# kgAliasRefseq reads from mm8.knownGene.name,
# mm8.knownGene.proteinID, mm8.kgXref.refseq
# to create kgAliasRefseq.tab
kgAliasRefseq mm8
hgsql sp060115 -N -e 'select name,gene.val from mm8.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
| sort -u > kgAliasP.tab
hgsql mm8 -N -e 'select name, name from knownGene' >kgAliasDup.tab
hgsql mm8 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
sort |uniq > kgAlias.tab
hgsql -e "drop table kgAlias;" mm8
hgsql mm8 < ~/kent/src/hg/lib/kgAlias.sql
hgsql mm8 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias'
# kgProtAlias reads from mm8.knownGene.name,
# mm8.knownGene.proteinID, mm8.knownGene.alignID,
# proteins060115.spXref3.accession, proteins060115.spSecondaryID, proteins060115.pdbSP.pdb
# to create kgProtAlias.tab#
kgProtAlias mm8 060115
hgsql mm8 -N -e \
'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
| sort -u >kgProtAliasNCBI.tab
# include variant splice protein IDs
hgsql mm8 -N -e \
'select name, proteinID, parAcc from knownGene,sp060115.varAcc where varAcc=proteinID'\
|sort -u >kgProtAliasDup.tab
# include duplicate protein IDs from dupSpMrna table
hgsql mm8 -N -e \
'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
|sort -u >>kgProtAliasDup.tab
# catch parent acc from dupProteinID too
hgsql mm8 -N -e\
'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp060115.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
|sort -u >>kgProtAliasDup.tab
cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab
echo "`date` creating table kgProtAlias"
hgsql mm8 -e "drop table kgProtAlias;"
hgsql mm8 <~/src/hg/lib/kgProtAlias.sql;
hgsql mm8 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'
# Build kgSpAlias table
hgsql mm8 -e \
'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
hgsql mm8 -e \
'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
>>j.tmp
cat j.tmp|sort -u |grep -v 'kgID' >mm8.kgSpAlias.tab
rm j.tmp
hgsql mm8 -e 'drop table kgSpAlias';
hgsql mm8 < ~/src/hg/lib/kgSpAlias.sql
hgsql mm8 -e 'load data local infile "mm8.kgSpAlias.tab" into table kgSpAlias'
#############################################################################
# 17-WAY VAR_MULTIZ - ALIGNMENTS (DONE - 2006-02-28 - 2006-03-02 - Hiram)
# Re-DONE with panTro2 in place of panTro1 - 2006-04-19 - Hiram)
# And again with xenTro1 in place of xenTro2 - 2006-04-24
# And again with danRer4 in place of danRer3 - 2006-05-02
ssh kkstore04
mkdir /cluster/data/mm8/bed/multiz17way
cd /cluster/data/mm8/bed/multiz17way
# create tree diagram to guide work below.
# This tree was constructed from one that Adam is using for
# ENCODE work and a 27-way alignment. Took that file and
# removed some of the entries, adding together the appropriate
# distances.
cat << '_EOF_' > 17way.nh
(((((((((
(human_hg18:0.006690,chimp_panTro2:0.007571):0.024272,
macaque_rheMac2:0.0592):0.023960,
((rat_rn4:0.081728,mouse_mm8:0.077017):0.229273,
rabbit_oryCun1:0.206767):0.1065):0.023026,
(cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,
armadillo_dasNov1:0.149862):0.015994,
(elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,
monodelphis_monDom4:0.371073):0.189124,
chicken_galGal2:0.454691):0.123297,
xenopus_xenTro2:0.782453):0.156067,
((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
zebrafish_danRer4:0.782561):0.156067);
'_EOF_'
# << happy emacs
/cluster/bin/phast/draw_tree 17way.nh > 17way.ps
/cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
grep -y mm8 17way.distances.txt | sort -k3,3n
# Print out that file for reference, and use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser. Zebrafish ends up before
# tetraodon and fugu on the browser despite its distance.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chainMm8Link chain linearGap
# distance on Mm8 on other minScore
# 1 0.1587 - rat rn4 (% 68.957) (% 69.651) 3000 medium
# 2 0.4677 - human hg18 (% 38.343) (% 34.514) 3000 medium
# 3 0.4686 - chimp panTro2 (% 37.549) (% 33.614) 3000 medium
# 4 0.4960 - macaque rheMac2 (% 34.718) (% 33.170) 3000 medium
# 5 0.5131 - rabbit oryCun1 (% 19.322) (no swap ) 3000 medium
# 6 0.6142 - armadillo dasNov1 (% 16.825) (no swap ) 3000 medium
# 7 0.6230 - dog canFam2 (% 32.281) (% 34.255) 3000 medium
# 8 0.6256 - elephant loxAfr1 (% 18.392) (no swap ) 3000 medium
# 9 0.6344 - cow bosTau2 (% 26.832) (% 24.293) 3000 medium
# 10 0.7805 - tenrec echTel1 (% 11.412) (no swap ) 5000 loose
# 11 1.0698 - opossum monDom4 (% 8.245) (% 6.024) 5000 loose
# 12 1.3425 - chicken galGal2 (% 2.552) (% 5.414) 5000 loose
# 13 1.7936 - frog xenTro2 (% 2.651) (% 5.358) 5000 loose
# 14 2.0157 - tetraodon tetNig1 (% 1.962) (% 13.734) 5000 loose
# 15 2.0562 - fugu fr1 (% 1.907) (% 13.524) 5000 loose
# 16 2.1059 - zebrafish danRer4 (% 2.105) (% 3.576) 5000 loose
cd /cluster/data/mm8/bed/multiz17way
# bash shell syntax here ...
export H=/cluster/data/mm8/bed
mkdir mafLinks
for G in rn4 hg18 panTro2 rheMac2 oryCun1 dasNov1 canFam2 \
loxAfr1 bosTau2 echTel1 monDom4 galGal2 xenTro2 tetNig1 fr1 danRer4
do
mkdir mafLinks/$G
if [ ! -d ${H}/blastz.${G}/mafNet ]; then
echo "missing directory blastz.${G}/mafNet"
exit 255
fi
ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
done
# Copy MAFs to some appropriate NFS server for kluster run
ssh kkstore04
mkdir /san/sanvol1/scratch/mm8/multiz17way
cd /san/sanvol1/scratch/mm8/multiz17way
time rsync -a --copy-links --progress \
/cluster/data/mm8/bed/multiz17way/mafLinks/ .
# We have about 5.9 Gb of data here, takes ~ 10 minutes to copy
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
# the autoMultiz cluster run
ssh pk
cd /cluster/data/mm8/bed/multiz17way/
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
17way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
# the maf directory here is a symlink to a /cluster/store8
# directory to even out the data load on store9 and store8 on kkstore04
mkdir /cluster/store8/mm8/bed/multiz17way/maf
ln -s /cluster/store8/mm8/bed/multiz17way/maf ./maf
mkdir run
cd run
# NOTE: you need to set the db properly in this script
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = mm8
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz17way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz17way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/store8/mm8/bed/multiz17way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
awk '{print $1}' /cluster/data/mm8/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
# 34 jobs
para try ... check ... push ... etc ...
# Completed: 34 of 34 jobs
# CPU time in finished jobs: 210573s 3509.55m 58.49h 2.44d 0.007 y
# IO & Wait Time: 4870s 81.17m 1.35h 0.06d 0.000 y
# Average job time: 6337s 105.61m 1.76h 0.07d
# Longest finished job: 17786s 296.43m 4.94h 0.21d
# Submission to last job: 41755s 695.92m 11.60h 0.48d
# combine results into a single file for loading and gbdb reference
ssh kkstore04
cd /cluster/data/mm8/bed/multiz17way
# There used to be a mafFilter here with a minScore of 500, but it
# turns out that the scores in these maf files are pretty much
# useless. They range from very large negatives to very large
# positives.
time catDir maf > multiz17way.maf
# real 10m17.400s
# makes an 17 Gb file:
# -rw-rw-r-- 1 17334936245 Apr 20 10:31 multiz17way.maf
# Create per-chrom individual maf files for downloads
# These are actually done after the annotation mafs are made
## re-done with corrected annotated mafs 2007-03-28 - Hiram
ssh kkstore04
cd /cluster/data/mm8/bed/multiz17way
mkdir mafDownloads
time for M in anno/maf/chr*.maf
do
B=`basename $M`
nice -n +19 cp -p ${M} mafDownloads/${B}
nice -n +19 gzip mafDownloads/${B}
echo ${B} done
done
# real 59m16.415s
cd mafDownloads
md5sum *.gz > md5sum.txt
# deliver to downloads
ssh hgwdev
ln -s /cluster/data/mm8/bed/multiz17way/mafDownloads \
/usr/local/apache/htdocs/goldenPath/mm8/multiz17way
# Load into database, actually annotation mafs are loaded later
ssh hgwdev
cd /cluster/data/mm8/bed/multiz17way
mkdir /gbdb/mm8/multiz17way
ln -s /cluster/data/mm8/bed/multiz17way/multiz17way.maf \
/gbdb/mm8/multiz17way
time nice -n +19 hgLoadMaf mm8 multiz17way
# Loaded 11601035 mafs in 1 files from /gbdb/mm8/multiz17way
# real 27m29.960s
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 mm8 multiz17waySummary multiz17way.maf
# Created 5782229 summary blocks from 65123362 components and
# 11601035 mafs from multiz17way.maf
# real 32m34.791s
# Dropped unused indexes (2006-05-09 kate)
# NOTE: this is not required in the future, as the loader
# has been fixed to not generate these indexes
hgsql mm8 -e "alter table multiz17waySummary drop index chrom_2"
hgsql mm8 -e "alter table multiz17waySummary drop index chrom_3"
# This was done for Mm7, same image can be reused
# create tree image:
# cat << '_EOF_' > species.nh
# ((((((human,(mouse,rat)),(dog,cow)),opossum),chicken),frog),(tetraodon,zebrafish))
# '_EOF_'
# /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps
# photoshop to enhance, reduce the amount of whitespace to make it
# smaller, then save as jpg
# cp species10.jpg /usr/local/apache/htdocs/images/phylo/Mm7_17way.jpg
# creating upstream mafs (DONE - 2006-07-31 - Hiram)
ssh hgwdev
# data data load balancing in the kkstore04 filesystems
mkdir /cluster/store8/mm8/bed/multiz17way/upstreamMafs
cd /cluster/data/mm8/bed/multiz17way
ln -s /cluster/store8/mm8/bed/multiz17way/upstreamMafs ./upstreamMafs
# rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst
# did not have mm8 as the first one
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 mm8 \
refGene:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags mm8 multiz17way \
stdin stdout -orgs=species.lst \
| gzip -c > upstreamMafs/upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/mm8/multiz17way
ln -s /cluster/data/mm8/bed/multiz17way/upstreamMafs/upstream*.maf.gz .
############################################################################
# ANNOTATE MULTIZ17WAY MAF AND LOAD TABLES (DONE - 2006-04-24 - Hiram)
# RE-DONE 2006-05-03 with danRer4 in place of danRer3
## Redone to correct usage of nBeds and sizes file (2007-03-28 - Hiram)
ssh kolossus
mkdir /cluster/data/mm8/bed/multiz17way/anno
cd /cluster/data/mm8/bed/multiz17way/anno
mkdir maf run
cd run
rm -f sizes nBeds
twoBitInfo -nBed /cluster/data/mm8/mm8.{2bit,N.bed}
for DB in `cat /cluster/data/mm8/bed/multiz17way/species.lst`
do
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
echo ${DB}.len >> sizes
echo $DB
done
echo '#!/bin/csh -ef' > jobs.csh
echo date >> jobs.csh
# do smaller jobs first so you can see some progress immediately:
for F in `ls -1rS ../../maf/*.maf`
do
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $F \
/cluster/data/mm8/mm8.2bit ../maf/`basename $F` >> jobs.csh
echo "echo $F" >> jobs.csh
done
echo date >> jobs.csh
chmod +x jobs.csh
time ./jobs.csh > jobs.log 2>&1 &
# to watch progress;
tail -f jobs.log
# real 218m16.272s
# Load anno/maf
ssh hgwdev
cd /cluster/data/mm8/bed/multiz17way/anno/maf
mkdir -p /gbdb/mm8/multiz17way/anno/maf
ln -s /cluster/data/mm8/bed/multiz17way/anno/maf/*.maf \
/gbdb/mm8/multiz17way/anno/maf
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/mm8/multiz17way/anno/maf mm8 multiz17way
# Loaded 12484442 mafs in 34 files from /gbdb/mm8/multiz17way/anno/maf
# real 8m14.757s
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh hgwdev64
cd /cluster/data/mm8/bed/multiz17way/anno/maf
time cat *.maf | \
nice -n +19 hgLoadMafSummary mm8 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 -test multiz17waySummary stdin
# Created 3153839 summary blocks from 65123362 components
# and 12484442 mafs from stdin
# real 13m25.961s
ssh hgwdev
cd /cluster/data/mm8/bed/multiz17way/anno/maf
time nice -n +19 hgLoadSqlTab mm8 multiz17waySummary \
~/kent/src/hg/lib/mafSummary.sql multiz17waySummary.tab
# real 0m53.525s
rm *.tab
#######################################################################
# MULTIZ17WAY MAF FRAMES (DONE - 2006-04-24 - 2006-04-25 - Hiram)
# RE-DONE 2006-05-03 to replace danRer3 with danRer4
ssh hgwdev
mkdir /cluster/data/mm8/bed/multiz17way/frames
cd /cluster/data/mm8/bed/multiz17way/frames
# The following is adapted from MarkD's Makefile used for mm7...
#------------------------------------------------------------------------
# get the genes for all genomes
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using mrna table as genes:
mkdir genes
for qDB in oryCun1 panTro2 rheMac2 canFam2 bosTau2 danRer4 loxAfr1 \
tetNig1 fr1
# single danRer4 re-run 2006-05-03, removed danRer3
for qDB in danRer4
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${qDB}.mrna-cds.${tmpExt}
tmpMrna=${qDB}.mrna.${tmpExt}
tmpCds=${qDB}.cds.${tmpExt}
echo $qDB
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
${qDB} > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
stdout \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
rm -f $tmpExt
done
# tried to use monDom4 in the above loop, but got this error:
# (450211944 450214274) out of range (0 400000000) in binKeeperAdd
# Which is interesting. This should be looked into to see why
# this is here.
# using knownGene for rn4 mm8 hg18
# using refGene for galGal2
# using mgcGenes for xenTro2
# no genes for monDom4 dasNov1 echTel1
# genePreds; (must keep only the first 10 columns for knownGene)
for qDB in rn4 mm8 hg18 galGal2 xenTro2
do
if [ $qDB = "xenTro2" ]; then
geneTbl=mgcGenes
elif [ $qDB = "galGal2" ]; then
geneTbl=refGene
else
geneTbl=knownGene
fi
echo hgsql -N -e 'select * from '"$geneTbl ${qDB}"
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
rm -f $tmpExt
done
#------------------------------------------------------------------------
# create frames
# beware, BASH syntax here ...
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
clusterDir=/cluster/bluearc/mm8/multiz17wayFrames
multizDir=/cluster/data/mm8/bed/multiz17way
mafDir=$multizDir/mafDownloads
geneDir=$multizDir/frames/genes
clusterMafDir=${clusterDir}/maf
clusterGeneDir=${clusterDir}/genes
clusterFramesDir=${clusterDir}/mafFrames.kki
# copy mafs to cluster storage
mkdir $clusterDir
ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/"
# copy genes to cluster storage
ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/"
# run cluster jobs
tmpExt=`mktemp temp.XXXXXX`
paraDir=$multizDir/frames/para.${tmpExt}
cd /cluster/data/mm8/bed/multiz17way/frames
mkdir mafFrames $paraDir
mkdir ${clusterFramesDir}
for qDB in `cat /cluster/data/mm8/bed/multiz17way/species.lst`
do
mkdir ${clusterFramesDir}/${qDB}
for C in `awk '{print $1;}' /cluster/data/mm8/chrom.sizes`
do
if [ -e ${clusterGeneDir}/${qDB}.gp.gz ]; then
echo /cluster/bin/scripts/mkMafFrames.pl ${qDB} mm8 \
${clusterGeneDir}/${qDB}.gp.gz ${clusterMafDir}/$C.maf.gz \
${clusterFramesDir}/${qDB}/$C.mafFrames \
>> $paraDir/jobList
fi
done
done
rm -f $tmpExt
ssh -x kki "cd ${paraDir} && para make jobList && para time"
# Completed: 476 of 476 jobs
# CPU time in finished jobs: 6235s 103.91m 1.73h 0.07d 0.000 y
# IO & Wait Time: 13538s 225.64m 3.76h 0.16d 0.000 y
# Average job time: 42s 0.69m 0.01h 0.00d
# Longest finished job: 237s 3.95m 0.07h 0.00d
# Submission to last job: 1242s 20.70m 0.34h 0.01d
# combine results from cluster
for qDB in \
`sed -e "s/ dasNov1//; s/ echTel1//; s/ monDom4//;" ../species.lst`
do
ssh -x kolossus "cat ${clusterFramesDir}/${qDB}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${qDB}.mafFrames.gz"
echo "${qDB}"
done
#------------------------------------------------------------------------
# load the database
ssh hgwdev
cd /cluster/data/mm8/bed/multiz17way/frames
time nice -n +19 hgLoadMafFrames mm8 multiz17wayFrames \
mafFrames/*.mafFrames.gz
# real 1m11.457s
#------------------------------------------------------------------------
# clean up
rm -rf ${clusterDir}
###
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
ssh kkstore04
cd /cluster/data/mm8/bed/multiz17way/frames
mv mafFrames/ mafFrames.old
nice tcsh # easy way to get process niced
(cat ../maf/*.maf | time genePredToMafFrames mm8 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz loxAfr1 genes/loxAfr1.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz tetNig1 genes/tetNig1.gp.gz xenTro2 genes/xenTro2.gp.gz bosTau2 genes/bosTau2.gp.gz | gzip >multiz17way.mafFrames.gz)>&log&
ssh hgwdev
cd /cluster/data/mm8/bed/multiz17way/frames
hgLoadMafFrames mm8 multiz17wayFrames multiz17way.mafFrames.gz |&mail markd&
############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
# (DONE - 2006-03-02 - Hiram)
# (RE-DONE - 2006-04-25 with panTro2 and xenTro2 - Hiram)
# (RE-DONE - 2006-05-03 with danRer4 instead of danRer3 - Hiram)
# Will skip this estimate for Mm8 since it was well done in Mm7
# and in Hg17, skip to the creation of the SS files
# Estimate phastCons parameters
ssh kkstore01
mkdir /cluster/data/mm8/bed/multiz17way/cons
cd /cluster/data/mm8/bed/multiz17way/cons
# Create a starting-tree.mod based on chr2 (the largest one)
/cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr2.maf \
--refseq ../../../2/chr2.fa --in-format MAF \
--windows 100000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
# 10 minutes
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "((((((((((hg18,panTro2),rheMac2),((rn4,mm8),oryCun1)),(bosTau2,canFam2)),dasNov1),(loxAfr1,echTel1)),monDom4),galGal2),xenTro2),((tetNig1,fr1),danRer4))" \
--out-root starting-tree
# real 840m53.157s
# That is 14 hours !
rm s1.*.ss
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.407
# This 0.407 is used in the --gc argument below
# CONTINUE HERE, no estimation required
# Create big bad bloated SS files on san filesystem (takes ~ 2h 20m)
# Increasing their size this time from 1,000,000 to 10,000,000 to
# slow down the phastCons pk jobs
ssh kkstore04
mkdir -p /san/sanvol1/scratch/mm8/cons/ss
cd /san/sanvol1/scratch/mm8/cons/ss
time for C in `awk '{print $1}' /cluster/data/mm8/chrom.sizes`
do
if [ -s /cluster/data/mm8/bed/multiz17way/maf/${C}.maf ]; then
mkdir ${C}
echo msa_split $C
chrN=${C/chr/}
chrN=${chrN/_random/}
/cluster/bin/phast/$MACHTYPE/msa_split \
/cluster/data/mm8/bed/multiz17way/maf/${C}.maf \
--refseq /cluster/data/mm8/${chrN}/${C}.fa \
--in-format MAF --windows 4000000,0 --between-blocks 5000 \
--out-format SS --out-root ${C}/${C}
fi
done &
# real 94m49.273s
# Again, going to SKIP this tuning business this time and use the
# previous numbers.
# Create a random list of 50 1 mb regions (do not use the _randoms)
cd /san/sanvol1/scratch/mm8/cons/ss
ls -1l chr*/chr*.ss | grep -v random | \
awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
# Set up parasol directory to calculate trees on these 50 regions
ssh pk
mkdir /san/sanvol1/scratch/mm8/cons/treeRun1
cd /san/sanvol1/scratch/mm8/cons/treeRun1
mkdir tree log
# Tuning this loop should come back to here to recalculate
# Create little script that calls phastCons with right arguments
# --target-coverage of 0.20 is about right for mouse, will be
# tuned exactly below
cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
/cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
/cluster/data/mm8/bed/multiz17way/cons/starting-tree.mod \
--gc 0.407 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-lengths 12 --target-coverage 0.17 \
--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
# << happy emacs
chmod a+x makeTree.csh
# Create gensub file
cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Make cluster job and run it
gensub2 ../randomSs.list single template jobList
para create jobList
para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs: 354644s 5910.74m 98.51h 4.10d 0.011 y
# IO & Wait Time: 352s 5.86m 0.10h 0.00d 0.000 y
# Average job time: 7100s 118.33m 1.97h 0.08d
# Longest finished job: 29358s 489.30m 8.15h 0.34d
# Submission to last job: 29446s 490.77m 8.18h 0.34d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kkstore01
cd /san/sanvol1/scratch/mm8/cons/treeRun1
ls -1 tree/chr*/*.cons.mod > cons.list
time /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
--output-average ../ave.cons.mod > cons_summary.txt 2>&1 &
ls -1 tree/chr*/*.noncons.mod > noncons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
--output-average ../ave.noncons.mod > noncons_summary.txt
cd ..
cp -p ave.*.mod /cluster/data/mm8/bed/multiz17way/cons
# measuring entropy
# consEntopy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
# never stops with the --NH argument
/cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \
ave.cons.mod ave.noncons.mod
XXXX - does not work: 2005-11-28
[hiram@kkstore01 /san/sanvol1/scratch/mm8/cons] /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 ave.cons.mod ave.noncons.mod
ERROR: with no separate source alignment, ss_from_msas expects sequences of positive length and no SS object.
#Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.454874 bits/site
# Required length: N=7.596943 sites
# Total entropy: NH=11.052595 bits
# consEntropy .20 12 ave.cons.mod.1 ave.noncons.mod.1
# Transition params: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833
# Relative entropy: H=1.454874 bits/site
# Required length: N=6.629337 sites
# Total entropy: NH=9.644850 bits
# consEntropy .10 12 ave.cons.mod.2 ave.noncons.mod.2
# Transition params: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.527815 bits/site
# Required length: N=7.205526 sites
# Total entropy: NH=11.008713 bits
# consEntropy .20 8 ave.cons.mod.3 ave.noncons.mod.3
# Transition params: gamma=0.200000, omega=8.000000, mu=0.125000, nu=0.031250
# Relative entropy: H=1.654878 bits/site
# Required length: N=5.146793 sites
# Total entropy: NH=8.517313 bits
### !!! *** This one with .17 and 12 is the one that was finally used
# consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4
# Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.478838 bits/site
# Required length: N=6.753382 sites
# Total entropy: NH=9.987159 bits
# SKIP to here passing by the tuning numbers
ssh pk
# Create cluster dir to do main phastCons run
mkdir /san/sanvol1/scratch/mm8/cons/consRun3
cd /san/sanvol1/scratch/mm8/cons
cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod .
# edit, change monDom2 to monDom4, hg17 to hg18, rheMac1 to
# rheMac2, rn3 to rn4, mm7 to mm8
# danRer3 to danRer4
# It looks like:
ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
TRAINING_LNL: -988246.132962
BACKGROUND: 0.295 0.205 0.205 0.295
RATE_MAT:
-1.165221 0.315494 0.589884 0.259843
0.189778 -0.878194 0.208718 0.479698
0.444622 0.261535 -0.885604 0.179447
0.234867 0.720815 0.215191 -1.170872
TREE: (((((((((((((hg18:0.006690,panTro2:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn4:0.081728,mm8:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom4:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro2:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer4:0.782561):0.156067);
cd /san/sanvol1/scratch/mm8/cons/consRun3
mkdir ppRaw bed
# Create script to run phastCons with right parameters
# These parameters:
# --rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \
# were taken from Kate's 17-way in Hg17, removing the
# --not-informative panTro2 since that isn't relevant here, nor
# would be --not-informative rn4 - Jim says rn4 is far enough away
# from mm8 that it is informative.
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \
--rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \
--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/elliotsEncode.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
# << happy emacs
chmod a+x doPhast
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
gensub2 in.list single template jobList
para create jobList
para try/check/push/etc.
# These jobs are very fast and very I/O intensive, even on the san
# they will hang it up as they work at full tilt.
# Completed: 689 of 689 jobs
# CPU time in finished jobs: 12806s 213.44m 3.56h 0.15d 0.000 y
# IO & Wait Time: 16079s 267.98m 4.47h 0.19d 0.001 y
# Average job time: 42s 0.70m 0.01h 0.00d
# Longest finished job: 94s 1.57m 0.03h 0.00d
# Submission to last job: 350s 5.83m 0.10h 0.00d
# combine predictions and transform scores to be in 0-1000 interval
# it uses a lot of memory, so on kolossus:
ssh kolossus
cd /san/sanvol1/scratch/mm8/cons/consRun3
# The sed's and the sort get the file names in chrom,start order
# You might like to verify it is correct by first looking at the
# list it produces:
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
# if that looks right, then let it run:
# FOR NEXT TIME - the result file should be named:
# phastConsElements17way.bed since that is the name of the DB
# table that it is loaded into. (instead of mostConserved.bed)
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/mm8/bed/multiz17way
# Figure out how much is actually covered by the bed file as so:
# Get the non-n genome size from faSize on all chroms:
ssh kkstore01
cd /cluster/data/mm8
faSize ?{,?}/chr*.fa
# 2664455088 bases (97171400 N's 2567283688 real 1477933003 upper
# 1089350685 lower) in 34 sequences in 34 files
cd /san/sanvol1/scratch/mm8/cons/consRun3
# The 2567283688 comes from the non-n genome as counted above.
awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/2567283688\n",100.0*sum/2567283688,sum}' \
mostConserved.bed
# --rho 0.28 --expected-length 14 --target-coverage 0.008
# % 5.40 = 100.0*138575691/2567283688 danRer4 instead of danRer3
# % 5.43 = 100.0*139309333/2567283688 panTro2 and xenTro2
# % 5.39 = 100.0*138300407/2567283688 panTro1 and xenTro1
# Aiming for %70 coverage in
# the following featureBits measurement on CDS:
# Beware of negative scores when too high. The logToBedScore
# will output an error on any negative scores.
HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits mm8 \
-enrichment refGene:cds mostConserved.bed
# --rho 0.28 --expected-length 14 --target-coverage 0.008
# with danRer4 instead of danRer3:
# refGene:cds 1.062%, mostConserved.bed 5.398%, both 0.743%, cover
# 69.99%, enrich 12.97x
# with panTro2 and xenTro2:
# refGene:cds 1.060%, mostConserved.bed 5.426%, both 0.740%, cover
# 69.85%, enrich 12.87x
# with panTro1 and xenTro1:
# refGene:cds 1.060%, mostConserved.bed 5.387%, both 0.739%, cover
# 69.71%, enrich 12.94x
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/mm8/bed/multiz17way
# the copy was already done above
# cp -p /san/sanvol1/scratch/mm8/cons/consRun3/mostConserved.bed .
time nice -n +19 hgLoadBed -strict mm8 phastConsElements17way \
mostConserved.bed
# Loaded 1883370 elements of size 5
# real 2m54.033s
# should measure the same as above
time nice -n +19 featureBits mm8 -enrichment refGene:cds \
phastConsElements17way
# with danRer4 in place of danRer3:
# refGene:cds 1.062%, phastConsElements17way 5.398%, both 0.743%,
# cover 69.99%, enrich 12.97x
# with panTro2 and xenTro2:
# refGene:cds 1.060%, phastConsElements 5.426%, both 0.740%, cover
# 69.85%, enrich 12.87x
# with panTro1 and xenTro1:
# refGene:cds 1.060%, phastConsElements 5.387%, both 0.739%, cover
# 69.71%, enrich 12.94x
# Create merged posterier probability file and wiggle track data files
ssh kkstore04
cd /san/sanvol1/scratch/mm8/cons/consRun3
# the sed business gets the names sorted by chromName, chromStart
# so that everything goes in numerical order into wigEncode
# This was verified above to be correct
time nice -n +19 find ./ppRaw -type f \
| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
phastCons17.wig phastCons17.wib
# real 15m59.846s
# -rw-rw-r-- 1 1961998053 May 3 12:22 phastCons17.wib
# -rw-rw-r-- 1 237229239 May 3 12:22 phastCons17.wig
time nice -n +19 cp -p phastCons17.wi? /cluster/data/mm8/bed/multiz17way/
# real 1m21.329s
# prepare compressed copy of ascii data values for downloads
ssh pk
cd /san/sanvol1/scratch/mm8/cons/consRun3
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons17Scores
for D in ppRaw/chr*
do
C=${D/ppRaw\/}
out=phastCons17Scores/${C}.data.gz
echo "========================== ${C} ${D}"
find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# real 18m15.212s
# copy them for downloads
ssh kkstore04
# this directory is actually a symlink from store9 to store8 to
# avoid the data full problem on store9
mkdir /cluster/data/mm8/bed/multiz17way/phastCons17Scores
cd /cluster/data/mm8/bed/multiz17way/phastCons17Scores
cp -p /san/sanvol1/scratch/mm8/cons/consRun3/phastCons17Scores/* .
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/mm8
ln -s /cluster/data/mm8/bed/multiz17way/phastCons17Scores .
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/mm8/bed/multiz17way
ln -s `pwd`/phastCons17.wib /gbdb/mm8/wib/phastCons17.wib
time nice -n +19 hgLoadWiggle mm8 phastCons17 phastCons17.wig
# real 2m55.836s
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/mm8/bed/multiz17way
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=mm8 phastCons17 > histogram.data 2>&1
# real 28m24.388s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm8 Histogram phastCons17 track"
set xlabel " phastCons17 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# QA NOTE: (ASZ: 5/1/2006) changed name of phastConsElements table to phastConsElements17way
# QA NOTE: (ASZ: 5/1/2006) changed name of phastCons17 table to phastCons17way
# Hiram Note: phastCons17 never changed to phastCons17way at any time
#########################################################################
# MAKE FOLDUTR TABLES (DONE 2006-02-28, Fan)
# First set up directory structure and extract UTR sequence on hgwdev
ssh hgwdev
cd /cluster/data/mm8/bed
rm rnaStruct
mkdir /san/sanvol1/scratch/mm8/rnaStruct.2006-02-28
ln -s /san/sanvol1/scratch/mm8/rnaStruct.2006-02-28 rnaStruct
cd rnaStruct
mkdir -p utr3/split utr5/split utr3/fold utr5/fold
utrFa mm8 knownGene utr3 utr3/utr.fa
utrFa mm8 knownGene utr5 utr5/utr.fa
# Split up files and make files that define job.
ssh pk
cd /cluster/data/mm8/bed/rnaStruct
faSplit sequence utr3/utr.fa 4000 utr3/split/s
faSplit sequence utr5/utr.fa 4000 utr5/split/s
ls -1 utr3/split > utr3/in.lst
ls -1 utr5/split > utr5/in.lst
cd utr3
cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
cp gsub ../utr5
# Do cluster run for 3' UTRs
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 3897 of 3897 jobs
# CPU time in finished jobs: 227530s 3792.17m 63.20h 2.63d 0.007 y
# IO & Wait Time: 44046s 734.10m 12.23h 0.51d 0.001 y
# Average job time: 70s 1.16m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1337s 22.28m 0.37h 0.02d
# Submission to last job: 1886s 31.43m 0.52h 0.02d
# Do cluster run for 5' UTRs
cd ../utr5
gensub2 in.lst single gsub spec
para create spec
para try
para push
# Completed: 3762 of 3762 jobs
# CPU time in finished jobs: 42244s 704.07m 11.73h 0.49d 0.001 y
# IO & Wait Time: 10250s 170.83m 2.85h 0.12d 0.000 y
# Average job time: 14s 0.23m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 2014s 33.57m 0.56h 0.02d
# Submission to last job: 2083s 34.72m 0.58h 0.02d
# Load database
ssh hgwdev
cd /cluster/data/mm8/bed/rnaStruct/utr5
hgLoadRnaFold mm8 foldUtr5 fold
cd ../utr3
hgLoadRnaFold mm8 foldUtr3 fold
# Clean up
rm -r split fold err batch.bak
cd ../utr5
rm -r split fold err batch.bak
# Build KEGG pathway tables. (DONE 3/8/06. Fan)
ssh hgwdev
cd /cluster/store9/kg/kgMm8A
md kegg
cd kegg
~/src/hg/protein/KGpath.sh kgMm8A mm8 060115
hgsql mm8 -e "drop table keggMapDesc"
hgsql mm8 -e "drop table keggPathway"
hgsql mm8 <~/src/hg/lib/keggMapDesc.sql
hgsql mm8 <~/src/hg/lib/keggPathway.sql
hgsql mm8 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
hgsql mm8 -e 'load data local infile "keggPathway.tab" into table keggPathway'
# Build CGAP pathway tables
cd ..
~/src/hg/protein/KGcgap.sh kgMm8A mm8 060115
cat cgapBIOCARTAdesc.tab |sort -u > cgapBIOCARTAdescSorted.tab
hgsql mm8 -e "drop table cgapAlias"
hgsql mm8 -e "drop table cgapBiocDesc"
hgsql mm8 -e "drop table cgapBiocPathway"
hgsql mm8 <~/src/hg/lib/cgapAlias.sql
hgsql mm8 <~/src/hg/lib/cgapBiocDesc.sql
hgsql mm8 <~/src/hg/lib/cgapBiocPathway.sql
hgsql mm8 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
hgsql mm8 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
hgsql mm8 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'
####################################################################################
# BUILD PROTEOME BROWSER TABLES FOR mm8 (DONE 3/8/06, Fan)
# These are instructions for building tables needed for the Proteome Browser.
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.
# This build is based on proteins DBs dated 060115.
# Create the working directory
ssh hgwdev
mkdir /cluster/store9/kg/kgMm8A/pb-2006-03-08
cd /cluster/data/mm8/bed
rm pb
ln -s /cluster/store9/kg/kgMm8A/pb-2006-03-08 pb
cd pb
# Define pep* tables in mm8 DB
cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
# First edit out pepPred table definition, then
hgsql mm8 < pepAll.sql
# Build the pepMwAa table
hgsql proteins060115 -N -e \
"select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab
hgsql mm8 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'
o Build the pepPi table
hgsql proteins060115 -e \
"select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis
hgsql mm8 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis
pbCalPi protAcc.lis sp060115 pepPi.tab
hgsql mm8 -e 'delete from pepPi'
hgsql mm8 -e 'load data local infile "pepPi.tab" into table mm8.pepPi'
# Calculate and load pep distributions
pbCalDist sp060115 proteins060115 10090 mm8 >pbCalDist.out
wc pbCalDist.out
hgsql mm8
load data local infile "pepExonCntDist.tab" into table mm8.pepExonCntDist;
load data local infile "pepCCntDist.tab" into table mm8.pepCCntDist;
load data local infile "pepHydroDist.tab" into table mm8.pepHydroDist;
load data local infile "pepMolWtDist.tab" into table mm8.pepMolWtDist;
load data local infile "pepResDist.tab" into table mm8.pepResDist;
load data local infile "pepIPCntDist.tab" into table mm8.pepIPCntDist;
load data local infile "pepPiDist.tab" into table mm8.pepPiDist;
quit
# Calculate frequency distributions
pbCalResStd sp060115 10090 mm8
# Create pbAnomLimit and pbResAvgStd tables
hgsql mm8 -e "drop table pbAnomLimit"
hgsql mm8 -e "drop table pbResAvgStd"
hgsql mm8 < ~/src/hg/lib/pbAnomLimit.sql
hgsql mm8 < ~/src/hg/lib/pbResAvgStd.sql
hgsql mm8 -e 'load data local infile "pbResAvgStd.tab" into table mm8.pbResAvgStd;'
hgsql mm8 -e 'load data local infile "pbAnomLimit.tab" into table mm8.pbAnomLimit;'
# Create pbStamp table for PB
hgsql mm8 -e "drop table pbStamp"
hgsql mm8 < ~/src/hg/lib/pbStamp.sql
hgsql mm7 -N -e 'select * from pbStamp' > pbStamp.tab
hgsql mm8 -e 'load data local infile "pbStamp.tab" into table mm8.pbStamp'
# ENABLE PB FOR mm8 IN HGCENTRALTEST
echo " insert into gdbPdb values('mm8', 'proteins060115')" \
| hgsql -h genome-testdb hgcentraltest
echo "update dbDb set hgPbOk = 1 where name = 'mm8';" \
| hgsql -h genome-testdb hgcentraltest
# Adjust drawing parameters for Proteome Browser stamps
Now invoke Proteome Browser and adjust various drawing parameters
(mostly the ymax of each stamp) if necessary, by updating the
pbStamp.tab file and then delete and reload the pbStamp table.
hgsql mm8 -e "drop table pbStamp"
hgsql mm8 < ~/src/hg/lib/pbStamp.sql
hgsql mm8 -e 'load data local infile "pbStamp.tab" into table mm8.pbStamp'
# Perform preliminary review of Proteome Browser for mm8, then
notify QA for formal review.
# BUILD MISC STUFF FOR KG
# Build mrnaRefseq table
# First make sure the entrez DB is updated. (recently updated on 2/8/06).
ssh hgwdev
cd /cluster/store9/kg/kgMm8A
hgsql entrez -N -e \
'select mrna, refseq from entrezRefseq, entrezMrna, mm8.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \
>mrnaRefseq1.tab.tab
hgsql mm8 -N -e 'select name, name from refGene' >mrnaRefseq2.tab
cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab
hgsql mm8 -e 'drop table mrnaRefseq'
hgsql mm8 < ~/src/hg/lib/mrnaRefseq.sql
hgsql mm8 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'
# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 3/8/06 Fan)
# This depends on the go and uniProt databases as well as
# the kgAlias and kgProAlias tables. The hgKgGetText takes
# about 5 minutes when the database is not too busy. The rest
# is real quick.
ssh hgwdev
cd /cluster/store9/kg/kgMm8A
mkdir index
cd index
hgKgGetText mm8 knownGene.text
ixIxx knownGene.text knownGene.ix knownGene.ixx
ln -s /cluster/store9/kg/kgMm8A/index/knownGene.ix /gbdb/mm8/knownGene.ix
ln -s /cluster/store9/kg/kgMm8A/index/knownGene.ixx /gbdb/mm8/knownGene.ixx
# BUILD KNOWN GENE LIST FOR GOOGLE.
# make knownGeneLists.html mm8GeneList.html mm5GeneList.html rm3GeneList.html
cd /cluster/data/mm8/bed
rm -rf knownGeneList/mm8
# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/mm8
hgKnownGeneList mm8
# copy over to /usr/local/apache/htdocs
rm -rf /usr/local/apache/htdocs/knownGeneList/mm8
mkdir -p /usr/local/apache/htdocs/knownGeneList/mm8
cp -Rfp knownGeneList/mm8/* /usr/local/apache/htdocs/knownGeneList/mm8
##################################################################################
# Create description.html for mm8
mkdir -p ~/kent/src/hg/makeDb/trackDb/mouse/mm8
cd ~/kent/src/hg/makeDb/trackDb/mouse/mm8
cp ../hg17/description.html .
vi description.html
# Change release date and build number and change hg17 to mm8
# Check it into CVS
mkdir -p /cluster/data/mm8/html
cp -p description.html /cluster/data/mm8/html
ln -s /cluster/data/mm8/html/description.html /gbdb/mm8/html/description.html
# BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-03-08, DONE 2006-02-14 - Fan)
# This should be done after KG tables are complete from known genes build
# process.
#
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/mm8/bed/geneSorter.2006-03-08
# remove old symbolic link
rm /cluster/data/mm8/bed/geneSorter
ln -s /cluster/data/mm8/bed/geneSorter.2006-03-08 /cluster/data/mm8/bed/geneSorter
cd /cluster/data/mm8/bed/geneSorter
hgClusterGenes mm8 knownGene knownIsoforms knownCanonical
# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/mm8/bed/geneSorter/blastp
cd /cluster/data/mm8/bed/geneSorter/blastp
pepPredToFa mm8 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
# This command is in /projects/compbio/bin/$MACH/formatdb
# Copy over database to bluearc
rm -fr /cluster/bluearc/mm8/blastp
mkdir -p /cluster/bluearc/mm8/blastp
cp -p /cluster/data/mm8/bed/geneSorter/blastp/known.* /cluster/bluearc/mm8/blastp
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm8/bed/geneSorter/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg
# Make parasol run directory
ssh pk
mkdir /cluster/data/mm8/bed/geneSorter/blastp/self
cd /cluster/data/mm8/bed/geneSorter/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
-p blastp -d /cluster/bluearc/mm8/blastp/known -i $1 -o $2 \
-e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod +x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
# 'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
para push
para check
Completed: 7730 of 7730 jobs
CPU time in finished jobs: 35194s 586.56m 9.78h 0.41d 0.001 y
IO & Wait Time: 29033s 483.89m 8.06h 0.34d 0.001 y
Average job time: 8s 0.14m 0.00h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 43s 0.72m 0.01h 0.00d
Submission to last job: 206s 3.43m 0.06h 0.00d
# Load into database. This takes about 20 minutes
ssh hgwdev
cd /cluster/data/mm8/bed/geneSorter/blastp/self/run/out
bash
time hgLoadBlastTab mm8 knownBlastTab *.tab
# Scanning through 7730 files
# Loading database with 5270545 rows
# real 13m30.534s
cd /cluster/data/mm8/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene mm8 refGene knownGene knownToRefSeq
# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm8 > refToLl.txt
hgMapToGene mm8 refGene knownGene knownToLocusLink -lookup=refToLl.txt
hgsql -e "select count(*) from knownToLocusLink;" mm8
# 27636
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt mm8 knownGene name proteinID Pfam knownToPfam
hgsql -e "select count(*) from knownToPfam;" mm8
# 29479
############################################################################
### MAKE THE affyU74 TRACK - needed for the Gene Sorter (DONE
#
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of
# target sequences. Recalculate alignments and load data
----------------------------------
# Load up semi-local disk with target sequences for Affy mouse U74 chips.
# ssh kkr1u00
# mkdir -p /iscratch/i/affy
# This /projects filesystem is not available on kkr1u00
# but it is on kk
# ssh kk
# cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy
ssh kkr1u00
iSync
# Run cluster job to do alignments
ssh kk
mkdir /cluster/data/mm8/
cd /cluster/data/mm8/bed/affyU74.2006-03-08
mkdir run
cd run
mkdir psl
#echo /scratch/mus/mm8/maskedContigs/*.fa | wordLine stdin > genome.lst
echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.lst affy.lst gsub jobList
para create jobList
para try
para check
para push
# Completed: 102 of 102 jobs
# CPU time in finished jobs: 5846s 97.43m 1.62h 0.07d 0.000 y
# IO & Wait Time: 367s 6.12m 0.10h 0.00d 0.000 y
# Average job time: 61s 1.02m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 189s 3.15m 0.05h 0.00d
# Submission to last job: 200s 3.33m 0.06h 0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
ssh kk
cd /cluster/data/mm8/bed/affyU74.2006-03-08/run
pslSort dirs raw.psl tmp psl
# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
#pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null
# Sort by chromosome and load into database.
ssh hgwdev
cd /cluster/data/mm8/bed/affyU74.2006-03-08
pslSortAcc nohead chrom temp all_affyU74.psl
cat chrom/*.psl > affyU74.psl
# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
mv affyU74.psl affyU74.psl.orig
cut -f 1-9 affyU74.psl.orig >j1.tmp
cut -f 10 affyU74.psl.orig | sed -e 's/:/\t/' | cut -f 2 > j2.tmp
cut -f 11-21 affyU74.psl.orig >j3.tmp
paste j1.tmp j2.tmp j3.tmp >affyU74.psl
hgLoadPsl mm8 affyU74.psl
rm -rf chrom temp run
## MAKE THE affyGnfU74 TRACKs (DONE 3/8/06, Fan)
# Make bed files and load consensus sequences for Affy U74 chip set.
#This needs to be done after affyU74 is already made.
ssh hgwdev
mkdir -p /cluster/data/mm8/bed/affyGnf.2006-03-08
cd /cluster/data/mm8/bed/affyGnf.2006-03-08
# may need to build this command in src/hg/affyGnf
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \
/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \
/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \
/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2
# edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
mkdir sav
cp *.bed sav -p
cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed
cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed
cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed
# and reload data into table
hgLoadBed -strict mm8 affyGnfU74A affyGnfU74A.bed
hgLoadBed -strict mm8 affyGnfU74B affyGnfU74B.bed
hgLoadBed -strict mm8 affyGnfU74C affyGnfU74C.bed
# Add in sequence data for U74 tracks.
# Copy consensus sequence to /gbdb if it isn't already
# [THE SYM LINKS WERE ALREADY DONE.]
# mkdir -p /gbdb/hgFixed/affyProbes
cd /gbdb/hgFixed/affyProbes
# fix broken symlinks after directory structure changed
# /projects/compbiodata ----> /projects/compbio/data
rm U74*
# make correct symlinks (hartera, 2005-05-03)
ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .
# used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
# ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
# reload sequences with prefix removed so acc matches name used in
# other dependent tables
hgLoadSeq -abbr=U74Av2: mm8 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
hgLoadSeq -abbr=U74Bv2: mm8 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
hgLoadSeq -abbr=U74Cv2: mm8 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa
### GNF ATLAS 2 (DONE 3/9/06, Fan)
# Align probes from GNF1M chip.
ssh kk
cd /cluster/data/mm8/bed
mkdir -p geneAtlas2/run/psl
cd geneAtlas2/run
echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst
ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
gensub2 genome.lst mrna.lst gsub spec
para create spec
para try
para check
para push
para time
# Completed: 34 of 34 jobs
# CPU time in finished jobs: 53165s 886.08m 14.77h 0.62d 0.002 y
# IO & Wait Time: 241s 4.02m 0.07h 0.00d 0.000 y
# Average job time: 1571s 26.18m 0.44h 0.02d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3929s 65.48m 1.09h 0.05d
# Submission to last job: 3929s 65.48m 1.09h 0.05d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create gnf1h.psl.
pslSort dirs raw.psl tmp psl
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null
#rm -r contig.psl raw.psl psl
# Load probes and alignments from GNF1H into database.
ssh hgwdev
cd /cluster/data/mm8/bed/geneAtlas2
# ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
hgLoadPsl mm8 affyGnf1m.psl
hgLoadSeq mm8 /gbdb/hgFixed/affyProbes/gnf1m.fa
# Load up track
hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
affyGnf1m.psl
# Note that the unmapped 5000 records are from all-N sequences.
hgLoadBed -strict mm8 gnfAtlas2 gnfAtlas2.bed
# MOUSE AFFYMETRIX MOE430 TRACK (DONE Fan 2006-03-09)
# mkdir -p /projects/compbio/data/microarray/affyMouse
# Download MOE430A and MOE430B consensus sequences from Affymetrix web site
# http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
# unzip MOE430*_consensus.zip
# check for duplicate probes: there are none, all have unique names
# check for duplicate probes: 100 from 136745_at to 1367551_a_at
# remove "consensus:" and ";" from FASTA headers to shorten probeset
# names for database
# sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
# sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
# cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
# /cluster/bluearc/affy/
# THE ABOVE WAS ALREADY TBD)
# Set up cluster job to align MOE430 consensus sequences to mm8
ssh kkr1u00
cd /cluster/data/mm8/bed
mkdir -p affyMOE430
cd affyMOE430
# mkdir -p /iscratch/i/affy
# cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
# iSync
ssh kk
cd /cluster/data/mm8/bed/affyMOE430
ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 genome.lst affy.lst template.sub para.spec
mkdir psl
para create para.spec
# Do the job with usual para try/check/push/time etc.
# Completed: 34 of 34 jobs
# CPU time in finished jobs: 9196s 153.26m 2.55h 0.11d 0.000 y
# IO & Wait Time: 362s 6.04m 0.10h 0.00d 0.000 y
# Average job time: 281s 4.69m 0.08h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 639s 10.65m 0.18h 0.01d
# Submission to last job: 639s 10.65m 0.18h 0.01d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyRAE230.psl
pslSort dirs raw.psl tmp psl
# only use alignments that cover 30% of sequence and have at least
# 95% identity in aligned region.
# low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null
# Load alignments and sequences into database
ssh hgwdev
cd /cluster/data/mm8/bed/affyMOE430
# shorten names in psl file
sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
mv affyMOE430.psl.bak affyMOE430.psl
# load track into database
hgLoadPsl mm8 affyMOE430.psl
# Add consensus sequences for MOE430
# Copy sequences to gbdb is they are not there already
# mkdir -p /gbdb/hgFixed/affyProbes
# ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
# /gbdb/hgFixed/affyProbes
hgLoadSeq -abbr=MOE430 mm8 /gbdb/hgFixed/affyProbes/MOE430_all.fa
# Clean up
# rm batch.bak contig.psl raw.psl
# BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4
# add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
# add affyMOE430.html file and then do make alpha to add to trackDb table
# Create known gene mapping table and expression distance tables
# for GNF Atlas 2. (The hgExpDistance takes an hour.)
hgMapToGene mm8 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm8 hgFixed.gnfMouseAtlas2MedianRatio \
hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio
Got 22937 unique elements in hgFixed.gnfMouseAtlas2MedianRatio
# Create table that maps between known genes and RefSeq
hgMapToGene mm8 refGene knownGene knownToRefSeq
# may need to build this command in src/hg/near/hgMapToGene
# Create a table that maps between known genes and
# the nice affy expression data.
hgMapToGene mm8 affyU74 knownGene knownToU74
hgMapToGene mm8 affyMOE430 knownGene knownToMOE430
hgMapToGene mm8 affyMOE430 -prefix=A: knownGene knownToMOE430A
# Format and load Rinn et al sex expression data
mkdir /cluster/data/mm8/bed/rinnSex
cd /cluster/data/mm8/bed/rinnSex
hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
../affyMOE430/affyMOE430.psl
hgLoadBed mm8 rinnSex rinnSex.bed
# Format and load the GNF data
mkdir /cluster/data/mm8/bed/affyGnf95
cd /cluster/data/mm8/bed/affyGnf95
~/src/hg/affyGnf/affyPslAndAtlasToBed -newType ../affyU95.psl \
/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
affyGnfU95.tab affyGnfU95Exps.tab -shortOut
# this .sql load was in preceeding instructions, but this .sql file
# appears to not exist and it doesn't seem to be needed anyway.
# Everything below this seems to create tables OK.
# hgsql mm8 < ~/kent/src/hg/affyGnf/affyGnfU95.sql
# Create table that gives distance in expression space between
# GNF genes. These commands take about 15 minutes each
# The affyGnfU74?Exps arguments appear to be unused in
# hgExpDistance
cd /cluster/data/mm8/bed/geneSorter
hgExpDistance mm8 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
hgExpDistance mm8 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
hgExpDistance mm8 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene mm8 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
# hgsql -e "select count(*) from knownToGnfAtlas2;" mm8
# row count changed to 22978
# Create expression distance table - takes about an hour
hgExpDistance mm8 hgFixed.gnfMouseAtlas2MedianRatio \
hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnfAtlas2 &
# hgsql -e "select count(*) from gnfAtlas2Distance;" mm8
# row count changed to 22937000
# HGNEAR PROTEIN BLAST TABLES (DONE 3/14/06 Fan)
ssh hgwdev
mkdir /cluster/data/mm8/bed/hgNearBlastp
cd /cluster/data/mm8/bed/hgNearBlastp
cat << _EOF_ > config.ra
# Latest mouse vs. other Gene Sorter orgs:
# human, rat, zebrafish, worm, yeast, fly
targetGenesetPrefix mouse
targetDb mm8
queryDbs hg18 rn4 danRer3 ce2 sacCer1 dm2
mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
hg18Fa /cluster/data/hg18/bed/geneSorter/blastp/known.faa
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa
buildDir /cluster/data/mm8/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/mm8HgNearBlastp
_EOF_
doHgNearBlastp.pl config.ra >do.log
# output was like this:
...
Scanning through 671 files^M
Loading database with 14470 rows^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/mm8.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/mm8.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/hg18.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/hg18.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/rn4.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/rn4.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/danRer3.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/danRer3.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/ce2.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/ce2.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/sacCer1.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/sacCer1.formatdb^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/dm2.split^M
# ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/dm2.formatdb^M
# ssh -x pk rmdir /san/sanvol1/scratch/mm8HgNearBlastp^M
^M
*** All done!^M
*** Check these tables in mm8:^M
*** mouseBlastTab hgBlastTab rnBlastTab drBlastTab ceBlastTab scBlastTab dmBlastTab ^M
*** and mmBlastTab in these databases:^M
*** hg18 rn4 danRer3 ce2 sacCer1 dm2 ^M
# MAKE ORGANISM-SPECIFIC HGNEARDATA FILES
cd ~/kent/src/hg/near/hgNear/hgNearData
mkdir -p Mouse/mm8
cd Mouse/mm8
cp ../mm7/otherOrgs.ra
# Edit ortherOrgs.ra to reflect the latest genomes used in blastp jobs
vi ortherOrgs.ra
# then check it into CVS.
# ENABLE HGNEAR FOR mm8 IN HGCENTRALTEST
echo "update dbDb set hgNearOk = 1 where name = 'mm8';" \
| hgsql -h genome-testdb hgcentraltest
# END OF HGNEAR STUFF
#########################################################################
# BLASTZ panTro2 after chr9 re-masked (DONE - 2006-03-30 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastzPanTro2.2006-03-28
cd /cluster/data/mm8/bed
rm blastz.panTro2
ln -s blastzPanTro2.2006-03-28 blastz.panTro2
cd blastz.panTro2
cat << '_EOF_' > DEF
# mouse vs chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/hg/panTro2/nib
SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes
SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzPanTro2.2006-03-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# broken during blastz run due to panassas failure
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=cat `pwd`/DEF > cat.out 2>&1 &
# Do not have this measurement for the first time around, tables
# got loaded again before I thought of that.
time nice -n +19 featureBits mm8 chainPanTro2Link \
> fb.mm8.chainPanTro2Link
# 963977790 bases of 2567283971 (37.549%) in intersection
# For panTro1 this was:
time nice -n +19 featureBits mm8 chainPanTro1Link \
> fb.mm8.chainPanTro1Link
# 901276629 bases of 2567283971 (35.106%) in intersection
ssh pk
mv /cluster/data/panTro2/bed/blastz.mm8.swap \
/cluster/data/panTro2/bed/blastz.mm8.swap.2006-03-21
mkdir /cluster/data/panTro2/bed/blastz.mm8.swap
cd /cluster/data/panTro2/bed/blastz.mm8.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
/cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \
> blastz.out 2>&1 &
# completed the downloads manually since they failed due to the
# existing downloads. Then cleanup:
ssh hgwbeta
cd /cluster/data/panTro2/bed/blastz.mm8.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=cleanup /cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \
> cleanup.out 2>&1 &
time nice -n +19 featureBits panTro2 chainMm8Link \
> fb.panTro2.chainMm8Link 2>&1 &
# 978002566 bases of 2909512873 (33.614%) in intersection
# first time before the chr9 fix was:
# 986978326 bases of 2909512873 (33.922%) in intersection
#########################################################################
# BLASTZ panTro2 (DONE - 2006-03-15 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.panTro2.2006-02-23
cd /cluster/data/mm8/bed
ln -s blastz.panTro2.2006-02-23 blastz.panTro2
cd blastz.panTro2
cat << '_EOF_' > DEF
# mouse vs chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm7
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/hg/panTro2/nib
SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes
SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzPanTro2.2006-03-15
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# broken during chain step due to missing files on the Iservers
# completed chain run manually, then continuing
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
# broken during loadUp due to script bug, ran loadUp.csh manually
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap `pwd`/DEF > swap.out 2>&1 &
# mistakenly did PanTro1 here ... should have been PanTro2
time nice -n +19 featureBits mm8 chainPanTro1Link
# 901276629 bases of 2567283971 (35.106%) in intersection
time nice -n +19 featureBits panTro2 chainMm8Link \
> fb.panTro2.chainMm8Link 2>&1
# 986978326 bases of 2909512873 (33.922%) in intersection
#############################################################################
# UPDATED mm8.knownToVisiGene (2006-03-15 galt)
ssh hgwdev
knownToVisiGene mm8
#############################################################################
# BLASTZ SELF (DONE - 2006-03-20 - 2006-03-22 - Hiram)
# using chain min score of 10,000 to cut down on volumn of data
ssh pk
mkdir /cluster/data/mm8/bed/blastzSelf.2006-03-20
cd /cluster/data/mm8/bed
ln -s blastzSelf.2006-03-20 blastz.mm8
cd blastzSelf.2006-03-20
cat << '_EOF_' > DEF
# mouse vs mouse
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=200
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Mouse Mm8
SEQ2_DIR=/scratch/hg/mm8/nib
SEQ2_LEN=/scratch/hg/mm8/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzSelf.2006-03-20
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
cd /cluster/data/mm8/bed/blastzSelf.2006-03-20
time /cluster/home/hiram/kent/src/utils/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
`pwd`/DEF > blastz.out 2>&1 &
# broke during the load step due to doBlastz script changes,
# finished the load manually, then:
time /cluster/home/hiram/kent/src/utils/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
-continue=download `pwd`/DEF > download.out 2>&1 &
ssh kolossus
cd /cluster/data/mm8/bed/blastzSelf.2006-03-20
time HGDB_CONF=~/.hg.conf.read-only featureBits mm8 \
chainSelfLink >fb.mm8.chainSelfLink 2>&1
cat fb.mm8.chainSelfLink
# 362483673 bases of 2567283971 (14.119%) in intersection
#############################################################################
# UPDATED mm8.knownToVisiGene (2006-04-05 galt)
ssh hgwdev
knownToVisiGene mm8
############################################################################
# LIFTOVER (DROPUNDER) CHAINS TO MM7 (2006-04-06 kate)
# Split (using makeLoChain-split) of mm7 is doc'ed in makeMm7.doc
# Do what makeLoChain-split says to do next (start blat alignment)
ssh kk
cd /cluster/data/mm8/bed/liftOver
makeLoChain-align mm8 /scratch/hg/mm8/nib mm7 \
/iscratch/i/mm7/split10k \
/cluster/bluearc/mm7/11.ooc >&! align.log &
# Do what its output says to do next (start cluster job)
cd /cluster/data/mm8/bed/blat.mm7.2006-04-06/run
para shove
para time >&! run.time
#CPU time in finished jobs: 906023s 15100.39m 251.67h 10.49d 0.029 y
#IO & Wait Time: 22074s 367.90m 6.13h 0.26d 0.001 y
#Average job time: 343s 5.72m 0.10h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 4260s 71.00m 1.18h 0.05d
#Submission to last job: 4965s 82.75m 1.38h 0.06d
# lift alignments
ssh kkr1u00
cd /cluster/data/mm8/bed/liftOver
makeLoChain-lift mm8 mm7 >&! lift.log &
# chain alignments
ssh kki
cd /cluster/data/mm8/bed/liftOver
makeLoChain-chain mm8 /scratch/hg/mm8/nib \
mm7 /scratch/hg/mm7/nib >&! chain.log &
# Do what its output says to do next (start cluster job)
cd /cluster/data/mm8/bed/blat.mm7.2006-04-06/chainRun
para shove
para time >&! run.time
#CPU time in finished jobs: 3884s 64.73m 1.08h 0.04d 0.000 y
#IO & Wait Time: 594s 9.91m 0.17h 0.01d 0.000 y
#Average job time: 86s 1.44m 0.02h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 245s 4.08m 0.07h 0.00d
#Submission to last job: 401s 6.68m 0.11h 0.00d
# net alignment chains
ssh kkstore03
cd /cluster/data/mm8/bed/liftOver
makeLoChain-net mm8 mm7 >&! net.log &
# load reference to over.chain into database table,
# and create symlinks /gbdb and download area
ssh hgwdev
cd /cluster/data/mm8/bed/liftOver
makeLoChain-load mm8 mm7 >&! load.log &
# test by converting a region using the "convert" link on
# the browser, and comparing to blat of the same region
#############################################################################
# Create Allen Brain Atlas mapping. (DONE 2006-04-12 galt)
# compile allenCollectSeq
ssh hgwdev
cd ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq
make
# Set up directory
ssh kk
cd /cluster/data/mm8/bed
mkdir allenBrain
cd allenBrain
# In /san/sanvol1/visiGene/offline/allenBrain/probesAndData/
# allen20051021.tab (converted from spreadsheet mailed by Susan Sunkin <SusanS@alleninstitute.org>)
# probeSeq.20051027.fasta (also from Susan).
# Create a list of probe sequences filling ones missing from probeSeq.20050127.fa
# with some NCBI and TIGR files, and some downloaded one at a time.
allenCollectSeq /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab /san/sanvol1/visiGene/offline/allenBrain/probesAndData/probeSeq.20051027.fasta /cluster/data/mm7/bed/ncbiXm/ncbiNm.fa /cluster/data/mm7/bed/ncbiXm/ncbiXm.fa /cluster/data/mm6/bed/tigrMgiTc/tigrMgiTc.fa ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq/extra.fa allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab
# Set up a blat run to align the probes.
mkdir split
faSplit sequence allProbes.fa 200 split/rp
mkdir run
cd run
ls -1 ../split/*.fa > mrna.lst
ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst
mkdir psl
cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 genome.lst mrna.lst gsub spec
para create spec
# Then do the usual para try/push/time/check until the run is finished
# Then do sorting and near-best-in-genome step on file server
ssh kkstore02
cd /cluster/data/mm8/bed/allenBrain/run
pslSort dirs raw.psl tmp psl
pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl
# Clean up big files no longer needed
rm raw.psl
rm -r psl
rm -r ../split
# Load up database
ssh hgwdev
cd /cluster/data/mm8/bed/allenBrain
# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.
hgsql mm8 < ~/kent/src/hg/lib/allenBrainUrl.sql
hgsql mm8 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;'
# Make probe alignment table, and load sequence.
hgLoadPsl mm8 allenBrainAli.psl
mkdir /gbdb/mm8/allenBrain
ln -s /cluster/data/mm8/bed/allenBrain/allProbes.fa /gbdb/mm8/allenBrain/allProbes.fa
hgLoadSeq mm8 /gbdb/mm8/allenBrain/allProbes.fa
# Make mapping between known genes and allenBrain
hgMapToGene mm8 allenBrainAli -type=psl knownGene knownToAllenBrain
#########################################################################
# BLASTZ HUMAN Hg17 (DONE - 2006-04-13 - 2006-04-19 - Hiram)
ssh pk
mkdir /cluster/data/mm8/bed/blastzHg17.2006-04-13
cd /cluster/data/mm8/bed
ln -s blastzHg17.2006-04-13 blastz.hg17
cd blastzHg17.2006-04-13
cat << '_EOF_' > DEF
# mouse vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Human Hg17 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs
SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse
SEQ2_LEN=/cluster/data/hg17/chrom.sizes
SEQ2_CHUNK=300000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastzHg17.2006-04-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
cd /cluster/data/mm8/bed/blastzHg17.2006-04-13
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# real 656m20.633s
# Then to swap over to Hg17
mkdir /cluster/data/hg17/bed/blastz.mm8.swap
cd /cluster/data/hg17/bed
ln -s blastz.mm8.swap blastz.mm8
cd blastz.mm8.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap /cluster/data/mm8/bed/blastzHg17.2006-04-13/DEF \
> swap.out 2>&1 &
ssh hgwdev
time nice -n +19 featureBits mm8 chainHg17Link
# 984380268 bases of 2567283971 (38.343%) in intersection
time nice -n +19 featureBits hg17 chainMm8Link
# 994530172 bases of 2881515245 (34.514%) in intersection
cd /cluster/data/mm8/bed/blastzHg17.2006-04-13
time nice -n +19 featureBits mm8 chainHg17Link > fb.mm8.chainHg17Link 2>&1
# 990554882 bases of 2567283971 (38.584%) in intersection
time nice -n +19 featureBits hg17 chainMm8Link > fb.hg17.chainMm8Link 2>&1
# 997368618 bases of 2866216770 (34.797%) in intersection
########################################################################
# BLASTZ/CHAIN/NET XENTRO2 (DONE - 2006-04-20 - Hiram)
ssh kk
mkdir /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20
cd /cluster/data/mm8/bed
ln -s blastz.xenTro2.2006-04-20 blastz.xenTro2
cd blastz.xenTro2.2006-04-20
cat << '_EOF_' > DEF
# mouse vs. frog
BLASTZ=/cluster/bin/penn/blastz.v7
# Use same params as used for mammal-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/cluster/data/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Frog xenTro2 - single chunk big enough to run two of the
# largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/mm8/bed/blastz.xenTro2.2006-04-20
'_EOF_'
# << emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# XXX running 2006-04-20
# Then to swap over to xenTro2
mkdir /cluster/data/xenTro2/bed/blastz.mm8.swap
cd /cluster/data/xenTro2/bed
ln -s blastz.mm8.swap blastz.mm8
cd blastz.mm8.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-swap /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20/DEF \
> swap.out 2>&1 &
ssh hgwdev
cd /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20
time nice -n +19 featureBits mm8 chainXenTro2Link \
> fb.mm8.chainXenTro2Link 2>&1 &
# 68050843 bases of 2567283971 (2.651%) in intersection
cd /cluster/data/xenTro2/bed/blastz.mm8.swap
time nice -n +19 featureBits xenTro2 chainMm8Link \
> fb.xenTro2.chainMm8Link 2>&1
# 72840135 bases of 1359412157 (5.358%) in intersection
#######################################################################
## LIFTOVER To Mm7 (DONE - 2006-04-21 - 2006-04-24 - Hiram)
ssh kkr1u00
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
mm7 /cluster/data/mm7/nib
# as it says, DO THIS NEXT:
ssh kk
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
mm8 /scratch/hg/mm8/nib mm7 /iscratch/i/mm7/split10k \
/cluster/data/mm7/11.ooc
# as it says, DO THIS NEXT:
cd /cluster/data/mm8/bed/blat.mm7.2006-04-21/run
para try, check, push, check, ...
# Completed: 1360 of 1360 jobs
# CPU time in finished jobs: 3890058s 64834.31m 1080.57h 45.02d 0.123 y
# IO & Wait Time: 13326s 222.09m 3.70h 0.15d 0.000 y
# Average job time: 2870s 47.84m 0.80h 0.03d
# Longest finished job: 27224s 453.73m 7.56h 0.32d
# Submission to last job: 80553s 1342.55m 22.38h 0.93d
# as it says, DO THIS NEXT:
# this does the liftUp and makes the psl files
ssh kkr1u00
cd /cluster/data/mm8/bed
ln -s blat.mm7.2006-04-21 blat.mm7
time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm8 mm7
# real 16m5.091s
# as it says, DO THIS NEXT:
# the prepares the batch to run for the chaining
ssh kki
time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
mm8 /cluster/data/mm8/nib mm7 /cluster/data/mm7/nib
# as it says, DO THIS NEXT:
# running the chain batch
cd /cluster/data/mm8/bed/blat.mm7.2006-04-21/chainRun
para try, check, push, check, ...
Completed: 40 of 40 jobs
# CPU time in finished jobs: 5381s 89.68m 1.49h 0.06d 0.000 y
# IO & Wait Time: 2119s 35.32m 0.59h 0.02d 0.000 y
# Average job time: 188s 3.12m 0.05h 0.00d
# Longest finished job: 652s 10.87m 0.18h 0.01d
# Submission to last job: 685s 11.42m 0.19h 0.01d
ssh kkstore04
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm8 mm7
# Created /cluster/data/mm8/bed/liftOver/mm8ToMm7.over.chain.gz
# as it says, DO THIS NEXT:
ssh hgwdev
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm8 mm7
# It says this:
# Now, add link for
# /usr/local/apache/htdocs/goldenPath/mm8/liftOver/mm8ToMm7.over.chain
# to hgLiftOver
# But I believe that link was already done:
cd /gbdb/mm8/liftOver
ls -og mm8ToMm7*
# lrwxrwxrwx 1 53 Apr 24 12:32 mm8ToMm7.over.chain.gz -> \
# /cluster/data/mm8/bed/liftOver/mm8ToMm7.over.chain.gz
########################################################################
## CYTOBAND - ideogram track (DONE - 2006-04-28 - Hiram)
ssh hgwdev
cd /cluster/data/mm8/pre_release
# The .wgetrc is the anonymous user
WGETRC=`pwd`/.wgetrc
export WGETRC
wget --timestamping \
ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release/ideogram
mkdir /cluster/data/mm8/cytoBand
cd /cluster/data/mm8/cytoBand
# Create bed file
$HOME/kent/src/utils/createNcbiCytoBand.pl \
/cluster/data/mm8/pre_release/ideogram
# Load the bed file
hgLoadBed -strict -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql mm8 \
cytoBand cytoBand.bed
# Make cytoBandIdeo track for ideogram gif on hgTracks page.
# For mouse cytoBandIdeo is just a replicate of the cytoBand track.
hgsql -e "drop table cytoBandIdeo;" mm8
hgsql mm8 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"
#########################################################################
# GENSCAN PREDICTIONS (DONE - 2006-05-03 - 2006-05-05 - Hiram)
ssh kkstore04
# Create a 2bit file with the full chrom sequences and the
# random contigs, all hard masked
cat ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
| maskOutFa stdin hard stdout \
| faToTwoBit stdin mm8Chroms_RandomContigs.hard.2bit
# make sure it still has all the unmasked sequence in it:
twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \
| faSize stdin
# 2661205088 bases (1183272085 N's 1477933003 real 1477933003
# upper 0 lower) in 99 sequences in 1 files
twoBitToFa mm8.2bit stdout | faSize stdin
# 2664455088 bases (97171400 N's 2567283688 real 1477933003 upper
# 1089350685 lower) in 34 sequences in 1 files
# note the 'real' bases are the same, the lowers have become N's
# 1089350685 + 97171400 = 1186522085
# 1186522085 - 1183272085 = 3250000 == N's in gaps between contigs
# And, make sure there aren't any sequences in this lot that have
# become all N's with no sequence left in them:
twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \
| faCount stdin > chroms_randoms.faCount
# the lowest three are:
egrep -v "^#|^total" chroms_randoms.faCount \
| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
# MmUn_162590_36 1631
# Mm1_163269_36 1581
# MmUn_102813_36 1479
# creating 4,000,000 sized chunks, the chroms stay together as
# single pieces. The contigs get grouped together into 4,000,000
# sized fasta files. You don't want to break these things up
# because genscan will be doing its own internal 2.4 million
# window on these pieces, and the gene names are going to be
# constructed from the sequence name in these fasta files. The
# gene names are much better when they are this simple chrN.M
# numbering scheme, or in the case of a contig: contig_name.M
# where the M is a sequence number that genscan will assign to
# each gene it discovers.
mkdir hardChunks
twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \
| faSplit about stdin 4000000 hardChunks/c_
rsync -a --progress hardChunks/ /cluster/bluearc/mm8/hardChunks/
ssh hgwdev
mkdir /cluster/data/mm8/bed/genscan
cd /cluster/data/mm8/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kki
cd /cluster/data/mm8/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
# Since we split on gaps, we have no chunks like that. You can
# verify with faCount on the chunks.
ls -1S /cluster/bluearc/mm8/hardChunks/c_*.fa > genome.list
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << emacs
gensub2 genome.list single template jobList
para create jobList
para try, check, push, check, ...
# Completed: 673 of 673 jobs
# CPU time in finished jobs: 76339s 1272.32m 21.21h 0.88d 0.002 y
# IO & Wait Time: 2327s 38.78m 0.65h 0.03d 0.000 y
# Average job time: 117s 1.95m 0.03h 0.00d
# Longest finished job: 1993s 33.22m 0.55h 0.02d
# Submission to last job: 7526s 125.43m 2.09h 0.09d
# There was a failed job, going to kolossus and running with a
# reduced window size:
ssh kolossus
cd /cluster/data/mm8/bed/genscan
time /cluster/bin/x86_64/gsBig /cluster/bluearc/mm8/hardChunks/c_01.fa \
gtf/c_01.gtf -trans=pep/c_01.pep -subopt=subopt/c_01.bed \
-exe=hg3rdParty/genscanlinux/genscan \
-par=hg3rdParty/genscanlinux/HumanIso.smat \
-tmp=/scratch/tmp -window=2000000
# real 258m34.800s
# cat and lift the results into single files
ssh kkstore04
cd /cluster/data/mm8/bed/genscan
cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \
../../jkStuff/liftAll.lft carry stdin
cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \
../../jkStuff/liftAll.lft carry stdin
cat pep/c_*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/mm8/bed/genscan
ldHgGene mm8 -gtf genscan genscan.gtf
# Read 44899 transcripts in 323099 lines in 1 files
# 44899 groups 34 seqs 1 sources 1 feature types
# 44899 gene predictions
hgPepPred mm8 generic genscanPep genscan.pep
hgLoadBed -strict mm8 genscanSubopt genscanSubopt.bed
# Loaded 530201 elements of size 6
# check the numbers
time nice -n +19 featureBits mm8 genscan
# 54455852 bases of 2567283971 (2.121%) in intersection
time nice -n +19 featureBits mm8 knownGene:cds
# 28459053 bases of 2567283971 (1.109%) in intersection
featureBits mm7 genscan
# 54864694 bases of 2583394090 (2.124%) in intersection
time nice -n +19 featureBits mm7 knownGene:cds
# 27531524 bases of 2583394090 (1.066%) in intersection
featureBits mm6 genscan
# 54894283 bases of 2597150411 (2.114%) in intersection
featureBits mm5 genscan
# 55024722 bases of 2615483787 (2.104%) in intersection
featureBits mm4 genscan
# 56164126 bases of 2627444668 (2.138%) in intersection
featureBits mm3 genscan
# 51697165 bases of 2505900260 (2.063%) in intersection
featureBits mm8 genscanSubopt
# 57048581 bases of 2567283971 (2.222%) in intersection
featureBits mm7 genscanSubopt
# 57512333 bases of 2583394090 (2.226%) in intersection
featureBits mm6 genscanSubopt
# 57856316 bases of 2597150411 (2.228%) in intersection
featureBits mm5 genscanSubopt
# 58474899 bases of 2615483787 (2.236%) in intersection
featureBits mm4 genscanSubopt
# 59601009 bases of 2627444668 (2.268%) in intersection
featureBits mm3 genscanSubopt
# 56085184 bases of 2505900260 (2.238%) in intersection
##########################################################################
# BUILD NIBB IMAGE PROGES (in progress 2007-05-05 Jim)
# Make directory on san for cluster job and copy in sequence
ssh pk
mkdir /san/sanvol1/scratch/mm8/nibbPics
cd /san/sanvol1/scratch/mm8/nibbPics
cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
# Make parasol job dir and sequence list files
mkdir run
cd run
mkdir psl
ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst
echo ../nibbImageProbes.fa > mrna.lst
# Create parasol gensub file file
cat << '_EOF_' > gsub
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'
# Create parasol batch
gensub2 genome.lst mrna.lst gsub spec
para create spec
# Do para try/push/time etc.
#Completed: 49 of 49 jobs
#CPU time in finished jobs: 12585s 209.74m 3.50h 0.15d 0.000 y
#IO & Wait Time: 411s 6.86m 0.11h 0.00d 0.000 y
#Average job time: 265s 4.42m 0.07h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 1145s 19.08m 0.32h 0.01d
#Submission to last job: 1195s 19.92m 0.33h 0.01d
# Make sort and filter
catDir psl | sort -k 10 \
| pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
| sort -k 14,14 -k 16,16n \
| sed 's#/scratch/hg/mm8/nib/chr#chr#' \
| sed 's/.nib//' > ../nibbImageProbes.psl
# Make bed file and copy in stuff
ssh hgwdev
cd /cluster/data/mm8/bed
mkdir nibbPics
cd nibbPics
cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
cp /san/sanvol1/scratch/mm8/nibbPics/nibbImageProbes.psl .
# Load into database
ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/mm8/nibbImageProbes.fa
hgLoadSeq mm8 /gbdb/mm8/nibbImageProbes.fa
hgLoadPsl mm8 nibbImageProbes.psl
#############################################################################
# miRNA track (DONE - 2006-05-22 - Fan)
# data from: Michel.Weber@ibcg.biotoul.fr
# notify them when done.
ssh hgwdev
cd /cluster/data/mm8/bed
mkdir miRNA-2006-05-22
cd miRNA-2006-0522
# save the mm8_miRNA_track_may2006.txt file from email
cat mm8_miRNA_track_may2006.txt|sed -e 's/ /\t/g' >miRNA.tab
hgLoadBed -strict mm8 miRNA miRNA.tab
# check previous release track before update
featureBits mm8 miRNA
# 28630 bases of 2567283971 (0.001%) in intersection
featureBits mm7 miRNA
# 20620 bases of 2583394090 (0.001%) in intersection
featureBits mm6 miRNA
# 21167 bases of 2597150411 (0.001%) in intersection
featureBits mm5 miRNA
# 17957 bases of 2615483787 (0.001%) in intersection
#########################################################################
# BLASTZ CHICKEN galGal3 (DONE 5/24/06 angie)
ssh pk
mkdir /cluster/data/mm8/bed/blastz.galGal3.2006-05-23
cd /cluster/data/mm8/bed/blastz.galGal3.2006-05-23
cat << '_EOF_' > DEF
# mouse vs chicken
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_SMSK=/san/sanvol1/scratch/mm8/linSpecRep/notInNonMammal
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/san/sanvol1/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastz.galGal3.2006-05-23
'_EOF_'
# << emacs
doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsmm8 \
-bigClusterHub=pk -smallClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose \
>& do.log & tail -f do.log
ln -s blastz.galGal3.2006-05-23 /cluster/data/mm8/bed/blastz.galGal3
#########################################################################
# ADD LINK TO GENENETWORK (DONE. 5/31/06 Fan).
# Copy geneNetwork ID list from mm7
ssh hgwdev
mkdir -p /cluster/data/mm8/bed/geneNetwork
cd /cluster/data/mm8/bed/geneNetwork
hgsql mm7 -N -e 'select * from geneNetworkId' > geneNetworkId.tab
hgsql mm8 -e 'drop table geneNetworkId'
hgsql mm8 < ~/src/hg/lib/geneNetworkId.sql
hgsql mm8 -e \
'load data local infile "geneNetworkId.tab" into table geneNetworkId'
############################################################################
# SGP GENES (DONE - 2006-06-12 - Hiram)
ssh kkstore02
cd /cluster/data/mm8/bed
ln -s /cluster/store8/mm8/bed/sgp .
cd sgp
# They don't do chrM
for C in `awk '{print $1}' /cluster/data/mm8/chrom.sizes | grep -v chrM`
do
wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmMar2006/SGP/humangp200603/${C}.gtf" \
-O "${C}.gtf"
done
ssh hgwdev
cd /cluster/data/mm8/bed/sgp
ldHgGene -gtf -genePredExt mm8 sgpGene chr*.gtf
featureBits mm8 -enrichment refGene:CDS sgpGene
# refGene:CDS 1.063%, sgpGene 1.455%, both 0.918%, cover 86.32%,
# enrich 59.32x
#########################################################################
# BUILD KNOWN GENE LIST FOR GOOGLE. (DONE. 6/6/06 Fan).
cd /cluster/data/mm8/bed
rm -rf knownGeneList/mm8
# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/mm8
hgKnownGeneList mm8
# copy over to /usr/local/apache/htdocs
rm -rf /usr/local/apache/htdocs/knownGeneList/mm8
mkdir -p /usr/local/apache/htdocs/knownGeneList/mm8
cp -Rfp knownGeneList/mm8/* /usr/local/apache/htdocs/knownGeneList/mm8
#########################################################################
### IGTC (Int'l GeneTrap Consortium) (DONE - 2006-06-12 - angie)
### fasta added 2006-06-21
### Doug Stryke <stryke@cgl.ucsf.edu> in Tom Ferrin's lab
### NOTE -- as of 2007-03-01 the igtc track will be automatically
### updated on hgwdev by the scripts monthlyUpdateIgtc.csh and
### updateIgtc.pl in kent/src/hg/utils/automation/ .
ssh hgwdev
mkdir /cluster/data/mm8/bed/igtc
cd /cluster/data/mm8/bed/igtc
wget http://www.genetrap.org/blattrack/genetrap_mm8.psl
grep -v ^track genetrap_mm8.psl \
| hgLoadPsl mm8 -table=igtc stdin
# Probe fasta is shared by all assemblies:
wget http://www.genetrap.org/blattrack/genetrap.fasta
mkdir /gbdb/mm8/igtc
ln -s /cluster/data/mm8/bed/igtc/genetrap.fasta /gbdb/mm8/igtc/
hgLoadSeq -replace mm8 /gbdb/mm8/igtc/genetrap.fasta
#########################################################################
# REGULATORY POTENTIAL (DONE - 2006-06-12 - Hiram)
# download data from "James Taylor" <james@bx.psu.edu>
ssh kkstore04
cd /cluster/data/mm8/bed
mkdir /cluster/store8/mm8/bed/regPotential7X
ln -s /cluster/store8/mm8/bed/regPotential7X .
cd regPotential7X
# This is a lot of data
time for C in 1 2 3 4 5 6 7 8 9 X 10 11 12 13 14 15 16 17 18 19
do
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm8/chr${C}.scores.truncated.bz2"
done
# real 79m32.840s
wget --timestamping \
"http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm8/trackDb.html" -O description.html
time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X
do
bzcat chr${C}.scores.truncated.bz2
done | wigEncode -noOverlap stdin regPotential7X.wig regPotential7X.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 22m28.583s
# Loading the table on hgwdev
ssh hgwdev
cd /cluster/data/mm8/bed/regPotential7X
ln -s /cluster/data/mm8/bed/regPotential7X/regPotential7X.wib \
/gbdb/mm8/wib/regPotential7X.wib
# using the tmpDir is faster since it is on local disk and it will
# clean up any temporary .tab file it creates there
time hgLoadWiggle -tmpDir=/scratch/tmp \
mm8 regPotential7X regPotential7X.wig
# real 0m28.683s
# create a histogram
ssh kolossus
cd /cluster/data/mm8/bed/regPotential7X
time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
-hMinVal=0.0 -db=mm8 regPotential7X > histogram.data 2>&1
# real 18m29.167s
# create download gzip files from the bz2 files:
ssh kkstore04
cd /cluster/data/mm8/bed/regPotential7X
for F in chr*.scores.truncated.bz2
do
C=`echo $F | awk -F'.' '{print $1}'`
echo -n "${C}.regPotential7X.mm8.gz working ... "
bzcat ${F} | gzip > ${C}.regPotential7X.mm8.gz
echo
done
#############################################################################
# SIB Transcriptome (DONE Aug 29, 2007 - JK)
# Create working directory and download data from where Christian Iseli
# (Christian.Iseli@licr.org) put it, and unpack. The download takes about
# ten minutes (161M file).
cd /cluster/data/mm8/bed
mkdir sibTranscriptome
cd sibTranscriptome
wget ftp://ftp.licr.org/pub/databases/trome/mouse/MTR.gtf.gz
wget ftp://ftp.licr.org/pub/databases/trome/mouse/txg.tar.gz
tar -zxvf txg.tar.gz
# Load up sibGene table
zcat MTR.gtf.gz | ldHgGene mm8 sibGene stdin
# Do a little data cleanup and transformation and load splice graphs into database.
sed 's/altGraphX/sibTxGraph/' ~/src/hg/lib/altGraphX.sql > sibTxGraph.sql
cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql mm8 sibTxGraph stdin
# Create sibAltEvents track for analysed alt-splices.
cat txg/*.txg | txgAnalyze stdin /cluster/data/mm8/mm8.2bit sibAltEvents.bed
awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
hgLoadBed mm8 sibAltEvents foo.bed
#########################################################################
# MAP CONTIGS TRACK (DONE - 2005-10-04 - Hiram)
ssh hgwdev
mkdir -p /cluster/data/mm8/bed/ctgPos
cd /cluster/data/mm8/bed/ctgPos
# hgCtgPos uses the lift files... but mouse lift files are for the
# 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs
# from the assembly. (In the future, we should go with the NT's!)
# So... just for this release, go straight from the seq_contig.md
# to the table def'n: contig, size, chrom, chromStart, chromEnd
# This script is an improvement from before, this is now doing the
# randoms properly.
cat << '_EOF_' > seqContigToCtgPos.pl
#!/usr/bin/env perl
use warnings;
use strict;
my $prevRandom="";
my $randomPosition=0;
while(my $line=<>)
{
chomp($line);
my @a = split('\s+',$line);
if ($a[1] =~ m/\|/)
{
my @b = split('\|',$a[1]);
if ($b[0] ne $prevRandom)
{
$randomPosition=0;
$prevRandom=$b[0];
}
my $size = $a[3]-$a[2]+1;
my $start = $randomPosition;
my $end = $randomPosition + $size;
printf "%s\t%d\tchr%s_random\t%d\t%d\n", $a[5],$size,$b[0],$start,$end;
if ($b[0] ne "Un") { $randomPosition += 50000; }
else { $randomPosition += 50000; }
$randomPosition += $size;
}
elsif ($a[5] =~ m/^N[TC]_\d+$/)
{
my $start = $a[2]-1;
my $end = $a[3];
my $size = $end-$start;
printf "%s\t%d\tchr%s\t%d\t%d\n", $a[5],$size,$a[1],$start,$end;
}
}
'_EOF_'
# << emacs happy
chmod +x seqContigToCtgPos.pl
egrep "ref_strain|C57BL" ../../seq_contig.md \
| ./seqContigToCtgPos.pl > ctgPos.tab
cat ../../seq_contig.md | ./seqContigToCtgPos.pl > ctgPos.tab
hgsql mm8 -e "drop table ctgPos;"
hgsql mm8 < ~/kent/src/hg/lib/ctgPos.sql
hgsql mm8 -e 'load data local infile "ctgPos.tab" into table ctgPos;'
featureBits -countGaps mm8 ctgPos
# 2573322222 bases of 2664455088 (96.580%) in intersection
featureBits -countGaps mm7 ctgPos
# 2608810329 bases of 2847717329 (91.611%) in intersection
featureBits -countGaps mm6 ctgPos
# 2638893452 bases of 3079633452 (85.689%) in intersection
featureBits -countGaps mm5 ctgPos
# 2557081173 bases of 3164952073 (80.794%) in intersection
#####################################################################
#### LOAD ENSEMBL GENES (DONE - 2006-06-21 - Hiram)
# ADDED PEPTIDE TABLE, ENSPEP (DONE, 2006-07-11, hartera)
# ADDDED STABLE URL TO TRACKDB BLOCK (V39, JUN 2006) (2008-01-10, rhead)
mkdir /cluster/data/mm8/bed/ensGene
cd /cluster/data/mm7/bed/ensGene
Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
Choose Ensembl 39 and Mus musculus, click next
It displays status in a window on the right, indicating how many
entries are here, currently: 27,967
The next page is the "filter" step, we do not want any filters,
nothing is changed on this page, click next
Now we are on the "output" tab, the filter in the window on the right
indicates that 27,967 passed the filter. (there is no filter)
Now, on this output page, change the pull-down menu item from
its default of "features" to read "structures"
All the check-boxes now change.
Mark the check box GTF under output format
Under Gene Ensemble Attributes,
Unselect Biotype
Select
Ensembl Gene ID
Ensembl Transcript ID
External Gene ID
gzip compression
and give it a filename: ensGeneMm8
it will add the .gff.gz suffix
press "export"
# The random coordinates are given in contig
# coordinates, need to lift them to chroms, create a lift file:
echo << '_EOF_' > mkRandomNTLift.sh
#!/bin/sh
grep random /cluster/data/mm8/chrom.sizes | while read R
do
chr=`echo $R | awk '{print $1}'`
size=`echo $R | awk '{print $2}'`
hgsql -N -e "select * from ctgPos where chrom=\"$chr\";" mm8 | \
awk '
BEGIN {size="'$size'"}
{
printf "%s\t%s\t%s\t%s\t%s\n", $4, $1, $2, $3, size
}
'
done
'_EOF_'
# << happy emacs
chmod +x ./mkRandomNTLift.sh
./mkRandomNTLift.sh > randomNT.lft
# Add "chr" to front of each line (that is a normal chrom number)
# in the gene data gtf file to make
# it compatible with ldHgGene and convert the chrMT name, and lift
# the random coordinates
zcat ensGeneMm8.gff.gz \
| sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
| liftUp ensGene.gtf randomNT.lft carry stdin
ldHgGene mm8 ensGene ensGene.gtf
# Read 34831 transcripts in 597575 lines in 1 files
# 34831 groups 34 seqs 1 sources 4 feature types
# 34831 gene predictions
featureBits mm8 ensGene
# 56159487 bases of 2567283971 (2.188%) in intersection
featureBits mm7 ensGene
# 57484684 bases of 2583394090 (2.225%) in intersection
featureBits mm6 ensGene
# 54791625 bases of 2597150411 (2.110%) in intersection
# Load ensGtp table.
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper.
# Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
# Choose Ensembl 39 and Mus musculus, click next
# Follow this sequence through the pages:
# 1) No filters in the filter section, click next go to Output
# 2) Select "Structures".
# 3) select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# 4) select "Text, tab separated" gzip and name the output file as "ensGtp"
# 5) download the output file "ensGtp.tsv.gz"
# the tsv.gz is added automatically to the ensGtp
# Something is unusual in this download. The lines are duplicated
# about 8 times more than necessary
zcat ensGtp.tsv.gz | wc -l
# 284554
zcat ensGtp.tsv.gz | sort -u | wc -l
gunzip ensGtp.tsv.gz
# 34832
hgsql mm8 < ~/kent/src/hg/lib/ensGtp.sql
# The 'tail -n +2' skips the first line with is just column
# heading labels. The sort -u will eliminate the duplicate lines:
zcat ensGtp.tsv.gz | tail -n +2 | sort -u \
| hgsql mm8 -e \
'load data local infile "/dev/stdin" into table ensGtp;'
hgsql -e "select count(*) from ensGtp;" mm8
# 34831
# properly, one less than the count above
# clean up
gzip ensGene.gtf
rm genePred.tab
# Now, an experiment to determine if the Ensembl peptide sequences
# are the same thing we get here upon translation of the CDS coding
# sequence from the genome
mkdir /cluster/data/mm8/bed/ensGene/testPeptides
cd /cluster/data/mm8/bed/ensGene/testPeptides
getRnaPred -cdsOnly mm8 ensGene all stdout | gzip > all.cdsOnly.gz
# Obtaining protein sequence from EnsMart
# Select "sequences" from the pull-down on the output page
# check Peptide in the "Sequences" selection area
# and "Ensembl Transcript ID (versioned) in the Transcript
# Attributes area
# Text,Fasta output, gzip, file name: ensPepMm8
# becomes ensPepMm8.fasta.gz
# A special faToTab.pl script to allow an exclude list, first need
# to obtain the exclude list from the ensembl set:
zcat ensPepMm8.fasta.gz \
| ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
| sed -e "/^$/d; s/\*$//" | sort > ensPepMm8.fa.tab
# extract the exclude list from that
grep "Sequence unavailable" ensPepMm8.fa.tab \
| awk '{print $1}' > excludeList.txt
# now filter via that exclude list, remove the final '*' character
# from their protein sequence and sort by name
zcat ensPepMm8.fasta.gz \
| ~/kent/src/utils/faToTab/faToTab.pl excludeList.txt /dev/stdin \
| sed -e "/^$/d; s/\*$//" | sort > ensPepMm8.fa.tab
# and then our peptides, same filter, remove the final 'Z' character
# from this protein sequence (the stop codon):
zcat all.cdsOnly.gz | faTrans stdin stdout \
| ~/kent/src/utils/faToTab/faToTab.pl excludeList.txt /dev/stdin \
| sed -e "/^$/d; s/Z$//" | sort > all.fa.tab
# do we have the same lists:
awk '{print $1}' ensPepMm8.fa.tab > ensList
awk '{print $1}' all.fa.tab > ucscList
diff ensList ucscList
# no differences in the name list, numbering:
wc -l ensList ucscList
# 31302 ensList
# 31302 ucscList
# How many proteins different:
diff ensPepMm8.fa.tab all.fa.tab | grep "^>" | awk '{print $2}' | wc -l
# 37
# Taking a look at that difference, it is difficult to see the
# individual differences, some are single amino acid
# differences, others are more radically different:
diff ensPepMm8.fa.tab all.fa.tab | less
# Conclusion, the 37 differences out of 31,302 are not worth the
# trouble to load up the entire Ensembl peptide table
# Add Ensembl peptide table - requested by a user (hartera, 2006-07-11)
ssh hgwdev
cd /cluster/data/mm8/bed/ensGene
cat << EOF > ensPep.sql
CREATE TABLE ensPep (
name varchar(255) not null, # Name of gene - same as in genePred
seq longblob not null, # Peptide sequence
#Indices
PRIMARY KEY(name(64))
);
EOF
cp ./testPeptides/ensPepMm8.fa.tab.gz .
gunzip ensPepMm8.fa.tab.gz
hgLoadSqlTab mm8 ensPep ensPep.sql ensPepMm8.fa.tab -warn
###########################################################################
## MAKE SUPERFAMILY TRACK (DONE, 6/22/06, Fan)
# If mm8.superfamily already exists, drop it.
cd /cluster/data/mm8/bed
mkdir /cluster/data/mm8/bed/sf.20060622
ln -s sf.20060622 sf
cd sf
hgSuperfam mm8 superfam060619 > sf.log
# It is normal that many proteins do not have corresponding Superfamily entries.
# If mm8.sfDescription exists, drop it.
hgsql mm8 < ~/src/hg/lib/sfDescription.sql
hgsql mm8 -e 'LOAD DATA local INFILE "sfDescription.tab" into table mm8.sfDescription;'
# Finally, load the superfamily table.
hgLoadBed mm8 superfamily superfamily.tab -tab
# Create knownToEnsembl table
hgMapToGene mm8 ensGene knownGene knownToEnsembl
# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \
| hgKnownToSuper mm8 mm stdin
# 26547 records output
###########################################################################
# dbSNP BUILD 126 (Heather, August 2006)
# Set up directory structure
ssh kkstore02
cd /cluster/data/dbSNP/126/mouse
mkdir mm8
cd mm8
mkdir data
mkdir schema
mkdir rs_fasta
# Get data from NCBI (anonymous FTP)
cd /cluster/data/dbSNP/126/mouse/mm8/data
ftp ftp.ncbi.nih.gov
cd snp/organisms/mouse_10090/database/organism_data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
get b126_SNPContigLoc_36_1.bcp.gz
# ContigLocusId has function
get b126_SNPContigLocusId_36_1.bcp.gz
get b126_ContigInfo_36_1.bcp.gz
# MapInfo has alignment weights
get b126_SNPMapInfo_36_1.bcp.gz
# SNP has univar_id, validation status and heterozygosity
get SNP.bcp.gz
# Get schema from NCBI
cd /cluster/data/dbSNP/126/mouse/mm8/schema
ftp ftp.ncbi.nih.gov
cd snp/organisms/mouse_10090/database/organism_schema
get mouse_10090_table.sql.gz
# Get fasta files from NCBI
# using headers of fasta files for molType
cd /cluster/data/dbSNP/126/mouse/rs_fasta
ftp ftp.ncbi.nih.gov
cd snp/organisms/mouse_10090/rs_fasta
prompt
mget *.gz
# add rs_fasta to seq/extFile
# 2 edits first: strip header to just rsId, and remove duplicates
# work on /cluster/store12 (kkstore05) which has more disk space
cp rs_ch*.fas.gz /cluster/store12/snp/126/mouse/rs_fasta
ssh kkstore05
cd /cluster/store12/snp/126/mouse/rs_fasta
# concat into rsAll.fas
cat << '_EOF_' > concat.csh
#!/bin/csh -ef
rm -f rsAll.fas
foreach file (rs_ch*.fas)
echo $file
zcat $file >> rsAll.fas
end
'_EOF_'
# snpCleanSeq strips the header and skips duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
rm rsAll.fas
# load on hgwdev
ssh hgwdev
mkdir /gbdb/mm8/snp
ln -s /cluster/store12/snp/126/mouse/rs_fasta/snp.fa /gbdb/mm8/snp/snp.fa
cd /cluster/store12/snp/126/mouse/rs_fasta
hgLoadSeq mm8 /gbdb/mm8/snp/snp.fa
# look up id in extFile
# move into separate table
hgsql mm8 < snpSeq.sql
hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 9642470' mm8
hgsql -e 'delete from seq where extFile = 9642470' mm8
hgsql -e 'alter table snpSeq add index acc (acc)' mm8
# clean up after hgLoadSeq
rm seq.tab
# Simplify names of data files
cd /cluster/data/dbSNP/126/mouse/mm8/data
mv b126_ContigInfo_36_1.bcp.gz ContigInfo.gz
mv b126_SNPContigLoc_36_1.bcp.gz ContigLoc.gz
mv b126_SNPContigLocusId_36_1.bcp.gz ContigLocusId.gz
mv b126_SNPMapInfo_36_1.bcp.gz MapInfo.gz
mv SNP.bcp.gz SNP.gz
ls -1 *.gz > filelist
# edit table descriptions
cd /cluster/data/dbSNP/126/mouse/mm8/schema
# get CREATE statements from mouse_10090_table.sql for our 5 tables
# store in table.tmp
# convert and rename tables
sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
rm table.tmp
sed -f 'tableRename.sed' table2.tmp > table.sql
rm table2.tmp
# Get updated UniVariation table
cd /cluster/data/dbSNP/126/shared
ftp ftp.ncbi.nih.gov
cd snp/database/shared_data
get UniVariation.bcp.gz
cd ../shared_schema
get dbSNP_main_table.sql.gz
# get UniVariation CREATE statement from dbSNP_main_table.sql
# use mssqlToMysql.sed to convert
# get header lines from rs_fasta
cd /cluster/data/dbSNP/126/mouse/mm8/rs_fasta
/bin/csh gnl.csh
# load on kkr5u00
ssh kkr5u00
hgsql -e mysql 'create database mm8snp126'
cd /cluster/data/dbSNP/126/mouse/mm8/schema
hgsql mm8snp126 < table.sql
cd ../data
/bin/csh load.csh
# note rowcount
# ContigLoc 23811983
# SNP 10837184
# MapInfo 23570302
# ContigLocusId 10317095
cd /cluster/data/dbSNP/126/shared
hgsql mm8snp126 < UniVariation.sql
zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' mm8snp126
# create working /scratch dir
cd /scratch/snp/126
mkdir mouse
cd mouse
# get mm8 ctgPos, load into mm8snp126, compare contig list between ctgPos and ContigInfo
# No issues in non-random
# No PAR issues
# get gnl files
cp /cluster/data/dbSNP/126/mouse/mm8/rs_fasta/*.gnl .
# examine ContigInfo for group_term and edit pipeline.csh
# use "ref_strain"
# filter ContigLoc into ContigLocFilter
# this lifts from contig coords to chrom coords
# phys_pos_from is used to check coords for non-random chroms
# errors reported to stdout
# this gets rid of alternate assemblies (using ContigInfo)
# this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
# assumes all contigs are positively oriented; will abort if not true
# Note for mouse we also screen on assembly = "C57BL/6J" in MapInfo
mysql> desc ContigLocFilter;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromName | varchar(32) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | start | int(11) | NO | | | |
# | end | int(11) | YES | | NULL | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter mm8snp126 ref_strain C57BL/6J
# note rowcount
# ContigLocFilter 7923033
# how many are positive strand? hopefully 90%
mysql> select count(*) from ContigLocFilter where orientation = 0;
# 7779413
# note count by loc_type
mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
# +----------+----------+
# | count(*) | loc_type |
# +----------+----------+
# | 2144 | 1 |
# | 7903966 | 2 |
# | 13105 | 3 |
# | 1052 | 4 |
# | 523 | 5 |
# | 2243 | 6 |
# +----------+----------+
# filter ContigLocusId into ContigLocusIdFilter
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter mm8snp126 ref_strain
# note rowcount
# ContigLocusIdFilter 3484757
# condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
# assumes SNPs are in numerical order; will errAbort if not true
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense mm8snp126
# note rowcount; expect about 50% (ascertainment bias for SNPs within genes)
# ContigLocusIdCondense 2789998
# could delete ContigLocusIdFilter table here
# create chrN_snpFasta tables from *.gnl files
# we are just using molType, but also storing class and observed
# need chromInfo for this
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta mm8snp126
# (could start using pipeline.csh here)
# (pipeline.csh takes about 35 minutes to run)
# split ContigLocFilter by chrom
# create the first chrN_snpTmp
# we will reuse this table name, adding/changing columns as we go
# at this point chrN_snpTmp will have the same description as ContigLocFilter
# this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom mm8snp126 ref_strain
# adjust coords using loc_type
# possible errors logged to snpLocType.error:
# Unknown locType
# Between with end != start + 1
# Between with allele != '-'
# Exact with end != start
# Range with end < start
# possible exceptions logged to snpLocType.exceptions:
# RefAlleleWrongSize
# This run no errors, no exceptions
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype mm8snp126 ref_strain
# expand allele as necessary
# report syntax errors to snpExpandAllele.errors
# possible exceptions logged to snpExpandAllele.exceptions:
# RefAlleleWrongSize
# This run no errors, no exceptions
# 200? alleles expanded
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele mm8snp126 ref_strain
# the next few steps prepare for working in UCSC space
# sort by position
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort mm8snp126 ref_strain
# rename MT --> M (pipeline.csh takes care of this)
hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" mm8snp126
# get mm8 nib files
# get mm8 chromInfo, load into mm8snp126 with editted path
# lookup reference allele in nibs
# keep reverse complement to use in error checking (snpCheckAlleles)
# check here for SNPs larger than 1024
# errAbort if detected
# check for coords that are too large, log to snpRefUCSC.error and skip
# This run no errors
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC mm8snp126
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +--------------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# +--------------------+-------------+------+-----+---------+-------+
# compare allele from dbSNP to refUCSC
# locType between is excluded from this check
# log exceptions to snpCheckAllele.exceptions
# if SNP is positive strand, expect allele == refUCSC
# log RefAlleleMismatch if not
# if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
# If allele == refUCSCRevComp, log RefAlleleNotRevComp
# If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
# This run we got:
# 0 RefAlleleMismatch
# 9621 RefAlleleNotRevComp
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles mm8snp126
# add class and observed using univar_id from SNP table
# to get class (subsnp_class) and observed (var_str) from UniVariation
# log errors to snpClassAndObserved.errors
# errors detected:
# class = 0 in UniVariation
# class > 8 in UniVariation
# univar_id = 0 in SNP
# no row in SNP for snp_id in chrN_snpTmp
# This run we got:
# 3 class = 0 in UniVariation
# 0 class > 8 in UniVariation
# 2890606 univar_id = 0 in SNP (strange, but okay)
# 0 no row in SNP for snp_id in chrN_snpTmp
# dbSNP has class = 'in-del'
# we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved mm8snp126
# morph chrN_snpTmp
# +--------------------+---------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+---------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | class | varchar(255) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# | observed | blob | YES | | NULL | |
# +--------------------+---------------+------+-----+---------+-------+
# generate exceptions for class and observed
# SingleClassBetweenLocType
# SingleClassRangeLocType
# NamedClassWrongLocType
# ObservedWrongFormat
# ObservedWrongSize
# ObservedMismatch
# RangeSubstitutionLocTypeExactMatch
# SingleClassTriAllelic
# SingleClassQuadAllelic
# This will also detect IUPAC symbols in allele
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved mm8snp126
# add function
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction mm8snp126
# add validation status and heterozygosity
# log error if validation status > 31 or missing
# no errors this run
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP mm8snp126
# add molType
# errors detected: missing or duplicate molType
# 57709 duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype mm8snp126
# generate chrN_snp126 and snp126Exceptions tables
cp snpCheckAlleles.exceptions snpCheckAlleles.tab
cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
cp snpExpandAllele.exceptions snpExpandAllele.tab
cp snpLocType.exceptions snpLocType.tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable mm8snp126 126
# concat into snp126.tab
# cat chr*_snp126.tab >> snp126.tab
/bin/sh concat.sh
# check for multiple alignments
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple mm8snp126
mysql> load data local infile 'snpMultiple.tab' into table snp126Exceptions;
# load on hgwdev
cp snp126.tab /cluster/home/heather/transfer/snp
hgsql mm8snp126 -e 'select * from snp126Exceptions' > /cluster/home/heather/transfer/snp/snp126Exceptions.tab
ssh hgwdev
mysql> load data local infile 'snp126.tab' into table snp126;
mysql> load data local infile 'snp126Exceptions.tab' into table snp126Exceptions;
# create indexes
mysql> alter table snp126 add index name (name);
mysql> alter table snp126 add index chrom (chrom, bin);
mysql> alter table snp126Exceptions add index name(name);
# create snp126ExceptionDesc table
cd /cluster/data/dbSNP
hgsql mm8 < snp126ExceptionDesc.sql
# add counts to exception.human.126, can start with exception.template
hgsql -e 'select count(*), exception from snp126Exceptions group by exception' mm8
mysql> load data local infile 'exception.mouse.126' into table snp126ExceptionDesc;
mysql> select count(*), exception from snp126Exceptions group by exception;
+----------+---------------------------+
| count(*) | exception |
+----------+---------------------------+
| 97271 | MultipleAlignments |
| 1600 | ObservedMismatch |
| 27 | ObservedWrongFormat |
| 272 | ObservedWrongSize |
| 9621 | RefAlleleNotRevComp |
| 11169 | SingleClassBetweenLocType |
| 346 | SingleClassQuadAllelic |
| 5023 | SingleClassRangeLocType |
| 3905 | SingleClassTriAllelic |
+----------+---------------------------+
####################################################################
## redoing STS markers track to get them more correct
## (DONE - 2006-09-15 - Hiram)
# Went into the updateBed.pl script, reworked it, made it safer,
# debugged a lot of things and placed it into the source tree.
ssh hgwdev
mkdir /cluster/data/mm8/bed/STSmarkers.2006-08-29
cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
# with that fixed script, create a new stsInfoMouse.bed file:
# Update the m m 7 directory name here to m m 8
# for the next build of m m 9, ...etc... and so forth
time ~/kent/src/hg/stsMarkers/updateBed.pl \
/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed \
../STSmarkers/downloads/MRK_Dump2.rpt \
../STSmarkers/downloads/PRB_PrimerSeq.rpt \
../STSmarkers/downloads/MRK_Sequence.rpt \
../STSmarkers/downloads/UniSTS_mouse.alias \
../STSmarkers/downloads/UniSTS_mouse.sts \
-g ../STSmarkers/downloads/10090.WI-Genetic.txt \
-r ../STSmarkers/downloads/10090.WI_MRC_RH.txt \
-verbose 2> dbg.updateBed | sed -e "s/\t*$//" > newbedfile
~/kent/src/hg/stsMarkers/cleanInfo.pl -mouse newbedfile \
| sed -e "s/\t*$//" > mm8.stsInfoMouse.bed
# copy the stsInfoMouse.bed file from working dir to the marker
# info storage fold. added 2 new steps by Yontao
# be wary of the archive name here, check the directory and get
# the name right here.
mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime
cp -p mm8.stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed
# comparing to previous, numbers increase slightly each time
wc -l /cluster/store5/mouseMarker/stsInfoMouse.bed \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
# 60631 /cluster/store5/mouseMarker/stsInfoMouse.bed
# 60440 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime
# 59843 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
# 58980 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
# 58493 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
# and from that, create new primer fa, epcr, etc:
time ~/kent/src/hg/stsMarkers/luConvertPrimerToFa \
mm8.stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
# the mouseC.fa file will be empty, should be more than last time
wc -l mouse?.* ../STSmarkers/mouse?.*
# 0 mouseC.fa
# 308384 mouseP.fa
# 34666 mouseP.info
# 0 ../STSmarkers/mouseC.fa
# 305991 ../STSmarkers/mouseP.fa
# 34475 ../STSmarkers/mouseP.info
# the equivalent Mm7 files:
# 0 0 0 mouseC.fa
# 300968 300914 6798466 mouseP.fa
# 33838 169275 2153113 mouseP.info
# 334806 470189 8951579 total
# the equivalent Mm6 files:
# 0 0 0 mouseC.fa
# 293305 293251 6624638 mouseP.fa
# 32890 164528 2087271 mouseP.info
# 326195 457779 8711909 total
# the equivalent Mm5 files:
# 0 0 0 mouseC.fa
# 286740 286686 6474893 mouseP.fa
# 32232 161234 2044810 mouseP.info
# 318972 447920 8519703 total
# copy the primers over to some filesystem close to the klusters
# and split them up to have a small number of sequences in one file
mkdir /cluster/bluearc/mm8/stsMarkers.2006-08-29
cp -p mouseP.fa /cluster/bluearc/mm8/stsMarkers.2006-08-29
cd /cluster/bluearc/mm8/stsMarkers.2006-08-29
cp -p /cluster/data/mm8/11.ooc .
mkdir split
# 356 files for 34,666 sequences, == about 97 sequences per file
faSplit sequence mouseP.fa 400 split/mm_
# PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE.
# This process could convert to a modern version of blat with the
# filters as described, for example, in the STS markers build in Hg18
# CLUSTER RUN FOR THE STS PRIMERS
ssh kk
cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
mkdir primer
mkdir ePCR
cd primer
mkdir out
# interestingly, this blat2.2 binary did not function correctly
# when given nib files. It has only about 1/4th of the number of
# alignments as it gets when it used fa files for the target
# sequence.
ls -1S /cluster/bluearc/mm8/stsMarkers.2006-08-29/split > primers.list
# re-using chrom sequences from first time
ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list
cat << '_EOF_' > runBlat2.csh
#!/bin/csh -fe
set primer = /cluster/bluearc/mm8/stsMarkers.2006-08-29/split/$1
set fa = /cluster/bluearc/mm8/stsMarkers/chroms/$2
set ooc = /cluster/bluearc/mm8/stsMarkers.2006-08-29/11.ooc
set root2 = $2:r
mkdir -p out/${root2}
set out = $3
/cluster/bin/i386/blat.2 ${fa} ${primer} -ooc=${ooc} \
-minMatch=1 -minScore=0 -minIdentity=80 -oneOff ${out}
'_EOF_'
# << happy emacs
chmod +x runBlat2.csh
cat << '_EOF_' > template
#LOOP
./runBlat2.csh $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 primers.list chr.list template jobList
para create jobList
p80ara try ... check ... push ... etc ...
# Completed: 12104 of 12104 jobs
# CPU time in finished jobs: 1078733s 17978.89m 299.65h 12.49d 0.034 y
# IO & Wait Time: 13537140s 225618.99m 3760.32h 156.68d 0.429 y
# Average job time: 1208s 20.13m 0.34h 0.01d
# Longest finished job: 11831s 197.18m 3.29h 0.14d
# Submission to last job: 20458s 340.97m 5.68h 0.24d
# on the file server
ssh kkstore04
cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer
time pslSort dirs primers.raw.psl temp out/chr*
# real 3m30.758s
# -rw-rw-r-- 1 588001891 Sep 15 10:02 primers.raw.psl
# filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
# should not be more than 100 bases different.
# This filters out about 948,260 alignments, or
# %17.4 = 100.0 * 948260 / 5462936
time pslSort dirs stdout temp out/chr* | awk -F"\t" '
{ if (((($13 - $12) - ($17 - $16)) > -100) &&
((($13 - $12) - ($17 - $16)) < 100)) {print}
}
' > primers.psl.100
rmdir temp
wc -l *.100 *.psl
# 5462936 primers.raw.psl
# 4514676 primers.psl.100
# 948260 difference
# a rough comparison with previous results:
wc -l /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.100
# 4500528 /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.100
# another kluster run for the ePCR
ssh pk
cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR
ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list
# pick up e-PCR source from
# ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
# version 2.3.1 11 Feb 2005
# Had to add the following to both re-PCR_main.cpp and
# e-PCR_main.cpp to get them to compile on kolossus:
// max and min Copied from /usr/include/mysql/my_global.h
#define max(a, b) ((a) >? (b))
#define min(a, b) ((a) <? (b))
mkdir out
cat << '_EOF_' > runPCR
#!/bin/csh -fe
/cluster/bin/x86_64/e-PCR \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/mouseP.info \
/cluster/bluearc/mm8/stsMarkers/chroms/$1 N=1 M=50 W=5 > $2
'_EOF_'
# << happy emacs
chmod +x runPCR
cat << '_EOF_' > template
#LOOP
./runPCR $(path1) {check out line+ out/$(num1).epcr}
#ENDLOOP
'_EOF_'
# << the mouseP.info was created above
gensub2 chr.list single template jobList
para create jobList
para try
para check
para push
... etc ...
# There is a single job that produces no output:
./runPCR chrX_random.fa out/30.epcr
# WARNING: 96 STSs have primer shorter than W
# WARNING: 21 STSs have ambiguities within W of 3' end
# Not sure what's up with that
# Completed: 33 of 34 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 64904s 1081.73m 18.03h 0.75d 0.002 y
# IO & Wait Time: 1860s 31.00m 0.52h 0.02d 0.000 y
# Average job time: 2023s 33.72m 0.56h 0.02d
# Longest finished job: 4861s 81.02m 1.35h 0.06d
# Submission to last job: 4862s 81.03m 1.35h 0.06d
ssh kkstore04
cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR
# all those results become all.epcr
cat out/*.epcr > all.epcr
# comparing to previous results:
wc -l all.epcr
# 58162 all.epcr
wc -l /cluster/data/mm8/bed/STSmarkers/ePCR/all.epcr
# 58088 all.epcr
cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer
~/kent/src/hg/stsMarkers/filterSTSPrimers \
-mouse ../mm8.stsInfoMouse.bed primers.psl.100 \
../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat
# The output should show an increasing count:
# Reading name info
# Reading primer info
# Processing file
# 100000
# 200000
# 300000
# ...
# 4500000
# Determining ePCR not found from ePCR results
# Out of 25749 ePCR alignments examined, not found: 520
#
wc -l primers.psl.filter.blat
# 34043 primers.psl.filter.blat
wc -l /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.filter.blat
# 34026 primers.psl.filter.blat
# create file accession_info.rdb
touch empty_sequence.inf
~/kent/src/hg/stsMarkers/compileAccInfo -mouse \
/cluster/data/mm8 empty_sequence.inf
# 20502 processed
mv accession_info.rdb accession_info.rdb.tmp
~/kent/src/hg/stsMarkers/sorttbl -x Chr Ord Start \
< accession_info.rdb.tmp > accession_info.rdb
# The -x prints the debug statement:
# sort arg: -t" " +0 -1 +1 -2g +2 -3g
rm accession_info.rdb.tmp
# comparing results to previous
# Continuing the trend that began with Mm7, the numbers in
# accession_info.rdb continue to decrease. Even Mm8 has much less
# fragments than did mm7:
# e.g.:
[hiram@kkstore04 /cluster/data] wc -l mm8/*/chr*.agp | tail -1
# 21910 total
[hiram@kkstore04 /cluster/data] wc -l mm7/*/chr*.agp | tail -1
# 70125 total
[hiram@kkstore04 /cluster/data] wc -l mm6/*/chr*.agp | tail -1
# 170812 total
wc -l accession_info.rdb
# 20385 accession_info.rdb
wc -l ../../STSmarkers/primer/accession_info.rdb
# 20385 ../../STSmarkers/primer/accession_info.rdb
# creates epcr.not.found.nomatch and epcr.not.found.psl
~/kent/src/hg/stsMarkers/epcrToPsl -mouse \
epcr.not.found ../mouseP.info \
accession_info.rdb /cluster/data/mm8 2> dbg.epcrToPsl
# the dbg.epcrToPsl has a number of lines complaining about bad
# primers in ../mouseP.info - and indeed they are bad primers,
# they do not have a second primer.
# Comparing results to previous:
wc -l epcr*
# 520 epcr.not.found
# 0 epcr.not.found.nomatch
# 520 epcr.not.found.psl
wc -l ../../STSmarkers/primer/epcr*
# 501 ../../STSmarkers/primer/epcr.not.found
# 0 ../../STSmarkers/primer/epcr.not.found.nomatch
# 501 ../../STSmarkers/primer/epcr.not.found.psl
# Mm7 wc epcr*
wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr*
# 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found
# 0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch
# 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl
# 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl
# 1106 total
# Mm6 wc epcr*
wc -l /cluster/data/mm6/bed/STSmarkers/primer/epcr*
# 472 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found
# 63 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.nomatch
# 404 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.psl
# 158 /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl
# 1097 total
cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
wc -l primers.psl.filter
# 34563 primers.psl.filter
wc -l ../../STSmarkers/primer/primers.psl.filter
# 34527 primers.psl.filter
wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
# 34460 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
# 33532 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
# 33691 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
# create primers.psl.filter.lifted.initial
# The PATH setting allows extractPslInfo to find other programs that it
# is going to use.
PATH=~/kent/src/hg/stsMarkers:$PATH \
~/kent/src/hg/stsMarkers/extractPslInfo primers.psl.filter
wc -l *.initial
# 34545 primers.psl.filter.initial
wc -l ../../STSmarkers/primer/*.initial
# 34513 ../../STSmarkers/primer/primers.psl.filter.initial
wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
# 34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
# 33514 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
wc -l \
/cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial
# 33689
# create primers.psl.filter.lifted.initial.acc
PATH=~/kent/src/hg/stsMarkers:$PATH \
~/kent/src/hg/stsMarkers/findAccession -agp \
-mouse primers.psl.filter.initial /cluster/data/mm8
wc -l primers.psl.filter.initial.acc
# 34545 primers.psl.filter.initial.acc
wc -l ../../STSmarkers/primer/primers.psl.filter.initial.acc
# 34513 primers.psl.filter.initial.acc
# this needs to be -rat as that specifies how to scan the
# stsInfoMouse.bed file and it does not work if you use -mouse
# it is not clear what -mouse would mean to this script, some other file
# format perhaps from the stsInfoMouse.bed format.
~/kent/src/hg/stsMarkers/getStsId -rat \
../mm8.stsInfoMouse.bed primers.psl.filter.initial.acc \
| sort -k4,4n > primers.final
wc -l primers.final
# 34545 primers.final
wc -l ../STSmarkers/primer/primers.final
# 34513 primers.final
cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
# stsMarkers.final is empty for mouse
touch stsMarkers.final dummy
PATH=~/kent/src/hg/stsMarkers:$PATH \
~/kent/src/hg/stsMarkers/combineSeqPrimerPos \
stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
wc -l stsMarkers_pos.rdb
# 33048 stsMarkers_pos.rdb
wc -l ../STSmarkers/stsMarkers_pos.rdb
# 33075 stsMarkers_pos.rdb
PATH=~/kent/src/hg/stsMarkers:$PATH \
~/kent/src/hg/stsMarkers/createStsBed \
mm8.stsInfoMouse.bed stsMarkers_pos.rdb 500 \
| sort -k1,1 -k2,2n | sed -e "s/ //g" > stsMapMouse.bed
# The sed removes unneeded blanks
# verify score profile remains similar
awk -F'\t' '{print $5}' stsMapMouse.bed | sort -n | uniq -c
# 546 500
# 1650 750
# 27705 1000
awk -F'\t' '{print $5}' ../STSmarkers/stsMapMouse.bed | sort -n | uniq -c
# 546 500
# 1648 750
# 27692 1000
wc -l stsMapMouse.bed
# 29901 stsMapMouse.bed
wc -l ../STSmarkers/stsMapMouse.bed
# 29888 stsMapMouse.bed
# loading STS markers tables
ssh hgwdev
cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
~/kent/src/hg/stsMarkers/ucscAlias.pl \
mm8.stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
# this does leave messages in ucscStsAlias.warnings but they seem
# to be very similar to Mm6 with just a few new ones
wc -l ucscStsAlias.tab
# 146767 ucscStsAlias.tab
wc -l ../STSmarkers/ucscStsAlias.tab
# 146064 ucscStsAlias.tab
# After extensive comparison with the currently existing STS markers, it
# appears that this new set only has a couple of new ones, and a couple
# of ones have been dropped. It seems that the primary correction has
# been to the marker positions.
ssh hgwdev
cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
# Saving the existing tables for archival purposes
hgsql -e "alter table stsInfoMouseNew rename as stsInfoMouseNewFeb2006;" mm8
hgsql -e "alter table stsAlias rename as stsAliasFeb2006;" mm8
hgsql -e "alter table all_sts_primer rename as all_sts_primerFeb2006;" mm8
hgsql -e "alter table stsMapMouseNew rename as stsMapMouseNewFeb2006;" mm8
hgsql mm8 < ~/kent/src/hg/lib/stsAlias.sql
hgsql -e \
'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm8
hgsql mm8 < ~/kent/src/hg/lib/stsMapMouseNew.sql
hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm8
hgsql mm8 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
hgsql -e \
'load data local infile "mm8.stsInfoMouse.bed" into table stsInfoMouseNew;' mm8
hgLoadPsl -nobin -table=all_sts_primer mm8 primer/primers.psl.filter
# load of all_sts_primer did not go as planned: 34563 record(s), 0
# row(s) skipped, 1 warning(s) loading primer/primers.psl.filter
# After warnings, checkTableCoords to find problems:
checkTableCoords -verboseBlocks mm8 all_sts_primer
# mm8.all_sts_primer item 61999 chr10:62418012-62418048: blocks 0 and 1 overlap.
# mm8.all_sts_primer has 1 records with overlapping blocks.
# Strip the offending item from the load:
# Verify the grep takes out only one item:
wc -l primer/primers.psl.filter
# 34563 primer/primers.psl.filter
grep -P "\t61999\t" primer/primers.psl.filter | wc -l
# 1
# and thus leaves the rest
grep -v -P "\t61999\t" primer/primers.psl.filter | wc -l
# 34562
grep -v -P "\t61999\t" primer/primers.psl.filter > fixed.primers.psl.filter
hgLoadPsl -nobin -table=all_sts_primer mm8 fixed.primers.psl.filter
# load primer sequences
rm /gbdb/mm8/stsMarker/mouseP.fa
ln -s /cluster/data/mm8/bed/STSmarkers.2006-08-29/mouseP.fa \
/gbdb/mm8/stsMarker/mouseP.fa
# PLEASE NOTE THAT THE If you are going to reload this business, use the
# -replace option on this hgLoadSeq
# hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa
# otherwise there will be a problem that the seq and extFile tables
# will be out of sync.
hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa
# Adding /gbdb/mm8/stsMarker/mouseP.fa
# 34666 sequences
# Warning: load of seq did not go as planned: 34666 record(s),
# 0 row(s) skipped, 1 warning(s) loading ./seq.tab
featureBits mm8 all_sts_primer
# 3700897 bases of 2567283971 (0.144%) in intersection
featureBits mm8 all_sts_primerFeb2006
# 3746196 bases of 2567283971 (0.146%) in intersection
featureBits mm7 all_sts_primer
# 3757119 bases of 2583394090 (0.145%) in intersection
featureBits mm6 all_sts_primer
# 3677372 bases of 2597150411 (0.142%) in intersection
featureBits mm8 stsMapMouseNew
# 4812616 bases of 2567283971 (0.187%) in intersection
featureBits mm8 stsMapMouseNewFeb2006
# 4801964 bases of 2567283971 (0.187%) in intersection
featureBits mm7 stsMapMouseNew
# 4805958 bases of 2583394090 (0.186%) in intersection
featureBits mm6 stsMapMouseNew
# 4638338 bases of 2597150411 (0.179%) in intersection
hgsql -N mm8 -e "select count(*) from stsAlias;"
# 146767
hgsql -N mm8 -e "select count(*) from stsAliasFeb2006;"
# 141981
hgsql -N mm7 -e "select count(*) from stsAlias;"
# 140649
hgsql -N mm7 -e "select count(*) from stsAlias;"
# 137738
hgsql -N mm5 -e "select count(*) from stsAlias;"
# 122944
hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;"
# 60440
hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
# 59843
hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
# 58980
hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;"
# 58493
# compare old and new name lists, not much difference:
awk '{print $4}' stsMapMouse.bed | sort -u > mm8.nameList
# in common with previous version
comm -12 ../STSmarkers/mm8.nameList mm8.nameList | wc -l
# 28687
# unique to previous version
comm -23 ../STSmarkers/mm8.nameList mm8.nameList | wc -l
# 11
# unique to this new set
comm -13 ../STSmarkers/mm8.nameList mm8.nameList | wc -l
# 20
##########################################################################
# N-SCAN gene predictions (nscanGene) - (2006-08-30 markd)
cd /cluster/data/mm8/bed/nscan/
# obtained NSCAN predictions from michael brent's group
# at WUSTL
mv ardor.wustl.edu/jeltje/mm8/chr_ptx .
rm -rf ardor.wustl.edu
rm chr_*/index.html*
gzip chr_*/*
chmod a-w chr_*/*.gz
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt mm8 nscanGene chr_gtf/chr*.gtf.gz
# load protein, add .1 suffix to match transcript id
hgPepPred -suffix=.1 mm8 generic nscanPep chr_ptx/chr*.fa.gz
rm *.tab
# update trackDb; need a mm8-specific page to describe informants
mouse/mm8/nscanGene.html (copy from hg18 and edit)
mouse/mm8/trackDb.ra
# changed search regex to
termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]
#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 9/18/06 angie)
# File emailed from Ginger Cheng <ginger2@u.washington.edu>
mkdir /cluster/data/mm8/bed/genomicSuperDups
cd /cluster/data/mm8/bed/genomicSuperDups
awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' mm8_WGAC.tab \
| hgLoadBed mm8 genomicSuperDups stdin \
-tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
# 8/29/07 Gak! Kayla found that the strand values were "+" and "_" -- fix:
hgsql mm8 -e 'update genomicSuperDups set strand = "-" where strand = "_";'
#####################################################################
# CELERA COVERAGE (WSSD -- DEPTH OF COVERAGE) (DONE 10/16/06 angie)
# File emailed from Ginger Cheng <ginger2@u.washington.edu>
mkdir /cluster/data/mm8/bed/wssd
cd /cluster/data/mm8/bed/wssd
tail +2 mm8_WSSD_DOC.tab \
| hgLoadBed mm8 wssdCoverage stdin
#####################################################################
## NIA Mouse Gene Index - (DONE, Fan, 10/6/06)
# requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
ssh hgwdev
mkdir -p /cluster/data/mm8/bed/NIAGene061003
cd /cluster/data/mm8/bed
ln -s NIAGene061003 NIAGene
cd NIAGene
wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm8/download/T-fasta.ff.gz
wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm8/download/T-psl.txt.gz
gzip -d *.gz
cut -f 1-21 T-psl.txt >NIAGene.tab
hgLoadPsl mm8 NIAGene.tab
mkdir /gbdb/mm8/NIAGene
ln -s /cluster/data/mm8/bed/NIAGene/T-fasta.fa /gbdb/mm8/NIAGene/T-fasta.fa
hgLoadSeq mm8 /gbdb/mm8/NIAGene/T-fasta.fa
# Create/edit/check in NIAGene.html and trackDb.ra under
kent/src/hg/makeDb/trackDb/mouse/mm8
#####################################################################
# LOAD GENEID GENES (DONE - 2006-10-09 - Fan)
ssh hgwdev
mkdir -p /cluster/data/mm8/bed/geneid/download
cd /cluster/data/mm8/bed/geneid/download
bash
awk '{print $1}' ../../../chrom.sizes | while read C
do
echo $C
wget --timestamping \
http://genome.imim.es/genepredictions/M.musculus/mmMar2006/geneid_v1.2/$C.gtf
wget --timestamping \
http://genome.imim.es/genepredictions/M.musculus/mmMar2006/geneid_v1.2/$C.prot
done
exit
# Add missing .1 to protein id's
foreach f (*.prot)
perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
end
cd ..
ldHgGene -genePredExt -gtf mm8 geneid download/*.gtf
#Read 35954 transcripts in 284585 lines in 34 files
# 35954 groups 34 seqs 1 sources 3 feature types
# 35954 gene predictions
hgPepPred mm8 generic geneidPep download/*-fixed.prot
featureBits mm8 -enrichment refGene geneid
# refGene 1.842%, geneid 1.592%, both 0.883%, cover 47.95%, enrich 30.13x
featureBits mm7 -enrichment refGene geneid
# refGene 1.835%, geneid 1.579%, both 0.866%, cover 47.18%, enrich 29.88x
#####################################################################
# RN4 RECIPROCAL BEST CHAINS/NETS (DONE - 2006-10-10 - Angie)
doRecipBest.pl mm8 rn4 \
>& /cluster/data/mm8/bed/blastz.rn4/axtChain/recipBest.log &
tail -f /cluster/data/mm8/bed/blastz.rn4/axtChain/recipBest.log
##############################################################################
############################################################################
# Load CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-10-10 markd)
cd /cluster/data/genbank/data/ccds/
ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
get CCDS.20061010.tar.gz
mkdir /scratch/tmp/ccds
cd /scratch/tmp/ccds
tar -zxf /cluster/data/genbank/data/ccds/CCDS.20061010.tar.gz
# import ccds database tables
/cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds mm8 ccdsInfo ccdsGene
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=mm8 -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords mm8 -verbose=2 ccdsGene
joinerCheck -database=mm8 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner
rm -rf /scratch/tmp/ccds
# build initial version of ccdsMgcMap table
./x86_64/mkCcdsGeneMap -loadDb -db=mm8 -loadDb ccdsGene mgcGenes ccdsMgcMap
# load trackDb
cd kent/src/hg/makeDb/trackDb
make alpha
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
############################################################################
# JAX TRACKS (DONE 10/20/06 angie - UPDATED 7/18/07, 9/27/07)
+# Table jaxQTL renamed to jaxQtl on 1/7/10 (see NOTE FOR NEXT TIME below)
ssh kkstore04
mkdir /cluster/data/mm8/bed/jax/2007_09
cd /cluster/data/mm8/bed/jax/2007_09
wget ftp://ftp.informatics.jax.org/pub/gbrowse/\*
wget ftp://ftp.informatics.jax.org/pub/reports/MGI_PhenotypicAllele.rpt
# Jax Rep Transcript track
# SEQ_RepTransGenomic_rpt.gff --> jaxRepTranscript{,Alias}
# -- names like AK016604_4933401J01Rik, NM_001011874_AY534250
# -- aliases ~ MGI:\d+
# Use simple perl script to uniquify transcript names and make alias.tab.
# Inspired by the mm6 version, but format has changed.
../2007_07/parseRepTranscript.pl SEQ_RepTransGenomic_rpt.gff \
> jaxRepTranscript.gff
# Jax Allele track
# AL_*.gff --> jaxAllele{,Info}
# -- bed12Source -- add type from filename
# -- names like NM_011283_Rp1h<tm1Jnz>, XM_129721_Slc9a2<tm1Ges>
# -- Info: name, mgiID, source {"Gene trapped", ...}
cp ../2007_07/parseAllele.pl .
# Edit to accomodate latest format tweaks.
rm -f jaxAllele.bed jaxAlleleInfo.tab fixJaxAllele.sql
foreach f (AL*.gff)
set type = `echo $f:t:r \
| sed -e 's/AL_//; s/GTRAP/GeneTrapped/; s/IND/Induced/; \
s/OTHER/Other/; s/SPON/Spontaneous/; s/TARG/Targeted/; \
s/TRANS/Transgenic/;'`
parseAllele.pl $f \
| ldHgGene mm8 placeholder stdin -nobin -out=stdout \
| /cluster/bin/scripts/genePredToBed \
| sed -e 's/$/'"\t$type"'/' \
>> jaxAllele.bed
end
# This round's formatting inconsistencies:
#source not given for NM_015770_a<jIs(17_In2)1Gso>
#source not given for NM_029931_Mllt3<T(4Mllt3_9Mll)1Thr>
#source not given for NM_009521_Wnt3<In(11Trp53_11Wnt3)8Brd>
#source not given for NM_011640_Trp53<In(11Trp53_11Wnt3)8Brd>
#source not given for NM_001081049_Mll1<T(4Mllt3_9Mll)1Thr>
#Missing > for mRNA name NM_001081193_Lemd3<Gt(XST167)Byg
# Jax Phenotype track
# MP_*.gff --> jaxPhenotype{,Alias}
# -- bed12Source -- add type from filename
# -- names like NM_001001488_Atp8b1
rm -f jaxPhenotype.bed jaxPhenotypeAlias.tab fixJaxPhenotype.sql
foreach f (MP_*.gff)
set type = `echo $f:t:r \
| perl -wpe 's/MP_[0-9]*_//; s/[_-](\w)/\u$1/g; s/^(\w)/\u$1/; \
s@AdiposeTissue@Adipose@; \
s@BehaviorNeurological@Behavior@; \
s@CardiovascularSystem@Cardiovascular@; \
s@DigestiveAlimentary@Digestive@; \
s@EndocrineExocrineGland@Gland@; \
s@GrowthSize@Growth Size@; \
s@HearingEar@Hearing/Ear@; \
s@HematopoieticSystem@Hematopoietic@; \
s@HomeostasisMetabolism@Homeostasis@; \
s@ImmuneSystem@Immune@; \
s@LethalityEmbryonicPerinatal@Embryonic Lethal@; \
s@LethalityPostnatal@Postnatal Lethal@; \
s@LifeSpanPostWeaningAging@Life Span@; \
s@LimbsDigitsTail@Limbs and Tail@; \
s@LiverBiliarySystem@Liver and Bile@; \
s@NervousSystem@Nervous System@; \
s@RenalUrinarySystem@Renal/Urinary@; \
s@ReproductiveSystem@Reproductive@; \
s@RespiratorySystem@Respiratory@; \
s@SkinCoatNails@Skin/Coat/Nails@; \
s@TasteOlfaction@Taste/Smell@; \
s@TouchVibrissae@Touch@; \
s@Tumorigenesis@Tumorigenesis@; \
s@VisionEye@Vision/Eye@;'`
echo $type
../2006_10/parsePhenotype.pl $f \
| ldHgGene mm8 placeholder stdin -nobin -out=stdout \
| /cluster/bin/scripts/genePredToBed \
| sed -e 's@$@'"\t$type"'@' \
>> jaxPhenotype.bed
end
sort -u jaxPhenotypeAlias.tab > tmp
mv tmp jaxPhenotypeAlias.tab
# Jax QTL track
# QTL*.gff --> jaxQtl2 (or 3?)... but we're missing MIT SSLP marker
# and CM distance for 2, or those plus flanking markers for 3...
perl -wpe 'chomp; s/\s*$//; \
($chr, undef, undef, $start, $end, undef, $strand, undef, $info) = \
split("\t"); \
if ($info =~ /QTL (\w+); Dbxref "(MGI:\d+)"; Alias .*; Note "([^"]+)"/) { \
($name, $mgiID, $desc) = ($1, $2, $3); \
} else { die "parse\n$info"; } \
$start--; \
s/^.*$/$chr\t$start\t$end\t$name\t1000\t$strand\t\t$mgiID\t$desc\t0.0\n/;' \
QTL_build36_03_alias.gff > jaxQtl.bed
# Extract phenotype-allele relationships:
# Make a file for the one code not already in a filename:
cp /dev/null MP_0003012_no_phenotypic_analysis
# Wrote a script to extract the phenotype-allele relationships --
# it uses the filenames to map MP:* codes to our phenotype names.
../2007_07/parsePhenotypicAllele.pl MGI_PhenotypicAllele.rpt \
> jaxAllelePheno.tab
# The file "err" has messages about missing data (no gene name in
# PhenotypicAllele.rpt, or gene/mgiId not found in jaxAlleleInfo).
# Load tables
ssh hgwdev
cd /cluster/data/mm8/bed/jax/2007_09
# jaxRepTranscript
ldHgGene mm8 jaxRepTranscript jaxRepTranscript.gff
hgsql mm8 < fixJaxRepTranscript.sql
sed -e 's/genericAlias/jaxRepTranscriptAlias/g' \
~/kent/src/hg/lib/genericAlias.sql > jaxRepTranscriptAlias.sql
hgLoadSqlTab mm8 jaxRepTranscriptAlias \
jaxRepTranscriptAlias.sql jaxRepTranscriptAlias.tab
# jaxAllele
sed -e 's/bed12Source/jaxAllele/g' \
$HOME/kent/src/hg/lib/bed12Source.sql > jaxAllele.sql
hgLoadBed -sqlTable=jaxAllele.sql mm8 jaxAllele jaxAllele.bed
hgsql mm8 < fixJaxAllele.sql
hgLoadSqlTab mm8 jaxAlleleInfo \
~/kent/src/hg/lib/jaxAlleleInfo.sql jaxAlleleInfo.tab
# jaxPhenotype
sed -e 's/bed12Source/jaxPhenotype/g' \
$HOME/kent/src/hg/lib/bed12Source.sql > jaxPhenotype.sql
hgLoadBed -tab -sqlTable=jaxPhenotype.sql mm8 jaxPhenotype jaxPhenotype.bed
hgsql mm8 < fixJaxPhenotype.sql
sed -e 's/genericAlias/jaxPhenotypeAlias/' \
~/kent/src/hg/lib/genericAlias.sql > jaxPhenotypeAlias.sql
hgLoadSqlTab mm8 jaxPhenotypeAlias \
jaxPhenotypeAlias.sql jaxPhenotypeAlias.tab
### NOTE FOR NEXT TIME ###
### Call the table jaxQtl instead of jaxQTL -- QA doesn't like jaxQTL.
+### (brooke) In fact, QA renamed the table to jaxQtl on 1/7/10 on hgwdev and
+### mysqlbeta with this command: mysql> alter table jaxQTL rename to jaxQtl;
+### (to make trackDb load with a single trackDb.ra entry for mm8 and mm9)
### Use -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql .
# jaxQTL
hgLoadBed -tab -notItemRgb -noBin \
-sqlTable=$HOME/kent/src/hg/lib/jaxQTL.sql \
mm8 jaxQTL jaxQtl.bed
checkTableCoords -verbose=2 mm8 jaxQTL
#mm8.jaxQTL item Scpro11 chr18:131504376-131504512: chromEnd > chromSize 90736837
#mm8.jaxQTL item Tswt chr18:134822025-134822132: chromEnd > chromSize 90736837
#mm8.jaxQTL item Ath13 chr14:164794113-164794369: chromEnd > chromSize 123978870
#mm8.jaxQTL item Dob7 chr11:131434708-131434798: chromEnd > chromSize 121798632
# Fix coords > chromSize:
perl -wpe 's/^(\w+)\t(\d+)$/ \
delete from jaxQTL where chrom="$1" and chromStart >= $2; \
update jaxQTL set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
../../../chrom.sizes \
| hgsql mm8
checkTableCoords -verbose=2 mm8 jaxQTL
# phenotype-allele relationships
hgLoadSqlTab mm8 jaxAllelePheno \
~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab
# Check joiner:
runJoiner.csh mm8 jaxAllele
runJoiner.csh mm8 jaxPhenotype
##########################################################################
# SWAP/CHAIN/NET GASACU1 (DONE 10/23/06 angie)
ssh kkstore04
mkdir /cluster/data/mm8/bed/blastz.gasAcu1.swap
cd /cluster/data/mm8/bed/blastz.gasAcu1.swap
doBlastzChainNet.pl -swap /cluster/data/gasAcu1/bed/blastz.mm8/DEF \
-chainMinScore=2000 -chainLinearGap=loose >& do.log & tail -f do.log
ln -s blastz.gasAcu1.swap /cluster/data/mm8/bed/blastz.gasAcu1
nice featureBits mm8 chainGasAcu1Link
#52781141 bases of 2567283971 (2.056%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET FELCAT3 (Done Nov 15 2006 heather)
# working in /cluster/data/felCat3 because /cluster/data/mm8 is 94% full
mkdir /cluster/data/felCat3/bed/blastz.mm8.2006-11-14
ln -s /cluster/data/felCat3/bed/blastz.mm8.2006-11-14 /cluster/data/mm8/bed/blastz.felCat3
cd /cluster/data/felCat3/bed/blastz.mm8.2006-11-14
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Mouse mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Cat felCat3
SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/felCat3/bed/blastz.mm8.2006-11-14
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk
-chainMinScore=3000 -chainLinearGap=medium
-blastzOutRoot /cluster/bluearc/felCat3/blastz.mm8 >& do.log &
tail -f do.log
nice featureBits -chrom=chr1 mm8 chainFelCat3Link
36333124 bases of 191450312 (18.978%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET BOSTAU3 (Done March 2007 heather)
mkdir /cluster/data/mm8/bed/blastz.bosTau3.2007-03-14
ln -s /cluster/data/mm8/bed/blastz.bosTau3.2007-03-14 /cluster/data/mm8/bed/blastz.bosTau3
cd /cluster/data/mm8/bed/blastz.bosTau3
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Mouse mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow bosTau3
SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes
SEQ2_LIMIT=500
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastz.bosTau3.2007-03-14
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/bosTau3/blastz.mm8 >& do.log &
tail -f do.log
nice featureBits -chrom=chr1 mm8 chainBosTau3Link
# 49896121 bases of 191450312 (26.062%) in intersection
#############################################################################
# REBUILD miRNA TRACK (DONE - 2006-12-01 - Fan)
# updated data from: Michel.Weber@ibcg.biotoul.fr
# notify them when done.
ssh hgwdev
cd /cluster/data/mm8/bed
mkdir miRNA-2006-12-01
cd miRNA-2006-12-01
# save the mmu8_miRNA.txt file from email
# add the following line in mmu8_miRNA.txt per email from Michel.
chrM 16114 16209 mmu-mir-805 480 -
hgLoadBed -strict mm8 miRNA mmu8_miRNA.txt
# check previous release track before update
featureBits mm8 miRNA
# 33033 bases of 2567283971 (0.001%) in intersection
featureBits mm7 miRNA
# 20620 bases of 2583394090 (0.001%) in intersection
#############################################################################
# Create Allen Brain Atlas mapping. (Done 2007-02-08 Galt)
# We are creating several things: a psl probe-track for the RR on mouse,
# a link out from kg to the probe to the ABA website,
# and a set of gene/probe info which visiGene will use.
# (This needs to be done after have created sequences in
# ncbiXm and tigrMgiTc as above.)
# metadata.log and SRGEsequence.log was provided by
# Susan Sunkin <SusanS@alleninstitute.org>
# this is an update to the visiGene with 6000 new images.
# See mm6.txt for steps not needing to be repeated.
# copy in the data files (directory already exists from previous build)
ssh hgwdev
cd /cluster/data/mm8/bed/allenBrain
mkdir old
mv * old/
cp /cluster/data/mm6/bed/allenBrain/allen20061204.tab .
cp /cluster/data/mm6/bed/allenBrain/probeSeq.20061204.fasta .
cp /cluster/data/mm6/bed/allenBrain/allProbes.fa .
cp /cluster/data/mm6/bed/allenBrain/allProbes.tab .
cp /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab .
# Set up a blat run to align the probes.
ssk pk
cd /cluster/data/mm8/bed/allenBrain
mkdir split
faSplit sequence allProbes.fa 200 split/rp
mkdir run
cd run
ls -1 ../split/*.fa > mrna.lst
ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst
mkdir psl
cat << '_EOF_' > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 genome.lst mrna.lst gsub spec
para create spec
# Then do the usual para try/push/time/check until the run is finished
#Completed: 6596 of 6596 jobs
#CPU time in finished jobs: 27258s 454.30m 7.57h 0.32d 0.001 y
#IO & Wait Time: 19700s 328.33m 5.47h 0.23d 0.001 y
#Average job time: 7s 0.12m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 39s 0.65m 0.01h 0.00d
#Submission to last job: 549s 9.15m 0.15h 0.01d
# Then do sorting and near-best-in-genome step on file server
ssh kkstore
cd /cluster/data/mm8/bed/allenBrain/run
pslSort dirs raw.psl tmp psl
pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl
# Clean up big files no longer needed
rm raw.psl
rm -r psl
rm -r ../split
# Load up database
ssh hgwdev
cd /cluster/data/mm8/bed/allenBrain
# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.
hgsql mm8 -e 'drop table allenBrainUrl'
hgsql mm8 < ~/kent/src/hg/lib/allenBrainUrl.sql
hgsql mm8 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl'
# Make probe alignment table, and load sequence.
hgLoadPsl mm8 allenBrainAli.psl
rm /gbdb/mm8/allenBrain/allProbes.fa
ln -s /cluster/data/mm8/bed/allenBrain/allProbes.fa /gbdb/mm8/allenBrain/allProbes.fa
hgLoadSeq -replace mm8 /gbdb/mm8/allenBrain/allProbes.fa
# Make mapping between known genes and allenBrain
hgMapToGene mm8 allenBrainAli -type=psl knownGene knownToAllenBrain
##########################################################################
# xxBlastTab - Help filter out unwanted paralogs (Galt 2007-01-11)
#
# We are starting with xxBlastTab tables already built in the usual way with
# blastall/blastp, probably with doHgNearBlastp.pl script.
#
# we want to update mm8 for human and rat,
# so check ./hgGeneData/Mouse/mm8/otherOrgs.ra for current settings
ssh hgwdev
synBlastp.csh mm8 hg18
#mm8.hgBlastTab
#new number of unique query values:
#25178
#new number of unique target values
#15328
#old number of unique query values:
#28286
#old number of unique target values
#15901
synBlastp.csh mm8 rn4
#mm8.rnBlastTab:
#new number of unique query values:
#11163
#new number of unique target values
#6573
#old number of unique query values:
#23183
#old number of unique target values
#6890
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page
# genbank release 157.0 now contains misc_diff fields for MGC clones
# reloading mRNAs results in gbMiscDiff table being created.
./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm8
##########################################################################
## WindowMasker (DONE - 2007-01-30 - Hiram)
ssh kolossus
mkdir /cluster/data/mm8/bed/WindowMasker.2007-01-29
cd /cluster/data/mm8/bed/WindowMasker.2007-01-29
# copy *.csh scripts from
# /cluster/data/danRer4/bed/WindowMasker.2006-12-04
# and fixup the db name and work directory in those scripts, then:
time nice -n +19 ./doCount.csh > doCount.out 2>&1
# real 67m32.178s
time nice -n +19 ./doSdust.csh >doSdust.out 2>&1
# real 477m24.667s
ssh kkstore04
cd /cluster/data/mm8/bed/WindowMasker.2007-01-29
gzip windowmasker.sdust.bed
time nice -n +19 ./applyMask.csh > applyMask.out 2>&1
time nice -n +19 ./addTrf.csh > addTrf.out 2>&1
twoBitToFa mm8.sdTrf.2bit stdout | faSize stdin
# 2664455088 bases (97171400 N's 2567283688 real 1644888505 upper
# 922395183 lower) in 34 sequences in 1 files
ssh hgwdev
cd /cluster/data/mm8/bed/WindowMasker.2007-01-29
##########################################################################
## AUGUSTUS ab initio predictions (DONE, 2007-01-30 - Mario)
ssh hgwdev
mkdir /cluster/data/mm8/bed/augustus
cd /cluster/data/mm8/bed/augustus
# get the program AUGUSTUS, e.g. from the web
wget http://augustus.gobics.de/binaries/augustus.2.0.1.src.tar.gz
# unpack
tar xzf augustus.2.0.1.src.tar.gz
# compile the binary if necessary
cd augustus/src
make augustus
# create output directory
cd /cluster/data/mm8/bed/augustus
mkdir out err
# create file with sequences and their sizes by modifying chrom.sizes
cat ../../chrom.sizes | perl -e 'while(<>){s/chr([0-9a-zA-Z]+)(_random|)/\/cluster\/data\/mm8\/$1\/chr$1$2.fa.masked/; print;}' > seq.lst
# create the job list
augustus/scripts/createAugustusJoblist.pl --sequences seq.lst --chunksize 5300000 --overlap 300000 --command "/cluster/data/panTro2/bed/augustus/augustus/src/augustus --AUGUSTUS_CONFIG_PATH=/cluster/data/panTro2/bed/augustus/augustus/config --species=human --sample=100 --/augustus/verbosity=0" --outputdir /cluster/data/mm8/bed/augustus/out/ --errordir /cluster/data/mm8/bed/augustus/err/ --joblist job.lst
para try
para check
para push
# CPU time in finished jobs: 2984823s 49747.06m 829.12h 34.55d 0.095 y
# IO & Wait Time: 19258s 320.96m 5.35h 0.22d 0.001 y
# Average job time: 5403s 90.05m 1.50h 0.06d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 7896s 131.60m 2.19h 0.09d
# Submission to last job: 15716s 261.93m 4.37h 0.18d
# check the error files, should be no errors
cat err/*.err
cat out/*.gff | augustus/scripts/join_aug_pred.pl > augustus.pep.gff
augustus/scripts/getAnnoFasta.pl augustus.pep.gff
cat augustus.pep.gff | egrep "CDS|codon"> augustus.gff
# load into database
ssh hgwdev
cd /cluster/data/panTro2/bed/augustus/
ldHgGene -bin mm8 augustus augustus.gff
# 32377 gene predictions
hgPepPred panTro2 generic augustusPep augustus.pep.aa
featureBits mm8 augustus
# 35380585 bases of 2567283971 (1.378%) in intersection
#########################################################################
## BLASTZ ANOCAR1 - Lizard - (DONE - 2007-02-19 - 2007-02-20 - Hiram)
ssh kkstore04
mkdir /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
cd /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
cat << '_EOF_' > DEF
# Mouse vs lizard
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm8
SEQ1_DIR=/san/sanvol1/scratch/mm8/mm8.sdTrf.2bit
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/mm8AnoCar1 > do.log 2>&1 &
# real 544m52.722s
# appears to have successfully finished
ssh hgwdev
cd /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
time nice -n +19 featureBits mm8 chainAnoCar1Link \
> fb.mm8.chainAnoCar1Link.txt 2>&1
# real 1m37.380s
# 96286498 bases of 2567283971 (3.751%) in intersection
# running the swap to anoCar1 - instructions in anoCar1.txt
cd /cluster/data/anoCar1/bed/blastz.mm8.swap
time nice -n +19 featureBits anoCar1 chainMm8Link \
> fb.anoCar1.chainMm8Link.txt 2>&1
# real 2m1.527s
# 82784787 bases of 1741478929 (4.754%) in intersection
#############################################################################
# UPDATED mm8.knownToVisiGene (DONE galt 2007-02-15)
#########################################################################
# BLASTZ ORNANA1 (PLATYPUS) - (DONE 2007-03-02 angie)
ssh kkstore04
mkdir /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27
cd /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27
cat << '_EOF_' > DEF
# mouse vs. platypus
# Use same params as used for hg18-danRer4
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: mm8
SEQ1_DIR=/scratch/hg/mm8/nib
SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: ornAna1
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/cluster/data/mm8/bed/blastz.ornAna1.2007-02-27
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
doBlastzChainNet.pl DEF \
-workhorse kkr6u00 \
-blastzOutRoot /cluster/bluearc/mm8.ornAna1 \
>& do.log & tail -f do.log
############################################################################
# Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd)
# see hg17.txt for build temporary ccds database for CCDS.20070228
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds mm8 ccdsInfo ccdsGene
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=mm8 -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords mm8 -verbose=2 ccdsGene
# update all.jointer to include mm8 in ccdsDb
joinerCheck -database=mm8 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# build initial version of ccdsMgcMap table, updated by nightly genbank update
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=mm8 ccdsGene mgcGenes ccdsMgcMap
# load trackDb
cd kent/src/hg/makeDb/trackDb
make alpha
# check in browser
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
ccdsMgcMap
# << emacs
############################################################################
# CGAP SAGE (DONE Andy 2007-03-01)
ssh hgwdev
cd san/andy/mouseSage/
wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm.libraries.gz
wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm_long.frequencies.gz
wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_mm_long_forward_v36.1.tar.gz
wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_mm_long_reverse_v36.1.tar.gz
tar xvfz SAGE_mm_long_forward_v36.1.tar.gz
tar xvfz SAGE_mm_long_reverse_v36.1.tar.gz
rm *.tar.gz
chmod a+r -R mm_*
chmod +x mm_*
cd mm_forward/
cat * | awk 'BEGIN{OFS="\t"}{print $1, $3, $4, $2, 1000, "+"}' > ../unlifted.bed
cd ../mm_reverse/
cat * | awk 'BEGIN{OFS="\t"}{print $1, $4, $3, $2, 1000, "-"}' >> ../unlifted.bed
ctgPosToLft mm8 mm8.lft
liftUp lifted.bed mm8.lft warn unlifted.bed
awk 'BEGIN{OFS="\t"}{strand = $6; start = $2; end = $3; if (strand == "-") { thickStart = end; } else { start = start - 1; thickStart = start
- 4; } thickEnd = thickStart + 4; print $1, start, end, $4, $5, strand, thickStart, thickEnd; }' lifted.bed > mapping.bed
gunzip *.gz
rm -rf mm_forward/ mm_reverse/ unlifted.bed lifted.bed mm8.lft
awk 'BEGIN{FS="\t"}{sex = $13; for (i=1; i<=12; i++) { printf("%s\t", $i); } if (sex == "unknown") { sex = ""; } else if (sex == "male and fe
male") { sex = "male,female,"} else if (sex == "male") { sex = "male,"} else {sex = "female,"}; printf("%s\t", sex); for (i=14; i<=20; i++) {
printf("%s\t", $i); } print $21}' Mm.libraries | tail +2 > massaged.Mm.libraries
cgapSageBedAddFreqs -noEmpty mapping.bed Mm_long.frequencies massaged.Mm.libraries cgapSage.bed
ln -s ~/hg/lib/cgapSage/cgapSageLib.sql
ln -s ~/hg/lib/cgapSage/cgapSage.sql
hgLoadBed -sqlTable=cgapSage.sql mm8 cgapSage cgapSage.bed
hgLoadSqlTab mm8 cgapSageLib cgapSageLib.sql massaged.Mm.libraries
############################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-02)
ssh kkstore04
bash
mkdir /cluster/data/mm8/blastDb
cd /cluster/data/mm8
ls noMask/*.fa | grep -v random > temp.lst
ls randomContigs/*.fa >> temp.lst
cat `cat temp.lst` > temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
mkdir -p /san/sanvol1/scratch/mm8/blastDb
cd /cluster/data/mm8/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /san/sanvol1/scratch/mm8/blastDb
done
mkdir -p /cluster/data/mm8/bed/tblastn.hg18KG
cd /cluster/data/mm8/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/mm8/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 2733 query.lst
# we want around 150000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(150000/2733) = 669.165940
mkdir -p /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa
split -l 670 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/mm8/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/mm8/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit # back to bash
ssh pk
cd /cluster/data/mm8/bed/tblastn.hg18KG
para create blastSpec
para time
# Completed: 150315 of 150315 jobs
# CPU time in finished jobs: 24349624s 405827.07m 6763.78h 281.82d 0.772 y
# IO & Wait Time: 1825515s 30425.24m 507.09h 21.13d 0.058 y
# Average job time: 174s 2.90m 0.05h 0.00d
# Longest finished job: 673s 11.22m 0.19h 0.01d
# Submission to last job: 79743s 1329.05m 22.15h 0.92d
ssh kkstore04
cd /cluster/data/mm8/bed/tblastn.hg18KG
for i in blastOut/*
do
echo "cd $i; cat *.psl | pslSortAcc nohead chrom /tmp/ stdin ; cd ../.."
done > sort.jobs
sh -x sort.jobs
tcsh
mkdir chainRun
cd chainRun
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
/cluster/home/braney/bin/x86_64/simpleChain -prot -outPsl -maxGap=150000 $1
`dirname $1`/c.`basename $1`.psl
'_EOF_'
chmod +x chainOne
ls ../blastOut/*/chrom/*.psl > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh pk
cd /cluster/data/mm8/bed/tblastn.hg18KG/chainRun
para create chainSpec
para maxNode 30
para try, check, push, check etc.
#two batches
# Completed: 2574 of 2574 jobs
# CPU time in finished jobs: 3338223s 55637.04m 927.28h 38.64d 0.106 y
# IO & Wait Time: 21934s 365.57m 6.09h 0.25d 0.001 y
# Average job time: 1305s 21.76m 0.36h 0.02d
# Longest finished job: 88204s 1470.07m 24.50h 1.02d
# Submission to last job: 92614s 1543.57m 25.73h 1.07d
# Completed: 2871 of 2871 jobs
# CPU time in finished jobs: 2495054s 41584.24m 693.07h 28.88d 0.079 y
# IO & Wait Time: 47207s 786.78m 13.11h 0.55d 0.001 y
# Average job time: 885s 14.76m 0.25h 0.01d
# Longest finished job: 59971s 999.52m 16.66h 0.69d
# Submission to last job: 78852s 1314.20m 21.90h 0.91d
ssh kkstore04
cd /cluster/data/mm8/bed/tblastn.hg18KG/blastOut
bash
for i in kg??
do
cat $i/chrom/c.*.psl|awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/mm8/bed/tblastn.hg18KG/preLift.psl
cd /cluster/data/mm8/bed/tblastn.hg18KG
liftUp -type=.psl -nohead stdout ../../jkStuff/liftAll.lft carry preLift.psl | sort -k 14,14 -k 16,16n -k 17,17n > blastHg18KG.psl
pslCheck blastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/mm8/bed/tblastn.hg18KG
hgLoadPsl mm8 blastHg18KG.psl
# check coverage
nice featureBits mm8 blastHg18KG
# 40445290 bases of 2567283971 (1.575%) in intersection
# In comparison to cat and dog:
nice featureBits felCat3 blastHg18KG
# 15218612 bases of 1642698377 (0.926%) in intersection
nice featureBits canFam2 blastHg18KG
# 32565727 bases of 2384996543 (1.365%) in intersection
featureBits mm8 refGene:cds blastHg18KG -enrichment
# refGene:cds 1.157%, blastHg18KG 1.575%, both 0.927%, cover 80.15%, enrich
# 50.88x
ssh kkstore04
rm -rf /cluster/data/mm8/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut
#end tblastn
# EXONIPHY MM8, lifted from hg18 (DONE acs 2007-04-08)
ssh hgwdev
cd /cluster/data/mm8/bed
mkdir exoniphy
cd exoniphy
hgLoadGenePred -genePredExt mm8 exoniphy exoniphyMm8.gp
# exoniphyMm8.gp was prepared at Cornell as follows
hgsql hg18 -e "select * from exoniphy" --skip-column-names > exoniphyHg18.gp
liftOver -genePred exoniphyHg18.gp /usr/data/hg18/dbDerived/netSynteny/hg18.mm8.syn.chain exoniphyMm8.gp unmapped
(where hg18.mm8.syn.chain representes the human/mouse syntenic net)
#########################################################################
# BLASTZ/CHAIN/NET HORSE (DONE 2/21/07 Fan)
ssh kkstore05
mkdir /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
cat << '_EOF_' > DEF
# Horse vs. Mouse
BLASTZ_M=50
# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500
SEQ1_CHUNK=30000000
SEQ1_LAP=10000
# QUERY: Mouse mm8
SEQ2_DIR=/scratch/hg/mm8/mm8.2bit
SEQ2_LEN=/cluster/data/mm8/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/equCab1/bed/blastz.mm8.2007-02-17
TMPDIR=/scratch/tmp
'_EOF_'
# Fix script coloring _EOF_
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab1/blastz.mm8 >& do.log &
tail -f do.log
ssh hgwdev
cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
ln -s blastz.mm8.2007-02-17 /cluster/data/equCab1/bed/blastz.mm8
nice featureBits equCab1 -chrom=chr1 chainMm8Link
# 70800969 bases of 177498097 (39.888%) in intersection
bash
time nice -n 19 featureBits equCab1 chainMm8Link \
> fb.equCab1.chainMm8Link.txt 2>&1
# 903993981 bases of 2421923695 (37.325%) in intersection
ssh kkstore05
mkdir /cluster/data/mm8/bed/blastz.equCab1.swap
cd /cluster/data/mm8/bed/blastz.equCab1.swap
bash
time doBlastzChainNet.pl \
/cluster/data/equCab1/bed/blastz.mm8.2007-02-17/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
tail -f swap.log
# real 76m34.873s
ssh hgwdev
cd /cluster/data/mm8/bed/blastz.equCab1.swap
bash
time nice -n 19 featureBits mm8 chainEquCab1Link \
> fb.mm8.chainEquCab1Link.txt 2>&1
# 906568751 bases of 2567283971 (35.312%) in intersection
#########################################################################
# CGAP SAGE (Done 2007-05-04)
ssh hgwdev
cd /san/sanVol1/scratch/andy
mkdir cgapSage.mm8
cd cgapSage.mm8
wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm.libraries.gz
wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm_long.frequencies.gz
hgsql -e 'select * from snp126 where class="single" and locType="exact"' mm8 \
| tail +2 | cut -f2- > snps.txt
hgsql -e 'select name from snp126Exceptions where exception="ObservedWrongSize"
or exception="SingleClassBetweenLocType" or exception="SingleClassRangeLocType"
or exception="MultipleAlignment"' mm8 \
| tail +2 > exceptions
tabGrep -v exceptions 4 snps.txt > tmp
mv tmp snps.txt
rm exceptions
hgsql -e 'select chrom,chromStart,chromEnd,name from simpleRepeat' mm8 | tail +2 > trf.bed
cut -f1-4 snps.txt > snps.bed
overlapSelect -nonOverlapping trf.bed snps.bed /dev/stdout | cut -f4 > goodSnps.txt
tabGrep goodSnps.txt 4 snps.txt > tmp
mv tmp snps.txt
rm trf.bed goodSnps.txt snps.bed
ln -s /cluster/data/mm8/mm8.2bit
ln -s /cluster/data/mm8/chrom.sizes
ln -s ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql
tail +2 Mm.libraries | awk -f cleanLibs.awk > libs.txt
hgLoadSqlTab mm8 cgapSageLib cgapSageLib.sql libs.txt
partitionSequence.pl -lstDir small 5000000 30 mm8.2bit chrom.sizes 0 > sequence.lst
grep -v small sequence.lst > seq.lst
cat small/* >> seq.lst
mv seq.lst sequence.lst
rm -rf small/
for part in `cat sequence.lst`; do ./doJobList.sh $part >> jobList; done
ssh pk
cd /san/sanVol1/scratch/andy/cgapSage.mm8
para create jobList
para try
para push
# takes like 5-10 min
exit # back to hgwdev
find output/ -name '*.bed' -exec cat '{}' >> output.bed \;
cgapSageDupeRemove output.bed tmp.bed
cgapSageDupeRemove -unique tmp.bed final.bed
ln -s ~/kent/src/hg/lib/cgapSage/cgapSage.sql
hgLoadBed -sqlTable=cgapSage.sql -tab mm8 cgapSage final.bed
#############################################################################
# REBUILD miRNA TRACK (DONE - 2007-05-31 - Fan)
# updated data from: Michel.Weber@ibcg.biotoul.fr
# notify them when done.
ssh hgwdev
cd /cluster/data/mm8/bed
mkdir miRNA-2007-05-31
cd miRNA-2007-05-31
# save the mouse_miRNA_track_may2007.txt file from email
cat mouse_miRNA_track_may2007.txt|sed -e 's/ /\t/g' > miRNA.tab
hgLoadBed mm8 miRNA miRNA.tab
# check previous release track before update
featureBits mm8 miRNA
#33398 bases of 2567283971 (0.001%) in intersection
featureBits mm7 miRNA
# 20620 bases of 2583394090 (0.001%) in intersection
#############################################################################
# LIFTOVER TO MM9 (DONE 7/25/07 angie)
ssh kkstore04
# -debug run to create run dir, preview scripts...
doSameSpeciesLiftOver.pl -debug mm8 mm9 \
-ooc /san/sanvol1/scratch/mm8/11.ooc
# Real run:
cd /cluster/data/mm8/bed/blat.mm9.2007-07-24
doSameSpeciesLiftOver.pl mm8 mm9 \
-ooc /san/sanvol1/scratch/mm8/11.ooc \
>& do.log & tail -f do.log
#############################################################################
# CONTRAST GENES (2007-10-02 markd)
# recieved predictions from Sam Gross <ssgross@stanford.edu>
cd /cluster/data/mm8/bed/contrastGene/
wget http://www.stanford.edu/~ssgross/contrast.mm8.bed
# this is a custom track, not a pure BED
tail +2 contrast.mm8.bed | hgLoadBed -tab mm8 contrastGene stdin
# verify
# load track db (ra and contrastGene.html are global
# request push of contrastGene
###########################################################################
# loading affy mouse Exon probes and transcripts (DONE - 2007-10-04 - Hiram)
# data was supplied from Venu Valmeekam Venu_Valmeekam@affymetrix.com
# dropped via FTP to genome-test
ssh hgwdev
mkdir /cluster/data/mm8/bed/affyMoEx1
cd /cluster/data/mm8/bed/affyMoEx1
# the files received:
# -rw-r--r-- 1 8909954 Oct 3 10:48 transcript_cluster_mm.bed.gz
# -rw-r--r-- 1 48178714 Oct 4 13:35 probe_mm_score.bed.gz
# loading:
hgLoadBed -tmpDir=/scratch/tmp mm8 affyMoEx1Probe probe_mm_score.bed.gz
# Loaded 4549897 elements of size 6
hgLoadBed -tmpDir=/scratch/tmp mm8 affyMoEx1Transcript \
transcript_cluster_mm.bed.gz
# Loaded 270140 elements of size 12
# working on description pages for these with Venu.
# I manually set the scores in the affyMoEx1Transcript track to
# 1000 so it would work OK (not color) with the useScore 1 so that
# the affyMoEx1Probe would color itself on the score
###########################################################################
# LIFT RM ALIGN FILES, MAKE PER-CHROM DOWNLOADS (DONE 12/7/07 angie)
# Lifting of .align files is now automated by doRepeatMasker.pl, but we
# got a user request for .align files from this pre-automation db.
ssh kkstore04
cd /cluster/data/mm8
mkdir downloads/RMalign
foreach c (?{,?})
echo linking/lifting to contigs of $c:t
foreach ctgdir ($c/chr$c{,_random}_?{,?})
set ctg = $ctgdir:t
if (! -f $ctgdir/$ctg.fa.align) then
pushd $ctgdir
liftRMAlign.pl $ctg.lft > $ctg.fa.align
popd
endif
ln -s $ctg/$ctg.fa.align $c/
end
set chr = chr$c:t
if (-e $c/lift/ordered.lft && ! -z $c/lift/ordered.lft) then
echo lifting contigs to chr$c
liftRMAlign.pl $c/lift/ordered.lft \
| gzip -c > downloads/RMalign/$chr.fa.align.gz
endif
if (-e $c/lift/random.lft && ! -z $c/lift/random.lft) then
echo lifting contigs to chr${c}_random
liftRMAlign.pl $c/lift/random.lft \
| gzip -c > downloads/RMalign/${chr}_random.fa.align.gz
endif
end
# Got some messages like these for chunks that fall entirely
# within gaps (e.g. centromere, huge unbridged...)
#FYI Couldn't open chr1_1_00.fa.align: No such file or directory
#...
#FYI Couldn't open chr1_1_05.fa.align: No such file or directory
#FYI Couldn't open chr1_17_02.fa.align: No such file or directory
#...
md5sum downloads/RMalign/*.gz > downloads/RMalign/md5sum.txt
ssh hgwdev ln -s /cluster/data/mm8/downloads/RMalign \
/usr/local/apache/htdocs/goldenPath/mm8/
############################################################################
# Reload CCDS (2007-12-12 markd)
# import ccds database as described in ccds.txt
set db=mm8
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
# build initial version of ccdsMgcMap table, updated by nightly genbank update
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
ccdsMgcMap
# << emacs
############################################################################
# Reload CCDS (2008-02-01 markd)
# import ccds database as described in ccds.txt
set db=mm8
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
############################################################################
# Broad whole-genome ChIP-Seq in stem and progenitor cells
# Mikkelson et al., Nature Aug. 2, 2007
# Requested by David Haussler
# 21 data sets, ~4M sequences/dataset
# 7 antibodies (histone meth & pol2),
# 4 cell sources (ES, NP, MEF, ES+)
# alignments/ sequences and mappings for 27bp reads
# format: chrom, start, end, strand, read_id, mismatches, sequence
# densities/ indication of #reads near the base, 25bp fixed window, -1 if unalignable base
# Allele-specific fragment counts
# format: chr start allele1 allele2 # #
# Enriched intervals by HMM
# BED3
# Enriched intervals by fixed-size windows
# BED3
# Also, gene expression data
# Track organization:
# Broad ChIP ES supertrack, with tracks:
# - Broad Stem ChIP Seq (read alignments)
# - Broad Stem ChIP Sig (density in 25bp windows)
# - Broad Stem ChIP Sites (regions from HMM, windowing)
# Each track has subtracks for different cell types and antibodies
# Also, a track for the expression data: Broad ES
#
ssh kkstore04
cd /cluster/data/mm8/bed
mkdir -p broadStemChip
cd broadStemChip/
wget -r ftp://ftp.broad.mit.edu/pub/papers/chipseq/
mv pub/papers/chipseq .
rm -fr pub
# original data
ln -s chipseq lab
cd lab
###############
# Sites track
# HMM Sites -- BED3
mkdir -p hmmSites
cd hmmSites
tar xvfz ../HMMIntervals.tar.gz
ssh hgwdev
cd /cluster/data/mm8/bed/broadStemChip
cat > hmmSites.csh << 'EOF'
foreach f (/hmmSites/HMM_ES_*.txt)
set b = $f:t
set ab = `echo $b | perl -wpe 's/HMM_ES_(.+).txt/H3$1me3/'`
echo $ab
tail +2 $f | sed 's/^/chr/' | \
hgLoadBed mm8 broadStemChipHmmSites${ab}Es stdin
end
'EOF'
# Fix script coloring EOF
csh hmmSites.csh >&! hmmSites.log
# Loaded 1788 - 19523 elements in 5 tracks
# H3K{20,27,36,4,9)me3
mkdir -p WindowSites
cd WindowSites
tar xvfz ../WindowIntervals.tar.gz
cd ..
awk '{print $4}' *K*.txt | sort -n | head -1
# Sites from Window algorithm -- BED3 plus float score
# min: 2.75, max: 275.50
# distribution of data values:
awk '{print $4}' *K*.txt | sort | textHistogram -binSize=10 maxBinCount=30 -real stdin
0.000000 ************************************************************ 38346
10.000000 ************************** 16385
20.000000 ************** 9186
30.000000 ********* 5705
40.000000 ******* 4607
50.000000 ****** 3686
60.000000 **** 2243
70.000000 ** 1094
80.000000 * 382
90.000000 112
100.000000 31
110.000000 10
120.000000 3
130.000000 2
140.000000 0
150.000000 0
160.000000 0
170.000000 0
180.000000 0
190.000000 1
200.000000 0
210.000000 0
220.000000 2
230.000000 0
240.000000 0
250.000000 0
260.000000 0
270.000000 1
# To range score display from 300 to 1000, use:
# (x * 2) + 300
mkdir windowSites
cat > windowSites.csh << 'EOF'
foreach f (chipseq/windowSites/*.K*.txt)
set b = $f:t
set ab = `echo $b | perl -wpe 's/\w+.(\w)(\w+).txt/H3\u$1\L$2me3/'`
set cell = `echo $b | perl -wpe 's/(\w)(\w+).*/\u$1\L$2/'`
tail +2 $f | awk '{printf "%s\t%d\t%d\t \t%d\t%s\n", $1, $2, $3, ($4 * 2) + 300, $4}' > windowSites/$cell.$ab.tab
# using kate's version, testing -renameSqlTable option
/cluster/home/kate/bin/x86_64/hgLoadBed mm8 -tab -noNameIx -renameSqlTable \
-sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
broadStemChipWinSites${ab}${cell} windowSites/$cell.$ab.tab
end
'EOF'
# Fix script coloring EOF
csh windowSites.csh >&! windowSites.log
###############
# Signal track
# indication of #reads near the base, 25bp fixed window, -1 if unalignable base
ssh kkstore04
cd /cluster/data/mm8/bed/broadStemChip/lab/densities
mkdir -p alignable
cd alignable
tar xvfz ../alignable.tar.gz
cd ../..
# Get a list of the datasets
mkdir -p signal
tar tfz chipseq/densities/chr1.tar.gz | \
perl -wpe 's/chr\w.(\w+.\w+).txt/$1/' > signal/datasets.txt
# ignore control (whole-cell extract)
grep -v WCE signal/datasets.txt > signal/subtracks.txt
wc -l signal/subtracks.txt
# 18
# Extract datasets from by-chrom packaging
# Weed out missing data which are represented as -1 values
# Convert to wiggle
cat > makeWig.csh << 'EOF'
foreach s (`cat signal/subtracks.txt`)
set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/H3\u$1\L$2/'`
set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
set table = broadStemChipSignal${ab}${cell}
echo $table
rm -f signal/$s.wigVar
foreach f (chipseq/densities/chr*.tar.gz)
set c = $f:t:r:r
(echo "fixedStep chrom=$c start=1 step=25 span=25"; \
tar xfzO $f $c.$s.txt) | \
nice fixStepToBedGraph.pl | \
nice grep -v '\-1$' | \
nice wigBedToStep stdin stdout >> signal/$s.wigVar
end
nice wigEncode signal/$s.wigVar signal/$s.wig signal/$s.wib
end
'EOF'
# Fix script coloring EOF
# NEWER
cat > makeWig.csh << 'EOF'
foreach s (`cat signal/subtracks.txt`)
set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/\u$1\L$2/'`
set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
set table = broadStemChipSignal${ab}${cell}
echo $table
rm -f signal/$s.wigVar
foreach f (chipseq/densities/chr*.tar.gz)
set c = $f:t:r:r
echo "variableStep chrom=$c span=25" >> signal/$table.wigVar
tar xfzO $f $c.$s.txt | \
awk 'BEGIN {pos = 1} {print pos, $1; pos += 25}' | \
grep -v '\-1$' >> signal/$table.wigVar
end
cd signal
nice wigEncode $table.wigVar $table.wig $table.wib
cd ..
end
'EOF'
# Fix script coloring EOF
csh makeWig.csh >&! makeWig.log &
# check output and cleanup
cd signal
gzip *.wigVar
csh makeWig.csh >&! makeWig.log &
# check output and cleanup
cd signal
gzip *.wigVar
######## Load wiggles?
ssh hgwdev
mkdir /gbdb/mm8/broadStemChip
cd /cluster/data/mm8/bed/broadStemChip
cat > loadWig.csh << \_EOF_
#!/bin/csh -fe
cd /cluster/data/mm8/bed/broadStemChip/signal
foreach f (*.wib)
set wi = $f:t:r
set wig = $wi.wig
echo Start: $wig
echo "ln -s `pwd`/$f /gbdb/mm8/broadStemChip/$wi.wib"
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig
echo Finished: $wig
end
_EOF_
chmod +x loadWig.csh
time nice -n +19 ./loadWig.csh >> loadWig.log 2>&1 &
# Try it by hand.
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 broadStemChipSignalH3Es broadStemChipSignalH3Es.wig
# Now Try it again.
cat > loadWig.csh << \_EOF_
#!/bin/csh -fe
cd /cluster/data/mm8/bed/broadStemChip/signal
foreach f (*.wib)
set wi = $f:t:r
set wig = $wi.wig
time hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig
echo Finished: $wig
end
_EOF_
# Try it again.
time nice -n +19 ./loadWig.csh >> loadWig.log 2>&1 &
# | broadStemChipSignalH3Es |
# | broadStemChipSignalK20Es |
# | broadStemChipSignalK27Es |
# | broadStemChipSignalK27Mef |
# | broadStemChipSignalK27Np |
# | broadStemChipSignalK36Es |
# | broadStemChipSignalK36Eshyb |
# | broadStemChipSignalK36Mef |
# | broadStemChipSignalK36Np |
# | broadStemChipSignalK4Es |
# | broadStemChipSignalK4Eshyb |
# | broadStemChipSignalK4Mef |
# | broadStemChipSignalK4Np |
# | broadStemChipSignalK9Es |
# | broadStemChipSignalK9Eshyb |
# | broadStemChipSignalK9Mef |
# | broadStemChipSignalK9Np |
# | broadStemChipSignalRpolEs |
# Noticed tables badly named, renamed them and corresponding files
hgsql mm8
rename table broadStemChipSignalK4Es to broadStemChipSignalH3K4Es ;
rename table broadStemChipSignalK4Eshyb to broadStemChipSignalH3K4Eshyb ;
rename table broadStemChipSignalK4Mef to broadStemChipSignalH3K4Mef ;
rename table broadStemChipSignalK4Np to broadStemChipSignalH3K4Np ;
rename table broadStemChipSignalK9Es to broadStemChipSignalH3K9Es ;
rename table broadStemChipSignalK9Eshyb to broadStemChipSignalH3K9Eshyb ;
rename table broadStemChipSignalK9Mef to broadStemChipSignalH3K9Mef ;
rename table broadStemChipSignalK9Np to broadStemChipSignalH3K9Np ;
rename table broadStemChipSignalK20Es to broadStemChipSignalH4K20Es ;
rename table broadStemChipSignalK27Es to broadStemChipSignalH3K27Es ;
rename table broadStemChipSignalK27Mef to broadStemChipSignalH3K27Mef ;
rename table broadStemChipSignalK27Np to broadStemChipSignalH3K27Np ;
rename table broadStemChipSignalK36Es to broadStemChipSignalH3K36Es ;
rename table broadStemChipSignalK36Eshyb to broadStemChipSignalH3K36Eshyb;
rename table broadStemChipSignalK36Mef to broadStemChipSignalH3K36Mef ;
rename table broadStemChipSignalK36Np to broadStemChipSignalH3K36Np ;
# | broadStemChipSignalH3K4Es |
# | broadStemChipSignalH3K4Eshyb |
# | broadStemChipSignalH3K4Mef |
# | broadStemChipSignalH3K4Np |
# | broadStemChipSignalH3K9Es |
# | broadStemChipSignalH3K9Eshyb |
# | broadStemChipSignalH3K9Mef |
# | broadStemChipSignalH3K9Np |
# | broadStemChipSignalH4K20Es |
# | broadStemChipSignalH3K27Es |
# | broadStemChipSignalH3K27Mef |
# | broadStemChipSignalH3K27Np |
# | broadStemChipSignalH3K36Es |
# | broadStemChipSignalH3K36Eshyb |
# | broadStemChipSignalH3K36Mef |
# | broadStemChipSignalH3K36Np |
# | broadStemChipSignalH3Es |
# | broadStemChipSignalRpolEs |
### ### ### Finished Signals 2008-05-08
######### Alignments
### Sample from ES.H3.txt.gz
# chr10 63848447 63848474 - 3084.4.1 0 GAGAGCCAATGGCTAGGCAGGGCATCA
### Convert to
#chr10 63848447 63848474 3084.4.1 0 - 63848447 63848474 0,255,0 0 GAGAGCCAATGGCTAGGCAGGGCATCA
# convert to bed-9+ color at 9, mismatch at 10 and seq at 11; grabbed some example code from encodeHg17.txt PET
ssh hgwdev
cd /cluster/data/mm8/bed/broadStemChip/lab/alignments
mkdir bed
cd bed
cat << \_EOF_ > makeBed9PlusFromAlignments.csh
#!/usr/bin/perl
# replace "reserved" field of BED >=9 fields with RGB value from 8-scale
# black->red palette, based on score value.
use warnings;
use strict;
while (<>) {
next if (/^track/ || /^\s*\#/);
chomp;
my @words = split("\t");
if (scalar(@words) < 7) {
@words = split(/\s+/);
die "Expecting at least 7 tab-sep fields but got fewer, line $.\n"
if (scalar(@words) < 7);
}
my @newWordOrder = ("","","","","","","","","","","");
$newWordOrder[0] = $words[0]; # chr
$newWordOrder[1] = $words[1]; # beg
$newWordOrder[2] = $words[2]; # end
$newWordOrder[3] = $words[4]; # name
#$newWordOrder[4] = "0"; # score
$newWordOrder[4] = 1000 - ($words[5] * 100); # score 0=1000 1=900 2=800
$newWordOrder[5] = $words[3]; # strand
$newWordOrder[6] = $words[1]; # beg
$newWordOrder[7] = $words[2]; # end
$newWordOrder[8] = "0,0,0"; # color to be set later
$newWordOrder[9] = $words[5]; # mismatch
$newWordOrder[10] = $words[6]; # seq
print join("\t", @newWordOrder) . "\n";
}
_EOF_
cat << \_EOF_ > makeColoredBedOnStrand.csh
#!/usr/bin/perl
# replace "reserved" field of BED >=9 fields with RGB value from 8-scale
# black->red palette, based on score value.
use warnings;
use strict;
# palette consistes of red, green blue
my @blues = ("0,0,255","0,0,204","0,0,170");
my @greens = ("0,255,0","0,187,0","0,136,0");
while (<>) {
next if (/^track/ || /^\s*\#/);
chomp;
my @words = split("\t");
if (scalar(@words) < 9) {
@words = split(/\s+/);
die "Expecting at least 9 tab-sep fields but got fewer, line $.\n"
if (scalar(@words) < 9);
}
die "More than 9 mismatches found line $.\n"
if ($words[9] > 9);
my $strand = $words[5];
if ($strand eq '+') {
if( $words[9] > 2 ) {
$words[8] = $blues[2]; # green
} else {
$words[8] = $blues[$words[9]]; # green
}
} else {
if( scalar($words[9]) > 2 ) {
$words[8] = $greens[2]; # blue
} else {
$words[8] = $greens[$words[9]]; # blue
}
}
print join("\t", @words) . "\n";
}
_EOF_
cat << \_EOF_ > convertToBed.csh
#!/bin/csh -fe
cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
foreach f (../*.txt.gz)
set root = `echo $f:t:r:r`
zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
echo $root.bed.gz done
end
_EOF_
chmod +x makeBed9PlusFromAlignments.csh
chmod +x makeColoredBedOnStrand.csh
chmod +x convertToBed.csh
zcat ../ES.H3.txt.gz | head | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh
## How to make bash work ???
#for f in ../*.txt.gz; do
# root=${f##*/}
# root=${root%.*}
# root=${root%.*}
# zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
# echo $root.bed.gz done
#done
ssh kkstore04
cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
time nice -n +19 ./convertToBed.csh > convert.log 2>&1 &
# failed because mismatches exceeded 2, so used following to determin max mismatches: 6 in ES.H3
zcat ../ES.H3.txt.gz | head -100 | awk '{print $6}' | sort -n | uniq -c | wc -l
# real 55m8.275s
# Two were not gzipped!
cat << \_EOF_ > convertTxtToBed.csh
#!/bin/csh -fe
cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
foreach f (../ES.*.txt)
set root = `echo $f:t:r`
./makeBed9PlusFromAlignments.csh < $f | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
echo $root.bed.gz done
end
_EOF_
chmod +x convertTxtToBed.csh
time nice -n +19 ./convertTxtToBed.csh >> convert.log 2>&1 &
# Add comments:
cat << \_EOF_ > commentBedFiles.csh
#!/bin/csh -fe
cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
set descr1 = `grep Primary ../readme.txt | tr -d "\r"`
set descr2 = `grep pluripotent ../readme.txt | tr -d "\r"`
foreach f (ES.*.bed.gz)
set root = `echo $f:t:r:r`
set comment = `grep $root ../readme.txt | tr -d "\r"`
echo "# $comment - ${descr1} ${descr2}" > new.${root}.bed
zcat $f >> new.${root}.bed
gzip new.${root}.bed
end
_EOF_
ssh kkstore04
cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
time nice -n +19 ./commentBedFiles.csh > comment.log 2>&1 &
# Rename to match other identifiers?
# | broadStemChipHmmSitesH3K20me3Es |
# | broadStemChipHmmSitesH3K27me3Es |
# | broadStemChipHmmSitesH3K36me3Es |
# | broadStemChipHmmSitesH3K4me3Es |
# | broadStemChipHmmSitesH3K9me3Es |
# | broadStemChipWinSitesH3K27me3Es |
# | broadStemChipWinSitesH3K27me3Mef |
# | broadStemChipWinSitesH3K27me3Np |
# | broadStemChipWinSitesH3K4me3Es |
# | broadStemChipWinSitesH3K4me3Mef |
# | broadStemChipWinSitesH3K4me3Np |
# | broadStemChipWinSitesH3K9me3Es |
# | broadStemChipWinSitesH3K9me3Mef |
# | broadStemChipWinSitesH3K9me3Np |
zcat new.ES.K9.bed.gz | head -1 | awk '{ print $5 }'
head -1 new.*.bed | awk '{ print $5 }'
for f in new.ES.K*.gz; do zcat $f | head -1 | awk '{ print $2,$5 "Es"}'; done
for f in new.ES.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceEs"}'; done
for f in new.ES.H3.*.gz; do zcat $f | head -1 | awk '{ print $2,"H3panEs"}'; done
for f in new.ES.R*.gz; do zcat $f | head -1 | awk '{ print $2,"RPolEs"}'; done
for f in new.ESHyb.*.gz; do zcat $f | head -1 | awk '{ print $2,"ES" $6 "EsHyb"}'; done
for f in new.MEF.K*.gz; do zcat $f | head -1 | awk '{ print $2,$4 "Mef"}'; done
for f in new.MEF.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceMef"}'; done
for f in new.NP.K*.gz; do zcat $f | head -1 | awk '{ print $2,$5 "Np"}'; done
for f in new.NP.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceNp"}'; done
mv new.ES.K20.bed.gz H4K20Me3Es.bed.gz
mv new.ES.K27.bed.gz H3K27Me3Es.bed.gz
mv new.ES.K36.bed.gz H3K36Me3Es.bed.gz
mv new.ES.K4.bed.gz H3K4Me3Es.bed.gz
mv new.ES.K9.bed.gz H3K9Me3Es.bed.gz
mv new.ES.WCE.bed.gz WceEs.bed.gz
mv new.ES.H3.bed.gz H3panEs.bed.gz
mv new.ES.RPol.bed.gz RPolEs.bed.gz
mv new.ESHyb.K36.bed.gz ESH3K36Me3EsHyb.bed.gz
mv new.ESHyb.K4.bed.gz ESH3K4Me3EsHyb.bed.gz
mv new.ESHyb.K9.bed.gz ESH3K9Me3EsHyb.bed.gz
mv new.MEF.K27.bed.gz H3K27Me3Mef.bed.gz
mv new.MEF.K36.bed.gz H3K36Me3Mef.bed.gz
mv new.MEF.K4.bed.gz H3K4Me3Mef.bed.gz
mv new.MEF.K9.bed.gz H3K9Me3Mef.bed.gz
mv new.MEF.WCE.bed.gz WceMef.bed.gz
mv new.NP.K27.bed.gz H3K27Me3Np.bed.gz
mv new.NP.K36.bed.gz H3K36Me3Np.bed.gz
mv new.NP.K4.bed.gz H3K4Me3Np.bed.gz
mv new.NP.K9.bed.gz H3K9Me3Np.bed.gz
mv new.NP.WCE.bed.gz WceNp.bed.gz
#hgLoadBed mm8 broadStemChipAlign${root} ${f}
time nice -n +19 hgLoadBed mm8 broadStemChipAlignmentsWceEs WceEs.bed.gz &
### Failed! All that work to put a nice comment in the bed file, and hgLoadBed does not handle it!
### Fixed this in hgLoadBed.c
cat << \_EOF_ > myBedTbl.sql
CREATE TABLE myBedTbl (
bin smallint unsigned not null,
chrom varchar(255) not null,
chromStart int unsigned not null,
chromEnd int unsigned not null,
name varchar(255) not null,
score int unsigned not null,
strand char(1) not null,
thickStart int unsigned not null,
thickEnd int unsigned not null,
reserved int unsigned not null,
mismatchCount int unsigned not null,
seq varchar(255) not null,
#Indices
INDEX(name(16)),
INDEX(chrom(5),bin)
)
_EOF_
cat << \_EOF_ > loadBedFiles.csh
#!/bin/csh -fe
cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
foreach f (*.bed.gz)
set root = `echo $f:t:r:r`
~/bin/x86_64/hgLoadBed -sqlTable=myBedTbl.sql -renameSqlTable mm8 broadStemChipAlignments${root} ${f}
echo broadStemChipAlignments${root} ${f} done
end
_EOF_
chmod +x loadBedFiles.csh
time nice -n +19 ./loadBedFiles.csh &
real 62m46.504s
# Noticed 3 tables badly named, renamed them and corresponding files
hgsql mm8
rename table broadStemChipAlignmentsESH3K36Me3EsHyb to broadStemChipAlignmentsH3K36Me3EsHyb;
rename table broadStemChipAlignmentsESH3K4Me3EsHyb to broadStemChipAlignmentsH3K4Me3EsHyb;
rename table broadStemChipAlignmentsESH3K9Me3EsHyb to broadStemChipAlignmentsH3K9Me3EsHyb;
# edited trackDb.broadStem.ra
broadStemChipAlignmentsH3K4Me3Es
broadStemChipAlignmentsH3K4Me3Mef
broadStemChipAlignmentsH3K4Me3Np
broadStemChipAlignmentsH3K9Me3Es
broadStemChipAlignmentsH3K9Me3Mef
broadStemChipAlignmentsH3K9Me3Np
broadStemChipAlignmentsH4K20Me3Es
broadStemChipAlignmentsH3K27Me3Es
broadStemChipAlignmentsH3K27Me3Mef
broadStemChipAlignmentsH3K27Me3Np
broadStemChipAlignmentsH3K36Me3Es
broadStemChipAlignmentsH3K36Me3Mef
broadStemChipAlignmentsH3K36Me3Np
broadStemChipAlignmentsH3K9Me3EsHyb
broadStemChipAlignmentsH3K36Me3EsHyb
broadStemChipAlignmentsH3K4Me3EsHyb
broadStemChipAlignmentsWceEs
broadStemChipAlignmentsWceMef
broadStemChipAlignmentsWceNp
broadStemChipAlignmentsRPolEs
broadStemChipAlignmentsH3panEs
### ### ### Finished Alignments 2008-04-29
### ### ### Edited mouse/mmm8/trackDb.broadStem.ra to include new broadChromatinChIPSeq
### ### ### track with 53 subtracts covering sites (HMM, Windowing), siganl & alignments
### ### ### for ES, MAF, NP, ES_hybrid cell lines
### ### ### and H3K4me3 H3K9me3 H4K20me3 H3K27me3 H3K36me3 antibodies
### ### ### and WCE, RPOL-II and pan-H3 controls
############################################################################
# Adding more tracks from Broad (Meissner2008)
# (Start 2008-7-14 Tim Done: 2008-07-18)
ssh kkstore04
cd /cluster/data/mm8/bed/broadStemChip/chipseq
mkdir -p Meissner2008
cd Meissner2008/
wget -r ftp://ftp.broad.mit.edu/pub/papers/chipseq/Meissner2008/
mv pub/papers/chipseq/Meissner2008 .
rm -fr pub
# original data
ln -s chipseq lab
cd lab
###############
# Sites track
mkdir windowSites/Meissner2008
cd windowSites/Meissner2008
tar xvfz ../../Meissner2008/WindowIntervals.tar.gz
awk '{print $4}' *.sites | sort -n | head -1
# Sites from Window algorithm -- BED3 plus float score
# min: 2.50, max: 275.50
# distribution of data values:
awk '{print $4}' *.sites | sort | textHistogram -binSize=10 maxBinCount=30 -real stdin
# 0.000000 ************************************************************ 155307
# 10.000000 **************** 42020
# 20.000000 ****** 14576
# 30.000000 **** 10408
# 40.000000 ** 5717
# 50.000000 * 2299
# 60.000000 718
# 70.000000 232
# 80.000000 60
# 90.000000 15
# 100.000000 3
# 110.000000 6
# 120.000000 1
# 130.000000 1
# 140.000000 1
mv Brain.H3K27me3.sites ../Brain.K27me3.sites
mv Brain.H3K4me2.sites ../Brain.K4me2.sites
mv Brain.H3K4me3.sites ../Brain.K4me3.sites
mv ES.H3K4me1.sites ../ES.K4me1.sites
mv ES.H3K4me2.sites ../ES.K4me2.sites
mv NP.H3K4me1.sites ../NP.K4me1.sites
mv NP.H3K4me2.sites ../NP.K4me2.sites
mv readme.txt ../readme.Meissner2008.txt
cd ..
rmdir Meissner2008/
# Continue to distinguish by .sites
# Brain.K27me3.sites ES.K27.txt ES.K4me2.sites MEF.K4.txt NP.K4.txt NP.K9.txt
# Brain.K4me2.sites ES.K4.txt ES.K9.txt MEF.K9.txt NP.K4me1.sites readme.Meissner2008.txt
# Brain.K4me3.sites ES.K4me1.sites MEF.K27.txt NP.K27.txt NP.K4me2.sites readme.txt
# To range score display from 300 to 1000, use THE SAME CONVERSION AS for the whole group:
# (x * 2) + 300
cd /cluster/data/mm8/bed/broadStemChip
mkdir windowSites
cat > windowSites.Meissner2008.csh << \_EOF_
foreach f (chipseq/windowSites/*.sites)
set b = $f:t
set ab = `echo $b | perl -wpe 's/\w+.(\w)(\w+).sites/H3$1\L$2/'`
set cell = `echo $b | perl -wpe 's/(\w)(\w+).*/\u$1\L$2/'`
echo $cell $ab $b
tail +2 $f | awk '{printf "%s\t%d\t%d\t \t%d\t%s\n", $1, $2, $3, ($4 * 2) + 300, $4}' > windowSites/$cell.$ab.tab
# using kate's version, testing -renameSqlTable option
/cluster/home/kate/bin/x86_64/hgLoadBed mm8 -tab -noNameIx -renameSqlTable \
-sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
broadStemChipWinSites${ab}${cell} windowSites/$cell.$ab.tab
end
_EOF_
# Fix script coloring EOF
chmod +x windowSites.Meissner2008.csh
csh windowSites.Meissner2008.csh > windowSites.Meissner2008.log 2>&1
###############
# Signal track
# indication of #reads near the base, 25bp fixed window, -1 if unalignable base
ssh kkstore04
cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/densities
mkdir -p alignable
cd alignable
foreach f (../*.tar.gz)
tar xvfz $f
end
cd ../..
# Get a list of the datasets
#mkdir -p signal
tar tfz chipseq/Meissner2008/densities/chr1.tar.gz | \
perl -wpe 's/chr\w.(\w+.\w+).txt/$1/' > signal/datasetsMeissner2008.txt
# ignore control (whole-cell extract)
grep -v WCE signal/datasetsMeissner2008.txt > signal/subtracksMeissner2008.txt
wc -l signal/subtracksMeissner2008.txt
# 7
# Extract datasets from by-chrom packaging
# Weed out missing data which are represented as -1 values
# Convert to wiggle
cat > makeWigMeissner2008.csh << \_EOF_
foreach s (`cat signal/subtracksMeissner2008.txt`)
set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/\u$1\u$2/'`
set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
set table = broadStemChipSignal${ab}${cell}
echo $table $s
rm -f signal/$table.wigVar
foreach f (chipseq/Meissner2008/densities/chr*.tar.gz)
set c = $f:t:r:r
echo "variableStep chrom=$c span=25" >> signal/$table.wigVar
tar xfzO $f $c.$s.txt | \
awk 'BEGIN {pos = 1} {print pos, $1; pos += 25}' | \
grep -v '\-1$' >> signal/$table.wigVar
end
cd signal
nice wigEncode $table.wigVar $table.wig $table.wib
cd ..
end
_EOF_
# Fix script coloring EOF
chmod +x makeWigMeissner2008.csh
csh makeWigMeissner2008.csh > makeWigMeissner2008.log 2>&1 &
# check output and cleanup
cd signal
gzip *.wigVar
######## Load wiggles?
ssh hgwdev
#mkdir /gbdb/mm8/broadStemChip
cd /cluster/data/mm8/bed/broadStemChip
cat > loadWigMeissner2008.csh << \_EOF_
#!/bin/csh -fe
cd /cluster/data/mm8/bed/broadStemChip/signal
foreach f (*H3K*me*.wib)
set wi = $f:t:r
set wig = $wi.wig
echo Start: $wig
echo "ln -s `pwd`/$f /gbdb/mm8/broadStemChip/$wi.wib"
hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig
echo Finished: $wig
end
_EOF_
chmod +x loadWigMeissner2008.csh
./loadWigMeissner2008.csh
time nice -n +19 ./loadWigMeissner2008.csh >> loadWigMeissner2008.log 2>&1 &
# Noticed tables badly named, renamed them and corresponding files
# hgsql mm8
# rename table broadStemChipSignalH3Es to broadStemChipSignalH3panEs
# rename table broadStemChipSignalH3K27Es to broadStemChipSignalH3K27me3Es
# rename table broadStemChipSignalH3K27Mef to broadStemChipSignalH3K27me3Mef
# rename table broadStemChipSignalH3K27Np to broadStemChipSignalH3K27me3Np
# rename table broadStemChipSignalH3K36Es to broadStemChipSignalH3K36me3Es
# rename table broadStemChipSignalH3K36EsHyb to broadStemChipSignalH3K36Esme3Hyb
# rename table broadStemChipSignalH3K36Mef to broadStemChipSignalH3K36me3Mef
# rename table broadStemChipSignalH3K36Np to broadStemChipSignalH3K36me3Np
# rename table broadStemChipSignalH3K4Es to broadStemChipSignalH3K4me3Es
# rename table broadStemChipSignalH3K4EsHyb to broadStemChipSignalH3K4Esme3Hyb
# rename table broadStemChipSignalH3K4Mef to broadStemChipSignalH3K4me3Mef
# rename table broadStemChipSignalH3K4Np to broadStemChipSignalH3K4me3Np
# rename table broadStemChipSignalH3K9Es to broadStemChipSignalH3K9me3Es
# rename table broadStemChipSignalH3K9EsHyb to broadStemChipSignalH3K9Esme3Hyb
# rename table broadStemChipSignalH3K9Mef to broadStemChipSignalH3K9me3Mef
# rename table broadStemChipSignalH3K9Np to broadStemChipSignalH3K9me3Np
# rename table broadStemChipSignalH4K20Es to broadStemChipSignalH4K20me3Es
######### Alignments
### Sample from Brain.H3K27me3.aligned.gz
#chr10 63848447 63848474 - 3084.4.1 0 GAGAGCCAATGGCTAGGCAGGGCATCA
### Convert to
#chr10 63848447 63848474 3084.4.1 0 - 63848447 63848474 0,255,0 0 GAGAGCCAATGGCTAGGCAGGGCATCA
# convert to bed-9+ color at 9, mismatch at 10 and seq at 11; grabbed some example code from encodeHg17.txt PET
ssh hgwdev
cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments
mkdir bed
cd bed
cp lab/alignments/bed/make* lab/Meissner2008/alignments/bed
# cat << \_EOF_ > makeBed9PlusFromAlignments.csh
# #!/usr/bin/perl
# # replace "reserved" field of BED >=9 fields with RGB value from 8-scale
# # black->red palette, based on score value.
#
# use warnings;
# use strict;
#
# while (<>) {
# next if (/^track/ || /^\s*\#/);
# chomp;
# my @words = split("\t");
# if (scalar(@words) < 7) {
# @words = split(/\s+/);
# die "Expecting at least 7 tab-sep fields but got fewer, line $.\n"
# if (scalar(@words) < 7);
# }
# my @newWordOrder = ("","","","","","","","","","","");
# $newWordOrder[0] = $words[0]; # chr
# $newWordOrder[1] = $words[1]; # beg
# $newWordOrder[2] = $words[2]; # end
# $newWordOrder[3] = $words[4]; # name
# #$newWordOrder[4] = "0"; # score
# $newWordOrder[4] = 1000 - ($words[5] * 100); # score 0=1000 1=900 2=800
# $newWordOrder[5] = $words[3]; # strand
# $newWordOrder[6] = $words[1]; # beg
# $newWordOrder[7] = $words[2]; # end
# $newWordOrder[8] = "0,0,0"; # color to be set later
# $newWordOrder[9] = $words[5]; # mismatch
# $newWordOrder[10] = $words[6]; # seq
#
# print join("\t", @newWordOrder) . "\n";
# }
# _EOF_
#
# cat << \_EOF_ > makeColoredBedOnStrand.csh
# #!/usr/bin/perl
# # replace "reserved" field of BED >=9 fields with RGB value from 8-scale
# # black->red palette, based on score value.
#
# use warnings;
# use strict;
#
# # palette consistes of red, green blue
# my @blues = ("0,0,255","0,0,204","0,0,170");
# my @greens = ("0,255,0","0,187,0","0,136,0");
#
# while (<>) {
# next if (/^track/ || /^\s*\#/);
# chomp;
# my @words = split("\t");
# if (scalar(@words) < 9) {
# @words = split(/\s+/);
# die "Expecting at least 9 tab-sep fields but got fewer, line $.\n"
# if (scalar(@words) < 9);
# }
# die "More than 9 mismatches found line $.\n"
# if ($words[9] > 9);
# my $strand = $words[5];
# if ($strand eq '+') {
# if( $words[9] > 2 ) {
# $words[8] = $blues[2]; # green
# } else {
# $words[8] = $blues[$words[9]]; # green
# }
# } else {
# if( scalar($words[9]) > 2 ) {
# $words[8] = $greens[2]; # blue
# } else {
# $words[8] = $greens[$words[9]]; # blue
# }
# }
# print join("\t", @words) . "\n";
# }
# _EOF_
# chmod +x makeBed9PlusFromAlignments.csh
# chmod +x makeColoredBedOnStrand.csh
cat << \_EOF_ > convertToBed.csh
#!/bin/csh -fe
cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed
foreach f (../*.aligned.gz)
set root = `echo $f:t:r:r`
zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
echo $root.bed.gz done
end
_EOF_
chmod +x convertToBed.csh
zcat ../Brain.H3K27me3.aligned.gz | head | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh
# chr14 12537326 12537362 205CY.7.1 1000 - 12537326 125373620,255,0 0 GGGATATGGACTGAAATAATTAGGAAAGAAATAACT
## How to make bash work ???
#for f in ../*.txt.gz; do
# root=${f##*/}
# root=${root%.*}
# root=${root%.*}
# zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
# echo $root.bed.gz done
#done
ssh kkstore04
cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed
time nice -n +19 ./convertToBed.csh > convert.log 2>&1 &
# real 25m22.762s
# Brain.H3K27me3.bed.gz done
# Brain.H3K4me2.bed.gz done
# Brain.H3K4me3.bed.gz done
# ES.H3K4me1.bed.gz done
# ES.H3K4me2.bed.gz done
# NP.H3K4me1.bed.gz done
# NP.H3K4me2.bed.gz done
# zcat Brain.H3K27me3.bed.gz | head -2
# chr14 12537326 12537362 205CY.7.1 1000 - 1253732612537362 0,255,0 0 GGGATATGGACTGAAATAATTAGGAAAGAAATAACT
# chr2 70236933 70236969 205CY.7.2 900 + 7023693370236969 0,0,204 1 GAATCCTTGAACATATTTATAATCATTCTTTTTAAT
# Compared to: zcat ../../../alignments/bed/ES.K20.bed.gz | head -2
# chr8 77978889 77978916 3080.2.1 1000 + 7797888977978916 0,0,255 0 GAAGGAAATCAGTCTTTGTTGAGCAGT
# chr12 38598403 38598430 3080.2.2 1000 + 3859840338598430 0,0,255 0 GATATTTCATTCCTTGGAGAAGGGTAA
cp ../../../alignments/bed/myBedTbl.sql .
# cat << \_EOF_ > myBedTbl.sql
# CREATE TABLE myBedTbl (
# bin smallint unsigned not null,
# chrom varchar(255) not null,
# chromStart int unsigned not null,
# chromEnd int unsigned not null,
# name varchar(255) not null,
# score int unsigned not null,
# strand char(1) not null,
# thickStart int unsigned not null,
# thickEnd int unsigned not null,
# reserved int unsigned not null,
# mismatchCount int unsigned not null,
# seq varchar(255) not null,
# #Indices
# INDEX(name(16)),
# INDEX(chrom(5),bin)
# )
# _EOF_
cat << \_EOF_ > loadBedFiles.csh
#!/bin/csh -fe
cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed
foreach f (*.bed.gz)
set root = `echo $f:t:r:r`
set ab = `echo $root | perl -wpe 's/\w+.(\w)(\w+)/\u$1\u$2/'`
set cell = `echo $root | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
set table = broadStemChipAlignments${ab}${cell}
~/bin/x86_64/hgLoadBed -sqlTable=myBedTbl.sql -renameSqlTable mm8 ${table} ${f}
echo ${table} ${f} done
end
_EOF_
chmod +x loadBedFiles.csh
time nice -n +19 ./loadBedFiles.csh > load.log 2>&1 &
real 28m9.939s
# broadStemChipAlignmentsH3K27me3Brain
# broadStemChipAlignmentsH3K4me1Es
# broadStemChipAlignmentsH3K4me1Np
# broadStemChipAlignmentsH3K4me2Brain
# broadStemChipAlignmentsH3K4me2Es
# broadStemChipAlignmentsH3K4me2Np
# broadStemChipAlignmentsH3K4me3Brain
#
# broadStemChipWinSitesH3K27me3Brain
# broadStemChipWinSitesH3K4me1Es
# broadStemChipWinSitesH3K4me1Np
# broadStemChipWinSitesH3K4me2Brain
# broadStemChipWinSitesH3K4me2Es
# broadStemChipWinSitesH3K4me2Np
# broadStemChipWinSitesH3K4me3Brain
#
# broadStemChipSignalH3K27me3Brain
# broadStemChipSignalH3K4me1Es
# broadStemChipSignalH3K4me1Np
# broadStemChipSignalH3K4me2Brain
# broadStemChipSignalH3K4me2Es
# broadStemChipSignalH3K4me2Np
# broadStemChipSignalH3K4me3Brain
# edited trackDb.broadStem.ra
############################################################################
# mm8 - Mouse - Ensembl Genes (DONE - 2008-03-06 - hiram)
ssh kkstore04
cd /cluster/data/mm8
cat << '_EOF_' > mm8.ensGene.ra
# required db variable
db mm8
# optional liftRandoms yes/no or absent
liftRandoms yes
# optional nameTranslation, the sed command that will transform
# Ensemble names to UCSC names. With quotes just to make sure.
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/"
# optionally update the knownToEnsembl table after ensGene updated
knownToEnsembl yes
'_EOF_'
# << happy emacs
doEnsGeneUpdate.pl -ensVersion=46 mm8.ensGene.ra
ssh hgwdev
cd /cluster/data/mm8/bed/ensGene.46
featureBits mm8 ensGene
# 56654064 bases of 2567283971 (2.207%) in intersection
############################################################################
# Reload CCDS from CCDS.20080502 dump (2008-05-03 markd)
# import ccds database as described in ccds.txt
set db=mm8
set ncbiBld=36.1
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
############################################################################
# AGILENT CGH PROBES (Done 2008-05-13, Andy)
# (see hg18.txt)
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
#############################################################################
# MOUSE TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)
# (to build the affyExonTissues track, see the steps outlined in hg18.txt)
#############################################################################
########################################################################
## AFFY ALL EXON PROBESETS (MM8) (DONE 2009-01-29, Andy)
ssh hgwdev
mkdir /hive/data/genomes/mm8/bed/affyAllExonProbes
cd /hive/data/genomes/mm8/bed/affyAllExonProbes
ln -s MoEx-1_0-st-v1.r2.dt1.mm8.csv mm8.csv
wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na20/exon/MoEx-1_0-st-v1.r2.dt1.mm8.zip
sed '1,12d' mm8.csv | tr ',' '\t' | cut -f 1,5-8,12 \
| sed 's/\"//g' | grep -v "\-\-\-" \
| awk 'BEGIN{FS="\t";OFS="\t";}{if ($6 == "core") score = 1000; else if ($6 == "extended") score = 700; else if ($6 == "full") score = 300; else score = 100; name = $1"|"$6; print $2, $4-1, $5, name, score, $3}' \
| bedSort stdin mm8.bed
hgLoadBed mm8 affyAllExonProbes mm8.bed
rm MoEx-1_0-st-v1.r2.dt1.mm8.{cor,ext,full,zip}* bed.tab affycookies.txt mm8.csv
gzip MoEx-1_0-st-v1.r2.dt1.mm8.csv mm8.bed
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
mm8.upstreamGeneTbl = refGene
mm8.upstreamMaf = multiz17way /hive/data/genomes/mm8/bed/multiz17way/species.lst
#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08)
ssh hgwdev
mkdir /cluster/data/mm8/bed/mrnaPcr
cd /cluster/data/mm8/bed/mrnaPcr
hgsql mm8 -NBe 'select * from knownGene' > knownGene.gp
genePredToBed knownGene.gp > ucscGenes.bed
hgsql mm8 -NBe 'select kgId,geneSymbol from kgXref' \
| perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
> idSub.txt
subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
sequenceForBed -keepName -db=mm8 -bedIn=ucscGenesIdSubbed.bed \
-fastaOut=stdout \
| faToTwoBit -ignoreDups stdin kgTargetSeq.2bit
cut -f 1-10 knownGene.gp \
| genePredToFakePsl mm8 stdin kgTargetAli.psl /dev/null
# Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
cd /cluster/data/mm8/bed/mrnaPcr
hgLoadPsl mm8 kgTargetAli.psl
mkdir /gbdb/mm8/targetDb
ln -s /cluster/data/mm8/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/mm8/targetDb/
# Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
# /gbdb/mm8/targetDb/kgTargetSeq.2bit .
ssh hgwdev
# Add records to hgcentraltest blatServers and targetDb:
hgsql hgcentraltest -e \
'INSERT into blatServers values ("mm8Kg", "blat13", 17803, 0, 1);'
hgsql hgcentraltest -e \
'INSERT into targetDb values("mm8Kg", "UCSC Genes", \
"mm8", "kgTargetAli", "", "", \
"/gbdb/mm8/targetDb/kgTargetSeq.2bit", 1, now(), "");'
#############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
############################################################################