src/hg/makeDb/doc/mm9.txt 1.126
1.126 2010/02/23 03:37:21 rhead
Added note for next time about NIAGene track.
Index: src/hg/makeDb/doc/mm9.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm9.txt,v
retrieving revision 1.125
retrieving revision 1.126
diff -b -B -U 1000000 -r1.125 -r1.126
--- src/hg/makeDb/doc/mm9.txt 16 Feb 2010 04:47:43 -0000 1.125
+++ src/hg/makeDb/doc/mm9.txt 23 Feb 2010 03:37:21 -0000 1.126
@@ -1,10214 +1,10217 @@
# for emacs: -*- mode: sh; -*-
# This file describes browser build for the mouse
# genome, April 2007, ncbi mouse_37 - Mm9
#
# "$Id$"
#
#######################################################################
# DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2007-04-05 - Hiram)
#
# Examine disk space issues, find some goodly amount of space
ssh kkstore02
mkdir /cluster/store5/mm9
ln -s /cluster/store5/mm9 /cluster/data/mm9
cd /cluster/data/mm9
## After testing with the pre-release below, the real thing begins here
mkdir mouse_37
cd mouse_37
## Ouch, the files are no longer delivered conveniently in a single
## directory. They are in several locations now ...
NCBI=ftp://ftp.ncbi.nih.gov/genomes
MAPVIEW=MapView/Mus_musculus/sequence/BUILD.37.1/initial_release
for F in README README_CURRENT_BUILD
do
wget --dont-remove-listing --timestamping \
"${NCBI}/M_musculus/${F}" -O ${F}
done
for F in allcontig.agp.gz seq_contig.md.gz ideogram.gz
do
wget --dont-remove-listing --timestamping \
"${NCBI}/${MAPVIEW}/${F}" -O ${F}
done
# survey the strains contained in seq_contig.md.gz
zcat seq_contig.md.gz | awk '{print $9}' | sort | uniq -c | sort -rn
13075 Celera
360 C57BL/6J
101 129/SvJ
93 129/Sv
79 unknown
75 129/SvEvTac
40 NOD
26 129S7/SvEv
14 129/Ola
7 129
6 Cast/Ei
6 BALB/c
3 SJL/J
3 C3H
3 B6/CBAF1J
3 AKR/J
3 A/J
2 Spret/Ei
1 group_label
1 129/J
# we will work on the C57BL/6J strain
mkdir -p chrAgp
cd chrAgp
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
do
wget --dont-remove-listing --timestamping \
"${NCBI}/M_musculus/Assembled_chromosomes/mm_ref_chr${C}.agp.gz" \
-O chr${C}.agp.gz
done
cd ..
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
do
zcat chrAgp/chr${C}.agp.gz | grep "^c"
done > chrOnly.agp
mkdir -p chrfasta
cd chrfasta
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
do
wget --dont-remove-listing --timestamping \
"${NCBI}/M_musculus/Assembled_chromosomes/mm_ref_chr${C}.fa.gz" \
-O chr${C}.fa.gz
done
cd ..
mkdir chrUn
mkdir chrM
wget --dont-remove-listing --timestamping \
"${NCBI}/M_musculus/CHR_Un/mm_ref_chrUn.fa.gz -O chrUn/chrUn.fa.gz
wget --dont-remove-listing --timestamping \
"${NCBI}/M_musculus/CHR_MT/mm_ref_chrUn.fa.gz \
-O chrM/mm_ref_chrUn.fa.gz
mkdir contigFasta
for C in 1 2 3 4 5 6 7 8 9
do
wget --dont-remove-listing --timestamping \
"${NCBI}/M_musculus/CHR_0${C}/mm_ref_chr${C}.fa.gz" \
-O contigFasta/chr${C}.fa.gz
done
for C in 0 1 2 3 4 5 6 7 8 9
do
wget --dont-remove-listing --timestamping \
"${NCBI}/M_musculus/CHR_1${C}/mm_ref_chr1${C}.fa.gz" \
-O contigFasta/chr1${C}.fa.gz
done
for C in X Y Un MT
do
wget --dont-remove-listing --timestamping \
"${NCBI}/M_musculus/CHR_${C}/mm_ref_chr${C}.fa.gz" \
-O contigFasta/chr${C}.fa.gz
done
mv contigFasta/chrMT.fa.gz contigFasta/chrM.fa.gz
## split up the contigFasta files into their individual contigs
## the sed fixes the fasta header name to just be the contig name
mkdir splitContigs
for F in contigFasta/chr*.fa.gz
do
BN=`basename ${F}`
C=${BN/.fa.gz/}
echo $F $BN $C
echo -n "${C} working ... "
mkdir -p splitContigs/${C}
zcat ${F} | sed -e "s/.*ref|/>/; s/|.*//" \
| faSplit byname stdin splitContigs/${C}/
echo "done"
done
## create agp files for the randoms from seq_contig.md and allcontig.agp
## both fragment and contig agp files
$HOME/kent/src/hg/mouseStuff/buildTools/seqContigToAgp.pl \
randomFragments.agp randomContigs.agp 2> randomContigs.err
## create contig agp file for non-randoms
$HOME/kent/src/hg/mouseStuff/buildTools/mkContigAgp.pl allContigs.agp
## combine the two contig agp files
cat allContigs.agp randomContigs.agp > mm9.contigs.agp
## separate the random contigs from the non-random contigs
$HOME/kent/src/hg/mouseStuff/buildTools/sortRandoms.pl \
randomContigs.agp > mvRandoms.sh
## inspect mvRandoms.sh and then run it if it is OK
chmod +x mvRandoms.sh
./mvRandoms.sh
## verify all contigs exist properly
$HOME/kent/src/hg/mouseStuff/buildTools/checkContigs.pl mm9.contigs.agp
## create all contigs fasta file
cd splitContigs
find . -type f | xargs cat > ../mm9.contigs.fa
## create assembled sequence from these contigs and agp file
cd ..
agpToFa -simpleMulti mm9.contigs.agp all mm9.assembled.fa mm9.contigs.fa
## create fragments agp file
cat chrOnly.agp randomFragments.agp > mm9.fragments.agp
## verify this agp too will work with the assembled fasta
## need 2bit file to avoid fasta file ordering difficulty
faToTwoBit mm9.assembled.fa mm9.assembled.2bit
checkAgpAndFa mm9.fragments.agp mm9.assembled.2bit
## it has a problem with chrY because it is supposed to end with:
chrY 2902556 5902555 29 N 3000000 centromere no
chrY 5902556 15902555 30 N 10000000 contig no
## edit mm9.contigs.agp to add these two lines, and repeat the agpToFa
## after that, this check fails on chrX_random
## this is supposed to be a gap, with N's
## chrX_random 300319 303472 46 N 3154 fragment yes
# Loop: chrX_random, dnaOffset=300318, seqSize=1785075
# agpFrag->chromStart: 300318, agpFrag->chromEnd: 303472, dnaOffset: 300318
# FASTA gap entry
# Bad char a found at index 300349
# Invalid Agp or Fasta file entry for sequence chrX_random
# agpMatchesFaEntry failed; exiting
## this comes from the use of a single fragment in two parts,
## from allcontig.agp
NT_165789.2 296206 300318 45 W CAAA01187194.1 1 4113 +
NT_165789.2 300319 300349 46 N 31 fragment no
NT_165789.2 300350 303372 47 W CAAA01187194.1 4145 7167
## which I processed into:
chrX_random 296206 300318 45 W CAAA01187194.1 1 4113 +
chrX_random 300319 303472 46 N 3154 fragment yes
## should have been
chrX_random 296206 300318 45 W CAAA01187194.1 1 4113 +
chrX_random 300319 300349 46 N 31 fragment yes
chrX_random 300350 303372 47 W CAAA01187194.1 4145 7167 +
### NCBI had this as a non-bridged fragment, a 'no' - I'm making it a yes
## so, edit the randomFragments.agp to fixup that line as indicated
## the chrOnly.agp file also needs an entry for chrM, add this
## line to chrOnly.agp:
chrM 1 16299 1 F NC_005089.1 1 16299 +
## now have successful business:
checkAgpAndFa mm9.fragments.agp mm9.assembled.2bit
# All AGP and FASTA entries agree - both files are valid
## let's get the sequence in order in the fasta file
faSplit byname mm9.assembled.fa splitChr/
cut -f1 mm9.fragments.agp | uniq -c
## using the order of this fragments.agp file
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y M \
13_random 16_random 17_random 1_random 3_random 4_random 5_random \
7_random 8_random 9_random Un_random X_random Y_random
do
cat splitChr/chr${C}.fa
done > mm9.fragorder.assembled.fa
## now that fasta file should also be OK
checkAgpAndFa mm9.fragments.agp mm9.fragorder.assembled.fa
# All AGP and FASTA entries agree - both files are valid
## now ready to give this agp and fasta file off to makeGenomeDb.pl
## pre-release testing download sequence ###############################
mkdir ncbi
cd ncbi
cp -p /cluster/data/mm8/ncbi/.wgetrc .
WGETRC=`pwd`/.wgetrc
export WGETRC
time nice -n +19 wget --timestamping --force-directories \
--directory-prefix=. --dont-remove-listing --recursive \
--level=4 --no-parent --no-host-directories --cut-dirs=1 \
ftp://ftp-private.ncbi.nih.gov/mouse_37
# Downloaded: 2,599,733,765 bytes in 196 files
# The pre-release sequence, April 5th:
mkdir /cluster/data/mm9/pre_release
cd /cluster/data/mm9/pre_release
# The .wgetrc is the anonymous user
cat << '_EOF_' > .wgetrc
login = anonymous
passwd = <your email address>
'_EOF_'
# << happy emacs
chmod 600 .wgetrc
WGETRC=`pwd`/.wgetrc
export WGETRC
wget --timestamping --force-directories --directory-prefix=. \
--dont-remove-listing --recursive --level=4 --no-parent \
--no-host-directories --cut-dirs=3 \
ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release
## Ran a quick test build with that to see if it would work
### this procedure run for the pre_release and the mouse_37 sequence
### for pre_release the sed was:
# zcat chrfasta/chr${C}.fa.gz | sed -e "s/^>lcl|/>/; s/.fa.*//"
mkdir chrNamesFixed
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
do
zcat chrfasta/chr${C}.fa.gz | sed -e "s/^>gi.*/>chr${C}/" \
| gzip -c > chrNamesFixed/chr${C}.fa.gz
echo chr${C} done
done
zcat chrM/mm_ref_chrMT.fa.gz | sed -e "s/^>gi.*/>chrM/" \
| gzip -c > chrNamesFixed/chrM.fa.gz
## later on, an error was discovered in the processing of chrY_random
# a lot of gaps of size zero were inserted. They didn't cause any
# disruption to the assembly track, they only caused extra gap entries
# that were useless. So, to fixup, remove anything in the chrY_gap
# table that has a size of zero:
hgsql -e 'delete from chrY_random_gap where size<"1";' mm9
## And, fixing the one fragment on chrX_random
hgsql -e 'INSERT chrX_random_gap VALUES("587", "chrX_random",
"300318", "300349", "46", "N", "31", "fragment", "yes")' mm9
hgsql -e 'DELETE from chrX_random_gold where chromStart="296205";' mm9
hgsql -e 'INSERT chrX_random_gold VALUES("587", "chrX_random",
"296205", "300318", "45", "W", "CAAA01187194.1", "0", "4113", "+")' mm9
hgsql -e 'INSERT chrX_random_gold VALUES("587", "chrX_random",
"300349", "303372", "45", "W", "CAAA01187194.1", "4144", "7167", "+")' mm9
##########################################################################
## final makeGenomeDb.pl (DONE - 2007-07-19 - Hiram)
## to make this go again, some things need to be removed or set-aside
ssh hgwdev
hgsql -e 'delete from dbDb where name="mm9";' hgcentraltest
rm -fr /gbdb/mm9
ssh kkstore06
cd /cluster/data/mm9
mv mm9.config.ra mm9.config.pre_release.ra
mv bed bed.pre_release
mv mm9.unmasked.2bit mm9.unmasked.2bit.pre_release
mv mm9.agp mm9.agp.pre_release
mv mm9.randoms.2bit mm9.randoms.2bit.pre_release
mv mm9.rmsk.2bit mm9.rmsk.2bit.pre_release
mv mm9.rmskTrf.2bit mm9.rmskTrf.2bit.pre_release
rm mm9.2bit
rm -fr ? ??
mv dbDbInsert.sql dbDbInsert.sql.pre_release
mv makeGenomeDb.out makeGenomeDb.out.pre_release
mv chrom.lst chrom.lst.pre_release
mv jkStuff jkStuff.pre_release
## ask cluster-admin to rename the existing mm9 db to be mm9prerelease
cat << '_EOF_' > mm9.config.ra
# Config parameters for makeGenomeDb.pl:
db mm9
scientificName Mus musculus
commonName Mouse
assemblyDate Jul. 2007
assemblyLabel NCBI Build 37
orderKey 121
mitoAcc none
fastaFiles /cluster/data/mm9/mouse_37/mm9.fragorder.assembled.fa
agpFiles /cluster/data/mm9/mouse_37/mm9.fragments.agp
# qualFiles /dev/null
dbDbSpeciesDir mouse
'_EOF_'
# << happy emacs
time nice -n +19 makeGenomeDb.pl mm9.config.ra > makeGenomeDb.out 2>&1 &
# real 24m24.468s
ssh hgwdev
featureBits mm9 gold
# 2620346158 bases of 2620346158 (100.000%) in intersection
featureBits mm8 gold
# 2567283971 bases of 2567283971 (100.000%) in intersection
featureBits mm9 gap
# 105419323 bases of 2620346158 (4.023%) in intersection
featureBits mm8 gap
# 97171117 bases of 2567283971 (3.785%) in intersection
# verify index is correct:
hgsql mm9 -e "show index from gc5Base;"
# should see good numbers in Cardinality column
# Reset default position to be like Mm8
hgsql -e \
'update dbDb set defaultPos="chr12:50258170-50263946" where name="mm9";' \
hgcentraltest
# create initial symlink for 2bit sequence
mkdir /gbdb/mm9
mkdir /gbdb/mm9/html
ln -s /cluster/data/mm9/mm9.unmasked.2bit /gbdb/mm9/mm9.2bit
## enter the trackDb business (was done in the pre-release test)
##########################################################################
## Initial pre-release makeGenomeDb.pl (DONE - 2007-04-05 - Hiram)
ssh kkstore02
cd /cluster/data/mm9
cat << '_EOF_' > mm9.config.ra
# Config parameters for makeGenomeDb.pl:
db mm9
scientificName Mus musculus
commonName Mouse
assemblyDate Apr. 2007
assemblyLabel NCBI Build 37
orderKey 121
mitoAcc 33115104
fastaFiles /cluster/data/mm9/pre_release/chrNamesFixed/chr*.fa.gz
agpFiles /cluster/data/mm9/pre_release/chrOnly.agp
# qualFiles /dev/null
dbDbSpeciesDir mouse
'_EOF_'
# << happy emacs
time nice -n +19 makeGenomeDb.pl mm9.config.ra > makeGenomeDb.out 2>&1 &
# real 24m24.468s
##########################################################################
## Repeat masker (DONE - 2007-04-05 - Hiram)
## RE-DONE with final sequence 2007-07-19 - Hiram
ssh kkstore06
## use screen for this
mkdir /cluster/data/mm9/bed/RepeatMasker
cd /cluster/data/mm9/bed/RepeatMasker
time nice -n +19 doRepeatMasker.pl -bigClusterHub=kk \
-buildDir=/cluster/data/mm9/bed/RepeatMasker mm9 > do.out 2>&1 &
# real 1726m32.849s
# Completed: 5467 of 5467 jobs
# CPU time in finished jobs: 54774630s 912910.50m 15215.17h 633.97d 1.737 y
# IO & Wait Time: 432302s 7205.04m 120.08h 5.00d 0.014 y
# Average job time: 10098s 168.30m 2.81h 0.12d
# Longest finished job: 20982s 349.70m 5.83h 0.24d
# Submission to last job: 100294s 1671.57m 27.86h 1.16d
ssh kkstore06
cd /cluster/data/mm9
twoBitToFa mm9.rmsk.2bit stdout | faSize stdin
# 2725765481 bases (105419509 N's 2620345972 real 1466644650 upper
# 1153701322 lower) in 35 sequences in 1 files
# %42.33 masked total, %44.03 masked real
##############################################################################
## simpleRepeat masking (DONE - 2007-04-07 - Hiram)
## RE-DONE with final sequence 2007-07-19 - Hiram
ssh kolossus
## use screen for this
mkdir /cluster/data/mm9/bed/simpleRepeat
cd /cluster/data/mm9/bed/simpleRepeat
time nice -n +19 twoBitToFa ../../mm9.unmasked.2bit stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
-bedAt=simpleRepeat.bed -tempDir=/scratch/tmp
# real 253m44.602s
# Appears to have an error on something:
# sh: line 1: 18346 File size limit exceeded/cluster/bin/i386/trf /scratch/tmp/stdin_kolossus_3af1_fe9700.tf 2 7 7 80 10 50 2000 -m -d
# Expecting 14 words line 4593 of /scratch/tmp/stdin_kolossus_3af1_fe9700.tf.2.7.7.80.10.50.2000.dat got 1
# Let's try running this on the kki kluster, by chrom
ssh kkr1u00
mkdir /iscratch/i/mus/mm9
cd /iscratch/i/mus/mm9
cp -p /cluster/data/mm9/mm9.unmasked.2bit .
cp -p /cluster/data/mm9/chrom.sizes .
cut -f1 chrom.sizes | while read C
do
twoBitToFa -noMask -seq=${C} mm9.unmasked.2bit stdout | gzip -c > ${C}.fa.gz
echo ${C}
done
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/mus/mm9/ kkr${R}u00:/iscratch/i/mus/mm9/
done
ssh kki
mkdir /cluster/data/mm9/bed/simpleRepeat/trf
cd /cluster/data/mm9/bed/simpleRepeat/trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set C = $1
set GZ = /iscratch/i/mus/mm9/$C.fa.gz
mkdir -p /scratch/tmp/$C
zcat $GZ > /scratch/tmp/$C/$C.fa
pushd /scratch/tmp/$C
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $C.fa \
/dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C
popd
rm -f $C.bed
cp -p /scratch/tmp/$C/$C.bed .
rm -fr /scratch/tmp/$C
'_EOF_'
# << happy emacs
chmod +x runTrf
cat << '_EOF_' > template
#LOOP
./runTrf $(path1) {check out line $(root1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 /iscratch/i/mus/mm9/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
para try ... check ... push ... etc ...
## none of these jobs and any trouble, running line counts of these result
## bed files with the previous failed run indicates there are identical
# Completed: 35 of 35 jobs
# CPU time in finished jobs: 14620s 243.66m 4.06h 0.17d 0.000 y
# IO & Wait Time: 272s 4.54m 0.08h 0.00d 0.000 y
# Average job time: 425s 7.09m 0.12h 0.00d
# Longest finished job: 1386s 23.10m 0.39h 0.02d
# Submission to last job: 1790s 29.83m 0.50h 0.02d
cat *.bed > ../simpleRepeat.bed
cd ..
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
ssh hgwdev
cd /cluster/data/mm9/bed/simpleRepeat
time nice -n +19 hgLoadBed mm9 simpleRepeat \
simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 1167619 elements of size 16
# real 0m33.312s
nice -n +19 featureBits mm9 simpleRepeat
# 80054947 bases of 2620346158 (3.055%) in intersection
## clean up the /iscratch/i/mus/mm9/ directory
## for downloads:
mkdir trfMaskChrom
cd trfMaskChrom
ln -s ../trf/chr*.bed .
###########################################################################
# CREATE MICROSAT TRACK (DONE - 2007-07-20 - Hiram)
ssh hgwdev
mkdir /cluster/data/mm9/bed/microsat
cd /cluster/data/mm9/bed/microsat
awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
../simpleRepeat/simpleRepeat.bed > microsat.bed
hgLoadBed mm9 microsat microsat.bed
# Loaded 195688 elements of size 4
featureBits mm9 microsat
# 8713212 bases of 2620346158 (0.333%) in intersection
featureBits mm8 microsat
# 8570611 bases of 2567283971 (0.334%) in intersection
#############################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2007-07-21 - Hiram)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kkstore06
cd /cluster/data/mm9/bed/simpleRepeat
mkdir trfMask
for F in trf/chr*.bed
do
echo "${F} -> ${F/trf\//}"
awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//}
done
## Add trfMask to repeat masked sequence
ssh kkstore06
cd /cluster/data/mm9
cat << '_EOF_' > addTrf.csh
#!/bin/csh -efx
# This script will fail if any of its commands fail.
set DB = mm9
set WORK_DIR = /cluster/data/${DB}
cd ${WORK_DIR}
set inputTwoBit = ${WORK_DIR}/${DB}.rmsk.2bit
set outputTwoBit = ${WORK_DIR}/${DB}.rmskTrf.2bit
cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \
| twoBitMask -add -type=.bed ${inputTwoBit} stdin ${outputTwoBit}
twoBitToFa ${outputTwoBit} stdout | faSize stdin > faSize.${DB}.rmskTrf.txt
'_EOF_'
# << happy emacs
chmod +x ./addTrf.csh
time ./addTrf.csh
cat faSize.mm9.rmskTrf.txt
# 2725765481 bases (105419509 N's 2620345972 real 1465037892 upper
# 1155308080 lower) in 35 sequences in 1 files
# %42.38 masked total, %44.09 masked real
ln -s mm9.rmskTrf.2bit mm9.2bit
# fixup /gbdb/mm9/mm9.2bit symlink to this newly masked sequence
## copy to san for genbank kluster run
cd /cluster/data/mm9
cp -p mm9.rmskTrf.2bit /san/sanvol1/scratch/mm9/mm9.2bit
############################################################################
# BLATSERVERS ENTRY (DONE - 2007-04-09 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("mm9", "blat14", "17790", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("mm9", "blat14", "17791", "0", "1");' \
hgcentraltest
# test it with some sequence
########################################################################
## CYTOBAND - ideogram track (DONE - 2007-08-15 - Hiram)
ssh hgwdev
mkdir /cluster/data/mm9/bed/cytoBand
cd /cluster/data/mm9/bed/cytoBand
# Create bed file
# (this script fixed up to eliminate one of the lines from ideogram file)
$HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl ../../mouse_37/ideogram
### doesn't work, the ideogram file is corrupted, use the one fetched below
## as so:
$HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl ideogram
## can now verify before load:
$HOME/kent/src/utils/ncbi/cytoBandVerify.pl
# everything checks out OK on 21 chroms
# Load the bed file
hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
mm9 cytoBand cytoBand.bed
# Make cytoBandIdeo track for ideogram gif on hgTracks page.
# For mouse cytoBandIdeo is just a replicate of the cytoBand track.
hgsql -e "drop table cytoBandIdeo;" mm9
hgsql mm9 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"
## fetch updated ideogram.gz file that has been fixed by NCBI
NCBI=ftp://ftp.ncbi.nih.gov/genomes
MAPVIEW=MapView/Mus_musculus/sequence/BUILD.37.1/updates
wget --dont-remove-listing --timestamping \
"${NCBI}/${MAPVIEW}/ideogram.gz" -O ideogram.gz
## run through the createNcbiCytoBand.pl process above, and then load
## can now verify before load:
$HOME/kent/src/utils/ncbi/cytoBandVerify.pl
# everything checks out OK on 21 chroms
##########################################################################
## GENBANK alignments (DONE - 2007-08-03 - Hiram)
## next time: don't forget to make the 11.ooc file, see below
## generate a lift file that specifies segments separated by non-bridged
## gaps
## make the ooc file
ssh kolossus
cd /cluster/data/mm9
time blat mm9.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=912
# real 2m29.455s
cp -p 11.ooc /san/sanvol1/scratch/mm9
cp -p 11.ooc jkStuff
## also setup /iscratch/i/mus/mm9/ with these files for
## other kluster runs:
# -rw-rw-r-- 1 712923274 Jul 21 13:31 mm9.2bit
# -rw-rw-r-- 1 17179 Jul 23 16:18 nonBridgedGap.lft
# -rw-rw-r-- 1 122352 Jul 24 11:32 11.ooc
ssh hgwdev
cd /cluster/data/mm9/jkStuff
gapToLift mm9 nonBridgedGap.lft
# WARNING: gap at end of chromosome at chrY:5902555-15902555
# WARNING: overlapping gap at chrY:2902555-5902555 and chrY:5902555-15902555
## These warnings are true, chrY has two gaps next to each other, and
## the second one is actually the end of the chrom. This is the way the
## NCBI supplied AGP file is. (this seems to be normal in hg18 too ...)
cp -p nonBridgedGap.lft /san/sanvol1/scratch/mm9
cd ..
cp -p mm9.rmskTrf.2bit /san/sanvol1/scratch/mm9/mm9.2bit
## The genbank.conf entry looks like:
# mm9
mm9.serverGenome = /cluster/data/mm9/mm9.2bit
mm9.clusterGenome = /san/sanvol1/scratch/mm9/mm9.2bit
mm9.ooc = /cluster/data/mm9/11.ooc
mm9.align.unplacedChroms = *
mm9.lift = /cluster/data/mm9/jkStuff/nonBridgedGap.lft
mm9.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
mm9.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
mm9.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
mm9.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
mm9.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
mm9.downloadDir = mm9
mm9.refseq.mrna.xeno.load = yes
mm9.refseq.mrna.xeno.loadDesc = yes
mm9.mgcTables.default = full
mm9.mgcTables.mgc = all
ssh kkstore02
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial mm9 &
## var/build/logs/2007.07.26-21:57:22.mm9.initalign.log
## logFile: var/build/logs/2007.07.23-16:44:31.mm9.initalign.log
# real 771m12.978s
# a couple of failed jobs, finish off the align step manually
ssh kk
cd /cluster/bluearc/genbank/work/initial.mm9/align
para time
# Completed: 50580 of 50580 jobs
# CPU time in finished jobs: 14556484s 242608.06m 4043.47h 168.48d 0.462 y
# IO & Wait Time: 988518s 16475.30m 274.59h 11.44d 0.031 y
# Average job time: 307s 5.12m 0.09h 0.00d
# Longest finished job: 1815s 30.25m 0.50h 0.02d
# Submission to last job: 40513s 675.22m 11.25h 0.47d
## after recovery of the alignments jobs
ssh kkstore02
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -continue=finish -initial mm9 &
# var/build/logs/2007.07.27-11:02:00.mm9.initalign.log
# real 169m53.124s
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad mm9
# var/dbload/hgwdev/logs/2007.07.27-14:10:22.dbload.log
# real 54m55.707s
## the two measurements are for two different runs of genbank,
## once configured as "ordered" secondly configured as "finished"
featureBits mm9 refGene:cds
# 30105171 bases of 2620346127 (1.149%) in intersection
# 30113840 bases of 2620346127 (1.149%) in intersection
featureBits mm9 refGene
# 51164928 bases of 2620346127 (1.953%) in intersection
# 51175624 bases of 2620346127 (1.953%) in intersection
featureBits mm9 mrna
# 135379415 bases of 2620346127 (5.166%) in intersection
# 137195240 bases of 2620346127 (5.236%) in intersection
featureBits mm9 mgcGenes
# 33676155 bases of 2620346127 (1.285%) in intersection
# 34012201 bases of 2620346127 (1.298%) in intersection
featureBits mm9 est
# 184121510 bases of 2620346127 (7.027%) in intersection
# 188799620 bases of 2620346127 (7.205%) in intersection
featureBits mm9 intronEst
# 52305179 bases of 2620346127 (1.996%) in intersection
# 52812173 bases of 2620346127 (2.015%) in intersection
featureBits mm9 xenoMrna
# 46119254 bases of 2620346127 (1.760%) in intersection
# 51438566 bases of 2620346127 (1.963%) in intersection
featureBits mm9 xenoRefGene
# 40378885 bases of 2620346127 (1.541%) in intersection
# 44298281 bases of 2620346127 (1.691%) in intersection
# enable daily alignment and update of hgwdev (DONE - 2007-08-03 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add mm9 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added mm9 - Mus musculus" etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# MAP CONTIGS TRACK (DONE - 2007-07-23 - Hiram)
## can take contig information directly from previously created
## mm9.contigs.agp
ssh hgwdev
mkdir /cluster/data/mm9/bed/ctgPos
cd /cluster/data/mm9/bed/ctgPos
grep CONTIG ../../mouse_37/mm9.contigs.agp \
| awk '{printf "%s\t%d\t%s\t%d\t%d\n", $6, $8, $1, $2-1, $3}' \
> mm9.ctgPos.tab
hgsql mm9 < ~/kent/src/hg/lib/ctgPos.sql
hgsql mm9 -e 'load data local infile "mm9.ctgPos.tab" into table ctgPos;'
featureBits -countGaps mm9 ctgPos
# 2623952781 bases of 2725765481 (96.265%) in intersection
featureBits -countGaps mm8 ctgPos
# 2573322222 bases of 2664455088 (96.580%) in intersection
#########################################################################
## Create downloads directory (DONE - 2007-07-25 - Hiram)
ssh hgwdev
mkdir /cluster/data/mm9/bed/simpleRepeat/trfMaskChrom
cd /cluster/data/mm9/bed/simpleRepeat/trfMaskChrom
ln -s ../trf/chr*.bed .
cd /cluster/data/mm9
time nice -n +19 /cluster/bin/scripts/makeDownloads.pl mm9 \
> do.downloads.out 2>&1
# real 41m18.282s
## failed during jkStuff/doInstall.csh:
# foreach size ( 1000 2000 5000 )
# echo 1000
# featureBits mm9 refGene:upstream:1000 -fa=stdout
# setpriority: Permission denied.
# Error writing 50 bytes: Operation not permitted
## remove the "nice" statements from the csh, and finish it off
## edit the README files to indicate correct information
##########################################################################
# MGI LIFTOVER FROM MM8 (DONE 2007-07-26 angie)
ssh kolossus
mkdir /cluster/data/mm9/bed/jaxLiftOver
cd /cluster/data/mm9/bed/jaxLiftOver
ldHgGene -out=stdout -nobin placeholder placeholder \
/cluster/data/mm8/bed/jax/2007_07/jaxRepTranscript.gff \
| liftOver stdin -minBlocks=0.5 \
/cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
-genePred jaxRepTranscriptLift.{gp,unmapped}
#Read 31587 transcripts in 232925 lines in 1 files
wc -l jaxRepTranscriptLift.{gp,unmapped}
# 31470 jaxRepTranscriptLift.gp
# 234 jaxRepTranscriptLift.unmapped
liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxAllele.bed \
/cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
-bedPlus=12 jaxAlleleLift.{bed,unmapped}
wc -l jaxAlleleLift.{bed,unmapped}
# 12372 jaxAlleleLift.bed
# 2 jaxAlleleLift.unmapped
liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxPhenotype.bed \
/cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
-bedPlus=12 -tab jaxPhenotypeLift.{bed,unmapped}
wc -l jaxPhenotypeLift.{bed,unmapped}
# 23806 jaxPhenotypeLift.bed
# 0 jaxPhenotypeLift.unmapped
liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxQtl.bed \
/cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
-bedPlus=6 -tab jaxQtlLift.{bed,unmapped}
wc -l jaxQtlLift.{bed,unmapped}
# 1539 jaxQtlLift.bed
# 12 jaxQtlLift.unmapped
# Load lifted track tables and original auxiliary tables:
ssh hgwdev
cd /cluster/data/mm8/bed/jaxLiftOver
# jaxRepTranscriptLift
ldHgGene -predTab mm9 jaxRepTranscriptLift jaxRepTranscriptLift.gp
#31470 gene predictions
sed -e 's/jaxRepTranscript/jaxRepTranscriptLift/g' \
/cluster/data/mm8/bed/jax/2007_07/fixJaxRepTranscript.sql \
> fixJaxRepTranscriptLift.sql
hgsql mm9 < fixJaxRepTranscriptLift.sql
hgLoadSqlTab mm9 jaxRepTranscriptAlias \
/cluster/data/mm8/bed/jax/2007_07/jaxRepTranscriptAlias.sql \
/cluster/data/mm8/bed/jax/2007_07/jaxRepTranscriptAlias.tab
hgsql mm9 -e 'rename table jaxRepTranscriptAlias to jaxRepTranscriptLiftAlias;'
# jaxAlleleLift
sed -e 's/bed12Source/jaxAlleleLift/g' \
$HOME/kent/src/hg/lib/bed12Source.sql > jaxAlleleLift.sql
hgLoadBed -sqlTable=jaxAlleleLift.sql mm9 jaxAlleleLift jaxAlleleLift.bed
#Loaded 12372 elements of size 13
sed -e 's/jaxAllele/jaxAlleleLift/g' \
/cluster/data/mm8/bed/jax/2007_07/fixJaxAllele.sql > fixJaxAlleleLift.sql
# empty file, but just in case it has something in the future...
hgsql mm9 < fixJaxAlleleLift.sql
hgLoadSqlTab mm9 jaxAlleleInfo \
~/kent/src/hg/lib/jaxAlleleInfo.sql \
/cluster/data/mm8/bed/jax/2007_07/jaxAlleleInfo.tab
# jaxPhenotypeLift
sed -e 's/bed12Source/jaxPhenotypeLift/g' \
~/kent/src/hg/lib/bed12Source.sql > jaxPhenotypeLift.sql
hgLoadBed -tab -sqlTable=jaxPhenotypeLift.sql mm9 jaxPhenotypeLift \
jaxPhenotypeLift.bed
#Loaded 23806 elements of size 13
sed -e 's/jaxPhenotype/jaxPhenotypeLift/g' \
/cluster/data/mm8/bed/jax/2007_07/fixJaxPhenotype.sql \
> fixJaxPhenotypeLift.sql
# empty file, but just in case it has something in the future...
hgsql mm9 < fixJaxPhenotypeLift.sql
hgLoadSqlTab mm9 jaxPhenotypeAlias \
/cluster/data/mm8/bed/jax/2007_07/jaxPhenotypeAlias.sql \
/cluster/data/mm8/bed/jax/2007_07/jaxPhenotypeAlias.tab
hgsql mm9 -e 'rename table jaxPhenotypeAlias to jaxPhenotypeLiftAlias;'
# phenotype-allele relationships
hgLoadSqlTab mm9 jaxAllelePheno \
~/kent/src/hg/lib/jaxAllelePheno.sql \
/cluster/data/mm8/bed/jax/2007_07/jaxAllelePheno.tab
# jaxQTLLift
sed -e 's/jaxQTL/jaxQTLLift/g'\
~/kent/src/hg/lib/jaxQTL.sql > jaxQTLLift.sql
hgLoadBed -tab -notItemRgb -noBin \
-sqlTable=jaxQTLLift.sql \
mm9 jaxQTLLift jaxQtlLift.bed
#Loaded 1539 elements of size 10
# Add row to mm9.grp for Phenotype and Allele track group:
hgsql mm9 -e 'insert into grp values("phenoAllele", "Phenotype and Allele", 4.5);'
##########################################################################
## Creating pushQ (DONE - 2007-07-26 - Hiram)
ssh hgwdev
mkdir /cluster/data/mm9/pushQ
cd /cluster/data/mm9/pushQ
/cluster/bin/scripts/makePushQSql.pl mm9 > mm9.sql 2> stderr.out
## check the stderr.out for anything that needs to be fixed
## copy mm9.sql to hgwbeta:/tmp
scp mm9.sql hgwbeta:/tmp
## then on hgwbeta
ssh hgwbeta
cd /tmp
hgsql qapushq < mm9.sql
#############################################################################
# STS MARKERS DATA DOWNLOAD (DONE - 2007-07-26 - Hiram)
ssh kkstore06
mkdir -p /cluster/data/mm9/bed/STSmarkers/downloads
cd /cluster/data/mm9/bed/STSmarkers/downloads
# these files appear to be new almost every day
time nice -n +19 wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
time nice -n +19 wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
# The new feature in the .aliases file this time are names with
# spaces in them ! This changes our parsing business below,
# hopefully the spaces in the names won't cause trouble elsewhere.
time nice -n +19 wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*
# these reports from jax.org appear to be changing daily
time nice -n +19 wget --timestamping \
ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
time nice -n +19 wget --timestamping \
ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
time nice -n +19 wget --timestamping \
ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt
ls -ogrt
# -rw-rw-r-- 1 676 Mar 11 2004 README
# -rw-rw-r-- 1 396858 Jan 28 2005 10090.MGI.txt
# -rw-rw-r-- 1 390139 Mar 16 2005 10090.WI_MRC_RH.txt
# -rw-rw-r-- 1 240688 Mar 16 2005 10090.WI-YAC.txt
# -rw-rw-r-- 1 173344 Mar 16 2005 10090.WI-Genetic.txt
# -rw-rw-r-- 1 25691253 Jan 13 2006 UniSTS.aliases
# -rw-rw-r-- 1 4582158 Jul 5 11:40 UniSTS_mouse.sts
# -rw-rw-r-- 1 2841773 Jul 26 03:13 PRB_PrimerSeq.rpt
# -rw-rw-r-- 1 5149790 Jul 26 03:13 MRK_Sequence.rpt
# -rw-rw-r-- 1 5697140 Jul 26 03:13 MRK_Dump2.rpt
# I note the UniSTS.aliases file is over twice as big as was in
# Mm7 build. I wonder what got into it ...
# What got into it was that it was completely broken. It appeared
# to have a vast section of itself duplicated again in the file.
# It was cleaned up via:
echo -e "#Unique ID\tAliases" > uniqueSTS.aliases
grep -v "^#" UniSTS.aliases | sort -n | uniq >> uniqueSTS.aliases
mv UniSTS.aliases UniSTS.aliases.broken
mv uniqueSTS.aliases UniSTS.aliases
# back to our work area, update the bed file
# to do this we need a new UniSTS_mouse.alias file
# it is created by a combination of information from several
# of the above files ! AND ! the previous stsInfoMouse.bed file
# the db reference here is to the previous build
time nice -n +19 ~/kent/src/hg/stsMarkers/fetchAllAliases.csh mm8
# Here is a normal set of errors:
# processing UniSTS_mouse.sts to find aliases
# # ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line
# # 2384
# processing MGI.aliases
# fetching existing aliases from previous stsInfoMouse.bed file
# found 27648 potential errors in
# /cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed
# to see the errors: grep ERROR stsInfoAliases.txt
# verify those stsInfoMouse.bed aliases with UniSTS.aliases
# those errors in the previous stsInfoMouse.bed file are an
# accumulation of errors from a long long time ago in this chain
# of processing. Some day it might be nice to fix them, but they
# don't seem to bother anything, so they continue to be carried
# forward, and a couple of new ones are added with each assembly.
####################################################################
## STS markers data processing track (DONE - 2007-07-26 - Hiram)
ssh hgwdev
cd /cluster/data/mm9/bed/STSmarkers
# create a new stsInfoMouse.bed file:
# Update the m m 8 directory name here to m m 9
# for the next build of m m 10, ...etc... and so forth
time ~/kent/src/hg/stsMarkers/updateBed.pl \
/cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed \
downloads/MRK_Dump2.rpt \
downloads/PRB_PrimerSeq.rpt \
downloads/MRK_Sequence.rpt \
downloads/UniSTS_mouse.alias \
downloads/UniSTS_mouse.sts \
-g downloads/10090.WI-Genetic.txt \
-r downloads/10090.WI_MRC_RH.txt \
-verbose 2> dbg.updateBed | sed -e "s/\t*$//" > newbedfile
~/kent/src/hg/stsMarkers/cleanInfo.pl -mouse newbedfile \
| sed -e "s/\t*$//" > mm9.stsInfoMouse.bed
# copy the stsInfoMouse.bed file from working dir to the marker
# info storage fold. added 2 new steps by Yontao
# be wary of the archive name here, check the directory and get
# the name right here.
mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime
cp -p mm9.stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed
# comparing to previous, numbers increase slightly each time
wc -l /cluster/store5/mouseMarker/stsInfoMouse.bed \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
# 66782 /cluster/store5/mouseMarker/stsInfoMouse.bed
# 60631 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime
# 59843 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
# 58980 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
# 58493 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
# and from that, create new primer fa, epcr, etc:
time ~/kent/src/hg/stsMarkers/luConvertPrimerToFa \
mm9.stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
# the mouseC.fa file will be empty, should be more than last time
wc -l mouse?.*
# 0 mouseC.fa
# 359647 mouseP.fa
# 41247 mouseP.info
# the equivalent Mm8 files:
# 0 mouseC.fa
# 308384 mouseP.fa
# 34666 mouseP.info
# copy the primers over to some filesystem close to the klusters
# and split them up to have a small number of sequences in one file
mkdir /cluster/bluearc/mm9/stsMarkers
cp -p mouseP.fa /cluster/bluearc/mm9/stsMarkers
cd /cluster/bluearc/mm9/stsMarkers
cp -p /cluster/data/mm9/11.ooc .
mkdir split
# 356 files for 41,247 sequences, == about 116 sequences per file
faSplit sequence mouseP.fa 400 split/mm_
# PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE.
# This process could convert to a modern version of blat with the
# filters as described, for example, in the STS markers build in Hg18
# CLUSTER RUN FOR THE STS PRIMERS
ssh kk
cd /cluster/data/mm9/bed/STSmarkers
mkdir primer
mkdir ePCR
cd primer
mkdir out
# interestingly, this blat2.2 binary did not function correctly
# when given nib files. It has only about 1/4th of the number of
# alignments as it gets when it used fa files for the target
# sequence.
ls -1S /cluster/bluearc/mm9/stsMarkers/split > primers.list
# will fetch chrom sequences from the 2bit file
cut -f1 /cluster/data/mm9/chrom.sizes > chr.list
## next time, make this script produce its results in /scratch/tmp
## then move result file to output instead of writing result
## to output
cat << '_EOF_' > runBlat2
#!/bin/csh -fe
set primer = /cluster/bluearc/mm9/stsMarkers/split/$1
set root1 = $1:r
set fa = $root1.$2.fa
set ooc = /cluster/bluearc/mm9/stsMarkers/11.ooc
set root2 = $2:r
set tmpDir = /scratch/tmp/$root1.$root2
mkdir $tmpDir
mkdir -p out/${root2}
set out = $3
pushd $tmpDir
twoBitToFa -seq=$2 /iscratch/i/mus/mm9/mm9.2bit ${fa}
cp -p ${primer} primer.fa
cp -p ${ooc} 11.ooc
/cluster/bin/i386/blat.2 ${fa} primer.fa -ooc=11.ooc \
-minMatch=1 -minScore=0 -minIdentity=80 -oneOff result.psl
popd
cp -p ${tmpDir}/result.psl ${out}
rm -fr ${tmpDir}
'_EOF_'
# << happy emacs
chmod +x runBlat2
cat << '_EOF_' > template
#LOOP
./runBlat2 $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 primers.list chr.list template jobList
para create jobList
para try ... check ... push ... etc ...
# Completed: 12425 of 12425 jobs
# CPU time in finished jobs: 1438098s 23968.31m 399.47h 16.64d 0.046 y
# IO & Wait Time: 237582s 3959.69m 65.99h 2.75d 0.008 y
# Average job time: 135s 2.25m 0.04h 0.00d
# Longest finished job: 2150s 35.83m 0.60h 0.02d
# Submission to last job: 4736s 78.93m 1.32h 0.05d
# on the file server
ssh kkstore06
cd /cluster/data/mm9/bed/STSmarkers/primer
time nice -n +19 pslSort dirs primers.raw.psl temp out/chr*
# real 1m34.193s
# -rw-rw-r-- 1 700293557 Aug 6 10:22 primers.raw.psl
# filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
# should not be more than 100 bases different.
# This filters out about 948,260 alignments, or
# %17.4 = 100.0 * 948260 / 5462936
time nice -n +19 pslSort dirs stdout temp out/chr* | awk -F"\t" '
{ if (((($13 - $12) - ($17 - $16)) > -100) &&
((($13 - $12) - ($17 - $16)) < 100)) {print}
}
' > primers.100.psl
rmdir temp
wc -l *.psl
# 5340677 primers.100.psl
# 6498150 primers.raw.psl
echo "6498150-5340677" | bc -q
# 1157473 difference
# a rough comparison with previous results:
wc -l primers.100.psl \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.100
# 5340677 primers.100.psl
# 4514676 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.100
# another kluster run for the ePCR
ssh pk
cd /cluster/data/mm9/bed/STSmarkers/ePCR
cut -f1 /cluster/data/mm9/chrom.sizes > chr.list
# Using previously fetched e-PCR source from
# ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
# version 2.3.1 11 Feb 2005
# Had to add the following to both re-PCR_main.cpp and
# e-PCR_main.cpp to get them to compile on kolossus:
// max and min Copied from /usr/include/mysql/my_global.h
#define max(a, b) ((a) >? (b))
#define min(a, b) ((a) <? (b))
mkdir out
cat << '_EOF_' > runPCR
#!/bin/csh -fe
set chr = $1
set out = $2
set wrkdir = /scratch/tmp/epcr.mm9.$chr
set fa = $chr.fa
set tmpResult = $chr.result.epcr
mkdir $wrkdir
twoBitToFa -seq=$chr /san/sanvol1/scratch/mm9/mm9.2bit $wrkdir/$fa
pushd $wrkdir
/cluster/bin/x86_64/e-PCR \
/cluster/data/mm9/bed/STSmarkers/mouseP.info $fa N=1 M=50 W=5 > $tmpResult
popd
cp -p $wrkdir/$tmpResult $out
rm $wrkdir/$tmpResult
rm $wrkdir/$fa
rmdir $wrkdir
'_EOF_'
# << happy emacs
chmod +x runPCR
cat << '_EOF_' > template
#LOOP
./runPCR $(path1) {check out line+ out/$(root1).epcr}
#ENDLOOP
'_EOF_'
# << the mouseP.info was created above
gensub2 chr.list single template jobList
para create jobList
para try
para check
para push
... etc ...
## two of those produce zero results:
# -rw-rw-r-- 1 0 Aug 6 12:53 chr3_random.epcr
# -rw-rw-r-- 1 0 Aug 6 12:53 chr16_random.epcr
## hence, the two crashed jobs in the check display:
# Completed: 33 of 35 jobs
# Crashed: 2 jobs
# CPU time in finished jobs: 80940s 1349.01m 22.48h 0.94d 0.003 y
# IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y
# Average job time: 2327s 38.78m 0.65h 0.03d
# Longest finished job: 6980s 116.33m 1.94h 0.08d
# Submission to last job: 15589s 259.82m 4.33h 0.18d
ssh kkstore06
cd /cluster/data/mm9/bed/STSmarkers/ePCR
# all those results become all.epcr
cat out/*.epcr > all.epcr
# comparing to previous results, should have more with new results:
wc -l all.epcr /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR/all.epcr
# 87623 all.epcr
# 58162 /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR/all.epcr
cd /cluster/data/mm9/bed/STSmarkers/primer
~/kent/src/hg/stsMarkers/filterSTSPrimers \
-mouse ../mm9.stsInfoMouse.bed primers.100.psl \
../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat
# The output should show an increasing count:
# Reading name info from: ../mm9.stsInfoMouse.bed
# Reading primer info from: ../mouseP.info
# Reading ePCR info from: ../ePCR/all.epcr
# Reading alignment results from: primers.100.psl
# 100000
# 200000
# ...
# 5200000
# 5300000
# Determining ePCR not found from ePCR results
# Out of 26332 ePCR alignments examined, not found: 527
## compare with previous build results
wc -l primers.psl.filter.blat \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat
# 35537 primers.psl.filter.blat
# 34043 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat
## ouch, mm9 does not have lift files for contigs to chroms
## let's make a contig lift file
cd /cluster/data/mm9/jkStuff
cp -p /cluster/data/cb3/jkStuff/agpToLift.pl .
grep CONTIG ../mouse_37/mm9.contigs.agp \
| ./agpToLift.pl /dev/stdin > mm9.contigs.lift
awk '{if (! match($5,"N")) print}' ../mouse_37/mm9.fragments.agp \
| /cluster/data/rn3/jkStuff/agpToLift.pl ../chrom.sizes /dev/stdin \
> mm9.fragments.lift
cd ..
mkdir ctgLifts
splitFileByColumn -col=4 jkStuff/mm9.contigs.lift ctgLifts
mkdir fragmentLifts
splitFileByColumn -col=4 jkStuff/mm9.fragments.lift fragmentLifts
## distribute those in the old-style lift directory hierarchy
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M
do
rm -fr ${C}/lift
done
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M
do
mkdir -p ${C}/lift
if [ -f ctgLifts/chr${C}.contigs.lift ]; then
cp -p ctgLifts/chr${C}.contigs.lift ${C}/lift/ordered.lft
fi
if [ -f ctgLifts/chr${C}_random.contigs.lift ]; then
cp -p ctgLifts/chr${C}_random.contigs.lift ${C}/lift/random.lft
fi
done
## not the fragments
# for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M
#do
# mkdir -p ${C}/lift
# if [ -f fragmentLifts/chr${C}.fragments.lift ]; then
# cp -p fragmentLifts/chr${C}.fragments.lift ${C}/lift/ordered.lft
# fi
# if [ -f fragmentLifts/chr${C}_random.fragments.lift ]; then
# cp -p fragmentLifts/chr${C}_random.fragments.lift ${C}/lift/random.lft
# fi
#done
## now, after that side trip, back to the primer business
# create file accession_info.rdb
touch empty_sequence.inf
~/kent/src/hg/stsMarkers/compileAccInfo -mouse \
/cluster/data/mm9 empty_sequence.inf
# 20363 processed
mv accession_info.rdb accession_info.rdb.tmp
~/kent/src/hg/stsMarkers/sorttbl -x Chr Ord Start \
< accession_info.rdb.tmp > accession_info.rdb
# The -x prints the debug statement:
# sort arg: -t" " +0 -1 +1 -2g +2 -3g
rm accession_info.rdb.tmp
# comparing results to previous
# Continuing the trend that began with Mm7, the numbers in
# accession_info.rdb continue to decrease. Even Mm8 has much less
# fragments than did mm7:
# e.g.:
[hiram@kkstore06 /cluster/data] wc -l mm9/?/chr*.agp mm9/??/chr*.agp | tail -1
# 21699 total
[hiram@kkstore06 /cluster/data] wc -l mm8/*/chr*.agp | tail -1
# 21910 total
[hiram@kkstore06 /cluster/data] wc -l mm7/*/chr*.agp | tail -1
# 70125 total
[hiram@kkstore06 /cluster/data] wc -l mm6/*/chr*.agp | tail -1
# 170812 total
wc -l accession_info.rdb \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/accession_info.rdb
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat
# 20333 accession_info.rdb
# 20385 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/accession_info.rdb
# creates epcr.not.found.nomatch and epcr.not.found.psl
~/kent/src/hg/stsMarkers/epcrToPsl -mouse \
epcr.not.found ../mouseP.info \
accession_info.rdb /cluster/data/mm9/mm9.2bit 2> dbg.epcrToPsl
# the dbg.epcrToPsl has a number of lines complaining about bad
# primers in ../mouseP.info - and indeed they are bad primers,
# they do not have a second primer.
# Comparing results to previous:
wc -l epcr* \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr*
# 527 epcr.not.found
# 0 epcr.not.found.nomatch
# 527 epcr.not.found.psl
# 520 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found
# 0 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found.nomatch
# 520 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found.psl
# Mm7 wc epcr*
wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr*
# 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found
# 0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch
# 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl
# 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl
# 1106 total
cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
wc -l primers.psl.filter \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter
# 36064 primers.psl.filter
# 34563 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter
# create primers.psl.filter.lifted.initial
# The PATH setting allows extractPslInfo to find other programs that it
# is going to use.
PATH=~/kent/src/hg/stsMarkers:$PATH \
~/kent/src/hg/stsMarkers/extractPslInfo primers.psl.filter
wc -l *.initial \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial
# 36040 primers.psl.filter.initial
# 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial
# create primers.psl.filter.lifted.initial.acc
PATH=~/kent/src/hg/stsMarkers:$PATH \
~/kent/src/hg/stsMarkers/findAccession -agp \
-mouse primers.psl.filter.initial /cluster/data/mm9
wc -l *.initial.acc /cluster/data/mm8/bed\
/STSmarkers.2006-08-29/primer/primers.psl.filter.initial.acc
# 36040 primers.psl.filter.initial.acc
# 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial.acc
# this needs to be -rat as that specifies how to scan the
# stsInfoMouse.bed file and it does not work if you use -mouse
# it is not clear what -mouse would mean to this script, some other file
# format perhaps from the stsInfoMouse.bed format.
~/kent/src/hg/stsMarkers/getStsId -rat \
../mm9.stsInfoMouse.bed primers.psl.filter.initial.acc \
| sort -k4,4n > primers.final
wc -l primers.final \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.final
# 36040 primers.final
# 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.final
cd /cluster/data/mm9/bed/STSmarkers
# stsMarkers.final is empty for mouse
touch stsMarkers.final dummy
PATH=~/kent/src/hg/stsMarkers:$PATH \
~/kent/src/hg/stsMarkers/combineSeqPrimerPos \
stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
wc -l stsMarkers_pos.rdb \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMarkers_pos.rdb
# 34232 stsMarkers_pos.rdb
# 33048 /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMarkers_pos.rdb
PATH=~/kent/src/hg/stsMarkers:$PATH \
~/kent/src/hg/stsMarkers/createStsBed \
mm9.stsInfoMouse.bed stsMarkers_pos.rdb 500 \
| sort -k1,1 -k2,2n | sed -e "s/ //g" > stsMapMouse.bed
# The sed removes unneeded blanks
# verify score profile remains similar
awk -F'\t' '{print $5}' stsMapMouse.bed | sort -n | uniq -c
# 591 500
# 1774 750
# 28529 1000
awk -F'\t' '{print $5}' \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed \
| sort -n | uniq -c
# 546 500
# 1650 750
# 27705 1000
wc -l stsMapMouse.bed \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed
# 30894 stsMapMouse.bed
# 29901 /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed
## check the names, look for odd ones
## the bogus names "-" were fixed for mm9
awk -F'\t' '{print $4}' stsMapMouse.bed | sort | head
awk -F'\t' '{print $4}' stsMapMouse.bed | sort | tail
# loading STS markers tables
ssh hgwdev
cd /cluster/data/mm9/bed/STSmarkers
~/kent/src/hg/stsMarkers/ucscAlias.pl \
mm9.stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
# this does leave messages in ucscStsAlias.warnings but they seem
# to be very similar to Mm6 with just a few new ones
wc -l ucscStsAlias.tab \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/ucscStsAlias.tab
# 146359 ucscStsAlias.tab
# 146767 /cluster/data/mm8/bed/STSmarkers.2006-08-29/ucscStsAlias.tab
ssh hgwdev
cd /cluster/data/mm9/bed/STSmarkers
## when reloading:
hgsql -e "drop table stsAlias;" mm9
hgsql -e "drop table stsMapMouseNew;" mm9
hgsql -e "drop table stsInfoMouseNew;" mm9
hgsql mm9 < ~/kent/src/hg/lib/stsAlias.sql
hgsql -e \
'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm9
hgsql mm9 < ~/kent/src/hg/lib/stsMapMouseNew.sql
hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm9
hgsql mm9 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
hgsql -e \
'load data local infile "mm9.stsInfoMouse.bed" into table stsInfoMouseNew;' mm9
hgsql -e "drop table all_sts_primer;" mm9
hgLoadPsl -nobin -table=all_sts_primer mm9 primer/primers.psl.filter
# load of all_sts_primer did not go as planned: 36064 record(s),
# 0 row(s) skipped, 1 warning(s) loading primer/primers.psl.filter
# After warnings, checkTableCoords to find problems:
checkTableCoords -verboseBlocks mm9 all_sts_primer
mm9.all_sts_primer item 61999 chr10:62485403-62485439: blocks 0 and 1 overlap.
mm9.all_sts_primer has 1 records with overlapping blocks.
# Strip the offending item from the load
hgsql -e 'delete from all_sts_primer where tName="chr10" AND tStart=62485403 AND tEnd=62485439;' mm9
# load primer sequences
mkdir /gbdb/mm9/stsMarker
ln -s /cluster/data/mm9/bed/STSmarkers/mouseP.fa \
/gbdb/mm9/stsMarker/mouseP.fa
# PLEASE NOTE THAT THE If you are going to reload this business, use the
# -replace option on this hgLoadSeq
# hgLoadSeq -replace mm9 /gbdb/mm9/stsMarker/mouseP.fa
# otherwise there will be a problem that the seq and extFile tables
# will be out of sync.
hgLoadSeq -replace mm9 /gbdb/mm9/stsMarker/mouseP.fa
# Adding /gbdb/mm9/stsMarker/mouseP.fa
# 41247 sequences
# Warning: load of seq did not go as planned: 41330 record(s), 0 row(s)
# skipped, 1 warning(s) loading ./seq.tab
## joinerCheck should be clean:
joinerCheck -keys -identifier=mouseStsTrueName -database=mm9 all.joiner
# Checking keys on database mm9
# mm9.stsAlias.trueName - hits 146350 of 146359 ok
# mm9.all_sts_primer.qName - hits 35537 of 36063 ok
# mm9.stsMapMouseNew.name - hits 30894 of 30894 ok
featureBits mm9 all_sts_primer
# 3795229 bases of 2620346127 (0.145%) in intersection
featureBits mm8 all_sts_primer
# 3700897 bases of 2567283971 (0.144%) in intersection
featureBits mm9 stsMapMouseNew
# 4884563 bases of 2620346127 (0.186%) in intersection
featureBits mm8 stsMapMouseNew
# 4812616 bases of 2567283971 (0.187%) in intersection
hgsql -N mm9 -e "select count(*) from stsAlias;"
# 146359
hgsql -N mm8 -e "select count(*) from stsAlias;"
# 146767
hgsql -N mm9 -e "select count(*) from stsInfoMouseNew;"
# 66782
hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;"
# 60631
# compare old and new name lists, not much difference:
awk '{print $4}' stsMapMouse.bed | sort -u > mm9.nameList
# in common with previous version:
comm -12 \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \
mm9.nameList | wc -l
# 28596
# unique to previous version:
comm -23 \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \
mm9.nameList | wc -l
# 111
# unique to this new set:
comm -13 \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \
mm9.nameList | wc -l
# 1017
###########################################################################
# Reset default position to be same area as Mm8, 2007-08-02 - Hiram
hgsql -e \
'update dbDb set defaultPos="chr12:57795963-57815592" where name="mm9";' \
hgcentraltest
##############################################################################
# CLONE ENDS - BACEND TRACK (DONE - 2007-08-02 - 2007-08-03 - Hiram)
ssh kkstore06
cd /cluster/data/mm9
# check disk space: 1.2T free
df -h .
# Filesystem Size Used Avail Use% Mounted on
# /export/cluster/store4
# 2.3T 997G 1.2T 46% /cluster/store4
mkdir -p bed/cloneend/ncbi
cd bed/cloneend/ncbi
wget --timestamping \
ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/*
cd /cluster/data/mm9/bed/cloneend
# seems like the *.mfa files were split just for convenience
# concatenate, and convert the title line of the fasta sequences
cat << '_EOF_' > convert.pl
#!/usr/bin/env perl
use strict;
use warnings;
while (my $line = <>) {
if ($line !~ m/^>/) {
print $line
} else {
my @fields = split('\|', $line);
my $fieldCount = scalar(@fields);
my $printed = 0;
for (my $i = 0; $i < $fieldCount; $i++) {
if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") {
(my $name, my $vers) = split(/\./,$fields[$i+1]);
print ">$name\n";
$i= $fieldCount;
$printed = 1;
}
}
die("Failed for $line\n") if (!$printed);
}
}
'_EOF_'
# << happy emacs
chmod +x convert.pl
for F in ncbi/*.mfa.gz
do
zcat ${F}
done | ./convert.pl | gzip > cloneEnds.fa.gz
# make sure nothing got broken:
faSize ncbi/*.mfa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper
# 176421214 lower) in 789466 sequences in 44 files
faSize cloneEnds.fa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper
# 176421214 lower) in 789466 sequences in 1 files
# identical numbers, curiously, these are exactly the same numbers
# as were seen during the build of Mm7. Do these things not
# change with time ?
# concatenate the text files, too
for F in ncbi/*.txt.gz
do
zcat ${F}
done | gzip > all.txt.gz
# generate cloneEndPairs.txt and cloneEndSingles.txt
zcat all.txt.gz | ~/kent/src/hg/utils/cloneEndParse.pl /dev/stdin
# Reading in end info
# Writing out pair info
# Writing out singleton info
# 354485 pairs and 78423 singles
# faSplit does not function correctly if given a .gz source file
# AND, we need the unzipped file for sequence loading below
gunzip cloneEnds.fa.gz
# split
mkdir split
cd split
## adjust split size based on previous kluster performance, see below
faSplit sequence ../cloneEnds.fa 500 cloneEnds
# Check to ensure no breakage:
faSize c*.fa
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
# lower) in 789466 sequences in 98 files
# %35.41 masked total, %36.65 masked real
# same numbers as before
# Copy to san for cluster runs
mkdir /san/sanvol1/scratch/mm9/cloneEnds
rsync -a --progress --stats ./ /san/sanvol1/scratch/mm9/cloneEnds/
rm *
cd ..
rmdir split
# may as well remove the previous assembly copy:
rm -fr /san/sanvol1/scratch/mm8/cloneEnds
# load sequences
ssh hgwdev
mkdir /gbdb/mm9/cloneend
cd /gbdb/mm9/cloneend
ln -s /cluster/data/mm9/bed/cloneend/cloneEnds.fa .
cd /tmp
hgLoadSeq mm9 /gbdb/mm9/cloneend/cloneEnds.fa
# Advisory lock created
# Creating .tab file
# Adding /gbdb/mm9/cloneend/cloneEnds.fa
# 789466 sequences
# Updating seq table
# Advisory lock has been released
# All done
## clean up garbage
rm seq.tab
############################################################################
# BACEND SEQUENCE ALIGNMENTS (DONE - 2007-08-06 - Hiram)
ssh kkstore06
mkdir /cluster/data/mm9/noMask
cd /cluster/data/mm9/
# Need an unmasked sequence for this work
for C in `cut -f1 chrom.sizes`
do
echo twoBitToFa -noMask -seq=${C} mm9.2bit noMask/${C}.fa
twoBitToFa -noMask -seq=${C} mm9.2bit noMask/${C}.fa
done
# verify nothing broken
faSize noMask/c*.fa
# 2725765481 bases (105419509 N's 2620345972 real 2620345972 upper 0 lower) in
# 35 sequences in 35 files
# note, this was the same as long ago when the mm9.2bit was measured:
# 2725765481 bases (105419509 N's 2620345972 real 1466644650 upper
# 1153701322 lower) in 35 sequences in 1 files
# copy to san for kluster run
mkdir /san/sanvol1/scratch/mm9/noMask
rsync -a --progress --stats noMask/ /san/sanvol1/scratch/mm9/noMask/
# 11.ooc file is already there from the genbank build
# and now for the kluster run
ssh pk
mkdir /cluster/data/mm9/bed/bacends
cd /cluster/data/mm9/bed/bacends
mkdir out
# allow blat to run politely in /tmp while it writes output, then
# copy results to results file:
cat << '_EOF_' > runBlat
#!/bin/csh -fe
set root1 = $1
set root2 = $2
set result = $3
rm -fr /scratch/tmp/${root1}_${root2}
mkdir /scratch/tmp/${root1}_${root2}
cp -p /san/sanvol1/scratch/mm9/11.ooc /scratch/tmp/${root1}_${root2}
cp -p /san/sanvol1/scratch/mm9/noMask/${root1}.fa \
/scratch/tmp/${root1}_${root2}
cp -p /san/sanvol1/scratch/mm9/cloneEnds/${root2}.fa \
/scratch/tmp/${root1}_${root2}
pushd /scratch/tmp/${root1}_${root2}
/cluster/bin/x86_64/blat ${root1}.fa ${root2}.fa \
-ooc=11.ooc ${root1}.${root2}.psl
popd
mkdir -p out/${root2}
rm -f ${result}
cp -p /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /scratch/tmp/${root1}_${root2}
'_EOF_'
# << happy emacs
chmod +x runBlat
cat << '_EOF_' > template
#LOOP
./runBlat $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
# << emacs happy
ls -1S /san/sanvol1/scratch/mm9/cloneEnds/cloneEnds*.fa > bacEnds.lst
ls -1S /san/sanvol1/scratch/mm9/noMask/chr*.fa > chrom.lst
gensub2 chrom.lst bacEnds.lst template jobList
para create jobList
# 17150 jobs written to batch
para try, check, push, etc ...
# Completed: 17150 of 17150 jobs
# CPU time in finished jobs: 698826s 11647.09m 194.12h 8.09d 0.022 y
# IO & Wait Time: 262556s 4375.94m 72.93h 3.04d 0.008 y
# Average job time: 56s 0.93m 0.02h 0.00d
# Longest finished job: 332s 5.53m 0.09h 0.00d
# Submission to last job: 250536s 4175.60m 69.59h 2.90d
ssh kkstore06
cd /cluster/data/mm9/bed/bacends
screen
mkdir temp
time nice -n +19 pslSort dirs raw.psl temp out/* > pslSort.out 2>&1 &
# real 22m4.019s
# -rw-rw-r-- 1 8423154460 Aug 6 13:40 raw.psl
time nice -n +19 pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 \
-noIntrons raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
# real 6m1.174s
# -rw-rw-r-- 1 1236810588 Aug 6 13:51 bacEnds.psl
# split this large psl file into pieces with 100,000 lines each
# to prepare for a sort
time nice -n +19 ~/kent/src/hg/pslSplitOnTarget/pslSplitLineCount.pl \
100000 bacEnds.psl split/bacends
# real 0m15.389s
# save original file, then sort
mv bacEnds.psl bacEnds.psl.save
time pslSort dirs bacEnds.psl temp split
# real 2m19.131s
# -rw-rw-r-- 1 1236810588 Aug 6 14:38 bacEnds.psl
## compare to previous results
wc -l bacEnds.psl /cluster/data/mm8/bed/bacends/bacEnds.psl
# 10294737 bacEnds.psl
# 10229750 /cluster/data/mm8/bed/bacends/bacEnds.psl
## work at top-level directory after this
mkdir /cluster/data/mm9/bacends
cp -p bacEnds.psl /cluster/data/mm9/bacends
############################################################################
# BACEND PAIRS TRACK (DONE - 2007-08-06 - Hiram)
ssh kolossus
cd /cluster/data/mm9/bacends
time nice -n +19 pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds
# real 0m49.120s
## produces files:
# -rw-rw-r-- 1 199185 Aug 6 14:46 bacEnds.slop
# -rw-rw-r-- 1 144486 Aug 6 14:46 bacEnds.short
# -rw-rw-r-- 1 24399410 Aug 6 14:46 bacEnds.pairs
# -rw-rw-r-- 1 25421100 Aug 6 14:46 bacEnds.orphan
# -rw-rw-r-- 1 201794 Aug 6 14:46 bacEnds.mismatch
# -rw-rw-r-- 1 15928 Aug 6 14:46 bacEnds.long
# create header required by "rdb" tools
echo -e \
"chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header
cat header bacEnds.pairs | \
/cluster/bin/scripts/row score ge 300 | \
/cluster/bin/scripts/sorttbl chr start | \
/cluster/bin/scripts/headchg -del > bacEndPairs.bed
# -rw-rw-r-- 1 24201067 Aug 6 14:49 bacEndPairs.bed
cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
/cluster/bin/scripts/sorttbl chr start | \
/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed
# -rw-rw-r-- 1 6888559 Aug 6 14:49 bacEndPairsBad.bed
/cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed >j1.out
# -rw-rw-r-- 1 989173324 Aug 6 14:52 j1.out
cat j1.out | /cluster/bin/scripts/sorttbl tname tstart >j2.out
# -rw-rw-r-- 1 989173324 Aug 6 15:07 j2.out
cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl
# -rw-rw-r-- 1 989173165 Aug 6 15:08 bacEnds.load.psl
rm j1.out j2.out
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $5}' bacEndPairs.bed | sort -u
# result should be the scores, no extraneous strings:
# 1000
# 300
# 375
# 500
# 750
# edit the file and fix it if it has a bad name.
wc -l bacEnds.load.psl /cluster/data/mm8/bacends/bacEnds.load.psl
# 8167555 bacEnds.load.psl
# 8132116 /cluster/data/mm8/bacends/bacEnds.load.psl
# load into database
ssh hgwdev
cd /cluster/data/mm9/bacends
hgLoadBed -notItemRgb mm9 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 239101 elements of size 11
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed -notItemRgb mm9 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 84679 elements of size 11
# NOTE: truncates file to 0 if -nobin is used
time hgLoadPsl mm9 -table=all_bacends bacEnds.load.psl
# load of all_bacends did not go as planned: 8167555 record(s), 0 row(s)
# skipped, 2 warning(s) loading psl.tab
# real 4m1.142s
## to find out what the warnings are about:
## first, on hgwdev, dump the loaded table
hgsql -N -e "select qName from all_bacends;" mm9 \
| sort -u > all_bacends.qName.txt
## then on kkstore06 compare the resulting load with the requested load file
diff psl.tab mm9.all_bacends.txt
## this diff shows two markers had their qBaseInsert count changed from
## a negative number to a zero since that field is an unsigned
## AG326808 and AG609381
## joinerCheck should be clean:
joinerCheck -keys -identifier=bacEndNames -database=mm9 all.joiner
# Checking keys on database mm9
# mm9.bacEndPairs.lfNames - hits 478202 of 478202 ok
featureBits mm9 all_bacends
# 349085662 bases of 2620346127 (13.322%) in intersection
featureBits mm8 all_bacends
# 327086559 bases of 2567283971 (12.741%) in intersection
featureBits mm7 all_bacends
# 334161740 bases of 2583394090 (12.935%) in intersection
featureBits mm6 all_bacends
# 336981828 bases of 2597150411 (12.975%) in intersection
featureBits mm5 all_bacends
# 268502414 bases of 2615483787 (10.266%) in intersection
featureBits mm4 all_bacends
# 243096171 bases of 2627444668 (9.252%) in intersection
featureBits mm9 bacEndPairs
# 209909804 bases of 2620346127 (8.011%) in intersection
featureBits mm8 bacEndPairs
# 2572527283 bases of 2567283971 (100.204%) in intersection
featureBits mm7 bacEndPairs
# 2578837424 bases of 2583394090 (99.824%) in intersection
featureBits mm6 bacEndPairs
# 2570768812 bases of 2597150411 (98.984%) in intersection
featureBits mm5 bacEndPairs
# 2567958504 bases of 2615483787 (98.183%) in intersection
featureBits mm4 bacEndPairs
# 2549945356 bases of 2627444668 (97.050%) in intersection
featureBits mm9 bacEndPairsBad
# 48850302 bases of 2620346127 (1.864%) in intersection
#######################################################################
# Special one-off bacEnds added (DONE - 2008-01-09 - Hiram)
ssh hgwdev
# BAC RP23-473N24 was reported missing
# its two ends are AZ095043 and AZ095046
# end AZ095046 maps just fine to the correct location on chr7
# the end AZ095043 does not map correctly when using the -ooc
# option to blat. Run the blat without ooc and it does the
# correct thing. From the genbank record:
cd /cluster/data/mm9/bed/bacends
cat << '_EOF_' > AZ095043.fa
>AZ095043
TTTATCATGAATGGGTGTTGTATCTTGTCGAAGCTTTTTCCGCATCTAACGAGATGATCATGTGGTTTTT
GTCTTTGAGTTTGTTTATATAATGGATTACATTGATGGATTTTCATATATTAAACCATCCCTGCATCCCT
GGAATAAAACCTACTTGGTCAGGATGGATGACTGCCAAGGCGGACCGGG
'_EOF_'
blat /san/sanvol1/scratch/mm9/noMask/chr7.fa AZ095043.fa AZ095043.raw.psl
pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 \
-noIntrons AZ095043.raw.psl AZ095043.psl /dev/null
# before adding this one item:
hgsql -e "select count(*) from all_bacends;" mm9
# 8167555
hgLoadPsl -table=all_bacends -append mm9 AZ095043.psl
# verify one row added
hgsql -e "select count(*) from all_bacends;" mm9
# 8167556
# Using the Mm6 records from all_bacends and bacEndPairs as a guide
# The bed record for this BAC is therefore:
cat << '_EOF_' > RP23-473N24.bed
chr7 150015932 150193247 RP23-473N24 1000 - all_bacends 2 150015932,150192880 172,367 AZ095043,AZ095046
'_EOF_'
# verify rows before adding this one new row
hgsql -e "select count(*) from bacEndPairs;" mm9
# 239101
# YOW ! The -oldTable option didn't work ! I'm guessing that with
# the -sqlTable argument it became confused
hgLoadBed -oldTable -notItemRgb mm9 bacEndPairs RP23-473N24.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# reload everything:
cat ../../bacends/bacEndPairs.bed RP23-473N24.bed \
| hgLoadBed -notItemRgb mm9 bacEndPairs stdin \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
#######################################################################
## create random contigs for genscan and other alignment tasks
## DONE - 2007-08-07 - Hiram
ssh kkstore06
mkdir randomContigs
for L in ?/lift/random.lft ??/lift/random.lft
do
D=${L/\/lift*}
echo $L $D
~/kent/src/hg/utils/lft2BitToFa.pl mm9.2bit ${L} \
> randomContigs/chr${D}_random.ctg.fa
done
#
# Verify these *.ctg.fa files have the same bases as the ordinary
# chr*_random.fa files:
## don't have these fasta files yet, extract them from the 2bit
grep random chrom.sizes | cut -f1 | sed -e "s/^chr//; s/_random//" \
| while read C
do
echo "twoBitToFa -seq=chr${C}_random mm9.2bit ${C}/chr${C}_random.fa"
twoBitToFa -seq=chr${C}_random mm9.2bit ${C}/chr${C}_random.fa
done
## now we can measure them
faSize ?/chr?_random.fa ??/chr??_random.fa
# 70853964 bases (9033771 N's 61820193 real 26427973 upper
# 35392220 lower) in 13 sequences in 13 files
## and our contig versions
faSize randomContigs/*.ctg.fa
# 62053964 bases (233771 N's 61820193 real 26427973 upper
# 35392220 lower) in 189 sequences in 13 files
## note, same number of real, upper and lower, only different N's
## it would be nice to have the actual chroms too
grep -v random chrom.sizes | cut -f1 | sed -e "s/^chr//" \
| while read C
do
echo "twoBitToFa -seq=chr${C} mm9.2bit ${C}/chr${C}.fa"
twoBitToFa -seq=chr${C} mm9.2bit ${C}/chr${C}.fa
done
# measure that result
faSize ?/chr?.fa ??/chr??.fa
# 2654911517 bases (96385738 N's 2558525779 real 1438609919
# upper 1119915860 lower) in 22 sequences in 22 files
## is this the amount of sequence specified in chrom.sizes ?
grep -v random chrom.sizes | ave -col=2 stdin | grep total
# total 2654911517.000000
## same number, nothing lost
#########################################################################
# GENSCAN PREDICTIONS (DONE - 2007-08-07 - 2007-08-10 - Hiram)
ssh kkstore06
# Create a 2bit file with the full chrom sequences and the
# random contigs, all hard masked
## later it was found that chr16_random.ctg.fa should not be in
## this genscan run. So, it was temporarily taken out of this directory
## and this sequence was rerun to avoid it.
cat ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
| maskOutFa stdin hard stdout \
| faToTwoBit stdin mm9Chroms_RandomContigs.hard.2bit
# with chr16_random removed:
# 2716961487 bases (1251923595 N's 1465037892 real 1465037892 upper 0
# lower) in 210 sequences in 1 files
# make sure it still has all the unmasked sequence in it: (incl 16)
twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \
| faSize stdin
# 2716965481 bases (1251927589 N's 1465037892 real 1465037892 upper
# 0 lower) in 211 sequences in 1 files
twoBitToFa mm9.2bit stdout | faSize stdin
# 2725765481 bases (105419509 N's 2620345972 real 1465037892 upper
# 1155308080 lower) in 35 sequences in 1 files
# note the upper bases are the same, the lowers have become N's
# lower 1155308080 + upper 1465037892 = 2620345972 real
# N's 1251927589 - N's 105419509 = 1146508080 ==
# N's in gaps between contigs
# And, make sure there aren't any sequences in this lot that have
# become all N's with no sequence left in them. This drives genscan nuts
twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \
| faCount stdin > chroms_randoms.faCount
# the lowest three are:
egrep -v "^#|^total" chroms_randoms.faCount \
| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
# NT_166474.1 75
# NT_166461.1 66
# NT_166481.1 39
# NT_166325.1 0
## This last one is the entire chr16_random and it is only 3,994 bases
## long and is marked entirely by RepeatMasker as a line. It needs
## to be skipped during the run of genscan. Go back to the 2bit creation
## and do not include chr16_random
# creating 4,000,000 sized chunks, the chroms stay together as
# single pieces. The contigs get grouped together into 4,000,000
# sized fasta files. You don't want to break these things up
# because genscan will be doing its own internal 2.4 million
# window on these pieces, and the gene names are going to be
# constructed from the sequence name in these fasta files. The
# gene names are much better when they are this simple chrN.M
# numbering scheme, or in the case of a contig: contig_name.M
# where the M is a sequence number that genscan will assign to
# each gene it discovers.
mkdir hardChunks
twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \
| faSplit about stdin 4000000 hardChunks/c_
ssh kkr1u00
mkdir /iscratch/i/mus/mm9/hardChunks
cd /iscratch/i/mus/mm9/hardChunks
rsync -a --progress /cluster/data/mm9/hardChunks/ .
for R in 2 3 4 5 6 7 8
do
rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/hardChunks/
done
ssh hgwdev
mkdir /cluster/data/mm9/bed/genscan
cd /cluster/data/mm9/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kki
cd /cluster/data/mm9/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
# Since we split on gaps, we have no chunks like that. You can
# verify with faCount on the chunks.
ls -1Sr /iscratch/i/mus/mm9/hardChunks/c_*.fa > genome.list
## for next time, this isn't a parasol safe method of operation.
## if genscan is writing answers to gtf/ pep/ and subopt/ during
## its operation and it fails. parsol wouldn't be able to verify that
## it was complete merely by file existence check. This should work
## in scratch/tmp entirely, then copy results back after it is done.
# Create template file, for gensub2. For example (3-line file):
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 genome.list single template jobList
para create jobList
para try, check, push, check, ...
# Completed: 35 of 36 jobs
# CPU time in finished jobs: 279581s 4659.68m 77.66h 3.24d 0.009 y
# IO & Wait Time: 3390s 56.50m 0.94h 0.04d 0.000 y
# Average job time: 8085s 134.75m 2.25h 0.09d
# Longest finished job: 32422s 540.37m 9.01h 0.38d
# Submission to last job: 122301s 2038.35m 33.97h 1.42d
# There was a failed job, going to kolossus and running it again,
# it takes a very long time, and fails with this cryptic error:
# No overlap between a and b in mergeTwo
ssh kolossus
cd /cluster/data/mm9/bed/genscan
time /cluster/bin/x86_64/gsBig /iscratch/i/mus/mm9/hardChunks/c_06.fa \
gtf/c_06.gtf -trans=pep/c_06.pep -subopt=subopt/c_06.bed \
-exe=hg3rdParty/genscanlinux/genscan \
-par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp \
-window=2400000
# real 922m2.382s
# run it with a reduced window size to see if it will complete
time nice -n +19 /cluster/bin/x86_64/gsBig \
/iscratch/i/mus/mm9/hardChunks/c_06.fa \
gtf/c_06.gtf -trans=pep/c_06.pep -subopt=subopt/c_06.bed \
-exe=hg3rdParty/genscanlinux/genscan \
-par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp \
-window=2000000
# real 648m24.682s
## that one failed too, with an error:
# /scratch/tmp/temp_gsBig_10943_chr7_38.genscan is not a GENSCAN output file
## and the contents of that file said:
# Insufficient memory error: results may be unreliable.
# Try running program an a portion of sequence.
# Let's try splitting up this chr7 on the gaps, which there are plenty
# of in this hard masked sequence. Ended up breaking the chr7 sequence
# with the non bridged lift file. See the lft2BitToFa.pl file in
# the chr7_split directory.
# on kkstore06
ssh kkstore06
mkdir /cluster/data/mm9/bed/genscan/chr7_split
cd /cluster/data/mm9/bed/genscan/chr7_split
./lft2BitToFa.pl ../../../mm9.2bit *.lft > chr7.contigs.hard.fa
mkdir /cluster/data/mm9/bed/genscan/chr7_run
cd /cluster/data/mm9/bed/genscan/chr7_run
mkdir split
faSplit sequence ../chr7_split/chr7.contigs.hard.fa 100 split/chr7_
## Now, on the small kluster
ssh kki
cd /cluster/data/mm9/bed/genscan/chr7_run
mkdir gtf pep subopt
# Create template file, for gensub2. For example (3-line file):
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=../hg3rdParty/genscanlinux/genscan -par=../hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << happy emacs
ls -1S split/chr7_*.fa > chr7.list
gensub2 chr7.list single template jobList
para create jobList
para try ... check ... push ... etc...
# Completed: 15 of 15 jobs
# CPU time in finished jobs: 4226s 70.43m 1.17h 0.05d 0.000 y
# IO & Wait Time: 215s 3.59m 0.06h 0.00d 0.000 y
# Average job time: 296s 4.93m 0.08h 0.00d
# Longest finished job: 861s 14.35m 0.24h 0.01d
# Submission to last job: 861s 14.35m 0.24h 0.01d
# lift these chr7 results into a single file,
# fixup the gene names with the sed to remove the lift name effect
ssh kkstore06
cd /cluster/data/mm9/bed/genscan/chr7_run
cat gtf/chr7_*.gtf | liftUp -type=.gtf stdout \
../chr7_split/nonBridgedChr7.lft error stdin \
| sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.gtf
cat subopt/chr7_*.bed | liftUp -type=.bed stdout \
../chr7_split/nonBridgedChr7.lft error stdin \
| sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.subopt.bed
cat pep/chr7_*.pep | sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.pep
## these results become the c_06 results in the main run
cp -p chr7.pep ../pep/c_06.pep
cp -p chr7.subopt.bed ../subopt/c_06.bed
cp -p chr7.gtf ../gtf/c_06.gtf
## after the chr7 business above, back to the mainline processing
# cat and lift the results into single files
ssh kkstore06
cd /cluster/data/mm9/bed/genscan
cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \
../../jkStuff/mm9.contigs.lift carry stdin
cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \
../../jkStuff/mm9.contigs.lift carry stdin
cat pep/c_*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/mm9/bed/genscan
ldHgGene mm9 -gtf genscan genscan.gtf
# Read 45189 transcripts in 324075 lines in 1 files
# 45189 groups 34 seqs 1 sources 1 feature types
# 45189 gene predictions
hgPepPred mm9 generic genscanPep genscan.pep
hgLoadBed mm9 genscanSubopt genscanSubopt.bed
# Loaded 525904 elements of size 6
# check the numbers
time nice -n +19 featureBits mm9 genscan
# 55293837 bases of 2620346127 (2.110%) in intersection
time nice -n +19 featureBits mm8 genscan
# 54455852 bases of 2567283971 (2.121%) in intersection
time nice -n +19 featureBits mm8 knownGene:cds
# 28459053 bases of 2567283971 (1.109%) in intersection
featureBits mm7 genscan
# 54864694 bases of 2583394090 (2.124%) in intersection
time nice -n +19 featureBits mm7 knownGene:cds
# 27531524 bases of 2583394090 (1.066%) in intersection
featureBits mm9 genscanSubopt
# 57044145 bases of 2620346127 (2.177%) in intersection
featureBits mm8 genscanSubopt
# 57048581 bases of 2567283971 (2.222%) in intersection
featureBits mm7 genscanSubopt
# 57512333 bases of 2583394090 (2.226%) in intersection
featureBits mm6 genscanSubopt
# 57856316 bases of 2597150411 (2.228%) in intersection
featureBits mm5 genscanSubopt
# 58474899 bases of 2615483787 (2.236%) in intersection
featureBits mm4 genscanSubopt
# 59601009 bases of 2627444668 (2.268%) in intersection
featureBits mm3 genscanSubopt
# 56085184 bases of 2505900260 (2.238%) in intersection
#############################################################################
# BLASTZ SELF (DONE - 2007-08-07 - 2007-08-31 - Hiram)
# using chain min score of 10,000 to cut down on volumn of data
# trying a two pass sequence, chroms with chroms, then randoms to chroms
# swap the randoms, then combine the three results into a final set
ssh kkstore06
cd /cluster/data/mm9
time nice -n +19 faToTwoBit ?/chr?.fa ??/chr??.fa mm9.chroms.2bit
time nice -n +19 faToTwoBit randomContigs/chr*.ctg.fa mm9.randomContigs.2bit
ssh kkr1u00
cd /iscratch/i/mus/mm9
cp -p /cluster/data/mm9/mm9.chroms.2bit .
cp -p /cluster/data/mm9/mm9.randomContigs.2bit .
twoBitInfo mm9.chroms.2bit mm9.chroms.sizes
twoBitInfo mm9.randomContigs.2bit mm9.randomContgs.sizes
for R in 2 3 4 5 6 7 8
do
rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/
done
ssh kkstore06
mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07
cd /cluster/data/mm9/bed/blastzSelf.2007-08-07
cat << '_EOF_' > DEF
# mouse vs mouse
BLASTZ_H=2000
BLASTZ_M=200
# TARGET: Mouse Mm9
SEQ1_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit
SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Mouse Mm9
SEQ2_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit
SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzSelf.2007-08-07
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
## run this in a screen on kkstore06
cd /cluster/data/mm9/bed/blastzSelf.2007-08-07
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
-stop=load `pwd`/DEF > blastz.out 2>&1 &
# This was a tricky one to complete. A situation was fixed in the
# blastz-run-ucsc script which may have helped, but then there were
# 32 jobs that would only complete on the kki kluster. The kk nodes
# complained about running out of memory. After a completed run was
# finished, and verified:
ssh kkstore06
cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/psl
find . -type f | wc -l
# 77284
wc -l ../run.blastz/jobList
# wc -l ../run.blastz.jobList
# finished the rest by continuing at the 'cat' step:
time doBlastzChainNet.pl -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
-continue=cat -stop=load `pwd`/DEF > cat.out 2>&1 &
# real 285m33.094s
# failed during the load because of the SEQ?_LEN specification pointing
# to /iscratch/i which is not available on hgwdev. So, only use
# the primary /cluster/data/mm9/chrom.sizes for the DEF file in the future
# ran the load step manually to complete with the loadUp.csh fixed.
ssh kolossus
cd /cluster/data/mm9/bed/blastzSelf.2007-08-07
time nice -n +19 featureBits mm9 chainSelfLink \
> fb.mm9.chainSelfLink.noRandoms.txt 2>&1
# real 24m54.883s
cat fb.mm9.chainSelfLink.noRandoms.txt
# 323062218 bases of 2620346127 (12.329%) in intersection
cd /cluster/data/mm9/bed
ln -s blastzSelf.2007-08-07 blastz.mm9
## prepare 2bit file of only the randoms
ssh kkstore06
cd /cluster/data/mm9
faToTwoBit ?/chr?_random.fa ??/chr??_random.fa mm9.randoms.2bit
# and the sizes files
twoBitInfo mm9.randomContigs.2bit mm9.randomContigs.sizes
twoBitInfo mm9.randoms.2bit mm9.randoms.sizes
# a cluster run for just these bits of sequence
mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly
cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly
cat << '_EOF_' > DEF
# mouse vs mouse randoms
PATH=/cluster/bin/penn/x86_64:/cluster/bin/penn:/cluster/bin/scripts:/cluster/bin/x86_64:/bin:/usr/bin
BLASTZ_H=2000
BLASTZ_M=200
# TARGET: Mouse Mm9
SEQ1_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit
SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Mouse Mm9 randoms only
SEQ2_DIR=/cluster/data/mm9/mm9.randoms.2bit
SEQ2_LEN=/cluster/data/mm9/mm9.randoms.sizes
SEQ2_CTGDIR=/cluster/data/mm9/mm9.randomContigs.2bit
SEQ2_CTGLEN=/cluster/data/mm9/mm9.randomContigs.sizes
SEQ2_LIFT=/cluster/data/mm9/jkStuff/mm9.contigs.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=20
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 -ignoreSelf \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
-stop=net `pwd`/DEF > blastz.out 2>&1 &
# now swap the primary chroms back to the randoms
mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsSwap
cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsSwap
chainSwap ../randomsOnly/axtChain/mm9.mm9.all.chain.gz stdout \
| nice chainSort stdin stdout | nice gzip -c \
> mm9.mm9.all.chain.gz
# And then combine all three sets together
mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/allTogetherNow
cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/allTogetherNow
chainSplit chain ../axtChain/mm9.mm9.all.chain.gz \
../randomsOnly/axtChain/mm9.mm9.all.chain.gz \
../randomsSwap/mm9.mm9.all.chain.gz
# get them sorted by score correctly
mkdir chainSort
for F in `(cd chain; ls *.chain)`
do
echo $F
chainSort chain/$F chainSort/$F
done
# re-number the chains consistently
chainMergeSort chainSort/*.chain | nice gzip -c > mm9.mm9.all.chain.gz
rm -fr chain
mv chainSort chain
# and for loading, split this consistently numbered set
rm -fr chain
time nice -n +19 chainSplit chain mm9.mm9.all.chain.gz
# real 5m0.666s
## using a manually fixed up netChains.csh script:
time nice -n +19 ./netChains.csh > netChains.out 2>&1
# real 147m53.147s
ssh hgwdev
## using a manually fixed up loadUp.csh script:
# (from ../axtChain/loadUp.csh)
time nice -n +19 ./loadUp.csh > loadUp.out 2>&1 &
# real 99m17.895s
time nice -n +19 featureBits mm9 chainSelfLink > fb.mm9.chainSelfLink 2>&1
# real 30m3.402s
# 378849408 bases of 2620346127 (14.458%) in intersection
cat /cluster/data/mm8/bed/blastzSelf.2006-03-20/fb.mm8.chainSelfLink
# 362483673 bases of 2567283971 (14.119%) in intersection
# finish off the nets
time nice -n +19 netClass -verbose=0 -noAr noClass.net mm9 mm9 mm9.mm9.net
# real 1m9.538s
# load nets (not needed for the RR, but useful on genome-test)
time nice -n +19 netFilter -minGap=10 mm9.mm9.net \
| hgLoadNet -verbose=0 mm9 netSelf stdin
# real 0m40.709s
## We don't deliver this track to the RR, so downloads are not necessary
#############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2007-08-07 - Hiram)
ssh kkr1u00
mkdir /iscratch/i/mus/mm9/rmsk
cd /cluster/data/mm9
cp -p */chr*.fa.out /iscratch/i/mus/mm9/rmsk
cd /iscratch/i/mus/mm9
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/mm9/ kkr${R}u00:/iscratch/i/mm9/
done
cd rmsk
ssh kki
mkdir /cluster/data/mm9/linSpecRep
cd /cluster/data/mm9/linSpecRep
ls -1S /iscratch/i/mus/mm9/rmsk > fa.list
cat << '_EOF_' > mkLSR
#!/bin/csh -fe
pushd /iscratch/i/mus/mm9/rmsk
rm -f $1_homo-sapiens_rattus_canis-familiaris_bos-taurus
/cluster/bluearc/RepeatMasker070517/DateRepeats \
$1 -query mouse -comp human -comp rat -comp dog -comp cow
popd
/bin/cp -p \
/iscratch/i/mus/mm9/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus .
rm -f /iscratch/i/mus/mm9/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus
'_EOF_'
# << happy emacs
chmod +x mkLSR
cat << '_EOF_' > template
#LOOP
./mkLSR $(path1) {check out line+ $(path1)_homo-sapiens_rattus_canis-familiaris_bos-taurus}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 fa.list single template jobList
para try ... check ... push ... etc...
para time
# Completed: 35 of 35 jobs
# CPU time in finished jobs: 1498s 24.96m 0.42h 0.02d 0.000 y
# IO & Wait Time: 193s 3.22m 0.05h 0.00d 0.000 y
# Average job time: 48s 0.81m 0.01h 0.00d
# Longest finished job: 102s 1.70m 0.03h 0.00d
# Submission to last job: 3399s 56.65m 0.94h 0.04d
ssh kkstore06
cd /cluster/data/mm9/linSpecRep
mkdir notInHuman notInRat notInDog notInCow notInRabbit
for F in chr*.out_homo-sapiens*
do
B=${F/.fa.out*/}
echo $B
/cluster/bin/scripts/extractRepeats 1 ${F} > \
notInHuman/${B}.out.spec
/cluster/bin/scripts/extractRepeats 2 ${F} > \
notInRat/${B}.out.spec
/cluster/bin/scripts/extractRepeats 3 ${F} > \
notInDog/${B}.out.spec
/cluster/bin/scripts/extractRepeats 4 ${F} > \
notInCow/${B}.out.spec
done
# the notInHuman, notInDog, and notInCow ended up being
# identical. Only the notInRat was different than them
# To check identical
find . -name "*.out.spec" | \
while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
| sort -k1,1n | sort -t"/" -k3,3
# Copy to iscratch for use in kluster runs
ssh kkr1u00
mkdir -p /iscratch/i/mus/mm9/linSpecRep/notInRat
mkdir -p /iscratch/i/mus/mm9/linSpecRep/notInOthers
cd /iscratch/i/mus/mm9/linSpecRep/notInRat
cp -p /cluster/data/mm9/linSpecRep/notInRat/* .
cd /iscratch/i/mus/mm9/linSpecRep/notInOthers
cp -p /cluster/data/mm9/linSpecRep/notInHuman/* .
# copy this directory to the other Iservers
cd /iscratch/i/mus/mm9
for R in 2 3 4 5 6 7 8
do
rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/
done
# and we can do the Iservers simply:
ssh kkr1u00
cd /iscratch/i/mm9
# no longer need these two directories
rm -fr fa rmsk
rsync -a --progress /cluster/bluearc/scratch/hg/mm9/ .
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/mm9/ kkr${R}u00:/iscratch/i/mm9/
done
# We also need the nibs for blastz runs with lineage specific repeats
ssh kkstore06
mkdir /cluster/data/mm9/nib
cd /cluster/data/mm9
for FA in ?/chr*.fa ??/chr*.fa
do
F=${FA/*\//}
F=${F/.fa/}
echo faToNib -softMask ${FA} nib/${F}.nib
faToNib -softMask ${FA} nib/${F}.nib
done
# copied to /cluster/bluearc/scratch/data/mm9/nib/
# and everything else we will need for kluster runs into
# /cluster/bluearc/scratch/data/mm9/
# Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
#########################################################################
# BLASTZ RAT Rn4 (DONE - 2007-08-09 - 2007-08-15 - Hiram)
# re-run a second time with tighter parameters, see below for second run
ssh kkstore06
mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-09
cd /cluster/data/mm9/bed/blastzRn4.2007-08-09
# Started this before the rsync to /scratch/data/mm9/ had completed,
# hence the /cluster/bluearc/scratch/data/mm9/ location is used
# here.
cat << '_EOF_' > DEF
# mouse vs rat
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm9
SEQ1_DIR=/cluster/bluearc/scratch/data/mm9/nib
SEQ1_SMSK=/cluster/bluearc/scratch/data/mm9/notInRat
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/iscratch/i/rn4/nib
SEQ2_SMSK=/iscratch/i/rn4/linSpecRep.notInMouse
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
## had to fix the blast-run-ucsc script to get these to complete.
# the chr16_random sequence was causing problems because it has no usable
# sequence in it for blastz to work with. And finally, two jobs needed to
# be run manually on kolossus, don't know what happened with them,
# although their output was immense:
# -rw-rw-r-- 1 15054644 Aug 14 10:22 chr2.nib:chr2:80000000-90010000_chr7.nib:chr7:0-10000000.psl
# -rw-rw-r-- 1 18992595 Aug 14 11:02 chr2.nib:chr2:80000000-90010000_chr3.nib:chr3:70000000-80000000.psl
# I suspect there is something going on with large results and running on
# the kk nodes. I'm getting the same trouble with the self blastz.
# then, continuing with the cat
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
-continue=cat `pwd`/DEF > cat.out 2>&1 &
# real 239m51.356s
cat fb.mm9.chainRn4Link.txt
# 1791195056 bases of 2620346127 (68.357%) in intersection
cat /cluster/data/mm8/bed/blastz.rn4/fb.mm8.chainRn4Link
# 1770319811 bases of 2567283971 (68.957%) in intersection
cd /cluster/data/mm9/bed
ln -s blastzRn4.2007-08-09 blastz.rn4
mkdir /cluster/data/rn4/bed/blastz.mm9.swap
cd /cluster/data/rn4/bed/blastz.mm9.swap
time ~/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
-swap /cluster/data/mm9/bed/blastzRn4.2007-08-09/DEF > swap.out 2>&1 &
# real 209m11.032s
cd /cluster/data/rn4/bed
ln -s blastz.mm9.swap blastz.mm9
cat /cluster/data/rn4/bed/blastz.mm9/fb.rn4.chainMm9Link.txt
# 1788261968 bases of 2571531505 (69.541%) in intersection
cat /cluster/data/rn4/bed/blastz.mm8/fb.rn4.chainMm8Link.txt
# 1791093685 bases of 2571531505 (69.651%) in intersection
#########################################################################
## multiple alignment preparation stats
# The following table will keep track of the pairwise alignments
# completed. (The % NN.Nxx mean not done yet)
# featureBits chainLink measures
# chainMm9Link chain linearGap
# distance on Mm9 on other minScore
# 1 0.1587 - rat rn4 (% 68.357) (% 69.541) 3000 medium
# 2 0.4677 - human hg18 (% 38.499) (% 35.201) 3000 medium
# 3 0.4686 - chimp panTro2 (% 37.5xx) (% 33.6xx) 3000 medium
# 4 0.4960 - macaque rheMac2 (% 34.7xx) (% 33.1xx) 3000 medium
# 5 0.5131 - rabbit oryCun1 (% 19.3xx) (no swap ) 3000 medium
# 6 0.6142 - armadillo dasNov1 (% 16.8xx) (no swap ) 3000 medium
# 7 0.6230 - dog canFam2 (% 32.2xx) (% 34.2xx) 3000 medium
# 8 0.6256 - elephant loxAfr1 (% 18.3xx) (no swap ) 3000 medium
# 9 0.6344 - cow bosTau2 (% 26.8xx) (% 24.2xx) 3000 medium
# 10 0.7805 - tenrec echTel1 (% 11.4xx) (no swap ) 5000 loose
# 11 1.0698 - opossum monDom4 (% 8.2xx) (% 6.0xx) 5000 loose
# 12 1.3425 - chicken galGal2 (% 2.5xx) (% 5.4xx) 5000 loose
# 13 1.7936 - frog xenTro2 (% 2.6xx) (% 5.3xx) 5000 loose
# 14 2.0157 - tetraodon tetNig1 (% 1.9xx) (% 13.7xx) 5000 loose
# 15 2.0562 - fugu fr1 (% 1.9xx) (% 13.5xx) 5000 loose
# 16 2.1059 - zebrafish danRer5 (% 2.1xx) (% 3.5xx) 5000 loose
##########################################################################
## BLASTZ SWAP from Hg18 to Mm9 (DONE - 2007-08-15 - Hiram)
# also in hg18.txt
cat /cluster/data/hg18/bed/blastzMm9.2007-08-09/fb.hg18.chainMm9Link.txt
# 1014323175 bases of 2881515245 (35.201%) in intersection
# Then to swap over to Mm9
ssh kkstore06
mkdir /cluster/data/mm9/bed/blastz.hg18.swap
cd /cluster/data/mm9/bed/blastz.hg18.swap
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 -swap -bigClusterHub=pk -chainMinScore=3000 \
-chainLinearGap=medium \
/cluster/data/hg18/bed/blastz.mm9/DEF > swap.out 2>&1 &
# real 67m21.146s
cat /cluster/data/mm9/bed/blastz.hg18.swap/fb.mm9.chainHg18Link.txt
# 1008812599 bases of 2620346127 (38.499%) in intersection
cat /cluster/data/mm8/bed/blastz.hg18/fb.mm8.chainHg18Link
# 984380268 bases of 2567283971 (38.343%) in intersection
cd /cluster/data/mm9/bed
ln -s blastz.hg18.swap blastz.hg18
## make swapped syntenic net
cd /cluster/data/mm9/bed/blastz.hg18.swap
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
-swap -syntenicNet -chainLinearGap=medium -continue=syntenicNet \
/cluster/data/hg18/bed/blastz.mm9/DEF > syntenic.out 2>&1 &
## real 20m49.712s
#########################################################################
# BLASTZ RAT Rn4 (DONE - 2007-08-30 - Hiram)
# re-run this second time with tighter parameters
ssh kkstore06
mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-30
cd /cluster/data/mm9/bed/blastzRn4.2007-08-30
# Started this before the rsync to /scratch/data/mm9/ had completed,
# hence the /cluster/bluearc/scratch/data/mm9/ location is used
# here.
cat << '_EOF_' > DEF
# mouse vs rat
# Specially tuned blastz parameters from Webb Miller
BLASTZ_ABRIDGE_REPEATS=0
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/scratch/hg/rn4/rn4.2bit
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-30
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
-stop=net \
`pwd`/DEF > blastz.out 2>&1 &
# this runs much faster than the usual blastz run
# failed when it got to the kki run since /scratch/hg/rn4/ was not
# complete on the Iservers. Fixup that, then, continue:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
-continue=chainMerge -stop=net \
`pwd`/DEF > chainMerge.out 2>&1 &
# And then, kolossus had no /scratch/data/ directory, go there and
# make this a symlink to /iscratch/data/
# and run the axtChain/netChains.csh script manually on kolossus
#########################################################################
# BLASTZ/CHAIN/NET oryLat1 (DONE - 2007-08-31 - Hiram)
ssh kkstore04
mkdir /cluster/data/mm9/bed/blastzOryLat1.2007-08-30
cd /cluster/data/mm9/bed/blastzOryLat1.2007-08-30
cat << '_EOF_' > DEF
# mouse vs medaka
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
# chrUn in Scaffolds for this alignment run
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzOryLat1.2007-08-30
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# real 512m56.909s
# had a single failed kk job, finished manually, then:
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
# real 11m5.508s
## typical failure:
# HgStepManager: executing step 'net' Fri Aug 31 10:02:51 2007.
# netChains: looks like previous stage was not successful (can't find [mm9.oryLat1.]all.chain[.gz]).
# continuing
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 &
# real 21m33.501s
cat fb.mm9.chainOryLat1Link.txt
# 50650171 bases of 2620346127 (1.933%) in intersection
# and the swap
mkdir /cluster/data/oryLat1/bed/blastz.mm9.swap
cd /cluster/data/oryLat1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/mm9/bed/blastzOryLat1.2007-08-30/DEF \
-chainMinScore=5000 -qRepeats=windowmaskerSdust -chainLinearGap=loose \
-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
cat fb.oryLat1.chainMm9Link.txt
# 45488232 bases of 700386597 (6.495%) in intersection
#########################################################################
# LOAD ACEMBLY (DONE 9/17/07 angie)
ssh kkstore06
mkdir /cluster/data/mm9/bed/acembly
cd /cluster/data/mm9/bed/acembly
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Sep07.mouse.genes/AceView.mm_37.genes_gff.tar.gz
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Sep07.mouse.genes/AceView.mm_37.good_proteins_fasta.tar.gz
tar xvzf AceView.mm_37.genes_gff.tar.gz
tar xvzf AceView.mm_37.good_proteins_fasta.tar.gz
cd AceView.mm_37.genes_gff
# If the result of this command is > 0, then some lines have end < start
# and need to be fixed:
awk '$5 < $4 {print;}' *.gff | wc -l
#0
# Add "chr" prefix:
sed -e 's/^/chr/;' x1*.gff > acembly.gff
# Extract annotation types from original gff:
perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \
s/Main$/main/ || s/Putative$/putative/ || \
die "Unrecognized class:\n$_\n";' *.gff \
| sort -u \
> acemblyClass.tab
# Keep tabs on the transcript names that end in -unspliced --
# the first time around, had to add that suffix to some protein names
# in order to get all of them to match. runJoiner is the real test.
grep unspliced acemblyClass.tab | wc -l
#54774
# Pare down proteins to just the ones that we have transcripts for:
cd /cluster/data/mm9/bed/acembly/AceView.mm_37.good_proteins_fasta
awk '{print $1;}' ../AceView.mm_37.genes_gff/acemblyClass.tab \
> transcriptNames.txt
cat *.fasta \
| faSomeRecords stdin transcriptNames.txt acemblyPep.fa
grep unspliced acemblyPep.fa | wc -l
#45033
# Danielle Thierry-Mieg explained that noncoding genes are included so
# the number of proteins can be smaller than the number of transcripts.
# Load tables
ssh hgwdev
cd /cluster/data/mm9/bed/acembly/AceView.mm_37.genes_gff
ldHgGene -gtf mm9 acembly acembly.gff
#Read 173008 transcripts in 2366104 lines in 1 files
# 173008 groups 21 seqs 1 sources 5 feature types
hgLoadSqlTab mm9 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \
acemblyClass.tab
cd /cluster/data/mm9/bed/acembly/AceView.mm_37.good_proteins_fasta
hgPepPred mm9 generic acemblyPep acemblyPep.fa
rm acemblyPep.tab
runJoiner.csh mm9 acembly
# mm9.acemblyPep.name - hits 149560 of 149560 ok
# mm9.acemblyClass.name - hits 173008 of 173008 ok
#########################################################################
# BLASTZ RAT Rn4 (DONE - 2007-08-30 - 2007-09-11 - Hiram)
# re-run this third time with a special matrix from Bob Harris/Webb Miller
cat /cluster/data/blastz/mouse_rat.q
A C G T
56 -109 -45 -137
-109 100 -103 -45
-45 -103 100 -109
-137 -45 -109 56
O=600 E=55
ssh kkstore06
mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-31
cd /cluster/data/mm9/bed/blastzRn4.2007-08-31
# Started this before the rsync to /scratch/data/mm9/ had completed,
# hence the /cluster/bluearc/scratch/data/mm9/ location is used
# here.
cat << '_EOF_' > DEF
# mouse vs rat
# Specially tuned blastz parameters from Webb Miller
BLASTZ_ABRIDGE_REPEATS=0
BLASTZ_O=600
BLASTZ_E=55
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/blastz/mouse_rat.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/scratch/hg/rn4/rn4.2bit
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-31
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
cd /cluster/data/mm9/bed/blastzRn4.2007-08-31
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
-stop=net `pwd`/DEF > blastz.out 2>&1 &
# real 243m51.078s
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
-continue=download -stop=download `pwd`/DEF > download.out 2>&1 &
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
-continue=cleanup -syntenicNet `pwd`/DEF > syntenicNet.out 2>&1 &
cat fb.mm9.chainRn4Link.txt
# 1713186474 bases of 2620346127 (65.380%) in intersection
# and the swap
mkdir /cluster/data/rn4/bed/blastz.mm9.swap
cd /cluster/data/rn4/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzRn4.2007-08-31/DEF \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
-swap -syntenicNet > swap.out 2>&1 &
# real 314m59.840s
cat fb.rn4.chainMm9Link.txt
# 1711034941 bases of 2571531505 (66.538%) in intersection
#########################################################################
# EXONIPHY MM9, lifted from hg18 (DONE - 2007-09-05 - Hiram)
# needed for uscsGenes10 building
# create a syntenic liftOver chain file
ssh kolossus
cd /cluster/data/hg18/bed/blastz.mm9/axtChain
time nice -n +19 netFilter -syn hg18.mm9.net.gz \
| netChainSubset -verbose=0 stdin hg18.mm9.all.chain.gz stdout \
| chainStitchId stdin stdout | gzip -c > hg18.mm9.syn.chain.gz
# real 5m55.575s
# slightly smaller than the ordinary liftOver chain file:
# -rw-rw-r-- 1 77849682 Aug 14 16:49 hg18.mm9.over.chain.gz
# -rw-rw-r-- 1 73972671 Sep 5 15:27 hg18.mm9.syn.chain.gz
# exoniphyMm9.gp is prepared as follows
ssh hgwdev
mkdir /cluster/data/mm9/bed/exoniphy
cd /cluster/data/mm9/bed/exoniphy
hgsql hg18 -e "select * from exoniphy" -N > exoniphyHg18.gp
time nice -n +19 liftOver -genePred exoniphyHg18.gp \
/cluster/data/hg18/bed/blastz.mm9/axtChain/hg18.mm9.syn.chain.gz \
exoniphyMm9.gp unmapped
# real 52m0.335s
wc -l *
# 178162 exoniphyHg18.gp
# 172859 exoniphyMm9.gp
# 10606 unmapped
ssh hgwdev
cd /cluster/data/mm9/bed/exoniphy
nice -n +19 hgLoadGenePred -genePredExt mm9 exoniphy exoniphyMm9.gp
nice -n +19 featureBits mm9 exoniphy
# 25931742 bases of 2620346127 (0.990%) in intersection
nice -n +19 featureBits mm8 exoniphy
# 25952211 bases of 2567283971 (1.011%) in intersection
#########################################################################
# BLASTZ canFam2 (DONE - 2006-02-18 - Hiram)
ssh kkstore06
# establish a screen to control this job
screen
mkdir /cluster/data/mm9/bed/blastzCanFam2.2007-09-04
cd /cluster/data/mm9/bed/blastzCanFam2.2007-09-04
cat << '_EOF_' > DEF
# mouse vs dog
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_SMSK=/scratch/data/mm9/notInOthers
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse
SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzCanFam2.2007-09-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# real 871m24.249s
cat fb.mm9.chainCanFam2Link.txt
# 848004408 bases of 2620346127 (32.362%) in intersection
mkdir /cluster/data/canFam2/bed/blastz.mm9.swap
cd /cluster/data/canFam2/bed/blastz.mm9.swap
time /cluster/bin/scripts/doBlastzChainNet.pl \
/cluster/data/mm9/bed/blastzCanFam2.2007-09-04/DEF \
-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
-chainLinearGap=medium -swap > swap.out 2>&1 &
# real 57m59.126s
cat fb.canFam2.chainMm9Link.txt
# 832145360 bases of 2384996543 (34.891%) in intersection
# need syntenic net for the multiz
cd /cluster/data/mm9/bed/blastzCanFam2.2007-09-04
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-syntenicNet -continue=syntenicNet `pwd`/DEF > synNet.out 2>&1 &
# real 19m1.302s
#########################################################################
# BLASTZ/CHAIN/NET RHEMAC2 (DONE - 2007-09-05 - Hiram)
# Won't put this in Conservation -- special request for ancestor recon.
ssh kkstore06
# use a screen to control this job
# XXX note for next time, naming convention is different here than all the
# others, and there is a missing TMPDIR in the DEF file
screen
mkdir /cluster/data/mm9/bed/blastz.rheMac2.2007-09-05
cd /cluster/data/mm9/bed/blastz.rheMac2.2007-09-05
cat << '_EOF_' > DEF
# Mouse vs. macacque
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_SMSK=/scratch/data/mm9/notInOthers
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Macacque (rheMac2)
SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib
SEQ2_SMSK=/cluster/bluearc/rheMac2/linSpecRep/notInRodent
SEQ2_LEN=/cluster/data/rheMac2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastz.rheMac2.2007-09-05
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
-syntenicNet `pwd`/DEF > do.log 2>&1 &
# real 1017m13.247s
# some kk kluster difficulties, fixup and complete manually
# Completed: 87616 of 87616 jobs
# CPU time in finished jobs: 26547195s 442453.25m 7374.22h 307.26d 0.842 y
# IO & Wait Time: 3384143s 56402.38m 940.04h 39.17d 0.107 y
# Average job time: 342s 5.69m 0.09h 0.00d
# Longest finished job: 3159s 52.65m 0.88h 0.04d
# Submission to last job: 65814s 1096.90m 18.28h 0.76d
# then, continuing
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
-continue=cat -syntenicNet `pwd`/DEF > cat.log 2>&1 &
# real 255m52.382s
cat fb.mm9.chainRheMac2Link.txt
# 998017006 bases of 2620346127 (38.087%) in intersection
mkdir /cluster/data/rheMac2/bed/blastz.mm9.swap
cd /cluster/data/rheMac2/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastz.rheMac2.2007-09-05/DEF \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet > swap.log 2>&1 &
# real 178m31.911s
cat fb.rheMac2.chainMm9Link.txt
# 1094006509 bases of 2646704109 (41.335%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET Orangutan ponAbe1 (DONE - 2007-09-05 - Hiram)
ssh kkstore01
# use a screen to control this job
screen
mkdir /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
cd /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
# next time, have SEQ2_CHUNK at 30000000 and SEQ2_LIMIT at 100
# this caused over 500,000 pk jobs, that is too many
cat << '_EOF_' > DEF
# mouse vs orangutan
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Orangutan ponAbe1
SEQ2_DIR=/scratch/data/ponAbe1/ponAbe1.2bit
SEQ2_LEN=/cluster/data/ponAbe1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-stop=load -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 &
# real 897m58.156s
# some pk kluster difficulties, fixup and complete manually
Completed: 511290 of 511290 jobs
CPU time in finished jobs: 11448015s 190800.24m 3180.00h 132.50d 0.363 y
IO & Wait Time: 1852197s 30869.96m 514.50h 21.44d 0.059 y
Average job time: 26s 0.43m 0.01h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 349s 5.82m 0.10h 0.00d
Submission to last job: 54771s 912.85m 15.21h 0.63d
# then, continuing
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=cat -stop=load -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 &
# ran into trouble on the kki chain run with stuff missing
# from the Iservers /scratch/data/ - rsync them up and get
# the run done manually
# Completed: 24 of 24 jobs
# CPU time in finished jobs: 17718s 295.30m 4.92h 0.21d 0.001 y
# IO & Wait Time: 203s 3.38m 0.06h 0.00d 0.000 y
# Average job time: 747s 12.45m 0.21h 0.01d
# Longest finished job: 3673s 61.22m 1.02h 0.04d
# Submission to last job: 3886s 64.77m 1.08h 0.04d
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=chainMerge -stop=load -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=pk > chainMerge.log 2>&1 &
# real 55m27.522s
cat fb.mm9.chainPonAbe1Link.txt
# 913843325 bases of 2620346127 (34.875%) in intersection
mkdir /cluster/data/ponAbe1/bed/blastz.mm9.swap
cd /cluster/data/ponAbe1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzPonAbe1.2007-09-05/DEF \
-stop=load -chainMinScore=3000 \
-swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 &
# create the syntenic maf nets:
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=download -syntenicNet -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 &
# real 20m55.024s
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 ponAbe1 \
> rbest.log 2>&1 &
# real 53m43.377s
#########################################################################
# BLASTZ/CHAIN/NET Marmoset calJac1 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
ssh kkstore06
# use a screen to control this job
screen
mkdir /cluster/data/mm9/bed/blastzCalJac1.2007-09-06
cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06
# next time, try SEQ2_CHUNK at 40000000, SEQ2_LIMIT at 75
# this created 285,570 kluster jobs, that is too many
cat << '_EOF_' > DEF
# mouse vs marmoset
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Marmoset calJac1
SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzCalJac1.2007-09-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-stop=load -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 &
# real 897m58.156s
# some pk kluster difficulties, fixup and complete manually
# Completed: 511290 of 511290 jobs
# CPU time in finished jobs: 11448015s 190800.24m 3180.00h 132.50d 0.363 y
# IO & Wait Time: 1852197s 30869.96m 514.50h 21.44d 0.059 y
# Average job time: 26s 0.43m 0.01h 0.00d
# Longest finished job: 349s 5.82m 0.10h 0.00d
# Submission to last job: 54771s 912.85m 15.21h 0.63d
# then, continuing
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=cat -stop=load -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 &
# real 669m34.473s
cat fb.mm9.chainCalJac1Link.txt
# 863961573 bases of 2620346127 (32.971%) in intersection
mkdir /cluster/data/calJac1/bed/blastz.mm9.swap
cd /cluster/data/calJac1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \
-stop=load -chainMinScore=3000 \
-swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 &
# real 217m10.835s
cat fb.calJac1.chainMm9Link.txt
# 887586922 bases of 2929139385 (30.302%) in intersection
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \
-continue=download -chainMinScore=3000 \
-swap -chainLinearGap=medium -bigClusterHub=pk > download.log 2>&1 &
# real 1m9.876s
# run the syntenic nets
time nice -n +19 doBlastzChainNet.pl -verbose=2 DEF \
-continue=download -chainMinScore=3000 \
-syntenicNet -chainLinearGap=medium -bigClusterHub=pk \
> syntenicNet.log 2>&1 &
# real 22m51.080s
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 calJac1 \
> rbest.log 2>&1 &
# real 47m18.467s
#########################################################################
# BLASTZ/CHAIN/NET Fugu fr2 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
ssh kkstore02
# use a screen to control this job
screen
mkdir /cluster/data/mm9/bed/blastzFr2.2007-09-06
cd /cluster/data/mm9/bed/blastzFr2.2007-09-06
cat << '_EOF_' > DEF
# mouse vs medaka
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Fugu fr2
# Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzFr2.2007-09-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# real 156m55.151s
# fixup broken kluster jobs, complete manually
# Completed: 70395 of 70395 jobs
# CPU time in finished jobs: 4339015s 72316.91m 1205.28h 50.22d 0.138 y
# IO & Wait Time: 486414s 8106.90m 135.12h 5.63d 0.015 y
# Average job time: 69s 1.14m 0.02h 0.00d
# Longest finished job: 1098s 18.30m 0.30h 0.01d
# Submission to last job: 18352s 305.87m 5.10h 0.21d
# and then continuing
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
# real 5m43.977s
# Still, the typical failure
# HgStepManager: executing step 'net' Thu Sep 6 16:04:56 2007.
# netChains: looks like previous stage was not successful (can't find [mm9.fr2.]all.chain[.gz]).
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 &
# real 178m15.798s
cat fb.mm9.chainFr2Link.txt
# 47018710 bases of 2620346127 (1.794%) in intersection
mkdir /cluster/data/fr2/bed/blastz.mm9.swap
cd /cluster/data/fr2/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
/cluster/data/mm9/bed/blastzFr2.2007-09-06/DEF \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
# real 15m32.368s
cat fb.fr2.chainMm9Link.txt
# 42413565 bases of 393312790 (10.784%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET Tetraodon tetNig1 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
ssh kkstore01
# use a screen to control this job
screen
mkdir /cluster/data/mm9/bed/blastzTetNig1.2007-09-06
cd /cluster/data/mm9/bed/blastzTetNig1.2007-09-06
cat << '_EOF_' > DEF
# mouse vs tetraodon
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tetraodon tetNig1
# Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzTetNig1.2007-09-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# real 535m2.474s
# Typical failure
# HgStepManager: executing step 'net' Fri Sep 7 01:13:06 2007.
# netChains: looks like previous stage was not successful (can't find [mm9.tetNig1.]all.chain[.gz]).
# continuing
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 &
cat fb.mm9.chainTetNig1Link.txt
# 46206292 bases of 2620346127 (1.763%) in intersection
mkdir /cluster/data/tetNig1/bed/blastz.mm9.swap
cd /cluster/data/tetNig1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzTetNig1.2007-09-06/DEF \
-chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-swap -bigClusterHub=kk > swap.log 2>&1 &
# real 19m58.885s
cat fb.tetNig1.chainMm9Link.txt
# 42256263 bases of 342403326 (12.341%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET Stickleback gasAcu1 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
ssh kkstore01
# use a screen to control this job
screen
mkdir /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
cd /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
cat << '_EOF_' > DEF
# mouse vs stickleback
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: stickleback gasAcu1
SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ2_CHUNK=35000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# Completed: 52725 of 52725 jobs
# CPU time in finished jobs: 4110432s 68507.19m 1141.79h 47.57d 0.130 y
# IO & Wait Time: 413069s 6884.49m 114.74h 4.78d 0.013 y
# Average job time: 86s 1.43m 0.02h 0.00d
# Longest finished job: 1140s 19.00m 0.32h 0.01d
# Submission to last job: 71194s 1186.57m 19.78h 0.82d
# had some jobs fail on the kk run, finish manually, then continuing:
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
# real 120m36.209s
# failed kki chain job due to san outage on kkr7u00, finished manually:
# Completed: 24 of 24 jobs
# CPU time in finished jobs: 1807s 30.12m 0.50h 0.02d 0.000 y
# IO & Wait Time: 258s 4.29m 0.07h 0.00d 0.000 y
# Average job time: 86s 1.43m 0.02h 0.00d
# Longest finished job: 257s 4.28m 0.07h 0.00d
# Submission to last job: 9851s 164.18m 2.74h 0.11d
# continuing
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 -verbose=2 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-continue=chainMerge -bigClusterHub=kk > chainMerge.log 2>&1 &
# real 21m7.089s
cat fb.mm9.chainGasAcu1Link.txt
# 48448585 bases of 2620346127 (1.849%) in intersection
mkdir /cluster/data/gasAcu1/bed/blastz.mm9.swap
cd /cluster/data/gasAcu1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
/cluster/data/mm9/bed/blastzGasAcu1.2007-09-06/DEF \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
cat fb.gasAcu1.chainMm9Link.txt
# 43730193 bases of 446627861 (9.791%) in intersection
#########################################################################
# BLASTZ Zebrafish danRer5 (DONE - 2007-09-11 - 2007-09-12 - Hiram)
# re-run a second time with BLASTZ_Q, see below
ssh kkstore06
screen # use screen to manage this job
mkdir /cluster/data/mm9/bed/blastzDanRer5.2007-09-11
cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-11
cat << '_EOF_' > DEF
# Mouse (mm9) vs zebrafish (danRer5)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - zebrafish (danRer5)
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/cluster/data/danRer5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzDanRer5.2007-09-11
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
# real 222m47.787s
cat fb.mm9.chainDanRer5Link.txt
# 48497464 bases of 2620346127 (1.851%) in intersection
mkdir /cluster/data/danRer5/bed/blastz.mm9.swap
cd /cluster/data/danRer5/bed/blastz.mm9.swap
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-chainMinScore=5000 \
/cluster/data/mm9/bed/blastzDanRer5.2007-09-11/DEF \
-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> swap.log 2>&1 &
# real 9m47.163s
cat fb.danRer5.chainMm9Link.txt
# 34017483 bases of 1435609608 (2.370%) in intersection
#########################################################################
# BLASTZ Zebrafish danRer5 (DONE - 2007-09-13 - Hiram)
# second time, forgot to include BLASTZ_Q the first time
ssh kkstore06
screen # use screen to manage this job
mkdir /cluster/data/mm9/bed/blastzDanRer5.2007-09-13
cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-13
# This is the wrong way overlap, but it seems to work
cat << '_EOF_' > DEF
# Mouse (mm9) vs zebrafish (danRer5)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - zebrafish (danRer5)
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/cluster/data/danRer5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzDanRer5.2007-09-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
# real 369m16.947s
cat fb.mm9.chainDanRer5Link.txt
# 84513268 bases of 2620346127 (3.225%) in intersection
mkdir /cluster/data/danRer5/bed/blastz.mm9.swap
cd /cluster/data/danRer5/bed/blastz.mm9.swap
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-chainMinScore=5000 \
/cluster/data/mm9/bed/blastzDanRer5.2007-09-13/DEF \
-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> swap.log 2>&1 &
# real 21m44.784s
cat fb.danRer5.chainMm9Link.txt
# 66400782 bases of 1435609608 (4.625%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET Guinea Pig cavPor2 (DONE - 2007-09-19 - kate)
ssh kkstore06
mkdir /cluster/data/mm9/bed/blastzCavPor2.2007-09-19
cd /cluster/data/mm9/bed/blastzCavPor2.2007-09-19
cat << '_EOF_' > DEF
# mouse vs guinea pig
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Guinea pig cavPor2
SEQ2_DIR=/san/sanvol1/scratch/cavPor2/cavPor2.2bit
SEQ2_LEN=/san/sanvol1/scratch/cavPor2/chrom.sizes
# chunking similar to cat (similar number of scaffolds)
SEQ2_CHUNK=30000000
SEQ2_LIMIT=500
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzCavPor2.2007-09-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
doBlastzChainNet.pl `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=pk >& do.log &
# load nets manually -- automated loading fails as classification info
# not available (no database)
ssh hgwdev
cd /cluster/data/mm9/bed/blastz.cavPor2/axtChain
netFilter -minGap=10 noClass.net | hgLoadNet -warn mm9 netCavPor2 stdin
netFilter -minGap=10 mm9.cavPor2.rbest.net.gz | \
hgLoadNet -warn mm9 netRBestCavPor2 stdin
doBlastzChainNet.pl `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-continue=download >& do2.log &
# reciprocal best net mafs for multiz
~/kent/src/hg/utils/automation/doRecipBest.pl mm9 cavPor2 >&! rbest.log &
time nice -n +19 featureBits mm9 chainCavPor2Link \
> fb.mm9.chainCavPor2Link.txt 2>&1
cat fb.mm9.chainCavPor2Link.txt
# 480194223 bases of 2620346127 (18.326%) in intersection
# create the syntenic maf nets (these are unneeded):
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
-chainLinearGap=medium -continue=syntenicNet -syntenicNet \
-bigClusterHub=pk > syntenicNet.log 2>&1
#########################################################################
## 4-Way Multiz (DONE - 2007-09-07 - Hiram)
ssh hgwdev
mkdir /cluster/data/mm9/bed/multiz4way
cd /cluster/data/mm9/bed/multiz4way
ln -s ../multiz30way/mm9.guess.30way.nh ./30way.nh
leave mm9 rn4, canFam2 and hg18
/cluster/bin/phast/tree_doctor \
--prune panTro2,ponAbe1,rheMac2,calJac1,otoGar1,tupBel1,cavPor2,oryCun1,sorAra1,eriEur1,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1,monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer4 30way.nh
# this leaves us with:
cat << '_EOF_' > 4way.nh
((hg18:0.126901,
(rn4:0.084383,mm9:0.076274):0.249544):0.019763,canFam2:0.187963);
'_EOF_'
# << happy emacs
# Use this specification in the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to obtain a gif image for htdocs/images/phylo/mm9_4way.gif
/cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
# Use this output to create the table below
grep -y mm9 4way.distances.txt | sort -k3,3n
#
# If you can fill in all the numbers in this table, you are ready for
# the multiple alignment procedure
#
# featureBits chainLink measures
# chainOryLat1Link chain linearGap
# distance on mm9 on other minScore
# 1 0.160657 - rat rn4 (% 65.380) (% xx.xxx) 5000 medium
# 2 0.452719 - human hg18 (% 38.499) (% 35.201) 3000 medium
# 3 0.533544 - dog canFam2 (% 32.362) (% 34.891) 3000 medium
# using the syntenic nets
cd /cluster/data/mm9/bed/multiz4way
mkdir mafLinks
mkdir mafLinks/rn4
cd mafLinks/rn4
ln -s ../../../blastzRn4.2007-08-31/mafSynNet/*.maf.gz .
mkdir ../hg18
cd ../hg18
ln -s ../../../blastz.hg18/mafSynNet/*.maf.gz .
mkdir ../canFam2
cd ../canFam2
ln -s ../../../blastz.canFam2/mafSynNet/*.maf.gz .
# Copy MAFs to some appropriate NFS server for kluster run
mkdir /san/sanvol1/scratch/mm9/multiz4way
cd /san/sanvol1/scratch/mm9/multiz4way
time nice -n +19 rsync -a --copy-links --progress \
/cluster/data/mm9/bed/multiz4way/mafLinks/ .
# 1 minute to copy 2.4 Gb
# determine what is the newest version of multiz and use that
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
# the autoMultiz cluster run
ssh pk
cd /cluster/data/mm9/bed/multiz4way
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
4way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir run maf
cd run
# NOTE: you need to set the db and multiz dirname properly in this script
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = mm9
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz4way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz4way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
./autoMultiz $(root1) {check out line+ /cluster/data/mm9/bed/multiz4way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
awk '{print $1}' /cluster/data/mm9/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
# 35 jobs
para try ... check ... push ... etc ...
# Completed: 35 of 35 jobs
# CPU time in finished jobs: 27901s 465.02m 7.75h 0.32d 0.001 y
# IO & Wait Time: 562s 9.37m 0.16h 0.01d 0.000 y
# Average job time: 813s 13.55m 0.23h 0.01d
# Longest finished job: 2222s 37.03m 0.62h 0.03d
# Submission to last job: 2222s 37.03m 0.62h 0.03d
# combine results into a single file for loading and gbdb reference
ssh kkstore06
cd /cluster/data/mm9/bed/multiz4way
time nice -n +19 catDir maf > multiz4way.maf
# real 2m43.409s
# makes a 6.5 Gb file:
# -rw-rw-r-- 1 6883356263 Sep 7 11:00 multiz4way.maf
# Create per-chrom individual maf files for downloads
# NOT NECESSARY HERE - DONE LATER WITH THE ANNOTATED MAFS
ssh kkstore04
cd /cluster/data/mm9/bed/multiz4way
mkdir mafDownloads
time for M in maf/chr*.maf
do
B=`basename $M`
cp -p ${M} mafDownloads/${B}
gzip mafDownloads/${B}
echo ${B} done
done
# real 5m9.273
# deliver to downloads *!* NOT NECESSARY HERE - DONE LATER WITH
# THE ANNOTATED MAFS
ssh hgwdev
ln -s /cluster/data/mm9/bed/multiz4way/mafDownloads \
/usr/local/apache/htdocs/goldenPath/mm9/multiz4way
# Load into database
ssh hgwdev
cd /cluster/data/mm9/bed/multiz4way
mkdir /gbdb/mm9/multiz4way
ln -s /cluster/data/mm9/bed/multiz4way/multiz4way.maf \
/gbdb/mm9/multiz4way
time nice -n +19 hgLoadMaf mm9 multiz4way
# Loaded 5072051 mafs in 1 files from /gbdb/mm9/multiz4way
# real 2m33.680s
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 mm9 multiz4waySummary multiz4way.maf
# Created 1330454 summary blocks from 9893113 components
# and 5068764 mafs from multiz4way.maf
# real 3m27.620s
# Create tree image for details page
# You can get a better image from the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# with mm9 on top:
(((mouse_mm9:0.076274,rat_rn4:0.084383):0.249544,human_hg18:0.126901):0.019763,
dog_canFam2:0.187963);
#########################################################################
### GNF ATLAS 2 - required for UCSC Gene/Gene Sorter build
# (DONE - 2007-09-10 - Hiram)
# Align probes from GNF1M chip.
ssh pk
mkdir -p /cluster/data/mm9/bed/geneAtlas2/run/psl
cd /cluster/data/mm9/bed/geneAtlas2/run
cut -f1 /cluster/data/mm9/chrom.sizes > genome.list
ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > probe.list
cat << '_EOF_' > template
#LOOP
blat -fine -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 genome.list probe.list template jobList
para create jobList
para try ... check ... push ... etc.
para time
# Completed: 35 of 35 jobs
# CPU time in finished jobs: 14865s 247.75m 4.13h 0.17d 0.000 y
# IO & Wait Time: 160s 2.66m 0.04h 0.00d 0.000 y
# Average job time: 429s 7.15m 0.12h 0.00d
# Longest finished job: 1151s 19.18m 0.32h 0.01d
# Submission to last job: 1166s 19.43m 0.32h 0.01d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create gnf1h.psl.
pslSort dirs raw.psl tmp psl
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \
../affyGnf1m.psl /dev/null
# Load probes and alignments from GNF1H into database.
ssh hgwdev
cd /cluster/data/mm9/bed/geneAtlas2
# ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
hgLoadPsl mm9 affyGnf1m.psl
hgLoadSeq mm9 /gbdb/hgFixed/affyProbes/gnf1m.fa
# 31309 sequences
# Load up track
hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
affyGnf1m.psl
#Loaded 34863 rows of expression data from hgFixed.gnfMouseAtlas2MedianRatio
# Mapped 30117, multiply-mapped 1723, missed 882, unmapped 4746
# Note that the unmapped 5000 records are from all-N sequences.
hgLoadBed mm9 gnfAtlas2 gnfAtlas2.bed
# Loaded 31840 elements of size 15
featureBits mm9 gnfAtlas2
# 12921627 bases of 2620346127 (0.493%) in intersection
featureBits mm8 gnfAtlas2
# 12858280 bases of 2567283971 (0.501%) in intersection
# during the build of UCSC genes, this sequence takes place:
hgMapToGene mm9 affyGnf1m knownGene knownToGnf1m
hgExpDistance mm9 hgFixed.gnfMouseAtlas2MedianRatio \
hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
# this hgExpDistance command takes some time, maybe an hour or so ?
# Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio
# Got 31145 unique elements in hgFixed.gnfMouseAtlas2MedianRatio
hgMapToGene mm9 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
############################################################################
### affyU74 TRACK - needed for the Gene Sorter (DONE - 2007-09-10 - Hiram)
#
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of
# target sequences. Recalculate alignments and load data
#
# The affy data has previously been loaded to iscratch in:
# /iscratch/i/affy
# It originates from:
# /projects/compbio/data/microarray/affyGnfMouse/sequences/
# Run cluster job to do alignments
ssh kk
mkdir -p /cluster/data/mm9/bed/affyU74/run/psl
cd /cluster/data/mm9/bed/affyU74/run
cut -f1 /cluster/data/mm9/chrom.sizes > genome.list
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.list
cat << '_EOF_' > template
#LOOP
blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 genome.list affy.list template jobList
para create jobList
para try ... check ... push ... etc.
para time
# Completed: 105 of 105 jobs
# CPU time in finished jobs: 5891s 98.18m 1.64h 0.07d 0.000 y
# IO & Wait Time: 738s 12.31m 0.21h 0.01d 0.000 y
# Average job time: 63s 1.05m 0.02h 0.00d
# Longest finished job: 199s 3.32m 0.06h 0.00d
# Submission to last job: 215s 3.58m 0.06h 0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
pslSort dirs raw.psl tmp psl
# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
#pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl \
../all_affyU74.psl /dev/null
# Processed 40512 alignments
# Sort by chromosome and load into database.
ssh hgwdev
cd /cluster/data/mm9/bed/affyU74
pslSortAcc nohead chrom temp all_affyU74.psl
# Processed 30609 lines into 1 temp files
cat chrom/*.psl > affyU74.psl
# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
mv affyU74.psl affyU74.psl.orig
cut -f 1-9 affyU74.psl.orig >j1.tmp
cut -f 10 affyU74.psl.orig | sed -e 's/:/\t/' | cut -f 2 > j2.tmp
cut -f 11-21 affyU74.psl.orig >j3.tmp
paste j1.tmp j2.tmp j3.tmp >affyU74.psl
hgLoadPsl mm9 affyU74.psl
rm -rf chrom temp run j?.tmp
# creating the gene sorter tables runs the following:
hgMapToGene mm9 affyU74 knownGene knownToU74
############################################################################
## MAKE THE affyGnfU74 TRACKs (DONE 3/8/06, Fan)
# Make bed files and load consensus sequences for Affy U74 chip set.
#This needs to be done after affyU74 is already made.
ssh hgwdev
mkdir -p /cluster/data/mm9/bed/affyGnf
cd /cluster/data/mm9/bed/affyGnf
# may need to build this command in src/hg/affyGnf
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \
/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
# 89 experiments
# 10043 rows of expression data
# 30609 records in ../affyU74/affyU74.psl
# 10309 records written to affyGnfU74A.bed
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \
/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
# 20 experiments
# 12477 rows of expression data
# 30609 records in ../affyU74/affyU74.psl
# 11324 records written to affyGnfU74B.bed
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \
/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2
# 20 experiments
# 11934 rows of expression data
# 30609 records in ../affyU74/affyU74.psl
# 7773 records written to affyGnfU74C.bed
# edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# (these files do not appear to have these long names in them to begin with)
mkdir sav
mv *.bed sav
sed -e "s/U74Av2://" sav/affyGnfU74A.bed > affyGnfU74A.bed
sed -e "s/U74Bv2://" sav/affyGnfU74B.bed > affyGnfU74B.bed
sed -e "s/U74Cv2://" sav/affyGnfU74C.bed > affyGnfU74C.bed
# and reload data into table
hgLoadBed mm9 affyGnfU74A affyGnfU74A.bed
# Loaded 10309 elements of size 15
hgLoadBed mm9 affyGnfU74B affyGnfU74B.bed
# Loaded 11324 elements of size 15
hgLoadBed mm9 affyGnfU74C affyGnfU74C.bed
# Loaded 7773 elements of size 15
# Add in sequence data for U74 tracks.
# This business is already in gbdb - 2007-00-10 - Hiram
# You do not need to repeat this symlink sequence
# Copy consensus sequence to /gbdb if it isn't already
# mkdir -p /gbdb/hgFixed/affyProbes
cd /gbdb/hgFixed/affyProbes
# fix broken symlinks after directory structure changed
# /projects/compbiodata ----> /projects/compbio/data
rm U74*
# make correct symlinks (hartera, 2005-05-03)
ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .
# used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
# ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
# reload sequences with prefix removed so acc matches name used in
# other dependent tables
hgLoadSeq -abbr=U74Av2: mm9 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
# 12422 sequences
hgLoadSeq -abbr=U74Bv2: mm9 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
# 12411 sequences
hgLoadSeq -abbr=U74Cv2: mm9 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa
# 11868 sequences
# building the gene sorter runs the following commands
hgExpDistance mm9 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
-lookup=knownToU74
# real 7m6.223s
# Have 9636 elements in affyGnfU74A
# Got 15902 unique elements in affyGnfU74A
hgExpDistance mm9 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
-lookup=knownToU74
# real 2m12.727s
# Have 11025 elements in affyGnfU74B
# Got 10442 unique elements in affyGnfU74B
hgExpDistance mm9 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
-lookup=knownToU74
# real 0m29.270s
# Have 7487 elements in affyGnfU74C
# Got 3259 unique elements in affyGnfU74C
##########################################################################
# BUILD NIBB IMAGE PROGES (DONE - 2007-09-10 - Hiram)
ssh pk
mkdir -p /cluster/data/mm9/bed/nibbPics/run
cd /cluster/data/mm9/bed/nibbPics
cp -p /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
cd run
mkdir psl
ls -1 /scratch/data/mm9/nib/*.nib > genome.list
echo ../nibbImageProbes.fa > probe.list
# Create parasol gensub file file
cat << '_EOF_' > template
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch
gensub2 genome.list probe.list template jobList
para create jobList
para try ... check ... push ... etc... time
# Completed: 35 of 35 jobs
# CPU time in finished jobs: 9983s 166.39m 2.77h 0.12d 0.000 y
# IO & Wait Time: 146s 2.43m 0.04h 0.00d 0.000 y
# Average job time: 289s 4.82m 0.08h 0.00d
# Longest finished job: 729s 12.15m 0.20h 0.01d
# Submission to last job: 729s 12.15m 0.20h 0.01d
# Make sort and filter
catDir psl | sort -k 10 \
| pslReps stdin stdout /dev/null -nohead -minAli=0.60 \
-nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
| sort -k 14,14 -k 16,16n \
| sed 's#/scratch/data/mm9/nib/chr#chr#' \
| sed 's/.nib//' > ../nibbImageProbes.psl
# Make bed file and copy in stuff
ssh hgwdev
cd /cluster/data/mm9/bed/nibbPics
# Load into database
ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa \
/gbdb/mm9/nibbImageProbes.fa
hgLoadSeq mm9 /gbdb/mm9/nibbImageProbes.fa
hgLoadPsl mm9 nibbImageProbes.psl
#########################################################################
# Creating visiGene tables for gene sorter business
# (DONE - 2007-09-10 - Hiram)
# This businesss has cumulative effects on the visiGene database
# for safety purposes, backup the visiGene database
ssh hgwdev
mkdir -p /cluster/data/mm9/bed/vgProbes/visiGene.bak
cd /cluster/data/mm9/bed/vgProbes/visiGene.bak
hgsqldump --all -c --tab=. visiGene
cd /cluster/data/mm9/bed/vgProbes
mkdir working
cd /cluster/data/mm9/bed/vgProbes
cp -p ~/kent/src/hg/visiGene/vgProbeTrack/*.sql .
# this SEQ appears to find nothing new ?
vgProbeTrack SEQ working mm9
rc = 0 = count of primers for mrna search for taxon 10090
rc = 0 = count of primers for genome search for taxon 10090
bac list read done.
found seq for 0 bacEndPairs
rc = 0 = count of refSeq mrna for mm9
rc = 0 = count of genRef mrna for mm9
rc = 0 = count of genbank mrna for mm9
rc = 0 = count of flatRef mrna for mm9
rc = 0 = count of flatAll mrna for mm9
rc = 0 = count of linkRef mrna for mm9
rc = 0 = count of linkAll mrna for mm9
rc = 0 = count of kgAlRef mrna for mm9
rc = 0 = count of kgAlAll mrna for mm9
# and then, this creates the vgProbes table in mm9
vgProbeTrack ALI working mm9 -sqlPath=..
hgsql -e "select count(*) from vgProbes;" mm9
# 24924
hgsql -e "select count(*) from vgProbes;" mm8
# 24615
# this appears to build working/vgPrbExt.fa and it loaded some sequences
vgProbeTrack EXT working mm9
# this copies over all the items from vgProbes to start vgAllProbes
vgProbeTrack SELFMAP working mm9 -sqlPath=..
# this adds frog alignments to vgAllProbes
vgProbeTrack -sqlPath=.. REMAP working mm9 nibb nibbImageProbes \
/gbdb/mm9/nibbImageProbes.fa
hgsql -e "select count(*) from vgAllProbes;" mm9
# 26289
hgsql -e "select count(*) from vgAllProbes;" mm8
# 25994
# finally, gathering together all alignments used and updates seq table
vgProbeTrack EXTALL working mm9
# Then, during the gene sorter build, it does:
knownToVisiGene mm9
vgGetText visiGene.text mm7 mm8 mm9 hg17 hg18
# probe has 26611 rows
# gene has 20413 rows
# imageProbe has 125765 rows
wc -l visiGene.text
# 124186 visiGene.text
# compare to existing:
wc -l /usr/local/apache/cgi-bin/visiGeneData/visiGene.text
# 124186 /usr/local/apache/cgi-bin/visiGeneData/visiGene.text
#########################################################################
# Create Allen Brain Atlas mapping. (DONE - 2007-09-24 - Hiram)
# Set up directory
ssh kkstore06
mkdir /cluster/data/mm9/bed/allenBrain
cd /cluster/data/mm9/bed/allenBrain
# find most recent update of allProbes.fa to use for these alignments
cp -p /cluster/data/mm6/bed/allenBrain/allProbes.fa ./allenBrainProbes.fa
cp -p /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab .
# Set up a blat run to align the probes.
mkdir split
faSplit sequence allenBrainProbes.fa 200 split/rp
mkdir run
ssh pk
cd /cluster/data/mm9/bed/allenBrain/run
ls -1 ../split/*.fa > probe.list
ls -1 /scratch/data/mm9/nib/*.nib > genome.list
mkdir psl
cat << '_EOF_' > template
#LOOP
runBlat $(path1) $(path2) $(root1) $(root2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
cat << '_EOF_' > runBlat
#!/bin/csh -ef
set ooc = /scratch/data/mm9/11.ooc
set tmpDir = /scratch/tmp/mm9
set workDir = $tmpDir/$3_$4
set pslOut = $3_$4.psl
mkdir -p $tmpDir
mkdir $workDir
blat -ooc=$ooc $1 $2 $workDir/$pslOut
mv $workDir/$pslOut psl/$pslOut
rmdir $workDir
rmdir --ignore-fail-on-non-empty $tmpDir
'_EOF_'
# << happy emacs
chmod +x runBlat
gensub2 genome.list probe.list template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 6790 of 6790 jobs
# CPU time in finished jobs: 28129s 468.81m 7.81h 0.33d 0.001 y
# IO & Wait Time: 23014s 383.57m 6.39h 0.27d 0.001 y
# Average job time: 8s 0.13m 0.00h 0.00d
# Longest finished job: 29s 0.48m 0.01h 0.00d
# Submission to last job: 363s 6.05m 0.10h 0.00d
# Then do sorting and near-best-in-genome step on file server
ssh kkstore06
cd /cluster/data/mm9/bed/allenBrain/run
pslSort dirs raw.psl tmp psl
pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 \
-nearTop=0.001 /dev/null
# Processed 63183 alignments
sort -k14,14 -k16,16n ../best.psl > ../allenBrainAli.psl
# Clean up big files no longer needed
rm raw.psl batch.bak
rm -r psl
rm -r ../split
# Load up database
ssh hgwdev
cd /cluster/data/mm9/bed/allenBrain
# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.
hgsql mm9 < ~/kent/src/hg/lib/allenBrainUrl.sql
hgsql mm9 -e \
'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;'
# Make probe alignment table, and load sequence.
hgLoadPsl mm9 allenBrainAli.psl
mkdir /gbdb/mm9/allenBrain
ln -s /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa \
/gbdb/mm9/allenBrain/allenBrainProbes.fa
hgLoadSeq -replace mm9 /gbdb/mm9/allenBrain/allenBrainProbes.fa
# Make mapping between known genes and allenBrain
hgMapToGene mm9 allenBrainAli -type=psl knownGene knownToAllenBrain
#########################################################################
# MOUSE AFFYMETRIX MOE430 TRACK (DONE - 2007-09-10 - Hiram)
# mkdir -p /projects/compbio/data/microarray/affyMouse
# Download MOE430A and MOE430B consensus sequences from Affymetrix web site
# http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
# unzip MOE430*_consensus.zip
# check for duplicate probes: there are none, all have unique names
# check for duplicate probes: 100 from 136745_at to 1367551_a_at
# remove "consensus:" and ";" from FASTA headers to shorten probeset
# names for database
# sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
# sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
# cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
# /cluster/bluearc/affy/
# THE ABOVE WAS ALREADY TBD)
# Set up cluster job to align MOE430 consensus sequences to mm9
ssh kk
mkdir /cluster/data/mm9/bed/affyMOE430
cd /cluster/data/mm9/bed/affyMOE430
ls -1 /iscratch/i/affy/MOE430_all.fa > probe.list
cut -f1 /cluster/data/mm9/chrom.sizes > genome.list
cat << '_EOF_' > template
#LOOP
blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 genome.list probe.list template jobList
mkdir psl
para create jobList
# Do the job with usual para try/check/push/time etc.
# Completed: 35 of 35 jobs
# CPU time in finished jobs: 9093s 151.55m 2.53h 0.11d 0.000 y
# IO & Wait Time: 217s 3.62m 0.06h 0.00d 0.000 y
# Average job time: 266s 4.43m 0.07h 0.00d
# Longest finished job: 602s 10.03m 0.17h 0.01d
# Submission to last job: 602s 10.03m 0.17h 0.01d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyRAE230.psl
pslSort dirs raw.psl tmp psl
# only use alignments that cover 30% of sequence and have at least
# 95% identity in aligned region.
# low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 \
raw.psl affyMOE430.psl /dev/null
# Load alignments and sequences into database
ssh hgwdev
cd /cluster/data/mm9/bed/affyMOE430
# shorten names in psl file
sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
mv affyMOE430.psl.bak affyMOE430.psl
# load track into database
hgLoadPsl mm9 affyMOE430.psl
# Add consensus sequences for MOE430
# Copy sequences to gbdb is they are not there already
# mkdir -p /gbdb/hgFixed/affyProbes
# ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
# /gbdb/hgFixed/affyProbes
hgLoadSeq -abbr=MOE430 mm9 /gbdb/hgFixed/affyProbes/MOE430_all.fa
# Clean up
rm batch.bak raw.psl
# and then, during the gene sorter build, it does:
hgMapToGene mm9 affyMOE430 knownGene knownToMOE430
hgMapToGene mm9 affyMOE430 -prefix=A: knownGene knownToMOE430A
#########################################################################
# creating UCSC genes track (DONE - 2007-08-31 - 2007-09-25 - Hiram)
# working on the script mm9.ucscGenes10.csh in src/hg/makeDb/doc
# The tracks created above were done as they were encountered
# in working through that script. Worked through that script
# approximately one kluster run at a time, using a large if (1 == 0)
# statement to skip over business that had been successfully completed.
# After it reached the point where it had begun to load the tables
# into the tempDb and started to fail at the missing tables affyGnf1m
# the successfully loaded tables in tempDb were moved to mm9 and
# the track began to function. Then, working through the affy
# alignments above, and completing the loading of the knownTo tables
# for the gene sorter as they were completed. Now continuing below
# with the rest of the steps manually since it is not necessary to
# use the tempDb and its /gbdb/ directory. Everything is not taking
# place in the mm9 database.
# example script to transfer appropriate tables from one DB to another
# while saving the first set too
hgsql -N -e "show tables;" mm9UCGenes | \
egrep -v "allenBrainAli|allenBrainUrl|extFile|knownToEnsembl|vgProbes|vgAllProbe
s|^seq$|trackDb|history|chromInfo" | while read T
do
echo -n "=== table ${T}: "
C1=`hgsql -N -e "select count(*) from ${T}" mm9`
C2=`hgsql -N -e "select count(*) from ${T}" mm9UCGenes 2> /dev/null`
D=`echo "${C1}" "${C2}" | awk '{printf "%d", $2-$1}'`
echo "${C1} - ${C2} - ${D}"
echo "rename table mm9.${T} to mm9UCGenes.${T}_try0"
echo "rename table mm9UCGenes.${T} to mm9.${T}"
hgsql -e "rename table mm9.${T} to mm9UCGenes.${T}_try0" mysql
hgsql -e "rename table mm9UCGenes.${T} to mm9.${T}" mysql
done
# The egrep -v knocks out tables that are redundant, should be the same
# in both DBs
#########################################################################
# running the blastP operation to the other genomes for the gene sorter
# (DONE - 2007-09-10 - Hiram)
mkdir /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
cd /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
cat << '_EOF_' > config.ra
# Latest human vs. other Gene Sorter orgs:
# mouse, rat, zebrafish, worm, yeast, fly
targetGenesetPrefix known
targetDb mm9
queryDbs hg18 rn4 danRer4 dm2 ce4 sacCer1
mm9Fa /cluster/data/mm9/bed/ucsc.10/ucscGenes.faa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
danRer4Fa /cluster/data/danRer4/bed/blastp/ensembl.faa
dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
ce4Fa /cluster/data/ce4/bed/hgNearBlastp/070731/ce4.sangerPep.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
buildDir /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
scratchDir /san/sanvol1/scratch/mm9/jkgHgNearBlastp
'_EOF_'
# << happy emacs
# takes about an hour
time nice -n +19 doHgNearBlastp.pl config.ra > do.log 2>&1 &
#########################################################################
# fixup the blastP tables to remove non-syntenic hits
# (DONE - 2007-09-11 - Hiram)
# This was all re-done 2007-09-25, see below:
###### Update blast tabs after UCSC genes rebuild (DONE - 2007-09-25 - Hiram)
# Remove non-syntenic hits for human and rat
# Takes a few minutes
cd /cluster/data/mm9/bed/ucsc.10
synBlastp.csh mm9 rn4
# old number of unique query values: 31610
# old number of unique target values 7072
# new number of unique query values: 13973
# new number of unique target values 6888
synBlastp.csh mm9 hg18
# old number of unique query values: 38136
# old number of unique target values 17214
# new number of unique query values: 0
# new number of unique target values 0
# Make reciprocal best subset for the blastp pairs that are too
# Far for synteny to help
cd /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
# Us vs. others
foreach otherDb (danRer4 dm2 ce4 sacCer1)
set aToB = run.mm9.$otherDb
set bToA = run.$otherDb.mm9
cat $aToB/out/*.tab > $aToB/all.tab
cat $bToA/out/*.tab > $bToA/all.tab
blastRecipBest $aToB/all.tab $bToA/all.tab \
$aToB/recipBest.tab $bToA/recipBest.tab
hgLoadBlastTab mm9 drBlastTab $aToB/recipBest.tab
hgLoadBlastTab $otherDb tfBlastTab $bToA/recipBest.tab
end
# Clean up
cat run.mm9.mm9/out/*.tab | gzip -c > run.mm9.mm9/all.tab.gz
cat run.mm9.hg18/out/*.tab | gzip -c > run.mm9.hg18/all.tab.gz
cat run.hg18.mm9/out/*.tab | gzip -c > run.hg18.mm9/all.tab.gz
cat run.mm9.rn4/out/*.tab | gzip -c > run.mm9.rn4/all.tab.gz
cat run.rn4.mm9/out/*.tab | gzip -c > run.rn4.mm9/all.tab.gz
gzip run.*/all.tab
rm -r run.*/out
#########################################################################
# Update BLASTTAB blast tabs after UCSC genes rebuild
## (DONE - 2007-09-25 - Hiram)
sh hgwdev
mkdir -p /cluster/data/mm9/bed/hgNearBlastp/070924
cd /cluster/data/mm9/bed/hgNearBlastp/070924
# Get the proteins used by all hgNear organisms:
pepPredToFa hg18 knownGenePep hg18.known.faa
pepPredToFa mm9 knownGenePep mm9.known.faa
pepPredToFa rn4 knownGenePep rn4.known.faa
pepPredToFa danRer4 ensPep danRer4.ensPep.faa
pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
pepPredToFa ce4 sangerPep ce4.sangerPep.faa
pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
cat << '_EOF_' > config.ra
# Latest human vs. other Gene Sorter orgs:
# mouse, rat, zebrafish, worm, yeast, fly
targetGenesetPrefix known
targetDb mm9
queryDbs hg18 rn4 danRer4 dm3 ce4 sacCer1
recipBest danRer4 dm3 ce4 sacCer1
mm9Fa /cluster/data/mm9/bed/hgNearBlastp/070924/mm9.known.faa
hg18Fa /cluster/data/mm9/bed/hgNearBlastp/070924/hg18.known.faa
rn4Fa /cluster/data/mm9/bed/hgNearBlastp/070924/rn4.known.faa
danRer4Fa /cluster/data/mm9/bed/hgNearBlastp/070924/danRer4.ensPep.faa
dm3Fa /cluster/data/mm9/bed/hgNearBlastp/070924/dm3.flyBasePep.faa
ce4Fa /cluster/data/mm9/bed/hgNearBlastp/070924/ce4.sangerPep.faa
sacCer1Fa /cluster/data/mm9/bed/hgNearBlastp/070924/sacCer1.sgdPep.faa
buildDir /cluster/data/mm9/bed/hgNearBlastp/070924
scratchDir /san/sanvol1/scratch/mm9HgNearBlastp
'_EOF_'
# << happy emacs
# Run with -noLoad so we can eyeball files, manually load mm9 tables now,
# and after release of mm9 Gene Sorter on the RR, overload other
# databases' mmBlastTab tables.
time nice -n +19 doHgNearBlastp.pl -noLoad config.ra > do.log 2>&1 &
tail -f do.log
Follow instructions at end of do.log, piecewise:
- first execute all of the run.mm9.* load scripts
- then execute the run.hg18.mm9 and run.rn4.mm9 scripts
- then run Galt's script (this is why we load hg18 and rn4 early):
synBlastp.csh mm9 hg18
synBlastp.csh mm9 rn4
-- The following was performed 2007-10-11
- After mm9 hgNear/Gene Sorter is enabled on the RR:
- run the remaining run.*.mm9 load scripts
- then modify each $queryDb's hgGeneData/$org/$queryDb/otherOrg.ra
to specify mm9 for mouse
- then do a push request for $queryDbs.mmBlastTab and hgGeneData
#########################################################################
# MAKE FOLDUTR TABLES (DONE - 2007-09-11 - Hiram)
# First set up directory structure and extract UTR sequence on hgwdev
# Beware running this on pk since the program RNAfold which is used
# during this process is only found on /cluster/bin/i386/
# And there is no way for this cluster setup to verify success
# of that program since it is hidden away in rnaFoldBig
# Need to fix rnaFoldBig to recognize RNAfold missing ...
ssh hgwdev
mkdir /cluster/data/mm9/bed/ucsc.10/rnaStruct
cd /cluster/data/mm9/bed/ucsc.10/rnaStruct
mkdir -p utr3/split utr5/split utr3/fold utr5/fold
utrFa mm9 knownGene utr3 utr3/utr.fa
utrFa mm9 knownGene utr5 utr5/utr.fa
# Split up files and make files that define job.
faSplit sequence utr3/utr.fa 10000 utr3/split/s
faSplit sequence utr5/utr.fa 10000 utr5/split/s
ls -1 utr3/split > utr3/in.lst
ls -1 utr5/split > utr5/in.lst
cd utr3
cat > template << '_EOF_'
#LOOP
rnaFoldBig split/$(path1) fold
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 in.lst single template jobList
cp -p template ../utr5
cd ../utr5
gensub2 in.lst single template jobList
ssh kk
cd /cluster/data/mm9/bed/ucsc.10/rnaStruct/utr3
para make jobList
# Completed: 9750 of 9750 jobs
# CPU time in finished jobs: 377924s 6298.73m 104.98h 4.37d 0.012 y
# IO & Wait Time: 38985s 649.75m 10.83h 0.45d 0.001 y
# Average job time: 43s 0.71m 0.01h 0.00d
# Longest finished job: 3432s 57.20m 0.95h 0.04d
# Submission to last job: 11280s 188.00m 3.13h 0.13d
cd ../utr5
para make jobList
# Completed: 9253 of 9253 jobs
# CPU time in finished jobs: 44949s 749.16m 12.49h 0.52d 0.001 y
# IO & Wait Time: 51547s 859.11m 14.32h 0.60d 0.002 y
# Average job time: 10s 0.17m 0.00h 0.00d
# Longest finished job: 1100s 18.33m 0.31h 0.01d
# Submission to last job: 1398s 23.30m 0.39h 0.02d
# Load database
ssh hgwdev
cd /cluster/data/mm9/bed/ucsc.10/rnaStruct/utr5
hgLoadRnaFold mm9 foldUtr5 fold
# Parsed 35796 files
cd ../utr3
hgLoadRnaFold -warnEmpty mm9 foldUtr3 fold
# only one is empty: uc009gyo.1
# Seems to be a problem in
# RNAfold, so not easy for us to fix. Consequence is not too bad, just a
# few 3' UTRs will be missing annotation. (in this case, only one)
# Clean up
tar cvzf ./fold.tgz ./fold
rm -r split fold err batch.bak
cd ../utr5
tar cvzf ./fold.tgz ./fold
rm -r split fold err batch.bak
#########################################################################
# Make pfam run. Actual cluster run is about 6 hours.
# (DONE - 2007-09-12 - Hiram)
# First get pfam global HMMs into /san/sanvol1/pfam somehow.
ssh pk
mkdir /san/sanvol1/scratch/mm9/ucscGenes
cd /san/sanvol1/scratch/mm9/ucscGenes
mkdir splitProt
faSplit sequence /cluster/data/mm9/bed/ucsc.10/ucscGenes.faa \
10000 splitProt/
mkdir pfam
cd pfam
mkdir out
ls -1 ../splitProt > gene.list
cat << '_EOF_' > doPfam
#!/bin/csh -ef
/san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/pfam/Pfam_fs $1 \
> /scratch/tmp/mm9.$2
mv /scratch/tmp/mm9.$2 $3
'_EOF_'
# << happy emacs
chmod a+x doPfam
cat << '_EOF_' > template
#LOOP
doPfam ../splitProt/$(path1) $(root1).pf {check out line out/$(root1).pf}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 gene.list single template jobList
para create jobList
para try ... check ... push ... etc... time
# after some kluster difficulties
Completed: 9666 of 9666 jobs
CPU time in finished jobs: 3535078s 58917.96m 981.97h 40.92d 0.112 y
IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y
Average job time: 287s 4.78m 0.08h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 3430s 57.17m 0.95h 0.04d
Submission to last job: 79051s 1317.52m 21.96h 0.91d
# Make up pfamDesc.tab by converting pfam to a ra file first
cat << '_EOF_' > makePfamRa.awk
/^NAME/ {print}
/^ACC/ {print}
/^DESC/ {print; printf("\n");}
'_EOF_'
# << happy emacs
awk -f makePfamRa.awk /cluster/store12/pfam/Pfam_fs > pfamDesc.ra
raToTab -cols=ACC,NAME,DESC pfamDesc.ra stdout | \
awk -F '\t' '{
printf("%s\t%s\t%s\n", gensub(/\.[0-9]+/, "", "g", $1), $2, $3);
}' > pfamDesc.tab
# Convert output to tab-separated file.
cd /cluster/data/mm9/bed/ucsc.10
catDir /san/sanvol1/scratch/mm9/ucscGenes/pfam/out \
| hmmPfamToTab -eValCol stdin ucscPfam.tab
# Convert output to knownToPfam table
awk '{printf("%s\t%s\n", $2, gensub(/\.[0-9]+/, "", "g", $1));}' \
/san/sanvol1/scratch/mm9/ucscGenes/pfam/pfamDesc.tab > sub.foo
cut -f 1,4 ucscPfam.tab | subColumn 2 stdin sub.foo knownToPfam.tab
hgLoadSqlTab mm9 knownToPfam ~/kent/src/hg/lib/knownTo.sql \
knownToPfam.tab
cut -f 1-4 ucscPfam.tab > load.ucscPfam.tab
hgLoadSqlTab mm9 ucscPfam ~/kent/src/hg/lib/ucscPfam.sql load.ucscPfam.tab
cp -p /san/sanvol1/scratch/mm9/ucscGenes/pfam/pfamDesc.tab .
hgLoadSqlTab mm9 pfamDesc ~/kent/src/hg/lib/pfamDesc.sql pfamDesc.tab
#########################################################################
# Do scop run. Takes about 3.5 hours (DONE - 2007-09-12 - Hiram)
# First get pfam global HMMs into /san/sanvol1/scop somehow.
ssh pk
mkdir /san/sanvol1/scratch/mm9/ucscGenes/scop
cd /san/sanvol1/scratch/mm9/ucscGenes/scop
mkdir out
ls -1 ../splitProt > gene.list
cat << '_EOF_' > doScop
#!/bin/tcsh -ef
/san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/scop/scop.hmm $1 \
> /scratch/tmp/mm9.$2
mv /scratch/tmp/mm9.$2 $3
'_EOF_'
chmod a+x doScop
cat << '_EOF_' > template
#LOOP
doScop ../splitProt/$(path1) $(root1).pf {check out line out/$(root1).pf}
#ENDLOOP
'_EOF_'
gensub2 gene.list single template jobList
para create jobList
para try ... check ... push ... etc... time
# Completed: 9666 of 9666 jobs
# CPU time in finished jobs: 3532425s 58873.76m 981.23h 40.88d 0.112 y
# IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y
# Average job time: 347s 5.78m 0.10h 0.00d
# Longest finished job: 6512s 108.53m 1.81h 0.08d
# Submission to last job: 12348s 205.80m 3.43h 0.14d
# Convert scop output to tab-separated files
ssh hgwdev
cd /cluster/data/mm9/bed/ucsc.10
catDir /san/sanvol1/scratch/mm9/ucscGenes/scop/out | \
hmmPfamToTab -eValCol -scoreCol stdin scopPlusScore.tab
scopCollapse scopPlusScore.tab /cluster/store12/scop/model.tab \
ucscScop.tab scopDesc.tab knownToSuper.tab
hgLoadSqlTab mm9 knownToSuper ~/kent/src/hg/lib/knownToSuper.sql \
knownToSuper.tab
hgLoadSqlTab mm9 ucscScop ~/kent/src/hg/lib/ucscScop.sql ucscScop.tab
hgLoadSqlTab mm9 scopDesc ~/kent/src/hg/lib/scopDesc.sql scopDesc.tab
# XXX - ccds is not yet available for Mm9 according to Mark
# Regenerate ccdsKgMap table
# /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=mm9 -loadDb \
# mm9.ccdsGene knownGene ccdsKgMap
# Map old to new mapping - maybe next time, this is first genes on mm9
# hgsql mm9 -N -e 'select * from knownGene' > knownGene_1.gp
# genePredToBed knownGene_1.gp >knownGene_1.bed
# cat refSeq/*.bed mrna/*.bed | txGeneExplainUpdate1 knownGene_1.bed \
# ucscGenes.bed stdin abWalk.bed kg2ToKg3.bed
# hgLoadSqlTab $tempDb kg1ToKg2 ~/kent/src/hg/lib/kg2ToKg3.sql kg2ToKg3.bed
# Build kgSpAlias table, which combines content of both kgAlias and kgProtAlias tables.
hgsql mm9 -N -e \
'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
hgsql mm9 -N -e \
'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
>>j.tmp
sort -u j.tmp > kgSpAlias.tab
rm j.tmp
hgLoadSqlTab mm9 kgSpAlias ~/kent/src/hg/lib/kgSpAlias.sql ./kgSpAlias.tab
#########################################################################
# Building PROTEOME BROWSER TABLES (DONE - 2007-09-12 - Hiram)
# These are instructions for building tables
# needed for the Proteome Browser.
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap2 table
# ARE REBUILT.
# This build is based on proteins DBs dated 070202.
# Create the working directory
ssh hgwdev
mkdir /cluster/data/mm9/bed/ucsc.10/pb
cd /cluster/data/mm9/bed/ucsc.10/pb
# Build the pepMwAa table
hgsql proteins070202 -N -e \
"select info.acc, molWeight, aaSize from sp070202.info, sp070202.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab
hgLoadSqlTab mm9 pepMwAa ~/kent/src/hg/lib/pepMwAa.sql ./pepMwAa.tab
# Build the pepPi table
hgsql proteins070202 -e \
"select info.acc from sp070202.info, sp070202.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.list
hgsql mm9 -N \
-e 'select proteinID from knownGene where proteinID like "%-%"' \
| sort -u >> protAcc.list
pbCalPi protAcc.list sp070202 pepPi.tab
hgLoadSqlTab mm9 pepPi ~/kent/src/hg/lib/pepPi.sql ./pepPi.tab
# Calculate and load pep distributions
pbCalDist sp070202 proteins070202 10090 mm9
hgLoadSqlTab mm9 pepExonCntDist ~/kent/src/hg/lib/pepExonCntDist.sql \
./pepExonCntDist.tab
hgLoadSqlTab mm9 pepCCntDist ~/kent/src/hg/lib/pepCCntDist.sql \
./pepCCntDist.tab
hgLoadSqlTab mm9 pepHydroDist ~/kent/src/hg/lib/pepHydroDist.sql \
./pepHydroDist.tab
hgLoadSqlTab mm9 pepMolWtDist ~/kent/src/hg/lib/pepMolWtDist.sql \
./pepMolWtDist.tab
hgLoadSqlTab mm9 pepResDist ~/kent/src/hg/lib/pepResDist.sql \
./pepResDist.tab
hgLoadSqlTab mm9 pepIPCntDist ~/kent/src/hg/lib/pepIPCntDist.sql \
./pepIPCntDist.tab
hgLoadSqlTab mm9 pepPiDist ~/kent/src/hg/lib/pepPiDist.sql ./pepPiDist.tab
# Calculate frequency distributions
pbCalResStd sp070202 10090 mm9
# Create pbAnomLimit and pbResAvgStd tables
hgLoadSqlTab mm9 pbAnomLimit ~/kent/src/hg/lib/pbAnomLimit.sql \
./pbAnomLimit.tab
hgLoadSqlTab mm9 pbResAvgStd ~/kent/src/hg/lib/pbResAvgStd.sql \
./pbResAvgStd.tab
hgsql -N -e "select * from pbStamp;" mm8 > pbStamp.tab
hgLoadSqlTab mm9 pbStamp ~/kent/src/hg/lib/pbStamp.sql \
./pbStamp.tab
# Turn on protein and gene sorter
hgsql -e 'update dbDb set hgNearOk=1,hgPbOk=1 where name="mm9";' \
hgcentraltest
# Add mm9 to gdbPdb, pointing to proteins070202
mysql> insert into gdbPdb values('mm9','proteins070202');
############################################################################
# BUILD KNOWN GENE LIST FOR GOOGLE. (DONE - 2007-10-03 - Hiram)
cd /cluster/data/mm9/bed
rm -rf knownGeneList/mm9
# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/mm9
hgKnownGeneList mm9
# copy over to /usr/local/apache/htdocs
rm -rf /usr/local/apache/htdocs/knownGeneList/mm9
rsync -a --progress ./knownGeneList/mm9/ \
/usr/local/apache/htdocs/knownGeneList/mm9/
# if this is a new listing, add it to the top level
# knownGeneLists.html file
############################################################################
# SGP GENES (DONE - 2007-10-01 - Hiram)
ssh kkstore06
mkdir /cluster/data/mm9/bed/sgp
cd /cluster/data/mm9/bed/sgp
# They don't do chrM (we could just let that on fail ...)
for C in `awk '{print $1}' /cluster/data/mm9/chrom.sizes | grep -v chrM`
do
wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmJul2007/SGP/humangp200603/${C}.gtf" \
-O "${C}.gtf"
done
ssh hgwdev
cd /cluster/data/mm9/bed/sgp
ldHgGene -gtf -genePredExt mm9 sgpGene chr*.gtf
# Read 35983 transcripts in 290486 lines in 34 files
# 35983 groups 32 seqs 1 sources 3 feature types
# 35983 gene predictions
featureBits mm9 -enrichment refGene:CDS sgpGene
# refGene:CDS 1.165%, sgpGene 1.439%, both 1.005%, cover 86.28%,
# enrich 59.96x
featureBits mm8 -enrichment refGene:CDS sgpGene
# refGene:CDS 1.186%, sgpGene 1.455%, both 1.025%, cover 86.47%,
# enrich 59.42x
featureBits mm9 -enrichment knownGene:CDS sgpGene
# knownGene:CDS 1.278%, sgpGene 1.439%, both 1.080%, cover 84.53%,
# enrich 58.74x
featureBits mm8 -enrichment knownGene:CDS sgpGene
# knownGene:CDS 1.109%, sgpGene 1.455%, both 0.931%, cover 83.98%,
# enrich 57.71x
#####################################################################
# LOAD GENEID GENES (DONE - 2007-10-01 - Hiram)
ssh kkstore06
mkdir -p /cluster/data/mm9/bed/geneid/download
cd /cluster/data/mm9/bed/geneid/download
bash
awk '{print $1}' ../../../chrom.sizes | while read C
do
echo $C
wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmJul2007/geneid_v1.2/${C}.gtf" \
-O ${C}.gtf
wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmJul2007/geneid_v1.2/${C}.prot" \
-O ${C}.prot
done
exit
# Add missing .1 to protein id's
foreach f (*.prot)
perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
end
ssh hgwdev
cd /cluster/data/mm9/bed/geneid
ldHgGene -genePredExt -gtf mm9 geneid download/*.gtf
# Read 36708 transcripts in 287399 lines in 35 files
# 36708 groups 34 seqs 1 sources 3 feature types
# 36708 gene predictions
# the chr16_random file is empty, do not attempt to use it
hgPepPred mm9 generic geneidPep \
`ls download/*-fixed.prot | grep -v chr16_random`
featureBits mm9 -enrichment refGene geneid
# refGene 1.975%, geneid 1.590%, both 0.956%, cover 48.39%, enrich 30.44x
featureBits mm8 -enrichment refGene geneid
# refGene 2.010%, geneid 1.592%, both 0.974%, cover 48.44%, enrich 30.43x
featureBits mm7 -enrichment refGene geneid
# refGene 2.002%, geneid 1.579%, both 0.952%, cover 47.57%, enrich 30.12x
featureBits mm9 -enrichment knownGene geneid
# knownGene 2.686%, geneid 1.590%, both 1.047%, cover 38.97%, enrich 24.52x
featureBits mm8 -enrichment knownGene geneid
# knownGene 2.130%, geneid 1.592%, both 0.900%, cover 42.23%, enrich 26.53x
featureBits mm7 -enrichment knownGene geneid
# knownGene 2.058%, geneid 1.579%, both 0.859%, cover 41.72%, enrich 26.42x
#########################################################################
# BLASTZ/CHAIN/NET Orangutan ponAbe2 (DONE - 2007-09-21 - Hiram)
ssh kkstore02
# use a screen to control this job
screen
mkdir /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
cat << '_EOF_' > DEF
# mouse vs orangutan
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0
# QUERY: Orangutan ponAbe2
SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
BASE=/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-stop=load -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 &
# real 62m34.156s
# some pk kluster difficulties, fixup and complete manually
# Completed: 104880 of 104880 jobs
# CPU time in finished jobs: 7142978s 119049.64m 1984.16h 82.67d 0.227 y
# IO & Wait Time: 556393s 9273.21m 154.55h 6.44d 0.018 y
# Average job time: 73s 1.22m 0.02h 0.00d
# Longest finished job: 507s 8.45m 0.14h 0.01d
# Submission to last job: 65973s 1099.55m 18.33h 0.76d
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=cat -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 &
# real 166m20.442s
cat fb.mm9.chainPonAbe2Link.txt
# 914561309 bases of 2620346127 (34.902%) in intersection
# And, for the swap
mkdir /cluster/data/ponAbe2/bed/blastz.mm9.swap
cd /cluster/data/ponAbe2/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \
-chainMinScore=3000 -swap -chainLinearGap=medium \
-bigClusterHub=pk > swap.log 2>&1 &
# real 102m23.209s
cat fb.ponAbe2.chainMm9Link.txt
# 948458190 bases of 3093572278 (30.659%) in intersection
# create the syntenic maf nets:
ssh hgwdev
cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-continue=syntenicNet -syntenicNet -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 &
# real 22m16.544s
########################################################################
# BLASTZ/CHAIN/NET Frog X. tropicalis xenTro2 (DONE - 2007-09-23 - Hiram)
ssh kkstore04
screen # use screen to manage this job
# XXX note for next time, missing the TMPDIR in the DEF file
mkdir /cluster/data/mm9/bed/blastzXenTro2.2007-09-19
cd /cluster/data/mm9/bed/blastzXenTro2.2007-09-19
cat << '_EOF_' > DEF
# Mouse (mm9) vs frog (xenTro2)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0
# QUERY: Frog xenTro2 - single chunk big enough to run two of the
# largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/cluster/data/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=10000
BASE=/cluster/data/mm9/bed/blastzXenTro2.2007-09-19
'_EOF_'
# << emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# real 1050m55.259s
# after kk difficulties, finishing the first kluster run manually
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# Completed: 126539 of 126540 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 15750656s 262510.93m 4375.18h 182.30d 0.499 y
# IO & Wait Time: 843281s 14054.69m 234.24h 9.76d 0.027 y
# Average job time: 131s 2.19m 0.04h 0.00d
# Longest finished job: 2039s 33.98m 0.57h 0.02d
# Submission to last job: 79275s 1321.25m 22.02h 0.92d
# A single job kept having trouble, finished it on kolossus:
ssh kolossus
cd /cluster/data/mm9/bed/blastzXenTro2.2007-09-19/run.blastz
time nice -n +19 /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
/scratch/data/mm9/mm9.2bit:chr2:80000000-90000000 qParts/part008.lst ../DEF \
../psl/mm9.2bit:chr2:80000000-90000000/mm9.2bit:chr2:80000000-90000000_part008.lst.psl
# continuing after that
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-continue=cat -bigClusterHub=kk -chainMinScore=5000 \
-chainLinearGap=loose `pwd`/DEF > cat.out 2>&1 &
# real 62m17.627s
cat fb.mm9.chainXenTro2Link.txt
# 82054987 bases of 2620346127 (3.131%) in intersection
# Then to swap over to xenTro2
mkdir /cluster/data/xenTro2/bed/blastz.mm9.swap
cd /cluster/data/xenTro2/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=kk -chainMinScore=5000 \
/cluster/data/mm9/bed/blastzXenTro2.2007-09-19/DEF \
-chainLinearGap=loose > swap.out 2>&1 &
# real 47m53.428s
ssh hgwdev
cd /cluster/data/mm9/bed/blastz.xenTro2.2007-09-19
time nice -n +19 featureBits mm9 chainXenTro2Link \
> fb.mm9.chainXenTro2Link 2>&1 &
# 68050843 bases of 2567283971 (2.651%) in intersection
cd /cluster/data/xenTro2/bed/blastz.mm9.swap
time nice -n +19 featureBits xenTro2 chainMm8Link \
> fb.xenTro2.chainMm8Link 2>&1
# 72840135 bases of 1359412157 (5.358%) in intersection
#########################################################################
## BLASTZ Lizard anoCar1 - (DONE - 2007-09-21 - Hiram)
ssh kkstore04
mkdir /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
cd /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
cat << '_EOF_' > DEF
# Mouse (mm9) vs lizard (anoCar1)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0
# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/cluster/data/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=10000
BASE=/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-qRepeats=windowmaskerSdust \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# real 911m49.918s
# after kk difficulties, finishing the first kluster run manually
# Completed: 86355 of 86355 jobs
# CPU time in finished jobs: 11171051s 186184.18m 3103.07h 129.29d 0.354 y
# IO & Wait Time: 662082s 11034.70m 183.91h 7.66d 0.021 y
# Average job time: 137s 2.28m 0.04h 0.00d
# Longest finished job: 1467s 24.45m 0.41h 0.02d
# Submission to last job: 62938s 1048.97m 17.48h 0.73d
# continuing
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
DEF -chainMinScore=5000 \
-continue=cat -qRepeats=windowmaskerSdust \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
# real 31m44.652s
cat fb.mm9.chainAnoCar1Link.txt
# 89239796 bases of 2620346127 (3.406%) in intersection
# and for the swap
mkdir /cluster/data/anoCar1/bed/blastz.mm9.swap
cd /cluster/data/anoCar1/bed/blastz.mm9.swap
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19/DEF -chainMinScore=5000 \
-swap -qRepeats=windowmaskerSdust \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
# real 29m12.291s
cat fb.anoCar1.chainMm9Link.txt
# 85923556 bases of 1741478929 (4.934%) in intersection
#########################################################################
# BLASTZ Chicken galGal3 (DONE - 2007-09-25 - Hiram)
ssh kkstore03
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzGalGal3.2007-09-21
cd /cluster/data/mm9/bed/blastzGalGal3.2007-09-21
# This partitioning is too large to run on kk, must run this on pk
# or change the partitioning
cat << '_EOF_' > DEF
# mouse vs chicken
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_SMSK=/scratch/data/mm9/notInOthers
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/hg/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzGalGal3.2007-09-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# real 587m53.468s
# Completed: 16680 of 17168 jobs
# Crashed: 488 jobs
# CPU time in finished jobs: 7758569s 129309.48m 2155.16h 89.80d 0.246 y
# IO & Wait Time: 190128s 3168.80m 52.81h 2.20d 0.006 y
# Average job time: 477s 7.94m 0.13h 0.01d
# Longest finished job: 6501s 108.35m 1.81h 0.08d
# Submission to last job: 271554s 4525.90m 75.43h 3.14d
# the kk cluster could not complete some of these jobs. A recovery job
# list was created from the remaining jobs and completed on pk
# Completed: 488 of 488 jobs
# CPU time in finished jobs: 1226144s 20435.73m 340.60h 14.19d 0.039 y
# IO & Wait Time: 6875s 114.58m 1.91h 0.08d 0.000 y
# Average job time: 2527s 42.11m 0.70h 0.03d
# Longest finished job: 3872s 64.53m 1.08h 0.04d
# Submission to last job: 11739s 195.65m 3.26h 0.14d
# continuing
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-continue=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> cat.log 2>&1 &
# real 18m35.814s
cat fb.mm9.chainGalGal3Link.txt
# 97711788 bases of 2620346127 (3.729%) in intersection
# and the swap
mkdir /cluster/data/galGal3/bed/blastz.mm9.swap
cd /cluster/data/galGal3/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \
/cluster/data/mm9/bed/blastzGalGal3.2007-09-21/DEF \
-swap -chainLinearGap=loose -bigClusterHub=pk > swap.log 2>&1 &
# real 12m54.737s
cat fb.galGal3.chainMm9Link.txt
# 84990797 bases of 1042591351 (8.152%) in intersection
#########################################################################
# BLASTZ Platypus ornAna1 - (DONE - 2007-09-21 - 2007-09-25 - Hiram)
ssh kkstore05
mkdir /cluster/data/mm9/bed/blastzOrnAna1.2007-09-21
cd /cluster/data/mm9/bed/blastzOrnAna1.2007-09-21
cat << '_EOF_' > DEF
# mouse vs. platypus
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0
# QUERY: ornAna1
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzOrnAna1.2007-09-21
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# real 912m18.732s
cat fb.mm9.chainOrnAna1Link.txt
# 141953739 bases of 2620346127 (5.417%) in intersection
# and the swap
mkdir /cluster/data/ornAna1/bed/blastz.mm9.swap
cd /cluster/data/ornAna1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \
/cluster/data/mm9/bed/blastzOrnAna1.2007-09-21/DEF \
-swap -chainLinearGap=loose -bigClusterHub=kk > swap.log 2>&1 &
# real 123m16.632s
cat fb.ornAna1.chainMm9Link.txt
# 135570580 bases of 1842236818 (7.359%) in intersection
#########################################################################
# Blastz Chimp panTro2 - (DONE - 2007-09-24 - 2007-09-25 - Hiram)
ssh kkstore04
mkdir /cluster/data/mm9/bed/blastzPanTro2.2007-09-24
cd /cluster/data/mm9/bed/blastzPanTro2.2007-09-24
cat << '_EOF_' > DEF
# Mouse vs Chimp
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_SMSK=/scratch/data/mm9/notInOthers
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/hg/panTro2/nib
SEQ2_LEN=/cluster/data/panTro2/chrom.sizes
SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzPanTro2.2007-09-24
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
DEF > blastz.out 2>&1 &
# real 701m23.446s
cat fb.mm9.chainPanTro2Link.txt
# 987180081 bases of 2620346127 (37.674%) in intersection
# and the swap
mkdir /cluster/data/panTro2/bed/blastz.mm9.swap
cd /cluster/data/panTro2/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzPanTro2.2007-09-24/DEF \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap > swap.log 2>&1 &
# real 87m25.448s
cat fb.panTro2.chainMm9Link.txt
# 997050630 bases of 2909485072 (34.269%) in intersection
# create syntenic maf nets:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-syntenicNet -continue=syntenicNet DEF > syntenicNet.out 2>&1 &
# real 25m13.118s
#########################################################################
# Blastz Horse equCab1 - (DONE - 2007-09-24 - 2007-09-25 - Hiram)
ssh kkstore05
mkdir /cluster/data/mm9/bed/blastzEquCab1.2007-09-24
cd /cluster/data/mm9/bed/blastzEquCab1.2007-09-24
cat << '_EOF_' > DEF
# Mouse vs Horse
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse EquCab1
SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ2_LEN=/cluster/data/equCab1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzEquCab1.2007-09-24
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
DEF > blastz.out 2>&1 &
# real 1582m34.597s
cat fb.mm9.chainEquCab1Link.txt
# 911418189 bases of 2620346127 (34.782%) in intersection
# and the swap
mkdir /cluster/data/equCab1/bed/blastz.mm9.swap
cd /cluster/data/equCab1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
/cluster/data/mm9/bed/blastzEquCab1.2007-09-24/DEF \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
> swap.out 2>&1 &
# real ~110m
cat fb.equCab1.chainMm9Link.txt
# 901367656 bases of 2421923695 (37.217%) in intersection
# create the syntenic maf nets
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=syntenicNet -syntenicNet DEF > syntenicNet.out 2>&1 &
# real 29m40.546s
#########################################################################
# Blastz Cow bosTau3 (DONE - 2007-09-25 - Hiram)
ssh kkstore05
screen # use a screen to control this job
mkdir /cluster/data/mm9/bed/blastzBosTau3.2007-09-25
cd /cluster/data/mm9/bed/blastzBosTau3.2007-09-25
cat << '_EOF_' > DEF
# Mouse vs Cow
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow bosTau3
SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
SEQ2_LEN=/cluster/data/bosTau3/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzBosTau3.2007-09-25
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \
DEF -bigClusterHub=pk -chainLinearGap=medium > do.log 2>&1 &
# real 733m40.065s
cat fb.mm9.chainBosTau3Link.txt
# 690515959 bases of 2620346127 (26.352%) in intersection
# and for the swap
mkdir /cluster/data/bosTau3/bed/blastz.mm9.swap
cd /cluster/data/bosTau3/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \
/cluster/data/mm9/bed/blastzBosTau3.2007-09-25/DEF \
-swap -bigClusterHub=pk -chainLinearGap=medium > swap.log 2>&1 &
# real 100m20.707s
cat fb.bosTau3.chainMm9Link.txt
# 707779988 bases of 2731807384 (25.909%) in intersection
# create the syntenic maf nets
time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \
-syntenicNet -continue=syntenicNet \
DEF -bigClusterHub=pk -chainLinearGap=medium > syntenicNet.log 2>&1 &
# real 16m28.741s
#########################################################################
# Blastz Opossum monDom4 (DONE - 2007-09-25 - 2007-09-27 - Hiram)
ssh kkstore04
screen # use screen to manage this job
mkdir /cluster/data/mm9/bed/blastzMonDom4.2007-09-25
cd /cluster/data/mm9/bed/blastzMonDom4.2007-09-25
# the opossum chroms are too large to work with on the kk, must run this
# on the pk kluster
cat << '_EOF_' > DEF
# Mouse vs. opossum
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom4
SEQ2_DIR=/scratch/hg/monDom4/monDom4.2bit
SEQ2_LEN=/cluster/data/monDom4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzMonDom4.2007-09-25
TMPDIR=/scratch/tmp
'_EOF'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
# real 811m19.320s
# problem on kki run, monDom4 wasn't distributed on the Iservers to
# /scratch/hg/monDom4/ - straighten that up, and finish that run, then
# continuing
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-continue=chainMerge -chainLinearGap=loose \
-bigClusterHub=pk -verbose=2 > chainMerge.log 2>&1 &
# real 158m9.287s
cat fb.mm9.chainMonDom4Link.txt
# 255535025 bases of 2620346127 (9.752%) in intersection
# and for the swap
mkdir /cluster/data/monDom4/bed/blastz.mm9.swap
cd /cluster/data/monDom4/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \
/cluster/data/mm9/bed/blastzMonDom4.2007-09-25/DEF \
-swap -chainLinearGap=loose \
-bigClusterHub=pk > swap.log 2>&1 &
# real 59m19.005s
cat fb.monDom4.chainMm9Link.txt
# 254018516 bases of 3501643220 (7.254%) in intersection
#########################################################################
# Blastz Tenrec echTel1 (DONE - 2007-09-25 - 2007-09-27 - Hiram)
ssh kkstore02
screen # use a screen to control this job
mkdir /cluster/data/mm9/bed/blastzEchTel1.2007-09-25
cd /cluster/data/mm9/bed/blastzEchTel1.2007-09-25
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - Tenrec echTel1
SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=800
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzEchTel1.2007-09-25
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-bigClusterHub=kk -chainLinearGap=medium DEF > do.log 2>&1 &
# real 2721m33.204s
cat fb.mm9.chainEchTel1Link.txt
# 291920039 bases of 2620346127 (11.141%) in intersection
# and for the swap
mkdir /cluster/data/echTel1/bed/blastz.mm9.swap
cd /cluster/data/echTel1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
/cluster/data/mm9/bed/blastzEchTel1.2007-09-25/DEF \
-swap -bigClusterHub=kk -chainLinearGap=medium > swap.log 2>&1 &
# real 520m9.198s
cat fb.echTel1.chainMm9Link.txt
# 298656963 bases of 2111581369 (14.144%) in intersection
# create syntenic maf nets
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
DEF -continue=syntenicNet -bigClusterHub=kk \
-syntenicNet -chainLinearGap=medium > syntenicNet.log 2>&1 &
# real 3m4.285s
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzEchTel1.2007-09-25
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 echTel1 \
> rbest.log 2>&1 &
# real 34m12.936s
#########################################################################
# Blastz Tree Shrew tupBel1 (DONE - 2007-09-27 - 2007-10-01 - Hiram)
ssh kkstore05
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
cat << '_EOF_' > DEF
# Mouse vs. Tree Shrew
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Tree shrew tupBel1
SEQ2_DIR=/san/sanvol1/scratch/tupBel1/tupBel1.2bit
SEQ2_LEN=/cluster/data/tupBel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzTupBel1.2007-09-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
> chainMerge.log 2>&1 &
# real 1262m32.699s
# the load should fail due to missing repeat masker tables in tupBel1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
> net.log 2>&1 &
# real 69m41.901s
# and indeed it did, Loading the net track
ssh hgwdev
cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27/axtChain
cp -p noClass.net mm9.tupBel1.net
time nice -n +19 netFilter -minGap=10 mm9.tupBel1.net \
| hgLoadNet -warn mm9 netTupBel1 stdin
cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
time nice -n +19 featureBits mm9 chainTupBel1Link \
> fb.mm9.chainTupBel1Link.txt 2>&1 &
cat fb.mm9.chainTupBel1Link.txt
# 552865662 bases of 2620346127 (21.099%) in intersection
# and, to finish it all off, with syntenic net
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=download -bigClusterHub=pk \
-syntenicNet -chainLinearGap=medium DEF > syntenicNet.log 2>&1 &
# real 14m42.816s
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 tupBel1 \
> rbest.log 2>&1 &
# real 41m12.278s
#########################################################################
# Blastz Bush Baby otoGar1 (DONE - 2007-09-27 - 2007-09-28 - Hiram)
ssh kkstore05
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
cd /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
cat << '_EOF_' > DEF
# Mouse vs. Tree Shrew
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Bush baby otoGar1
SEQ2_DIR=/san/sanvol1/scratch/otoGar1/otoGar1.2bit
SEQ2_LEN=/cluster/data/otoGar1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
> chainMerge.log 2>&1 &
# real 873m23.531s
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
> net.log 2>&1 &
# real 67m7.172s
cat fb.mm9.chainOtoGar1Link.txt
# 601932945 bases of 2620346127 (22.972%) in intersection
# and run the syntenicNet and cleanup
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
-syntenicNet > syntenicNet.log 2>&1 &
# real 13m57.573s
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 otoGar1 \
> rbest.log 2>&1 &
# real 40m1.428s
#########################################################################
# Blastz Armadillo dasNov1 (DONE - 2007-09-27 - 2007-10-02 - Hiram)
ssh kkstore04
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzDasNov1.2007-09-27
cd /cluster/data/mm9/bed/blastzDasNov1.2007-09-27
cat << '_EOF_' > DEF
# Mouse vs. Armadillo
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Armadillo dasNov1
SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
SEQ2_LEN=/cluster/data/dasNov1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzDasNov1.2007-09-27
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
> load.log 2>&1 &
# real 3607m35.169s
cat fb.mm9.chainDasNov1Link.txt
# 433593082 bases of 2620346127 (16.547%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
-syntenicNet > syntenicNet.log 2>&1 &
# real 15m7.642s
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzDasNov1.2007-09-27
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 dasNov1 \
> rbest.log 2>&1 &
# real 39m18.156s
#########################################################################
# Blastz Rabbit oryCun1 (DONE - 2007-09-28 - 2007-09-29 - Hiram)
ssh kkstore04
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzOryCun1.2007-09-28
cd /cluster/data/mm9/bed/blastzOryCun1.2007-09-28
cat << '_EOF_' > DEF
# Mouse vs. Rabbit
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rabbit oryCun1
SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit
SEQ2_LEN=/cluster/data/oryCun1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzOryCun1.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
> chainMerge.log 2>&1 &
# real 2126m59.162s
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
> load.log 2>&1 &
# real 53m28.279s
cat fb.mm9.chainOryCun1Link.txt
# 496428446 bases of 2620346127 (18.945%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
-syntenicNet > syntenicNet.log 2>&1 &
# real 9m27.321s
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzOryCun1.2007-09-28
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 oryCun1 \
> rbest.log 2>&1 &
# real 37m32.151s
#########################################################################
# Blastz Cat felCat3 (DONE - 2007-09-28 - 2007-09-29 - Hiram)
ssh kkstore05
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzFelCat3.2007-09-28
cd /cluster/data/mm9/bed/blastzFelCat3.2007-09-28
cat << '_EOF_' > DEF
# Mouse vs. Cat
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cat felCat3
SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
SEQ2_LEN=/cluster/data/felCat3/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzFelCat3.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
> chainMerge.log 2>&1 &
# real 1597m21.032s
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
> load.log 2>&1 &
# real 39m30.078s
cat fb.mm9.chainFelCat3Link.txt
# 499894253 bases of 2620346127 (19.077%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
-syntenicNet > syntenicNet.log 2>&1 &
# real 9m42.624s
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzFelCat3.2007-09-28
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 felCat3 \
> rbest.log 2>&1 &
# real 36m40.000s
#########################################################################
# Blastz Elephant loxAfr1 (DONE - 2007-09-28 - 2007-10-02 - Hiram)
ssh kkstore04
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
cd /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
cat << '_EOF_' > DEF
# Mouse vs. Elephant
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Elephant loxAfr1
SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
SEQ2_LEN=/cluster/data/loxAfr1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
> load.log 2>&1 &
# real 2981m3.302s
# had two failed jobs in that state where their results existed,
# but parasol thought they were not done. Continuing, and now
# all the way to syntenicNet. Will probably fail during the load
# since not everything is there for db loxAfr1
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=cat -bigClusterHub=pk -chainLinearGap=medium DEF \
-syntenicNet > syntenicNet.log 2>&1 &
# real 166m4.710s
# it did get through everything to a successful completion
cat fb.mm9.chainLoxAfr1Link.txt
# 473014688 bases of 2620346127 (18.052%) in intersection
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 loxAfr1 \
> rbest.log 2>&1 &
# real 41m56.201s
#########################################################################
# Blastz Hedgehog eriEur1 (DONE - 2007-09-28 - 2007-10-02 - Hiram)
ssh kkstore05
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
cat << '_EOF_' > DEF
# Mouse vs. Hedgehog
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Hedgehog eriEur1
SEQ2_DIR=/san/sanvol1/scratch/eriEur1/eriEur1.2bit
SEQ2_LEN=/cluster/data/eriEur1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzEriEur1.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
> load.log 2>&1 &
# failed during the load since the db eriEur1 does not exist
ssh hgwdev
cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28/axtChain
cp -p noClass.net mm9.eriEur1.net
time nice -n +19 netFilter -minGap=10 mm9.eriEur1.net \
| hgLoadNet -warn mm9 netEriEur1 stdin
cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
time nice -n +19 featureBits mm9 chainEriEur1Link \
> fb.mm9.chainEriEur1Link.txt 2>&1 &
cat fb.mm9.chainEriEur1Link.txt
# 262604655 bases of 2620346127 (10.022%) in intersection
# continuing through syntenic nets (actually unneeded)
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
-syntenicNet > syntenicNet.log 2>&1 &
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 eriEur1 \
> rbest.log 2>&1 &
# real 33m27.296s
#########################################################################
# Blastz Shrew sorAra1 (DONE - 2007-09-28 - 2007-10-01 - Hiram)
ssh kkstore05
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
cat << '_EOF_' > DEF
# Mouse vs. Shrew
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Shrew sorAra1
SEQ2_DIR=/san/sanvol1/scratch/sorAra1/sorAra1.2bit
SEQ2_LEN=/cluster/data/sorAra1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzSorAra1.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
>chainMerge chainMerge.log 2>&1 &
# real 2478m57.242s
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
> load.log 2>&1 &
# real 15m55.272s
# as expected, fails during load since there is no sorAra1 database
# load nets without class
ssh hgwdev
cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28/axtChain
cp -p noClass.net mm9.sorAra1.net
time nice -n +19 netFilter -minGap=10 mm9.sorAra1.net \
| hgLoadNet -warn mm9 netSorAra1 stdin
cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
time nice -n +19 featureBits mm9 chainSorAra1Link \
> fb.mm9.chainSorAra1Link.txt 2>&1
cat fb.mm9.chainSorAra1Link.txt
# 250412778 bases of 2620346127 (9.556%) in intersection
# and, to finish it all off, with syntenic net
time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
-continue=download -bigClusterHub=pk \
-syntenicNet -chainLinearGap=medium DEF > syntenicNet.log 2>&1 &
# real 3m49.961s
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 sorAra1 \
> rbest.log 2>&1 &
# real 27m3.076s
#########################################################################
## 30-Way Multiz (WORKING - 2007-10-01 - Hiram)
## The blastz alignments for this 30-way are documented at:
## http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
##
ssh hgwdev
mkdir /cluster/data/mm9/bed/multiz30way
cd /cluster/data/mm9/bed/multiz30way
# take the 28-way tree from hg18 and insert the two new genomes.
# rearrange to get mm9 on the top of the graph
# paste this tree into the on-line phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to create the image for the tree diagram
cat << '_EOF_' > mm9OnTop.fullNames.nh
((((((((
(((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607,
GuineaPig_cavPor2:0.202990):0.034350,
Rabbit_oryCun1:0.208548):0.014587,
((((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,
Orangutan_ponAbe2:0.02):0.013037,Rhesus_rheMac2:0.031973):0.0365,
Marmoset_calJac1:0.07):0.0365,Bushbaby_otoGar1:0.151185):0.015682,
TreeShrew_tupBel1:0.162844):0.006272):0.019763,
((Shrew_sorAra1:0.248532,Hedgehog_eriEur1:0.222255):0.045693,
(((Dog_canFam2:0.101137,Cat_felCat3:0.098203):0.048213,
Horse_equCab1:0.099323):0.007287,
Cow_bosTau3:0.163945):0.012398):0.018928):0.030081,
(Armadillo_dasNov1:0.133274,(Elephant_loxAfr1:0.103030,
Tenrec_echTel1:0.232706):0.049511):0.008424):0.213469,
Opossum_monDom4:0.320721):0.088647,
Platypus_ornAna1:0.488110):0.118797,
(Chicken_galGal3:0.395136,Lizard_anoCar1:0.513962):0.093688):0.151358,
Frog_xenTro2:0.778272):0.174596,
(((Tetraodon_tetNig1:0.203933,Fugu_fr2:0.239587):0.203949,
(Stickleback_gasAcu1:0.314162,Medaka_oryLat1:0.501915):0.055354):0.346008,
Zebrafish_danRer5:0.730028):0.174596);
'_EOF_'
# << happy emacs
# create a species list from that file:
sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' mm9OnTop.fullNames.nh \
| sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
| sed -e "s/.*_//; s/:.*//" | sort > species.list
# verify that has 30 db names in it
# create a stripped down nh file for use in autoMZ run
echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' mm9OnTop.fullNames.nh \
| sed -e "s/ / /g"` > tree.30.nh
# that looks like, as a single line:
(((((((( (((mm9 rn4) cavPor2) oryCun1) ((((((hg18 panTro2) ponAbe2) rheMac2)
calJac1) otoGar1) tupBel1)) ((sorAra1 eriEur1) (((canFam2 felCat3) equCab1)
bosTau3))) (dasNov1 (loxAfr1 echTel1))) monDom4) ornAna1) (galGal3 anoCar1))
xenTro2) (((tetNig1 fr2) (gasAcu1 oryLat1)) danRer5))
# verify all blastz's exists
cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/mm9/bed/multiz30way
foreach db (`cat species.list`)
set bdir = /cluster/data/mm9/bed/blastz.$db
if (-e $bdir/mafRBestNet/chr1.maf.gz) then
echo "$db mafRBestNet"
else if (-e $bdir/mafSynNet/chr1.maf.gz) then
echo "$db mafSynNet"
else if (-e $bdir/mafNet/chr1.maf.gz) then
echo "$db mafNet"
else
echo "$db mafs not found"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./listMafs.csh
# see what it says, shouldn't be anything with "mafs not found"
./listMafs.csh
# copy net mafs to cluster-friendly storage, splitting chroms
# into 50MB chunks to improve run-time
# NOTE: splitting will be different for scaffold-based reference asemblies
ssh hgwdev
mkdir /cluster/data/mm9/bed/multiz30way/run.split
cd /cluster/data/mm9/bed/multiz30way/run.split
# this works by examining the rmsk table for likely repeat areas
# that won't be used in blastz
mafSplitPos mm9 50 mafSplit.bed
ssh kki
cd /cluster/data/mm9/bed/multiz30way/run.split
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set db = $1
set sdir = /san/sanvol1/scratch/mm9/splitStrictMafNet
mkdir -p $sdir
if (-e $sdir/$db) then
echo "directory $sdir/$db already exists -- remove and retry"
exit 1
endif
set bdir = /cluster/data/mm9/bed/blastz.$db
if (! -e $bdir) then
echo "directory $bdir not found"
exit 1
endif
mkdir -p $sdir/$db
if (-e $bdir/mafRBestNet) then
set mdir = $bdir/mafRBestNet
else if (-e $bdir/mafSynNet) then
set mdir = $bdir/mafSynNet
else if (-e $bdir/mafNet) then
set mdir = $bdir/mafNet
else
echo "$bdir maf dir not found"
exit 1
endif
echo $mdir
foreach f ($mdir/*)
set c = $f:t:r:r
echo " $c"
nice mafSplit mafSplit.bed $sdir/$db/ $f
end
echo "gzipping $sdir/$db mafs"
nice gzip $sdir/$db/*
endif
echo $mdir > $db.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
grep -v mm9 ../species.list > split.list
cat << '_EOF_' > template
#LOOP
doSplit.csh $(path1) {check out line+ $(path1).done}
#ENDLOOP
'_EOF_'
gensub2 split.list single template jobList
para create jobList
# 29 jobs
# start these gently, this is a good load on the san filesystem
para try
# let that run to a couple completions, a few minutes, then again:
para try
# etc ...
# Completed: 29 of 29 jobs
# CPU time in finished jobs: 9476s 157.94m 2.63h 0.11d 0.000 y
# IO & Wait Time: 1531s 25.51m 0.43h 0.02d 0.000 y
# Average job time: 380s 6.33m 0.11h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1081s 18.02m 0.30h 0.01d
# Submission to last job: 1391s 23.18m 0.39h 0.02d
# ready for the multiz run
ssh pk
cd /cluster/data/mm9/bed/multiz30way
# actually, the result directory here should be maf.split instead of maf
mkdir -p maf run
cd run
mkdir penn
# use latest penn utilities
P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# list chrom chunks, any db dir will do; better would be for the
# splitter to generate this file
# We temporarily use __ instead of . to delimit chunk in filename
# so we can use $(root) to get basename
find /san/sanvol1/scratch/mm9/splitStrictMafNet -type f \
| while read F; do basename $F; done \
| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.lst
sort -u > chromChunks.list
wc -l chromChunks.list
# 75
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = mm9
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
rm -fr $tmp
mkdir -p $tmp
cp ../tree.30.nh ../species.list $tmp
pushd $tmp
foreach s (`cat species.list`)
set c2 = `echo $c | sed 's/__/./'`
set in = $pairs/$s/$c2.maf
set out = $db.$s.sing.maf
if ($s == mm9) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.30.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/mm9/bed/multiz30way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs
gensub2 chromChunks.list single template jobList
para create jobList
# 75 jobs
# three of these jobs failed with memory allocation error:
# maf_project.v12: Ran out of memory trying to allocate 64.
# autoMZ.v1: command 'maf_project /scratch/tmp/mm9/multiz.chr10__1/_MZ_16482_lef
# t.maf19 mm9 > /scratch/tmp/mm9/multiz.chr10__1/_MZ_16482_U1' failed
# the 73 jobs run time:
# Completed: 72 of 75 jobs
# CPU time in finished jobs: 501143s 8352.38m 139.21h 5.80d 0.016 y
# IO & Wait Time: 22628s 377.14m 6.29h 0.26d 0.001 y
# Average job time: 7275s 121.24m 2.02h 0.08d
# Longest finished job: 15957s 265.95m 4.43h 0.18d
# Submission to last job: 16473s 274.55m 4.58h 0.19d
# performed a para recover on the jobList and used the kki kluster
# to run the last three jobs:
# Completed: 3 of 3 jobs
# CPU time in finished jobs: 50762s 846.03m 14.10h 0.59d 0.002 y
# IO & Wait Time: 1795s 29.92m 0.50h 0.02d 0.000 y
# Average job time: 17519s 291.98m 4.87h 0.20d
# Longest finished job: 17887s 298.12m 4.97h 0.21d
# Submission to last job: 17887s 298.12m 4.97h 0.21d
# put the split maf results back together into single chroms
ssh kkstore06
cd /cluster/data/mm9/bed/multiz30way
# here is where the result directory maf should have already been maf.split
mv maf maf.split
mkdir maf
# going to sort out the redundant header garbage to leave a cleaner maf
for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
do
echo ${C}
head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
done
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/mm9/multiz30way/maf
ln -s /cluster/data/mm9/bed/multiz30way/maf/*.maf \
/gbdb/mm9/multiz30way/maf
cd /cluster/data/mm9/bed/multiz30way
# this generates a large 1 Gb multiz30way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /scratch/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/mm9/multiz30way/maf mm9 multiz30way
# real 11m38.695s
# Loaded 15881850 mafs in 34 files from /gbdb/mm9/multiz30way/maf
# load summary table
time nice -n +19 cat /gbdb/mm9/multiz30way/maf/*.maf \
| hgLoadMafSummary mm9 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz30waySummary stdin
# Created 5648546 summary blocks from 154642836 components and 15872991
# mafs from stdin
# real 19m44.355s
# Gap Annotation
# prepare bed files with gap info
ssh kkstore06
mkdir /cluster/data/mm9/bed/multiz30way/anno
cd /cluster/data/mm9/bed/multiz30way/anno
mkdir maf run
for DB in `cat ../species.list`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `grep -v mm9 ../../species.list`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
ssh kki
cd /cluster/data/mm9/bed/multiz30way/anno/run
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set dir = /cluster/data/mm9/bed/multiz30way
set c = $1
cat $dir/maf/${c}.maf | \
nice mafAddIRows -nBeds=nBeds stdin /cluster/data/mm9/mm9.2bit $2
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ /cluster/data/mm9/bed/multiz30way/anno/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
# there is no 16_random maf file
cut -f1 /cluster/data/mm9/chrom.sizes | grep -v 16_random > chrom.list
gensub2 chrom.list single template jobList
para create jobList
para try
# Crashed: 1 jobs
# CPU time in finished jobs: 18129s 302.15m 5.04h 0.21d 0.001 y
# IO & Wait Time: 10273s 171.22m 2.85h 0.12d 0.000 y
# Average job time: 861s 14.34m 0.24h 0.01d
# Longest finished job: 4376s 72.93m 1.22h 0.05d
# one job was too large for this memory:
# job: ./doAnno.csh chr1 /cluster/data/mm9/bed/multiz30way/anno/maf/chr1.maf
# needLargeMem: Out of memory - request size 1129396 bytes, errno: 12
# going to hgwdev for this one:
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way/anno/run
time ./doAnno.csh chr1 ../maf/chr1.maf
# real 17m34.550s
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way/anno
mkdir -p /gbdb/mm9/multiz30way/anno/maf
ln -s /cluster/data/mm9/bed/multiz30way/anno/maf/*.maf \
/gbdb/mm9/multiz30way/anno/maf
# by loading this into the table multiz30way, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /scratch/tmp
time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/mm9/multiz30way/anno/maf \
mm9 multiz30way
# Loaded 16799995 mafs in 34 files from /gbdb/mm9/multiz30way/anno/maf
# real 18m12.171s
# This step may be useless. The original mafs should have the same
# summary.
cat /cluster/data/mm9/chrom.sizes | \
awk '{if ($2 > 1000000) { print $1 }}' |
while read C
do
echo /gbdb/mm9/multiz30way/anno/maf/$C.maf
done | xargs cat | \
hgLoadMafSummary mm9 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz30waySummary stdin
# Created 5648546 summary blocks from 154642836 components and 16790208
# mafs from stdin
# by loading this into the table multiz30waySummary, it will replace
# the previously loaded table with the unannotated mafs
# real 30m26.542s
#############################################################################
## Annotate 30-way multiple alignment with gene annotations
## (WORKING - 2007-10-18 - Hiram)
# Gene frames
## survey all genomes to see what type of gene track to use
ssh hgwdev
mkdir /cluster/data/mm9/bed/multiz30way/frames
cd /cluster/data/mm9/bed/multiz30way/frames
# dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`egrep -v "sorAra1|eriEur1|cavPor2" ../species.list`)
echo -n "${db}: "
echo -n "Tables: "
set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
foreach table ($tables)
if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
$table == "knownGene") then
set count = `hgsql $db -N -e "select count(*) from $table"`
echo -n "${table}: ${count}, "
endif
end
set orgName = `hgsql hgcentraltest -N -e \
"select scientificName from dbDb where name='$db'"`
set orgId = `hgsql mm9 -N -e \
"select id from organism where name='$orgName'"`
if ($orgId == "") then
echo "Mrnas: 0"
else
set count = `hgsql mm9 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
echo "Mrnas: ${count}"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./showGenes.csh
# given this output, manually sorted for this display:
# hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 29028, refGene: 25902, Mrnas: 208990
# mm9: Tables: knownGene: 49409, mgcGenes: 22947, refGene: 21004, Mrnas: 5092390
# rn4: Tables: ensGene: 33745, knownGene: 8202, mgcGenes: 5400, refGene: 14333, Mrnas: 34471
# canFam2: Tables: ensGene: 25568, refGene: 833, Mrnas: 1708
# danRer5: Tables: ensGene: 31740, mgcGenes: 13037, refGene: 12879, Mrnas: 33184
# fr2: Tables: ensGene: 22102, Mrnas: 1098
# gasAcu1: Tables: ensGene: 28840, Mrnas: 2326
# monDom4: Tables: ensGene: 33878, refGene: 163, Mrnas: 398
# ornAna1: Tables: ensGene: 25981, refGene: 3, Mrnas: 141
# oryLat1: Tables: ensGene: 23087, Mrnas: 980
# panTro2: Tables: ensGene: 32852, refGene: 26160, Mrnas: 1277
# rheMac2: Tables: ensGene: 38561, refGene: 412, Mrnas: 3169
# bosTau3: Tables: mgcGenes: 9617, refGene: 10287, Mrnas: 26808
# equCab1: Tables: refGene: 304, Mrnas: 1396
# felCat3: Tables: refGene: 401, Mrnas: 882
# galGal3: Tables: refGene: 4210, Mrnas: 31217
# xenTro2: Tables: mgcGenes: 6255, refGene: 7086, Mrnas: 19155
# anoCar1: Tables: Mrnas: 12
# calJac1: Tables: Mrnas: 949
# dasNov1: Tables: Mrnas: 18
# echTel1: Tables: Mrnas: 0
# loxAfr1: Tables: Mrnas: 12
# oryCun1: Tables: Mrnas: 3786
# otoGar1: Tables: Mrnas: 0
# ponAbe2: Tables: Mrnas: 2
# tetNig1: Tables: Mrnas: 99495
# tupBel1: Tables: Mrnas: 47
# use knownGene for hg18, mm9
# use ensGene for rn4, canFam2, danRer5, fr2, gasAcu1, monDom4, ornAna1,
# oryLat1, panTro2, rheMac2
# use refGene for bosTau3, xenTro2
# use Mrnas for galGal3, tetNig1
# barely can use Mrnas for equCab1, felCat3, anoCar1, dasNov1,
# loxAfr1, oryCun1, ponAbe2
# no annotations for calJac1, echTel1, otoGar1, sorAra1, eriEur1, cavPor2
# tupBel1
mkdir genes
# knownGene
for DB in hg18 mm9
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# ensGene
for DB in rn4 canFam2 danRer5 fr2 gasAcu1 monDom4 \
ornAna1 oryLat1 panTro2 rheMac2
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# refGene
for DB in bosTau3 xenTro2
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# and finally, using the mrna tables
# use Mrnas for galGal3 tetNig1 equCab1 felCat3 anoCar1 dasNov1
# loxAfr1 oryCun1 ponAbe2
for DB in galGal3 tetNig1 equCab1 felCat3 anoCar1 dasNov1 \
loxAfr1 oryCun1 ponAbe2
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done
ssh kkstore06
cd /cluster/data/mm9/bed/multiz30way/frames
# leaving out calJac1, echTel1, otoGar1, sorAra1, eriEur1, cavPor2
# tupBel1 since no gene preds there
time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames mm9 stdin stdout rn4 genes/rn4.gp.gz hg18 genes/hg18.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz equCab1 genes/equCab1.gp.gz dasNov1 genes/dasNov1.gp.gz oryCun1 genes/oryCun1.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz loxAfr1 genes/loxAfr1.gp.gz bosTau3 genes/bosTau3.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz gasAcu1 genes/gasAcu1.gp.gz danRer5 genes/danRer5.gp.gz tetNig1 genes/tetNig1.gp.gz fr2 genes/fr2.gp.gz oryLat1 genes/oryLat1.gp.gz | gzip > multiz30way.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multiz30way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
67 loxAfr1
79 dasNov1
116 ponAbe2
491 anoCar1
1807 tetNig1
2429 felCat3
4892 equCab1
9156 oryCun1
85568 bosTau3
118192 galGal3
129442 xenTro2
208239 rn4
224420 rheMac2
226866 panTro2
228563 hg18
243074 canFam2
329523 danRer5
334418 ornAna1
347708 oryLat1
369267 monDom4
374016 gasAcu1
380839 fr2
# load the resulting file
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way/frames
time nice -n +19 hgLoadMafFrames mm9 multiz30wayFrames \
multiz30way.mafFrames.gz
# real 1m1.893s
# enable the trackDb entries:
# frames multiz30wayFrames
# irows on
#############################################################################
# phastCons 30-way (WORKING - 2007-10-16 - Hiram)
# split 30way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh kki
mkdir /cluster/data/mm9/bed/multiz30way/msa.split
cd /cluster/data/mm9/bed/multiz30way/msa.split
mkdir -p /san/sanvol1/scratch/mm9/multiz30way/cons/ss
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/mm9/bed/multiz30way/maf
set WINDOWS = /san/sanvol1/scratch/mm9/multiz30way/cons/ss
pushd $WINDOWS
set c = $1
rm -fr $c
mkdir $c
twoBitToFa -seq=$c /scratch/data/mm9/mm9.2bit /scratch/tmp/mm9.$c.fa
# need to truncate odd-ball scaffold/chrom names that include dots
# as phastCons utils can't handle them
set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$
perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF
/cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \
-M /scratch/tmp/mm9.$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
rm -f $CLEAN_MAF /scratch/tmp/mm9.$c.fa
popd
date >> $c.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
# << happy emacs
# do the easy ones first to see some immediate results
ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... etc
-
# completed shorter jobs in a few hours, there is a problem of swapping
# going on here, two of these jobs on a single node can consume all of its
# memory and then some. Three jobs failed to complete, finish them up
# manually on hgwdev, the processes grow to over 8 Gb in memory for chr1,
# chr11 and chr2
# Estimate phastCons parameters
time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
/san/sanvol1/scratch/mm9/multiz30way/cons/ss/chrY/chrY.1-10000000.ss \
--tree "(((((((((((mm9,rn4),cavPor2),oryCun1),((((((hg18,panTro2),ponAbe2),rheMac2),calJac1),otoGar1),tupBel1)),((sorAra1,eriEur1),(((canFam2,felCat3),equCab1),bosTau3))),(dasNov1,(loxAfr1,echTel1))),monDom4),ornAna1),(galGal3,anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer5))" \
--out-root starting-tree
# real 107m46.703s
# Tried this on chr13 too:
# real 4619m42.984s
# that is almost 77 hours on hgwdev == 3.2 days
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.400
# This 0.400 is used in the --gc argument below
# got 0.404 with chrM.starting-tree.mod
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh pk
mkdir -p /cluster/data/mm9/bed/multiz30way/cons/run.cons
cd /cluster/data/mm9/bed/multiz30way/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
# all gliers placentals
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/mm9/bed/multiz30way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/mm9/multiz30way/cons
if (-s $cons/$grp/$grp.non-inf) then
cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf .
cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
cp -p $cons/$grp/$grp.mod .
cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 4
touch $san/$grp/pp/$c $san/$grp/bed/$c
rm -f $san/$grp/pp/$c/$f.pp
rm -f $san/$grp/bed/$c/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/mm9/multiz30way/cons
ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
/cluster/data/mm9/bed/multiz30way/cons/run.cons/ss.list
popd
# run for all species
cd ..
mkdir -p all run.cons/all
cd all
cp ../../chrY.starting-tree.mod all.mod
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 294 of 294 jobs
# CPU time in finished jobs: 25724s 428.73m 7.15h 0.30d 0.001 y
# IO & Wait Time: 8951s 149.19m 2.49h 0.10d 0.000 y
# Average job time: 118s 1.97m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 226s 3.77m 0.06h 0.00d
# Submission to last job: 582s 9.70m 0.16h 0.01d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/mm9/multiz30way/cons/all
cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/all
# load into database
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way/cons/all
time nice -n +19 hgLoadBed mm9 phastConsElements30way mostConserved.bed
# Loaded 2782368 elements of size 5
# real 1m15.673s
# compare with previous tracks
hgsql mm9 -s -N -e "select count(*) from phastConsElements30way"
# 2782368
hgsql mm8 -s -N -e "select count(*) from phastConsElements17way"
# 1883370
# Try for 5% overall cov, and 70% CDS cov
# --rho .31 --expected-length 45 --target-coverage .3
# chrY mod tree
featureBits mm9 -enrichment refGene:cds phastConsElements30way
# refGene:cds 1.167%, phastConsElements30way 4.789%,
# both 0.582%, cover 49.90%, enrich 10.42x
featureBits mm9 -enrichment knownGene:cds phastConsElements30way
# knownGene:cds 1.278%, phastConsElements30way 4.789%,
# both 0.627%, cover 49.03%, enrich 10.24x
# --rho .31 --expected-length 45 --target-coverage .3 elim non-autho
# chr13 mod tree
featureBits mm9 -enrichment refGene:cds mostConserved.bed
# refGene:cds 1.167%, mostConserved.bed 4.128%,
# both 0.614%, cover 52.59%, enrich 12.74x
# --rho .31 --expected-length 45 --target-coverage .3 elim non-autho
# 28-way mod tree adjusted to 30-way
featureBits mm9 -enrichment refGene:cds mostConserved.bed
# refGene:cds 1.167%, mostConserved.bed 5.841%, both 0.862%, cover
# 73.90%, enrich 12.65x
featureBits mm8 -enrichment refGene:cds phastConsElements17way
# refGene:cds 1.188%, phastConsElements17way 5.398%,
# both 0.832%, cover 70.05%, enrich 12.98x
featureBits mm8 -enrichment knownGene:cds phastConsElements17way
# knownGene:cds 1.109%, phastConsElements17way 5.398%,
# both 0.776%, cover 69.99%, enrich 12.97x
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
cd /san/sanvol1/scratch/mm9/multiz30way/cons/all
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons30wayScores
for D in pp/chr*
do
C=${D/pp\/}
out=phastCons30wayScores/${C}.data.gz
echo "${D} > ${C}.data.gz"
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
cd /san/sanvol1/scratch/mm9/multiz30way/cons/all
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
for D in pp/chr*
do
ls $D/*.pp | sort -n -t\. -k2
done | xargs cat \
| wigEncode -noOverlap stdin phastCons30way.wig phastCons30way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way/cons/all
ln -s `pwd`/phastCons30way.wib /gbdb/mm9/multiz30way/phastCons30way.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
phastCons30way phastCons30way.wig
# real 0m42.728s
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=mm9 phastCons30way > histogram.data 2>&1
# real 28m24.388s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm9 Histogram phastCons30way track"
set xlabel " phastCons30way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
### Create a phastCons data set for Euarchontoglires
# setup euarchontoglires-only run
ssh pk
cd /cluster/data/mm9/bed/multiz30way/cons
mkdir euarchontoglires run.cons/euarchontoglires
cd euarchontoglires
# euarchontoglires-only: exclude all but these for phastCons tree:
/cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
--prune-all-but=mm9,rn4,cavPor2,oryCun1,hg18,panTro2,ponAbe2,rheMac2,calJac1,otoGar1,tupBel1 \
> euarchontoglires.mod
# and place the removed ones in the non-inf file so phastCons will
# truly ignore them:
echo "sorAra1,eriEur1,canFam2,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1,monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer5" \
> euarchontoglires.non-inf
cd ../run.cons/euarchontoglires
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Three of these jobs fail to produce any output:
# chr5_random/chr5_random.1-357350.bed
# chr7_random/chr7_random.1-362490.bed
# chrY_random/chrY_random.50000001-58682461.bed
# Completed: 291 of 294 jobs
# Crashed: 3 jobs
# CPU time in finished jobs: 17184s 286.40m 4.77h 0.20d 0.001 y
# IO & Wait Time: 30139s 502.31m 8.37h 0.35d 0.001 y
# Average job time: 163s 2.71m 0.05h 0.00d
# Longest finished job: 296s 4.93m 0.08h 0.00d
# Submission to last job: 2775s 46.25m 0.77h 0.03d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires
cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires
# load into database
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires
time nice -n +19 hgLoadBed mm9 phastConsElements30wayEuarch \
mostConserved.bed
# Loaded 1021674 elements of size 5
# real 0m23.402s
# verify coverage
featureBits mm9 phastConsElements30wayEuarch
# 103492546 bases of 2620346127 (3.950%) in intersection
# Create the downloads .pp files, from which the phastCons wiggle data
# is calculated
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires
mkdir downloads
for D in pp/chr*
do
C=${D/pp\//}
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
> downloads/${C}.euarchontoglires.pp.data.gz
echo $D $C done
done
# Create merged posterier probability file and wiggle track data files
cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires
ls downloads/chr*.data.gz | xargs zcat \
| wigEncode -noOverlap stdin phastCons30wayEuarch.wig phastCons30wayEuarch.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
## load table with wiggle data
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires
cp -p /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/*.wi? .
ln -s `pwd`/phastCons30wayEuarch.wib \
/gbdb/mm9/multiz30way/phastCons30wayEuarch.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
phastCons30wayEuarch phastCons30wayEuarch.wig
# real 0m44.161s
# Create histogram to get an overview of all the data
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=mm9 phastCons30wayEuarch > histogram.data 2>&1
# real 3m22.364s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm9 Histogram phastCons30wayEuarch track"
set xlabel " phastCons30wayEuarch score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
### Create a phastCons data set for Placentals
# setup placental-only run
ssh pk
cd /cluster/data/mm9/bed/multiz30way/cons
mkdir placental run.cons/placental
cd placental
# placental-only: exclude all but these for phastCons tree:
/cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
--prune-all-but=mm9,rn4,cavPor2,oryCun1,hg18,panTro2,ponAbe2,rheMac2,calJac1,otoGar1,tupBel1,sorAra1,eriEur1,canFam2,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1 \
> placental.mod
# and place the removed ones in the non-inf file so phastCons will
# truly ignore them:
echo "monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer5" \
> placental.non-inf
cd ../run.cons/placental
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/placental/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# One of these jobs fails to produce any output:
# chr5_random/chr5_random.1-357350.bed
# Completed: 293 of 294 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 21121s 352.01m 5.87h 0.24d 0.001 y
# IO & Wait Time: 33985s 566.42m 9.44h 0.39d 0.001 y
# Average job time: 188s 3.13m 0.05h 0.00d
# Longest finished job: 324s 5.40m 0.09h 0.00d
# Submission to last job: 3511s 58.52m 0.98h 0.04d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental
cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/placental
# load into database
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way/cons/placental
time nice -n +19 hgLoadBed mm9 phastConsElements30wayPlacental \
mostConserved.bed
# Loaded 1990870 elements of size 5
# real 0m48.084s
# verify coverage
featureBits mm9 phastConsElements30wayPlacental
# 111626429 bases of 2620346127 (4.260%) in intersection
# Create the downloads .pp files, from which the phastCons wiggle data
# is calculated
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental
mkdir downloads
for D in pp/chr*
do
C=${D/pp\//}
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
> downloads/${C}.placental.pp.data.gz
echo $D $C done
done
# Create merged posterier probability file and wiggle track data files
cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental
ls downloads/chr*.data.gz | xargs zcat \
| wigEncode -noOverlap stdin phastCons30wayPlacental.wig \
phastCons30wayPlacental.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
## load table with wiggle data
ssh hgwdev
cd /cluster/data/mm9/bed/multiz30way/cons/placental
cp -p /san/sanvol1/scratch/mm9/multiz30way/cons/placental/*.wi? .
ln -s `pwd`/phastCons30wayPlacental.wib \
/gbdb/mm9/multiz30way/phastCons30wayPlacental.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
phastCons30wayPlacental phastCons30wayPlacental.wig
# real 0m44.585s
# Create histogram to get an overview of all the data
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=mm9 phastCons30wayPlacental > histogram.data 2>&1
# real 28m24.388s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm9 Histogram phastCons30wayPlacental track"
set xlabel " phastCons30wayPlacental score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
#############################################################################
## Downloads for 30way Conservation (DONE - 2007-11-01 - Hiram)
ssh kkstore06
mkdir /cluster/data/mm9/bed/multiz30way/downloads/phastCons30wayScores
cd /cluster/data/mm9/bed/multiz30way/downloads/phastCons30wayScores
mkdir placental euarchontoglires all
cd all
cp -p \
/san/sanvol1/scratch/mm9/multiz30way/cons/all/phastCons30wayScores/*.data.gz .
cd ../placental
cp -p \
/san/sanvol1/scratch/mm9/multiz30way/cons/placental/downloads/*.data.gz .
cd ../euarchontoglires
cp -p \
/san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/downloads/*.data.gz .
# rebuilt 2007-12-27 to fix difficulty in mafFrags when species.lst
# did not have mm9 as the first one
# upstream mafs (mafFrags takes a while)
ssh kkstore06
cd /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf
# bash script
#!/bin/sh
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits mm9 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags mm9 multiz30way \
stdin stdout \
-orgs=/cluster/data/mm9/bed/multiz30way/species.list \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
md5sum up*.gz >> md5sum.txt
ssh kkstore06
mkdir /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf_qual
cp -p ../../../qual/maf/*.maf .
time nice -n +19 gzip *.maf
# real 77m3.592s
time nice -n +19 md5sum *.gz > md5sum.txt
# real 4m52.044s
mkdir /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf
cp -p ../../../anno/maf/*.maf .
time nice -n +19 gzip *.maf
# real 86m2.341s
time nice -n +19 md5sum *.gz > md5sum.txt
# real 4m30.087s
# create syn.net files for downloads for those organisms which
# used the mafSynNet in the multiz30way
ssh kkstore06
cd /cluster/data/mm9/bed
for DB in rn4 hg18 rheMac2 ponAbe2 panTro2 equCab1 canFam2 bosTau3
do
cd /cluster/data/mm9/bed/blastz.${DB}/axtChain
time nice -n +19 netFilter -syn mm9.${DB}.net.gz \
| gzip -c > mm9.${DB}.syn.net.gz
ls -og mm9.${DB}.syn.net.gz
md5sum mm9.${DB}.syn.net.gz >> md5sum.txt
done
for DB in calJac1 cavPor2 tupBel1 otoGar1 dasNov1 oryCun1 felCat3 \
loxAfr1 eriEur1 sorAra1 echTel1
do
cd /cluster/data/mm9/bed/blastz.${DB}/axtChain
ls -l mm9.${DB}.rbest.net.gz
md5sum mm9.${DB}.rbest.net.gz >> md5sum.txt
md5sum mm9.${DB}.rbest.chain.gz >> md5sum.txt
grep rbest md5sum.txt
done
# create symlinks to make everything show up
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/mm9
for DB in ?n4 ?g18 ?heMac2 ?onAbe2 ?anTro2 ?quCab1 ?anFam2 ?osTau3
do
ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.syn.net.gz
ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.syn.net.gz \
vs${DB}/
ls -Lld vs${DB}/mm9.*.syn.net.gz
done
for DB in ?alJac1 ?avPor2 ?upBel1 ?toGar1 ?asNov1 ?ryCun1 ?elCat3 \
?oxAfr1 ?riEur1 ?orAra1 ?chTel1
do
ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz
ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz \
vs${DB}/
ls -Lld vs${DB}/mm9.${DB}.rbest.net.gz
grep rbest vs${DB}/md5sum.txt
done
for DB in ?alJac1 ?avPor2 ?upBel1 ?toGar1 ?asNov1 ?ryCun1 ?elCat3 \
?oxAfr1 ?riEur1 ?orAra1 ?chTel1
do
ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz
ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz \
vs${DB}/
ls -Lld vs${DB}/mm9.${DB}.rbest.net.gz
ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.chain.gz
ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.chain.gz \
vs${DB}/
ls -Lld vs${DB}/mm9.${DB}.rbest.chain.gz
grep rbest vs${DB}/md5sum.txt
done
###########################################################################t
#
# BUILD miRNA TRACK (DONE - 2007-10-05 - Fan)
# updated data from: Michel.Weber@ibcg.biotoul.fr
# notify them when done.
ssh hgwdev
cd /cluster/data/mm9/bed
mkdir miRNA-2007-10-05
cd miRNA-2007-10-05
# save the miRNAtrack-mm9.txt file from email
cat miRNAtrack-mm9.txt|sed -e 's/ /\t/g' > miRNA.tab
hgLoadBed mm9 miRNA miRNA.tab
# Add the miRNA section to makeDb/trackDb/mouse/mm9/trackDb.ra
vi ~/src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra
# check previous release track before update
featureBits mm8 miRNA
#33398 bases of 2567283971 (0.001%) in intersection
featureBits mm9 miRNA
#39718 bases of 2620346127 (0.002%) in intersection
###########################################################################t
# RE-BUILD miRNA TRACK (DONE – 2008-05-29 - Fan)
# updated data from: Michel.Weber@ibcg.biotoul.fr
# notify them when done.
ssh hgwdev
cd /cluster/data/mm9/bed
mkdir miRNA-2008-05-28
cd miRNA-2008-05-28
# save the mouse_miRNA_may2008.doc as mouse_miRNA_may2008.txt
# and replace all blanks with tabs.
cp mouse_miRNA_may2008.txt miRNA.tab
hgLoadBed mm9 miRNA miRNA.tab
# check previous release track before update
featureBits mm8 miRNA
#33398 bases of 2567283971 (0.001%) in intersection
featureBits mm9 miRNA
#43236 bases of 2620346127 (0.002%) in intersection
#############################################################################
# N-SCAN gene predictions (nscanGene) - (2006-08-30 markd)
# obtained NSCAN predictions from michael brent's group
# at WUSTL
cd /cluster/data/mm9/bed/nscan/
wget http://mblab.wustl.edu/predictions/mouse/mm9/mm9.gtf
wget http://mblab.wustl.edu/predictions/mouse/mm9/mm9.prot.fa
wget http://mblab.wustl.edu/predictions/mouse/mm9/readme.txt
bzip2 mm9.*
chmod a-w *
mv ardor.wustl.edu/jeltje/mm9/chr_ptx .
rm -rf ardor.wustl.edu
rm chr_*/index.html*
gzip chr_*/*
chmod a-w chr_*/*.gz
# load track
ldHgGene -bin -gtf -genePredExt mm9 nscanGene mm9.gtf.bz2
hgPepPred mm9 generic nscanPep mm9.prot.fa.bz2
rm *.tab
# update trackDb; need a mm9-specific page to describe informants
mouse/mm9/nscanGene.html (copy from hg18 and edit)
mouse/mm9/trackDb.ra
# changed search regex to
termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]
#########################################################################
# CPGISLANDS (DONE - 2007-10-25 - Hiram)
ssh hgwdev
mkdir /cluster/data/mm9/bed/cpgIsland
cd /cluster/data/mm9/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
# gcc readseq.c cpg_lh.c -o cpglh.exe
cd ../..
ln -s hg3rdParty/cpgIslands/cpglh.exe .
# cpglh.exe requires hard-masked (N) .fa's.
# make the hard masked sequences from these soft masked sequences
ssh kkstore06
time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
do
echo "maskOutFa ${CHR} hard ${CHR}.masked"
nice -n +19 maskOutFa ${CHR} hard ${CHR}.masked
done
# about 2 minutes
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
cd /cluster/data/mm9/bed/cpgIsland
time for F in ../../*/chr*.fa.masked
do
FA=${F/*\/}
C=${FA/.fa.masked/}
echo "./cpglh.exe ${FA} > ${C}.cpg"
nice -n +19 ./cpglh.exe ${F} > ${C}.cpg
done > cpglh.out 2>&1 &
# about 3 minutes
# Several chroms have 0 results:
# -rw-rw-r-- 1 0 Oct 25 11:11 chr16_random.cpg
# -rw-rw-r-- 1 0 Oct 25 11:12 chr3_random.cpg
# -rw-rw-r-- 1 0 Oct 25 11:12 chr5_random.cpg
# -rw-rw-r-- 1 0 Oct 25 11:13 chr7_random.cpg
# -rw-rw-r-- 1 0 Oct 25 11:13 chrM.cpg
# -rw-rw-r-- 1 0 Oct 25 11:13 chrX_random.cpg
# -rw-rw-r-- 1 0 Oct 25 11:13 chrY.cpg
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << happy emacs
awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed
ssh hgwdev
cd /cluster/data/mm9/bed/cpgIsland
hgLoadBed mm9 cpgIslandExt -tab \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 15963 elements of size 10
featureBits mm9 cpgIslandExt
# 10496250 bases of 2620346127 (0.401%) in intersection
featureBits mm8 cpgIslandExt
# 10456823 bases of 2567283971 (0.407%) in intersection
featureBits mm7 cpgIslandExt
# 10439328 bases of 2583394090 (0.404%) in intersection
featureBits mm6 cpgIslandExt
# 10432360 bases of 2597150411 (0.402%) in intersection
featureBits mm5 cpgIslandExt
# 10422989 bases of 2615483787 (0.399%) in intersection
featureBits mm4 cpgIsland
# 11109692 bases of 2627444668 (0.423%) in intersection
featureBits mm3 cpgIsland
# 10102968 bases of 2505900260 (0.403%) in intersection
#############################################################################
# LIFTOVER (DROPUNDER) TO MM8 (DONE - 2007-11-05 - Hiram)
ssh kkstore06
screen # use a screen to control this job
# -debug run to create run dir, preview scripts...
doSameSpeciesLiftOver.pl -debug mm9 mm8 \
-ooc /san/sanvol1/scratch/mm9/11.ooc
# Real run:
cd /cluster/data/mm9/bed/blat.mm8.2007-11-05
time nice -n +19 doSameSpeciesLiftOver.pl mm9 mm8 \
-ooc /san/sanvol1/scratch/mm9/11.ooc > do.log 2>&1 &
########################################################################
# ANNOTATE 30-WAY ALIGNMENT WITH QUALITY DATA (2007-11-07 rico at bx.psu.edu)
#
# The basic idea here is to create a qac file which has quality data for each
# (chromosome/scaffold/etc) and then index the qac file. Once this is done,
# mafAddQRows can be used to add the quality data to a given maf. The agp
# files are used so that gaps can be represented in the qac files as a special
# value.
## create .qac and .qdx files for each species in the 30-way alignment
## results are stored in /cluster/store12/rico/quality
o human (hg18)
Unable to find quality data.
o chimp (panTro2)
in.agp = cat /cluster/data/panTro2/wustl/*.agp > all.agp
in.qac = /cluster/data/panTro2/bed/quality/qac/panTro2.qac
qacAddGapIdx in.agp in.qac panTro2.qac panTro2.qdx
o rhesus (rheMac2)
in.agp: /cluster/data/rheMac2/downloads
(cat v1.edit4.chrome.ctgs.final.fix.agp; sed -e 's/^ChrUr/chrUr/' v1.edit4.ChrUr.ctgs.agp ) > all.agp
in.qa = /cluster/data/rheMac2/qual/rheMac2.qual.qv.gz
qaAgpToQacIdx in.agp in.qa rheMac2.qac rheMac2.qdx
o bushbaby (otoGar1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/bushbaby/otoGar1
in.agp = assembly.agp
in.qa = Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx in.agp in.qa otoGar1.qac otoGar1.qdx
o treeshrew (tupBel1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/treeShrew/tupBel1
in.agp = assembly.agp
in.qa = Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx in.agp in.qa tupBel1.qac tupBel1.qdx
o rat (rn4)
in.agp: /cluster/data/rn4
#!/bin/sh
rm -f rn4.agp
for chrom in `awk '{print $1}' chrom.sizes`
do
num=`echo $chrom | cut -dr -f2- | cut -d_ -f1`
if [ -f "$num/${chrom}.agp" ]; then
cat $num/${chrom}.agp >> rn4.agp
else
echo "Missing agp file for $chrom"
exit 1
fi
done
in.qa: /cluster/data/rn4/downloads
#!/bin/sh
rm -f rn4.qa
for file in *.qual.gz
do
echo -n "Processing $file ... "
chrom=`echo $file | sed -e 's/^Rnor3.4//;s/\.fa\.qual\.gz$//' | tr '-' '_'`
(echo ">$chrom" ; gzip -dc $file | tail +2) >> rn4.qa
echo "done."
done
qaAgpToQacIdx in.agp in.qa rn4.qac rn4.qdx
o mouse (mm9)
Unable to find quality data.
o guinea pig (cavPor2)
in.agp = /cluster/data/cavPor2/downloads/assembly.agp
in.qa = /cluster/data/cavPor2/downloads/Draft_v2.agp.chromosome.qual.gz
qaAgpToQacIdx in.agp in.qa cavPor2.qac cavPor2.qdx
o rabbit (oryCun1)
http://www.broad.mit.edu/ftp/pub/assemblies/mammals/rabbit/oryCun1
in.agp = assembly.agp
in.qa = Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx in.agp in.qa oryCun1.qac oryCun1.qdx
o shrew (sorAra1)
in.agp = /cluster/data/sorAra1/downloads/assembly.agp
in.qa = /cluster/data/sorAra1/downloads/Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx in.agp in.qa sorAra1.qac sorAra1.qdx
o hedgehog (eriEur1)
in.agp = /cluster/data/eriEur1/downloads/assembly.agp
in.qa = /cluster/data/eriEur1/downloads/Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx in.agp in.qa eriEur1.qac eriEur1.qdx
o dog (canFam2)
in.agp = /cluster/store9/canFam2/broad/UCSC_Dog2.0.agp
in.qac = /cluster/store9/canFam2/bed/quality/chrom.qac
qacAddGapIdx in.agp in.qac canFam2.qac canFam2.qdx
o cat (felCat3)
in.agp = /cluster/data/felCat3/downloads/assembly.agp
in.qa = /cluster/data/felCat3/downloads/Draft_v3.agp.chromosome.qual.gz
qaAgpToQacIdx in.agp in.qa felCat3.qac felCat3.qdx
o horse (equCab1)
in.agp = /cluster/data/equCab1/downloads/assembly.agp
in.qa = /cluster/data/equCab1/downloads/Draft_v1.agp.chromosome.qual.gz
qaAgpToQacIdx in.agp in.qa equCab1.qac equCab1.qdx
o cow (bosTau3)
in.agp = /cluster/data/bosTau3/fixup/UCSC.agp
in.qac = /cluster/data/bosTau3/fixup/chrom.qac
qacAddGapIdx in.agp in.qac bosTau3.qac bosTau3.qdx
o armadillo (dasNov1)
/cluster/data/dasNov1/broad
combineQuals assembly.agp.gz assembly.quals.gz combined.quals
qaAgpToQacIdx assembly.agp.gz combined.quals dasNov1.qac dasNov1.qdx
o elephant (loxAfr1)
/cluster/data/loxAfr1/broad
combineQuals assembly.agp assembly.quals.gz combined.quals
qaAgpToQacIdx assembly.agp combined.quals loxAfr1.qac loxAfr1.qdx
o tenrec (echTel1)
/cluster/data/echTel1/broad
combineQuals assembly.agp assembly.quals.gz combined.quals
qaAgpToQacIdx assembly.agp combined.quals echTel1.qac echTel1.qdx
o opossum (monDom4)
/cluster/data/monDom4/broad.mit.edu
in.qa = gzip -dc Monodelphis4.0.agp.chromosome.qual.gz \
| sed -e 's/^>\([^.]*\)\.1-.*/>chr\1/;/^>.*Monodelphis4.0)/d' > monDom4.qa
in.agp = Monodelphis4.0.agp
qaAgpToQacIdx in.agp in.qa monDom4.qac monDom4.qdx
o platypus (ornAna1)
Unable to find quality data.
o chicken (galGal3)
Unable to find quality data.
o lizard (anoCar1)
in.agp = /cluster/data/anoCar1/downloads/assembly.agp
in.qac = /cluster/data/anoCar1/downloads/scaffold.lifted.qac
qacAddGapIdx in.agp in.qac anoCar1.qac anoCar1.qdx
o frog (xenTro2)
Unable to find quality data.
o tetraodon (tetNig1)
Unable to find quality data.
o fugu (fr2)
Unable to find quality data.
o stickleback (gasAcu1)
in.agp = /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.agp
in.qa = /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.qual
qaAgpToQacIdx in.agp in.qac gasAcu1.qac gasAcu1.qdx
o medaka (oryLat1)
in.agp = /cluster/data/oryLat1/downloads/chr.agp.txt-fixed
in.qac = /cluster/data/oryLat1/bed/qual/fixed.chroms.qac
qacAddGapIdx in.agp in.qac oryLat1.qac oryLat1.qdx
o zebrafish (danRer5)
Unable to find quality data.
o orangutan (ponAbe2)
Unable to find quality data.
o marmoset (calJac1)
Unable to find quality data.
## copy all .qac and .qdx files to the san
cp *.{qac,qdx} /san/sanvol1/rico/quality
## create species list (species.lst) containing the following
anoCar1 /san/sanvol1/rico/quality
bosTau3 /san/sanvol1/rico/quality
canFam2 /san/sanvol1/rico/quality
cavPor2 /san/sanvol1/rico/quality
dasNov1 /san/sanvol1/rico/quality
echTel1 /san/sanvol1/rico/quality
equCab1 /san/sanvol1/rico/quality
eriEur1 /san/sanvol1/rico/quality
felCat3 /san/sanvol1/rico/quality
gasAcu1 /san/sanvol1/rico/quality
loxAfr1 /san/sanvol1/rico/quality
monDom4 /san/sanvol1/rico/quality
oryCun1 /san/sanvol1/rico/quality
oryLat1 /san/sanvol1/rico/quality
otoGar1 /san/sanvol1/rico/quality
panTro2 /san/sanvol1/rico/quality
rheMac2 /san/sanvol1/rico/quality
rn4 /san/sanvol1/rico/quality
sorAra1 /san/sanvol1/rico/quality
tupBel1 /san/sanvol1/rico/quality
## the following script will add quality data to each of the mafs
cat > addQData << 'EOF'
#!/bin/sh
INPUT_DIR=/cluster/data/mm9/bed/multiz30way/anno/maf
OUTPUT_DIR=/cluster/data/mm9/bed/multiz30way/qual/maf
for maf in `ls -1Sr ${INPUT_DIR}/*.maf`
do
file=`basename $maf`
mafAddQRows species.lst $maf ${OUTPUT_DIR}/$file
done
'EOF'
# << emacs
#########################################################################
### IGTC (Int'l GeneTrap Consortium) (DONE - 2007-10-01 - angie)
### Doug Stryke <stryke@cgl.ucsf.edu> in Tom Ferrin's lab
### NOTE -- the igtc track is automatically updated on hgwdev by the
### scripts monthlyUpdateIgtc.csh and updateIgtc.pl in
### kent/src/hg/utils/automation/ .
#########################################################################
# Load CCDS (2007-12-12 markd)
# import ccds database as described in ccds.txt
set db=mm9
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
# build initial version of ccdsMgcMap table, updated by nightly genbank update
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap
checkTableCoords ${db} -verbose=2 ccdsGene
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
############################################################################
# Reload CCDS (2008-02-01 markd)
# import ccds database as described in ccds.txt
set db=mm9
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
############################################################################
# dbSNP BUILD 128 (DONE 2/8/08 angie)
# updated snp128ExceptionDesc (tweaked wording) 3/11/08
# Set up build directory
ssh kkstore06
mkdir -p /cluster/store3/dbSNP128/{mouse,shared}
# dbSNP 128 field encodings (*.bcp.gz) were already downloaded --
# see hg18.txt.
########################## DOWNLOAD #############################
cd /cluster/data/dbSNP/128/mouse
mkdir data schema rs_fasta
# Get data from NCBI (anonymous FTP)
wget ftp://ftp.ncbi.nih.gov/snp/00readme.txt
cd /cluster/data/dbSNP/128/mouse/data
alias wg wget --timestamping
set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/database
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
wg $ftpSnpDb/organism_data/b128_SNPContigLoc_37_1.bcp.gz
wg $ftpSnpDb/organism_data/b128_SNPContigLocusId_37_1.bcp.gz
wg $ftpSnpDb/organism_data/b128_ContigInfo_37_1.bcp.gz
# MapInfo has alignment weights
wg $ftpSnpDb/organism_data/b128_SNPMapInfo_37_1.bcp.gz
# SNP has univar_id, validation status and heterozygosity
wg $ftpSnpDb/organism_data/SNP.bcp.gz
# Get schema
cd /cluster/data/dbSNP/128/mouse/schema
wg $ftpSnpDb/organism_schema/mouse_10090_table.sql.gz
# Get fasta files
# using headers of fasta files for molType, class, observed
cd /cluster/data/dbSNP/128/mouse/rs_fasta
wg ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/rs_fasta/\*.gz
########################## LOAD NCBI TABLES #############################
# Simplify names of data files -- strip version & extras to get
# local canonical table names.
cd /cluster/data/dbSNP/128/mouse/data
foreach f (*.bcp.gz)
set new = `echo $f \
| sed -e 's/^b128_SNP//; s/^b128_//; s/_37_1//; s/.bcp//;'`
mv $f $new
echo $new
end
# Extract just the tables that we need from the NCBI msSQL table
# creation file, and get CREATE statements from
# mouse_10090_table.sql for our 5 tables
cd /cluster/data/dbSNP/128/mouse/schema
zcat mouse_10090_table.sql.gz \
| perl -we '$/ = "\nGO\n\n\n"; \
while (<>) { \
next unless /^CREATE TABLE \[(b128_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_37_1)?\]/; \
s/b128_(SNP)?//; s/_37_1//; \
s/[\[\]]//g; s/GO\n\n\n/;/; s/smalldatetime/datetime/g; \
s/ON PRIMARY//g; s/COLLATE//g; s/Latin1_General_BIN//g; \
s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
s/nvarchar/varchar/g; s/set quoted/--set quoted/g; \
s/(image|varchar\s+\(\d+\))/BLOB/g; \
print; \
}' \
> table.sql
# load on kolossus or a small cluster machine (mysql5 is OK for this).
ssh kkr3u00
hgsql '' -e 'create database mm9snp128'
cd /cluster/data/dbSNP/128/mouse/schema
hgsql mm9snp128 < table.sql
cd ../data
foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
zcat $t.gz \
| perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
| hgLoadSqlTab -oldTable mm9snp128 $t placeholder stdin
end
# There were some warnings (many cleared up by the perl substitution)
# but no rows were dropped. I eyeballed a few examples, seemed OK,
# e.g. no value given for a field where NULL is OK.
foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
echo -n "${t}:\t"
hgsql -N -B mm9snp128 -e 'select count(*) from '$t
end
#ContigInfo: 13636
#ContigLoc: 31733892
#ContigLocusId: 12883378
#MapInfo: 28464204
#SNP: 14380527
# compare contig list in mm9.ctgPos vs ContigInfo (for the reference
# strain, not the alts included in ContigInfo)
ssh hgwdev hgsql mm9 -NBe 'select * from ctgPos;' \
| sed -re 's/^(N[A-Z]_[0-9]+)\.[0-9]+/\1/;' \
> ctgPos.tab
awk '{print $1;}' ctgPos.tab | sort > /tmp/1
# Take a look at the group_label values and choose a set that matches
# the reference assembly:
hgsql mm9snp128 -NBe 'select distinct(group_label) from ContigInfo'
# Looks like just ref_strain will do.
hgsql mm9snp128 -NBe 'select contig_acc from ContigInfo \
where group_label = "C57BL/6J"' \
| sort > /tmp/2
diff /tmp/1 /tmp/2
# No diff, good.
# Make sure there are no orient != 0 contigs among those selected.
hgsql mm9snp128 -NBe \
'select count(*) from ContigInfo where orient != 0 and \
group_label = "C57BL/6J";'
#0
#################### EXTRACT INFO FROM NCBI TABLES ####################
mkdir -p /scratch/snp/128/mouse
cd /scratch/snp/128/mouse
time hgsql mm9snp128 -e \
'alter table ContigLoc add index (ctg_id); \
alter table ContigInfo add index (ctg_id);'
#0.002u 0.001s 6:18.71 0.0% 0+0k 0+0io 1pf+0w
time hgsql mm9snp128 -e \
'alter table ContigInfo add index (group_label(9));'
#0.002u 0.002s 0:00.35 0.0% 0+0k 0+0io 1pf+0w
# Since there is only one group_label for mouse, just use snp_id
# as key. If there is more than one group_label to pick up, then
# don't use this as a template -- use hg18.txt.
hgsql mm9snp128 -NBe \
'select snp_id, ContigInfo.contig_acc, asn_from, asn_to, \
loc_type, orientation, allele, phys_pos_from \
from ContigLoc, ContigInfo \
where ContigLoc.ctg_id = ContigInfo.ctg_id and \
ContigInfo.group_label = "C57BL/6J";' \
| sort \
> ucscContigLoc.txt
# took ~7 minutes
# The IDs are non-unique (can be multiply mapped). This is OK if
# everything else that we relate to these uniquely maps to snp_id.
wc -l ucscContigLoc.txt
#16232825 ucscContigLoc.txt
awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l
#14304640
# SNP -> valid, avHet, avHetSE
# SNP has only snp_id as identifier, nothing relating to assembly.
hgsql mm9snp128 -NBe \
'select snp_id, validation_status, avg_heterozygosity, het_se \
from SNP;' \
| sort \
> ucscSNP.txt
# Check ID uniqueness:
wc -l ucscSNP.txt
#14380527 ucscSNP.txt
awk '{print $1;}' ucscSNP.txt | uniq | wc -l
#14380527
# ContigLocusId -> func
# ContigLocusId has only snp_id as an identifier (it gives one
# example contig if the SNP is on multiple contigs).
# The sort options and awk are to convert multiple entries with different
# function classes for the same SNP into one entry per SNP with a list
# of function classes.
hgsql mm9snp128 -NBe \
'select snp_id, fxn_class from ContigLocusId;' \
| sort -u -k1,1 -k2,2n \
| awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
else { if (prevId) {print prevId "\t" prevFunc;} \
prevFunc = $2 ","; }} \
{prevId = $1;} \
END {print prevId "\t" prevFunc;}' \
> ucscFunc.txt
# Check ID uniqueness:
wc -l ucscFunc.txt
#5878591 ucscFunc.txt
awk '{print $1;}' ucscFunc.txt | sort -u | wc -l
#5878591
# MapInfo -> weight
# MapInfo needs assembly+snp_ids in order to have unique IDs.
time hgsql mm9snp128 -e \
'alter table MapInfo add index (assembly(9));'
#0.000u 0.004s 2:22.64 0.0% 0+0k 0+0io 0pf+0w
hgsql mm9snp128 -NBe \
'select snp_id, weight from MapInfo where assembly = "C57BL/6J";' \
| sort \
> weight.txt
# ~1 minute
# Check ID uniqueness:
wc -l weight.txt
#14304640 weight.txt
awk '{print $1;}' weight.txt | uniq | wc -l
#14304640
awk '{print $2;}' weight.txt | sort -n | uniq -c
#13954580 1
# 113119 2
# 169755 3
# 67186 10
# SNPs w/weight 0 and 10 will be discarded later.
# fasta headers -> observed, molType, class
zcat /cluster/data/dbSNP/128/mouse/rs_fasta/rs_ch*.fas.gz \
| grep '^>gnl' \
| perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
| sort \
> ucscGnl.txt
# ~4 minutes
wc -l ucscGnl.txt
#14380527 ucscGnl.txt
awk '{print $1;}' ucscGnl.txt | uniq | wc -l
#14380527
############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################
# Join files by ID.
time join -a 1 -e MISSING -t ' ' ucscContigLoc.txt weight.txt \
> ucscCL+w.txt
#26.811u 4.091s 1:02.59 49.3% 0+0k 0+0io 0pf+0w
wc -l ucscCL+w.txt
#16232825 ucscCL+w.txt
# Same as ucscContigLoc.txt above, good.
# Any missing weights?
grep MISSING ucscCL+w.txt | head
# No output, good.
# Join the files with SNP-only IDs.
time join -e MISSING -t ' ' ucscGnl.txt ucscSNP.txt \
> ucscG+S.txt
#16.591u 1.935s 0:28.44 65.1% 0+0k 0+0io 0pf+0w
wc -l ucscG+S.txt
#14380527 ucscG+S.txt
# Same as ucscSNP.txt and ucscGnl.txt above.
grep MISSING ucscG+S.txt | wc -l
#0
time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \
-t ' ' ucscG+S.txt ucscFunc.txt \
> ucscG+S+F.txt
#17.438u 2.115s 0:24.83 78.6% 0+0k 0+0io 0pf+0w
wc -l ucscG+S+F.txt
#14380527 ucscG+S+F.txt
grep MISSING ucscG+S+F.txt | wc -l
#8501936
# Not surprising -- ucscFunc.txt has only 5878591 lines.
expr 14380527 - 5878591
#8501936
# Final join -- treat ContigLoc as authoritative (since it has coords).
# Arrange columns in same order as in the SNP table, with extras for
# checking at the end (phys_pos_from).
# chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ...
time join -a 1 -e MISSING -t ' ' \
-o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \
ucscCL+w.txt ucscG+S+F.txt \
> ucscNcbiSnp.ctg.txt
#41.401u 6.045s 1:02.04 76.4% 0+0k 0+0io 0pf+0w
wc -l ucscNcbiSnp.ctg.txt
#16232825 ucscNcbiSnp.ctg.txt
grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l
#8432812
# a bit less than the 8501936 missing FUNC's above... perhaps some
# of those did not have any mappings in ucscContigLoc.txt.
# Lift the map contig coordinates to chrom coordinates (~2m);
sed -re 's/\t(N[A-Z]_[0-9]+)\.[0-9]+\t/\t\1\t/;' \
/cluster/data/mm9/jkStuff/mm9.contigs.lift > liftContigs.lft
time liftUp ucscNcbiSnp.bed liftContigs.lft warn ucscNcbiSnp.ctg.txt
#131.007u 7.438s 2:26.48 94.5% 0+0k 0+0io 0pf+0w
wc -l ucscNcbiSnp.bed
#16232825 ucscNcbiSnp.bed
# At this point, move back from /scratch to /cluster/data.
nice gzip ucscNcbiSnp.bed
cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/128/mouse/
# Translate NCBI's encoding into UCSC's, and perform a bunch of
# checks. This is where developer involvement is most likely as
# NCBI extends the encodings used in dbSNP.
cd /cluster/data/dbSNP/128/mouse/
gunzip ucscNcbiSnp.bed.gz
time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/mm9/mm9.2bit \
snp128
#count of snps with weight 0 = 0
#count of snps with weight 1 = 13954580
#count of snps with weight 2 = 226238
#count of snps with weight 3 = 712684
#count of snps with weight 10 = 1339323
#Found no errors.
#162.963u 9.783s 3:02.77 94.5% 0+0k 0+0io 1pf+0w
wc -l snp*
# 14893502 snp128.bed
# 22 snp128.sql
# 0 snp128Errors.bed
# 18 snp128ExceptionDesc.tab
# 1898314 snp128Exceptions.bed
# Make one big fasta file. (note: snp126 skipped chrUn... but it's small
# compared to chr1, chr2 etc.)
# Some of the fasta files have SNPs that were not mapped to the reference
# assembly. Make sure there is no overlap with snp128.bed, and then
# move then out of the way.
zcat rs_fasta/rs_chNotOn.fas.gz \
| perl -we 'while (<>) { \
next unless /^>gnl/; s/^>gnl.dbSNP.(rs\d+).*/$1/; print; }' \
| sort | grep -Fwf - snp128.bed | head
^chNotOn^chAltOnly
# No output from either command -- good.
mkdir rs_fasta/omitted
mv rs_fasta/rs_ch{AltOnly,NotOn}.fas.gz rs_fasta/omitted/
zcat rs_fasta/rs_ch*.fas.gz \
| perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
> snp128.fa
# Check for duplicates.
grep ^\>rs snp128.fa | sort > /scratch/tmp/seqHeaders
wc -l /scratch/tmp/seqHeaders
#14304640 /scratch/tmp/seqHeaders
uniq /scratch/tmp/seqHeaders | wc -l
#14304640
# Use hgLoadSeq to generate .tab output for sequence file offsets,
# and keep only the columns that we need: acc and file_offset.
# Index it and translate to snpSeq table format.
time hgLoadSeq -test placeholder snp128.fa
#42.866u 4.977s 0:48.09 99.4% 0+0k 0+0io 4pf+0w
cut -f 2,6 seq.tab > snp128Seq.tab
rm seq.tab
ssh hgwdev
# Load up main track tables.
cd /cluster/data/dbSNP/128/mouse
time nice hgLoadBed -tab -noSort -onServer -tmpDir=/scratch/tmp \
mm9 snp128 -sqlTable=snp128.sql snp128.bed
#Loaded 14893502 elements of size 17
#67.395u 12.818s 8:43.01 15.3% 0+0k 0+0io 0pf+0w
sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125Exceptions.sql \
> snp128Exceptions.sql
time nice hgLoadBed -tab -onServer -tmpDir=/scratch/tmp \
mm9 snp128Exceptions -sqlTable=snp128Exceptions.sql \
snp128Exceptions.bed
#Loaded 1898314 elements of size 5
#8.925u 1.354s 0:52.66 19.5% 0+0k 0+0io 0pf+0w
sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
> snp128ExceptionDesc.sql
# 3/11/08: reloaded snp128ExceptionDesc (tweaked wording)
hgLoadSqlTab mm9 snp128ExceptionDesc snp128ExceptionDesc.sql \
snp128ExceptionDesc.tab
# Load up sequences.
sed -e 's/snpSeq/snp128Seq/' ~/kent/src/hg/lib/snpSeq.sql \
> snp128Seq.sql
mkdir -p /gbdb/mm9/snp
ln -s /cluster/data/dbSNP/128/mouse/snp128.fa /gbdb/mm9/snp/snp128.fa
time nice hgLoadSqlTab mm9 snp128Seq snp128Seq.sql snp128Seq.tab
#0.000u 0.003s 3:02.66 0.0% 0+0k 0+0io 0pf+0w
# Put in a link where one would expect to find the track build dir...
ln -s /cluster/data/dbSNP/128/mouse /cluster/data/mm9/bed/snp128
#########################################################################
# BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11,12 - Hiram)
ssh kkstore06
screen # use a screen to manage this multi-day job
mkdir /cluster/data/mm9/bed/blastzBosTau4.2008-03-11
cd /cluster/data/mm9/bed/blastzBosTau4.2008-03-11
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow bosTau4
SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=200
SEQ2_CHUNK=20000000
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzBosTau4.2008-03-11
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-syntenicNet > do.log 2>&1 &
# real 460m51.297s
cat fb.mm9.chainBosTau4Link.txt
# 690095394 bases of 2620346127 (26.336%) in intersection
mkdir /cluster/data/bosTau4/bed/blastz.mm9.swap
cd /cluster/data/bosTau4/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzBosTau4.2008-03-11/DEF \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet > swap.log 2>&1 &
# real 117m39.571s
cat fb.bosTau4.chainMm9Link.txt
# 707444627 bases of 2731830700 (25.896%) in intersection
#######################################################################
# BLASTZ/CHAIN/NET Lamprey petMar1 (DONE - 2008-04-14 - Hiram)
ssh kkstore06
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzPetMar1.2008-04-14
cd /cluster/data/mm9/bed/blastzPetMar1.2008-04-14
cat << '_EOF_' > DEF
# Mouse vs. Lamprey
# using the "distant" genome alignment parameters
# see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Mouse
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Lamprey petMar1
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzPetMar1.2008-04-14
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust -bigClusterHub=pk > do.log 2>&1 &
cat fb.mm9.chainPetMar1Link.txt
# 29113438 bases of 2620346127 (1.111%) in intersection
# That is OK, now for the swap:
mkdir /cluster/data/petMar1/bed/blastz.mm9.swap
cd /cluster/data/petMar1/bed/blastz.mm9.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust -bigClusterHub=pk > swap.log 2>&1 &
# real 33m29.076s
cat fb.petMar1.chainMm9Link.txt
# 26052507 bases of 831696438 (3.132%) in intersection
#######################################################################
# BLASTZ/CHAIN/NET Lanclet broFla1 (DONE - 2008-04-14 - Hiram)
ssh kkstore06
screen # use screen to control this job
mkdir /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
cd /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
cat << '_EOF_' > DEF
# Mouse vs. Lanclet
# using the "distant" genome alignment parameters
# see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Mouse
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold
# Largest scaffold 7,200,735 - 3032 scaffolds + chrM
SEQ2_DIR=/scratch/data/braFlo1/braFlo1.2bit
SEQ2_LEN=/scratch/data/braFlo1/chrom.sizes
SEQ2_CTGDIR=/scratch/data/braFlo1/braFlo1UnScaffolds.2bit
SEQ2_CTGLEN=/scratch/data/braFlo1/braFlo1UnScaffolds.sizes
SEQ2_LIFT=/scratch/data/braFlo1/braFlo1.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust -bigClusterHub=kk > do.log 2>&1 &
# real 408m36.691s
cat fb.mm9.chainBraFlo1Link.txt
# 26725980 bases of 2620346127 (1.020%) in intersection
# That is OK, now for the swap:
mkdir /cluster/data/braFlo1/bed/blastz.mm9.swap
cd /cluster/data/braFlo1/bed/blastz.mm9.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust -bigClusterHub=kk > swap.log 2>&1 &
# real 12m23.402s
cat fb.braFlo1.chainMm9Link.txt
# 31517169 bases of 923355587 (3.413%) in intersection
###########################################################################
# LOAD Transcriptome data (DONE - 2008-05-06 - Hiram)
# data from Christian Iseli 'Christian.Iseli at licr.org'
ssh hgwdev
mkdir /cluster/data/mm9/bed/transcriptome
cd /cluster/data/mm9/bed/transcriptome
wget --timestamping ftp://ftp.licr.org/pub/MTr.gtf.gz
wget --timestamping ftp://ftp.licr.org/pub/txg.tar.gz
gtfToGenePred -genePredExt MTR.gtf.gz MTr.gp
hgLoadGenePred mm9 transcriptome -genePredExt MTr.gp
tar xvzf txg.tar.gz
# Do a little data cleanup and transformation and
# load splice graphs into database.
sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql \
> sibTxGraph.sql
cat txg/*.txg | txgToAgx stdin stdout \
| hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql mm9 sibTxGraph stdin
# Loaded 52065 elements of size 18
# Create sibAltEvents track for analysed alt-splices.
cat txg/*.txg \
| txgAnalyze stdin /cluster/data/mm9/mm9.2bit stdout \
| awk '$2 >= 0' | sort | uniq > sibAltEvents.bed
hgLoadBed mm9 sibAltEvents sibAltEvents.bed
#############################################################################
# BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-17 - larrym)
ssh kkstore04
screen # use screen to control this multi-day job
mkdir /cluster/data/mm9/bed/blastz.equCab2.2008-04-15
cd /cluster/data/mm9/bed/blastz.equCab2.2008-04-15
cat << '_EOF_' > DEF
# Mouse vs. Horse
BLASTZ_M=50
# TARGET: Mouse MM9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=200
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastz.equCab2.2008-04-15
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log &
ln -s blastz.equCab2.2008-04-15 /cluster/data/mm9/bed/blastz.equCab2
############################################################################
# Reload CCDS from CCDS.20080502 dump (2008-05-03 markd)
# import ccds database as described in ccds.txt
set db=mm9
set ncbiBld=37.1
# create and load ccdsGene and ccdsInfo tables from imported database
/cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene
# ccdsKgMap
/cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
checkTableCoords ${db} -verbose=2 ccdsGene
# update all.jointer to include ${db} in ccdsDb
joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
# request push of
ccdsGene
ccdsInfo
ccdsKgMap
# << emacs
############################################################################
# update vega genes to version 31 (v49 of Ensembl genes)
# (DONE - 2008-05-15 - Hiram)
mkdir /cluster/data/mm9/bed/vega31_49
cd /cluster/data/mm9/bed/vega31_49
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/mouse/gtf_file.gz"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/mouse/CHANGELOG.gz"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/mouse/catalog.txt"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/mouse/pep/Mus_musculus.VEGA.apr.pep.tot.fa.gz"
# processing similar to the same processing for Ensembl genes,
# from /cluster/data/mm9/bed/ensGene.49/process/doProcess.csh
cp -p /cluster/data/mm9/bed/ensGene.49/process/randoms.mm9.lift .
zcat gtf_file.gz \
| sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
| liftUp -type=.gtf stdout randoms.mm9.lift carry stdin \
| gzip > allGenes.gtf.gz
gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
| gzip > mm9.allGenes.gp.gz
/cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl \
infoOut.txt > ensGtp.tab
genePredCheck -db=mm9 mm9.allGenes.gp.gz
# checked: 54208 failed: 0
zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
genePredCheck -db=mm9 pseudo.gp
# checked: 3989 failed: 0
genePredCheck -db=mm9 not.pseudo.gp
# checked: 50219 failed: 0
hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp
############################################################################
# BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate)
ssh kkstore06
cd /cluster/data/mm9/bed
mkdir blastzSpeTri0.2008-05-16
cd blastzSpeTri0.2008-05-16
cat << '_EOF_' > DEF
# Mouse vs. Ground squirrel
BLASTZ_M=50
# TARGET: Mouse MM9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Ground squirrel speTri0
SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit
SEQ2_LEN=/cluster/data/speTri0/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=500
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzSpeTri0.2008-05-16
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium >& do.log &
ln -s blastzSpeTri0.2008-05-16 /cluster/data/mm9/bed/blastz.speTri0
# create reciprocal best chains/nets
ssh hgwdev
cd /cluster/data/mm9/bed/blastz.speTri0
/cluster/bin/scripts/doRecipBest.pl mm9 speTri0 >&! rbest.log &
# failed coverage check, shouldn't be fatal ?
# resume creating axt's and maf's
# use axtChain/doRecipBest.csh to create resume.csh
ssh kkstore06
cd /cluster/data/mm9/bed/blastz.speTri0/axtChain
csh resume.csh >&! resume.log &
ssh hgwdev
cd /cluster/data/mm9/bed/blastz.speTri0
featureBits mm9 chainSpeTri0Link > fb.mm9.chainSpeTri0Link.txt
cat fb.mm9.chainSpeTri0Link.txt
# 673393210 bases of 2620346127 (25.699%) in intersection
#################
# Rodent multiz (mouse, guinea pig, ground squirrel)
# for Jurgen Schmitz (2008-06-07 kate)
# Redo with unfiltered net mafs, to maximize squirrel sequence
ssh kkstore06
mkdir /cluster/data/mm9/bed/multiz3way
cd /cluster/data/mm9/bed/multiz3way
mkdir mafLinks
mkdir mafLinks/cavPor3
cd mafLinks/cavPor3
# high quality mammalian genome, so use syntenic net
ln -s ../../../blastz.cavPor3/mafSynNet/*.maf.gz .
mkdir ../speTri0
cd ../speTri0
# low coverage genome, so use reciprocal best
#ln -s ../../../blastz.speTri0/mafRBestNet/*.maf.gz .
# redo with unfiltered, to get more squirrel sequence
ln -s ../../../blastz.speTri0/maftNet/*.maf.gz .
# Copy MAFs to kluster-friendly disk
mkdir -p /san/sanvol1/scratch/mm9/multiz3way
cd /san/sanvol1/scratch/mm9/multiz3way
rsync -a --copy-links --progress \
/cluster/data/mm9/bed/multiz3way/mafLinks/ .
# get latest PSU utilities
mkdir penn
set p=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $p/{autoMZ,multiz,maf_project} penn
# the autoMultiz cluster run
ssh pk
cd /cluster/data/mm9/bed/multiz3way
# create species list and stripped down tree for autoMZ
cat > tree.nh << 'EOF'
((mm9 cavPor3) speTri0)
'EOF'
cat > species.lst << 'EOF'
mm9 cavPor3 speTri0
'EOF'
mkdir run maf
cd run
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = mm9
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz3way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz3way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
./autoMultiz $(root1) {check out line+ /cluster/data/mm9/bed/multiz3way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
awk '{print $1}' /cluster/data/mm9/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
# 35 jobs
para try
para check
#Completed: 35 of 35 jobs
#CPU time in finished jobs: 6086s 101.43m 1.69h 0.07d 0.000 y
#IO & Wait Time: 240s 4.00m 0.07h 0.00d 0.000 y
#Average job time: 181s 3.01m 0.05h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 502s 8.37m 0.14h 0.01d
#Submission to last job: 506s 8.43m 0.14h 0.01d
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/mm9
mkdir multizRodent3way
cd multizRodent3way
ln -s /cluster/data/mm9/bed/multiz3way/maf .
cat > README.txt << 'EOF'
This directory contains multiple alignments of 2 rodent genome
assemblies to the mouse genome (mm9, Mar. 2006):
_ guinea pig Cavia porcellus Feb. 2008, cavPor3
_ ground squirrel Spermophilus tridecemlineatus Jun. 2006, speTri0
'EOF'
# << emacs
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
#########################################################################
# ORegAnno - Open Regulatory Annotations
# loaded July 7, 2008
# updated Sept 29, 2008
# loaded by Belinda Giardine, in same manner as hg18 ORegAnno track
############################################################################
# JAX/MGI TRACKS (DONE 8/20/09 angie)
# Previously done 6/11/09 in /hive/data/genomes/mm9/bed/jax/2009_06 (pushed)
# Previously done 4/24/09 in /hive/data/genomes/mm9/bed/jax/2009_04 (not pushed)
# Previously done 9/24/08 in /cluster/data/mm9/bed/jax/2008_09
mkdir -p /hive/data/genomes/mm9/bed/jax/2009_08
cd /hive/data/genomes/mm9/bed/jax/2009_08
wget ftp://ftp.informatics.jax.org/pub/gbrowse/\*
wget ftp://ftp.informatics.jax.org/pub/reports/MGI_PhenotypicAllele.rpt
# Jax Rep Transcript track
# SEQ_RepTransGenomic_rpt.gff --> jaxRepTranscript{,Alias}
# -- names like AK016604_4933401J01Rik, NM_001011874_AY534250
# -- aliases ~ MGI:\d+
# Use simple perl script to uniquify transcript names and make alias.tab.
# (Copied /hive/data/genomes/mm8/bed/jax/2007_07/parseRepTranscript.pl and
# modified to tweak a regex for tweaked name NR_027008_Gt(ROSA)26Sor_1)
../2009_06/parseRepTranscript.pl SEQ_RepTransGenomic_rpt.gff \
| sed -e 's/^/chr/; s/chrMT/chrM/;' \
> jaxRepTranscript.gff
# Jax Allele track
# AL_*.gff --> jaxAllele{,Info}
# -- bed12Source -- add type from filename
# -- names like NM_011283_Rp1h<tm1Jnz>, XM_129721_Slc9a2<tm1Ges>
# -- Info: name, mgiID, source {"Gene trapped", ...}
rm -f jaxAllele.bed jaxAlleleInfo.tab fixJaxAllele.sql
foreach f (AL*.gff)
set type = `echo $f:t:r \
| sed -e 's/AL_//; s/GTRAP/GeneTrapped/; s/IND/Induced/; \
s/OTHER/Other/; s/SPON/Spontaneous/; s/TARG/Targeted/; \
s/TRANS/Transgenic/;'`
/hive/data/genomes/mm8/bed/jax//2007_09/parseAllele.pl $f \
| ldHgGene mm9 placeholder stdin -nobin -out=stdout \
| /cluster/bin/scripts/genePredToBed \
| sed -e 's/^/chr/; s/$/'"\t$type"'/;' \
>> jaxAllele.bed
end
# This round's formatting inconsistencies:
#source not given for NM_010230_Fmn1<ld-Is(17_In2)1Gso>
#source not given for NM_015770_a<jIs(17_In2)1Gso>
#source not given for NM_009521_Wnt3<In(11Trp53_11Wnt3)8Brd>
#source not given for NM_001127233_Trp53<In(11Trp53_11Wnt3)8Brd>
#source not given for NM_029931_Mllt3<T(4Mllt3_9Mll)1Thr>
# Jax Phenotype track
# MP_*.gff --> jaxPhenotype{,Alias}
# -- bed12Source -- add type from filename
# -- names like NM_001001488_Atp8b1
rm -f jaxPhenotype.bed jaxPhenotypeAlias.tab fixJaxPhenotype.sql
foreach f (MP_*.gff)
set type = `echo $f:t:r \
| perl -wpe 's/MP_[0-9]*_//; s/[_-](\w)/\u$1/g; s/^(\w)/\u$1/; \
s@AdiposeTissue@Adipose@ || \
s@BehaviorNeurological@Behavior@ || \
s@CardiovascularSystem@Cardiovascular@ || \
s@DigestiveAlimentary@Digestive@ || \
s@EndocrineExocrineGland@Gland@ || \
s@GrowthSize@Growth Size@ || \
s@HearingEar@Hearing/Ear@ || \
s@HematopoieticSystem@Hematopoietic@ || \
s@HomeostasisMetabolism@Homeostasis@ || \
s@ImmuneSystem@Immune@ || \
s@LethalityEmbryonicPerinatal@Embryonic Lethal@ || \
s@LethalityPostnatal@Postnatal Lethal@ || \
s@LifeSpanPostWeaningAging@Life Span@ || \
s@LimbsDigitsTail@Limbs and Tail@ || \
s@LiverBiliarySystem@Liver and Bile@ || \
s@NervousSystem@Nervous System@ || \
s@RenalUrinarySystem@Renal/Urinary@ || \
s@ReproductiveSystem@Reproductive@ || \
s@RespiratorySystem@Respiratory@ || \
s@SkinCoatNails@Skin/Coat/Nails@ || \
s@TasteOlfaction@Taste/Smell@ || \
s@TouchVibrissae@Touch@ || \
s@Tumorigenesis@Tumorigenesis@ || \
s@VisionEye@Vision/Eye@ || \
m/^Craniofacial|Cellular|Embryogenesis|Muscle|Normal|Other|Pigmentation|Skeleton|$/ || \
die "Unrec $_";'`
echo $type
/hive/data/genomes/mm8/bed/jax/2006_10/parsePhenotype.pl $f \
| ldHgGene mm9 placeholder stdin -nobin -out=stdout \
| /cluster/bin/scripts/genePredToBed \
| sed -e 's/^/chr/; s@$@'"\t$type"'@;' \
>> jaxPhenotype.bed
end
sort -u jaxPhenotypeAlias.tab > tmp
mv tmp jaxPhenotypeAlias.tab
# Jax QTL track
# QTL*.gff --> jaxQtl2 (or 3?)... but we're missing MIT SSLP marker
# and CM distance for 2, or those plus flanking markers for 3...
cmp MGI_QTL.gff ../2009_06/MGI_QTL.gff
# No output, so skip this part:
if (0)
perl -wpe 'chomp; s/\s*$//; \
($c, undef, undef, $start, $end, undef, $strand, undef, $info) = \
split("\t"); \
if ($info =~ /QTL (\S+); Dbxref "(MGI:\d+)"; Alias .*; Note "([^"]+)"/) { \
($name, $mgiID, $desc) = ($1, $2, $3); \
} else { die "parse\n$info"; } \
if ($start > $end) { $tmp = $end; $end = $start; $start = $tmp; } \
$start-- unless $start == 0; \
s/^.*$/chr$c\t$start\t$end\t$name\t1000\t$strand\t\t$mgiID\t$desc\t0.0\n/;' \
MGI_QTL.gff > jaxQtl.bed
endif
# Extract phenotype-allele relationships:
# Make a file for the one code not already in a filename:
cp /dev/null MP_0003012_no_phenotypic_analysis
# Wrote a script to extract the phenotype-allele relationships --
# it uses the filenames to map MP:* codes to our phenotype names.
/hive/data/genomes/mm8/bed/jax/2007_07/parsePhenotypicAllele.pl \
MGI_PhenotypicAllele.rpt > jaxAllelePheno.tab
# The file "err" has messages about missing data (no gene name in
# PhenotypicAllele.rpt, or gene/mgiId not found in jaxAlleleInfo).
# Load tables
# jaxRepTranscript
ldHgGene mm9 jaxRepTranscript jaxRepTranscript.gff
#35505 gene predictions
hgsql mm9 < fixJaxRepTranscript.sql
hgLoadSqlTab mm9 jaxRepTranscriptAlias \
~/kent/src/hg/lib/genericAlias.sql jaxRepTranscriptAlias.tab
checkTableCoords mm9 jaxRepTranscript
# jaxAllele
hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \
mm9 jaxAllele jaxAllele.bed
#Loaded 15904 elements of size 13
# fixJaxAllele.sql is empty so don't need to do this:
# hgsql mm9 < fixJaxAllele.sql
hgLoadSqlTab mm9 jaxAlleleInfo \
~/kent/src/hg/lib/jaxAlleleInfo.sql jaxAlleleInfo.tab
# jaxPhenotype
hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \
-tab mm9 jaxPhenotype jaxPhenotype.bed
#Loaded 32922 elements of size 13
# fixJaxPhenotype.sql is empty so don't need to do this:
# hgsql mm9 < fixJaxPhenotype.sql
hgLoadSqlTab mm9 jaxPhenotypeAlias \
~/kent/src/hg/lib/genericAlias.sql jaxPhenotypeAlias.tab
# jaxQtl
cmp MGI_QTL.gff ../2009_06/MGI_QTL.gff
# No output ==> no data change, skip the following lines:
# hgLoadBed -tab -notItemRgb -noBin \
# -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql \
# mm9 jaxQtl jaxQtl.bed
checkTableCoords -verbose=2 mm9 jaxQtl
# No output, good.
# phenotype-allele relationships
hgLoadSqlTab mm9 jaxAllelePheno \
~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab
# Check joiner:
runJoiner.csh mm9 jaxRepTranscript
runJoiner.csh mm9 jaxAllele
runJoiner.csh mm9 jaxPhenotype
############################################################################
# WOLD RNA-seq
#
# wig files: bed format, 25mers
ave mm9Brain.wig
#min 1, max=12989, median, 6
#7.4M reads
woldRnaSeqBrain
##########################################################################
# Fix equCab2 nets and chains to remove duplicate scaffold_34 (DONE - 2008-08-19 - larrym)
fixChainNetEquCab2 hg18
deleted: 3100 from chr1_chainEquCab2
deleted: 7362 from chr10_chainEquCab2
deleted: 8472 from chr11_chainEquCab2
deleted: 1078 from chr12_chainEquCab2
deleted: 2227 from chr13_chainEquCab2
deleted: 2 from chr13_random_chainEquCab2
deleted: 3605 from chr14_chainEquCab2
deleted: 6773 from chr15_chainEquCab2
deleted: 3400 from chr16_chainEquCab2
deleted: 0 from chr16_random_chainEquCab2
deleted: 3741 from chr17_chainEquCab2
deleted: 3 from chr17_random_chainEquCab2
deleted: 334 from chr18_chainEquCab2
deleted: 5620 from chr19_chainEquCab2
deleted: 5 from chr1_random_chainEquCab2
deleted: 23003 from chr2_chainEquCab2
deleted: 1265 from chr3_chainEquCab2
deleted: 0 from chr3_random_chainEquCab2
deleted: 2567 from chr4_chainEquCab2
deleted: 0 from chr4_random_chainEquCab2
deleted: 967 from chr5_chainEquCab2
deleted: 0 from chr5_random_chainEquCab2
deleted: 3419 from chr6_chainEquCab2
deleted: 10493 from chr7_chainEquCab2
deleted: 0 from chr7_random_chainEquCab2
deleted: 1284 from chr8_chainEquCab2
deleted: 1 from chr8_random_chainEquCab2
deleted: 10185 from chr9_chainEquCab2
deleted: 1 from chr9_random_chainEquCab2
deleted: 4 from chrM_chainEquCab2
deleted: 8 from chrUn_random_chainEquCab2
deleted: 1585 from chrX_chainEquCab2
deleted: 3 from chrX_random_chainEquCab2
deleted: 19 from chrY_chainEquCab2
deleted: 70 from chrY_random_chainEquCab2
deleted: 18173 from netEquCab2
#########################################################################
# BLASTZ/CHAIN/NET oryLat2 (DONE - 2008-08-25,27 - Hiram)
ssh kkstore06
screen # use a screen to manage this longish running job
mkdir /cluster/data/mm9/bed/blastzOryLat2.2008-08-25
cd /cluster/data/mm9/bed/blastzOryLat2.2008-08-25
cat << '_EOF_' > DEF
# Mouse vs. Medaka
BLASTZ=/cluster/bin/penn/x86_64/lastz
# typical parameters for a genome that is distant from human
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Mouse mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=200
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/blastzOryLat2.2008-08-25
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl `pwd`/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust \
-bigClusterHub=pk -verbose=2 > do.log 2>&1 &
# real 124m28.816s
# problems with memk today, continuing:
time doBlastzChainNet.pl `pwd`/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-continue=cat -qRepeats=windowmaskerSdust \
-smallClusterHub=pk -bigClusterHub=pk -verbose=2 > cat.log 2>&1 &
# the kluster is acting up, took several attempts to get one of the
# simple cat jobs done, not sure why it was having trouble, continuing:
time doBlastzChainNet.pl `pwd`/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-continue=chainRun -qRepeats=windowmaskerSdust \
-smallClusterHub=pk -bigClusterHub=pk -verbose=2 > chainRun.log 2>&1 &
time doBlastzChainNet.pl `pwd`/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -continue=chainMerge -qRepeats=windowmaskerSdust \
-smallClusterHub=pk -bigClusterHub=pk -verbose=2 > chainMerge.log 2>&1 &
# real 14m58.355s
cat fb.mm9.chainOryLat2Link.txt
# 50975949 bases of 2620346127 (1.945%) in intersection
cd /cluster/data/mm9/bed
ln -s blastzOryLat2.2008-08-25 blastz.oryLat2
# That is OK, now for the swap:
mkdir /cluster/data/oryLat2/bed/blastz.mm9.swap
cd /cluster/data/oryLat2/bed/blastz.mm9.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/mm9/bed/blastzOryLat2.2008-08-25/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust \
-smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
# real 15m26.642s
cat fb.oryLat2.chainMm9Link.txt
# 45837267 bases of 700386597 (6.545%) in intersection
#######################################
# Wold RNA-seq data (Done Jul 30 mikep)
#
df .
#Filesystem 1K-blocks Used Available Use% Mounted on
#kkstore06-10:/export/cluster/store4
# 2402304448 2183573728 96700640 96% /cluster/store4
ssh kkstore06
cd /cluster/store4/mm9/bed/woldRnaSeq/
# naming convention: woldRnaSeq (Signal) Tissue Replicate
# rename input wigs to convention
mv mm9Brain.wig woldRnaSeqSignalBrain1.wigbed
mv mm9Brain2.wig woldRnaSeqSignalBrain2.wigbed
mv mm9Liver.wig woldRnaSeqSignalLiver1.wigbed
mv mm9Liver2.wig woldRnaSeqSignalLiver2.wigbed
mv mm9Muscle.wig woldRnaSeqSignalMuscle1.wigbed
mv mm9Muscle2.wig woldRnaSeqSignalMuscle2.wigbed
# wigEncode it all
for T in Brain Liver Muscle
do
for R in 1 2
do
wigEncode woldRnaSeqSignal${T}${R}.wigbed woldRnaSeqSignal${T}${R}.wig woldRnaSeqSignal${T}${R}.wib
done
done
#Converted woldRnaSeqSignalBrain1.wigbed, upper limit 12989.00, lower limit 1.00
#Converted woldRnaSeqSignalBrain2.wigbed, upper limit 1482.24, lower limit 0.04
#Converted woldRnaSeqSignalLiver1.wigbed, upper limit 44652.00, lower limit 1.00
#Converted woldRnaSeqSignalLiver2.wigbed, upper limit 2567.53, lower limit 0.06
#Converted woldRnaSeqSignalMuscle1.wigbed, upper limit 60949.00, lower limit 1.00
#Converted woldRnaSeqSignalMuscle2.wigbed, upper limit 2726.96, lower limit 0.06
# Load on hgwdev
ssh hgwdev
for T in Brain Liver Muscle
do
for R in 1 2
do
ln -s /cluster/data/mm9/bed/woldRnaSeq/woldRnaSeqSignal${T}${R}.wib /gbdb/mm9/wib/
hgLoadWiggle mm9 woldRnaSeqSignal${T}${R} woldRnaSeqSignal${T}${R}.wig
done
done
rm wiggle.tab
# do the beds
for F in data/*beds*tgz
do
echo "untaring $F"
tar zxvf $F
done
# How many records in the beds?
wc -l *bed
# 8868804 mm9Brain1.multi.bed
# 856281 mm9Brain1.splices.bed
# 14488584 mm9Brain1.uniqs.bed
# 16180919 mm9Brain2.multi.bed
# 54100 mm9Brain2.spike.bed
# 1570776 mm9Brain2.splices.bed
# 26519333 mm9Brain2.uniqs.bed
# 12794917 mm9Liver1.multi.bed
# 1030969 mm9Liver1.splices.bed
# 13133048 mm9Liver1.uniqs.bed
# 17783124 mm9Liver2.multi.bed
# 414618 mm9Liver2.spike.bed
# 1372984 mm9Liver2.splices.bed
# 17673014 mm9Liver2.uniqs.bed
# 12048985 mm9Muscle1.multi.bed
# 1150895 mm9Muscle1.splices.bed
# 13936012 mm9Muscle1.uniqs.bed
# 16033642 mm9Muscle2.multi.bed
# 589787 mm9Muscle2.spike.bed
# 1347749 mm9Muscle2.splices.bed
# 16632816 mm9Muscle2.uniqs.bed
# 194481357 total
# Just do the splices ones
for T in Brain Liver Muscle
do
for R in 1 2
do
egrep -v "^track" mm9${T}${R}.splices.bed | gawk -v OFS="\t" '{print $1,$2,$3,$4,$5,$6,$2,$3,0,$10,$11,$12}' > woldRnaSeqSplices${T}${R}.bed
hgLoadBed mm9 woldRnaSeqSplices${T}${R} woldRnaSeqSplices${T}${R}.bed
done
done
rm bed.tab
#########################################################################
# KOMP/IKMC (KNOCKOUT MOUSE PROJECT became Int'l Knockout Mouse Cons) (DONE 12/8/09 angie)
# done 7/24/09 w/files emailed from Carol 7/24
# done 5/7/09 w/files emailed from Carol Bult 5/7
# done 2/12/09 w/files emailed from Carol Bult 2/12
# done 10/21/08 w/files emailed from Carol Bult 10/18
ssh hgwdev
mkdir -p /hive/data/genomes/mm9/bed/komp/2009_12
cd /hive/data/genomes/mm9/bed/komp/2009_12
# Save files emailed from Carol Bult 12/7 as
# 20091204_ikmc.gff.gz
# Make bed12 with itemRgb:
zcat 20091204_ikmc.gff.gz \
| perl -we \
'while (<>) { \
s/\r?\n$//; \
($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
if ($s eq "") { warn "$_\n"; s/^.*//; next; } # Some lines have no coords. \
$col = ($col eq "Yellow") ? "255,215,0" : \
($col eq "Green") ? "0,240,0" : \
($col eq "Blue") ? "0,0,200" : "0,0,0"; \
$s--; \
$id =~ s/^MGI:\d+; (\w+); .*/$1/ || die "Cant parse id \"$id\""; \
my $geneId = join("|", $chr, $ctr, "${n}_$id"); \
push @{$geneBlks{$geneId}}, [$s, $e, $col] unless $e <= 0; \
} \
warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
foreach my $geneId (keys %geneBlks) { \
my @blks = @{$geneBlks{$geneId}}; \
my ($chrom, $center, $name) = split(/\|/, $geneId); \
my $blkCount = @blks; \
@blks = sort {$a->[0] <=> $b->[0]} @blks; \
my $chromStart = $blks[0]->[0]; \
my $chromEnd = $blks[$blkCount-1]->[1]; \
my $color = $blks[0]->[2]; \
my $blkStarts = ""; \
my $blkSizes = ""; \
foreach my $blk (@blks) { \
my ($start, $end, $col) = @{$blk}; \
$blkStarts .= ($start - $chromStart) . ","; \
$blkSizes .= ($end - $start) . ","; \
if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
} \
print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
$chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
}' \
| sort -k 1,1 -k 2n,2n > komp.bed
#Got 36359 genes.
# No stderr empty-coord warnings this time (no unmapped items).
# Make an alias-style table with associated info (MGI ID and status):
zcat 20091204_ikmc.gff.gz \
| perl -wpe 's/\r?\n$//; @w = split("\t"); \
if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
if ($w[4] <= 0) { s/^.*//; next; } # A few lines have end=0. \
$w[8] =~ m/^(MGI:\d+); (\w+); (\w.*)/ || die; \
($mgi, $designId, $status) = ($1, $2, $3); \
$_ = "$w[10]_$designId\t$mgi,$w[2],$status\n";' \
| sort -u > kompExtra.tab
wc -l kompExtra.tab
#36359 kompExtra.tab
# Load 'em up:
hgLoadBed mm9 komp komp.bed
#Loaded 32185 elements of size 12
hgLoadSqlTab mm9 kompExtra $HOME/kent/src/hg/lib/genericAlias.sql kompExtra.tab
checkTableCoords -verbose=2 mm9 komp
#mm9.komp item Tekt3_41479 chr11:62887195-62896116: blocks 3 and 4 overlap.
#mm9.komp item Tekt3_41478 chr11:62887195-62896116: blocks 3 and 4 overlap.
#mm9.komp item Tekt3_41477 chr11:62887195-62896116: blocks 3 and 4 overlap.
#mm9.komp item Tekt3_41476 chr11:62887195-62896116: blocks 3 and 4 overlap.
#mm9.komp item Cntn5_44827 chr9:10008998-10019351: blocks 1 and 2 overlap.
# Carol talked to the Sanger folks about those... pls waive.
# Note from July '09: Carol noticed some very long items and is asking
# Sanger about them. Here's how to check it ourselves next time:
hgsql mm9 -e 'select name, (chromEnd-chromStart) as length from komp \
where chromEnd - chromStart > 1000000 order by length desc;'
#+----------------------+----------+
#| name | length |
#+----------------------+----------+
#| Ankrd22_67616 | 51920750 |
#| Ptprd_VG12763 | 2270723 |
#| Macrod2_VG12650 | 1997658 |
#| A430089I19Rik_71812 | 1814706 |
#| 1700049E17Rik2_68957 | 1596021 |
#| Pcdh15_VG15967 | 1550393 |
#| Gpc5_VG15750 | 1431812 |
#| Lrrc4c_VG10110 | 1313498 |
#| Agbl4_VG16439 | 1266664 |
#| Prkg1_VG15918 | 1197272 |
#| Ptprt_VG10147 | 1139158 |
#| Ccl21b_67667 | 1019106 |
#+----------------------+----------+
runJoiner.csh mm9 komp
# mm9.kompExtra.name - hits 36359 of 36359 ok
#########################################################################
### Affy MOE430 version 2 (DONE - 2008-09-25,10-02 - Hiram)
# Align probes from MOE430v2 chip.
# Data was picked up manually from the Affymetrix WEB site
# while logged in to the Affymetrix system, from the page:
# http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430-20
# found links to the following files:
-rw-r--r-- 1 51429336 Dec 1 2003 Mouse430_2.probe_fasta
-rw-r--r-- 1 163849 Dec 2 2003 Mouse430_2_control
-rw-r--r-- 1 89662619 Dec 2 2003 Mouse430_2.consensus
-rw-r--r-- 1 30999528 Dec 2 2003 Mouse430_2.target
-rw-r--r-- 1 24828845 Jun 12 2006 Mouse430_2.link.psl
-rw-r--r-- 1 119301329 Aug 18 2006 Mouse430_2_ortholog.csv
-rw-rw-rw- 1 95467111 Jul 7 22:05 Mouse430_2.na26.annot.csv
-rw-r--r-- 1 3188 Jul 8 13:23 3prime-IVT.AFFX_README.NetAffx-CSV-Files.txt
# placed into: /hive/data/genomes/mm9/bed/affyMOE430v2/affyData
# The GNF folks pointed to data available at:
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE10246
ssh memk
# cat ../affyData/Mouse430_2.probe_fasta \
# | sed -e "s/probe:Mouse430_2:/MOE320v2_/; s/:.*//" > MOE430v2_probes.fa
# cat ../affyData/Mouse430_2.target \
# | sed -e "s/target:Mouse430_2:/MOE320v2_/; s/;.*//" > MOE430v2_target.fa
mkdir /hive/data/genomes/mm9/bed/affyMOE430v2/run
cd /hive/data/genomes/mm9/bed/affyMOE430v2/run
mkdir psl
cut -f1 ../../../chrom.sizes > genome.list
cat ../affyData/Mouse430_2.consensus \
| sed -e "s/consensus:Mouse430_2://; s/;.*//" > affyMOE430v2.fa
ls -1 /hive/data/genomes/mm9/bed/affyMOE430v2/run/affyMOE430v2.fa \
> probe.list
cat << '_EOF_' > template
#LOOP
blat -fine -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib $(path2) {check out line+ psl/$(root1).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 genome.list probe.list template jobList
para create jobList
para try ... check ... push ... etc.
para time
# Completed: 35 of 35 jobs
# CPU time in finished jobs: 22222s 370.36m 6.17h 0.26d 0.001 y
# IO & Wait Time: 104s 1.74m 0.03h 0.00d 0.000 y
# Average job time: 638s 10.63m 0.18h 0.01d
# Longest finished job: 1580s 26.33m 0.44h 0.02d
# Submission to last job: 1589s 26.48m 0.44h 0.02d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create gnf1h.psl.
pslSort dirs raw.psl tmp psl
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \
../affyMOE430v2.psl /dev/null
# Load probes and alignments from MOE430v2 into database.
ssh hgwdev
cd /hive/data/genomes/mm9/bed/affyMOE430v2
mkdir /projects/compbio/data/microarray/affyMOE430v2
cp -p run/affyMOE430v2.fa /projects/compbio/data/microarray/affyMOE430v2
ln -s /projects/compbio/data/microarray/affyMOE430v2/affyMOE430v2.fa \
/gbdb/hgFixed/affyProbes
hgLoadPsl mm9 affyMOE430v2.psl
hgLoadSeq mm9 /gbdb/hgFixed/affyProbes/affyMOE430v2.fa
# 45037 sequences
pslToBed affyMOE430v2.psl affyMOE430v2Probes.bed
hgLoadBed -tmpDir=/scratch/tmp mm9 affyMOE430v2Probes affyMOE430v2Probes.bed
Loaded 46193 elements of size 12
# this is temporary, for use with bedMergeExpData below
# Create a similar formatted file to the one used in MOE430
zcat geoData/GSE10246_series_matrix.txt.gz \
| egrep "^\"1|source_name|Sample_title" \
| sed -e "s/\!Sample_title/#Probe Set/; s#\!Sample_source_name_ch1##;" \
| sed -e "s/\"//g" > gnfMOE430v2.AD.txt
# create gnfMouseAtlas3AllExps and gnfMouseAtlas3All tables in hgFixed
hgGnfMicroarray gnfMouseAtlas3AllExps gnfMouseAtlas3All \
gnfMOE430v2.AD.txt -chip=affyMOE430v2
# 182 experiments
# from that table, create median ratio table
# create table gnfMOE430v2AllRatio in hgFixed from hgFixed.gnfMOE430v2All
# and classification file ../hgMedianMicroarray/gnfMOE430v2.ra
hgRatioMicroarray gnfMouseAtlas3All gnfMouseAtlas3AllRatio \
-clump=$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra
# add those ratio's to the probe locations to make a bed 15 microarray type
bedMergeExpData hgFixed.gnfMouseAtlas3AllRatio mm9.affyMOE430v2Probes \
gnfMouseAtlas3AllRatio.bed
# no longer need this table
# do not need this table for the genome browser display
hgsql -e "drop table affyMOE430v2Probes;" mm9
hgLoadBed mm9 gnfMouseAtlas3 gnfMouseAtlas3AllRatio.bed
hgMapToGene mm9 gnfMouseAtlas3 knownGene \
knownToGnfMouseAtlas3 '-type=bed 12'
time hgExpDistance mm9 hgFixed.gnfMouseAtlas3AllRatio \
hgFixed.gnfMouseAtlas3AllExps gnfMouseAtlas3Distance \
-lookup=knownToGnfMouseAtlas3
# Have 45036 elements in hgFixed.gnfMouseAtlas3AllRatio
# Got 39872 unique elements in hgFixed.gnfMouseAtlas3AllRatio
# Loaded gnfMouseAtlas3Distance
# real 34m56.844s
# user 58m1.892s
# sys 1m44.821s
# Take the median value over multiple replicants creating
# hgFixed.gnfMouseAtlas3MedianRatio and gnfMouseAtlas3MedianExps
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfMouseAtlas3AllRatio gnfMouseAtlas3AllExps \
$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra \
gnfMouseAtlas3MedianRatio gnfMouseAtlas3MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfMouseAtlas3All gnfMouseAtlas3AllExps \
$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra \
gnfMouseAtlas3AllMedian gnfMouseAtlas3AllMedianExps -minExps=1
time hgExpDistance mm9 hgFixed.gnfMouseAtlas3MedianRatio \
hgFixed.gnfMouseAtlas3MedianExps gnfMouseAtlas3MedianDistance \
-lookup=knownToGnfMouseAtlas3
# Have 45037 elements in hgFixed.gnfMouseAtlas3MedianRatio
# Got 39872 unique elements in hgFixed.gnfMouseAtlas3MedianRatio
XXX - working Mon Nov 24 10:01:43 PST 2008
# real 16m5.102s
# user 41m54.581s
# sys 1m28.595s
# 182 experiments
# Convert these to ratios using the median of medians of non-cancerous
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
cd ../hgMedianMicroarray
# create tables gnfMOE430v2MedianRatio gnfMOE430v2MedianExps in hgFixed
hgMedianMicroarray hgFixed gnfMOE430v2AllRatio gnfMOE430v2AllExps \
gnfMOE430v2.ra gnfMOE430v2MedianRatio gnfMOE430v2MedianExps -minExps=1
# Also make a median version of the absolute measurements
# create gnfMOE430v2Median
hgMedianMicroarray hgFixed gnfMOE430v2All gnfMOE430v2AllExps \
gnfMOE430v2.ra gnfMOE430v2Median gnfMOE430v2MedianExps -minExps=1
cd /hive/data/genomes/mm9/bed/affyMOE430v2
# Load up microarray track
hgMapMicroarray gnfMOE430v2.bed hgFixed.gnfMOE430v2MedianRatio \
affyMOE430v2.psl
# Loaded 45037 rows of expression data from hgFixed.gnfMOE430v2MedianRatio
# Mapped 44106, multiply-mapped 2087, missed 0, unmapped 931
hgLoadBed mm9 gnfMOE430v2 gnfMOE430v2.bed
# Loaded 46193 elements of size 15
#######################################
hgExpDistance mm9 hgFixed.gnfMouseAtlas2MedianRatio \
hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
# Convert these to ratios using the median of medians of non-cancerous
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfMouseAtlas2All gnfMouseAtlas2AllRatio -clump=../hgMedianMicroarray/gnfMouseAtlas2.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfMouseAtlas2AllRatio gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2MedianRatio gnfMouseAtlas2MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfMouseAtlas2All gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2Median gnfMouseAtlas2MedianExps -minExps=1
############################################################################
# hgPal downloads
ssh hgwdev
screen
bash
rm -rf /cluster/data/mm9/bed/multiz30way/pal
mkdir /cluster/data/mm9/bed/multiz30way/pal
cd /cluster/data/mm9/bed/multiz30way/pal
cat > order.lst <<EOF
mm9
rn4
cavPor2
oryCun1
hg18
panTro2
rheMac2
ponAbe2
calJac1
otoGar1
tupBel1
sorAra1
eriEur1
canFam2
felCat3
equCab1
bosTau3
dasNov1
loxAfr1
echTel1
monDom4
ornAna1
galGal3
anoCar1
xenTro2
gasAcu1
danRer5
tetNig1
fr2
oryLat1
EOF
mz=multiz30way
gp=refGene
db=mm9
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.jobs
time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
sleep 1
tail -f $gp.jobs.log
# real 196m7.752s
# user 11m26.917s
# sys 3m41.587s
zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
# we're only distributing exons at the moment
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
mz=multiz30way
gp=knownGene
db=mm9
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.$mz.jobs
time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
sleep 1
tail -f $gp.$mz.job.log
# real 216m43.721s
# user 18m33.552s
# sys 5m42.639s
zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
# now do the canonical set
cd /cluster/data/mm9/bed/multiz30way/pal
mz=multiz30way
gp=knownCanonical
db=mm9
for j in `awk '{print $1}' /cluster/data/mm9/chrom.sizes`
do
echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
done
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -geneBeds=$j.known.bed $db $mz knownGene order.lst stdout | gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.$mz.jobs
time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
sleep 1
tail -f $gp.$mz.job.log
# real 192m17.168s
# user 10m28.659s
# sys 3m53.467s
rm *.known.bed
zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
db=mm9
mz=multiz30way
gp=knownCanonical
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
#############################################################################
# MOUSE TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)
# (to build the affyExonTissues track, see the steps outlined in hg18.txt)
#############################################################################
########################################################################
## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy)
## (instructions are in hg18.txt)
########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
mm9.upstreamGeneTbl = refGene
mm9.upstreamMaf = multiz30way /hive/data/genomes/mm9/bed/multiz30way/species.list
#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08)
ssh hgwdev
mkdir /cluster/data/mm9/bed/mrnaPcr
cd /cluster/data/mm9/bed/mrnaPcr
genePredToBed /cluster/data/mm9/bed/ucsc.10/ucscGenes.gp > ucscGenes.bed
hgsql mm9 -NBe 'select kgId,geneSymbol from kgXref' \
| perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
> idSub.txt
subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
sequenceForBed -keepName -db=mm9 -bedIn=ucscGenesIdSubbed.bed \
-fastaOut=stdout \
| faToTwoBit stdin kgTargetSeq.2bit
cut -f 1-10 /cluster/data/mm9/bed/ucsc.10/ucscGenes.gp \
| genePredToFakePsl mm9 stdin kgTargetAli.psl /dev/null
# Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
cd /cluster/data/mm9/bed/mrnaPcr
hgLoadPsl mm9 kgTargetAli.psl
mkdir /gbdb/mm9/targetDb
ln -s /cluster/data/mm9/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/mm9/targetDb/
# Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
# /gbdb/mm9/targetDb/kgTargetSeq.2bit .
ssh hgwdev
# Add records to hgcentraltest blatServers and targetDb:
hgsql hgcentraltest -e \
'INSERT into blatServers values ("mm9Kg", "blat13", 17805, 0, 1);'
hgsql hgcentraltest -e \
'INSERT into targetDb values("mm9Kg", "UCSC Genes", \
"mm9", "kgTargetAli", "", "", \
"/gbdb/mm9/targetDb/kgTargetSeq.2bit", 1, now(), "");'
#############################################################################
# TEST BLASTZ with Rn5 (DONE - 2008-11-26,30 - Hiram)
mkdir /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
cd /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
cat << '_EOF_' > DEF
# mouse vs rat
# Specially tuned blastz parameters from Webb Miller
BLASTZ=blastz
BLASTZ_ABRIDGE_REPEATS=0
BLASTZ_O=600
BLASTZ_E=55
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/scratch/data/blastz/mouse_rat.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rat Rn5
SEQ2_DIR=/scratch/data/rn5/rn5.2bit
SEQ2_LEN=/scratch/data/rn5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
cd /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium \
-stop=net `pwd`/DEF > do.log 2>&1 &
# real 403m22.371s
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-debug -chainMinScore=5000 -chainLinearGap=medium \
-continue=load -stop=load `pwd`/DEF > load.log 2>&1 &
# real 44m59.528s
cat fb.mm9.chainRn5BlastzLink.txt
# 1751593467 bases of 2620346127 (66.846%) in intersection
cat /cluster/data/mm9/bed/blastzRn4.2007-08-31/fb.mm9.chainRn4Link.txt
# 1713186474 bases of 2620346127 (65.380%) in intersection
mkdir /hive/data/genomes/rn5/bed/blastz.mm9.swap
cd /hive/data/genomes/rn5/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26/DEF \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium \
-swap -stop=net > swap.log 2>&1 &
# real 63m51.690s
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26/DEF \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium \
-debug -swap -continue=load -stop=load > load.log 2>&1 &
cat fb.rn5.chainMm9BlastzLink.txt
# 1901280009 bases of 3372561689 (56.375%) in intersection
#############################################################################
# AFFY EXON PROBE LIFT MM8->MM9 (DONE, 2008-12-17 Andy)
ssh hgwdev
cd /hive/data/genomes/mm9/bed
mkdir affyMoEx1
cd affyMoEx1/
echo "select * from affyMoEx1Probe" | \
hgsql mm8 | tail +2 | cut -f2- > mm8.affyMoEx1Probe.bed
liftOver mm8.affyMoEx1Probe.bed /gbdb/mm8/liftOver/mm8ToMm9.over.chain.gz \
affyMoEx1Probe.bed unmapped.txt
grep Partially unmapped.txt | wc -l
#199
grep Split unmapped.txt | wc -l
#190
grep Deleted unmapped.txt | wc -l
#354
wc -l mm8.affyMoEx1Probe.bed
#4549897
## Out of 4.5 million probes in mm8, we've lost 743 in different ways
## attempting to lift. That's an acceptable number.
hgLoadBed mm9 affyMoEx1Probe{,.bed}
echo "select * from affyMoEx1Transcript" | \
hgsql mm8 | tail +2 | cut -f2- > mm8.affyMoEx1Transcript.bed
liftOver mm8.affyMoEx1Transcript.bed /gbdb/mm8/liftOver/mm8ToMm9.over.chain.gz \
affyMoEx1Transcript.bed unmapped.txt
hgLoadBed mm9 affyMoEx1Transcript{,.bed}
## Put unlifted IDs into a downloadable file.
mkdir /usr/local/apache/htdocs/goldenPath/mm9/unlifted
grep -A1 Deleted unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8Deleted.bed
grep -A1 Partially unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8PartiallyDeleted.bed
grep -A1 Split unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8Split.bed
grep -A1 Deleted unmappedTranscripts.txt | grep "^chr" > affyMoEx1Transcript.mm8Deleted.bed
grep -A1 Partially unmappedTranscripts.txt | grep "^chr" > affyMoEx1Transcript.mm8PartiallyDeleted.bed
cp affyMoEx1*.mm8*.bed /usr/local/apache/htdocs/goldenPath/mm9/unlifted
## mm8 and mm9 track descriptions differ:
## 1. Copy mouse/trackDb.ra setting to mouse/mm9/trackDb.ra and add
## origAssembly mm8 line.
## 2. Make a new paragraph in a new affyMouseExon.html in mm9 to include
## details about the lift and how many didn't lift.
#############################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2009-04-07)
# bash if not using bash shell already
ssh kolossus
mkdir /cluster/data/mm9/blastDb
cd /cluster/data/mm9
awk '{if ($2 > 1000000) print $1}' mm9Chroms_RandomContigs.hard.sizes > 1meg.lst
twoBitToFa -seqList=1meg.lst mm9Chroms_RandomContigs.hard.2bit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa 1meg.lst
awk '{if ($2 <= 1000000) print $1}' mm9Chroms_RandomContigs.hard.sizes > less1meg.lst
twoBitToFa -seqList=less1meg.lst mm9Chroms_RandomContigs.hard.2bit temp.fa
faSplit about temp.fa 1000000 blastDb/y
cd blastDb
for i in *.fa
do
/hive/data/outside/blast229/formatdb -i $i -p F
done
rm *.fa
ls *.nsq | wc -l
# 2712
mkdir -p /cluster/data/mm9/bed/tblastn.hg18KG
cd /cluster/data/mm9/bed/tblastn.hg18KG
echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 2712 query.lst
# we want around 250000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\)
# 36727/(250000/2712) = 398.414496
mkdir -p kgfa
split -l 398 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/mm9/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/mm9/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit
ssh swarm
cd /cluster/data/mm9/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 252216 of 252216 jobs
# CPU time in finished jobs: 14882096s 248034.93m 4133.92h 172.25d 0.472 y
# IO & Wait Time: 1019014s 16983.57m 283.06h 11.79d 0.032 y
# Average job time: 63s 1.05m 0.02h 0.00d
# Longest finished job: 184s 3.07m 0.05h 0.00d
# Submission to last job: 15667s 261.12m 4.35h 0.18d
ssh swarm
cd /cluster/data/mm9/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS ../blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
para create chainSpec
para try, check, push, check etc.
# Completed: 93 of 93 jobs
# CPU time in finished jobs: 5736s 95.59m 1.59h 0.07d 0.000 y
# IO & Wait Time: 21289s 354.82m 5.91h 0.25d 0.001 y
# Average job time: 291s 4.84m 0.08h 0.00d
# Longest finished job: 472s 7.87m 0.13h 0.01d
# Submission to last job: 496s 8.27m 0.14h 0.01d
cd /cluster/data/mm9/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort u.*.psl m60* | uniq > ../unliftBlastHg18KG.psl
cd ..
pslCheck unliftBlastHg18KG.psl
liftUp -nohead temp.psl ../../jkStuff/mm9.contigs.lift carry unliftBlastHg18KG.psl
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n temp.psl > blastHg18KG.psl
rm temp.psl
pslCheck blastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/mm9/bed/tblastn.hg18KG
hgLoadPsl mm9 blastHg18KG.psl
# check coverage
featureBits mm9 blastHg18KG
# 30285278 bases of 2620346127 (1.156%) in intersection
featureBits mm9 knownGene:cds blastHg18KG -enrichment
# knownGene:cds 1.278%, blastHg18KG 1.156%, both 0.969%, cover 75.86%, enrich 65.64x
featureBits mm9 refGene:cds blastHg18KG -enrichment
# refGene:cds 1.205%, blastHg18KG 1.156%, both 0.940%, cover 78.04%, enrich 67.52x
rm -rf blastOut
#end tblastn
#############################################################################
# LASTZ Swap Human Hg19 (DONE - 2009-05-14 - Hiram)
# the original
cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
cat fb.hg19.chainMm9Link.txt
# 1022734273 bases of 2897316137 (35.299%) in intersection
# and the swap
mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap
cd /hive/data/genomes/mm9/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \
-swap -noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 131m58.763s
cat fb.mm9.chainHg19Link.txt
# 1013880568 bases of 2620346127 (38.693%) in intersection
#############################################################################
# RE-BUILD miRNA TRACK (DONE, 2009-06-09-2009-06-11, hartera)
# The miRNA track from miRBase is out of date so update the track.
mkdir -p /hive/data/genomes/mm9/bed/miRNA-2009-06-09
cd /hive/data/genomes/mm9/bed/miRNA-2009-06-09
# Download GFF file of latest miRNA annotations from miRBase at the
# Wellcome Trust Sanger Institute (WTSI). This is Release 13.0.
# (March 2009)
wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/CURRENT/genomes/mmu.gff
# Re-format, need to add "chr" to the beginning of each line.
sed -e 's/^/chr/' mmu.gff > mmMirBaseFormat.gff
# Remove extra "chr" in comment lines
perl -pi.bak -e 's/chr#/#/' mmMirBaseFormat.gff
# Change chrMT to chrM
perl -pi.bak -e 's/chrMT/chrM/' mmMirBaseFormat.gff
# Remove all but ID name in last field
sed -e 's/\";//g' mmMirBaseFormat.gff | sed -e 's/ID=\"/transcript_id=/g' \
| sed -e 's/ACC=\"MI[0-9]*\s//' > mmMirBaseFormatIdOnly.gff
# Load into database.
ldHgGene -exon=miRNA mm9 miRNARel13 mmMirBaseFormatIdOnly.gff
# Does not load as mmu-mir-692-2 is on two chroms, chr4 and chr13.
# These are alignments not genePreds so convert to BED for loading into
# the database.
sed -e 's/\";//g' mmMirBaseFormat.gff | sed -e 's/ID=\"//g' \
| sed -e 's/ACC=\"MI[0-9]*\s//' > mmMirBaseFormatIdOnly.gff
# chr1 . miRNA 20669091 20669163 . +
# . mmu-mir-206
# use score 906 for + strand and 480 for - strand. This will show
# up black on the track for + strand and grey for - strand.
# Re-do below and re-load track as appears off by 1 compared to
# Ensembl track and other miRNA resources (2009-06-11)
# Confirmed with Sam Griffith-Jones that the coordinates in the
# GFF file are 1-based. (2009-06-12).
awk 'BEGIN {FS="\t"} {OFS="\t"} \
{if ($0 !~ /#/ && $7 == "+") print $1, $4-1, $5, $9, 960, $7; \
else if ($0 !~ /#/ && $7 == "-") print $1, $4-1, $5, $9, 480, $7;}' \
mmMirBaseFormatIdOnly.gff > mmMirBaseFormatIdOnly.bed
# Remove previous table
hgsql -e 'drop table miRNA' mm9
hgLoadBed mm9 miRNA mmMirBaseFormatIdOnly.bed
# Reading mmMirBaseFormatIdOnly.bed
# Loaded 568 elements of size 6
# Sorted
# Creating table definition for miRNARel13
# Saving bed.tab
# Loading mm9
hgsql -e 'select count(*) from miRNA;' mm9
# 568
# The previous version had 493 miRNAs.
hgsql -e 'select count(distinct name) from miRNA;' mm9
# 541
# The previous version had 466 unique miRNAs.
############################################################################
# Re-Run equCab2 alignment (DONE - 2009-06-29,07-02 - Hiram
mkdir /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29
cd /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29
cat << '_EOF_' > DEF
# Mouse vs. Horse
BLASTZ_M=50
# TARGET: Mouse MM9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 360m10.094s
time doBlastzChainNet.pl `pwd`/DEF \
-continue=chainMerge -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 &
# real 225m4.178s
cat fb.mm9.chainEquCab2Link.txt
# 912421053 bases of 2620346127 (34.821%) in intersection
mkdir /hive/data/genomes/equCab2/bed/blastz.mm9.swap
cd /hive/data/genomes/equCab2/bed/blastz.mm9.swap
time doBlastzChainNet.pl \
/hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29/DEF \
-swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 122m25.314s
cat fb.equCab2.chainMm9Link.txt
# 902295813 bases of 2428790173 (37.150%) in intersection
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-07-30 - 2009-09-09, hartera)
# Needs updating as the current version is build 31 from May 2008.
# 2009-08-03 (hartera) - Added code to register track handler for
# vegaGeneComposite.
# 2009-08-15 - 2009-08-16 (hartera) - Added code to allow use of radio buttons
# on the configuratio page for the track item labels. Modified code so it
# can be shared with Ensembl to create the links to Vega transcript, gene
# and protein reports on the details pages.
# 2009-08-22 - Finished code for adding Vega report URLs to the details pages.
# Loaded the vegaGtp table.
# 2009-09-01 and 2009-09-03 (hartera). Loaded a vegaPep table for the protein
# sequence link on the details pages.
# 2009-09-04 Re-load all tables as some reverted to the older version during
# mySQL 5 upgrade.
# 2009-09-08 - 2009-09-09 Code change to change message on details page when
# no protein is available and change to trackDb to make vegaGene items a
# darker blue colour. Reloaded vegaPep after removing proteins whose
# transcripts are not in vegaGtp to make all.joiner happy.
mkdir -p /hive/data/genomes/mm9/bed/vega35
cd /hive/data/genomes/mm9/bed/vega35
# Download the VEGA genes for mouse from the ftp site
# This file is from 03/17/09.
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/mouse/gtf_file.gz"
# add chr in front of chromosome names and lift up the randoms
# processing similar to the same processing for Ensembl genes,
# from /cluster/data/mm9/bed/ensGene.49/process/doProcess.csh
cp -p /cluster/data/mm9/bed/ensGene.49/process/randoms.mm9.lift .
zcat gtf_file.gz \
| sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
| liftUp -type=.gtf stdout randoms.mm9.lift carry stdin \
| gzip > allGenes.gtf.gz
# Got 189 lifts in randoms.mm9.lift
gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
| gzip > mm9.allGenes.gp.gz
/cluster/home/hartera/kent/src/hg/utils/automation/extractGtf.pl \
infoOut.txt > ensGtp.tab
genePredCheck -db=mm9 mm9.allGenes.gp.gz
# checked: 59381 failed: 0
zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
# Modify the GTF files so that the gene name goes into the
# name2 field of the genePred.
perl -pi.bak -e 's/gene_id/other_gene_id/' *pseudo.gtf
perl -pi.bak -e 's/gene_name/gene_id/' *pseudo.gtf
gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
genePredCheck -db=mm9 pseudo.gp
# checked: 4305 failed: 0§
genePredCheck -db=mm9 not.pseudo.gp
# checked: 55076 failed: 0
hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp
# clean up
rm *.bak
# 2009-08-03 (hartera)
# Added code to src/hg/hgTracks/simpleTracks.c to register a track
# handler for vegaGeneComposite that is now used for this data. This used
# vegaGeneMethods to display the name2 field (gene) as the item label in
# the track.
# 2009-08-15 - 2009-08-16 (hartera)
# Information extracted the attributes in the GTF file as ensGtp so
# change name to vegaGtp.
mv ensGtp.tab vegaGtp.tab
# ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql
# There is an index on the protein field so it can not be NULL.
# If there is no protein, the gene name is given.
# Added code to hgTracks.c and hgTrackUi.c to allow the use of
# radio buttons on the track configuratioin page to select the
# gene name, accession or both to be displayed in the track.
# The gene name is displayed by default.
# Added code to hgc.c so that Ensembl and Vega can share code to
# create links on the details pages to the Vega reports for transcript,
# gene and protein through these IDs. Created new function
# printEnsemblOrVegaCustomUrl().
# 2009-08-22 (hartera)
# Loaded the vegaGtp table. Use ensGtp.sql to create the table.
# vegaGtp associates geneId/transcriptId/proteinId
# for the links to Vega reports from the details page.
cd /hive/data/genomes/mm9/bed/vega35
cp ~/kent/src/hg/lib/ensGtp.sql .
# 11 of the gene names for noncoding transcripts are too long for the
# protein ID field so change this field in ensGtp.sql to allow 40 chars
# instead of 20 and re-load the table.
hgsql -e 'drop table vegaGtp;' mm9
hgLoadSqlTab mm9 vegaGtp ensGtp.sql vegaGtp.tab
# Loaded succesfully
# Added code to hgc.c to use printEnsemblOrVegaCustomUrl() in
# doVegaGene() to add the links to Vega reports on the details pages.
# Code was added so that there is no protein sequence link on the details
# page if it there is none available e.g. noncoding.
# 2009-09-01 (hartera)
# Coding genes are displaying the message that there is no protein
# prediction available. Need to add a vegaPep table.
cd /hive/data/genomes/mm9/bed/vega35
# Download the protein FASTA file for Vega35
wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/mouse/pep/*.tot.fa.gz"
# from the Ensembl process:
zcat Mus_musculus.VEGA.mar.pep.tot.fa.gz \
| sed -e 's/^>.* Transcript:/>/;' | gzip > vegaPep.txt.gz
zcat vegaPep.txt.gz \
| ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
| sed -e '/^$/d; s/*$//' | sort > vegaPep.mm9.fa.tab
# Load table (2009-09-03, hartera)
hgPepPred mm9 tab vegaPep vegaPep.mm9.fa.tab
# Add vegaPep to the trackDb.ra entry for the vegaGeneComposite track
# in the type line for src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra.
# Check that the vegaPep table looks ok and then check protein-coding and
# noncoding transcript details pages for protein links.
# 2009-09-04, hartera
# Re-load tables after upgrade to mySQL 5 as they had reverted back to
# tables with the previous Vega dataset.
cd /hive/data/genomes/mm9/bed/vega35
hgsql -e 'drop table vegaGene;' mm9
hgsql -e 'drop table vegaPseudoGene;' mm9
hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp
hgsql -e 'drop table vegaGtp;' mm9
hgLoadSqlTab mm9 vegaGtp ensGtp.sql vegaGtp.tab
hgsql -e 'drop table vegaPep;' mm9
hgPepPred mm9 tab vegaPep vegaPep.mm9.fa.tab
# 2009-09-08 (hartera). Changed message in code for details page when no
# protein sequence is available to be more explanatory. "Non-protein
# coding gene or gene fragment, no protein prediction available." Changed
# the colouring for the vegaGene subtrack to be darker blue so there is
# more of a contrast between vegaGene and vegaPseudoGene subtracks.
# 2009-09-09 (hartera) - re-loaded vegaPep table with only those proteins
# that have a transcript ID in vegaGtp.
# all.joiner is complaining as there are about 1,000 extra proteins in
# vegaPep that do not have transcripts in vegaGtp. Decided to remove these
# and e-mailed the HAVANA group to ask about the discrepancy.
cd /hive/data/genomes/mm9/bed/vega35
awk '{print $2}' vegaGtp.tab | sort | uniq > vegaGtp.tx.ids
awk '{print $1}' vegaPep.mm9.fa.tab | sort | uniq > vegaPep.tx.ids
wc -l *.tx.ids
# 59381 vegaGtp.tx.ids
# 30956 vegaPep.tx.ids
# Number of transcripts that have a protein ID:
hgsql -Ne 'select transcript from vegaGtp where protein like "OTTMUSP%";' \
mm9 | sort | uniq > vegaGtpWithProt.tx.ids
wc -l vegaGtpWithProt.tx.ids
# 29902 vegaGtpWithProt.tx.ids
# find those that are common to both.
comm -12 vegaGtp.tx.ids vegaPep.tx.ids > pepandGtp.tx.ids
wc -l pepandGtp.tx.ids
# 29902 pepandGtp.tx.ids
comm -12 pepandGtp.tx.ids vegaGtpWithProt.tx.ids | wc -l
# 29902
# Therefore all the vegaGtp transcripts with a protein ID are in the
# protein FASTA file.
hgsql -Ne 'select * from vegaPep as p, vegaGtp as g where g.protein \
like "OTTMUSP%" and p.name = g.transcript;' mm9 \
> vegaPepOnlyInGtp.mm9.fa.tab
wc -l vegaPepOnlyInGtp.mm9.fa.tab
# 29902 vegaPepOnlyInGtp.mm9.fa.tab
hgsql -e 'drop table vegaPep;' mm9
hgPepPred mm9 tab vegaPep vegaPepOnlyInGtp.mm9.fa.tab
############################################################################
# Blastz Elephant loxAfr3 (DONE - 2009-08-12 - Hiram)
mkdir /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12
cd /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12
cat << '_EOF_' > DEF
# Mouse vs. Elephant
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Elephant loxAfr3
SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit
SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/mm9/bed/lastzLoxAfr3.2009-08-12
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
# real 498m44.261s
cat fb.mm9.chainLoxAfr3Link.txt
# 684326090 bases of 2620346127 (26.116%) in intersection
# trying syntenic nets
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
# about 20 minutes
mkdir /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
cd /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12/DEF \
-swap -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
-syntenicNet > swap.log 2>&1 &
# real 123m9.342s
cat fb.loxAfr3.chainMm9Link.txt
# 673856452 bases of 3118565340 (21.608%) in intersection
#########################################################################
## NIA Mouse Gene Index - (DONE, Fan, 9/9/09)
+# NOTE FOR NEXT TIME: this track fails pslCheck because every row in the
+# NIAGene table has a tSize of 198000000. Future tables should contain the
+# proper chromosome lengths in the tSize field. (Brooke, 2/22/10)
ssh hgwdev
mkdir -p /cluster/data/mm9/bed/NIAGene090903
cd /cluster/data/mm9/bed
ln -s NIAGene090903 NIAGene
cd NIAGene
wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm9/download/T-fasta.ff.gz
wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm9/download/T-psl.txt.gz
gzip -d *.gz
cut -f 1-21 T-psl.txt >NIAGene.tab
hgLoadPsl mm9 NIAGene.tab
mkdir /gbdb/mm9/NIAGene
ln -s /cluster/data/mm9/bed/NIAGene/T-fasta.fa /gbdb/mm9/NIAGene/T-fasta.fa
hgLoadSeq mm9 /gbdb/mm9/NIAGene/T-fasta.fa
#Creating seq.tab file
#Adding /gbdb/mm9/NIAGene/T-fasta.fa
#257758 sequences
#Updating seq table
#Warning: load of seq did not go as planned: 257758 record(s), 0 row(s) skipped, 257758 warning(s) loading ./seq.tab
#Advisory lock has been released
#All done
# not sure what the warnings are about, but the track seems working.
# Create/edit/check in NIAGene.html and trackDb.ra under
kent/src/hg/makeDb/trackDb/mouse/mm9
#####################################################################
# LASTZ Tetraodon TetNig2 (DONE - 2009-09-15 - Hiram)
mkdir /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15
cd /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15
cat << '_EOF_' > DEF
# mouse vs tetraodon
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5
# QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
screen
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# about 124 minutes
cat fb.mm9.chainTetNig2Link.txt
# 45642112 bases of 2620346127 (1.742%) in intersection
# running the swap
mkdir /hive/data/genomes/tetNig2/bed/blastz.mm9.swap
cd /hive/data/genomes/tetNig2/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15/DEF \
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-swap > swap.log 2>&1 &
# real 10m34.797s
cat fb.tetNig2.chainMm9Link.txt
# 41176381 bases of 302314788 (13.620%) in intersection
##############################################################################
# BUILD REST TRACK (DONE 9/16/09, Fan)
mkdir /hive/data/genomes/mm9/bed/REST
cd /hive/data/genomes/mm9/bed/REST
# Receive bed data file, REST_ChIP_PET_mm9.bed,
# from Rory JOHNSON [johnsonrb@gis.a-star.edu.sg].
hgLoadBed mm9 REST REST_ChIP_PET_mm9.bed
# Discovered mm9's extFile and history tables were out of sync.
# Bob and Hirm fixed the problem. Reload and it was successful.
# Created REST.html based on Rory's original doc and later updates.
# Added track definition and search term into trackDb/mouse/mm9/trackDb.ra
# Fix the 0 base problem. (Fan 9/20/09, per Rory's email)
hgsql mm9 -e 'update rest set chromStart = chromStart -1'
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
############################################################################
# ADD LINK TO GENENETWORK (DONE. 11/06/09 Fan).
# Received geneNetwork ID list file, GN_mouse_RefSeq.txt, for mm9 from
# GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com].
ssh hgwdev
mkdir -p /cluster/data/mm9/bed/geneNetwork
cd /cluster/data/mm9/bed/geneNetwork
hgsql mm9 < ~/src/hg/lib/geneNetworkId.sql
hgsql mm9 -e \
'load data local infile "GN_mouse_RefSeq.txt" into table geneNetworkId'
#########################################################################
# LASTZ/CHAIN/NET swap danRer6 (DONE - 2009-12-18 - Galt)
# original alignment to danRer6
cd /hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17
cat fb.danRer6.chainMm9Link.txt
# 77099032 bases of 1506896106 (5.116%) in intersection
# running the swap - DONE - 2009-12-18
mkdir /hive/data/genomes/mm9/bed/blastz.danRer6.swap
cd /hive/data/genomes/mm9/bed/blastz.danRer6.swap
time nice +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-swap >& swap.log &
# real 183m21.102s
cat fb.mm9.chainDanRer6Link.txt
# 73444297 bases of 2620346127 (2.803%) in intersection
#######################################################################
# Vega gene update (DONE - 2010-01-15 - Hiram)
# lookup version number at the Vega WEB site:
# http://vega.sanger.ac.uk/index.html
# and FTP site:
# ftp://ftp.sanger.ac.uk/pub/vega/
cd /hive/data/genomes/mm9
# step wise to verify operation
doEnsGeneUpdate.pl -vegaGene -ensVersion=37 -stop=download mm9.ensGene.ra
doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \
-continue=process -stop=process mm9.ensGene.ra
doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \
-continue=load -stop=load mm9.ensGene.ra
doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \
-continue=cleanup mm9.ensGene.ra
featureBits mm9 vegaGene
# 53838752 bases of 2620346127 (2.055%) in intersection
featureBits mm9 vegaPseudoGene
# 3060300 bases of 2620346127 (0.117%) in intersection
########################################################################
# Blastz Rabbit oryCun2 (DONE - 2010-01-15 - Hiram)
ssh hgwdev
screen # use screen to control this job
mkdir /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
cat << '_EOF_' > DEF
# Mouse vs. Rabbit
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rabbit at chunk 20,000,000 all but 36 contigs can fit in a single job
SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit
SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/oryCun2/oryCun2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/oryCun2/oryCun2.contigs.sizes
SEQ2_LIFT=/hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> do.log 2>&1 &
cat fb.mm9.chainOryCun2Link.txt
# 670229789 bases of 2620346127 (25.578%) in intersection
# 496428446 bases of 2620346127 (18.945%) in intersection
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-continue=syntenicNet -bigClusterHub=swarm \
-syntenicNet > syntenicNet.log 2>&1 &
# about 20 minutes
# create reciprocal best chains/nets
ssh hgwdev
cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
# this needs blastz.oryCun2 symlink to function
time nice -n +19 doRecipBest.pl mm9 oryCun2 > rbest.log 2>&1 &
# real 37m32.151s
mkdir /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
cd /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-swap -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
> swap.log 2>&1 &
# real 84m6.571s
cat fb.oryCun2.chainMm9Link.txt
# 669602734 bases of 2604023284 (25.714%) in intersection
#########################################################################
# ailMel1 Panda alignment (DONE - 2010-02-04 - Hiram)
mkdir /hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04
cd /hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04
cat << '_EOF_' > DEF
# Mouse vs. Panda
# parameters from the Panda paper supplemental where they describe
# their lastz parameters
BLASTZ_K=2200
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_H=2000
BLASTZ_C=2
BLASTZ_T=2
# our usual M
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Panda
SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 501m27.760s
cat fb.mm9.chainAilMel1Link.txt
# 749595031 bases of 2620346127 (28.607%) in intersection
mkdir /hive/data/genomes/ailMel1/bed/blastz.mm9.swap
cd /hive/data/genomes/ailMel1/bed/blastz.mm9.swap
time doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04/DEF \
-swap -noLoadChainSplit -bigClusterHub=swarm -smallClusterHub=memk \
-workhorse=hgwdev \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 54m57.140s
cat fb.ailMel1.chainMm9Link.txt
# 739076250 bases of 2245312831 (32.916%) in intersection
############################################################################
# susScr1 Pig BLASTZ/CHAIN/NET (WORKING - 2010-01-21 - Hiram)
screen # use a screen to manage this multi-day job
mkdir /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21
cd /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21
cat << '_EOF_' > DEF
# Pig vs. Mouse
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Pig SusScr1
SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit
SEQ2_LEN=/scratch/data/susScr1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 875m26.114s
cat fb.mm9.chainSusScr1Link.txt
# 616833828 bases of 2620346127 (23.540%) in intersection
mkdir /hive/data/genomes/susScr1/bed/blastz.mm9.swap
cd /hive/data/genomes/susScr1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21/DEF \
-swap -noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 69m27.221s
cat fb.susScr1.chainMm9Link.txt
# 656445475 bases of 2231332019 (29.419%) in intersection
#########################################################################
# CRG MAPABILITY (2010-02-05 - 2010-02-09, hartera, DONE)
# Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca
# from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona
# on 2010-02-04.
# Data was produced using their GEM mapper aligner taking sliding k-mers
# window of the human genome that were mapped back onto the genome with up
# to 2mismatches. For each window, a mappability score is computed
# S = 1/(nb of match_found) and the BigWig index was created according to
# this score.
# 2010-02-09. Loaded database and added data to /gbdb/
# Added trackDb entry for the Mapability track.
mkdir -p /hive/data/genomes/mm9/bed/crgMapability
cd /hive/data/genomes/mm9/bed/crgMapability
cat << 'EOF' > temp
#!/bin/tcsh -ef
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-36_mm9.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-50_mm9.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-75_mm9.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-100_mm9.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-40_mm9.bw.bz2
'EOF'
awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
temp > download.csh
rm temp
chmod +x download.csh
./download.csh >& download.log &
# Add the data to /gbdb/ and load the file names into tables (2010-01-26)
cd /hive/data/genomes/mm9/bed/crgMapability
bunzip2 *.bz2
# Add data to gbdb
mkdir -p /gbdb/mm9/bbi/
# Symlink files with names as crgMapabilityAlignXmer.bw to /gbdb/mm9/bbi
# and load file name into a table - one per dataset. Each table
# represents a subtrack.
foreach f (`ls *.bw`)
echo $f
set g=`echo $f | cut -d "-" -f2`
set num=`echo $g | cut -d "_" -f1`
set mer=`echo "${num}mer"`
set nf=`echo "crgMapabilityAlign${mer}.bw"`
echo $nf
ln -s `pwd`/${f} /gbdb/mm9/bbi/${nf}
hgsql mm9 -e "drop table if exists crgMapabilityAlign${mer}; \
create table crgMapabilityAlign${mer} (fileName varchar(255) not null); \
insert into crgMapabilityAlign${mer} values ('/gbdb/mm9/bbi/${nf}');"
end
# Added a trackDb entry for this mapability track in
# kent/src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra
# use bigWigInfo to check min and max values. Created a mapability.html
# description page.
#####################################################################
# tRNAs track (2010-01-15, Fan DONE)
#
ssh hgwdev
cd /hive/data/genomes/mm9/bed
mkdir tRNAs
cd tRNAs
# Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/
cp -p /projects/lowelab/users/lowe/Browser/vertebrates/mm9-tRNAs.bed .
cp -p \
/projects/lowelab/users/lowe/Browser/vertebrates/mm9_tRNAs_images.tar.gz\
.
hgsql mm9 -e 'drop table if exists tRNAs'
hgLoadBed -tab mm9 tRNAs mm9-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql
mkdir gif
cd gif
gzip -d ../mm9_tRNAs_images.tar.gz
tar -xvf mm9_tRNAs_images.tar
mkdir /hive/data/gbdb/mm9/RNA-img
cp -p * /hive/data/gbdb/mm9/RNA-img
#####################################################################
# LASTZ/CHAIN/NET Marmoset calJac3 (DONE - 2010-02-12 - Hiram)
# use a screen to control this job
screen
mkdir /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12
cd /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12
cat << '_EOF_' > DEF
# mouse vs marmoset
BLASTZ_M=50
# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Marmoset (calJac3)
SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
SEQ2_LIMIT=75
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 `pwd`/DEF \
-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# real 445m42.381s
cat fb.mm9.chainCalJac3Link.txt
# 859869647 bases of 2620346127 (32.815%) in intersection
mkdir /hive/data/genomes/calJac3/bed/blastz.mm9.swap
cd /hive/data/genomes/calJac3/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12/DEF \
-swap -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 90m38.739s
cat fb.calJac3.chainHg19Link.txt
# 861811978 bases of 2752505800 (31.310%) in intersection
#####################################################################