src/hg/makeDb/doc/monDom5.txt 1.18
1.18 2010/02/06 00:17:33 hiram
re-run the Panda chain net
Index: src/hg/makeDb/doc/monDom5.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/monDom5.txt,v
retrieving revision 1.17
retrieving revision 1.18
diff -b -B -U 1000000 -r1.17 -r1.18
--- src/hg/makeDb/doc/monDom5.txt 25 Jan 2010 19:16:13 -0000 1.17
+++ src/hg/makeDb/doc/monDom5.txt 6 Feb 2010 00:17:33 -0000 1.18
@@ -1,1537 +1,1537 @@
# for emacs: -*- mode: sh; -*-
# Creating the assembly for Monodelphis domestica
# South American, Short-tailed Opossum
# http://www.genome.gov/11510687
# http://www.genome.gov/12512285
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+---------------------------------+
# | tableName | type |
# +-------------+---------------------------------+
# | refGene | genePred refPep refMrna |
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | ensGene | genePred ensPep |
# | genscan | genePred genscanPep |
# +-------------+---------------------------------+
#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2007-05-11 - Hiram)
ssh kkstore04
mkdir -p /cluster/store8/monDom5/broad
ln -s /cluster/store8/monDom5 /cluster/data/monDom5
cd /cluster/data/monDom4/broad.mit.edu
time nice -n +19 wget --timestamping \
'ftp://broad.mit.edu/pub/assemblies/mammals/monodelphis/monDom5/*'
# real 52m48.859s
# user 0m2.122s
# sys 0m22.031s
# This was a symlink to ../monDom2/
# lrwxrwxrwx 1 36 May 10 16:00 BACread_2_BACclone.txt.gz -> ../monDom2/BACread_2_BACclone.txt.gz
# so actually fetch the file here
rm BACread_2_BACclone.txt.gz
time nice -n +19 wget --timestamping \
'ftp://broad.mit.edu/pub/assemblies/mammals/monodelphis/monDom2/BACread_2_BACclone.txt.gz'
# it looks like they've been using our tools, their agp file has
# the extra gap at the end of chrUn:
# chrUn 103240612 103241611 16995 N 1000 clone no
# Their fasta file is in 5,000,000 chunks
# fixup the split fasta files
time nice -n +19 gunzip Monodelphis5.0.agp.chromosome.qual.gz \
Monodelphis5.0.agp.chromosome.fasta.gz
# real 5m24.942s
# user 1m31.895s
# sys 0m51.726s
mkdir splitFa
time nice -n +19 faSplit -verbose=2 byname \
Monodelphis5.0.agp.chromosome.fasta splitFa/
# 2m44s
mkdir chrFa
# combine the Broad split files into single chrom fasta files
time for C in 1 2 3 4 5 6 7 8 X Un
do
rm -f chrFa/chr${C}.fa
echo ">chr${C}" > chrFa/chr${C}.fa
echo -n "chrFa/chr${C}.fa working ... "
ls splitFa/${C}.*-*.fa | sort -t"." -k2,2n | while read F
do
grep -v "^>" ${F} >> chrFa/chr${C}.fa
done
echo "done"
done
# verify nothing was lost, should be the same totals here
faSize chrFa/chr*.fa
# 3605614649 bases (103971429 N's 3501643220 real 3501643220 upper 0
# lower) in 10 sequences in 10 files
faSize Monodelphis5.0.agp.chromosome.fasta
# 3605614649 bases (103971429 N's 3501643220 real 3501643220 upper 0
# lower) in 726 sequences in 1 files
# put them together into a single file:
cat chrFa/chr*.fa > ucscChroms.fa
# create a lift file from the information in the fasta headers
cat << '_EOF_' > liftBroadToChroms.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH, 'grep "^>" Monodelphis5.0.agp.chromosome.fasta|') or
die "can not grep Monodelphis5.0.agp.chromosome.fasta";
my %liftSpec; # key is chrom_start, value is end
my %chrSize; # key is chrom, value is size
my @liftLines; # index is line number, value is line to output
$chrSize{'1'} = 0;
$chrSize{'2'} = 0;
$chrSize{'3'} = 0;
$chrSize{'4'} = 0;
$chrSize{'5'} = 0;
$chrSize{'6'} = 0;
$chrSize{'7'} = 0;
$chrSize{'8'} = 0;
$chrSize{'X'} = 0;
$chrSize{'Un'} = 0;
my $lineCount = 0;
my $prevChr = "";
my $chrStart = 0;
while (my $line = <FH>) {
my ($chr_pos, $name) = split('\s+',$line);
my ($chr, $range) = split('\.',$chr_pos);
$chr =~ s/>//;
if ($chr ne $prevChr) {
$chrStart = 0;
$prevChr = $chr;
print STDERR "chr$chr starting\n";
}
my ($start,$end) = split('-',$range);
my $key = sprintf("%s_%d", $chr, $start);
$liftSpec{$key} = $end;
$chrSize{$chr} = $end if ($end > $chrSize{$chr});
my $fragSize = $end - $start + 1;
$liftLines[$lineCount++] = sprintf "%d\t%s.%d-%d\t%d\tchr%s",
$chrStart, $chr, $start, $end, $fragSize, $chr;
$chrStart += $fragSize;
}
close (FH);
for (my $i = 0; $i < $lineCount; ++$i) {
my ($chrStart, $fragName, $fragSize, $chr) = split('\t',$liftLines[$i]);
$chr =~ s/chr//;
printf "%s\t%d\n", $liftLines[$i], $chrSize{$chr};
}
'_EOF_'
# << happy emacs
chmod +x liftBroadToChroms.pl
./liftBroadToChroms.pl liftBroad.lft
# split up the quality file to get it put together into a single
# chrom based file
cat << '_EOF_' > splitQual.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH, "<Monodelphis5.0.agp.chromosome.qual") or
die "can not read Monodelphis5.0.agp.chromosome.qual";
# open an initial output file to get OUT established, not a real chr name
my $fileName = "splitQual/0.1.fa";
open (OUT,">$fileName") or die "can not write to $fileName";
while (my $line = <FH>) {
if ($line =~ m/^>/) {
close(OUT);
$line =~ s/>//;
$line =~ s/-.*//;
$fileName = "splitQual/$line";
open (OUT,">$fileName") or die "can not write to $fileName";
printf STDERR "writing to $fileName";
} else {
print OUT $line;
}
}
close (FH);
close (OUT);
'_EOF_'
# << happy emacs
chmod +x splitQual.pl
mkdir splitQual
./splitQual.pl
# put them back together in order as full chroms
for C in 1 2 3 4 5 6 7 8 X Un
do
echo ">chr${C}" > chrQual/chr${C}.qual.fa
ls splitQual/${C}.* | sort -t'.' -k2,2n | while read F
do
cat $F >> chrQual/chr${C}.qual.fa
done
done
# real 6m7.079s
# and as a single file
for C in 1 2 3 4 5 6 7 8 X Un
do
cat chrQual/chr${C}.qual.fa
done | gzip -c > ucscChroms.qual.fa.gz
# real 4m41.229s
# and turn it into a qac file
qaToQac ucscChroms.qual.fa.gz ucscChroms.qac
# real 3m49.380s
#########################################################################
# create genome assembly database (DONE - 2008-11-25 - Hiram)
cd /hive/data/genomes/monDom5/
cat << '_EOF_' > monDom5.config.ra
# Config parameters for makeGenomeDb.pl:
db monDom5
clade mammal
scientificName Monodelphis domestica
commonName Opossum
assemblyDate Oct. 2006
assemblyLabel Broad Institute monDom5 (NCBI project 12561, accession AAFR03000000)
orderKey 354
mitoAcc NC_006299
fastaFiles /hive/data/genomes/monDom5/broad/ucscChroms.fa
agpFiles /hive/data/genomes/monDom5/broad/Monodelphis5.0.agp
qualFiles /hive/data/genomes/monDom5/broad/ucscChroms.qac
dbDbSpeciesDir opossum
'_EOF_'
# << happy emacs
time makeGenomeDb.pl -verbose=2 -stop=seq monDom5.config.ra > seq.log 2>&1
# real 3m3.414s
time makeGenomeDb.pl -verbose=2 -continue=agp -stop=agp monDom5.config.ra \
> agp.log 2>&1
# real 0m36.723s
time makeGenomeDb.pl -verbose=2 -continue=db -stop=db monDom5.config.ra \
> db.log 2>&1
# real 36m30.041
time makeGenomeDb.pl -verbose=2 -continue=dbDb -stop=dbDb \
monDom5.config.ra > dbDb.log 2>&1
# real 0m1.074s
time makeGenomeDb.pl -verbose=2 -continue=trackDb -stop=trackDb \
monDom5.config.ra > trackDb.log 2>&1
# check in the trackDb files and the browser should be up and running
#########################################################################
# monDom5 - Opossum - Ensembl Genes version 50 (DONE - 2008-12-02 - hiram)
ssh hgwdev
cd /hive/data/genomes/monDom5
cat << '_EOF_' > monDom5.ensGene.ra
# required db variable
db monDom5
# optional nameTranslation, the sed command that will transform
# Ensemble names to UCSC names. With quotes just to make sure.
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/"
'_EOF_'
# << happy emacs
doEnsGeneUpdate.pl -ensVersion=50 monDom5.ensGene.ra
ssh hgwdev
cd /hive/data/genomes/monDom5/bed/ensGene.50
featureBits monDom5 ensGene
# 32970520 bases of 3501660299 (0.942%) in intersection
*** All done! (through the 'makeDoc' step)
*** Steps were performed in /hive/data/genomes/monDom5/bed/ensGene.50
############################################################################
# monDom5 - Opossum - Ensembl Genes version 51 (DONE - 2008-12-02 - hiram)
ssh hgwdev
cd /hive/data/genomes/monDom5
cat << '_EOF_' > monDom5.ensGene.ra
# required db variable
db monDom5
# optional nameTranslation, the sed command that will transform
# Ensemble names to UCSC names. With quotes just to make sure.
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/"
'_EOF_'
# << happy emacs
doEnsGeneUpdate.pl -ensVersion=51 monDom5.ensGene.ra
ssh hgwdev
cd /hive/data/genomes/monDom5/bed/ensGene.51
featureBits monDom5 ensGene
# 32959755 bases of 3501660299 (0.941%) in intersection
*** All done! (through the 'makeDoc' step)
*** Steps were performed in /hive/data/genomes/monDom5/bed/ensGene.51
#########################################################################
## REPEATMASKER (DONE 2008-11-26 Andy)
screen -S monDom5RepeatMasker # to manage this several day job
mkdir /hive/data/genomes/monDom5/bed/repeatMasker
cd /hive/data/genomes/monDom5/bed/repeatMasker
time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -workhorse=hgwdev \
-bigClusterHub=swarm -buildDir=`pwd` monDom5 > do.log
## (detach screen Ctrl+A+D)
## ... (reattach on hgwdev: screen -r monDom5RepeatMasker)
#
#Checking finished jobs
#cat run.time
#HgStepManager: executing step 'cat' Wed Nov 26 07:29:04 2008.
#Use of uninitialized value in substitution (s///) at /cluster/home/aamp/kent/src/hg/utils/automation/HgAutomate.pm line 262.
#Could not extract server from output of "df /hive/data/genomes/monDom5/bed/repeatMasker":
#/dev/hivedev 171434397696 55883525376 115550872320 33% /hive
#...
#user 0m0.099s
#sys 0m0.070s
## checking the problem, it's that I don't have a $HOST in my env.
## ... so, tcsh it is.
tcsh
time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -continue=cat \
-workhorse=hgwdev -bigClusterHub=swarm -buildDir=`pwd` \
monDom5 > doAfterCat.log
#0.389u 0.348s 1:12:48.52 0.0% 0+0k 0+0io 1pf+0w
#########################################################################
## SIMPLE REPEATS TRF (DONE 2008-11-26 - Andy)
screen
mkdir /hive/data/genomes/monDom5/bed/simpleRepeat
cd /hive/data/genomes/monDom5/bed/simpleRepeat
time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
-buildDir=/cluster/data/monDom5/bed/simpleRepeat monDom5 > do.log
#0.233u 0.153s 2:52:17.94 0.0% 0+0k 0+0io 0pf+0w
cat fb.simpleRepeat
# 83450133 bases of 3501660299 (2.383%) in intersection
## after RM run is done, add this mask:
cd /hive/data/genomes/monDom5
rm monDom5.2bit
twoBitMask monDom5.rmsk.2bit -add bed/simpleRepeat/trfMask.bed monDom5.2bit
## can safely ignore warning about >=13 fields in bed file
twoBitToFa monDom5.2bit stdout | faSize stdin > monDom5.2bit.faSize.txt
# 3605631728 bases (103971429 N's 3501660299 real 1542619938 upper 1959040361 lower)
# %54.33 masked total, %55.95 masked real
## link to gbdb
ln -s `pwd`/monDom5.2bit /gbdb/monDom5
###########################################################################
# WINDOWMASKER (DONE 2008-12-02 Andy)
ssh kolossus
mkdir /hive/data/genomes/monDom5/bed/WindowMasker
cd /hive/data/genomes/monDom5/bed/WindowMasker
screen -S monDom5_WindowMasker
tcsh
~/kent/src/hg/utils/automation/doWindowMasker.pl monDom5 -buildDir=`pwd` -workhorse=kolossus >& wm.log
## oops forgot to time it (or nice it). It took less than 8 hrs though
## load result to prep for cleaning
ssh hgwdev
cd /hive/data/genomes/monDom5/bed/WindowMasker
hgLoadBed monDom5 windowmaskerSdust windowmasker.sdust.bed.gz
# Loaded 1346187 elements of size 3
featureBits monDom5 windowmaskerSdust
# 1619987578 bases of 3501660299 (46.263%) in intersection
# eliminate the gaps from the masking (WM bug)
featureBits monDom5 -not gap -bed=notGap.bed
# 170473138 bases of 170473138 (100.000%) in intersection
screen -S monDom5_WindowMasker
time nice featureBits monDom5 windowmasker.sdust.bed.gz notGap.bed \
-bed=stdout | gzip -c > cleanWMask.bed.gz
#1516026449 bases of 3501660299 (43.295%) in intersection
#
#real 12m17.912s
#user 3m56.801s
#sys 0m13.704s
## reload track to get it clean
hgLoadBed monDom5 windowmaskerSdust cleanWMask.bed.gz
## Loaded 22241063 elements of size 4
##########################################################################
## CPG ISLANDS (DONE 2008-12-03, Andy)
ssh hgwdev
screen -S monDom5_CpG
~/kent/src/hg/utils/automation/doCpgIslands.pl monDom5
## [detach screen]
## took an hour or two
featureBits monDom5 cpgIslandExt
# 14553072 bases of 3501660299 (0.416%) in intersection
##########################################################################
## LIFTOVER CHAIN MONDOM4->MONDOM5 (DONE 2008-12-06, Andy)
ssh hgwdev
screen -S monDom5_liftOver
tcsh
~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl monDom4 monDom5
## [detach]
## ... 8 hippos
ssh swarm
cd /hive/data/genomes/monDom4/bed/blat.monDom5.2008-12-04/run.blat
## edit job.csh and add -maxIntron=1 -mask=lower
## to blat command and up minScore to 5000 and minIdentity to 99
para stop
para push
## Took several hours after hippos were dealt with and -continue net
## had to be run as well after it crashed running out of tmp disk space
cd /hive/data/genomes/monDom4/bed/blat.monDom5.2008-12-04
rm -rf run.chain/ run.blat/
mv monDom4ToMonDom5.over.chain.gz ../liftOver/
cd /gbdb/monDom4/liftOver
#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 2008-12-11, Andy)
ssh hgwdev
screen -S monDom5_11.ooc
cd /hive/data/genomes/monDom5
## use repMatch of 1228 as this genome is ~ %20 larger than human
## 1024 + (1024 * 0.2) = 1228
time nice blat monDom5.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228
# Wrote 43992 overused 11-mers to 11.ooc
# real 3m24.793s
#######################################################################
# GENBANK AUTOUPDATE SETUP (DONE 2008-12-15 Andy)
ssh hgwdev
screen -S monDom5_genbank
cd ~/kent/src/hg/makeDb/genbank/conf/
## edit genbank.conf: search monDom4, copy and paste that whole entry
## and replace monDom4 with monDom5:
## cvs ci genbank.conf
make etc-update
ssh genbank
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial monDom5 &
# logFile: var/build/logs/2008.12.11-16:37:37.monDom5.initalign.log
#real 633m58.880s
##user 96m10.868s
#sys 35m42.207s
exit
## we're back on hgwdev
nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad monDom5 &
#logFile: var/dbload/genbank/logs/2008.12.15-10:44:36.dbload.log
cd ~/kent/src/hg/makeDb/genbank
cvsup
## add monDom5 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added monDom5" etc/align.dbs etc/hgwdev.dbs
make etc-update
##########################################################################
## BLAT (DONE 2008-01-20 Andy)
# e-mail cluster-admin and ask for a server and tell them about
# /gbdb/monDom5/monDom5.2bit
# then
ssh hgwdev
hgsql -N hgcentraltest
mysql> insert into blatServers (db, host, port, isTrans, canPcr) values ('monDom5', 'blatx', '17778', '1', '0');
Query OK, 1 row affected (0.01 sec)
mysql> insert into blatServers (db, host, port, isTrans, canPcr) values ('monDom5', 'blatx', '17779', '0', '1');
Query OK, 1 row affected (0.00 sec)
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-05 braney )
# bash if not using bash shell already
cd /cluster/data/monDom5
grep chrUn monDom5.agp | /cluster/bin/scripts/agpToLift > jkStuff/Un.lft
mkdir /cluster/data/monDom5/blastDb
cat broad/Monodelphis5.0.agp | awk '/^chrUn/ {if ($5 == "W")
{
printf "echo \\>%s\n",$6 ;
printf "twoBitToFa monDom5.2bit:chrUn:%d-%d stdout | tail -n +2\n", $2,$3
}}' > script
sh script > chrUn.fa
grep -v chrUn chrom.sizes | awk '{ printf "twoBitToFa monDom5.2bit:%s stdout \n", $1}' > script
sh script > chrNotUn.fa
cat chrUn.fa chrNotUn.fa | faToTwoBit stdin monDom5.chrUnAsContigs.2bit
twoBitInfo monDom5.chrUnAsContigs.2bit monDom5.chrUnAsContigs.sizes
awk '{if ($2 > 1000000) print $1}' monDom5.chrUnAsContigs.sizes > 1meg.lst
twoBitToFa -seqList=1meg.lst monDom5.chrUnAsContigs.2bit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
# 4366 pieces of 4366 written
rm temp.fa 1meg.lst
awk '{if ($2 <= 1000000) print $1}' monDom5.chrUnAsContigs.sizes > less1meg.lst
twoBitToFa -seqList=less1meg.lst monDom5.chrUnAsContigs.2bit temp.fa
faSplit about temp.fa 1000000 blastDb/y
ls blastDb/y* | wc -l
# 88
rm temp.fa less1meg.lst
cd blastDb
for i in *.fa
do
/hive/data/outside/blast229/formatdb -i $i -p F
done
rm *.fa &
ls *.nsq | wc -l
# 4454
mkdir -p /cluster/data/monDom5/bed/tblastn.hg18KG
cd /cluster/data/monDom5/bed/tblastn.hg18KG
echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 4454 query.lst
# we want around 1000000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(1000000/`wc query.lst | awk '{print $1}'`\)
# 36727/(1000000/4454) = 163.582058
mkdir -p kgfa
split -l 164 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/monDom5/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/monDom5/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
exit
ssh swarm
cd /cluster/data/monDom5/bed/tblastn.hg18KG
gensub2 query.lst kg.lst blastGsub blastSpec
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 997696 of 997696 jobs
# CPU time in finished jobs: 21259114s 354318.56m 5905.31h 246.05d 0.674 y
# IO & Wait Time: 3461687s 57694.79m 961.58h 40.07d 0.110 y
# Average job time: 25s 0.41m 0.01h 0.00d
# Longest finished job: 76s 1.27m 0.02h 0.00d
# Submission to last job: 31794s 529.90m 8.83h 0.37d
ssh swarm
cd /cluster/data/monDom5/bed/tblastn.hg18KG
cat << _EOF_ > splitOne
(cd \$1;
grep -h -v contig q.*.psl | pslSplitOnTarget stdin chroms
grep -h contig q.*.psl > chroms/contigs.psl
)
_EOF_
chmod +x splitOne
for i in `pwd`/blastOut/kg??; do
echo ./splitOne $i
done > split.jobs
para create split.jobs
para maxJob 10
para shove
# Completed: 224 of 224 jobs
# CPU time in finished jobs: 910s 15.17m 0.25h 0.01d 0.000 y
# IO & Wait Time: 128754s 2145.89m 35.76h 1.49d 0.004 y
# Average job time: 579s 9.65m 0.16h 0.01d
# Longest finished job: 683s 11.38m 0.19h 0.01d
# Submission to last job: 13166s 219.43m 3.66h 0.15d
ssh swarm
cd /cluster/data/monDom5/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(
f=`basename $1`;
d=`dirname $1`;
if simpleChain -prot -outPsl -maxGap=150000 $1 $d/c.$f.tmp
then
mv $d/c.$f.tmp $d/c.$f
else
rm $d/c.$f.tmp
fi
)
'_EOF_'
chmod +x chainOne
ls -1dS ../blastOut/kg??/chroms/*.psl > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
para create chainSpec
para try, check, push, check etc.
# Completed: 2270 of 2270 jobs
# CPU time in finished jobs: 10821559s 180359.32m 3005.99h 125.25d 0.343 y
# IO & Wait Time: 249403s 4156.72m 69.28h 2.89d 0.008 y
# Average job time: 4877s 81.28m 1.35h 0.06d
# Longest finished job: 205913s 3431.88m 57.20h 2.38d
# Submission to last job: 205932s 3432.20m 57.20h 2.38d
cd /cluster/data/monDom5/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat $i/chroms/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../unLift.psl
cd ..
pslCheck unLift.psl
# checked: 71490 failed: 0 errors: 0
liftUp blastHg18KG.psl ../../jkStuff/Un.lft carry unLift.psl
pslCheck -prot blastHg18KG.psl
# checked: 71490 failed: 0 errors: 0
# load table
ssh hgwdev
cd /cluster/data/monDom5/bed/tblastn.hg18KG
hgLoadPsl monDom5 blastHg18KG.psl
# check coverage
featureBits monDom5 blastHg18KG
# 33631308 bases of 3501660299 (0.960%) in intersection
featureBits monDom4 blastHg18KG
# 19424941 bases of 3501643220 (0.555%) in intersection
featureBits monDom5 all_mrna blastHg18KG -enrichment
# all_mrna 0.006%, blastHg18KG 0.960%, both 0.003%, cover 43.79%, enrich 45.59x
rm -rf blastOut
#end tblastn
#########################################################################
## NSCAN LIFT (FROM MONDOM4) (DONE 2008-12-08 Andy)
ssh hgwdev
mkdir /hive/data/genomes/monDom5/bed/nscanGene.lifted
cd /hive/data/genomes/monDom5/bed/nscanGene.lifted
echo "select * from nscanGene" | hgsql monDom4 | sed '1d' > nscanGene.monDom4.gp
liftOver nscanGene.monDom4.gp /gbdb/monDom4/liftOver/monDom4ToMonDom5.over.chain.gz \
| nscanGene.monDom5.gp unmapped.gp
ldHgGene -predTab monDom5 nscanGene nscanGene.monDom5.gp
##########################################################################
## GENSCAN (DONE 2008-01-22 Andy)
ssh hgwdev
mkdir -p /hive/data/genomes/monDom5/bed/genscan/{fasta,gtf,pep,subopt}
cd /hive/data/genomes/monDom5/bed/genscan
twoBitToFa ../../monDom5.2bit stdout | maskOutFa stdin hard stdout \
| faSplit -maxN=5000000 -lift=chunks.lft gap stdin 8000000 fasta/chunk
cvs co hg3rdParty/genscanlinux
cat << '_EOF_' > gsub
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << emacs
ls -1 fasta/* > chunk.lst
ssh swarm
cd /hive/data/genomes/monDom5/bed/genscan
gensub2 chunk.lst single gsub spec
para create spec
para push
para time
#Completed: 421 of 421 jobs
#CPU time in finished jobs: 60788s 1013.13m 16.89h 0.70d 0.002 y
#IO & Wait Time: 5097s 84.96m 1.42h 0.06d 0.000 y
#Average job time: 156s 2.61m 0.04h 0.00d
#Longest finished job: 367s 6.12m 0.10h 0.00d
#Submission to last job: 378s 6.30m 0.10h 0.00d
catDir subopt | liftUp my.bed chunks.lft error stdin
sed 's/chunk/opossum/' my.bed > genscan.subopt.bed
catDir gtf | liftUp my.gtf chunks.lft error stdin
sed 's/chunk/opossum/g' my.gtf > genscan.gtf
catDir pep | sed 's/chunk/opossum/' > genscan.pep.fa
ldHgGene -gtf monDom5 genscan genscan.gtf
hgPepPred monDom5 generic genscanPep genscan.pep.fa
hgLoadBed monDom5 genscanSubopt genscan.subopt.bed
#########################################################################
# BIGZIPS/DOWNLOADS (DONE, 2009-06-08 Andy)
ssh hgwdev
mkdir /hive/data/genomes/monDom5/downloads
mkdir /hive/data/genomes/monDom5/downloads/bigZips
mkdir /hive/data/genomes/monDom5/downloads/chromosomes
cd /hive/data/genomes/monDom5/downloads/chromosomes
cp /usr/local/apache/htdocs/goldenPath/monDom4/chromosomes/README.txt .
# edit that readme to provide correct references and details
ln -s `pwd` /usr/local/apache/htdocs/goldenPath/monDom5/chromosomes
cd /hive/data/genomes/monDom5/downloads/chromosomes
for c in `cut -f1 ../../chrom.sizes`; do
echo $c;
twoBitToFa ../../monDom5.unmasked.2bit:$c stdout \
| gzip -c > $c.fa.gz;
done
md5sum *.fa.gz R*.txt > md5sum.txt
cd ../../
for c in `cut -f1 chrom.sizes`; do
echo ${c#chr};
pushd ${c#chr};
cp ../downloads/chromosomes/${c}.fa.gz .
gunzip ${c}.fa.gz;
popd;
done
tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa Un/chrUn.fa
tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out Un/chrUn.fa.out
for c in `cut -f1 chrom.sizes`; do echo $c; twoBitToFa monDom5.2bit:$c ${c#chr}/${c}.fa.masked; done
tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \
Un/chrUn.fa.masked
#real 17m19.439s
cd bed/simpleRepeat/
mkdir trfMask
cd trfMask
ln -s ../trfMaskChrom/* .
cd ../
tar -hcvzf chromTrf.tar.gz ./trfMask
mv chromTrf.tar.gz ../../downloads/bigZips/
cd ../../downloads/bigZips/
cp ../../broad/Monodelphis5.0.agp .
gzip Monodelphis5.0.agp
cp /usr/local/apache/htdocs/goldenPath/monDom4/bigZips/README.txt .
# [edit]
ln -s /hive/data/genomes/monDom5/downloads/bigZips \
/usr/local/apache/htdocs/goldenPath/monDom5/bigZips
cd /usr/local/apache/htdocs/goldenPath/monDom5/bigZips
ln -s /hive/data/genomes/monDom5/monDom5.2bit .
md5sum *.gz *.2bit README.txt > md5sum.txt
cd ../chromosomes/
md5sum *.gz R*.txt > md5sum.txt
cd ../
mkdir database
cd database/
cp ../../monDom4/database/README.txt .
# [edit README.txt]
#########################################################################
#nscanGene (2009-06-23 markd)
# nscanGene track from WUSTL
cd /hive/data/genomes/monDom5/bed/nscan
wget http://mblab.wustl.edu/predictions/possum/monDom5/README
wget http://mblab.wustl.edu/predictions/possum/monDom5/monDom5.gtf
wget -r -np -e robots=off -l 1 http://mblab.wustl.edu/predictions/possum/monDom5/chr_ptx/
nice bzip2 chr_ptx/* monDom5.gtf
# load track
gtfToGenePred -genePredExt monDom5.gtf.bz2 stdout| hgLoadGenePred -genePredExt monDom5 nscanGene stdin
bzcat chr_ptx/*.fa.bz2 | hgPepPred monDom5 generic nscanPep stdin
rm *.tab
# validate same number of transcripts and peptides are loaded
hgsql -Ne 'select count(*) from nscanGene' monDom5
hgsql -Ne 'select count(*) from nscanPep' monDom5
# validate search expression
hgc-sql -Ne 'select name from nscanGene' monDom5 | egrep -v -e '^chr[0-9a-zA-Z_]+\.[0-9]+\.[0-9]+(\.[0-9a-z]+)?$' |wc -l
#########################################################################
# lastz Horse equCab2 (DONE - 2009-06-29,07-02 - Hiram)
mkdir /hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29
cd /hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29
cat << '_EOF_' > DEF
# opossum vs. Horse
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q
# TARGET: Opossum (monDom5)
SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ1_LEN=/scratch/data/monDom5/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse equCab2
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-workhorse=hgwdev \
-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
# real 1353m4.161s
# had some trouble with the first kluster run due to various factors
# not the least of which was a major power outage
time doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-continue=cat -workhorse=hgwdev \
-chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 &
# real 957m25.164s
cat fb.monDom5.chainEquCab2Link.txt
# 355004426 bases of 3501660299 (10.138%) in intersection
mkdir /hive/data/genomes/equCab2/bed/blastz.monDom5.swap
cd /hive/data/genomes/equCab2/bed/blastz.monDom5.swap
time doBlastzChainNet.pl \
/hive/data/genomes/monDom5/bed/lastzEquCab2.2009-06-29/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-swap -workhorse=hgwdev \
-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
# real 320m3.573s
cat fb.equCab2.chainMonDom5Link.txt
# 351787662 bases of 2428790173 (14.484%) in intersection
#########################################################################
# Set default position to OPN1LW gene 2009-07-20, Brooke
hgsql -e \
"update dbDb set defaultPos='chrX:14658820-14669558' where name='monDom5'" \
hgcentraltest
#########################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
###########################################################################
# ALIGNMENTS/CHAINS/NETS (DONE Dec 2008, Andy)
#
# To be honest I didn't really concentrate on getting the whole enchilada of
# make-notes into record, because the whole process is so robotic.
#
# I'll start with the DEF files. These have varying parameters based on
# the query species. So here's the various DEF parameters:
#
DB H Y L K T M A_R* Q 1CHUNK 1LAP 2CHUNK 2LAP 2LIMIT
bosTau4 - 3400 6000 2200 - - - HoxD55 1M 10K 20M 0 100
canFam2 2000 3400 10000 2200 - 50 0 HoxD55 1M 10K 30M 0 -
danRer5 2000 3400 6000 2200 - - - HoxD55 5M 10K 10M 0 -
galGal3 2000 3400 10000 2200 - - - HoxD55 5M 10K 20M 0 -
hg18 2000 3400 10000 2200 - 50 0 HoxD55 5M 10K 10M 0 -
macEug1 - 3400 - - 2 10 - - 10M 320K 500K 0 100
mm9 - 3400 6000 2200 - - - HoxD55 5M 10K 20M 0 -
ornAna1 2000 3400 6000 2200 - 50 - HoxD55 5M 10K 20M 0 300
panTro2 2000 3400 10000 2200 - 50 0 HoxD55 1M 10K 30M 0 -
ponAbe2 2000 3400 10000 2200 - 50 0 HoxD55 1M 10K 30M 0 -
rheMac2 2000 3400 10000 2200 - 50 0 HoxD55 1M 10K 30M 0 -
rn4 - 3400 6000 2200 - - - HoxD55 2M 10K 10M 0 -
rn5 - 3400 6000 2200 - - - - 5M 10K 20M 0 -
xenTro2 2000 3400 8000 2200 - 50 - HoxD55 5M 10K 20M 0 100
* BLASTZ_ABRIDGE_REPEATS
# In most of those cases I was looking to the DEF variables in the monDom4 version
# of the alignment. In the case of macEug1, the variable settings were given by
# Webb Miller.
#
# After all the DEF files are created, it's time to run doBlastzChainNet.pl
# In the case of monDom5, they all have the profile of this:
cd /hive/data/genomes/monDom5/bed
mkdir blastz.otherDb
cd blastz.otherDb
screen -S otherDb_monDom5
doBlastzChainNet.pl -bigClusterHub swarm -stop cat DEF >& doUntilCat.log
#[detach screen, come back when it's done]
# The reason I'd stop after the cat step is that I was having better results
# using swarm for chaining instead of memk. At the time, memk only had a dozen
# nodes, and usually only 8 would be online. Because of the large chromosomes,
# the chaining step would be bottlenecked by the hippo jobs on the big chroms.
# It was going much more quickly when the cluster could accommodate more
# concurrent jobs. On memk the jobs were allocated 8GB of RAM and they normally
# only get 2GB on swarm, so in case that mattered, I did this
#[re-attach screen]
doBlastzChainNet.pl -bigClusterHub swarm -smallClusterHub swarm -minChainScore 3000 DEF >& doAfterCat.log
#[detach screen]
ssh swarm
cd /hive/data/genomes/monDom5/bed/blastz.otherDb/axtChain/run
para check
# check to see the jobs have started, once they have:
para stop; para resetCounts; para -ram=8g -cpu=4 create jobList; para push
# check to see things are all fine, and log out of swarm
# after that all should have completed fine. Stopping the cluster run manually
# while the doBlastzChainNet.pl script is running doesn't typically confuse it.
# I suppose it's possible that the script while check on the run in the small
# period while it's stopped, but it didn't happen to me.
#
# In retrospect I think the memk trick is possibly avoidable now that there are
# more memk nodes. However the oppossum is a particularly difficult species to
# chain, so I'm just mentioning it anyway. Later when doing alignments on hg19,
# memk was up to the task just fine.
#
# Also of note is the inconsistency of DEF parameter settings. When monDom6
# happens, someone will probably look here to see what was done with monDom5.
# If I have any advice it's to take an approach like the one with hg19 and
# use consistent parameters as much as possible and set them according to
# different tiers of evolutionary distance.
####################################################################
# RELOAD CHAINS/NETS AS NON-SPLIT (2009-06-09, Andy)
for d in blastz.*; do
if [ $d != "blastz.bosTau4" ]; then
db=${d#blastz.};
Db=`echo $db | sed 's/^./\u&/'`;
echo Loading $db chains into monDom5...;
time nice -n +19 hgLoadChain -tIndex monDom5 chain$Db \
blastz.$db/axtChain/monDom5.$db.all.chain.gz;
fi;
done >& unsplit/chainReloads.log
# problem with macEug1
cd blastz.macEug1/axtChain/
time nice -n +19 hgLoadChain -tIndex monDom5 chainMacEug1 monDom5.macEug1.all.chain.gz
#Loading 19668859 chains into monDom5.chainMacEug1
#Can't start query:
#load data local infile 'link.tab' into table chainMacEug1Link
#
#mySQL error 1114: The table 'chainMacEug1Link' is full
#real 146m54.273s
#
wc -l link.tab
#200440062 link.tab
#19668859 chain.tab
randomLines link.tab 10000000 stdout | awk '{print length($0)}' | sort | uniq -c
randomLines chain.tab 1000000 stdout | awk '{print length($0)}' | sort | uniq -c
# 92 chain, 42 link
sed "s/hgLoadChain.*/hgsqldump monDom5 chainRn4Link --no-data --skip-comments\n\
| sed \'s\/Rn4\/MacEug1\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=201000000\n\
avg_row_length=42 pack_keys=1 CHARSET=latin1\/\' | hgsql monDom5 \n\
/" loadUp.csh > manualLoadUp.csh
hgsqldump monDom5 chainRn4 --no-data --skip-comments | sed \'s\/Rn4\/MacEug1\/; s\/TYPE=MyISAM\/ENGINE=MyISAM max_rows=20000000 avg_row_length=92 pack_keys=1 CHARSET=latin1\/\' | hgsql monDom5 \n\
hgsql monDom5 -e \"load data local infile \'chain.tab\' into table chainMacEug1\"\n\
hgsql monDom5 -e \"load data local infile \'link.tab\' into table chainMacEug1Link\"\n\
hgsql monDom5 -e \"INSERT into history (ix, startId, endId, who, what, modTime, errata) VALUES(NULL,0,0,\'aamp\',\'Loaded 19668859 chains into macEug1 chain table manually\', NOW(), NULL)\"\
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
############################################################################
ssh hgwdev
mkdir /hive/data/genomes/monDom5/bed/multiz9way
cd /hive/data/genomes/monDom5/bed/multiz9way
## Build the tree by pruning out the 8 species from the recent 44way,
## changing monDom4 to monDom5, and adding macEug1
## (Wallaby placement deduced from (Margulies et al.,PNAS 2004) ).
/cluster/bin/phast/tree_doctor \
--prune-all-but=hg18,mm9,canFam2,ornAna1,galGal3,xenTro2,danRer5,monDom4 \
/hive/data/genomes/hg18/bed/multiz44way/44way.nh | \
sed 's/:0\.[[:digit:]]\+/ /g; s/,//g; s/;//; s/\ )/)/g; s/monDom4/(monDom5 macEug1)/' \
> tree.nh
sed 's/(//g; s/)//g' tree.nh > species.lst
## Make per-chrom maf links
mkdir mafLinks
cd mafLinks
bash
for db in hg18 mm9 canFam2 macEug1 ornAna1 galGal3 xenTro2 danRer5; do
mkdir $db; pushd $db;
if [ -e /hive/data/genomes/monDom5/bed/blastz.${db}/mafSynNet ]; then
ln -s /hive/data/genomes/monDom5/bed/blastz.${db}/mafSynNet/* .
else
ln -s /hive/data/genomes/monDom5/bed/blastz.${db}/mafNet/* .
fi
popd
done
cd ../
## Split mafs up
mkdir mafSplit
mafSplitPos -minGap=50000 monDom5 10 mafSplit.bed
for db in `ls -1 mafLinks`; do
mkdir mafSplit/$db
pushd mafSplit/$db
mafSplit ../../mafSplit.bed monDom5_ ../../mafLinks/${db}/chr*.maf.gz \
-verbose=2
popd
done
## Make file list for cluster run
cd mafSplit/canFam2/
find . -type f | sort -u | sed -e "s#./##" > ../../9-way.split.lst
## Double-check the file sets are all the same in each database subdir
## Cluster run on split mafs
ssh swarm
cd /hive/data/genomes/monDom5/bed/multiz9way
mkdir -p multizRun/run/penn multizRun/maf
cd multizRun/run/
cp -p /cluster/bin/penn/multiz.2008-11-25/{multiz,maf_project,autoMZ} penn/
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = monDom5
set c = $1
set result = $2
set run = `pwd`
set tmp = $run/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/monDom5/bed/multiz9way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.lst $tmp
pushd $tmp
foreach s (`sed -e "s/ $db//" species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
/bin/zcat $in.gz > $out
else if (-e $in) then
ln -s $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
/bin/rmdir --ignore-fail-on-non-empty $run/tmp
'_EOF_'
# << emacs
chmod +x autoMultiz.csh
cat << '_EOF' > gsub
#LOOP
./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/monDom5/bed/multiz9way/multizRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs
para -ram=4g -cpu=2 create jobList
para push
para time
#Completed: 332 of 332 jobs
#CPU time in finished jobs: 17639s 293.98m 4.90h 0.20d 0.001 y
#IO & Wait Time: 46958s 782.64m 13.04h 0.54d 0.001 y
#Average job time: 195s 3.24m 0.05h 0.00d
#Longest finished job: 1693s 28.22m 0.47h 0.02d
#Submission to last job: 1706s 28.43m 0.47h 0.02d
## Sew up the mafs into one file
ssh hgwdev
cd /hive/data/genomes/hg18/bed/multiz44way/multizRun
mkdir ../maf
ls -1 maf | sed 's/monDom5_//; s/\..*//' | sort -u | while read chrom; do
head -q -n 1 maf/monDom5_${chrom}.*.maf | sort -u > ../maf/${chrom}.maf
grep -h "^#" maf/monDom5_${chrom}.*.maf | egrep -v "maf version=1|eof maf" | sed -e "s#${chrom}.[0-9][0-9]*#${chrom}#g; s#_MZ_[^ ]* # #g;" | sort -u >> ../maf/${chrom}.maf
grep -h -v "^#" `ls maf/monDom5_${chrom}.*.maf | sort -t. -k2,2n` >> ../maf/${chrom}.maf
tail -q -n 1 maf/monDom5_${chrom}.*.maf | sort -u >> ../maf/${chrom}.maf
echo done with $chrom
done
cd ../
rm -rf multizRun/maf/ mafSplit/
## gap annotation
mkdir -p anno/{maf,run}
cd anno/
for db in `cat ../species.lst`; do
dbDir="/hive/data/genomes/${db}"
if [ ! -f ${dbDir}/${db}.N.bed ]; then
echo "creating ${db}.N.bed"
twoBitInfo -nBed ${dbDir}/${db}.2bit ${dbDir}/${db}.N.bed
else
ls -og ${dbDir}/${db}.N.bed
fi
done
cd run/
for db in `sed -e "s/monDom5 //" ../../species.lst`; do
echo "${db} "
ln -s /hive/data/genomes/${db}/${db}.N.bed ${db}.bed
echo ${db}.bed >> nBeds
ln -s /hive/data/genomes/${db}/chrom.sizes ${db}.len
echo ${db}.len >> sizes
done
ssh hgwdevnew
cd /hive/data/genomes/monDom5/bed/multiz9way/run
for c in `ls -1 ../../maf | sed -e "s/.maf//"`; do
echo starting $c
time nice mafAddIRows -nBeds=nBeds ../../maf/${c}.maf /hive/data/genomes/monDom5/monDom5.2bit ../maf/${c}.maf
done
exit # hgwdevnew
cd ../../
## quality information (only available for 6 of 9 species including monDom5)
mkdir -p quals/{maf,qa}
cd quals/qa/
ln -s /hive/data/genomes/monDom5/monDom5.{qac,qdx} .
ln -s /hive/data/genomes/hg18/bed/multiz44way/quals/allInOnePlace/{galGal3,canFam2,ornAna1}.* .
qaToQac /hive/data/genomes/rn5/baylor/Rnor4.chroms.fa.qual.gz \
rn5.noIdx.qac
qacAddGapIdx /hive/data/genomes/rn5/baylor/Rnor4.scaffold_chr.agp \
rn5.noIdx.qac rn5.qac rn5.qdx
rm rn5.noIdx.qac
qaToQac /hive/data/genomes/macEug1/baylor/macEug1.contigs.onlyInAgp.qa.gz \
macEug1.noIdx.qac
qacAddGapIdx /hive/data/genomes/macEug1/baylor/macEug1.agp macEug1.noIdx.qac \
macEug1.qac macEug1.qdx
for db in canFam2 galGal3 macEug1 monDom5 ornAna1 rn5; do
echo $db | awk 'BEGIN{OFS="\t"}{print $1, "/hive/data/genomes/monDom5/bed/multiz9way/quals/qa";}';
done > quals.lst
cat << '_EOF_' > gsub
#LOOP
mafAddQRows quals.lst $(path1) {check out line+ maf/$(file1) }
#ENDLOOP
'_EOF_'
# << emacs
ssh swarm
cd /hive/data/genomes/monDom5/bed/multiz9way/quals
mkdir inMaf
cd inMaf/
ln -s ../../anno/maf/* .
cd ../
ls -1 inMaf/* > in.lst
gensub2 in.lst single gsub jobList
para -ram=8g -cpu=4 create jobList
para push
para time
#Completed: 11 of 11 jobs
#CPU time in finished jobs: 5419s 90.32m 1.51h 0.06d 0.000 y
#IO & Wait Time: 19745s 329.08m 5.48h 0.23d 0.001 y
#Average job time: 2288s 38.13m 0.64h 0.03d
#Longest finished job: 5196s 86.60m 1.44h 0.06d
#Submission to last job: 5206s 86.77m 1.45h 0.06d
cd ../
## Gene frames
mkdir -p genes/{genePreds,inMaf,chrFrames}
cd genes/genePreds/
## human/mouse use known genes
for db in hg18 mm9; do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${db} | \
genePredSingleCover stdin stdout | gzip -c > ${db}.gp.gz
done
## monDom5/canFam2/ornAna1/galGal3/xenTro2/danRer5 use ensGene
for db in monDom5 canFam2 ornAna1 galGal3 xenTro2 danRer5; do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" $db | \
genePredSingleCover stdin stdout | gzip -c > ${db}.gp.gz
done
cd ../inMaf/
ln -s ../../quals/maf/* .
cd ../
ls -1 inMaf/ | sed 's/.maf//' > chr.lst
ls -1 genePreds/ | sed 's/.gp.gz//' > gp.lst
cat << '_EOF_' > mafFrames.csh
#!/bin/csh -fe
set c = $1
set g = $2
cat inMaf/${c}.maf | genePredToMafFrames monDom5 stdin stdout \
${g} genePreds/${g}.gp.gz | gzip -c > chrFrames/${c}.${g}.mafFrames.gz
'_EOF_'
cat << '_EOF_' > gsub
#LOOP
./mafFrames.csh $(root1) $(root2) {check out exists+ chrFrames/$(root1).$(root2).mafFrames.gz}
#ENDLOOP
'_EOF_'
ssh swarm
cd /hive/data/genomes/monDom5/bed/multiz9way/genes
gensub2 chr.lst gp.lst gsub jobList
para -ram=8g -cpu=4 create jobList
para push
para time
#Completed: 88 of 88 jobs
#CPU time in finished jobs: 1610s 26.83m 0.45h 0.02d 0.000 y
##IO & Wait Time: 3101s 51.69m 0.86h 0.04d 0.000 y
#Average job time: 54s 0.89m 0.01h 0.00d
#Longest finished job: 95s 1.58m 0.03h 0.00d
#Submission to last job: 211s 3.52m 0.06h 0.00d
exit # [exit swarm]
find chrFrames -type f | while read frameFile; do
zcat ${frameFile}
done | sort -k1,1 -k2,2n > multiz9wayFrames.bed
hgLoadMafFrames monDom5 multiz9wayFrames{,.bed}
## make downloads
mkdir -p download/maf
cd download/maf
cp -p ../../quals/maf/chr*.maf .
gzip --rsyncable *.maf
md5sum *.gz > md5sum.txt
mkdir -p /usr/local/apache/htdocs/goldenPath/monDom5/multiz9way/maf
ln -s `pwd -P`/* /usr/local/apache/htdocs/goldenPath/monDom5/multiz9way/maf
cd ../../
## load mafs into database
cd anno/maf
mkdir -p /gbdb/monDom5/multiz9way/maf
ln -s `pwd -P`/*.maf /gbdb/monDom5/multiz9way/maf
pushd /data/tmp
nice -n +19 hgLoadMaf -pathPrefix=/gbdb/monDom5/multiz9way/maf monDom5 multiz9way
nice -n +19 cat /gbdb/monDom5/multiz9way/maf/*.maf | hgLoadMafSummary monDom5 multiz9waySummary stdin
popd
cd ../../
## phastCons
## 1. collect all mafs with all nine species in components.
mkdir cons
cd cons/
for chrom in `ls -1 ../initialMafs/`; do
for db in `cat ../species.lst`; do
maf=../initialMafs/$chrom
if [ -e withAllNine.$chrom ]; then
maf=withAllNine.$chrom
fi
mafFilter -needComp=$db $maf > tmp.maf
mv tmp.maf withAllNine.$chrom
done
echo Done with withAllNine.$chrom
done
## 2. split into 10 Mbase pieces and generate .ss files
mkdir msa.split ss
cd msa.split/
ln -s ../../../../monDom5.2bit
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set maf = /hive/data/genomes/monDom5/bed/multiz9way/cons/maf/withAllNine.${c}.maf
set chromDir = /hive/data/genomes/monDom5/bed/multiz9way/cons/ss/$c
rm -fr $chromDir
mkdir $chromDir
pushd $chromDir > /dev/null
twoBitToFa -seq=$c monDom5.2bit monDom5.${c}.fa
msa_split $maf -i MAF \
-M monDom5.${c}.fa -o SS -r $chromDir/$c -w 20000000,0 -I 100 -B 5000
rm -f monDom5.${c}.fa
popd > /dev/null
date >> ${c}.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
./doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
# << happy emacs
ssh swarm
cd /hive/data/genomes/monDom5/bed/multiz9way/cons/msa.split
ls -1 ../maf/ | sed 's/withAllNine.//; s/.maf//' > chr.lst
gensub2 chr.lst single gsub jobList
para -ram=8g -cpu=4 create jobList
para push
para time
#Completed: 11 of 11 jobs
#CPU time in finished jobs: 805s 13.42m 0.22h 0.01d 0.000 y
#IO & Wait Time: 367s 6.11m 0.10h 0.00d 0.000 y
#Average job time: 107s 1.78m 0.03h 0.00d
#Longest finished job: 211s 3.52m 0.06h 0.00d
#Submission to last job: 220s 3.67m 0.06h 0.00d
cd ../
## another run, full chroms
mkdir ss.chrom msa_view.run
cd msa_view.run
ln -s ../../../../monDom5.2bit
cat << '_EOF_' > doChrom.csh
#!/bin/csh -ef
set c = $1
twoBitToFa -seq=$c monDom5.2bit ${c}.fa
msa_view ../maf/withAllNine.${c}.maf -i MAF -M ${c}.fa -o SS > ../ss.chrom/${c}.ss
rm ${c}.fa
'_EOF_'
# << emacs
cat << '_EOF_' > gsub
#LOOP
./doChrom.csh $(root1) {check out line+ ../ss.chrom/$(root1).ss }
#ENDLOOP
'_EOF_'
ls -1 ../maf/ | sed 's/withAllNine.//; s/.maf//' > chr.lst
gensub2 chr.lst single gsub jobList
para -ram=8g -cpu=4 create jobList
para push
para time
#Completed: 11 of 11 jobs
#CPU time in finished jobs: 769s 12.81m 0.21h 0.01d 0.000 y
#IO & Wait Time: 426s 7.10m 0.12h 0.00d 0.000 y
#Average job time: 109s 1.81m 0.03h 0.00d
#Longest finished job: 211s 3.52m 0.06h 0.00d
#Submission to last job: 217s 3.62m 0.06h 0.00d
mkdir phyloFit.run
cd phyloFit.run/
ls -1 ../ss.chrom/* > chr.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/phast/x86_64/phyloFit --msa-format SS --tree start.tree $(path1) { check out line+ ../trees/$(root1).mod }
#ENDLOOP
'_EOF_'
gensub2 chr.lst single gsub jobList
para -ram=8g -cpu=4 create jobList
para push
para time
cd ../trees/
phyloBoot --read-mods chr{1,2,3,4,5,6,7,8}.mod --output-average chroms1-8.mod
######## Let's back up a minute
exit # back to hgwdev
mkdir msa_split.chr1
cd msa_split.chr1/
twoBitToFa ../../../../monDom5.2bit:chr1 chr1.fa
time msa_split ../maf/withAllNine.chr1.maf --refseq chr1.fa --in-format MAF \
--windows 100000000,1000 --out-format SS \
--between-blocks 5000 --out-root ssChr1
ssh swarm
cd /hive/data/genomes/monDom5/bed/multiz9way/cons/phastCons.estimateTrees.run
ls -1 ../msa_split.chr1/* > ss.lst
grep BACKGROUND ../trees/chroms1-8.mod | awk '{printf "%0.3f\n", $3 + $4;}'
#0.383
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/phast/x86_64/phastCons --gc 0.383 --target-coverage 0.17 --expected-length 12 --estimate-trees --no-post-probs trees/$(root1) $(path1) ../trees/chroms1-8.mod
#ENDLOOP
'_EOF_'
gensub2 ss.lst single gsub jobList
para -ram=8g -cpu=4 create jobList
para push
para time
#Completed: 8 of 8 jobs
#CPU time in finished jobs: 51655s 860.91m 14.35h 0.60d 0.002 y
#IO & Wait Time: 950s 15.84m 0.26h 0.01d 0.000 y
#Average job time: 6576s 109.59m 1.83h 0.08d
#Longest finished job: 8998s 149.97m 2.50h 0.10d
#Submission to last job: 9217s 153.62m 2.56h 0.11d
## OK now average these
cd ../
ls trees/*.cons.mod > cons.txt
phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod
ls trees/*.noncons.mod > noncons.txt
phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod
## Now break the big mafs into .ss files
cd ../
mkdir msa_split.bigMafs ss.splits
cd msa_split.bigMafs/
ls -1 ../../initialMafs/* > mafs.lst
cat << '_EOF_' > msa.sh
#!/bin/bash
chr=`basename $1 .maf`
mkdir -p $chr
twoBitToFa ../../../../monDom5.2bit:$chr ${chr}.fa
/cluster/bin/phast/x86_64/msa_split $1 -M ${chr}.fa -i MAF -o SS -w 10000000,1000 -B 5000 -r ${chr}/ss
rm ${chr}.fa
'_EOF_'
chmod +x msa.sh
cat << '_EOF_' > gsub
#LOOP
./msa.sh $(path1) { check exists $(root1) }
#ENDLOOP
'_EOF_'
ssh swarm
cd /hive/data/genomes/monDom5/bed/multiz9way/cons/msa_split.bigMafs
gensub2 mafs.lst single gsub spec
para create spec
para push
para time
## Run phastCons on the many .ss files
mkdir ../phast.run
cd ../phast.run/
find ../msa_split.bigMafs/ -type f -name '*.ss' > ss.lst
ln -s ../phastCons.estimateTrees.run/*.mod .
cat << '_EOF_' > phast.sh
#!/bin/bash
ss=$1
wig=$2
bed=$3
mkdir -p `dirname $wig`
mkdir -p `dirname $bed`
/cluster/bin/phast/x86_64/phastCons --target-coverage 0.17 --expected-length 12 --most-conserved $bed --score $ss ave.cons.mod,ave.noncons.mod > $wig
'_EOF_'
chmod +x phast.sh
cat << '_EOF_' > gsub
#LOOP
./phast.sh $(path1) wig/$(lastDir1)/$(root1).wig bed/$(lastDir1)/$(root1).bed
#ENDLOOP
'_EOF_'
gensub2 ss.lst single gsub spec
para create spec
para push
para time
#Completed: 367 of 367 jobs
#CPU time in finished jobs: 9131s 152.18m 2.54h 0.11d 0.000 y
#IO & Wait Time: 6169s 102.82m 1.71h 0.07d 0.000 y
#Average job time: 42s 0.69m 0.01h 0.00d
#Longest finished job: 70s 1.17m 0.02h 0.00d
#Submission to last job: 343s 5.72m 0.10h 0.00d
## Stitch up wig files, make .wibs, load in DB
############################################################################
# susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-21,22 - Hiram)
screen # use a screen to manage this multi-day job
mkdir /hive/data/genomes/monDom5/bed/lastzSusScr1.2010-01-21
cd /hive/data/genomes/monDom5/bed/lastzSusScr1.2010-01-21
cat << '_EOF_' > DEF
# Pig vs. Opossum
BLASTZ_M=50
# TARGET: Opossum MonDom5
SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ1_LEN=/scratch/data/monDom5/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Pig SusScr1
SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit
SEQ2_LEN=/scratch/data/susScr1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/monDom5/bed/lastzSusScr1.2010-01-21
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
# real 992m51.260s
cat fb.monDom5.chainSusScr1Link.txt
# 179898391 bases of 3501660299 (5.138%) in intersection
mkdir /hive/data/genomes/susScr1/bed/blastz.monDom5.swap
cd /hive/data/genomes/susScr1/bed/blastz.monDom5.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/monDom5/bed/lastzSusScr1.2010-01-21/DEF \
-swap -noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
# real 97m35.156s
cat fb.susScr1.chainMonDom5Link.txt
# 182834626 bases of 2231332019 (8.194%) in intersection
#########################################################################
-# ailMel1 Panda BLASTZ/CHAIN/NET (DONE - 2010-01-22 - Hiram)
+# ailMel1 Panda BLASTZ/CHAIN/NET (DONE - 2010-02-04 - Hiram)
screen # use a screen to manage this multi-day job
- mkdir /hive/data/genomes/monDom5/bed/lastzAilMel1.2010-01-22
- cd /hive/data/genomes/monDom5/bed/lastzAilMel1.2010-01-22
+ mkdir /hive/data/genomes/monDom5/bed/lastzAilMel1.2010-02-04
+ cd /hive/data/genomes/monDom5/bed/lastzAilMel1.2010-02-04
cat << '_EOF_' > DEF
# Panda vs. Opossum
BLASTZ_M=50
# TARGET: Opossum MonDom5
SEQ1_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ1_LEN=/scratch/data/monDom5/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Panda AilMel1
SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
-BASE=/hive/data/genomes/monDom5/bed/lastzAilMel1.2010-01-22
+BASE=/hive/data/genomes/monDom5/bed/lastzAilMel1.2010-02-04
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
- # real 293m25.573s
+ # real 492m43.088s
cat fb.monDom5.chainAilMel1Link.txt
- # 223233047 bases of 3501660299 (6.375%) in intersection
+ # 223510659 bases of 3501660299 (6.383%) in intersection
mkdir /hive/data/genomes/ailMel1/bed/blastz.monDom5.swap
cd /hive/data/genomes/ailMel1/bed/blastz.monDom5.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
- /hive/data/genomes/monDom5/bed/lastzAilMel1.2010-01-22/DEF \
+ /hive/data/genomes/monDom5/bed/lastzAilMel1.2010-02-04/DEF \
-swap -noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
- # real 65m0.711s
+ # real 69m35.464s
cat fb.ailMel1.chainMonDom5Link.txt
- # 210235815 bases of 2225124764 (9.448%) in intersection
+ # 211209682 bases of 2245312831 (9.407%) in intersection
#########################################################################