src/hg/makeDb/doc/monDom5.txt 1.8
1.8 2009/06/08 23:35:31 aamp
big zips/downloads
Index: src/hg/makeDb/doc/monDom5.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/monDom5.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/hg/makeDb/doc/monDom5.txt 7 Feb 2009 00:33:51 -0000 1.7
+++ src/hg/makeDb/doc/monDom5.txt 8 Jun 2009 23:35:31 -0000 1.8
@@ -1,708 +1,759 @@
# for emacs: -*- mode: sh; -*-
# Creating the assembly for Monodelphis domestica
# South American, Short-tailed Opossum
# http://www.genome.gov/11510687
# http://www.genome.gov/12512285
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+---------------------------------+
# | tableName | type |
# +-------------+---------------------------------+
# | refGene | genePred refPep refMrna |
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | ensGene | genePred ensPep |
# | genscan | genePred genscanPep |
# +-------------+---------------------------------+
#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2007-05-11 - Hiram)
ssh kkstore04
mkdir -p /cluster/store8/monDom5/broad
ln -s /cluster/store8/monDom5 /cluster/data/monDom5
cd /cluster/data/monDom4/broad.mit.edu
time nice -n +19 wget --timestamping \
'ftp://broad.mit.edu/pub/assemblies/mammals/monodelphis/monDom5/*'
# real 52m48.859s
# user 0m2.122s
# sys 0m22.031s
# This was a symlink to ../monDom2/
# lrwxrwxrwx 1 36 May 10 16:00 BACread_2_BACclone.txt.gz -> ../monDom2/BACread_2_BACclone.txt.gz
# so actually fetch the file here
rm BACread_2_BACclone.txt.gz
time nice -n +19 wget --timestamping \
'ftp://broad.mit.edu/pub/assemblies/mammals/monodelphis/monDom2/BACread_2_BACclone.txt.gz'
# it looks like they've been using our tools, their agp file has
# the extra gap at the end of chrUn:
# chrUn 103240612 103241611 16995 N 1000 clone no
# Their fasta file is in 5,000,000 chunks
# fixup the split fasta files
time nice -n +19 gunzip Monodelphis5.0.agp.chromosome.qual.gz \
Monodelphis5.0.agp.chromosome.fasta.gz
# real 5m24.942s
# user 1m31.895s
# sys 0m51.726s
mkdir splitFa
time nice -n +19 faSplit -verbose=2 byname \
Monodelphis5.0.agp.chromosome.fasta splitFa/
# 2m44s
mkdir chrFa
# combine the Broad split files into single chrom fasta files
time for C in 1 2 3 4 5 6 7 8 X Un
do
rm -f chrFa/chr${C}.fa
echo ">chr${C}" > chrFa/chr${C}.fa
echo -n "chrFa/chr${C}.fa working ... "
ls splitFa/${C}.*-*.fa | sort -t"." -k2,2n | while read F
do
grep -v "^>" ${F} >> chrFa/chr${C}.fa
done
echo "done"
done
# verify nothing was lost, should be the same totals here
faSize chrFa/chr*.fa
# 3605614649 bases (103971429 N's 3501643220 real 3501643220 upper 0
# lower) in 10 sequences in 10 files
faSize Monodelphis5.0.agp.chromosome.fasta
# 3605614649 bases (103971429 N's 3501643220 real 3501643220 upper 0
# lower) in 726 sequences in 1 files
# put them together into a single file:
cat chrFa/chr*.fa > ucscChroms.fa
# create a lift file from the information in the fasta headers
cat << '_EOF_' > liftBroadToChroms.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH, 'grep "^>" Monodelphis5.0.agp.chromosome.fasta|') or
die "can not grep Monodelphis5.0.agp.chromosome.fasta";
my %liftSpec; # key is chrom_start, value is end
my %chrSize; # key is chrom, value is size
my @liftLines; # index is line number, value is line to output
$chrSize{'1'} = 0;
$chrSize{'2'} = 0;
$chrSize{'3'} = 0;
$chrSize{'4'} = 0;
$chrSize{'5'} = 0;
$chrSize{'6'} = 0;
$chrSize{'7'} = 0;
$chrSize{'8'} = 0;
$chrSize{'X'} = 0;
$chrSize{'Un'} = 0;
my $lineCount = 0;
my $prevChr = "";
my $chrStart = 0;
while (my $line = <FH>) {
my ($chr_pos, $name) = split('\s+',$line);
my ($chr, $range) = split('\.',$chr_pos);
$chr =~ s/>//;
if ($chr ne $prevChr) {
$chrStart = 0;
$prevChr = $chr;
print STDERR "chr$chr starting\n";
}
my ($start,$end) = split('-',$range);
my $key = sprintf("%s_%d", $chr, $start);
$liftSpec{$key} = $end;
$chrSize{$chr} = $end if ($end > $chrSize{$chr});
my $fragSize = $end - $start + 1;
$liftLines[$lineCount++] = sprintf "%d\t%s.%d-%d\t%d\tchr%s",
$chrStart, $chr, $start, $end, $fragSize, $chr;
$chrStart += $fragSize;
}
close (FH);
for (my $i = 0; $i < $lineCount; ++$i) {
my ($chrStart, $fragName, $fragSize, $chr) = split('\t',$liftLines[$i]);
$chr =~ s/chr//;
printf "%s\t%d\n", $liftLines[$i], $chrSize{$chr};
}
'_EOF_'
# << happy emacs
chmod +x liftBroadToChroms.pl
./liftBroadToChroms.pl liftBroad.lft
# split up the quality file to get it put together into a single
# chrom based file
cat << '_EOF_' > splitQual.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH, "<Monodelphis5.0.agp.chromosome.qual") or
die "can not read Monodelphis5.0.agp.chromosome.qual";
# open an initial output file to get OUT established, not a real chr name
my $fileName = "splitQual/0.1.fa";
open (OUT,">$fileName") or die "can not write to $fileName";
while (my $line = <FH>) {
if ($line =~ m/^>/) {
close(OUT);
$line =~ s/>//;
$line =~ s/-.*//;
$fileName = "splitQual/$line";
open (OUT,">$fileName") or die "can not write to $fileName";
printf STDERR "writing to $fileName";
} else {
print OUT $line;
}
}
close (FH);
close (OUT);
'_EOF_'
# << happy emacs
chmod +x splitQual.pl
mkdir splitQual
./splitQual.pl
# put them back together in order as full chroms
for C in 1 2 3 4 5 6 7 8 X Un
do
echo ">chr${C}" > chrQual/chr${C}.qual.fa
ls splitQual/${C}.* | sort -t'.' -k2,2n | while read F
do
cat $F >> chrQual/chr${C}.qual.fa
done
done
# real 6m7.079s
# and as a single file
for C in 1 2 3 4 5 6 7 8 X Un
do
cat chrQual/chr${C}.qual.fa
done | gzip -c > ucscChroms.qual.fa.gz
# real 4m41.229s
# and turn it into a qac file
qaToQac ucscChroms.qual.fa.gz ucscChroms.qac
# real 3m49.380s
#########################################################################
# create genome assembly database (DONE - 2008-11-25 - Hiram)
cd /hive/data/genomes/monDom5/
cat << '_EOF_' > monDom5.config.ra
# Config parameters for makeGenomeDb.pl:
db monDom5
clade mammal
scientificName Monodelphis domestica
commonName Opossum
assemblyDate Oct. 2006
assemblyLabel Broad Institute monDom5 (NCBI project 12561, accession AAFR03000000)
orderKey 354
mitoAcc NC_006299
fastaFiles /hive/data/genomes/monDom5/broad/ucscChroms.fa
agpFiles /hive/data/genomes/monDom5/broad/Monodelphis5.0.agp
qualFiles /hive/data/genomes/monDom5/broad/ucscChroms.qac
dbDbSpeciesDir opossum
'_EOF_'
# << happy emacs
time makeGenomeDb.pl -verbose=2 -stop=seq monDom5.config.ra > seq.log 2>&1
# real 3m3.414s
time makeGenomeDb.pl -verbose=2 -continue=agp -stop=agp monDom5.config.ra \
> agp.log 2>&1
# real 0m36.723s
time makeGenomeDb.pl -verbose=2 -continue=db -stop=db monDom5.config.ra \
> db.log 2>&1
# real 36m30.041
time makeGenomeDb.pl -verbose=2 -continue=dbDb -stop=dbDb \
monDom5.config.ra > dbDb.log 2>&1
# real 0m1.074s
time makeGenomeDb.pl -verbose=2 -continue=trackDb -stop=trackDb \
monDom5.config.ra > trackDb.log 2>&1
# check in the trackDb files and the browser should be up and running
#########################################################################
# monDom5 - Opossum - Ensembl Genes version 50 (DONE - 2008-12-02 - hiram)
ssh hgwdev
cd /hive/data/genomes/monDom5
cat << '_EOF_' > monDom5.ensGene.ra
# required db variable
db monDom5
# optional nameTranslation, the sed command that will transform
# Ensemble names to UCSC names. With quotes just to make sure.
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/"
'_EOF_'
# << happy emacs
doEnsGeneUpdate.pl -ensVersion=50 monDom5.ensGene.ra
ssh hgwdev
cd /hive/data/genomes/monDom5/bed/ensGene.50
featureBits monDom5 ensGene
# 32970520 bases of 3501660299 (0.942%) in intersection
*** All done! (through the 'makeDoc' step)
*** Steps were performed in /hive/data/genomes/monDom5/bed/ensGene.50
############################################################################
# monDom5 - Opossum - Ensembl Genes version 51 (DONE - 2008-12-02 - hiram)
ssh hgwdev
cd /hive/data/genomes/monDom5
cat << '_EOF_' > monDom5.ensGene.ra
# required db variable
db monDom5
# optional nameTranslation, the sed command that will transform
# Ensemble names to UCSC names. With quotes just to make sure.
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/"
'_EOF_'
# << happy emacs
doEnsGeneUpdate.pl -ensVersion=51 monDom5.ensGene.ra
ssh hgwdev
cd /hive/data/genomes/monDom5/bed/ensGene.51
featureBits monDom5 ensGene
# 32959755 bases of 3501660299 (0.941%) in intersection
*** All done! (through the 'makeDoc' step)
*** Steps were performed in /hive/data/genomes/monDom5/bed/ensGene.51
#########################################################################
## REPEATMASKER (DONE 2008-11-26 Andy)
screen -S monDom5RepeatMasker # to manage this several day job
mkdir /hive/data/genomes/monDom5/bed/repeatMasker
cd /hive/data/genomes/monDom5/bed/repeatMasker
time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -workhorse=hgwdev \
-bigClusterHub=swarm -buildDir=`pwd` monDom5 > do.log
## (detach screen Ctrl+A+D)
## ... (reattach on hgwdev: screen -r monDom5RepeatMasker)
#
#Checking finished jobs
#cat run.time
#HgStepManager: executing step 'cat' Wed Nov 26 07:29:04 2008.
#Use of uninitialized value in substitution (s///) at /cluster/home/aamp/kent/src/hg/utils/automation/HgAutomate.pm line 262.
#Could not extract server from output of "df /hive/data/genomes/monDom5/bed/repeatMasker":
#/dev/hivedev 171434397696 55883525376 115550872320 33% /hive
#...
#user 0m0.099s
#sys 0m0.070s
## checking the problem, it's that I don't have a $HOST in my env.
## ... so, tcsh it is.
tcsh
time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -continue=cat \
-workhorse=hgwdev -bigClusterHub=swarm -buildDir=`pwd` \
monDom5 > doAfterCat.log
#0.389u 0.348s 1:12:48.52 0.0% 0+0k 0+0io 1pf+0w
#########################################################################
## SIMPLE REPEATS TRF (DONE 2008-11-26 - Andy)
screen
mkdir /hive/data/genomes/monDom5/bed/simpleRepeat
cd /hive/data/genomes/monDom5/bed/simpleRepeat
time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
-buildDir=/cluster/data/monDom5/bed/simpleRepeat monDom5 > do.log
#0.233u 0.153s 2:52:17.94 0.0% 0+0k 0+0io 0pf+0w
cat fb.simpleRepeat
# 83450133 bases of 3501660299 (2.383%) in intersection
## after RM run is done, add this mask:
cd /hive/data/genomes/monDom5
rm monDom5.2bit
twoBitMask monDom5.rmsk.2bit -add bed/simpleRepeat/trfMask.bed monDom5.2bit
## can safely ignore warning about >=13 fields in bed file
twoBitToFa monDom5.2bit stdout | faSize stdin > monDom5.2bit.faSize.txt
# 3605631728 bases (103971429 N's 3501660299 real 1542619938 upper 1959040361 lower)
# %54.33 masked total, %55.95 masked real
## link to gbdb
ln -s `pwd`/monDom5.2bit /gbdb/monDom5
###########################################################################
# WINDOWMASKER (DONE 2008-12-02 Andy)
ssh kolossus
mkdir /hive/data/genomes/monDom5/bed/WindowMasker
cd /hive/data/genomes/monDom5/bed/WindowMasker
screen -S monDom5_WindowMasker
tcsh
~/kent/src/hg/utils/automation/doWindowMasker.pl monDom5 -buildDir=`pwd` -workhorse=kolossus >& wm.log
## oops forgot to time it (or nice it). It took less than 8 hrs though
## load result to prep for cleaning
ssh hgwdev
cd /hive/data/genomes/monDom5/bed/WindowMasker
hgLoadBed monDom5 windowmaskerSdust windowmasker.sdust.bed.gz
# Loaded 1346187 elements of size 3
featureBits monDom5 windowmaskerSdust
# 1619987578 bases of 3501660299 (46.263%) in intersection
# eliminate the gaps from the masking (WM bug)
featureBits monDom5 -not gap -bed=notGap.bed
# 170473138 bases of 170473138 (100.000%) in intersection
screen -S monDom5_WindowMasker
time nice featureBits monDom5 windowmasker.sdust.bed.gz notGap.bed \
-bed=stdout | gzip -c > cleanWMask.bed.gz
#1516026449 bases of 3501660299 (43.295%) in intersection
#
#real 12m17.912s
#user 3m56.801s
#sys 0m13.704s
## reload track to get it clean
hgLoadBed monDom5 windowmaskerSdust cleanWMask.bed.gz
## Loaded 22241063 elements of size 4
##########################################################################
## CPG ISLANDS (DONE 2008-12-03, Andy)
ssh hgwdev
screen -S monDom5_CpG
~/kent/src/hg/utils/automation/doCpgIslands.pl monDom5
## [detach screen]
## took an hour or two
featureBits monDom5 cpgIslandExt
# 14553072 bases of 3501660299 (0.416%) in intersection
##########################################################################
## LIFTOVER CHAIN MONDOM4->MONDOM5 (DONE 2008-12-06, Andy)
ssh hgwdev
screen -S monDom5_liftOver
tcsh
~/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl monDom4 monDom5
## [detach]
## ... 8 hippos
ssh swarm
cd /hive/data/genomes/monDom4/bed/blat.monDom5.2008-12-04/run.blat
## edit job.csh and add -maxIntron=1 -mask=lower
## to blat command and up minScore to 5000 and minIdentity to 99
para stop
para push
## Took several hours after hippos were dealt with and -continue net
## had to be run as well after it crashed running out of tmp disk space
cd /hive/data/genomes/monDom4/bed/blat.monDom5.2008-12-04
rm -rf run.chain/ run.blat/
mv monDom4ToMonDom5.over.chain.gz ../liftOver/
cd /gbdb/monDom4/liftOver
#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 2008-12-11, Andy)
ssh hgwdev
screen -S monDom5_11.ooc
cd /hive/data/genomes/monDom5
## use repMatch of 1228 as this genome is ~ %20 larger than human
## 1024 + (1024 * 0.2) = 1228
time nice blat monDom5.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228
# Wrote 43992 overused 11-mers to 11.ooc
# real 3m24.793s
#######################################################################
# GENBANK AUTOUPDATE SETUP (DONE 2008-12-15 Andy)
ssh hgwdev
screen -S monDom5_genbank
cd ~/kent/src/hg/makeDb/genbank/conf/
## edit genbank.conf: search monDom4, copy and paste that whole entry
## and replace monDom4 with monDom5:
## cvs ci genbank.conf
make etc-update
ssh genbank
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial monDom5 &
# logFile: var/build/logs/2008.12.11-16:37:37.monDom5.initalign.log
#real 633m58.880s
##user 96m10.868s
#sys 35m42.207s
exit
## we're back on hgwdev
nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad monDom5 &
#logFile: var/dbload/genbank/logs/2008.12.15-10:44:36.dbload.log
cd ~/kent/src/hg/makeDb/genbank
cvsup
## add monDom5 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added monDom5" etc/align.dbs etc/hgwdev.dbs
make etc-update
##########################################################################
## BLAT (DONE 2008-01-20 Andy)
# e-mail cluster-admin and ask for a server and tell them about
# /gbdb/monDom5/monDom5.2bit
# then
ssh hgwdev
hgsql -N hgcentraltest
mysql> insert into blatServers (db, host, port, isTrans, canPcr) values ('monDom5', 'blatx', '17778', '1', '0');
Query OK, 1 row affected (0.01 sec)
mysql> insert into blatServers (db, host, port, isTrans, canPcr) values ('monDom5', 'blatx', '17779', '0', '1');
Query OK, 1 row affected (0.00 sec)
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-05 braney )
# bash if not using bash shell already
cd /cluster/data/monDom5
grep chrUn monDom5.agp | /cluster/bin/scripts/agpToLift > jkStuff/Un.lft
mkdir /cluster/data/monDom5/blastDb
cat broad/Monodelphis5.0.agp | awk '/^chrUn/ {if ($5 == "W")
{
printf "echo \\>%s\n",$6 ;
printf "twoBitToFa monDom5.2bit:chrUn:%d-%d stdout | tail -n +2\n", $2,$3
}}' > script
sh script > chrUn.fa
grep -v chrUn chrom.sizes | awk '{ printf "twoBitToFa monDom5.2bit:%s stdout \n", $1}' > script
sh script > chrNotUn.fa
cat chrUn.fa chrNotUn.fa | faToTwoBit stdin monDom5.chrUnAsContigs.2bit
twoBitInfo monDom5.chrUnAsContigs.2bit monDom5.chrUnAsContigs.sizes
awk '{if ($2 > 1000000) print $1}' monDom5.chrUnAsContigs.sizes > 1meg.lst
twoBitToFa -seqList=1meg.lst monDom5.chrUnAsContigs.2bit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
# 4366 pieces of 4366 written
rm temp.fa 1meg.lst
awk '{if ($2 <= 1000000) print $1}' monDom5.chrUnAsContigs.sizes > less1meg.lst
twoBitToFa -seqList=less1meg.lst monDom5.chrUnAsContigs.2bit temp.fa
faSplit about temp.fa 1000000 blastDb/y
ls blastDb/y* | wc -l
# 88
rm temp.fa less1meg.lst
cd blastDb
for i in *.fa
do
/hive/data/outside/blast229/formatdb -i $i -p F
done
rm *.fa &
ls *.nsq | wc -l
# 4454
mkdir -p /cluster/data/monDom5/bed/tblastn.hg18KG
cd /cluster/data/monDom5/bed/tblastn.hg18KG
echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 4454 query.lst
# we want around 1000000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(1000000/`wc query.lst | awk '{print $1}'`\)
# 36727/(1000000/4454) = 163.582058
mkdir -p kgfa
split -l 164 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/monDom5/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/monDom5/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
exit
ssh swarm
cd /cluster/data/monDom5/bed/tblastn.hg18KG
gensub2 query.lst kg.lst blastGsub blastSpec
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 997696 of 997696 jobs
# CPU time in finished jobs: 21259114s 354318.56m 5905.31h 246.05d 0.674 y
# IO & Wait Time: 3461687s 57694.79m 961.58h 40.07d 0.110 y
# Average job time: 25s 0.41m 0.01h 0.00d
# Longest finished job: 76s 1.27m 0.02h 0.00d
# Submission to last job: 31794s 529.90m 8.83h 0.37d
ssh swarm
cd /cluster/data/monDom5/bed/tblastn.hg18KG
cat << _EOF_ > splitOne
(cd \$1;
grep -h -v contig q.*.psl | pslSplitOnTarget stdin chroms
grep -h contig q.*.psl > chroms/contigs.psl
)
_EOF_
chmod +x splitOne
for i in `pwd`/blastOut/kg??; do
echo ./splitOne $i
done > split.jobs
para create split.jobs
para maxJob 10
para shove
# Completed: 224 of 224 jobs
# CPU time in finished jobs: 910s 15.17m 0.25h 0.01d 0.000 y
# IO & Wait Time: 128754s 2145.89m 35.76h 1.49d 0.004 y
# Average job time: 579s 9.65m 0.16h 0.01d
# Longest finished job: 683s 11.38m 0.19h 0.01d
# Submission to last job: 13166s 219.43m 3.66h 0.15d
ssh swarm
cd /cluster/data/monDom5/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(
f=`basename $1`;
d=`dirname $1`;
if simpleChain -prot -outPsl -maxGap=150000 $1 $d/c.$f.tmp
then
mv $d/c.$f.tmp $d/c.$f
else
rm $d/c.$f.tmp
fi
)
'_EOF_'
chmod +x chainOne
ls -1dS ../blastOut/kg??/chroms/*.psl > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
para create chainSpec
para try, check, push, check etc.
# Completed: 2270 of 2270 jobs
# CPU time in finished jobs: 10821559s 180359.32m 3005.99h 125.25d 0.343 y
# IO & Wait Time: 249403s 4156.72m 69.28h 2.89d 0.008 y
# Average job time: 4877s 81.28m 1.35h 0.06d
# Longest finished job: 205913s 3431.88m 57.20h 2.38d
# Submission to last job: 205932s 3432.20m 57.20h 2.38d
cd /cluster/data/monDom5/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat $i/chroms/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../unLift.psl
cd ..
pslCheck unLift.psl
# checked: 71490 failed: 0 errors: 0
liftUp blastHg18KG.psl ../../jkStuff/Un.lft carry unLift.psl
pslCheck -prot blastHg18KG.psl
# checked: 71490 failed: 0 errors: 0
# load table
ssh hgwdev
cd /cluster/data/monDom5/bed/tblastn.hg18KG
hgLoadPsl monDom5 blastHg18KG.psl
# check coverage
featureBits monDom5 blastHg18KG
# 33631308 bases of 3501660299 (0.960%) in intersection
featureBits monDom4 blastHg18KG
# 19424941 bases of 3501643220 (0.555%) in intersection
featureBits monDom5 all_mrna blastHg18KG -enrichment
# all_mrna 0.006%, blastHg18KG 0.960%, both 0.003%, cover 43.79%, enrich 45.59x
rm -rf blastOut
#end tblastn
#########################################################################
## NSCAN LIFT (FROM MONDOM4) (DONE 2008-12-08 Andy)
ssh hgwdev
mkdir /hive/data/genomes/monDom5/bed/nscanGene.lifted
cd /hive/data/genomes/monDom5/bed/nscanGene.lifted
echo "select * from nscanGene" | hgsql monDom4 | sed '1d' > nscanGene.monDom4.gp
liftOver nscanGene.monDom4.gp /gbdb/monDom4/liftOver/monDom4ToMonDom5.over.chain.gz \
| nscanGene.monDom5.gp unmapped.gp
ldHgGene -predTab monDom5 nscanGene nscanGene.monDom5.gp
##########################################################################
## GENSCAN (DONE 2008-01-22 Andy)
ssh hgwdev
mkdir -p /hive/data/genomes/monDom5/bed/genscan/{fasta,gtf,pep,subopt}
cd /hive/data/genomes/monDom5/bed/genscan
twoBitToFa ../../monDom5.2bit stdout | maskOutFa stdin hard stdout \
| faSplit -maxN=5000000 -lift=chunks.lft gap stdin 8000000 fasta/chunk
cvs co hg3rdParty/genscanlinux
cat << '_EOF_' > gsub
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << emacs
ls -1 fasta/* > chunk.lst
ssh swarm
cd /hive/data/genomes/monDom5/bed/genscan
gensub2 chunk.lst single gsub spec
para create spec
para push
para time
#Completed: 421 of 421 jobs
#CPU time in finished jobs: 60788s 1013.13m 16.89h 0.70d 0.002 y
#IO & Wait Time: 5097s 84.96m 1.42h 0.06d 0.000 y
#Average job time: 156s 2.61m 0.04h 0.00d
#Longest finished job: 367s 6.12m 0.10h 0.00d
#Submission to last job: 378s 6.30m 0.10h 0.00d
catDir subopt | liftUp my.bed chunks.lft error stdin
sed 's/chunk/opossum/' my.bed > genscan.subopt.bed
catDir gtf | liftUp my.gtf chunks.lft error stdin
sed 's/chunk/opossum/g' my.gtf > genscan.gtf
catDir pep | sed 's/chunk/opossum/' > genscan.pep.fa
ldHgGene -gtf monDom5 genscan genscan.gtf
hgPepPred monDom5 generic genscanPep genscan.pep.fa
hgLoadBed monDom5 genscanSubopt genscan.subopt.bed
+
+#########################################################################
+# BIGZIPS/DOWNLOADS (DONE, 2009-06-08 Andy)
+ ssh hgwdev
+ mkdir /hive/data/genomes/monDom5/downloads
+ mkdir /hive/data/genomes/monDom5/downloads/bigZips
+ mkdir /hive/data/genomes/monDom5/downloads/chromosomes
+ cd /hive/data/genomes/monDom5/downloads/chromosomes
+ cp /usr/local/apache/htdocs/goldenPath/monDom4/chromosomes/README.txt .
+ # edit that readme to provide correct references and details
+ ln -s `pwd` /usr/local/apache/htdocs/goldenPath/monDom5/chromosomes
+ cd /hive/data/genomes/monDom5/downloads/chromosomes
+ for c in `cut -f1 ../../chrom.sizes`; do
+ echo $c;
+ twoBitToFa ../../monDom5.unmasked.2bit:$c stdout \
+ | gzip -c > $c.fa.gz;
+ done
+ md5sum *.fa.gz R*.txt > md5sum.txt
+ cd ../../
+ for c in `cut -f1 chrom.sizes`; do
+ echo ${c#chr};
+ pushd ${c#chr};
+ cp ../downloads/chromosomes/${c}.fa.gz .
+ gunzip ${c}.fa.gz;
+ popd;
+ done
+ tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa Un/chrUn.fa
+ tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out Un/chrUn.fa.out
+ for c in `cut -f1 chrom.sizes`; do echo $c; twoBitToFa monDom5.2bit:$c ${c#chr}/${c}.fa.masked; done
+ tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \
+ Un/chrUn.fa.masked
+ #real 17m19.439s
+ cd bed/simpleRepeat/
+ mkdir trfMask
+ cd trfMask
+ ln -s ../trfMaskChrom/* .
+ cd ../
+ tar -hcvzf chromTrf.tar.gz ./trfMask
+ mv chromTrf.tar.gz ../../downloads/bigZips/
+ cd ../../downloads/bigZips/
+ cp ../../broad/Monodelphis5.0.agp .
+ gzip Monodelphis5.0.agp
+ cp /usr/local/apache/htdocs/goldenPath/monDom4/bigZips/README.txt .
+ # [edit]
+ ln -s /hive/data/genomes/monDom5/downloads/bigZips \
+ /usr/local/apache/htdocs/goldenPath/monDom5/bigZips
+ cd /usr/local/apache/htdocs/goldenPath/monDom5/bigZips
+ ln -s /hive/data/genomes/monDom5/monDom5.2bit .
+ md5sum *.gz *.2bit README.txt > md5sum.txt
+ cd ../chromosomes/
+ md5sum *.gz R*.txt > md5sum.txt