src/hg/makeDb/doc/anoCar1.txt 1.21
1.21 2009/07/21 21:01:40 markd
added transmap update blurb
Index: src/hg/makeDb/doc/anoCar1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/anoCar1.txt,v
retrieving revision 1.20
retrieving revision 1.21
diff -b -B -U 1000000 -r1.20 -r1.21
--- src/hg/makeDb/doc/anoCar1.txt 1 Jul 2008 16:52:59 -0000 1.20
+++ src/hg/makeDb/doc/anoCar1.txt 21 Jul 2009 21:01:40 -0000 1.21
@@ -1,853 +1,862 @@
# for emacs: -*- mode: sh; -*-
# Lizard - Anolis carolinensis - Broad Institute 1.0
#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2007-02-16 - Hiram)
ssh kkstore05
mkdir /cluster/store12/anoCar1
ln -s /cluster/store12/anoCar1 /cluster/data/anoCar1
mkdir /cluster/data/anoCar1/downloads
cd /cluster/data/anoCar1/downloads
foreach f (assembly.agp \
BasicAssemblyOneLiner.out \
ForDistribution.command \
assembly.bases.gz \
assembly.links \
assembly.quals.gz \
source)
wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/reptiles/lizard/AnoCar1.0/$f
end
faSize assembly.bases.gz
# 1741478929 bases (0 N's 1741478929 real 1741478929 upper 0 lower)
# in 50470 sequences in 1 files
# Discovered later that this quals file needs to be lifted
qaToQac assembly.quals.gz stdout \
| qacAgpLift assembly.agp stdin scaffold.lifted.qac
## Calculate N50
# Find total length in sequence
sort -k2nr chrom.sizes | awk '{sum+=$2;print NR,sum,$2,$1}' | tail -1
# 7233 1781602899 340 scaffold_7232
# half of 1781602899 is 890801449
# run this again, scanning the list until the sum reaches 890801449
sort -k2nr chrom.sizes | awk '{sum+=$2;print NR,sum,$2,$1}' | less
# 204 889215106 2440512 scaffold_201
# 205 891654231 2439125 scaffold_205
#########################################################################
## Create .ra file and run makeGenomeDb.pl
ssh hgwdev
cd /cluster/data/anoCar1
cat << '_EOF_' >anoCar1.config.ra
# Config parameters for makeGenomeDb.pl:
db anoCar1
clade vertebrate
genomeCladePriority 66
scientificName Anolis carolinensis
commonName Lizard
assemblyDate Jan. 2007
assemblyLabel Broad Institute AnoCar (1.0)
orderKey 440
mitoAcc none
fastaFiles /cluster/data/anoCar1/downloads/assembly.bases.gz
agpFiles /cluster/data/anoCar1/downloads/assembly.agp
qualFiles /cluster/data/anoCar1/downloads/scaffold.lifted.qac
dbDbSpeciesDir lizard
'_EOF_'
time makeGenomeDb.pl -verbose=2 anoCar1.config.ra > makeGenomeDb.out 2>&1
# broken down during the quals step since assembly.quals.gz needed
# to be lifted. do the qaToQac | qacAgpLift sequence, fixup the
# specification above for qualFiles, and finish off the quals loading:
ssh kkstore05
cd /cluster/data/anoCar1/bed/qual
qacToWig -fixed ../../downloads/scaffold.lifted.qac stdout \
| wigEncode stdin qual.wig qual.wib
ssh hgwdev
cd /cluster/data/anoCar1/bed/qual
time nice -n +19 hgLoadWiggle \
-pathPrefix=/gbdb/anoCar1/wib anoCar1 quality qual.wig
## continue
ssh hgwdev
cd /cluster/data/anoCar1
time makeGenomeDb.pl -verbose=2 -continue=dbDb anoCar1.config.ra \
> makeDbDb.out 2>&1
## better orderKey to get Lizard between frog and fish
hgsql -e 'update dbDb set orderKey="440" where name="anoCar1";' \
hgcentraltest
## fixup that number in the .ra file as mentioned above, was 375
##########################################################################
## Photograph - permission to use obtained from R. Steven Rainwater
## (DONE - 2007-02-16 - Hiram)
## Fetch photo from:
wget --timestamping \
"http://rainwaterreptileranch.org/steve/photos/herps/anole2.jpeg" \
-O R.Steven.Rainwater.Anole.Lizard.jpg
convert -quality 80 -sharpen 0 -crop "168x272+264+37" \
R.Steven.Rainwater.Anole.Lizard.jpg Anolis_carolinensis.jpg
################################################
## WINDOWMASKER (Working - 2007-02-16 - Hiram)
ssh kkstore05
cd /cluster/data/anoCar1/bed/
time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl anoCar1 \
-workhorse=kolossus > wmRun.log 2>&1 &
# real 172m58.813s
# Save the log
mv wmRun.log WindowMasker.2007-02-16
# Masking statistics
cd WindowMasker.2007-02-18
twoBitToFa anoCar1.wmsk.2bit stdout | faSize stdin
# 1781602899 bases (40123970 N's 1741478929 real
# 1009685313 upper 731793616 lower) in 7233 sequences in 1 files
twoBitToFa anoCar1.wmsk.sdust.2bit stdout | faSize stdin
# 1781602899 bases (40123970 N's 1741478929 real
# 1000477327 upper 741001602 lower) in 7233 sequences in 1 files
ssh hgwdev
hgLoadBed -strict anoCar1 windowmaskerSdust windowmasker.sdust.bed.gz
# Loaded 8354004 elements of size 3
# why does featureBits show more bases masked than what faSize
# measured as lower case ? Because this counts the masked sequence
# in the gaps, and the faSize doesn't have the gaps.
time nice -n +19 featureBits anoCar1 windowmaskerSdust
# 781125572 bases of 1741478929 (44.854%) in intersection
time nice -n +19 featureBits -countGaps anoCar1 windowmaskerSdust
# 781125572 bases of 1781602899 (43.844%) in intersection
# Curiously, WM overlaps gaps ?
time nice -n +19 featureBits -countGaps anoCar1 windowmaskerSdust gap
# 40123970 bases of 1781602899 (2.252%) in intersection
# 741001602 + 40123970 == 781125572
#########################################################################
# SIMPLE REPEATS (TRF) (DONE 2007-02-16 - Hiram)
# (dropped chromEnd index 2007-05-10 - kuhn)
ssh kolossus
mkdir /cluster/data/anoCar1/bed/simpleRepeat
cd /cluster/data/anoCar1/bed/simpleRepeat
time nice -n 19 twoBitToFa ../../anoCar1.unmasked.2bit stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
-bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 &
# real 164m34.988s
# user 159m32.859s
# sys 4m31.649s
ssh kkstore05
cd /cluster/data/anoCar1/bed/simpleRepeat
# Make a filtered version for sequence masking:
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
splitFileByColumn trfMask.bed trfMaskChrom
# Load unfiltered repeats into the database:
ssh hgwdev
nice -n +19 hgLoadBed anoCar1 simpleRepeat \
/cluster/data/anoCar1/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 569322 elements of size 16
nice -n +19 featureBits anoCar1 simpleRepeat
# 50290171 bases of 1741478929 (2.888%) in intersection
#########################################################################
## Add TRF mask to WindowMasker masked sequence, and fixup the bogus
## window masked N's (DONE - 2007-02-17 - Hiram)
ssh kkstore05
cd /cluster/data/anoCar1/bed/WindowMasker.2007-02-16
## Curious, twoBitMask would not accept stdout for its output 2bit
twoBitMask anoCar1.wmsk.sdust.2bit \
-add ../simpleRepeat/trfMask.bed tmp.2bit
twoBitToFa tmp.2bit stdout \
| sed -e "s/n/N/g" | faToTwoBit stdin anoCar1.2bit
## check it:
twoBitToFa anoCar1.2bit stdout | faSize stdin
# 1781602899 bases (40123970 N's 1741478929 real
# 1000242640 upper 741236289 lower) in 7233 sequences in 1 files
## trfMask contributes:
awk '{sum+=$3-$2} END{print sum}' ../simpleRepeat/trfMask.bed
# 16534332
## and we measured wmsk.sdust before: 741001602
16534332 + 741001602
#########################################################################
## BLASTZ Hg18 swap (DONE - 2007-02-18 - Hiram)
## the original blastz to hg18 measured
time nice -n +19 featureBits hg18 chainAnoCar1Link \
> fb.hg18.chainAnoCar1Link.txt 2>&1
# real 2m28.318s
# 137554843 bases of 2881515245 (4.774%) in intersection
ssh kkstore05
mkdir /cluster/data/anoCar1/bed/blastz.hg18.swap
cd /cluster/data/anoCar1/bed/blastz.hg18.swap
time doBlastzChainNet.pl \
/cluster/data/hg18/bed/blastz.anoCar1.2007-02-17/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -bigClusterHub=pk -swap > swap.log 2>&1 &
time nice -n +19 featureBits anoCar1 chainHg18Link \
> fb.anoCar1.chainHg18Link.txt 2>&1
# real 3m16.810s
# 112434396 bases of 1741478929 (6.456%) in intersection
#########################################################################
# GENSCAN PREDICTIONS (DONE - 2006-05-03 - 2006-05-05 - Hiram)
ssh kkstore05
# Create a 2bit file all hard masked
cd /cluster/data/anoCar1
time nice -n +10 twoBitToFa anoCar1.2bit stdout \
| maskOutFa stdin hard stdout \
| faToTwoBit stdin anoCar1.hard.2bit
# make sure it still has all the unmasked sequence in it:
nice -n +19 twoBitToFa anoCar1.hard.2bit stdout | faSize stdin
# 1781602899 bases (781360259 N's 1000242640 real
# 1000242640 upper 0 lower) in 7233 sequences in 1 files
nice -n +19 twoBitToFa anoCar1.2bit stdout | faSize stdin
# 1781602899 bases (40123970 N's 1741478929 real
# 1000242640 upper 741236289 lower) in 7233 sequences in 1 files
# same number of total bases, the lowers have become Ns:
# 781360259 == 741236289 + 40123970
# the lower "reals" disappear from the "real" count:
# 1000242640 == 1741478929 - 741236289
# And, make sure there aren't any sequences in this lot that have
# become all N's with no sequence left in them:
twoBitToFa anoCar1.hard.2bit stdout \
| faCount stdin > anoCar1.hard.faCount
# 181 scaffolds end up with less than 100 bases of sequence left
egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \
| sort -k2nr | awk '{if ($2 < 100) { print }}' | wc -l
# 181
# leaving 7.052 scaffolds with more than 100 bases of seqence left:
egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \
| sort -k2nr | awk '{if ($2 >= 100) { print }}' | wc -l
# 7052
# make a list of those to extract their sequence:
egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \
| sort -k2nr | awk '{if ($2 >= 100) { print $1 }}' \
| sort > hard.genscan.list
twoBitToFa -seqList=hard.genscan.list anoCar1.hard.2bit genscan.hard.fa
# What do we have left to work with:
faSize genscan.hard.fa
# 1780575355 bases (780338843 N's 1000236512 real
# 1000236512 upper 0 lower) in 7052 sequences in 1 files
# creating 4,000,000 sized chunks, the largest scaffolds remain
# in single pieces, the scaffolds smaller than 4,000,000 are grouped
# into 4,000,000 sized fasta files. You don't want to break these
# things up because genscan will be doing its own internal 2.4 million
# window on these pieces, and the gene names are going to be
# constructed from the sequence name in these fasta files. The
# gene names are much better when they are this simple scaffoldN.M
# numbering scheme.
mkdir genscanSplit
faSplit about genscan.hard.fa 4000000 genscanSplit/c_
ssh hgwdev
mkdir /cluster/data/anoCar1/bed/genscan
cd /cluster/data/anoCar1/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kki
cd /cluster/data/anoCar1/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
# Since we split on gaps, we have no chunks like that. You can
# verify with faCount on the chunks.
ls -1S /cluster/data/anoCar1/genscanSplit/c_*.fa > chunk.list
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 chunk.list single template jobList
para create jobList
para try, check, push, check, ...
# Completed: 319 of 319 jobs
# CPU time in finished jobs: 39037s 650.62m 10.84h 0.45d 0.001 y
# IO & Wait Time: 1015s 16.91m 0.28h 0.01d 0.000 y
# Average job time: 126s 2.09m 0.03h 0.00d
# Longest finished job: 459s 7.65m 0.13h 0.01d
# Submission to last job: 3495s 58.25m 0.97h 0.04d
# cat results into single files
ssh kkstore05
cd /cluster/data/anoCar1/bed/genscan
cat gtf/c_*.gtf > genscan.gtf
cat subopt/c_*.bed > genscanSubopt.bed
cat pep/c_*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/anoCar1/bed/genscan
ldHgGene anoCar1 -gtf genscan genscan.gtf
# Read 28102 transcripts in 184045 lines in 1 files
# 28102 groups 4190 seqs 1 sources 1 feature types
# 28102 gene predictions
hgPepPred anoCar1 generic genscanPep genscan.pep
hgLoadBed -strict anoCar1 genscanSubopt genscanSubopt.bed
# Loaded 286327 elements of size 6
# check the numbers
time nice -n +19 featureBits anoCar1 genscan
# 31394034 bases of 1741478929 (1.803%) in intersection
#########################################################################
## BLASTZ GasAcu1/Stickleback swap (DONE - 2007-02-19 - Hiram)
## the original blastz to gasAcu1 measured
time nice -n +19 featureBits gasAcu1 chainAnoCar1Link \
> fb.gasAcu1.chainAnoCar1Link.txt 2>&1
# real 0m51.499s
# 56386298 bases of 446627861 (12.625%) in intersection
ssh kkstore05
mkdir /cluster/data/anoCar1/bed/blastz.gasAcu1.swap
cd /cluster/data/anoCar1/bed/blastz.gasAcu1.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-19/DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap > swap.log 2>&1 &
time nice -n +19 featureBits anoCar1 chainGasAcu1Link \
> fb.anoCar1.chainGasAcu1Link.txt 2>&1
# real 1m14.245s
# 54464074 bases of 1741478929 (3.127%) in intersection
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-02-19)
ssh kkstore05
bash # if not using bash shell already
mkdir /cluster/data/anoCar1/blastDb
cd /cluster/data/anoCar1
twoBitToFa anoCar1.unmasked.2bit temp.fa
faSplit sequence temp.fa 500 blastDb/
rm temp.fa
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
mkdir -p /san/sanvol1/scratch/anoCar1/blastDb
cd /cluster/data/anoCar1/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /san/sanvol1/scratch/anoCar1/blastDb
done
mkdir -p /cluster/data/anoCar1/bed/tblastn.hg18KG
cd /cluster/data/anoCar1/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/anoCar1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 496 query.lst
# we want around 200000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(200000/496) = 91.082960
mkdir -p /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa
split -l 90 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/anoCar1/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
# back to bash
exit
ssh pk
cd /cluster/data/anoCar1/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 202864 of 202864 jobs
# CPU time in finished jobs: 14747966s 245799.43m 4096.66h 170.69d 0.468 y
# IO & Wait Time: 1561940s 26032.34m 433.87h 18.08d 0.050 y
# Average job time: 80s 1.34m 0.02h 0.00d
# Longest finished job: 722s 12.03m 0.20h 0.01d
# Submission to last job: 93277s 1554.62m 25.91h 1.08d
ssh kkstore05
cd /cluster/data/anoCar1/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
exit
chmod +x chainOne
ls -1dS /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh kk
cd /cluster/data/anoCar1/bed/tblastn.hg18KG/chainRun
para create chainSpec
para maxNode 30
para try, check, push, check etc.
# Completed: 409 of 409 jobs
# CPU time in finished jobs: 2911s 48.52m 0.81h 0.03d 0.000 y
# IO & Wait Time: 65231s 1087.18m 18.12h 0.75d 0.002 y
# Average job time: 167s 2.78m 0.05h 0.00d
# Longest finished job: 287s 4.78m 0.08h 0.00d
# Submission to last job: 2318s 38.63m 0.64h 0.03d
ssh kkstore05
cd /cluster/data/anoCar1/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/anoCar1/bed/tblastn.hg18KG/blastHg18KG.psl
cd ..
pslCheck blastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/anoCar1/bed/tblastn.hg18KG
hgLoadPsl anoCar1 blastHg18KG.psl
# check coverage
featureBits anoCar1 blastHg18KG
# 21571582 bases of 1741478929 (1.239%) in intersection
ssh kkstore05
rm -rf /cluster/data/anoCar1/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut
#end tblastn
#########################################################################
## BLASTZ galGal3/Chicken swap (DONE - 2007-02-19 - Hiram)
## the original blastz to galGal3 measured
time nice -n +19 featureBits galGal3 chainAnoCar1Link \
> fb.galGal3.chainAnoCar1Link.txt 2>&1
# real 0m43.752s
# 106743952 bases of 1042591351 (10.238%) in intersection
ssh kkstore05
mkdir /cluster/data/anoCar1/bed/blastz.galGal3.swap
cd /cluster/data/anoCar1/bed/blastz.galGal3.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18/DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap > swap.log 2>&1 &
ssh hgwdev
cd /cluster/data/anoCar1/bed/blastz.galGal3.swap
time nice -n +19 featureBits anoCar1 chainGalGal3Link \
> fb.anoCar1.chainGalGal3Link.txt 2>&1
# real 1m8.359s
# 109074507 bases of 1741478929 (6.263%) in intersection
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2007-02-19 - Hiram)
# This will find repeats within the genome that should not be matched
# against. Uses 11-mers.
# Use -repMatch=620 (based on size -- for human we use 1024, and
# lizard size is ~60.4% of human judging by gapless anoCar1 vs. hg18
# genome sizes from featureBits.
# hg18 / anoCar1 non-gap bases
# 2881515245 / 1741478929 = 1.654636
# anoCar1 / hg18 non-gap bases
# 1741478929 / 2881515245 = 0.604362
# thus 1024 * 0.604362 ~= 620
ssh kkstore05
blat /cluster/data/anoCar1/anoCar1.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/data/anoCar1/11.ooc -repMatch=620
# Wrote 32070 overused 11-mers to /cluster/data/anoCar1/11.ooc
cp -p /cluster/data/anoCar1/11.ooc /san/sanvol1/scratch/anoCar1
cp -p /cluster/data/anoCar1/jkStuff/liftAll.lft \
/san/sanvol1/scratch/anoCar1
#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-02-20 - Hiram)
# Make a liftAll.lft that specifies 5M chunks for genbank:
# only a few of the largest scaffolds will be broken up, most of them not
ssh kkstore05
cd /cluster/data/anoCar1
simplePartition.pl anoCar1.2bit 5000000 /tmp/anoCar1
find /tmp/anoCar1 -type f | grep lft | xargs cat > jkStuff/liftAll.lft
rm -r /tmp/anoCar1
cp -p jkStuff/liftAll.lft /san/sanvol1/scratch/anoCar1
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add anoCar1 just after xenTro2
# anoCar1
anoCar1.serverGenome = /cluster/data/anoCar1/anoCar1.2bit
anoCar1.clusterGenome = /san/sanvol1/scratch/anoCar1/anoCar1.2bit
anoCar1.ooc = /san/sanvol1/scratch/anoCar1/11.ooc
anoCar1.lift = /san/sanvol1/scratch/anoCar1/liftAll.lft
anoCar1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter}
anoCar1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
anoCar1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
anoCar1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
anoCar1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter}
anoCar1.refseq.mrna.native.load = yes
anoCar1.genbank.est.native.load = no
anoCar1.refseq.mrna.xeno.load = yes
anoCar1.genbank.mrna.xeno.load = yes
anoCar1.downloadDir = anoCar1
anoCar1.perChromTables = no
cvs ci -m "Added anoCar1." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
# Edit src/lib/gbGenome.c to add new species.
#
cvs ci -m "Added Anolis carolinensis (lizard)." src/lib/gbGenome.c
make install-server
cd /cluster/data/genbank
screen
# This is a call to a script that will push our jobs out to the cluster
# since it's a big job.
nice -n +19 bin/gbAlignStep -initial anoCar1 &
# logFile: var/build/logs/2007.02.19-22:10:25.anoCar1.initalign.log
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad anoCar1
# real 8m9.108s
# enable daily alignment and update of hgwdev (DONE - 2007-02-20 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add anoCar1 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added anoCar1." etc/align.dbs etc/hgwdev.dbs
make etc-update
### (2007-05-16 markd)
# modify genbank to not load native RefSeq, since there are none.
# remove empty files and rerun gbDbLoadStep
anoCar1.refseq.mrna.native.load = no
nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad anoCar1
##########################################################################
## BLASTZ FROG xenTro2 (WORKING - 2007-02-20 - Hiram)
ssh kkstore04
mkdir /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20
cd /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20
cat << '_EOF_' > DEF
# Lizard vs frog
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ1_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ1_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=30
# TARGET: Frog xenTro2 - single chunk big enough for the largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/xenTro2/xenTro2.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=8000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-blastzOutRoot /cluster/bluearc/anoCar1XenTro2 > do.log 2>&1 &
## real 1522m46.550s
## this broke down during chaining because this file:
# -rw-rw-r-- 1 5982855 Feb 21 14:39 ../../pslParts/part039.lst.psl.g
## has a bogus character control-V in the middle on a number
## manually running through that one chain step with that line removed
## got the chaining completed, then continuing:
time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=chainMerge \
-blastzOutRoot /cluster/bluearc/anoCar1XenTro2 > chainMerge.log 2>&1 &
# real 119m12.634s
ssh hgwdev
cd /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20]
time nice -n +19 featureBits anoCar1 chainXenTro2Link \
> fb.anoCar1.chainXenTro2Link.txt 2>&1
# real 11m33.086s
# 83873500 bases of 1741478929 (4.816%) in intersection
ssh kkstore04
mkdir /cluster/data/xenTro2/bed/blastz.anoCar1.swap
cd /cluster/data/xenTro2/bed/blastz.anoCar1.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20/DEF \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap > swap.log 2>&1 &
############################################################################
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("anoCar1", "blat11", "17780", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("anoCar1", "blat11", "17781", "0", "1");' \
hgcentraltest
# test it with some sequence
#########################################################################
## BLASTZ mm8/Mouse swap (DONE - 2007-02-20 - Hiram)
## the original blastz to mm8 measured
time nice -n +19 featureBits mm8 chainAnoCar1Link \
> fb.mm8.chainAnoCar1Link.txt 2>&1
# real 1m37.380s
# 106743952 bases of 1042591351 (10.238%) in intersection
ssh kkstore04
mkdir /cluster/data/anoCar1/bed/blastz.mm8.swap
cd /cluster/data/anoCar1/bed/blastz.mm8.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm8/bed/blastz.anoCar1.2007-02-19/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk \
-swap > swap.log 2>&1 &
ssh hgwdev
cd /cluster/data/anoCar1/bed/blastz.mm8.swap
time nice -n +19 featureBits anoCar1 chainMm8Link \
> fb.anoCar1.chainMm8Link.txt 2>&1
# real 2m1.527s
# 82784787 bases of 1741478929 (4.754%) in intersection
# % Coverage of Lizard by: (chainMinScore,chainLinearGap,type masking)
# 6.456 - Human hg18 (5000,loose,windowMasker)
# 6.263 - Chicken galGal3 (5000,loose,windowMasker)
# 4.816 - Frog xenTro2 (5000,loose,windowMasker)
# 4.754 - Mouse mm8 (5000,loose,windowMasker)
# 3.127 - Stickleback gasAcu1 (5000,loose,windowMasker)
# % coverage of Chicken by:
# 10.238 - Lizard anoCar1 (5000,loose,windowMasker)
# 8.795 - Human hg18 (5000,loose,rmsk)
# 6.745 - Mouse mm8 (5000,loose,rmsk)
# 5.330 - Frog xenTro2 (5000,loose,rmsk)
# 3.144 - Stickleback gasAcu1 (2000,loose,windowMasker)
# % coverage of Frog by:
# 6.217 - Lizard anoCar1 (5000,loose,windowMasker)
# 5.634 - Human hg18 (5000,loose,rmsk)
# 5.358 - Mouse mm8 (5000,loose,rmsk)
# 4.776 - Chicken galGal3 (5000,loose,rmsk)
# x.xxx - Stickleback gasAcu1 (not yet done)
# % coverage of Human by:
# 34.514 - Mouse mm8 (3000,medium,rmsk)
# 4.774 - Lizard anoCar1 (5000,loose,windowMasker)
# 3.589 - Chicken galGal3 (5000,loose,rmsk)
# 2.623 - Frog xenTro2 (5000,loose,rmsk)
# 1.923 - Stickleback gasAcu1 (2000,loose,rmsk)
##########################################################################
## RepeatMasker run to cover all bases (DONE - 2007-03-07 - Hiram)
ssh kkstore02
mkdir /cluster/data/anoCar1/bed/RepeatMasker
cd /cluster/data/anoCar1/bed/RepeatMasker
time nice -n +19 doRepeatMasker.pl -verbose=2 -bigClusterHub=kk \
-buildDir=/cluster/data/anoCar1/bed/RepeatMasker anoCar1 > do.log 2>&1 &
############################################################################
## DOWNLOADS - (DONE - 2007-02-12 - 2007-02-16 - Hiram)
ssh hgwdev
cd /cluster/data/anoCar1
ln -s bed/RepeatMasker/anoCar1.fa.out .
~/kent/src/hg/utils/automation/makeDownloads.pl anoCar1 \
> makeDownloads.out 2>&1
# Doesn't work due to missing Repeat masker outputs
# Create WindowMasker separate files by chrom, for downloads
ssh kkstore05
cd /cluster/data/anoCar1/goldenPath/bigZips
ln -s ../../bed/WindowMasker.2007-02-16/windowmasker.sdust.bed.gz \
./anoCar1.WMSdust.bed.gz
# get GenBank native mRNAs
ssh hgwdev
cd /cluster/data/genbank
./bin/x86_64/gbGetSeqs -db=anoCar1 -native \
GenBank mrna /cluster/data/anoCar1/goldenPath/bigZips/mrna.fa
# get GenBank xeno mRNAs
./bin/x86_64/gbGetSeqs -db=anoCar1 -xeno \
GenBank mrna /cluster/data/anoCar1/goldenPath/bigZips/xenoMrna.fa
ssh kkstore05
cd /cluster/data/anoCar1/goldenPath/bigZips
gzip *.fa
md5sum *.gz > md5sum.txt
# Edit the README.txt file to be correct
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/anoCar1/bigZips
ln -s /cluster/data/anoCar1/goldenPath/bigZips/* .
############################################################################
## Default position set at IFG-1 (DONE - 2007-04-09 - Hiram)
ssh hgwdev
hgsql -e 'update dbDb set defaultPos="scaffold_72:3056494-3141055"
where name="anoCar1";' hgcentraltest
############################################################################
# SWAP ORNANA1 CHAIN/NET (DONE 5/2/07 angie)
ssh kkstore05
mkdir /cluster/data/anoCar1/bed/blastz.ornAna1.swap
cd /cluster/data/anoCar1/bed/blastz.ornAna1.swap
doBlastzChainNet.pl -swap \
/cluster/data/ornAna1/bed/blastz.anoCar1/DEF >& do.log &
tail -f do.log
ln -s blastz.ornAna1.swap /cluster/data/anoCar1/bed/blastz.ornAna1
############################################################################
# SWAP Mouse Mm9 chain/net (DONE - 2007-09-21 - hiram)
ssh kkstore04
screen # control this sequence with screen
# the original
cd /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
cat fb.mm9.chainAnoCar1Link.txt
# 89239796 bases of 2620346127 (3.406%) in intersection
# and for the swap
mkdir /cluster/data/anoCar1/bed/blastz.mm9.swap
cd /cluster/data/anoCar1/bed/blastz.mm9.swap
time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19/DEF -chainMinScore=5000 \
-swap -qRepeats=windowmaskerSdust \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
# real 29m12.291s
cat fb.anoCar1.chainMm9Link.txt
# 85923556 bases of 1741478929 (4.934%) in intersection
#########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
+############################################################################
+# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
+
+vertebrate-wide transMap alignments were built Tracks are created and loaded
+by a single Makefile. This is available from:
+ svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
+
+see doc/builds.txt for specific details.
+############################################################################