src/hg/makeDb/doc/canFam2.txt 1.22
1.22 2009/08/10 16:38:13 hartera
Documented production of zebrafish danRer5 chains, nets and downloads.
Index: src/hg/makeDb/doc/canFam2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/canFam2.txt,v
retrieving revision 1.21
retrieving revision 1.22
diff -b -B -U 1000000 -r1.21 -r1.22
--- src/hg/makeDb/doc/canFam2.txt 21 Jul 2009 21:01:41 -0000 1.21
+++ src/hg/makeDb/doc/canFam2.txt 10 Aug 2009 16:38:13 -0000 1.22
@@ -1,2912 +1,2956 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on the
# Dog (Canis familiaris) May 2005 release.
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+---------------------------------+
# | tableName | type |
# +-------------+---------------------------------+
# | refGene | genePred refPep refMrna |
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | nscanGene | genePred nscanPep |
# | genscan | genePred genscanPep |
# +-------------+---------------------------------+
# CREATE BUILD DIRECTORY (DONE 6/1/05 angie)
ssh kkstore01
mkdir /cluster/store9/canFam2
ln -s /cluster/store9/canFam2 /cluster/data/canFam2
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 6/1/05 angie)
mkdir /cluster/data/canFam2/M
cd /cluster/data/canFam2/M
# go to http://www.ncbi.nih.gov/ and search Nucleotide for
# "canis familiaris mitochondrion genome". That shows the gi number:
# 17737322
# Use that number in the entrez linking interface to get fasta:
wget -O chrM.fa \
'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=17737322&dopt=FASTA'
# Edit chrM.fa: make sure the long fancy header line says it's the
# Canis familiaris mitochondrion complete genome, and then replace the
# header line with just ">chrM".
# MAKE JKSTUFF AND BED DIRECTORIES (DONE 6/1/05 angie)
# This used to hold scripts -- better to keep them inline in the .doc
# so they're in CVS. Now it should just hold lift file(s) and
# temporary scripts made by copy-paste from this file.
mkdir /cluster/data/canFam2/jkStuff
# This is where most tracks will be built:
mkdir /cluster/data/canFam2/bed
# DOWNLOAD AGP, FASTA & QUAL (DONE 6/2/05 angie)
ssh kkstore01
mkdir /cluster/data/canFam2/broad
cd /cluster/data/canFam2/broad
ftp ftp.broad.mit.edu
prompt
bin
cd pub/assemblies/mammals/canFam2
mget contigs.bases.gz AAEX02.full_AGP Dog2.0.agp supercontigs
mget assembly.agp assembly.format contigs.quals.gz supercontigs.summary
bye
# Jean Chang is going to remove AAEX02.full_AGP to avoid confusion,
# so make sure we can tell people how to quickly regenerate it from
# Dog2.0.agp:
perl -wpe 'if (/contig_(\d+)/) { \
$a = sprintf "AAEX020%05d", $1+1; s/contig_\d+/$a/; }' \
Dog2.0.agp > /tmp/1.agp
diff /tmp/1.agp AAEX02.full_AGP | wc -l
#0
# Meanwhile, the AGP has chr01, chr02 instead of chr1, chr2... yecch.
# Substitute those out:
sed -e 's/^chr0/chr/' Dog2.0.agp > UCSC_Dog2.0.agp
# BUILD CHROM FA (DONE 6/28/05 angie)
ssh kkstore01
cd /cluster/data/canFam2/broad
awk '{print $1;}' UCSC_Dog2.0.agp | uniq > ../chrom.lst
nice gunzip contigs.bases.gz
foreach chr (`cat ../chrom.lst`)
set c = `echo $chr | sed -e 's/chr//'`
mkdir ../$c
awk '$1 == "'$chr'" {print;}' UCSC_Dog2.0.agp > ../$c/$chr.agp
agpToFa -simpleMultiMixed ../$c/$chr.agp $chr ../$c/$chr.fa contigs.bases
end
faSize ../*/chr*.fa
#2531673953 bases (146677410 N's 2384996543 real 2384996543 upper 0 lower) in 41 sequences in 41 files
#Total size: mean 61748145.2 sd 25246010.8 min 16727 (chrM) max 126883977 (chrX) median 61280721
#N count: mean 3577497.8 sd 1401887.8
#U count: mean 58170647.4 sd 24664411.9
#L count: mean 0.0 sd 0.0
# checkAgpAndFa prints out way too much info -- keep the end/stderr only:
cd /cluster/data/canFam2
foreach agp (?{,?}/chr*.agp)
set fa = $agp:r.fa
echo checking consistency of $agp and $fa
checkAgpAndFa $agp $fa | tail -1
end
nice gzip broad/contigs.bases
echo "chrM" >> chrom.lst
# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 6/28/05 angie)
ssh kkstore01
cd /cluster/data/canFam2
foreach agp (?{,?}/chr*.agp)
set fa = $agp:r.fa
echo splitting $agp and $fa
cp -p $agp $agp.bak
cp -p $fa $fa.bak
nice splitFaIntoContigs $agp $fa . -nSize=5000000
end
# No _random's in this assembly, so no need to clean up after them.
# Make a "pseudo-contig" for processing chrM too:
mkdir M/chrM_1
sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
mkdir M/lift
echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
echo "chrM_1" > M/lift/ordered.lst
set msize = `faSize M/chrM.fa | awk '{print $1;}'`
echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft
foreach f ( ?{,?}/chr*.fa.bak )
nice faCmp $f $f:r
end
# MAKE LIFTALL.LFT (DONE 6/28/05 angie)
ssh kkstore01
cd /cluster/data/canFam2
cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
# CREATING DATABASE (DONE 6/28/05 angie)
ssh hgwdev
hgsql '' -e 'create database canFam2'
# Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
df -h /var/lib/mysql
#/dev/sdc1 1.8T 940G 720G 57% /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE 6/28/05 angie)
ssh hgwdev
hgsql canFam2 -e \
"create table grp (PRIMARY KEY(NAME)) select * from hg17.grp"
# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 6/28/05 angie)
# Make .2bit, unmasked until RepeatMasker and TRF steps are done.
# Do this now so we can load up RepeatMasker and run featureBits;
# can also load up other tables that don't depend on masking.
ssh kkstore01
cd /cluster/data/canFam2
nice faToTwoBit ?{,?}/chr*.fa canFam2.2bit
mkdir bed/chromInfo
twoBitInfo canFam2.2bit stdout \
| awk '{print $1 "\t" $2 "\t/gbdb/canFam2/canFam2.2bit";}' \
> bed/chromInfo/chromInfo.tab
# Make symbolic links from /gbdb/canFam2/ to the real .2bit.
ssh hgwdev
mkdir /gbdb/canFam2
ln -s /cluster/data/canFam2/canFam2.2bit /gbdb/canFam2/
# Load /gbdb/canFam2/canFam2.2bit paths into database and save size info.
cd /cluster/data/canFam2
hgsql canFam2 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql canFam2 -e 'load data local infile \
"/cluster/data/canFam2/bed/chromInfo/chromInfo.tab" \
into table chromInfo;'
echo "select chrom,size from chromInfo" | hgsql -N canFam2 > chrom.sizes
# take a look at chrom.sizes, should be 41 lines
wc chrom.sizes
# 41 82 603 chrom.sizes
# GOLD AND GAP TRACKS (DONE 6/28/05 angie)
ssh hgwdev
cd /cluster/data/canFam2
hgGoldGapGl -noGl -chromLst=chrom.lst canFam2 /cluster/data/canFam2 .
# featureBits fails if there's no chrM_gap, so make one:
# echo "create table chrM_gap like chr1_gap" | hgsql canFam2
# oops, that won't work until v4.1, so do this for the time being:
hgsql canFam2 -e "create table chrM_gap select * from chr1_gap where 0=1"
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CANFAM2 (DONE 8/1/05 angie)
ssh hgwdev
cd $HOME/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add canFam2 in all the right places and do
make update
cvs commit makefile
mkdir -p dog/canFam2
cvs add dog dog/canFam2
cvs ci -m "trackDb dir for dog genome(s)" dog/canFam2
# Do this in a clean (up-to-date, no edits) tree:
make alpha
# Add dbDb entry (not a new organism so defaultDb and genomeClade already
# have entries):
hgsql -h genome-testdb hgcentraltest \
-e 'insert into dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
values("canFam2", "May 2005", \
"/gbdb/canFam2/nib", "Dog", "chr14:11072309-11078928", 1, \
18, "Dog", "Canis familiaris", \
"/gbdb/canFam2/html/description.html", 0, 0, \
"Broad Institute v. 2.0");'
# REPEAT MASKING (DONE braney/angie 7-10-05)
#- Split contigs into 500kb chunks, at gaps if possible:
ssh kkstore01
cd /cluster/data/canFam2
foreach chr (`cat chrom.lst`)
set c = `echo $chr | sed -e 's/chr//'`
foreach d ($c/chr${c}*_?{,?})
cd $d
echo "splitting $d"
set contig = $d:t
faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
cd ../..
end
end
#- Make the run directory and job list:
cd /cluster/data/canFam2
cat << '_EOF_' > jkStuff/RMDog
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/canFam2/$2
/bin/cp $2 /tmp/canFam2/$2/
cd /tmp/canFam2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec dog $2
popd
/bin/cp /tmp/canFam2/$2/$2.out ./
if (-e /tmp/canFam2/$2/$2.align) /bin/cp /tmp/canFam2/$2/$2.align ./
if (-e /tmp/canFam2/$2/$2.tbl) /bin/cp /tmp/canFam2/$2/$2.tbl ./
if (-e /tmp/canFam2/$2/$2.cat) /bin/cp /tmp/canFam2/$2/$2.cat ./
/bin/rm -fr /tmp/canFam2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/canFam2
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMDog
mkdir RMRun
cp /dev/null RMRun/RMJobs
foreach chr (`cat chrom.lst`)
set c = `echo $chr | sed -e 's/chr//'`
foreach d ($c/chr${c}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/canFam2/jkStuff/RMDog \
/cluster/data/canFam2/$d $f \
'{'check out line+ /cluster/data/canFam2/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
end
#- Do the run
ssh kk
cd /cluster/data/canFam2/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
# Completed: 6149 of 6149 jobs
# CPU time in finished jobs: 32138805s 535646.75m 8927.45h 371.98d 1.019 y
# IO & Wait Time: 346449s 5774.15m 96.24h 4.01d 0.011 y
# Average job time: 5283s 88.05m 1.47h 0.06d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 8642s 144.03m 2.40h 0.10d
# Submission to last job: 147094s 2451.57m 40.86h 1.70d
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kkstore01
cd /cluster/data/canFam2
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out > /dev/null
end
#- Lift pseudo-contigs to chromosome level
foreach c (`cat chrom.lst`)
echo lifting $c
set dir=`echo $c | sed "s/chr//" `
cd $dir
liftUp $c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` > /dev/null
cd ..
end
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/canFam2
hgLoadOut canFam2 */chr*.fa.out
# VERIFY REPEATMASKER RESULTS (DONE 2005-07-11 braney)
# Eyeball some repeat annotations in the browser, compare to lib seqs.
# Run featureBits on canFam2 and on a comparable genome build, and compare:
ssh hgwdev
featureBits canFam2 rmsk
# 968054174 bases of 2384996543 (40.589%) in intersection
#canFam1 is
#896773874 bases of 2359845093 (38.001%) in intersection
# SIMPLE REPEATS (TRF) (DONE 2005-07-11 braney)
# PARTIALLY REDONE 2005-12-02 angie -- See partial redo notes below...
ssh kkstore01
mkdir /cluster/data/canFam2/bed/simpleRepeat
cd /cluster/data/canFam2/bed/simpleRepeat
mkdir trf
cp /dev/null jobs.csh
foreach d (/cluster/data/canFam2/*/chr*_?{,?})
set ctg = $d:t
foreach f ($d/${ctg}.fa)
set fout = $f:t:r.bed
echo $fout
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" >> jobs.csh
end
end
csh -ef jobs.csh >&! jobs.log &
# check on this with
tail -f jobs.log
wc -l jobs.csh
ls -1 trf | wc -l
endsInLf trf/*
# When job is done do:
liftUp simpleRepeat.bed /cluster/data/canFam2/jkStuff/liftAll.lft warn trf/*.bed
# Load into the database:
ssh hgwdev
hgLoadBed canFam2 simpleRepeat \
/cluster/data/canFam2/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# load of simpleRepeat did not go as planned: 637318 record(s), 0 row(s) skipped, 1172 warning(s) loading bed.tab
featureBits canFam2 simpleRepeat
# 52855902 bases of 2384996543 (2.216%) in intersection
# canFam1 is
#36509895 bases of 2359845093 (1.547%) in intersection
# 2005-12-02 Error with simpleRepeats found during QA. 1622 entries have what
# appears to be a parsing error. The "sequence" value is a float number of
# format X.XX instead of an ATCG sequence. Lines removed from active tables.
# Copy of original data on hgwdev.canFam2.simpleRepeat_1202problem
# Partial redo 12/2/05 -- Jen found some lines with invalid values in
# the database. All were from trf/chr10_1.bed which I moved aside to
# trf/chr10_1.bed.bad. When I reran the chr10_1 job from jobs.csh,
# it produced a valid output file. So I did the rest of the steps and
# reloaded.
ssh kolossus
cd /cluster/data/canFam2/bed/simpleRepeat
mv trf/chr10_1.bed{,.bad}
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf /cluster/data/canFam2/10/chr10_1/chr10_1.fa /dev/null -bedAt=trf/chr10_1.bed -tempDir=/tmp
mv simpleRepeat.bed simpleRepeat.bed.bad
liftUp simpleRepeat.bed /cluster/data/canFam2/jkStuff/liftAll.lft warn trf/*.bed
# on hgwdev:
featureBits canFam2 simpleRepeat
#52885863 bases of 2384996543 (2.217%) in intersection
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
ssh hgwdev
cd /cluster/data/canFam2/bed
mkdir microsat
cd microsat
awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
/cluster/bin/i386/hgLoadBed canFam2 microsat microsat.bed
# PROCESS SIMPLE REPEATS INTO MASK (DONE 2005-07-11 braney)
# 2005-12-02 angie: continuing redo of chr10_1, see below.
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kkstore01
cd /cluster/data/canFam2/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/canFam2
mkdir bed/simpleRepeat/trfMaskChrom
foreach c (`cat chrom.lst`)
set dir=`echo $c | sed "s/chr//" `
if (-e $dir/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' $dir/lift/ordered.lst > $dir/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$dir.bed jkStuff/liftAll.lft warn `cat $dir/lift/oTrf.lst`
endif
end
# Here's the coverage for the filtered TRF:
ssh hgwdev
cat /cluster/data/canFam2/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed
featureBits canFam2 /tmp/filtTrf.bed
#23111877 bases of 2384996543 (0.969%) in intersection
# canFam1 was
#23017541 bases of 2359845093 (0.975%) in intersection
featureBits canFam2 /tmp/filtTrf.bed \!rmsk
# 1205611 bases of 2384996543 (0.051%) in intersection
#canFam1 was
#1275941 bases of 2359845093 (0.054%) in intersection
# 12/2/2005 -- since I regenerated trf/chr10_1.bed above, continue here...
ssh kolossus
cd /cluster/data/canFam2/bed/simpleRepeat
mv trfMask/chr10_1.bed{,.orig}
awk '{if ($5 <= 12) print;}' trf/chr10_1.bed > trfMask/chr10_1.bed
mv trfMaskChrom/chr10.bed{,.orig}
cd ../..
liftUp bed/simpleRepeat/trfMaskChrom/chr10.bed jkStuff/liftAll.lft warn `cat 10/lift/oTrf.lst`
ssh hgwdev
cat /cluster/data/canFam2/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed
featureBits canFam2 /tmp/filtTrf.bed
#23133898 bases of 2384996543 (0.970%) in intersection
featureBits canFam2 /tmp/filtTrf.bed \!rmsk
#1206965 bases of 2384996543 (0.051%) in intersection
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 2005-07-11 braney)
# REDONE 2005-12-02 angie (to pick up the chr10_1 simpleRepeat redo)
ssh kkstore01
cd /cluster/data/canFam2
# Soft-mask (lower-case) the contig and chr .fa's,
# then make hard-masked versions from the soft-masked.
set trfCtg=bed/simpleRepeat/trfMask
set trfChr=bed/simpleRepeat/trfMaskChrom
foreach f (*/chr*.fa)
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
set chr = $f:t:r
maskOutFa -softAdd $f $trfChr/$chr.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
# Tons of warnings like this, mostly for L1M*:
#WARNING: negative rEnd: -189 chrX:117586389-117586475 L1M3e
foreach c (`cat chrom.lst`)
set c=`echo $c | sed "s/chr//" `
echo "repeat- and trf-masking contigs of chr$c"
foreach d ($c/chr*_?{,?})
set ctg=$d:t
set f=$d/$ctg.fa
maskOutFa -soft $f $f.out $f
maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
maskOutFa $f hard $f.masked
end
end
#- Rebuild the nib files, using the soft masking in the fa:
foreach f (*/chr*.fa)
faToNib -softMask $f nib/$f:t:r.nib
end
# Make one big 2bit file as well, and make a link to it in
# /gbdb/canFam2/nib because hgBlat looks there:
faToTwoBit */chr*.fa canFam2.2bit
ssh hgwdev
ln -s /cluster/data/canFam2/canFam2.2bit /gbdb/canFam2/nib/
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 8/1/05 angie)
ssh hgwdev
mkdir /gbdb/canFam2/html
# Write ~/kent/src/hg/makeDb/trackDb/dog/canFam2/description.html
# with a description of the assembly and some sample position queries.
chmod a+r $HOME/kent/src/hg/makeDb/trackDb/dog/canFam2/description.html
# Check it in and copy (ideally using "make alpha" in trackDb) to
# /gbdb/canFam2/html
# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 8/1/05 angie)
# pitakluster (blastz etc):
ssh pk
mkdir /san/sanvol1/scratch/canFam2
rsync -av /cluster/data/canFam2/nib /san/sanvol1/scratch/canFam2/
mkdir /san/sanvol1/scratch/canFam2/rmsk
cp -p /cluster/data/canFam2/*/chr*.fa.out \
/san/sanvol1/scratch/canFam2/rmsk
# small cluster (chaining etc):
ssh kkr1u00
mkdir -p /iscratch/i/canFam2/nib
rsync -av /cluster/data/canFam2/nib /iscratch/i/canFam2/
iSync
# big cluster (genbank):
ssh kkstore01
mkdir /cluster/bluearc/scratch/hg/canFam2
rsync -av /cluster/data/canFam2/nib /cluster/bluearc/scratch/hg/canFam2/
# ask cluster-admin to rsync that to all big cluster nodes' /scratch/...
# MAKE LINEAGE-SPECIFIC REPEATS VS. HUMAN, MOUSE (DONE 8/1/05 angie)
ssh kolossus
cd /san/sanvol1/scratch/canFam2/rmsk
# Run Arian's DateRepsinRMoutput.pl to add extra columns telling
# whether repeats in -query are also expected in -comp species.
# Human in extra column 1, Mouse in extra column 2
foreach outfl ( *.out )
echo "$outfl"
/cluster/bluearc/RepeatMasker/DateRepeats \
${outfl} -query dog -comp human -comp mouse
end
# Now extract human (extra column 1), mouse (extra column).
cd ..
mkdir linSpecRep.notInHuman
mkdir linSpecRep.notInMouse
foreach f (rmsk/*.out_homo-sapiens_mus-musculus)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 1 $f > \
linSpecRep.notInHuman/$base.out.spec
/cluster/bin/scripts/extractLinSpecReps 2 $f > \
linSpecRep.notInMouse/$base.out.spec
end
wc -l rmsk/*.out
#4533630 total
wc -l linSpecRep.notInHuman/*
#1542788 total
wc -l linSpecRep.notInMouse/*
#1546408 total
# Clean up.
rm rmsk/*.out_h*
# PRODUCING GENSCAN PREDICTIONS (DONE 8/11/05 angie)
ssh hgwdev
mkdir /cluster/data/canFam2/bed/genscan
cd /cluster/data/canFam2/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kki
cd /cluster/data/canFam2/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
rm -f genome.list
touch genome.list
foreach f ( `ls -1S /cluster/data/canFam2/*/chr*_*/chr*_?{,?}.fa.masked` )
egrep '[ACGT]' $f > /dev/null
if ($status == 0) echo $f >> genome.list
end
wc -l genome.list
#495
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para make jobList
#Completed: 493 of 495 jobs
#Crashed: 2 jobs
#Average job time: 918s 15.30m 0.25h 0.01d
#Longest finished job: 30383s 506.38m 8.44h 0.35d
#Submission to last job: 132236s 2203.93m 36.73h 1.53d
# If there are crashes, diagnose with "para problems" and "para crashed".
# If a job crashes due to genscan running out of memory, re-run it
# manually with "-window=1200000" instead of "-window=2400000".
ssh kkr7u00
cd /cluster/data/canFam2/bed/genscan
/cluster/bin/x86_64/gsBig /cluster/data/canFam2/1/chr1_23/chr1_23.fa.masked gtf/chr1_23.fa.gtf -trans=pep/chr1_23.fa.pep -subopt=subopt/chr1_23.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
/cluster/bin/x86_64/gsBig /cluster/data/canFam2/27/chr27_1/chr27_1.fa.masked gtf/chr27_1.fa.gtf -trans=pep/chr27_1.fa.pep -subopt=subopt/chr27_1.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
ls -1 gtf | wc -l
# 495
endsInLf gtf/*
# Convert these to chromosome level files as so:
ssh kkstore01
cd /cluster/data/canFam2/bed/genscan
liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/canFam2/bed/genscan
ldHgGene -gtf canFam2 genscan genscan.gtf
hgPepPred canFam2 generic genscanPep genscan.pep
hgLoadBed canFam2 genscanSubopt genscanSubopt.bed
# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 8/1/05 angie)
ssh kolossus
# numerator is canFam2 gapless bases as reported by featureBits,
# denominator is hg17 gapless bases as reported by featureBits,
# 1024 is threshold used for human -repMatch:
calc \( 2384996543 / 2867328468 \) \* 1024
# ( 2384996543 / 2867328468 ) * 1024 = 851.746316
# ==> use -repMatch=852 according to size scaled down from 1024 for human.
mkdir /cluster/bluearc/canFam2
mkdir /cluster/data/canFam2/bed/ooc
cd /cluster/data/canFam2/bed/ooc
ls -1 /cluster/data/canFam2/nib/chr*.nib > nib.lst
blat nib.lst /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/canFam2/11.ooc -repMatch=852
#Wrote 27388 overused 11-mers to /cluster/bluearc/canFam2/11.ooc
# AUTO UPDATE GENBANK MRNA RUN (DONE 8/8/05 angie)
# 2/22/06: added refGene
ssh hgwdev
# Update genbank config and source in CVS:
cd ~/kent/src/hg/makeDb/genbank
cvsup .
# Edit etc/genbank.conf and add these lines:
# canFam2 (dog)
canFam2.genome = /scratch/hg/canFam2/nib/chr*.nib
canFam2.lift = /cluster/data/canFam2/jkStuff/liftAll.lft
canFam2.refseq.mrna.native.load = no
canFam2.refseq.mrna.xeno.load = yes
canFam2.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
canFam2.genbank.mrna.xeno.load = yes
canFam2.genbank.est.xeno.load = no
canFam2.downloadDir = canFam2
cvs ci etc/genbank.conf
# Edit src/align/gbBlat to add /cluster/bluearc/canFam2/11.ooc
cvs diff src/align/gbBlat
make
cvs ci src/align/gbBlat
# Install to /cluster/data/genbank:
make install-server
ssh eieio
cd /cluster/data/genbank
# This is an -initial run, (xeno) refseq only:
nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial canFam2 &
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2
featureBits canFam2 xenoRefGene
#41278562 bases of 2384996543 (1.731%) in intersection
# Clean up:
rm -rf work/initial.canFam2
ssh eieio
cd /cluster/data/genbank
# This is an -initial run, mRNA only:
nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial canFam2 &
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2
featureBits canFam2 mrna
#2061533 bases of 2384996543 (0.086%) in intersection
featureBits canFam2 xenoMrna
#63057460 bases of 2384996543 (2.644%) in intersection
# Clean up:
rm -rf work/initial.canFam2
ssh eieio
# -initial for ESTs:
nice bin/gbAlignStep -srcDb=genbank -type=est -initial canFam2 &
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 canFam2 &
featureBits canFam2 intronEst
#16241242 bases of 2384996543 (0.681%) in intersection
featureBits canFam2 est
#41719045 bases of 2384996543 (1.749%) in intersection
# Clean up:
rm -rf work/initial.canFam2
# 2/22/06: add native refseq
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
# genbank.conf: canFam2.refseq.mrna.native.load = yes
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=refseq -type=mrna -orgCat=native -initial \
canFam2 &
tail -f var/build/logs/2006.02.22-21:03:48.canFam2.initalign.log
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2 &
tail -f var/dbload/hgwdev/logs/2006.02.22-21:19:53.dbload.log
featureBits canFam2 refGene
#1222436 bases of 2384996543 (0.051%) in intersection
# SWAP CHAINS FROM HG17, BUILD NETS ETC. (DONE 8/3/05 angie - REDONE 12/13/05 - REDONE 2/8/06)
# hg17-canFam2 was redone 12/12/05 (makeHg17.doc) so re-running this...
# and again 2/8/06, sheesh...
mkdir /cluster/data/canFam2/bed/blastz.hg17.swap
cd /cluster/data/canFam2/bed/blastz.hg17.swap
doBlastzChainNet.pl -swap /cluster/data/hg17/bed/blastz.canFam2/DEF \
>& do.log &
tail -f do.log
# Add {chain,net}Hg17 to trackDb.ra if necessary.
# RE-RUN NETTOAXT, AXTTOMAF FOR HG17 (DONE 10/31/05 angie)
# Obsoleted 12/13/05 by re-run of canFam2-hg17 above.
# Kate fixed netToAxt to avoid duplicated blocks, which is important
# for input to multiz. Regenerate maf using commands from sub-script
# netChains.csh generated by doBlastzChainNet.pl above.
ssh kolossus
cd /cluster/data/canFam2/bed/blastz.hg17.swap/axtChain
netSplit canFam2.hg17.net.gz net
chainSplit chain canFam2.hg17.all.chain.gz
cd ..
mv axtNet axtNet.orig
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
/cluster/data/canFam2/nib /iscratch/i/hg17/nib stdout \
| axtSort stdin stdout \
| gzip -c > axtNet/$f:t:r.canFam2.hg17.net.axt.gz
end
rm -r mafNet
mkdir mafNet
foreach f (axtNet/*.canFam2.hg17.net.axt.gz)
axtToMaf -tPrefix=canFam2. -qPrefix=hg17. $f \
/cluster/data/canFam2/chrom.sizes /cluster/data/hg17/chrom.sizes \
stdout \
| gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end
rm -r axtChain/{chain,net}/ axtNet.orig
# QUALITY SCORES (DONE 8/11/05 angie)
ssh kkstore01
mkdir /cluster/data/canFam2/bed/quality
cd /cluster/data/canFam2/bed/quality
qaToQac ../../broad/contigs.quals.gz stdout \
| qacAgpLift ../../broad/UCSC_Dog2.0.agp stdin chrom.qac
mkdir wigData
# Build .wig, .wib files in current directory so that "wigData/" doesn't
# appear in the .wig's:
cd wigData
foreach agp (../../../*/chr*.agp)
set chr = $agp:t:r
set abbrev = `echo $chr | sed -e 's/^chr//; s/_random/r/;'`
echo $chr to qual_$abbrev wiggle
qacToWig -fixed ../chrom.qac -name=$chr stdout \
| wigEncode stdin qual_$abbrev.{wig,wib}
end
# Verify size of .wib file = chrom length
foreach f (*.wib)
set abbrev = $f:t:r
set chr = `echo $abbrev | sed -e 's/^qual_/chr/; s/r$/_random/;'`
set wibLen = `ls -l $f | awk '{print $5;}'`
set chromLen = `grep -w $chr ../../../chrom.sizes | awk '{print $2;}'`
if ($wibLen != $chromLen) then
echo "ERROR: $chr size is $chromLen but wib size is $wibLen"
else
echo $chr OK.
endif
end
# /gbdb & load:
ssh hgwdev
cd /cluster/data/canFam2/bed/quality/wigData
mkdir -p /gbdb/canFam2/wib
ln -s `pwd`/*.wib /gbdb/canFam2/wib
hgLoadWiggle canFam2 quality *.wig
# GC 5 BASE WIGGLE TRACK (DONE 8/11/05 angie)
ssh kki
mkdir /cluster/data/canFam2/bed/gc5Base
cd /cluster/data/canFam2/bed/gc5Base
cat > doGc5Base.csh <<'_EOF_'
#!/bin/csh -fe
set chr = $1
set c = `echo $chr | sed -e 's/^chr//; s/_random/r/;'`
/cluster/bin/x86_64/hgGcPercent \
-chr=${chr} -wigOut -doGaps -file=stdout -win=5 canFam2 \
/iscratch/i/canFam2/nib \
| /cluster/bin/x86_64/wigEncode stdin gc5Base_${c}{.wig,.wib}
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doGc5Base.csh
cp /dev/null spec
foreach c (`cat ../../chrom.lst`)
echo "./doGc5Base.csh $c" >> spec
end
para make spec
para time
#Completed: 41 of 41 jobs
#Average job time: 23s 0.38m 0.01h 0.00d
#Longest finished job: 44s 0.73m 0.01h 0.00d
#Submission to last job: 259s 4.32m 0.07h 0.00d
# /gbdb and load track on hgwdev
ssh hgwdev
cd /cluster/data/canFam2/bed/gc5Base
mkdir -p /gbdb/canFam2/wib
ln -s `pwd`/*.wib /gbdb/canFam2/wib
hgLoadWiggle canFam2 gc5Base *.wig
# EXTRACT LINEAGE-SPECIFIC REPEATS FOR RAT (DONE 8/12/05 angie)
ssh kolossus
cd /panasas/store/canFam2/rmsk
# Run Arian's DateRepsinRMoutput.pl to add extra columns telling
# whether repeats in -query are also expected in -comp species.
foreach outfl ( *.out )
echo "$outfl"
/cluster/bluearc/RepeatMasker/DateRepeats \
${outfl} -query dog -comp rat
end
# Now extract rat (extra column 1):
cd ..
mkdir linSpecRep.notInRat
foreach f (rmsk/*.out_rattus)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractRepeats 1 $f > \
linSpecRep.notInRat/$base.out.spec
end
# Clean up.
rm rmsk/*.out_rat*
# LOAD CPGISSLANDS (DONE 8/12/05 angie)
ssh hgwdev
mkdir -p /cluster/data/canFam2/bed/cpgIsland
cd /cluster/data/canFam2/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
mv cpglh.exe /cluster/data/canFam2/bed/cpgIsland/
ssh kolossus
cd /cluster/data/canFam2/bed/cpgIsland
foreach f (../../*/chr*.fa.masked)
set fout=$f:t:r:r.cpg
echo running cpglh on $f to $fout
./cpglh.exe $f > $fout
end
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
# load into database:
ssh hgwdev
cd /cluster/data/canFam2/bed/cpgIsland
hgLoadBed canFam2 cpgIslandExt -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
wc -l cpgIsland.bed
# 47804 cpgIsland.bed
featureBits canFam2 cpgIslandExt
#38974374 bases of 2384996543 (1.634%) in intersection
# ANDY LAW CPGISSLANDS (DONE 8/12/05 angie)
# See notes in makeGalGal2.doc.
ssh kolossus
mkdir /cluster/data/canFam2/bed/cpgIslandGgfAndy
cd /cluster/data/canFam2/bed/cpgIslandGgfAndy
# Use masked sequence since this is a mammal...
cp /dev/null cpgIslandGgfAndyMasked.bed
foreach f (../../*/chr*.fa.masked)
set chr = $f:t:r:r
echo preproc and run on masked $chr
/cluster/home/angie/bin/i386/preProcGgfAndy $f \
| /cluster/home/angie/ggf-andy-cpg-island.pl \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
$gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \
$pGc = (100.0 * $gc / $n); \
$_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
"$pCpG\t$pGc\t$oE\n";' \
>> cpgIslandGgfAndyMasked.bed
end
# load into database:
ssh hgwdev
cd /cluster/data/canFam2/bed/cpgIslandGgfAndy
sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
hgLoadBed canFam2 cpgIslandGgfAndyMasked -tab -noBin \
-sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
featureBits canFam2 cpgIslandExt
#38974374 bases of 2384996543 (1.634%) in intersection
featureBits canFam2 cpgIslandGgfAndyMasked
#99187178 bases of 2384996543 (4.159%) in intersection
wc -l ../cpgIsland/cpgIsland.bed *bed
# 47804 ../cpgIsland/cpgIsland.bed
# 138037 cpgIslandGgfAndyMasked.bed
# MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN & BEYOND (DONE 8/15/05 angie)
# In an email 2/13/04, Arian said we could treat all human repeats as
# lineage-specific for human-chicken blastz. Do the same for dog.
# Scripts expect *.out.spec filenames, so set that up:
ssh kkstore01
cd /cluster/data/canFam2
mkdir /panasas/store/canFam2/linSpecRep.notInNonMammal
foreach f (/panasas/store/canFam2/rmsk/chr*.fa.out)
ln $f /panasas/store/canFam2/linSpecRep.notInNonMammal/$f:t:r:r.out.spec
end
# SWAP CHAINS FROM MM6, BUILD NETS ETC. (DONE 8/15/05 angie)
# REDONE by hiram 12/7/05 -- tables loaded as {chain,net}Mm6u1 then
# renamed to {chain,net}Mm6 12/13/05 angie
mkdir /cluster/data/canFam2/bed/blastz.mm6.swap
cd /cluster/data/canFam2/bed/blastz.mm6.swap
doBlastzChainNet.pl -swap /cluster/data/mm6/bed/blastz.canFam2/DEF \
>& do.log
echo "check /cluster/data/canFam2/bed/blastz.mm6.swap/do.log" \
| mail -s "check do.log" $USER
# Add {chain,net}Mm6 to trackDb.ra if necessary.
# RE-RUN NETTOAXT, AXTTOMAF FOR MM6 (DONE 11/1/05 angie)
# Obsoleted by re-run of canFam2-mm6 above.
# Kate fixed netToAxt to avoid duplicated blocks, which is important
# for input to multiz. Regenerate maf using commands from sub-script
# netChains.csh generated by doBlastzChainNet.pl above.
ssh kolossus
cd /cluster/data/canFam2/bed/blastz.mm6.swap/axtChain
netSplit canFam2.mm6.net.gz net
chainSplit chain canFam2.mm6.all.chain.gz
cd ..
mv axtNet axtNet.orig
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
/cluster/data/canFam2/nib /cluster/data/mm6/nib stdout \
| axtSort stdin stdout \
| gzip -c > axtNet/$f:t:r.canFam2.mm6.net.axt.gz
end
rm -r mafNet
mkdir mafNet
foreach f (axtNet/*.canFam2.mm6.net.axt.gz)
axtToMaf -tPrefix=canFam2. -qPrefix=mm6. $f \
/cluster/data/canFam2/chrom.sizes /cluster/data/mm6/chrom.sizes \
stdout \
| gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end
rm -r axtChain/{chain,net}/ axtNet.orig
# SWAP CHAINS FROM RN3, BUILD NETS ETC. (DONE 8/16/05 angie)
mkdir /cluster/data/canFam2/bed/blastz.rn3.swap
cd /cluster/data/canFam2/bed/blastz.rn3.swap
doBlastzChainNet.pl -swap /cluster/data/rn3/bed/blastz.canFam2/DEF \
>& do.log
echo "check /cluster/data/canFam2/bed/blastz.rn3.swap/do.log" \
| mail -s "check do.log" $USER
# Add {chain,net}Rn3 to trackDb.ra if necessary.
# Found that downloads in rn3 was missing 2005-12-21 - remake - Hiram
cd /cluster/data/rn3/bed/blastz.canFam2
/cluster/bin/scripts/doBlastzChainNet.pl -continue=download \
/cluster/data/rn3/bed/blastz.canFam2/DEF \
-chainMinScore=1000 -chainLinearGap=loose > downloads.out 2>&1
# RE-RUN NETTOAXT, AXTTOMAF FOR RN3 (DONE 11/2/05 angie)
# Kate fixed netToAxt to avoid duplicated blocks, which is important
# for input to multiz. Regenerate maf using commands from sub-script
# netChains.csh generated by doBlastzChainNet.pl above.
ssh kolossus
cd /cluster/data/canFam2/bed/blastz.rn3.swap/axtChain
netSplit canFam2.rn3.net.gz net
chainSplit chain canFam2.rn3.all.chain.gz
cd ..
mv axtNet axtNet.orig
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
/cluster/data/canFam2/nib /cluster/data/rn3/nib stdout \
| axtSort stdin stdout \
| gzip -c > axtNet/$f:t:r.canFam2.rn3.net.axt.gz
end
rm -r mafNet
mkdir mafNet
foreach f (axtNet/*.canFam2.rn3.net.axt.gz)
axtToMaf -tPrefix=canFam2. -qPrefix=rn3. $f \
/cluster/data/canFam2/chrom.sizes /cluster/data/rn3/chrom.sizes \
stdout \
| gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end
rm -r axtChain/{chain,net}/ axtNet.orig
# MAKE THIS THE DEFAULT ASSEMBLY WHEN THERE ARE ENOUGH TRACKS (DONE 11/28/05 angie)
hgsql -h genome-testdb hgcentraltest \
-e 'update defaultDb set name = "canFam2" where genome = "Dog";'
# MAKE Human Proteins track (DONE braney 2005-12-10)
ssh kkstore01
cd /cluster/data/canFam2
cat */chr*/*.lft > jkStuff/subChr.lft
mkdir blastDb
for i in */*/*_*_*.fa; do ln `pwd`/$i blastDb; done
cd blastDb
for i in *.fa; do /cluster/bluearc/blast229/formatdb -p F -i $i; rm $i; done
ssh pk
destDir=/san/sanvol1/scratch/canFam2/blastDb
mkdir -p $destDir
cd /cluster/data/canFam2/blastDb
for i in nin nsq nhr; do cp *.$i $destDir; done
mkdir -p /cluster/data/canFam2/bed/tblastn.hg17KG
cd /cluster/data/canFam2/bed/tblastn.hg17KG
find $destDir -name "*.nsq" | sed "s/\.nsq//" > target.lst
mkdir kgfa
# calculate a reasonable number of jobs
calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(500000/`wc target.lst | awk "{print \\\$1}"`\)
# 37365/(500000/6149) = 459.514770
split -l 460 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
rm -rf /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut
mkdir -p /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut
ln -s /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/canFam2/jkStuff/subChr.lft carry $f.3
liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/canFam2/jkStuff/liftAll.lft carry $f.4
mv $3.tmp $3
rm -f $f.1 $f.2
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'
exit
chmod +x blastSome
gensub2 target.lst kg.lst blastGsub blastSpec
para create blastSpec
para push
# Completed: 504218 of 504218 jobs
# CPU time in finished jobs: 28863259s 481054.31m 8017.57h 334.07d 0.915 y
# IO & Wait Time: 4255045s 70917.42m 1181.96h 49.25d 0.135 y
# Average job time: 66s 1.09m 0.02h 0.00d
# Longest finished job: 443s 7.38m 0.12h 0.01d
# Submission to last job: 166648s 2777.47m 46.29h 1.93d
ssh kki
cd /cluster/data/canFam2/bed/tblastn.hg17KG
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1) $(path2)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.$2_*.psl | /cluster/home/braney/bin/i386/simpleChain -chrom=$2 -prot -outPsl -maxGap=250000 stdin ../c.$2.`bas
ename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec ../../chrom.lst
para create chainSpec
para push
# Completed: 3362 of 3362 jobs
# CPU time in finished jobs: 1078406s 17973.43m 299.56h 12.48d 0.034 y
# IO & Wait Time: 85905s 1431.75m 23.86h 0.99d 0.003 y
# Average job time: 346s 5.77m 0.10h 0.00d
# Longest finished job: 22556s 375.93m 6.27h 0.26d
# Submission to last job: 106376s 1772.93m 29.55h 1.23d
ssh kkstore01
cd /cluster/data/canFam2/bed/tblastn.hg17KG/blastOut
for i in kg??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.*.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/canFam2/bed/tblastn.hg17KG/blastHg17KG.psl
cd ..
wc blastHg17KG.psl
# 64343 1351203 10913814 blastHg17KG.psl
pslUniq blastDm2FB.psl stdout | wc
# 18827 395367 2992653
cat kgfa/*fa | grep ">" | wc
# 309494 309494 4810327
ssh hgwdev
cd /cluster/data/canFam2/bed/tblastn.hg17KG
hgLoadPsl canFam2 blastHg17KG.psl
featureBits canFam2 blastHg17KG
# 35781547 bases of 2384996543 (1.500%) in intersection
exit
# back to kksilo
rm -rf blastOut
# End tblastn
# BLASTZ SELF (DONE braney 8-29-2005)
# For future reference -- doBlastzChainNet.pl should have been used
ssh pk
mkdir -p /cluster/data/canFam2/bed/blastz.canFam2
cd /cluster/data/canFam2/bed/blastz.canFam2
cat << '_EOF_' > DEF
# dog vs. dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0
# TARGET
# Dog
SEQ1_DIR=/scratch/hg/canFam2/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Dog
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
BASE=/cluster/data/canFam2/bed/blastz.canFam2
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
cp /cluster/data/canFam2/chrom.sizes S1.len
cp /cluster/data/canFam2/chrom.sizes S2.len
mkdir run
cd run
partitionSequence.pl 10000000 10000 /cluster/data/canFam2/nib ../S1.len \
-xdir xdir.sh -rawDir ../lav \
> canFam2.lst
csh -ef xdir.sh
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-run-ucsc $(path1) $(path2) ../DEF {check out line ../lav/$(file1)/$(file1)_$(file2).lav}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
gensub2 canFam2.lst canFam2.lst gsub jobList
para create jobList
para try, check, push, check, ...
# Completed: 70756 of 70756 jobs
# CPU time in finished jobs: 5743034s 95717.24m 1595.29h 66.47d 0.182 y
# IO & Wait Time: 199800s 3330.00m 55.50h 2.31d 0.006 y
# Average job time: 84s 1.40m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 2758s 45.97m 0.77h 0.03d
# Submission to last job: 17852s 297.53m 4.96h 0.21d
# convert lav files to psl
# First, combine all files per partition (too many files per chrom
# to do in one swoop!). Then do another round to collect all files
# per chrom.
ssh kki
cd /cluster/data/canFam2/bed/blastz.canFam2
mkdir pslChrom pslParts run.lavToPsl
cd run.lavToPsl
ls -1d ../lav/* | sed -e 's@/$@@' > parts.lst
# For self alignments, we need lavToAxt's -dropSelf behavior,
# so go lav->axt->psl... could add -dropSelf to lavToPsl, but that
# might be somewhat invasive and perhaps not really much faster (?)
# because the sequence must be dug up in order to rescore...
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cat $1/*.lav \
| lavToAxt -dropSelf stdin /iscratch/i/canFam2/nib \
/iscratch/i/canFam2/nib stdout \
| axtToPsl stdin ../S1.len ../S2.len stdout \
| sort -k 14,14 -k 16n,17n \
| gzip -c > $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh $(path1) {check out exists ../pslParts/$(file1).psl.gz }
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 parts.lst single gsub jobList
para create jobList
para try, check, push, check, ...
# Completed: 266 of 275 jobs
# Crashed: 9 jobs
# CPU time in finished jobs: 1228s 20.47m 0.34h 0.01d 0.000 y
# IO & Wait Time: 7117s 118.61m 1.98h 0.08d 0.000 y
# Average job time: 31s 0.52m 0.01h 0.00d
# Longest finished job: 74s 1.23m 0.02h 0.00d
# Submission to last job: 641s 10.68m 0.18h 0.01d
awk '{print $1;}' /cluster/data/canFam2/chrom.sizes > chroms.lst
cat << '_EOF_' > doChrom.csh
#!/bin/csh -ef
zcat ../pslParts/$1.*.psl.gz \
| sort -k 14,14 -k 16n,17n \
| gzip -c > $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChrom.csh
cat << '_EOF_' > gsub.chroms
#LOOP
./doChrom.csh $(root1) {check out exists ../pslChrom/$(root1).psl.gz }
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 chroms.lst single gsub.chroms jobList
para create jobList
para try, check, push, check, ...
# Completed: 40 of 40 jobs
# CPU time in finished jobs: 463s 7.71m 0.13h 0.01d 0.000 y
# IO & Wait Time: 178s 2.97m 0.05h 0.00d 0.000 y
# Average job time: 16s 0.27m 0.00h 0.00d
# Longest finished job: 38s 0.63m 0.01h 0.00d
# Submission to last job: 61s 1.02m 0.02h 0.00d
# CHAIN SELF BLASTZ
# For future reference -- doBlastzChainNet.pl should have been used
# REDONE partially, see below (from the filtering step onward, to get rid
# of duplicates, 12/6/05 angie)
# CHAINS REMERGED to reassign IDs so that they're unique genome-wide, and
# nets deleted to avoid confusion, 12/14/05 angie. The IDs were not unique
# genome-wide (only per-chrom) because the filtering was done directly on
# run1/chain files which had not yet had their IDs reassigned by
# chainMergeSort, and after filtering, chainMergeSort -saveId was used... doh.
# It would have been OK if the filtering had been done post-merging,
# or if -saveId had been omitted after filtering. caught by joinerCheck.
# Also could have been avoided by just using doBlastzChainNet.pl the first time.
# Run axtChain on little cluster
ssh kki
cd /cluster/data/canFam2/bed/blastz.canFam2
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S ../../pslChrom/*.psl.gz > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh -ef
setenv PATH /cluster/bin/x86_64:$PATH
axtChain -verbose=0 -psl $1 /iscratch/i/canFam2/nib \
/iscratch/i/canFam2/nib stdout \
| chainAntiRepeat /iscratch/i/canFam2/nib /iscratch/i/canFam2/nib \
stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
#Completed: 40 of 41 jobs
#Crashed: 1 jobs
#Average job time: 185s 3.09m 0.05h 0.00d
#Longest job: 490s 8.17m 0.14h 0.01d
#Submission to last job: 74789s 1246.48m 20.77h 0.87d
# now on the cluster server, sort chains
ssh kolossus
cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
# Had these steps actually been followed it would have saved some
# trouble, but instead some custom filtering was performed directly
# on run1/chain files and they never got their IDs reassigned:
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=10000 /tmp/score.$f:t:r
echo ""
end
# trim to minScore=20000 to cut some of the fluff
mkdir chainFilt
foreach f (chain/*.chain)
chainFilter -minScore=20000 $f > chainFilt/$f:t
end
gzip -c all.chain > all.chain.unfiltered.gz
rm -r chain
mv chainFilt chain
chainMergeSort -saveId chain/*.chain > all.chain
# Load chains into database
ssh hgwdev
cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain canFam2 ${c}_chainSelf $i
end
# 12/6/05 angie -- Jen found that all chains were duplicated --
# turns out that axtChain/run1/chain/ contained some chr*.filt.chain
# in addition to chr*.psl.chain, so *.chain resulted in duplicates.
# Redo the filtering and chain merge, then re-run
# doBlastzChainNet.pl -continue net
# to redo the subsequent steps.
ssh kolossus
cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
chainMergeSort run1/chain/chr*.psl.chain \
| gzip -c > all.chain.unfiltered.gz
rm -r chain
chainSplit chain all.chain.unfiltered.gz
mkdir chainFilt
foreach f (run1/chain/chr*.psl.chain)
chainFilter -minScore=20000 $f > chainFilt/$f:t:r:r.chain
end
rm -r chain
mv chainFilt chain
chainMergeSort -saveId chain/*.chain \
| gzip -c > canFam2.canFam2.all.chain.gz
ssh hgwdev
cd /cluster/data/canFam2/bed/blastz.canFam2
rm -r axtNet mafNet
rm -r /usr/local/apache/htdocs/goldenPath/canFam2/vsSelf/
doBlastzChainNet.pl -continue net DEF >& redo.log &
tail -f redo.log
# 12/14/05 angie -- Jen ran joinerCheck and it complained about
# non-unique IDs in the chain tables. fixing...
ssh kolossus
cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
chainSplit chain canFam2.canFam2.all.chain.gz
mv canFam2.canFam2.all.chain.gz canFam2.canFam2.all.chain.badIDs.gz
chainMergeSort chain/*.chain \
| gzip -c > canFam2.canFam2.all.chain.gz
rm -r chain
chainSplit chain canFam2.canFam2.all.chain.gz
# Load chains into database
ssh hgwdev
cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain canFam2 ${c}_chainSelf $i
end
cd ..
rm -r chain
# I removed the self net in order to avoid confusion, so no need to
# regenerate it.
# NET SELF BLASTZ
# For future reference -- doBlastzChainNet.pl should have been used
# REDONE 12/6/05 angie as part of doBlastzChainNet.pl command above
ssh kolossus
cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
chainPreNet all.chain.gz ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
netClass -noAr noClass.net canFam2 canFam2 self.net
rm noClass.net
# Load the nets into database
netFilter -minGap=10 self.net | hgLoadNet canFam2 netSelf stdin
# Add entries for chainSelf, netSelf to dog/canFam2 trackDb
# ADD SELF DOWNLOADABLE FILES (DONE 11/21/05 angie)
# For future reference -- doBlastzChainNet.pl should have been used for
# all steps from blastz up to this point.
# REDONE 12/6/05 angie as part of doBlastzChainNet.pl command above
cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
mv self.net canFam2.canFam2.net
gzip canFam2.canFam2.net
cd ..
doBlastzChainNet.pl -continue download -stop download DEF \
>& doDownload.log
tail -f doDownload.log
# MULTIZ.V10 4WAY (DOG/H/M/R) (DONE 11/7/05 angie - REDONE 12/13/05)
# hg17 and mm6 alignments were redone -- re-run this to pick up improvements.
# Tree: ((canFam2 hg17) (mm6 rn3))
ssh kkstore01
mkdir /cluster/data/canFam2/bed/multiz4way.2005-12-13
rm -f /cluster/data/canFam2/bed/multiz4way
ln -s /cluster/data/canFam2/bed/multiz4way.2005-12-13 \
/cluster/data/canFam2/bed/multiz4way
cd /cluster/data/canFam2/bed/multiz4way
# Setup: Copy pairwise MAF to /san/sanvol1/scratch:
mkdir /san/sanvol1/scratch/dogMultiz4way
foreach db (hg17 mm6 rn3)
echo $db
cp -pR /cluster/data/canFam2/bed/blastz.$db/mafNet \
/san/sanvol1/scratch/dogMultiz4way/$db
end
ls -lLR /san/sanvol1/scratch/dogMultiz4way
# Make output dir and script to use /cluster/bin/scripts/runMultizV10.csh:
mkdir maf
cat << '_EOF_' > doMultizAll.csh
#!/bin/csh -fex
set chr = $1
set tmpDir = /scratch/dogMultiz4way.$chr
mkdir $tmpDir
set mafScratch = /san/sanvol1/scratch/dogMultiz4way
# Really should write a perl script to take a tree like this and generate
# commands like the ones below:
# ((canFam2 hg17) (mm6 rn3))
/cluster/bin/scripts/runMultizV10.csh \
$mafScratch/mm6/$chr.maf.gz \
$mafScratch/rn3/$chr.maf.gz \
0 canFam2.$chr $tmpDir $tmpDir/$chr.MusRat.maf
/cluster/bin/scripts/runMultizV10.csh \
$mafScratch/hg17/$chr.maf.gz \
$tmpDir/$chr.MusRat.maf \
1 canFam2.$chr $tmpDir maf/$chr.maf
rm -f $tmpDir/*.maf
rmdir $tmpDir
'_EOF_'
# << for emacs
chmod 775 doMultizAll.csh
awk '{print "./doMultizAll.csh " $1;}' /cluster/data/canFam2/chrom.sizes \
> jobs.lst
# Run on brawny cluster
ssh pk
cd /cluster/data/canFam2/bed/multiz4way
para make jobs.lst
para time
#Completed: 41 of 41 jobs
#Average job time: 1296s 21.61m 0.36h 0.02d
#Longest finished job: 2879s 47.98m 0.80h 0.03d
#Submission to last job: 2879s 47.98m 0.80h 0.03d
ls -1 missing*
#ls: No match.
# make /gbdb/ links to 4way maf files:
ssh hgwdev
mkdir -p /gbdb/canFam2/multiz4way/maf/multiz4way
ln -s /cluster/data/canFam2/bed/multiz4way/maf/chr*.maf \
/gbdb/canFam2/multiz4way/maf/multiz4way/
# load into database
cd /tmp
hgLoadMaf -warn canFam2 multiz4way \
-pathPrefix=/gbdb/canFam2/multiz4way/maf/multiz4way
# load summary table to replace pairwise
cat /cluster/data/canFam2/bed/multiz4way/maf/chr*.maf \
| nice hgLoadMafSummary canFam2 multiz4waySummary stdin
# Dropped unused indexes (2006-05-09 kate)
# NOTE: this is not required in the future, as the loader
# has been fixed to not generate these indexes
hgsql canFam2 -e "alter table multiz4waySummary drop index chrom_2"
hgsql canFam2 -e "alter table multiz4waySummary drop index chrom_3"
# put 4way MAF out for download
ssh kolossus
cd /cluster/data/canFam2/bed/multiz4way
mkdir mafDownload
foreach f (maf/*.maf)
nice gzip -c $f > mafDownload/$f:t.gz
end
cd mafDownload
md5sum *.maf.gz > md5sum.txt
# make a README.txt
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/canFam2/multiz4way
ln -s /cluster/data/canFam2/bed/multiz4way/mafDownload/{*.maf.gz,*.txt} \
/usr/local/apache/htdocs/goldenPath/canFam2/multiz4way
# Cleanup
rm -rf /san/sanvol1/scratch/dogMultiz4way/
# PHASTCONS 4WAY WITH METHODS FROM PAPER (DONE 11/9/05 angie - REDONE 12/13/05)
# ((canFam2,hg17),(mm6,rn3))
# redo of hg17, mm6 -> multiz -> phastCons.
ssh kkstore01
mkdir -p /san/sanvol1/scratch/canFam2/chrom
cp -p /cluster/data/canFam2/?{,?}/chr*.fa /san/sanvol1/scratch/canFam2/chrom/
# Split chrom fa into smaller windows for phastCons:
ssh pk
mkdir /cluster/data/canFam2/bed/multiz4way/phastCons
mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.split
cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.split
set WINDOWS = /san/sanvol1/scratch/canFam2/phastCons/WINDOWS
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.sh
#!/bin/csh -ef
set PHAST=/cluster/bin/phast
set FA_SRC=/san/sanvol1/scratch/canFam2/chrom
set WINDOWS=/san/sanvol1/scratch/canFam2/phastCons/WINDOWS
set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
-O canFam2,hg17,mm6,rn3 \
-w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
chmod a+x doSplit.sh
rm -f jobList
foreach file (/cluster/data/canFam2/bed/multiz4way/maf/*.maf)
if (-s $file) then
echo "doSplit.sh {check in exists+ $file}" >> jobList
endif
end
para make jobList
para time
#Completed: 41 of 41 jobs
#Average job time: 132s 2.21m 0.04h 0.00d
#Longest finished job: 232s 3.87m 0.06h 0.00d
#Submission to last job: 232s 3.87m 0.06h 0.00d
############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
# Use consEntropy --NH to make it suggest a --expected-lengths param
# that we should try next. Adam ran this on hg17 to find out the
# total entropy of the hg17 model:
# consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
# Our target is that NH result: 9.7834 bits.
# Use phyloFit to make an initial model:
ssh kolossus
cd /cluster/data/canFam2/bed/multiz4way/phastCons
/cluster/bin/phast/msa_view ../maf/chr{2,20,37}.maf \
--aggregate canFam2,hg17,mm6,rn3 \
-i MAF -o SS > all.ss
/cluster/bin/phast/phyloFit all.ss \
--tree "((canFam2,hg17),(mm6,rn3))" \
-i SS --out-root starting-tree
cat starting-tree.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -378601835.277927
#BACKGROUND: 0.285684 0.213599 0.213743 0.286974
#RATE_MAT:
# -0.889874 0.169851 0.563150 0.156874
# 0.227172 -1.149929 0.168661 0.754097
# 0.752694 0.168547 -1.148977 0.227736
# 0.156169 0.561285 0.169622 -0.887076
#TREE: ((canFam2:0.161521,hg17:0.166335):0.114597,(mm6:0.071038,rn3:0.076403):0.114597);
# also get GC content from model -- if similar enough, no need to extract it
# separately above.
awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod
#0.427342
# OK, use .427 for --gc below.
# Use values of --target-coverage and --expected-lengths sort of like
# those in the latest run on human, 0.25 and 12, just as a starting point.
# Multiply each subst rate on the TREE line by 3.75 which is roughly the
# ratio of noncons to cons in
# /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.estimate/ave.*
/cluster/bin/phast/tree_doctor -s 3.75 starting-tree.mod \
> starting-tree.noncons.mod
/cluster/bin/phast/consEntropy --NH 9.7834 0.25 12 \
starting-tree{,.noncons}.mod
#( Solving for new omega: 12.000000 13.069907 13.002421 13.002166 )
#Transition parameters: gamma=0.250000, omega=12.000000, mu=0.083333, nu=0.027778
#Relative entropy: H=0.803151 bits/site
#Expected min. length: L_min=11.957635 sites
#Expected max. length: L_max=9.271001 sites
#Phylogenetic information threshold: PIT=L_min*H=9.603785 bits
#Recommended expected length: omega=13.002166 sites (for L_min*H=9.783400)
# OK, use --expected-lengths 13.
############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
# We're here because the actual target coverage was not satisfactory,
# so we're changing the --target-coverage param. Given that we're
# changing that, take a guess at how we should change --expected-lengths
# in order to also hit the total entropy target.
cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
# SECOND ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.18 13 ave.{cons,noncons}.mod
#ERROR: too many iterations, not converging; try without --NH.
# -- it gets that error unless I raise coverage to 0.34. Well,
# stick with 13 for now...
# THIRD ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.15 13 ave.{cons,noncons}.mod
#ERROR: too many iterations, not converging; try without --NH.
# -- it gets that error unless I raise coverage to 0.31. Well,
# stick with 13 for now...
# Now set up cluster job to estimate model parameters given free params
# --target-coverage and --expected-lengths and the data.
ssh pk
mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
# REDO ONLY
cp ../../../multiz4way.2005-11-07/phastCons/run.estimate/ave.*.mod .
# REDO ONLY -- skip the first iteration stuff from here on out, and
# do subsequent iteration stuff using params from 11-07 run.
# FIRST ITERATION: Use ../starting-tree.mod:
# REDO ONLY
cp ../../../multiz4way.2005-11-07/phastCons/run.estimate/ave.*.mod .
# REDO ONLY -- skip the First iteration stuff from here on out, and
# do subsequent iteration stuff using params from 11-07 run.
cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.427 --nrates 1,1 \
--no-post-probs --ignore-missing \
--expected-lengths 13 --target-coverage 0.25 \
--quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
# SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.427 --nrates 1,1 \
--no-post-probs --ignore-missing \
--expected-lengths 13 --target-coverage 0.15 \
--quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
chmod a+x doEstimate.sh
rm -fr LOG TREES
mkdir -p LOG TREES
rm -f jobList
foreach f (/san/sanvol1/scratch/canFam2/phastCons/WINDOWS/*.ss.gz)
set root = $f:t:r:r
echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
end
para make jobList
para time
#Completed: 2434 of 2434 jobs
#Average job time: 29s 0.48m 0.01h 0.00d
#Longest finished job: 59s 0.98m 0.02h 0.00d
#Submission to last job: 214s 3.57m 0.06h 0.00d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kolossus
cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
ls -1 TREES/*.cons.mod > cons.txt
/cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
--output-average ave.cons.mod > cons_summary.txt
ls -1 TREES/*.noncons.mod > noncons.txt
/cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
--output-average ave.noncons.mod > noncons_summary.txt
grep TREE ave*.mod
# FIRST ITERATION:
#ave.cons.mod:TREE: ((canFam2:0.063241,hg17:0.067179):0.047808,(mm6:0.029756,rn3:0.031924):0.047808);
#ave.noncons.mod:TREE: ((canFam2:0.201135,hg17:0.213229):0.151937,(mm6:0.094208,rn3:0.101249):0.151937);
# SECOND ITERATION:
#ave.cons.mod:TREE: ((canFam2:0.056147,hg17:0.059693):0.042487,(mm6:0.026610,rn3:0.028533):0.042487);
#ave.noncons.mod:TREE: ((canFam2:0.192186,hg17:0.203958):0.145480,(mm6:0.090727,rn3:0.097451):0.145480);
# THIRD ITERATION:
#ave.cons.mod:TREE: ((canFam2:0.052533,hg17:0.055873):0.039758,(mm6:0.024967,rn3:0.026765):0.039758);
#ave.noncons.mod:TREE: ((canFam2:0.188439,hg17:0.200074):0.142743,(mm6:0.089242,rn3:0.095829):0.142743);
# REDO:
#ave.cons.mod:TREE: ((canFam2:0.052883,hg17:0.056022):0.039955,(mm6:0.024815,rn3:0.026742):0.039955);
#ave.noncons.mod:TREE: ((canFam2:0.188924,hg17:0.199837):0.142839,(mm6:0.088391,rn3:0.095457):0.142839);
cat cons_summary.txt
# look over the files cons_summary.txt and noncons_summary.txt.
# The means and medians should be roughly equal and the stdevs
# should be reasonably small compared to the means, particularly
# for rate matrix parameters (at bottom) and for branches to the
# leaves of the tree. The stdevs may be fairly high for branches
# near the root of the tree; that's okay. Some min values may be
# 0 for some parameters. That's okay, but watch out for very large
# values in the max column, which might skew the mean. If you see
# any signs of bad outliers, you may have to track down the
# responsible .mod files and throw them out. I've never had to do
# this; the estimates generally seem pretty well behaved.
# NOTE: Actually, a random sample of several hundred to a thousand
# alignment fragments (say, a number equal to the number of
# available cluster nodes) should be more than adequate for
# parameter estimation. If pressed for time, use this strategy.
# FIRST ITERATION ONLY:
# Check the total entropy figure to see if we're way off.
# This takes an hour for 4way (exponential in #species) and has never
# produced a different answer from the input after the first iteration,
# so do this for the first iteration only:
/cluster/bin/phast/consEntropy --NH 9.7834 0.25 13 ave.{cons,noncons}.mod
#ERROR: too many iterations, not converging; try without --NH
# Dang. That is a new one. Oh well, I'll proceed with 13.
# Now we are ready to set up the cluster job for computing the
# conservation scores and predicted elements. The we measure the
# conserved elements coverage, and if that's not satisfactory then we
# adjust parameters and repeat.
ssh pk
mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.phast
cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.phast
cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
--expected-lengths 13 --target-coverage 0.15 \
--quiet --seqname $chr --idpref $pref \
--viterbi /san/sanvol1/scratch/canFam2/phastCons/ELEMENTS/$chr/$pref.bed --score \
--require-informative 0 \
> $tmpfile
gzip -c $tmpfile > /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS/$chr/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
chmod a+x doPhastCons.sh
rm -fr /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}
mkdir -p /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}
foreach chr (`awk '{print $1;}' /cluster/data/canFam2/chrom.sizes`)
mkdir /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}/$chr
end
rm -f jobList
foreach f (/san/sanvol1/scratch/canFam2/phastCons/WINDOWS/*.ss.gz)
echo doPhastCons.sh $f >> jobList
end
para make jobList
para time
#Completed: 2434 of 2434 jobs
#Average job time: 11s 0.18m 0.00h 0.00d
#Longest finished job: 39s 0.65m 0.01h 0.00d
#Submission to last job: 74s 1.23m 0.02h 0.00d
# back on kolossus:
# combine predictions and transform scores to be in 0-1000 interval
cd /cluster/data/canFam2/bed/multiz4way/phastCons
cp /dev/null all.bed
foreach d (/san/sanvol1/scratch/canFam2/phastCons/ELEMENTS/*)
echo $d:t
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
$d/*.bed \
| /cluster/bin/scripts/lodToBedScore >> all.bed
end
ssh hgwdev
# Now measure coverage of CDS by conserved elements.
# We want the "cover" figure to be close to 68.9%.
# However we don't have dog gene annotations -- we just have xenoRefGene
# which is of course biased towards conserved genes. So cover will be
# artificially high. Shoot for "all.bed 5%" and make sure cover is
# somewhat higher than 68.9%.
cd /cluster/data/canFam2/bed/multiz4way/phastCons
featureBits -enrichment canFam2 xenoRefGene:cds all.bed
# FIRST ITERATION: too high, reduce target-coverage:
#xenoRefGene:cds 1.268%, all.bed 6.812%, both 1.091%, cover 86.06%, enrich 12.63x
# SECOND ITERATION: still too high.
#xenoRefGene:cds 1.268%, all.bed 5.432%, both 1.048%, cover 82.67%, enrich 15.22x
# THIRD ITERATION: close enough.
#xenoRefGene:cds 1.268%, all.bed 4.960%, both 1.025%, cover 80.83%, enrich 16.29x
# REDO: close enough.
#xenoRefGene:cds 1.268%, all.bed 4.960%, both 1.025%, cover 80.85%, enrich 16.30x
# Having met the CDS coverage target, load up the results.
hgLoadBed canFam2 phastConsElements4way all.bed
# Create wiggle
ssh pk
mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.wib
cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.wib
rm -rf /san/sanvol1/scratch/canFam2/phastCons/wib
mkdir -p /san/sanvol1/scratch/canFam2/phastCons/wib
cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /san/sanvol1/scratch/canFam2/phastCons/wib
zcat `ls -1 /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS/$chr/*.pp.gz \
| sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
chmod a+x doWigEncode
rm -f jobList
foreach chr (`ls -1 /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS \
| sed -e 's/\/$//'`)
echo doWigEncode $chr >> jobList
end
para make jobList
para time
#Completed: 41 of 41 jobs
#Average job time: 35s 0.59m 0.01h 0.00d
#Longest finished job: 67s 1.12m 0.02h 0.00d
#Submission to last job: 67s 1.12m 0.02h 0.00d
# back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want
# the raw scores) from san/sanvol1
cd /cluster/data/canFam2/bed/multiz4way/phastCons
rm -rf wib POSTPROBS
rsync -av /san/sanvol1/scratch/canFam2/phastCons/wib .
rsync -av /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS .
# load wiggle component of Conservation track
ssh hgwdev
mkdir /gbdb/canFam2/multiz4way/wib
cd /cluster/data/canFam2/bed/multiz4way/phastCons
chmod 775 . wib
chmod 664 wib/*.wib
ln -s `pwd`/wib/*.wib /gbdb/canFam2/multiz4way/wib/
hgLoadWiggle canFam2 phastCons4way \
-pathPrefix=/gbdb/canFam2/multiz4way/wib wib/*.wig
rm wiggle.tab
# and clean up san/sanvol1.
rm -r /san/sanvol1/scratch/canFam2/phastCons/{ELEMENTS,POSTPROBS,wib}
rm -r /san/sanvol1/scratch/canFam2/chrom
# Offer raw scores for download since fly folks are likely to be interested:
# back on kolossus
cd /cluster/data/canFam2/bed/multiz4way/phastCons/POSTPROBS
mkdir ../postprobsDownload
foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
zcat `ls -1 $chr/$chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
> ../postprobsDownload/$chr.pp.gz
end
cd ../postprobsDownload
md5sum *.gz > md5sum.txt
# Make a README.txt there too.
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/canFam2/phastCons4way
cd /usr/local/apache/htdocs/goldenPath/canFam2/phastCons4way
ln -s /cluster/data/canFam2/bed/multiz4way/phastCons/postprobsDownload/* .
# UNCERTIFIED ASSEMBLY REGIONS (DONE 11/9/05 angie)
ssh hgwdev
mkdir /cluster/data/canFam2/bed/uncertified
cd /cluster/data/canFam2/bed/uncertified
mv ../../broad/canFam2.0uncertifiedRegions.txt .
tail +2 canFam2.0uncertifiedRegions.txt \
| awk -F"\t" '{print "chr" $1 "\t" $2 "\t" $3 "\t" $7 "\t" $6;}' \
| sed -e 's/Missing read partners/MRP/; s/Haplotype inconsistency/HI/; \
s/Negative gap/NG/; s/Linking inconsistency/LI/; \
s/Illogical links/IL/; s/; /+/g;' \
> uncertified.bed
hgLoadBed -tab canFam2 uncertified uncertified.bed
# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 11/21/05 angie)
# REDONE parts: Sequence and simpleRepeat. 12/5/05 angie
ssh kolossus
cd /cluster/data/canFam2
#- Build the .tar.gz files -- no genbank for now.
cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf bigZips
mkdir bigZips
tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp
tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked
cd bed/simpleRepeat
tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed
cd ../..
'_EOF_'
# << this line makes emacs coloring happy
csh -efx ./jkStuff/zipAll.csh |& tee zipAll.log
#- Look at zipAll.log to make sure all file lists look reasonable.
cd bigZips
md5sum *.gz > md5sum.txt
# Make a README.txt
cd ..
mkdir chromGz
foreach f ( ?{,?}/chr*.fa )
echo $f:t:r
gzip -c $f > chromGz/$f:t.gz
end
cd chromGz
md5sum *.gz > md5sum.txt
# Make a README.txt
#- Link the .gz and .txt files to hgwdev:/usr/local/apache/...
ssh hgwdev
set gp = /usr/local/apache/htdocs/goldenPath/canFam2
mkdir -p $gp/bigZips
ln -s /cluster/data/canFam2/bigZips/{chrom*.tar.gz,*.txt} $gp/bigZips
mkdir -p $gp/chromosomes
ln -s /cluster/data/canFam2/chromGz/{chr*.gz,*.txt} $gp/chromosomes
# Take a look at bigZips/* and chromosomes/*
# Can't make refGene upstream sequence files - no refSeq for dog.
mkdir database
# Create README.txt files in database/ to explain the files.
# CHORI BAC END PAIRS (DONE 11/22/05 angie)
# Rachel downloaded and parsed BAC end sequences and pair info from
# CHORI and NCBI -- seee makeCanFam1.doc "BAC END PAIRS". Use those
# files and align to canFam2.
# Do BLAT alignments of BAC ends to the genome on the pitakluster.
# copy over masked contigs to the san
ssh kkstore01
cd /cluster/data/canFam2
mkdir /san/sanvol1/scratch/canFam2/maskedContigs
foreach d (?{,?}/chr*_?{,?})
echo $d:t
cp -p $d/$d:t.fa /san/sanvol1/scratch/canFam2/maskedContigs/
end
# copy over 11.ooc file to the san
cp -p /cluster/bluearc/canFam2/11.ooc /san/sanvol1/scratch/canFam2/
# make output directory, run directory and bed/ directory
mkdir -p /san/sanvol1/scratch/canFam2/bacendsRun/psl
mkdir /cluster/data/canFam2/bed/bacends
ssh pk
cd /san/sanvol1/scratch/canFam2/bacendsRun
echo '#LOOP\n/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/canFam2/11.ooc {check out line+ /san/sanvol1/scratch/canFam2/bacendsRun/psl/$(root1).$(root2).psl}\n#ENDLOOP' > gsub
ls -1S /san/sanvol1/scratch/canFam2/maskedContigs/*.fa > contigs.lst
ls -1S /san/sanvol1/scratch/canFam1/bacends/bacends*.fa > bacends.lst
gensub2 contigs.lst bacends.lst gsub jobList
para make jobList
para time
#Completed: 49005 of 49005 jobs
#Average job time: 7s 0.12m 0.00h 0.00d
#Longest finished job: 105s 1.75m 0.03h 0.00d
#Submission to last job: 1884s 31.40m 0.52h 0.02d
# back on kkstore01, retrieve pitakluster results
cd /cluster/data/canFam2/bed/bacends
rsync -av /san/sanvol1/scratch/canFam2/bacendsRun/{*.lst,batch*,g*,j*,p*} .
# lift alignments
ssh kolossus
cd /cluster/data/canFam2/bed/bacends
pslSort dirs raw.psl tmp psl
# started 08:15, PID: 16561
pslCheck raw.psl >& check.log
wc -l check.log
#4379 check.log
grep '< previous block' check.log | wc -l
#2194
grep -v 'Error: invalid PSL' check.log | grep -v '< previous block' | wc -l
#0
wc -l raw.psl
#27352955 raw.psl -- 2194 / 27352955 = 0.000080 = 0.008% overlapping
pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl bacEnds.psl /dev/null
# Processed 27147454 alignments
wc -l bacEnds.psl
#769149 bacEnds.psl
liftUp bacEnds.lifted.psl /cluster/data/canFam2/jkStuff/liftAll.lft \
warn bacEnds.psl
awk '{print $10}' bacEnds.lifted.psl | sort | uniq | wc -l
#317042
faSize /san/sanvol1/scratch/canFam1/bacends/bacends*.fa
#290482800 bases (10817785 N's 279665015 real 279665015 upper 0 lower) in 393408 sequences in 99 files
calc 317042 / 393408
#317042 / 393408 = 0.805886
# Make BAC end pairs track:
mkdir pairs
cd pairs
set ncbiDir = /cluster/data/ncbi/bacends/dog/bacends.1
/cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose ../bacEnds.lifted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds
wc -l *
# 40 bacEnds.long
# 460 bacEnds.mismatch
# 44366 bacEnds.orphan
# 133248 bacEnds.pairs
# 478 bacEnds.short
# 816 bacEnds.slop
# 179408 total
# Filter by score and sort by {chrom,chromStart}:
awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
cat bacEnds.{slop,short,long,mismatch,orphan} \
| awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
wc -l *.bed
# 133241 bacEndPairs.bed
# 46057 bacEndPairsBad.bed
# 179298 total
extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed \
bacEndPairsBad.bed \
| sorttbl tname tstart | headchg -del \
> bacEnds.load.psl
# load into database
ssh hgwdev
cd /cluster/data/canFam2/bed/bacends/pairs
hgLoadBed canFam2 bacEndPairs bacEndPairs.bed -notItemRgb \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
#Loaded 133241 elements of size 11
# note - this next track isn't pushed to RR, just used for assembly QA
hgLoadBed canFam2 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
#Loaded 46057 elements of size 11
hgLoadPsl canFam2 -table=all_bacends bacEnds.load.psl
#load of all_bacends did not go as planned: 748544 record(s), 0 row(s) skipped, 928 warning(s) loading psl.tab
# Diagnose...
echo select \* from all_bacends | hgsql -N canFam2 > /tmp/1
diff psl.tab /tmp/1 | less
# Looks like some rows of psl.tab have negative numbers in the
# qBaseInsert column!
# load BAC end sequences into seq table so alignments may be viewed
# symlink to FASTA sequence file in ncbi directory
mkdir -p /gbdb/canFam2/bacends
ln -s /cluster/data/ncbi/bacends/dog/bacends.1/canFamBacends.fa \
/gbdb/canFam2/bacends/canFamBacends.fa
hgLoadSeq canFam2 /gbdb/canFam2/bacends/canFamBacends.fa
#393408 sequences
# featureBits canFam1 all_bacends
#211644790 bases of 2359845093 (8.969%) in intersection
featureBits canFam2 all_bacends
#218709219 bases of 2384996543 (9.170%) in intersection
# featureBits canFam1 bacEndPairs
#2334084046 bases of 2359845093 (98.908%) in intersection
featureBits canFam2 bacEndPairs
#2353239742 bases of 2384996543 (98.668%) in intersection
# featureBits canFam1 bacEndPairsBad
# 548130287 bases of 2359845093 (23.227%) in intersection
featureBits canFam2 bacEndPairsBad
#534195657 bases of 2384996543 (22.398%) in intersection
# add trackDb entry and html
# Clean up
ssh pk
rm -r /san/sanvol1/scratch/canFam2/bacendsRun
# ACCESSIONS FOR CONTIGS (DONE 2006-01-04 kate)
# Found WGS project entry in Genbank for canFam2:
# LOCUS AAEX02000000 35696 rc DNA linear MAM 13-DEC-2005
# DEFINITION Canis familiaris whole genome shotgun sequencing project.
# Project has a total of 35696 contigs, accessioned as
# AAEX02000001-AAEX02035696, also named cont2.0--cont2.35695
# Poked around after submitting "canis[ORGN] AND WGS[KYWD]" search
# to Entrez Nucleotide
cd /cluster/data/canFam2/broad
grep contig Dog2.0.agp | wc -l
# 35696
grep contig Dog2.0.agp | head -1
# chr01 3000001 3024025 2 W contig_30884 ...
hgsql canFam2 -Ns -e "select frag from chr1_gold limit 1"
# contig_30884
grep contig_0 Dog2.0.agp
# chrUn 57343783 57365839 7071 W contig_0
grep contig_35695 Dog2.0.agp
# chrUn 73893245 73902697 11515 W contig_35695
grep contig_35696 Dog2.0.agp
# nothing
#
# in Genbank entries)
# Looked at summary file of contig entries from Genbank by
# clicking the project entry (AAEX02000000), then clicking on
# the WGS contig acession list at the end of the page
# the AGP and our Gap tables frag names contig_* correspond to the
# cont2.* names and AEX02* names (-1) in the Genbank entries,
# so we don't need to to parse out the comments from the summary
# entries.
# create file mapping our contig names to Genbank accessions
cd /cluster/data/canFam2/bed
mkdir -p contigAcc
cd contigAcc
sed -n 's/.*contig_\([0-9][0-9]*\).*/\1/p' \
/cluster/data/canFam2/broad/Dog2.0.agp | \
sort -un | \
awk '{printf ("contig_%d\tAAEX020%05d.1\n", $1, $1+1)}' > contigAcc.tab
wc -l contigAcc.tab
# 35696 contigAcc.tab
# load into database
hgsql canFam2 < $HOME/kent/src/hg/lib/contigAcc.sql
echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
hgsql canFam2
hgsql canFam2 -Nse "SELECT COUNT(*) FROM contigAcc"
# 35696
############################################################################
# QA NOTE (2007-08-16 brooke):
# rhesus nets on canFam2 were requested by Merck (see the following
# thread: http://www.soe.ucsc.edu/pipermail/genome/2007-August/014414.html).
# We may want to consider adding rhesus nets/chains to next dog release.
############################################################################
# BLASTZ BOSTAU2 (DONE - 2006-01-17 - 2006-01-18 - Hiram)
ssh kk
mkdir /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
cd /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
BLASTZ=blastz.v7
BLASTZ_M=50
# dog vs. cow
# TARGET: Dog
SEQ1_DIR=/iscratch/i/canFam2/nib
SEQ1_LEN=/iscratch/i/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow (bosTau2)
SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=300
BASE=/cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# running 2006-01-17 13:58
# done 2005-01-18 03:40
# real 823m27.169s
ssh hgwdev
cd /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
time featureBits canFam2 chainBosTau2Link \
> fb.canFam2.chainBosTau2Link 2>&1 &
# real 32m3.460s
# 1376066425 bases of 2384996543 (57.697%) in intersection
# SWAP CHAINS FROM RN4, BUILD NETS ETC. (DONE 2/25/06 angie)
mkdir /cluster/data/canFam2/bed/blastz.rn4.swap
cd /cluster/data/canFam2/bed/blastz.rn4.swap
doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.canFam2/DEF \
>& do.log
echo "check /cluster/data/canFam2/bed/blastz.rn4.swap/do.log" \
| mail -s "check do.log" $USER
ln -s blastz.rn4.swap /cluster/data/canFam2/bed/blastz.rn4
############################################################################
## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
ssh pk
cd /cluster/data/mm8/bed/blastzCanFam2.2006-02-18
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits canFam2 chainMm8Link
# 816262344 bases of 2384996543 (34.225%) in intersection
############################################################################
## BLASTZ swap from panTro2 alignments (DONE 2006-05-04 markd)
# FINISHED (2006-07-21 kate)
ssh hgwdev
mkdir /cluster/data/canFam2/bed/blastz.panTro2.swap
ln -s blastz.panTro2.swap /cluster/data/canFam2/bed/blastz.panTro2
cd /cluster/data/canFam2/bed/blastz.panTro2.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
/cluster/data/panTro2/bed/blastz.canFam2/DEF >& swap.out&
# create the net files
ssh hgwdev
cd /cluster/data/canFam2/bed/blastz.panTro2.swap/axtChain
nice netClass -verbose=0 -noAr noClass.net canFam2 panTro2 canFam2.panTro2.net
nice gzip canFam2.panTro2.net
# Mark stopped here
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -continue=load \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
/cluster/data/panTro2/bed/blastz.canFam2/DEF >& swap.2.out&
##########################################################################
# NSCAN track - (2006-06-01 markd)
cd /cluster/data/canFam2/bed/nscan/
# obtained NSCAN predictions from michael brent's group
# at WUSTL
wget -nv http://genes.cs.wustl.edu/jeltje/canFam2/canFam2.nscan.gtf
gzip canFam2.nscan.gtf
wget -nv -r -np http://genes.cs.wustl.edu/jeltje/canFam2/chr_ptx/
cat genes.cs.wustl.edu/jeltje/canFam2/chr_ptx/*.fa >canFam2.nscan.pep.fa
gzip canFam2.nscan.pep.fa
rm -rf genes.cs.wustl.edu
chmod a-w *.gz
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt canFam2 nscanGene canFam2.nscan.gtf.gz
# add .a suffix to match transcript id
WRONG, see below: hgPepPred -suffix=.a canFam2 generic nscanPep canFam2.nscan.pep.fa.gz
rm *.tab
# update trackDb; need a canFam2-specific page to describe informants
dog/canFam2/nscanGene.html
dog/canFam2/trackDb.ra
# 2006-07-24 markd: reloaded without -suffix
hgPepPred canFam2 generic nscanPep canFam2.nscan.pep.fa.gz
##########################################################################
# MAKE Human Proteins (hg18) track (DONE braney 2006-08-17)
ssh pk
destDir=/san/sanvol1/scratch/canFam2/blastDb
mkdir -p /cluster/data/canFam2/bed/tblastn.hg18KG
cd /cluster/data/canFam2/bed/tblastn.hg18KG
find $destDir -name "*.nsq" | sed "s/\.nsq//" > target.lst
mkdir kgfa
# calculate a reasonable number of jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(500000/`wc target.lst | awk "{print \\\$1}"`\)
# 36727/(500000/6149) = 451.668646
split -l 452 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
rm -rf /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut
mkdir -p /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/canFam2/jkStuff/subChr.lft carry $f.3
liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/canFam2/jkStuff/liftAll.lft carry $f.4
mv $3.tmp $3
rm -f $f.1 $f.2
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'
exit
chmod +x blastSome
gensub2 target.lst kg.lst blastGsub blastSpec
para create blastSpec
para push
# Completed: 504218 of 504218 jobs
# CPU time in finished jobs: 28268903s 471148.38m 7852.47h 327.19d 0.896 y
# IO & Wait Time: 5565118s 92751.97m 1545.87h 64.41d 0.176 y
# Average job time: 67s 1.12m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 710s 11.83m 0.20h 0.01d
# Submission to last job: 91362s 1522.70m 25.38h 1.06d
ssh kki
cd /cluster/data/canFam2/bed/tblastn.hg18KG
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1) $(path2)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.$2_*.psl | /cluster/home/braney/bin/i386/simpleChain -chrom=$2 -prot -outPsl -maxGap=250000 stdin ../c.$2.`bas
ename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst ../../chrom.lst chainGsub chainSpec
para create chainSpec
para push
# Completed: 3335 of 3335 jobs
# CPU time in finished jobs: 1056945s 17615.76m 293.60h 12.23d 0.034 y
# IO & Wait Time: 75548s 1259.13m 20.99h 0.87d 0.002 y
# Average job time: 340s 5.66m 0.09h 0.00d
# Longest finished job: 25716s 428.60m 7.14h 0.30d
# Submission to last job: 61450s 1024.17m 17.07h 0.71d
ssh kkstore04
cd /cluster/data/canFam2/bed/tblastn.hg18KG/blastOut
for i in kg??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.*.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -k 14,14 -k 16,16n -k 18,18n u.*.psl m60* | uniq > /cluster/data/canFam2/bed/tblastn.hg18KG/blastHg18KG.psl
cd ..
wc blastHg18KG.psl
# 62741 1317561 12491782 blastHg18KG.psl
# hg17 64343 1351203 10913814 blastHg18KG.psl
pslUniq blastHg18KG.psl stdout | wc
# 36272 761712 8427975
# hg17 18827 395367 2992653
cat kgfa/*fa | grep ">" | wc
# 303516 303516 4739313
# hg17 309494 309494 4810327
ssh hgwdev
cd /cluster/data/canFam2/bed/tblastn.hg18KG
hgLoadPsl canFam2 blastHg18KG.psl
featureBits canFam2 blastHg18KG
# 32565727 bases of 2384996543 (1.365%) in intersection
# hg17 35781547 bases of 2384996543 (1.500%) in intersection
exit
# back to kksilo
rm -rf blastOut
# End tblastn
#########################################################################
# BLASTZ/CHAIN/NET FELCAT3 (Done Nov 16 2006 heather)
# working in /cluster/data/felCat3 because /cluster/data/canFam2 is 94% full
mkdir /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16
ln -s /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16 /cluster/data/canFam2/bed/blastz.felCat3
cd /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Dog canFam2
SEQ1_DIR=/scratch/hg/canFam2/nib
SEQ1_LEN=/scratch/hg/canFam2/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=200000000
SEQ1_LAP=0
# QUERY: Cat felCat3
SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=500
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/felCat3/bed/blastz.canFam2.2006-11-16
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk
-chainMinScore=3000 -chainLinearGap=medium
-blastzOutRoot /cluster/bluearc/felCat3/blastz.canFam2 >& do.log &
tail -f do.log
nice featureBits -chrom=chr1 canFam2 chainFelCat3Link
# 65223887 bases of 121613690 (53.632%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET DOG AND HORSE (DONE 2/19/07 Fan)
ssh kkstore05
mkdir /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
cd /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
cat << '_EOF_' > DEF
# Horse vs. Dog
BLASTZ_M=50
# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500
SEQ1_CHUNK=30000000
SEQ1_LAP=10000
# QUERY: Dog canFam2
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_LEN=/cluster/data/canFam2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do.log &
tail -f do.log
# 6 jobs failed.
# Robert re-ran those 6 jobs. (2/22/07)
doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
-continue cat -bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do2.log &
tail -f do2.log
ln -s blastz.canFam2.2007-02-18 /cluster/data/equCab1/bed/blastz.canFam2
ssh hgwdev
nice featureBits equCab1 -chrom=chrI chainCanFam2Link
# 128747357 bases of 177498097 (72.534%) in intersection
bash
time nice -n 19 featureBits equCab1 chainCanFam2Link \
> fb.equCab1.chainCanFam2Link.txt 2>&1
# 1717664968 bases of 2421923695 (70.922%) in intersection
ssh kkstore04
mkdir /cluster/data/canFam2/bed/blastz.equCab1.swap
cd /cluster/data/canFam2/bed/blastz.equCab1.swap
bash
time doBlastzChainNet.pl \
/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
# real 115m55.880s
cat *.txt
# 1673177547 bases of 2384996543 (70.154%) in intersection
#########################################################################
#########################################################################
# BLASTZ OPOSSUM monDom4 (DONE 2007-05-27 braney)
ssh kolossus
mkdir /cluster/data/canFam2/bed/blastz.monDom4
cd /cluster/data/canFam2/bed/blastz.monDom4
cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
BLASTZ=blastz.v7
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Dog (canFam2)
SEQ1_DIR=/san/sanvol1/scratch/canFam2/nib
SEQ1_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom4
#SEQ2_DIR=/iscratch/i/monDom4/monDom4RMExtra.2bit
SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
#SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
SEQ2_LEN=/cluster/data/monDom4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/canFam2/bed/blastz.monDom4
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
doBlastzChainNet.pl DEF \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
>& blastz.out 2>&1 &
#########################################################################
# Blastz Marmoset calJac1 (DONE - 2007-11-28 - 30 - Hiram)
ssh kkstore04
screen # use screen to control this job
# managing disk space on full filesystem
mkdir -p /cluster/store8/canFam2/bed/blastzCalJac1.2007-11-28
ln -s /cluster/store8/canFam2/bed/blastzCalJac1.2007-11-28 \
/cluster/data/canFam2/bed
cd /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28
cat << '_EOF_' > DEF
# Dog vs. Marmoset
BLASTZ_M=50
# TARGET: Dog MonDom4
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/cluster/data/canFam2/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=100
# QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs
SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=2000000
SEQ2_LIMIT=200
SEQ2_LAP=0
BASE=/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
-chainLinearGap=medium -bigClusterHub=kk -verbose=2 \
> do.log 2>&1 &
# real 864m49.290s
# Completed: 266999 of 267000 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 17058860s 284314.33m 4738.57h 197.44d 0.541 y
# IO & Wait Time: 1031855s 17197.59m 286.63h 11.94d 0.033 y
# Average job time: 68s 1.13m 0.02h 0.00d
# Longest finished job: 1302s 21.70m 0.36h 0.02d
# Submission to last job: 47998s 799.97m 13.33h 0.56d
# one job ran out of memory on the kk nodes. It was finished on kolossus
/cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
/scratch/data/canFam2/nib/chr21.nib:chr21:20000000-40010000 qParts/part238.lst \
../DEF \
../psl/chr21.nib:chr21:20000000-40010000/chr21.nib:chr21:20000000-40010000_part238.lst.psl
# continuing
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
-continue=cat -chainLinearGap=medium -bigClusterHub=kk -verbose=2 \
> cat.log 2>&1 &
# had a problem with files missing from /scratch/data/ on
# the Iservers - rsync that directory /iscratch/data/ full on
# kkr1u00 from /cluster/bluearc/scratch/data/ and push it to
# the other Iservers
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
-continue=chainMerge -chainLinearGap=medium -bigClusterHub=kk \
-verbose=2 > chainMerge.log 2>&1 &
# real 79m39.909s
cat fb.canFam2.chainCalJac1Link.txt
# 1369690756 bases of 2384996543 (57.429%) in intersection
mkdir /cluster/data/calJac1/bed/blastz.canFam2.swap
cd /cluster/data/calJac1/bed/blastz.canFam2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=kk > swap.log 2>&1 &
# encountered difficulties with /scratch/data/ on kolossus
# had to finish the netChains.csh script manually, then continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \
-continue=load -chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=kk > load.log 2>&1 &
# real 56m44.375s
cat fb.calJac1.chainCanFam2Link.txt
# 1451345669 bases of 2929139385 (49.549%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11 - Hiram)
ssh kkstore04
screen # use a screen to manage this multi-day job
mkdir /cluster/data/canFam2/bed/blastzBosTau4.2008-03-11
cd /cluster/data/canFam2/bed/blastzBosTau4.2008-03-11
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/cluster/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow bosTau4
SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0
BASE=/cluster/data/canFam2/bed/blastzBosTau4.2008-03-11
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
`pwd`/DEF -verbose=2 \
-bigClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \
-syntenicNet > do.log 2>&1
# the pk became free as this job had 3 days to go. So, chill out
# the batch on memk, let it finish its running jobs, then finish
# the batch manually on pk. Continuing:
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
`pwd`/DEF -verbose=2 -smallClusterHub=memk \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-continue=cat -syntenicNet > cat.log 2>&1
# real 175m41.674s
cat fb.canFam2.chainBosTau4Link.txt
# 1367017426 bases of 2384996543 (57.317%) in intersection
mkdir /cluster/data/bosTau4/bed/blastz.canFam2.swap
cd /cluster/data/bosTau4/bed/blastz.canFam2.swap
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
/cluster/data/canFam2/bed/blastzBosTau4.2008-03-11/DEF \
-verbose=2 -smallClusterHub=memk \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet > swap.log 2>&1
# real 388m46.213s
cat fb.bosTau4.chainCanFam2Link.txt
# 1447088347 bases of 2731830700 (52.971%) in intersection
#############################################################################
# BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-18 - larrym)
ssh kkstore04
screen # use screen to control this multi-day job
mkdir /cluster/data/canFam2/bed/blastz.equCab2.2008-04-18
cd /cluster/data/canFam2/bed/blastz.equCab2.2008-04-18
cat << '_EOF_' > DEF
# Dog vs. Horse
BLASTZ_M=50
# TARGET: Dog canFam2
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/cluster/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/canFam2/bed/blastz.equCab2.2008-04-18
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab2/blastz.canFam2 >& do.log &
0.117u 0.073s 9:20:00.22 0.0% 0+0k 0+0io 1pf+0w
six parasol jobs failed; retried with:
para -retries=5 push
disable bad servers:
1 kkr10u31.kilokluster.ucsc.edu
1 kkr11u16.kilokluster.ucsc.edu
1 kkr12u29.kilokluster.ucsc.edu
pushd /cluster/bluearc/equCab2/blastz.canFam2/psl
find . -name '*.psl' | wc
64625 64625 6657995
This is the correct number of psl files, so continue with cat step:
time doBlastzChainNet.pl `pwd`/DEF \
-verbose=2 -bigClusterHub=pk -continue=cat \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/equCab2/blastz.canFam2 >>& do.log &
0.212u 0.173s 2:49:45.35 0.0% 0+0k 0+0io 62pf+0w
ln -s blastz.equCab2.2008-04-18 /cluster/data/canFam2/bed/blastz.equCab2
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
# lastz Poodle canFamPoodle1 (DONE - 2009-06-06,22 - Hiram)
mkdir /hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06
cd /hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06
cat << '_EOF_' > DEF
# Tasha boxer dog vs Shadow poodle
# parameters for very close alignment from human-chimp example
BLASTZ_M=254
BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_K=4500
BLASTZ_Y=15000
BLASTZ_T=2
# TARGET: Tasha, canFam2
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/data/canFamPoodle1/canFamPoodle1.2bit
SEQ2_LEN=/scratch/data/canFamPoodle1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LAP=0
SEQ2_LIMIT=800
BASE=/hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-noDbNameCheck -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium > do.log 2>&1
# real 8250m12.188s
# Completed: 374825 of 374825 jobs
# CPU time in finished jobs: 187350504s 3122508.40m 52041.81h 2168.41d 5.941 y
# IO & Wait Time: 4127960s 68799.33m 1146.66h 47.78d 0.131 y
# Average job time: 511s 8.51m 0.14h 0.01d
# Longest finished job: 2339s 38.98m 0.65h 0.03d
# Submission to last job: 494836s 8247.27m 137.45h 5.73d
# the lastz run thought it failed, but it didn't, continuing:
time nice -n +19 doBlastzChainNet.pl \
-verbose=2 \
`pwd`/DEF \
-continue=cat -noDbNameCheck -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium > cat.log 2>&1
# real 4841m48.581s <- this is actually a fb time on cavPor3
# this finish step was less than an hour
fb.canFam2.chainCanFamPoodle1Link.txt
# 1405528799 bases of 2384996543 (58.932%) in intersection
mkdir /hive/data/genomes/canFamPoodle1/bed/blastz.canFam2.swap
cd /hive/data/genomes/canFamPoodle1/bed/blastz.canFam2.swap
time nice -n +19 doBlastzChainNet.pl \
-verbose=2 \
/hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06/DEF \
-swap -noDbNameCheck -noLoadChainSplit \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1
# real 8585m58.789s
cat fb.canFamPoodle1.chainCanFam2Link.txt
# 1377478896 bases of 1517497798 (90.773%) in intersection
############################################################################
# Re-Run equCab2 alignment (DONE - 2009-06-29 - Hiram
mkdir /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29
cd /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29
cat << '_EOF_' > DEF
# Dog vs. Horse
BLASTZ_M=50
# TARGET: Dog canFam2
SEQ1_DIR=/scratch/data/canFam2/nib
SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl `pwd`/DEF \
-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 338m57.973s
cat fb.canFam2.chainEquCab2Link.txt
# 1676663178 bases of 2384996543 (70.300%) in intersection
# this is identical to what went down before ?
mkdir /hive/data/genomes/equCab2/bed/blastz.canFam2.swap
cd /hive/data/genomes/equCab2/bed/blastz.canFam2.swap
time doBlastzChainNet.pl \
/hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29/DEF \
-swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 286m51.658s
fb.equCab2.chainCanFam2Link.txt
# 1721407500 bases of 2428790173 (70.875%) in intersection
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
+# BLASTZ FOR ZEBRAFISH (danRer5) (DONE, 2009-08-07, hartera)
+# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
+ mkdir /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07
+ cd /hive/data/genomes/canFam2/bed
+ ln -s blastz.danRer5.2009-08-07 blastz.danRer5
+ cd /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07
+
+ cat << '_EOF_' > DEF
+# dog (canFam2) vs zebrafish (danRer5)
+BLASTZ_Y=3400
+BLASTZ_L=6000
+BLASTZ_K=2200
+BLASTZ_M=50
+
+# TARGET: Dog canFam2
+SEQ1_DIR=/scratch/data/canFam2/canFam2.2bit
+SEQ1_LEN=/hive/data/genomes/canFam2/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY - zebrafish (danRer5)
+SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
+SEQ2_LEN=/hive/data/genomes/danRer5/chrom.sizes
+SEQ2_CHUNK=10000000
+SEQ2_LIMIT=50
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+ chmod +x DEF
+time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
+ -chainMinScore=5000 -chainLinearGap=loose \
+ >& doBlastz.log &
+# 0.806u 0.488s 3:58:52.14 0.0% 0+0k 0+0io 0pf+0w
+
+cat fb.canFam2.chainDanRer5Link.txt
+# 29234486 bases of 2384996543 (1.226%) in intersection
+
+# DO BLASTZ SWAP TO CREATE CANFAM2 CHAINS, NETS, AXNET, MAFNET AND
+# ALIGNMENT DOWNLOADS ON DANRER5 - documented in danRer5.txt