src/hg/makeDb/doc/dm2.txt 1.23
1.23 2009/02/23 23:40:23 angie
bdtnpChipper (old change): corrected regex, found another subtrack.
Index: src/hg/makeDb/doc/dm2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/dm2.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/dm2.txt 3 Feb 2009 00:06:40 -0000 1.22
+++ src/hg/makeDb/doc/dm2.txt 23 Feb 2009 23:40:23 -0000 1.23
@@ -1,5943 +1,5947 @@
# for emacs: -*- mode: sh; -*-
# Drosophila Melanogaster --
#
# euchromatin (2L, 2R, 3L, 3R, 4, X):
# Berkeley Drosophila Genome Project (fruitfly.org) release 4 (Apr. 2004)
# http://www.fruitfly.org/annot/release4.html
#
# heterochromatin (2h, 3h, 4h, Xh, Yh, U):
# Drosophila Heterochromatin Genome Project (dhgp.org) release 3.2
# (submitted to GenBank June 2004)
# http://www.dhgp.org/index_release_notes.html
#
# Gene annotations:
# FlyBase (http://flybase.bio.indiana.edu/) last updated ???
#
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+-------------------------+
# | tableName | type |
# +-------------+-------------------------+
# | flyBaseGene | genePred flyBasePep |
# | refGene | genePred refPep refMrna |
# | nscanGene | genePred |
# | geneid | genePred geneidPep |
# | genscan | genePred genscanPep |
# | augustus | genePred |
# +-------------+-------------------------+
# DOWNLOAD SEQUENCE (DONE 9/8/04 angie)
ssh kksilo
mkdir /cluster/store8/dm2
ln -s /cluster/store8/dm2 /cluster/data/dm2
cd /cluster/data/dm2
wget ftp://ftp.fruitfly.org/pub/download/compressed/na_euchromatin_genomic_dmel_RELEASE4.FASTA.gz
zcat na_euchromatin_genomic_dmel_RELEASE4.FASTA.gz \
| faSplit byname stdin dummyArg
# Follow FlyBase's lead on the chromosome names, but still use our
# "chr" prefix:
foreach c (2L 2R 3L 3R 4 X)
mkdir $c
sed -e 's/^>arm_/>chr/' arm_$c.fa > $c/chr$c.fa
echo arm_$c.fa size:
faSize arm_$c.fa
echo $c/chr$c.fa size:
faSize $c/chr$c.fa
echo comparison:
faCmp arm_$c.fa $c/chr$c.fa
echo ""
end
# heterochromatin sold separately... ftp://ftp.dhgp.org/pub/DHGP still
# has release 3.2 as its latest. But that's June 2004... wtf, grab it:
wget ftp://ftp.dhgp.org/pub/DHGP/Release3.2/FASTA/super-scaffolds/heterochromatin_super-scaffolds-genomic_dmel_RELEASE3.2.FASTA.tar.gz
tar xvzf heterochromatin_super-scaffolds-genomic_dmel_RELEASE3.2.FASTA.tar.gz
foreach c (2h 3h 4h Xh Yh U)
mkdir $c
perl -wpe 's/^>(\w+).*/>chr$1/' $c.FASTA > $c/chr$c.fa
echo $c.FASTA size:
faSize $c.FASTA
echo $c/chr$c.fa size:
faSize $c/chr$c.fa
echo comparison:
faCmp $c.FASTA $c/chr$c.fa
echo ""
end
# Carefully review output of those commands, then:
rm *.fa *.FASTA
# put away the big download files
mkdir downloads
mv *.gz downloads/
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 9/9/04 angie)
mkdir /cluster/data/dm2/M
cd /cluster/data/dm2/M
# go to http://www.ncbi.nih.gov/ and search Nucleotide for
# "drosophila melanogaster mitochondrion genome". That shows the gi number:
# 5835233
# Use that number in the entrez linking interface to get fasta:
wget -O chrM.fa \
'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5835233&dopt=FASTA'
# Edit chrM.fa: make sure the long fancy header line says it's the
# Drosophila melanogaster mitochondrion complete genome, and then replace the
# header line with just ">chrM".
# SPLIT CHROM FA INTO SMALLER CHUNKS BY GAPS (DONE 9/9/04 angie)
ssh kksilo
cd /cluster/data/dm2
foreach c (?{,?})
faSplit -minGapSize=100 -lift=$c/chr$c.lft \
gap $c/chr$c.fa 2000000 $c/chr${c}_
end
foreach ctgFa (?{,?}/chr*_*.fa)
set ctg = $ctgFa:r
mkdir $ctg
mv $ctgFa $ctg
end
# CREATING DATABASE (DONE 9/9/04 angie)
ssh hgwdev
# Make sure there is at least 5 gig free for the database
df -h /var/lib/mysql
#/dev/sdc1 1.8T 535G 1.1T 33% /var/lib/mysql
# Create the database.
hgsql '' -e 'create database dm2'
# EXTRACT GAP INFORMATION FROM FASTA, LOAD GAP TRACK (DONE 9/9/04 angie)
ssh kksilo
cd /cluster/data/dm2
# Do as Jim suggested back when we got dm1 sequence:
# I think that we can probably just show all gaps as bridged
# in the non-h chromosomes, and as unbridged in the h chromosomes
# and leave it at that.
# Extract gaps using scaffoldFaToAgp. It's really meant for a different
# purpose, so clean up its output: remove the .lft and .agp, and remove
# the last line of .gap (extra gap added at end). Also substitute in
# the correct chrom name in .gap.
foreach c (?{,?})
set chr = chr$c
pushd $c
mv $chr.lft $chr.lft.bak
scaffoldFaToAgp -minGapSize=100 $chr.fa
rm $chr.{lft,agp}
mv $chr.lft.bak $chr.lft
set chrSize = `faSize $chr.fa | awk '{print $1;}'`
set origLines = `cat $chr.gap | wc -l`
awk '($2 != '$chrSize'+1) {print;}' $chr.gap \
| sed -e "s/chrUn/$chr/" > $chr.gap2
set newLines = `cat $chr.gap2 | wc -l`
if ($newLines == ($origLines - 1)) then
mv $chr.gap2 $chr.gap
else
echo "Error: $chr/$chr.gap2 has wrong number of lines."
endif
popd
end
# Call the gaps unbridged in chrU and chr*h:
foreach c (U ?h)
set chr = chr$c
sed -e 's/yes/no/' $c/$chr.gap > $c/$chr.gap2
mv $c/$chr.gap2 $c/$chr.gap
end
ssh hgwdev
hgLoadGap dm2 /cluster/data/dm2
# MAKE JKSTUFF AND BED DIRECTORIES (DONE 9/9/04 angie)
# This used to hold scripts -- better to keep them inline in the .doc
# so they're in CVS. Now it should just hold lift file(s) and
# temporary scripts made by copy-paste from this file.
mkdir /cluster/data/dm2/jkStuff
# This is where most tracks will be built:
mkdir /cluster/data/dm2/bed
# MAKE LIFTALL.LFT (DONE 9/9/04 angie)
ssh kksilo
cd /cluster/data/dm2
cat ?{,?}/chr*.lft > jkStuff/liftAll.lft
# RUN REPEAT MASKER (DONE 9/9/04 angie)
# Note: drosophila library ("drosophila.lib") is dated May 27 '03.
# Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make
# RepeatMasker runs manageable on the cluster ==> results need lifting.
# Split contigs into 500kb chunks:
ssh kksilo
cd /cluster/data/dm2
foreach d ( */chr*_?{,?} )
cd $d
set contig = $d:t
faSplit -minGapSize=100 -lift=$contig.lft -maxN=500000 \
gap $contig.fa 500000 ${contig}_
cd ../..
end
#- Make the run directory and job list:
cd /cluster/data/dm2
cat << '_EOF_' > jkStuff/RMDrosophila
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/dm2/$2
/bin/cp $2 /tmp/dm2/$2/
cd /tmp/dm2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2
popd
/bin/cp /tmp/dm2/$2/$2.out ./
if (-e /tmp/dm2/$2/$2.tbl) /bin/cp /tmp/dm2/$2/$2.tbl ./
if (-e /tmp/dm2/$2/$2.cat) /bin/cp /tmp/dm2/$2/$2.cat ./
/bin/rm -fr /tmp/dm2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/dm2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/dm2
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMDrosophila
mkdir RMRun
cp /dev/null RMRun/RMJobs
foreach d ( ?{,?}/chr*_?{,?} )
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/dm2/jkStuff/RMDrosophila \
/cluster/data/dm2/$d $f /cluster/data/dm2/$d \
'{'check out line+ /cluster/data/dm2/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
# do the run
ssh kk9
cd /cluster/data/dm2/RMRun
para create RMJobs
para try, check, push, check,...
#Completed: 288 of 288 jobs
#Average job time: 3169s 52.82m 0.88h 0.04d
#Longest job: 4752s 79.20m 1.32h 0.06d
#Submission to last job: 13769s 229.48m 3.82h 0.16d
# Lift up the split-contig .out's to contig-level .out's
ssh kksilo
cd /cluster/data/dm2
foreach d ( ?{,?}/chr*_?{,?} )
cd $d
set contig = $d:t
liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
cd ../..
end
# Lift up the contig-level .out's to chr-level
foreach c (?{,?})
cd $c
if (-e chr$c.lft && ! -z chr$c.lft) then
echo lifting $c
/cluster/bin/i386/liftUp chr$c.fa.out chr$c.lft warn \
`awk '{print $2"/"$2".fa.out";}' chr$c.lft` > /dev/null
else
echo Can\'t find $c/chr$c.lft \!
endif
cd ..
end
# Load the .out files into the database with:
ssh hgwdev
hgLoadOut dm2 /cluster/data/dm2/?{,?}/*.fa.out
# SIMPLE REPEATS (TRF) (DONE 9/9/04 angie)
ssh kksilo
mkdir /cluster/data/dm2/bed/simpleRepeat
cd /cluster/data/dm2/bed/simpleRepeat
mkdir trf
cp /dev/null jobs.csh
foreach f (/cluster/data/dm2/?{,?}/chr*_*/chr?{,?}_?{,?}.fa)
set fout = $f:t:r.bed
echo $fout
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
>> jobs.csh
end
tcsh jobs.csh >&! jobs.log &
# check on this with
tail -f jobs.log
wc -l jobs.csh
ls -1 trf | wc -l
# When job is done do:
liftUp simpleRepeat.bed /cluster/data/dm2/jkStuff/liftAll.lft warn \
trf/*.bed
# Load this into the database as so
ssh hgwdev
hgLoadBed dm2 simpleRepeat \
/cluster/data/dm2/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 9/9/04 angie)
# make a filtered version # of the trf output:
# keep trf's with period <= 12:
ssh kksilo
cd /cluster/data/dm2/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/*.bed)
echo "filtering $f"
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/dm2
mkdir bed/simpleRepeat/trfMaskChrom
foreach c (?{,?})
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.lft warn \
`awk '{print "bed/simpleRepeat/trfMask/"$2".bed";}' $c/chr$c.lft`
end
# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 9/9/04 angie)
ssh kksilo
cd /cluster/data/dm2
foreach c (?{,?})
echo repeat- and trf-masking chr$c.fa
/cluster/home/kent/bin/i386/maskOutFa -soft \
$c/chr$c.fa $c/chr$c.fa.out $c/chr$c.fa
/cluster/home/kent/bin/i386/maskOutFa -softAdd \
$c/chr$c.fa bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.fa
end
foreach c (?{,?})
echo repeat- and trf-masking contigs of chr$c
foreach ctgFa ($c/chr*/chr${c}_?{,?}.fa)
set trfMask=bed/simpleRepeat/trfMask/$ctgFa:t:r.bed
/cluster/home/kent/bin/i386/maskOutFa -soft $ctgFa $ctgFa.out $ctgFa
/cluster/home/kent/bin/i386/maskOutFa -softAdd $ctgFa $trfMask $ctgFa
end
end
# STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 9/9/04 angie)
# Translate to nib
ssh kksilo
cd /cluster/data/dm2
mkdir nib
foreach c (?{,?})
faToNib -softMask $c/chr$c.fa nib/chr$c.nib
end
# Make symbolic links from /gbdb/dm2/nib to the real nibs.
ssh hgwdev
mkdir -p /gbdb/dm2/nib
foreach f (/cluster/data/dm2/nib/chr*.nib)
ln -s $f /gbdb/dm2/nib
end
# Load /gbdb/dm2/nib paths into database and save size info.
hgsql dm2 < ~/src/hg/lib/chromInfo.sql
hgNibSeq -preMadeNib dm2 /gbdb/dm2/nib /cluster/data/dm2/?{,?}/chr?{,?}.fa
echo "select chrom,size from chromInfo" | hgsql -N dm2 \
> /cluster/data/dm2/chrom.sizes
# CREATING GRP TABLE FOR TRACK GROUPING (DONE 9/9/04 angie)
# Copy all the data from the table "grp"
# in the existing database dm1 to the new database
ssh hgwdev
hgsql dm2 -e "create table grp (PRIMARY KEY(NAME)) select * from dm1.grp"
# MAKE GCPERCENT (DONE 2/2/04 angie)
ssh hgwdev
mkdir /cluster/data/dm2/bed/gcPercent
cd /cluster/data/dm2/bed/gcPercent
# create and load gcPercent table
hgsql dm2 < ~/src/hg/lib/gcPercent.sql
hgGcPercent dm2 ../../nib
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DROSOPHILA (DONE 9/9/04 angie)
# Warning: must genome and organism fields must correspond
# with defaultDb values
echo 'INSERT INTO dbDb \
(name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) values \
("dm2", "Apr. 2004", "/gbdb/dm2/nib", "D. melanogaster", \
"chr2L:825964-851061", 1, 55, "D. melanogaster", \
"Drosophila melanogaster", "/gbdb/dm2/html/description.html", \
0, 0, "BDGP v. 4 / DHGP v. 3.2");' \
| hgsql -h genome-testdb hgcentraltest
echo 'update defaultDb set name = "dm2" where genome = "D. melanogaster"' \
| hgsql -h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
cd ~/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add dm2 to DBS and do
make update
# go public on genome-test
cvs commit makefile
make alpha
# Add trackDb directories and description.html
mkdir drosophila/dm2
cvs add drosophila/dm2
# Write ~/kent/src/hg/makeDb/trackDb/drosophila/dm2/description.html
# with a description of the assembly and some sample position queries.
chmod a+r drosophila/dm2/description.html
# Check it in and copy (via "make alpha" in trackDb/) to
# /cluster/data/dm2/html/.
cvs add drosophila/dm2/description.html
cvs commit drosophila/dm2
mkdir -p /gbdb/dm2/html
make alpha
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DROSOPHILA (DONE 2/?/05 galt/angie)
ssh hgwdev
# Get appropriate hostname and port numbers from cluster admins:
echo 'insert into blatServers values("dm2", "blat14", 17794, 1, 0); \
insert into blatServers values("dm2", "blat14", 17795, 0, 1);' \
| hgsql -h genome-testdb hgcentraltest
# PUT NIBS ON ISCRATCH (DONE 9/9/04 angie)
ssh kkr1u00
mkdir /iscratch/i/dm2
cd /iscratch/i/dm2
cp -pR /cluster/data/dm2/nib .
iSync
# Added "contigs" (chunks) 9/16/04
mkdir maskedContigs
cp -p /cluster/data/dm2/*/chr*_*/chr?{,?}_?{,?}.fa maskedContigs
iSync
# AUTO UPDATE GENBANK MRNA RUN (DONE 2/1/05 angie)
# Update genbank config and source in CVS:
cd ~/kent/src/hg/makeDb/genbank
cvsup .
# See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
# edits, check them in if necessary:
diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf
# Edit etc/genbank.conf and add these lines:
# dm2 (D. melanogaster)
dm2.genome = /iscratch/i/dm2/nib/chr*.nib
dm2.lift = /cluster/data/dm2/jkStuff/liftAll.lft
dm2.genbank.mrna.xeno.load = yes
dm2.genbank.est.xeno.load = no
dm2.downloadDir = dm2
cvs ci etc/genbank.conf
# Install to /cluster/data/genbank:
make install-server
ssh eieio
cd /cluster/data/genbank
# This is an -initial run, refseq only:
nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial dm2 &
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -drop -initialLoad dm2
featureBits dm2 refGene
#28230571 bases of 131698467 (21.436%) in intersection
# Clean up:
rm -rf work/initial.dm2
ssh eieio
cd /cluster/data/genbank
# This is an -initial run, mRNA only:
nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial dm2 &
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -drop -initialLoad dm2
featureBits dm2 mrna
#23752517 bases of 131698467 (18.036%) in intersection
featureBits dm2 xenoMrna
#5943805 bases of 131698467 (4.513%) in intersection
# Clean up:
rm -rf work/initial.dm2
ssh eieio
# -initial for ESTs:
nice bin/gbAlignStep -srcDb=genbank -type=est -initial dm2 &
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep dm2
featureBits dm2 intronEst
#12644283 bases of 131698467 (9.601%) in intersection
featureBits dm2 est
#30993609 bases of 131698467 (23.534%) in intersection
# Clean up:
rm -rf work/initial.dm2
# PRODUCING GENSCAN PREDICTIONS (DONE 9/10/04 angie)
# Check out hg3rdParty/genscanlinux to get latest genscan:
ssh hgwdev
mkdir /cluster/data/dm2/bed/genscan
cd /cluster/data/dm2/bed/genscan
cvs co hg3rdParty/genscanlinux
ssh kksilo
cd /cluster/data/dm2/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Make hard-masked contigs
foreach f (/cluster/data/dm2/?{,?}/chr*/chr?{,?}_?{,?}.fa)
maskOutFa $f hard $f.masked
end
# Generate a list file, contigs.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
rm -f contigs.list
touch contigs.list
foreach f ( `ls -1S /cluster/data/dm2/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` )
egrep '[ACGT]' $f > /dev/null
if ($status == 0) echo $f >> contigs.list
end
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
gensub2 contigs.list single gsub jobList
# Run on small cluster -- genscan needs big mem.
ssh kki
cd /cluster/data/dm2/bed/genscan
para create jobList
para try, check, push, check, ...
#Completed: 81 of 81 jobs
#Average job time: 48s 0.80m 0.01h 0.00d
#Longest job: 212s 3.53m 0.06h 0.00d
#Submission to last job: 589s 9.82m 0.16h 0.01d
# If there are crashes, diagnose with "para problems".
# If a job crashes due to genscan running out of memory, re-run it
# manually with "-window=1200000" instead of "-window=2400000".
# Convert these to chromosome level files as so:
ssh kksilo
cd /cluster/data/dm2/bed/genscan
liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/dm2/bed/genscan
ldHgGene -gtf dm2 genscan genscan.gtf
hgPepPred dm2 generic genscanPep genscan.pep
hgLoadBed dm2 genscanSubopt genscanSubopt.bed
featureBits dm2 genscan
#24711981 bases of 131698467 (18.764%) in intersection
# MYTOUCH FIX - Jen - 2006-01-24
sudo mytouch dm2 genscanPep 0412141800.00
# MAKE DOWNLOADABLE FILES (DONE 1/26/05 angie)
ssh kksilo
cd /cluster/data/dm2
mkdir zips
zip -j zips/chromOut.zip ?{,?}/chr?{,?}.fa.out
zip -j zips/chromFa.zip ?{,?}/chr?{,?}.fa
foreach f (?{,?}/chr?{,?}.fa)
maskOutFa $f hard $f.masked
end
zip -j zips/chromFaMasked.zip ?{,?}/chr?{,?}.fa.masked
cd bed/simpleRepeat
zip ../../zips/chromTrf.zip trfMaskChrom/chr*.bed
cd ../..
# Make a starter mrna.zip -- it will get updated regularly on the RR.
/cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank \
-db=dm2 -native genbank mrna zips/mrna.fa
gzip zips/mrna.fa
foreach f (zips/*.zip)
echo $f
unzip -t $f | tail -1
end
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/dm2
cd /usr/local/apache/htdocs/goldenPath/dm2
mkdir bigZips database
# Create README.txt files in bigZips/ and database/ to explain the files.
cp -p /cluster/data/dm2/zips/*.{zip,gz} bigZips
cd bigZips
md5sum *.{zip,gz} > md5sum.txt
# BLASTZ D.PSEUDOOBSCURA (DONE 9/10/04 angie)
# CHAIN PSEUDOOBSCURA BLASTZ (DONE 9/10/04 angie)
# NET PSEUDOOBSCURA BLASTZ (DONE 9/11/04 angie)
# GENERATE DP2 MAF FOR MULTIZ FROM NET (DONE 9/11/04 angie)
# MAKE VSDP2 DOWNLOADABLES (DONE 2/15/05 angie)
# -- dp2; Originally run with default blastz params; obsoleted by dp3 which
# was later run with better params, below.
# BLASTZ ANOPHELES (DONE 9/13/04 angie)
# CHAIN ANOPHELES BLASTZ (DONE 9/13/04 angie)
# NET ANOPHELES BLASTZ (DONE 9/13/04 angie)
# MAKE VSANOGAM1 DOWNLOADABLES (DONE 1/28/05 angie)
# GENERATE ANOGAM1 MAF FOR MULTIZ FROM NET (DONE 9/13/04 angie)
# -- Originally run with "human-fugu" blastz params; replaced by a run with
# better params, below.
# MULTIZ MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/22/04 angie)
# put the MAFs on bluearc
ssh kksilo
mkdir -p /cluster/bluearc/multiz.flymo/my
cp /cluster/data/dm2/bed/blastz.droYak1.2004-09-10/mafNet/*.maf \
/cluster/bluearc/multiz.flymo/my
mkdir -p /cluster/bluearc/multiz.flymo/mp
cp /cluster/data/dm2/bed/blastz.dp2.2004-09-10/mafNet/*.maf \
/cluster/bluearc/multiz.flymo/mp
mkdir -p /cluster/bluearc/multiz.flymo/ma
cp /cluster/data/dm2/bed/blastz.anoGam1.2004-09-13/mafNet/*.maf \
/cluster/bluearc/multiz.flymo/ma
ssh kki
mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1
mkdir mypa
# Use PSU's new var_multiz:
cat << '_EOF_' > doMultiz
#!/bin/csh -ef
set path = (/cluster/bin/penn/var_multiz.2004.08.12 $path)
set chr = $1:t:r
if (-s $1 && -s $2) then
set tmp = /scratch/$chr.tmp.maf
var_multiz $1 $2 0 0 > $tmp
maf_project $tmp dm2.$chr > /scratch/$chr.myp.maf
rm $tmp
else if (-s $1) then
cp $1 /scratch/$chr.myp.maf
else if (-s $2) then
cp $2 /scratch/$chr.myp.maf
endif
if (-s /scratch/$chr.myp.maf && -s $3) then
set tmp = /scratch/$chr.tmp.maf
var_multiz /scratch/$chr.myp.maf $3 1 0 > $tmp
maf_project $tmp dm2.$chr > $4
rm $tmp
else if (-s /scratch/$chr.myp.maf) then
cp /scratch/$chr.myp.maf $4
else if (-s $3) then
cp $3 $4
endif
rm /scratch/$chr.myp.maf
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doMultiz
cp /dev/null jobList
foreach chr (`awk '{print $1;}' /cluster/data/dm2/chrom.sizes`)
set f1 = /cluster/bluearc/multiz.flymo/my/$chr.maf
set f2 = /cluster/bluearc/multiz.flymo/mp/$chr.maf
set f3 = /cluster/bluearc/multiz.flymo/ma/$chr.maf
echo "doMultiz $f1 $f2 $f3 mypa/$chr.maf" >> jobList
end
para create jobList
para try, check, push, check
#Completed: 13 of 13 jobs
#Average job time: 246s 4.09m 0.07h 0.00d
#Longest job: 811s 13.52m 0.23h 0.01d
#Submission to last job: 811s 13.52m 0.23h 0.01d
du -sh mypa
#417M mypa
# clean up bluearc
rm -r /cluster/bluearc/multiz.flymo
# setup external files for database reference
ssh hgwdev
mkdir /gbdb/dm2/mzDy1Dp2Ag1_phast
ln -s /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/mypa/chr*.maf \
/gbdb/dm2/mzDy1Dp2Ag1_phast/
# load into database
cd /tmp
hgLoadMaf -warn dm2 mzDy1Dp2Ag1_phast
cd /gbdb/dm2
mkdir d_pseudoobscura_mypa d_yakuba_mypa a_gambiae_mypa
cd /tmp
ln -s /cluster/data/dm2/bed/blastz.dp2.2004-09-10/mafNet/*.maf \
/gbdb/dm2/d_pseudoobscura_mypa
hgLoadMaf -WARN dm2 d_pseudoobscura_mypa
ln -s /cluster/data/dm2/bed/blastz.droYak1.2004-09-10/mafNet/*.maf \
/gbdb/dm2/d_yakuba_mypa
hgLoadMaf -WARN dm2 d_yakuba_mypa
ln -s /cluster/data/dm2/bed/blastz.anoGam1.2004-09-13/mafNet/*.maf \
/gbdb/dm2/a_gambiae_mypa
hgLoadMaf -WARN dm2 a_gambiae_mypa
# PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/22/04 angie)
ssh kksilo
# copy chrom fa to bluearc, break up the genome-wide MAFs into pieces
mkdir -p /cluster/bluearc/dm2/chrom
cp -p /cluster/data/dm2/?{,?}/chr*.fa /cluster/bluearc/dm2/chrom/
ssh kki
mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.split
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.split
set WINDOWS = /cluster/bluearc/dm2/phastCons/WINDOWS
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.sh
#!/bin/csh -ef
set PHAST=/cluster/bin/phast
set FA_SRC=/cluster/bluearc/dm2/chrom
set WINDOWS=/cluster/bluearc/dm2/phastCons/WINDOWS
set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O dm2,droYak1,dp2,anoGam1 \
-w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
chmod a+x doSplit.sh
rm -f jobList
foreach file (/cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/mypa/*.maf)
if (-s $file) then
echo "doSplit.sh {check in line+ $file}" >> jobList
endif
end
para create jobList
para try, check, push, check...
#Completed: 13 of 13 jobs
#Average job time: 23s 0.38m 0.01h 0.00d
#Longest job: 53s 0.88m 0.01h 0.00d
#Submission to last job: 53s 0.88m 0.01h 0.00d
cd ..
# use the model previously estimated (see makeDm1.doc) as a starting model
sed -e 's/dm1/dm2/g' /cluster/data/dm1/bed/phastCons4way/rev-dg.mod \
> starting-tree.mod
# -- Because of the very long branch length to anoGam1 being pretty
# much impossible to estimate from alignment data, edit that file to
# reduce the anoGam1 branch length from 2.66 to 0.5. Otherwise
# estimation process blows up. So our starting tree becomes
#TREE: (((dm2:0.058,droYak1:0.074):0.133,dp2:0.200):0,anoGam1:0.5);
# Get genome-wide average GC content (for all species together,
# not just the reference genome). If you have a globally
# estimated tree model, as above, you can get this from the
# BACKGROUND line in the .mod file. E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.276938 0.223190 0.223142 0.276730
# add up the C and G:
awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod
#0.446
# Great, use 0.446 as the --gc parameter in phastCons below:.
# Now set up cluster job to estimate model parameters.
# Parameters will be estimated separately for each alignment fragment
# then will be combined across fragments.
# Use --gc from above, and --target-coverage computed as follows:
# 1. Jim would like phastConsElements coverage to be 25% for fly.
# 2. 83% of dm2 is covered by chainDroYak1Link, so use .25 / .83 = .30
# as an initial --target-coverage.
# 3. If actual coverage is different from our target, come back to this
# step, adjust --target-coverage and rerun up through phastConsElements.
# -- Actually, Adam suggests starting with what proved to work for worm:
# --target-coverage 0.4 and --expected-lengths 25 (not 12 as for mammal)
# -- OK, that led to too-high coverage:
# 51511266 bases of 131698467 (39.113%) in intersection
# so next try 0.3, 30:
# 49886398 bases of 131698467 (37.879%) in intersection
# how about 0.3, 20?
# 42067218 bases of 131698467 (31.942%) in intersection
# 0.25, 20?
# 38397215 bases of 131698467 (29.155%) in intersection
# 0.25, 15?
# 35039841 bases of 131698467 (26.606%) in intersection
# -- that's close enough to the target. If the "bumpiness" of the
# wiggle looks aesthetically pleasing then we're done.
# -- OK, Adam would like to see more smoothing so that coding exons
# stand out better, so try 0.20, 25:
# 37574714 bases of 131698467 (28.531%) in intersection
# More smooting would be nice for coding exons, so tried 0.20, 50,
# but Adam didn't like that as much overall so stick with 0.20, 25.
mkdir run.estimate
cd run.estimate
cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.435 --nrates 1,1 \
--no-post-probs --ignore-missing --expected-lengths 25 \
--target-coverage 0.20 --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
chmod a+x doEstimate.sh
rm -fr LOG TREES
mkdir -p LOG TREES
rm -f jobList
foreach f (/cluster/bluearc/dm2/phastCons/WINDOWS/*.ss.gz)
set root = $f:t:r:r
echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
end
# run cluster job
ssh kk9
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.estimate
para create jobList
para try, check, push, check, ...
#Completed: 139 of 139 jobs
#Average job time: 133s 2.22m 0.04h 0.00d
#Longest job: 221s 3.68m 0.06h 0.00d
#Submission to last job: 328s 5.47m 0.09h 0.00d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kksilo
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.estimate
ls -1 TREES/*.cons.mod > cons.txt
/cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
--output-average ave.cons.mod > cons_summary.txt
ls -1 TREES/*.noncons.mod > noncons.txt
/cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
--output-average ave.noncons.mod > noncons_summary.txt
grep TREE ave*.mod
#ave.cons.mod:TREE: (((dm2:0.028707,droYak1:0.019740):0.039091,dp2:0.065825):0.086142,anoGam1:0.086142);
#ave.noncons.mod:TREE: (((dm2:0.118202,droYak1:0.079147):0.162923,dp2:0.275544):0.360361,anoGam1:0.360361);
# look over the files cons_summary.txt and noncons_summary.txt.
# The means and medians should be roughly equal and the stdevs
# should be reasonably small compared to the means, particularly
# for rate matrix parameters (at bottom) and for branches to the
# leaves of the tree. The stdevs may be fairly high for branches
# near the root of the tree; that's okay. Some min values may be
# 0 for some parameters. That's okay, but watch out for very large
# values in the max column, which might skew the mean. If you see
# any signs of bad outliers, you may have to track down the
# responsible .mod files and throw them out. I've never had to do
# this; the estimates generally seem pretty well behaved.
# NOTE: Actually, a random sample of several hundred to a thousand
# alignment fragments (say, a number equal to the number of
# available cluster nodes) should be more than adequate for
# parameter estimation. If pressed for time, use this strategy.
# Now we are ready to set up the cluster job for computing the
# conservation scores and predicted elements. It's all downhill
# from here.
ssh kk9
mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.phast
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.phast
cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
--expected-lengths 25 --target-coverage 0.20 --quiet --seqname $chr \
--idpref $pref \
--viterbi /cluster/bluearc/dm2/phastCons/ELEMENTS/$pref.bed --score \
--require-informative 0 \
> $tmpfile
gzip -c $tmpfile > /cluster/bluearc/dm2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
chmod a+x doPhastCons.sh
rm -fr /cluster/bluearc/dm2/phastCons/{POSTPROBS,ELEMENTS}
mkdir -p /cluster/bluearc/dm2/phastCons/{POSTPROBS,ELEMENTS}
rm -f jobList
foreach f (/cluster/bluearc/dm2/phastCons/WINDOWS/*.ss.gz)
echo doPhastCons.sh $f >> jobList
end
para create jobList
para try, check, push, check, ...
#Completed: 139 of 139 jobs
#Average job time: 23s 0.38m 0.01h 0.00d
#Longest job: 49s 0.82m 0.01h 0.00d
#Submission to last job: 52s 0.87m 0.01h 0.00d
# back on kksilo:
# combine predictions and transform scores to be in 0-1000 interval
# do in a way that avoids limits on numbers of args
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
/cluster/bluearc/dm2/phastCons/ELEMENTS/*.bed \
| /cluster/bin/scripts/lodToBedScore > all.bed
ssh hgwdev
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
featureBits dm2 all.bed
#37574714 bases of 131698467 (28.531%) in intersection
# OK, close enough.
hgLoadBed dm2 phastConsElements all.bed
# Create wiggle on the small cluster
ssh kki
mkdir /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.wib
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons/run.wib
rm -rf /cluster/bluearc/dm2/phastCons/wib
mkdir -p /cluster/bluearc/dm2/phastCons/wib
cat << 'EOF' > doWigAsciiToBinary
#!/bin/csh -ef
set chr = $1
zcat `ls -1 /cluster/bluearc/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
| sort -t\. -k2,2n` \
| wigAsciiToBinary -chrom=$chr \
-wibFile=/cluster/bluearc/dm2/phastCons/wib/${chr}_phastCons stdin
'EOF'
# << for emacs
chmod a+x doWigAsciiToBinary
rm -f jobList
foreach chr (`ls -1 /cluster/bluearc/dm2/phastCons/POSTPROBS \
| awk -F\. '{print $1}' | sort -u`)
echo doWigAsciiToBinary $chr >> jobList
end
para create jobList
para try, check, push, check, ...
#Completed: 13 of 13 jobs
#Average job time: 14s 0.24m 0.00h 0.00d
#Longest job: 36s 0.60m 0.01h 0.00d
#Submission to last job: 36s 0.60m 0.01h 0.00d
# back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want
# the raw scores) from bluearc
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
rm -rf wib POSTPROBS
rsync -av /cluster/bluearc/dm2/phastCons/wib .
rsync -av /cluster/bluearc/dm2/phastCons/POSTPROBS .
# load wiggle component of Conservation track
ssh hgwdev
mkdir -p /gbdb/dm2/wib/mzDy1Dp2Ag1_phast
cd /cluster/data/dm2/bed/multiz.droYak1dp2anoGam1/phastCons
chmod 775 . wib
chmod 664 wib/*.wib
ln -s `pwd`/wib/*.wib /gbdb/dm2/wib/mzDy1Dp2Ag1_phast/
hgLoadWiggle dm2 mzDy1Dp2Ag1_phast_wig \
-pathPrefix=/gbdb/dm2/wib/mzDy1Dp2Ag1_phast wib/*.wig
#NOT DONE -- still using dm1 for this:
# make top-5000 list and launcher on Adam's home page:
sort -k5,5nr raw.bed | head -5000 > top5000.bed
/cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
/cse/grads/acs/public_html/dm-top5000-4way \
"top 5000 conserved elements (4way)" dm2
# and clean up bluearc.
rm -r /cluster/bluearc/dm2/phastCons
rm -r /cluster/bluearc/dm2/chrom
# LIFTOVER BDGP 3.2 ANNOTATIONS FROM DM1 (TEMPORARY) (DONE 9/15/04 angie)
ssh hgwdev
mkdir /cluster/data/dm2/bed/bdgp3.2.liftOver
cd /cluster/data/dm2/bed/bdgp3.2.liftOver
hgsql dm1 -N -e 'select * from bdgpGene' > dm1.bdgpGene.tab
hgsql dm1 -N -e 'select * from bdgpNonCoding' > dm1.bdgpNonCoding.tab
ssh kksilo
cd /cluster/data/dm2/bed/bdgp3.2.liftOver
# lift files to try out:
# blastz:
# /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChain/dm1ToDm2.over.chain \
# blat on dm1 chroms vs. dm2 2Mb chunks:
# /cluster/data/dm1/bed/blat.dm2.2004-09-16/dm1ToDm2.over.chain \
# blat on dm1 chroms vs. dm2 3kb chunks:
# /cluster/data/dm1/bed/blat.dm2.2004-09-17/dm1ToDm2.over.chain \
liftOver -genePred dm1.bdgpGene.tab \
/cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \
bdgpLiftGene.tab bdgpLiftGene.unmapped.tab
liftOver -genePred dm1.bdgpNonCoding.tab \
/cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \
bdgpLiftNonCoding.tab bdgpLiftNonCoding.unmapped.tab
# Not perfect but not bad:
wc -l bdgp*.tab
# 18707 bdgpLiftGene.tab
# 78 bdgpLiftGene.unmapped.tab
# 2134 bdgpLiftNonCoding.tab
# 62 bdgpLiftNonCoding.unmapped.tab
ssh hgwdev
cd /cluster/data/dm2/bed/bdgp3.2.liftOver
ldHgGene -predTab dm2 bdgpLiftGene bdgpLiftGene.tab
ldHgGene -predTab dm2 bdgpLiftNonCoding bdgpLiftNonCoding.tab
# Copy over the gene info tables
hgsql dm2 -e 'create table bdgpLiftGeneInfo (PRIMARY KEY(bdgpName(7)), INDEX(flyBaseId(11))) select * from dm1.bdgpGeneInfo'
hgsql dm2 -e 'create table bdgpLiftNonCodingInfo (INDEX(bdgpName(16)), INDEX(flyBaseId(11))) select * from dm1.bdgpNonCodingInfo'
# FLYBASE ANNOTATIONS (DONE 1/12/05 angie)
# REPLACED 2/23/05 -- SEE "FLYBASE 4.1" BELOW
ssh kksilo
mkdir /cluster/data/dm2/bed/flybase
cd /cluster/data/dm2/bed/flybase
foreach c (2L 2R 3L 3R 4 X)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.0_20041119/gff/dmel-$c-r4.0.gff.gz
end
zcat *.gff.gz > flybase.gff3
# What data sources are represented in this file?
grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
# excerpt (many other sources, including blastx:... , sim4:... and
# tblastx:...; also various other types for source "."):
18747 . CDS
62629 . exon
13472 . gene
19301 . mRNA
70 . ncRNA
40 . pseudogene
96 . rRNA
28 . snRNA
28 . snoRNA
288 . tRNA
36921 . transcription_start_site
1571 . transposable_element
4680 . transposable_element_insertion_site
# What keywords are defined in the 9th field?
grep -v '^#' flybase.gff3 \
| awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \
| sort | uniq -c
# This incarnation of gff3 looks similar to the one encountered in
# dp3, but still uses slightly different keywords and there are different
# data sources of interest. So one-shot perl-script again:
extractGenes.pl flybase.gff3
# Get predicted proteins (for main annotations only)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.0_20041119/fasta/dmel-all-translation-r4.0.fasta.gz
zcat dmel-all-translation-r4.0.fasta.gz \
| perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' > flybasePep.fa
ssh hgwdev
cd /cluster/data/dm2/bed/flybase
# Protein-coding genes:
ldHgGene -gtf dm2 flyBaseGene flybase.gtf
hgPepPred dm2 generic flyBasePep flybasePep.fa
# Noncoding genes:
hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
# Cross-referencing info for both coding and noncoding:
hgsql dm2 < $HOME/kent/src/hg/lib/flyBase2004Xref.sql
hgsql dm2 -e 'load data local infile "flyBase2004Xref.tab" \
into table flyBase2004Xref'
# add upstream* downloadable files (added 2/10/05)
cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
foreach size (1000 2000 5000)
echo upstream$size
featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
| gzip -c > upstream$size.fa.gz
end
md5sum upstream*.fa.gz >> md5sum.txt
# BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 1/18/05 angie)
# CHAIN PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie)
# NET PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie)
# GENERATE DP3 AXTNET AND MAF FOR MULTIZ (DONE 1/18/05 angie)
# MAKE VSDP3 DOWNLOADABLES (DONE 1/18/05 angie)
# -- Originally run with default blastz params; replaced by a run with
# better params, below.
# BLASTZ.V7 EVAL: BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 1/18/05 angie)
ssh kk9
mkdir /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
ln -s blastzv7.dp3.2005-01-18 /cluster/data/dm2/bed/blastzv7.dp3
# Note the full path for BLASTZ so we get blastz.v7 (which is not in
# PSU's CVS, argh!):
cat << '_EOF_' > DEF
# D.melanogaster vs. D.pseudoobscura
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=/cluster/bin/penn/blastz.2004-12-27/blastz-source/blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
# unused: SEQ1_RMSK=
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp3/nib
# unused: SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line keeps emacs coloring happy
# Create run dir, job list, and raw/ dir structure:
mkdir run
~/kent/src/utils/blastz-make-joblist DEF \
/cluster/data/dm2/chrom.sizes /cluster/data/dp3/chrom.sizes \
> run/j
csh -ef ./xdir.sh
cd run
# cluster run
para create j
para try, check, push, check, ....
#Completed: 552 of 552 jobs
#Average job time: 100s 1.66m 0.03h 0.00d
#Longest job: 480s 8.00m 0.13h 0.01d
#Submission to last job: 733s 12.22m 0.20h 0.01d
# back on kksilo...
mkdir /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1
cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1
/cluster/bin/scripts/blastz-make-out2lav ../DEF .. > j
# small cluster run
ssh kki
cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/run.1
para create j
para try, check, push, check, ....
#Completed: 23 of 23 jobs
#Average job time: 8s 0.14m 0.00h 0.00d
#Longest job: 29s 0.48m 0.01h 0.00d
#Submission to last job: 60s 1.00m 0.02h 0.00d
# Translate .lav to axt:
ssh kksilo
cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
rm -r raw
mkdir axtChrom
foreach c (lav/*)
pushd $c
set chr=$c:t
set out=axtChrom/$chr.axt
echo "Translating $chr lav to $out"
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /cluster/data/dm2/nib /cluster/data/dp3/nib stdout \
| axtSort stdin ../../$out
popd
end
# BLASTZ.V7 EVAL: CHAIN PSEUDOOBSCURA BLASTZ (DONE 1/18/05 angie)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18
mkdir -p axtChain/run1
cd axtChain/run1
mkdir chain
ls -1S /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -verbose=0 $1 \
/iscratch/i/dm2/nib \
/iscratch/i/dp3/nib stdout \
| chainAntiRepeat /iscratch/i/dm2/nib /iscratch/i/dp3/nib \
stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
#Completed: 13 of 13 jobs
#Average job time: 10s 0.17m 0.00h 0.00d
#Longest job: 22s 0.37m 0.01h 0.00d
#Submission to last job: 22s 0.37m 0.01h 0.00d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=10000 /tmp/score.$f:t:r
echo ""
end
# Load chains into database
ssh hgwdev
cd /cluster/data/dm2/bed/blastzv7.dp3.2005-01-18/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain dm2 ${c}_chainBz7Dp3 $i
end
# Compare to regular (blastz.v6) chains:
featureBits dm2 chainDp3Link
#76557614 bases of 131698467 (58.131%) in intersection
featureBits dm2 chainBz7Dp3Link
#76340831 bases of 131698467 (57.966%) in intersection
# Looks like v6 found more than v7 in general, but not too different.
# Look at some cases where blastz.v7 found something but v6 didn't:
featureBits dm2 chainBz7Dp3Link \!chainDp3Link -minSize=20 -bed=stdout
#chrYh 212776 212811 chrYh.1
...
#chr3R 18027130 18027160 chr3R.59
...
#chr2h 300968 301032 chr2h.1
...
#34843 bases of 131698467 (0.026%) in intersection
# BLATZ EVAL: D. PSEUDOOBSCURA (DP3) (IN PROGRESS 1/24/05 angie)
ssh kk9
mkdir /cluster/data/dm2/bed/blatz.dp3.2004-01-19
cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19
mkdir chainRaw
partitionSequence.pl 10000000 10000 /iscratch/i/dm2/nib \
/cluster/data/dm2/chrom.sizes > dm2.lst
partitionSequence.pl 10000000 10000 /iscratch/i/dp3/nib \
/cluster/data/dp3/chrom.sizes > dp3.lst
cat << '_EOF_' > gsub
#LOOP
blatz $(path1) $(path2) {check out line chainRaw/$(file1)_$(file2).chain}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
gensub2 dm2.lst dp3.lst gsub spec
para create spec
para try, check, push, check, ...
#Completed: 549 of 552 jobs
#Crashed: 3 jobs
#Average job time: 289s 4.82m 0.08h 0.00d
#Longest job: 1664s 27.73m 0.46h 0.02d
#Submission to last job: 6206s 103.43m 1.72h 0.07d
# With default blatz params, 3 jobs crashed due to out-of-mem,
# all others succeeded. Try again with thresholds more like
# blastz human-mouse, i.e. -minGapless=3000? No, Jim says forget
# the crashers for now (narrow down later) and just forge ahead
# with the comparison to blastz (v6).
ssh kksilo
cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19
sed -e 's@/iscratch/i/d../nib/@@g; s/.nib//g;' chainRaw/* \
| chainMergeSort stdin > all.chain
chainSplit chain all.chain
# take a look at score distr's -- LOTS of low-scoring chains
foreach f (chain/*.chain)
grep ^chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=10000 /tmp/score.$f:t:r
echo ""
end
# Jim recommends -minScore=5000 for cross-species. Filter for now:
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
chainSplit chain all.chain
gzip all.chain.unfiltered
# Load chains into database
ssh hgwdev
cd /cluster/data/dm2/bed/blatz.dp3.2004-01-19/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain dm2 ${c}_chainBz1Dp3 $i
end
# Compare to regular (blastz.v6) chains:
featureBits dm2 -enrichment flyBaseGene:CDS chainDp3Link
#flyBaseGene:CDS 16.551%, chainDp3Link 58.131%, both 15.303%, cover 92.46%, enrich 1.59x
featureBits dm2 -enrichment flyBaseGene:CDS chainBz1Dp3Link
#flyBaseGene:CDS 16.551%, chainBz1Dp3Link 79.858%, both 16.179%, cover 97.76%, enrich 1.22x
# Look at some examples where blatz covers some CDS that blastz doesn't:
featureBits -chrom=chr3R dm2 flyBaseGene:CDS chainBz1Dp3Link \
\!chainDp3Link -minSize=30 -bed=/tmp/bz1Cds.bed
# BLASTZ D.YAKUBA (DONE 1/26/05 angie)
# CHAIN YAKUBA BLASTZ (DONE 1/26/05 angie)
# NET YAKUBA BLASTZ (DONE 1/26/05 angie)
# GENERATE DROYAK1 MAF FOR MULTIZ FROM NET (DONE 1/26/05 angie)
# MAKE VSDROYAK1 DOWNLOADABLES (DONE 1/26/05 angie)
# -- Originally run with default blastz params; replaced by a run with
# better params, below.
# BLASTZ D.ANANASSAE (DONE 1/28/05 angie)
# CHAIN ANANASSAE BLASTZ (DONE 1/28/05 angie)
# NET ANANASSAE BLASTZ (DONE 1/28/05 angie)
# GENERATE DROANA1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie)
# MAKE VSDROANA1 DOWNLOADABLES (DONE 1/28/05 angie)
# -- Originally run with default blastz params; replaced by a run with
# better params, below.
# BLASTZ D.VIRILIS (DONE 1/28/05 angie)
# CHAIN VIRILIS BLASTZ (DONE 1/28/05 angie)
# NET VIRILIS BLASTZ (DONE 1/28/05 angie)
# GENERATE DROVIR1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie)
# MAKE VSDROVIR1 DOWNLOADABLES (DONE 1/28/05 angie)
# -- Originally run with default blastz params; replaced by a run with
# better params, below.
# BLASTZ D.MOJAVENSIS (DONE 1/28/05 angie)
# CHAIN MOJAVENSIS BLASTZ (DONE 1/28/05 angie)
# NET MOJAVENSIS BLASTZ (DONE 1/28/05 angie)
# GENERATE DROMOJ1 AXTNET AND MAF FOR MULTIZ (DONE 1/28/05 angie)
# MAKE VSDROMOJ1 DOWNLOADABLES (DONE 1/28/05 angie)
# -- Originally run with default blastz params; replaced by a run with
# better params, below.
# BLASTZ A.MELLIFERA (DONE 1/28/05 angie)
# CHAIN MELLIFERA BLASTZ (DONE 1/31/05 angie)
# NET MELLIFERA BLASTZ (DONE 1/31/05 angie)
# GENERATE APIMEL1 AXTNET AND MAF FOR MULTIZ (DONE 1/31/05 angie)
# MAKE VSAPIMEL1 DOWNLOADABLES (DONE 1/31/05 angie)
# -- Originally run on apiMel2 with "human-fugu" blastz params; outdated by
# apiMel2 which was later run with better params, below.
# BLASTZ A.MELLIFERA 2.0 (DONE 2/8/05 Andy)
# CHAIN MELLIFERA 2.0 BLASTZ (DONE 2/9/05 Andy)
# NET MELLIFERA 2.0 BLASTZ (DONE 2/9/05 Andy)
# GENERATE APIMEL2 AXTNET AND MAF FOR MULTIZ (DONE 2/9/05 Andy)
# MAKE VSAPIMEL2 DOWNLOADABLES (DONE 2/9/2005 Andy)
# -- Originally run with "human-fugu" blastz params; replaced by a run with
# better params, below.
# MULTIZ.V10 8WAY (6 FLIES, MOSQUITO, HONEYBEE) (REDONE 3/7/05 angie
# originally done 2/1/05
# REDONE 3/7/05 angie -- User found that yakuba was missing (due to
# pairwise maf files having different .suffixes, and the script tolerating
# missing files... yikes!!!). :(
# REPLACED with 9WAY 5/23/05
# MULTIZ.V10 9WAY (7 FLIES, MOSQUITO, HONEYBEE) (DONE 5/24/05 angie)
# Better, looser blastz params were used to redo alignments of all
# other insects to dm2. Much better coverage on the pairwise inputs
# now, so re-run multiz and phastCons to get a better Conservation
# track... also using dp3 and apiMel2 instead of dp2 and apiMel1,
# and adding droSim1.
# Tree (9-way):
# ((((((dm2 droYak1) droAna1) dp3) (droVir1 droMoj1)) anoGam1) apiMel2)
ssh kkstore01
mkdir /cluster/data/dm2/bed/multiz9way.2005-05-23
ln -s /cluster/data/dm2/bed/multiz9way.2005-05-23 \
/cluster/data/dm2/bed/multiz9way
cd /cluster/data/dm2/bed/multiz9way
# Setup: Copy pairwise MAF to /santest/scratch:
mkdir /santest/scratch/flyMultiz9way
foreach db (droSim1 droYak1 droAna1 dp3 droVir1 droMoj1 anoGam1 apiMel2)
cp -pR /cluster/data/dm2/bed/blastz.$db/mafNet \
/santest/scratch/flyMultiz9way/$db
end
ls -lLR /santest/scratch/flyMultiz9way
# Make output dir:
mkdir maf
# Create script to run multiz.v10 in the right order:
cat << '_EOF_' > doMultiz.csh
#!/bin/csh -fe
set chr = $1
set tmp = /scratch/flyMultiz9way.$chr
mkdir $tmp
set REF = dm2.$chr
set SIM = /santest/scratch/flyMultiz9way/droSim1/$chr.maf.gz
set YAK = /santest/scratch/flyMultiz9way/droYak1/$chr.maf.gz
set ANA = /santest/scratch/flyMultiz9way/droAna1/$chr.maf.gz
set PSE = /santest/scratch/flyMultiz9way/dp3/$chr.maf.gz
set VIR = /santest/scratch/flyMultiz9way/droVir1/$chr.maf.gz
set MOJ = /santest/scratch/flyMultiz9way/droMoj1/$chr.maf.gz
set ANO = /santest/scratch/flyMultiz9way/anoGam1/$chr.maf.gz
set API = /santest/scratch/flyMultiz9way/apiMel2/$chr.maf.gz
set DEST = /cluster/data/dm2/bed/multiz9way/maf/$chr.maf
set MZ10 = /cluster/bin/penn/multiz.v10
set PROJECT = /cluster/bin/penn/maf_project
if ( -s $SIM && -s $YAK ) then
echo "Aligning $SIM $YAK..."
zcat $SIM > $tmp/$chr.Sim.maf
zcat $YAK > $tmp/$chr.Yak.maf
$MZ10 $tmp/$chr.Sim.maf $tmp/$chr.Yak.maf 1 > $tmp/$chr.tmp.maf
echo "Projecting on $REF..."
$PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYak.maf
else if ( -s $SIM ) then
touch missing.$chr.Sim
zcat $SIM > $tmp/$chr.SimYak.maf
else if ( -s $YAK ) then
touch missing.$chr.Yak
zcat $YAK > $tmp/$chr.SimYak.maf
endif
if ( -s $tmp/$chr.SimYak.maf && -s $ANA ) then
echo "Aligning $tmp/$chr.SimYak.maf $ANA..."
zcat $ANA > $tmp/$chr.Ana.maf
$MZ10 $tmp/$chr.SimYak.maf $tmp/$chr.Ana.maf 1 > $tmp/$chr.tmp.maf
echo "Projecting on $REF..."
$PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAna.maf
else if ( -s $tmp/$chr.SimYak.maf ) then
touch missing.$chr.Ana
cp $tmp/$chr.SimYak.maf $tmp/$chr.SimYakAna.maf
else if ( -s $ANA ) then
touch missing.$chr.SimYak
zcat $ANA > $tmp/$chr.SimYakAna.maf
endif
if ( -s $PSE && -s $tmp/$chr.SimYakAna.maf ) then
echo "Adding $PSE..."
zcat $PSE > $tmp/$chr.Pse.maf
$MZ10 $tmp/$chr.SimYakAna.maf $tmp/$chr.Pse.maf 1 > $tmp/$chr.tmp.maf
echo "Projecting on $REF..."
$PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPse.maf
else if ( -s $PSE ) then
touch missing.$chr.SimYakAna
zcat $PSE > $tmp/$chr.SimYakAnaPse.maf
else if ( -s $tmp/$chr.SimYakAna.maf ) then
touch missing.$chr.Pse
cp $tmp/$chr.SimYakAna.maf $tmp/$chr.SimYakAnaPse.maf
endif
# droVir1 and droMoj1 are a subtree -- run on just them, then fold in:
if ( -s $VIR && -s $MOJ ) then
echo "Aligning $VIR $MOJ..."
zcat $VIR > $tmp/$chr.Vir.maf
zcat $MOJ > $tmp/$chr.Moj.maf
$MZ10 $tmp/$chr.Vir.maf $tmp/$chr.Moj.maf 0 > $tmp/$chr.tmp.maf
echo "Projecting on $REF..."
$PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.VirMoj.maf
else if ( -s $VIR ) then
touch missing.$chr.Moj
zcat $VIR > $tmp/$chr.VirMoj.maf
else if ( -s $MOJ ) then
touch missing.$chr.Vir
zcat $MOJ > $tmp/$chr.VirMoj.maf
endif
if ( -s $tmp/$chr.VirMoj.maf && -s $tmp/$chr.SimYakAnaPse.maf ) then
echo "Adding $tmp/$chr.VirMoj.maf..."
$MZ10 $tmp/$chr.SimYakAnaPse.maf $tmp/$chr.VirMoj.maf 1 > $tmp/$chr.tmp.maf
echo "Projecting on $REF..."
$PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPseVirMoj.maf
else if ( -s $tmp/$chr.VirMoj.maf ) then
touch missing.$chr.SimYakAnaPse
cp $tmp/$chr.VirMoj.maf $tmp/$chr.SimYakAnaPseVirMoj.maf
else if ( -s $tmp/$chr.SimYakAnaPse.maf ) then
touch missing.$chr.VirMoj
cp $tmp/$chr.SimYakAnaPse.maf $tmp/$chr.SimYakAnaPseVirMoj.maf
endif
if ( -s $ANO && -s $tmp/$chr.SimYakAnaPseVirMoj.maf ) then
echo "Adding $ANO..."
zcat $ANO > $tmp/$chr.Ano.maf
$MZ10 $tmp/$chr.SimYakAnaPseVirMoj.maf $tmp/$chr.Ano.maf 1 > $tmp/$chr.tmp.maf
echo "Projecting on $REF..."
$PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.SimYakAnaPseVirMojAno.maf
else if ( -s $ANO ) then
touch missing.$chr.SimYakAnaPseVirMoj
zcat $ANO > $tmp/$chr.SimYakAnaPseVirMojAno.maf
else if ( -s $tmp/$chr.SimYakAnaPseVirMoj.maf ) then
touch missing.$chr.Ano
cp $tmp/$chr.SimYakAnaPseVirMoj.maf $tmp/$chr.SimYakAnaPseVirMojAno.maf
endif
if ( -s $API && -s $tmp/$chr.SimYakAnaPseVirMojAno.maf ) then
echo "Adding $API..."
zcat $API > $tmp/$chr.Api.maf
$MZ10 $tmp/$chr.SimYakAnaPseVirMojAno.maf $tmp/$chr.Api.maf 1 > $tmp/$chr.tmp.maf
echo "Projecting on $REF..."
$PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.final.maf
cp $tmp/$chr.final.maf $DEST
else if ( -s $API ) then
touch missing.$chr.SimYakAnaPseVirMojAno
zcat $API > $DEST
else if ( -s $tmp/$chr.SimYakAnaPseVirMojAno.maf ) then
touch missing.$chr.Api
cp $tmp/$chr.SimYakAnaPseVirMojAno.maf $DEST
endif
rm $tmp/$chr.*.maf
rmdir $tmp
'_EOF_'
# << keep emacs coloring happy
chmod 755 doMultiz.csh
awk '{print "./doMultiz.csh " $1;}' /cluster/data/dm2/chrom.sizes \
> jobs.lst
# Run on small cluster
ssh kki
cd /cluster/data/dm2/bed/multiz9way
para create jobs.lst
para try, check, push, check, ...
#Completed: 13 of 13 jobs
#Average job time: 2000s 33.34m 0.56h 0.02d
#Longest finished job: 5969s 99.48m 1.66h 0.07d
#Submission to last job: 15605s 260.08m 4.33h 0.18d
ls -1 missing*
#ls: No match.
# make /gbdb/ links to 9way maf files:
ssh hgwdev
mkdir -p /gbdb/dm2/multiz9way/maf/multiz9way
ln -s /cluster/data/dm2/bed/multiz9way/maf/chr*.maf \
/gbdb/dm2/multiz9way/maf/multiz9way/
# load into database
cd /tmp
hgLoadMaf -warn dm2 multiz9way \
-pathPrefix=/gbdb/dm2/multiz9way/maf/multiz9way
# load summary table to replace pairwise
time cat /cluster/data/dm2/bed/multiz9way/maf/chr*.maf \
| nice hgLoadMafSummary dm2 multiz9waySummary stdin
# put 9way MAF out for download
ssh kkstore01
cd /cluster/data/dm2/bed/multiz9way
mkdir mafDownload
foreach f (maf/*.maf)
nice gzip -c $f > mafDownload/$f:t.gz
end
cd mafDownload
md5sum *.maf.gz > md5sum.txt
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/dm2/multiz9way
ln -s /cluster/data/dm2/bed/multiz9way/mafDownload/{*.maf.gz,md5sum.txt} \
/usr/local/apache/htdocs/goldenPath/dm2/multiz9way
# make a README.txt
# Cleanup
rm -rf /santest/scratch/flyMultiz9way/
# PHASTCONS 8WAY WITH METHODS FROM PAPER (DONE 3/8/05 angie)
# originally done 2/2/05
# REDONE 3/8/05 angie -- after regenerating multiz alignments above.
# Same procedure as for latest dm1 8way -- using the param estimation
# methods described in Adam's Genome Research paper.
# REPLACED with 9way 5/23/05
# PHASTCONS 9WAY WITH METHODS FROM PAPER (DONE 5/27/05 angie)
ssh kkstore01
mkdir -p /santest/scratch/dm2/chrom
cp -p /cluster/data/dm2/?{,?}/chr*.fa /santest/scratch/dm2/chrom/
# Split chrom fa into smaller windows for phastCons:
ssh kki
mkdir /cluster/data/dm2/bed/multiz9way/phastCons
mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.split
cd /cluster/data/dm2/bed/multiz9way/phastCons/run.split
set WINDOWS = /santest/scratch/dm2/phastCons/WINDOWS
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.sh
#!/bin/csh -ef
set PHAST=/cluster/bin/phast
set FA_SRC=/santest/scratch/dm2/chrom
set WINDOWS=/santest/scratch/dm2/phastCons/WINDOWS
set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
-O dm2,droSim1,droYak1,droAna1,dp3,droVir1,droMoj1,anoGam1,apiMel2 \
-w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
chmod a+x doSplit.sh
rm -f jobList
foreach file (/cluster/data/dm2/bed/multiz9way/maf/*.maf)
if (-s $file) then
echo "doSplit.sh {check in exists+ $file}" >> jobList
endif
end
para create jobList
para try, check, push, check...
#Completed: 13 of 13 jobs
#Average job time: 44s 0.73m 0.01h 0.00d
#Longest finished job: 117s 1.95m 0.03h 0.00d
#Submission to last job: 117s 1.95m 0.03h 0.00d
############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
# Use consEntropy --NH to make it suggest a --expected-lengths param
# that we should try next. Adam ran this on hg17 to find out the
# total entropy of the hg17 model:
# consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
# Our target is that NH result: 9.7834 bits.
# Use phyloFit to make an initial model:
ssh kolossus
cd /cluster/data/dm2/bed/multiz9way/phastCons
/cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \
--aggregate dm2,droSim1,droYak1,droAna1,dp3,droVir1,droMoj1,anoGam1,apiMel2 \
-i MAF -o SS > all.ss
/cluster/bin/phast/phyloFit all.ss \
--tree "(((((((dm2,droSim1),droYak1),droAna1),dp3),(droVir1,droMoj1)),anoGam1),apiMel2)" \
-i SS --out-root starting-tree
cat starting-tree.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -393252389.008460
#BACKGROUND: 0.286960 0.213375 0.213302 0.286363
#RATE_MAT:
# -0.918686 0.202545 0.451023 0.265118
# 0.272394 -1.108053 0.230681 0.604978
# 0.606771 0.230760 -1.109751 0.272220
# 0.265671 0.450782 0.202768 -0.919221
#TREE: (((((((dm2:0.031118,droSim1:0.035465):0.029649,droYak1:0.076216):0.089605,droAna1:0.210217):0.044178,dp3:0.215369):0.046188,(droVir1:0.128011,droMoj1:0.139931):0.195030):0.109752,anoGam1:0.342365):0.228550,apiMel2:0.228550);
# also get GC content from model -- if similar enough, no need to extract it
# separately above.
awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod
#0.426677
# OK, use .427 for --gc below.
# Use the values of --target-coverage and --expected-lengths from the
# last iteration of the 8way run, 0.573 and 36.8.
# Multiply each subst rate on the TREE line by 3.5 which is roughly the
# ratio of noncons to cons in
# /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
/cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
> starting-tree.noncons.mod
/cluster/bin/phast/consEntropy --NH 9.7834 0.573 36.8 \
starting-tree{,.noncons}.mod
#( Solving for new omega: 36.800000 36.416935 36.415050 )
#Transition parameters: gamma=0.573000, omega=36.800000, mu=0.027174, nu=0.036465
#Relative entropy: H=1.931731 bits/site
#Expected min. length: L_min=5.081081 sites
#Expected max. length: L_max=5.398876 sites
#Total entropy: L_min*H=9.815283 bits
#Recommended expected length: omega=36.415050 sites (for L_min*H=9.783400)
# OK, use --expected-lengths 36.4.
############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
# We're here because the actual target coverage was not satisfactory,
# so we're changing the --target-coverage param. Given that we're
# changing that, take a guess at how we should change --expected-lengths
# in order to also hit the total entropy target.
cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
# SECOND ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.52 36.8 ave.{cons,noncons}.mod
#Recommended expected length: omega=32.240755 sites (for L_min*H=9.783400)
# OK, --expected-lengths 32.2
# THIRD ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.45 32.2 ave.{cons,noncons}.mod
#Recommended expected length: omega=27.095993 sites (for L_min*H=9.783400)
# ==> 27.1
# FOURTH ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.425 27.1 ave.{cons,noncons}.mod
#Recommended expected length: omega=25.440532 sites (for L_min*H=9.783400)
# ==> 25.4
# FIFTH ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.393 25.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.420199 sites (for L_min*H=9.783400)
# ==> 23.4
# SIXTH ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.736472 sites (for L_min*H=9.783400)
# ==> 23.7
# Now set up cluster job to estimate model parameters given free params
# --target-coverage and --expected-lengths and the data.
ssh kk9
mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
# FIRST ITERATION: Use ../starting-tree.mod:
cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.427 --nrates 1,1 \
--no-post-probs --ignore-missing \
--expected-lengths 36.4 --target-coverage 0.573 \
--quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
# SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.427 --nrates 1,1 \
--no-post-probs --ignore-missing \
--expected-lengths 23.7 --target-coverage 0.398 \
--quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
chmod a+x doEstimate.sh
rm -fr LOG TREES
mkdir -p LOG TREES
rm -f jobList
foreach f (/santest/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
set root = $f:t:r:r
echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
end
# run cluster job
para create jobList
para try, check, push, check, ...#
#Completed: 139 of 139 jobs
#Average job time: 4131s 68.85m 1.15h 0.05d
#Longest finished job: 8787s 146.45m 2.44h 0.10d
#Submission to last job: 8886s 148.10m 2.47h 0.10d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kolossus
cd /cluster/data/dm2/bed/multiz9way/phastCons/run.estimate
ls -1 TREES/*.cons.mod > cons.txt
/cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
--output-average ave.cons.mod > cons_summary.txt
ls -1 TREES/*.noncons.mod > noncons.txt
/cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
--output-average ave.noncons.mod > noncons_summary.txt
grep TREE ave*.mod
# FIRST ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.019592,droSim1:0.027067):0.014875,droYak1:0.047389):0.041218,droAna1:0.116484):0.020166,dp3:0.125948):0.018479,(droVir1:0.073409,droMoj1:0.080389):0.106180):0.056105,anoGam1:0.235476):0.127958,apiMel2:0.127958);
#ave.noncons.mod:TREE: (((((((dm2:0.059164,droSim1:0.077944):0.047044,droYak1:0.144159):0.135352,droAna1:0.373788):0.067147,dp3:0.405695):0.062350,(droVir1:0.234677,droMoj1:0.258111):0.355922):0.191534,anoGam1:0.773044):0.423894,apiMel2:0.423894);
# SECOND ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.019133,droSim1:0.026408):0.014525,droYak1:0.046282):0.040238,droAna1:0.113841):0.019632,dp3:0.123153):0.017914,(droVir1:0.071760,droMoj1:0.078554):0.104006):0.054630,anoGam1:0.230601):0.124952,apiMel2:0.124952);
#ave.noncons.mod:TREE: (((((((dm2:0.058357,droSim1:0.076814):0.046405,droYak1:0.142227):0.133457,droAna1:0.368999):0.065975,dp3:0.400678):0.061001,(droVir1:0.231694,droMoj1:0.254750):0.352032):0.188199,anoGam1:0.764503):0.418063,apiMel2:0.418063);
# THIRD ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.018454,droSim1:0.025454):0.014002,droYak1:0.044650):0.038757,droAna1:0.109828):0.018811,dp3:0.118845):0.017034,(droVir1:0.069220,droMoj1:0.075740):0.100602):0.052271,anoGam1:0.222690):0.120267,apiMel2:0.120267);
#ave.noncons.mod:TREE: (((((((dm2:0.057221,droSim1:0.075276):0.045493,droYak1:0.139568):0.130778,droAna1:0.362074):0.064250,dp3:0.393146):0.058938,(droVir1:0.227290,droMoj1:0.249820):0.346379):0.183000,anoGam1:0.750810):0.409058,apiMel2:0.409058);
# FOURTH ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.018200,droSim1:0.025097):0.013808,droYak1:0.044040):0.038208,droAna1:0.108335):0.018509,dp3:0.117244):0.016712,(droVir1:0.068273,droMoj1:0.074693):0.099332):0.051396,anoGam1:0.219717):0.118553,apiMel2:0.118553);
#ave.noncons.mod:TREE: (((((((dm2:0.056840,droSim1:0.074761):0.045186,droYak1:0.138661):0.129841,droAna1:0.359716):0.063642,dp3:0.390643):0.058209,(droVir1:0.225781,droMoj1:0.248129):0.344408):0.181133,anoGam1:0.745987):0.406113,apiMel2:0.406113);
# FIFTH ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.017875,droSim1:0.024638):0.013564,droYak1:0.043255):0.037506,droAna1:0.106409):0.018128,dp3:0.115161):0.016304,(droVir1:0.067046,droMoj1:0.073340):0.097682):0.050261,anoGam1:0.215779):0.116296,apiMel2:0.116296);
#ave.noncons.mod:TREE: (((((((dm2:0.056321,droSim1:0.074061):0.044780,droYak1:0.137445):0.128624,droAna1:0.356472):0.062873,dp3:0.387024):0.057266,(droVir1:0.223703,droMoj1:0.245817):0.341759):0.178650,anoGam1:0.739131):0.401733,apiMel2:0.401733);
# SIXTH ITERATION:
#ave.cons.mod:TREE: (((((((dm2:0.017924,droSim1:0.024709):0.013600,droYak1:0.043377):0.037613,droAna1:0.106707):0.018185,dp3:0.115488):0.016365,(droVir1:0.067237,droMoj1:0.073549):0.097937):0.050437,anoGam1:0.216405):0.116660,apiMel2:0.116660);
#ave.noncons.mod:TREE: (((((((dm2:0.056410,droSim1:0.074182):0.044846,droYak1:0.137649):0.128814,droAna1:0.357039):0.062987,dp3:0.387708):0.057413,(droVir1:0.224066,droMoj1:0.246214):0.342186):0.179046,anoGam1:0.740329):0.402594,apiMel2:0.402594);
cat cons_summary.txt
# look over the files cons_summary.txt and noncons_summary.txt.
# The means and medians should be roughly equal and the stdevs
# should be reasonably small compared to the means, particularly
# for rate matrix parameters (at bottom) and for branches to the
# leaves of the tree. The stdevs may be fairly high for branches
# near the root of the tree; that's okay. Some min values may be
# 0 for some parameters. That's okay, but watch out for very large
# values in the max column, which might skew the mean. If you see
# any signs of bad outliers, you may have to track down the
# responsible .mod files and throw them out. I've never had to do
# this; the estimates generally seem pretty well behaved.
# NOTE: Actually, a random sample of several hundred to a thousand
# alignment fragments (say, a number equal to the number of
# available cluster nodes) should be more than adequate for
# parameter estimation. If pressed for time, use this strategy.
# Check the total entropy figure to see if we're way off.
# We probably don't need to do this, since it has always been very close
# if not the same as what we used above, but it only takes a second.
# FIRST ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.573 36.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=36.779882 sites (for L_min*H=9.783400)
# OK, tweak --expected-lengths to 36.8 below (and for next iteration).
# SECOND ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.52 32.2 ave.{cons,noncons}.mod
#Recommended expected length: omega=32.239216 sites (for L_min*H=9.783400)
# ==> keep at 32.2.
# THIRD ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.45 27.1 ave.{cons,noncons}.mod
#Recommended expected length: omega=27.102528 sites (for L_min*H=9.783400)
# ==> keep at 27.1.
# FOURTH ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.425 25.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=25.445150 sites (for L_min*H=9.783400)
# ==> keep at 25.4
# FIFTH ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.393 23.4 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.427025 sites (for L_min*H=9.783400)
# ==> keep at 23.4
# SIXTH ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.7 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.735901 sites (for L_min*H=9.783400)
# ==> keep at 23.7
# Now we are ready to set up the cluster job for computing the
# conservation scores and predicted elements. The we measure the
# conserved elements coverage, and if that's not satisfactory then we
# adjust parameters and repeat.
ssh kk9
mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.phast
cd /cluster/data/dm2/bed/multiz9way/phastCons/run.phast
cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
--expected-lengths 23.7 --target-coverage 0.398 \
--quiet --seqname $chr --idpref $pref \
--viterbi /santest/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \
--require-informative 0 \
> $tmpfile
gzip -c $tmpfile > /santest/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
chmod a+x doPhastCons.sh
rm -fr /santest/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
mkdir -p /santest/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
rm -f jobList
foreach f (/santest/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
echo doPhastCons.sh $f >> jobList
end
para create jobList
para try, check, push, check, ...
#Completed: 139 of 139 jobs
#Average job time: 18s 0.31m 0.01h 0.00d
#Longest finished job: 29s 0.48m 0.01h 0.00d
#Submission to last job: 46s 0.77m 0.01h 0.00d
# back on kolossus:
# combine predictions and transform scores to be in 0-1000 interval
cd /cluster/data/dm2/bed/multiz9way/phastCons
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
/santest/scratch/dm2/phastCons/ELEMENTS/*.bed \
| /cluster/bin/scripts/lodToBedScore > all.bed
ssh hgwdev
# Now measure coverage of CDS by conserved elements.
# We want the "cover" figure to be close to 68.9%.
cd /cluster/data/dm2/bed/multiz9way/phastCons
featureBits -enrichment dm2 flyBaseGene:cds all.bed
# FIRST ITERATION: too high; decrease --target-coverage, re-estimate.
#flyBaseGene:cds 16.567%, all.bed 42.995%, both 12.251%, cover 73.95%, enrich 1.72x
# SECOND ITERATION: still too high.
#flyBaseGene:cds 16.567%, all.bed 41.408%, both 12.030%, cover 72.62%, enrich 1.75x
# THIRD ITERATION: still...
#flyBaseGene:cds 16.567%, all.bed 39.143%, both 11.693%, cover 70.58%, enrich 1.80x
# FOURTH ITERATION: dang, should have used a calculator.
#flyBaseGene:cds 16.567%, all.bed 38.329%, both 11.563%, cover 69.80%, enrich 1.82x
# FIFTH ITERATION: close... but overshot. Back off just a bit.
#flyBaseGene:cds 16.567%, all.bed 37.277%, both 11.388%, cover 68.74%, enrich 1.84x
# SIXTH ITERATION: done.
#flyBaseGene:cds 16.567%, all.bed 37.445%, both 11.417%, cover 68.92%, enrich 1.84x
# Having met the CDS coverage target, load up the results.
hgLoadBed dm2 phastConsElements9way all.bed
# Create wiggle on the small cluster
ssh kki
mkdir /cluster/data/dm2/bed/multiz9way/phastCons/run.wib
cd /cluster/data/dm2/bed/multiz9way/phastCons/run.wib
rm -rf /santest/scratch/dm2/phastCons/wib
mkdir -p /santest/scratch/dm2/phastCons/wib
cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /santest/scratch/dm2/phastCons/wib
zcat `ls -1 /santest/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
| sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
chmod a+x doWigEncode
rm -f jobList
foreach chr (`ls -1 /santest/scratch/dm2/phastCons/POSTPROBS \
| awk -F\. '{print $1}' | sort -u`)
echo doWigEncode $chr >> jobList
end
para create jobList
para try, check, push, check, ...
#Completed: 13 of 13 jobs
#Average job time: 10s 0.16m 0.00h 0.00d
#Longest finished job: 23s 0.38m 0.01h 0.00d
#Submission to last job: 23s 0.38m 0.01h 0.00d
# back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want
# the raw scores) from santest
cd /cluster/data/dm2/bed/multiz9way/phastCons
rm -rf wib POSTPROBS
rsync -av /santest/scratch/dm2/phastCons/wib .
rsync -av /santest/scratch/dm2/phastCons/POSTPROBS .
# load wiggle component of Conservation track
ssh hgwdev
mkdir /gbdb/dm2/multiz9way/wib
cd /cluster/data/dm2/bed/multiz9way/phastCons
chmod 775 . wib
chmod 664 wib/*.wib
ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz9way/wib/
hgLoadWiggle dm2 phastCons9way \
-pathPrefix=/gbdb/dm2/multiz9way/wib wib/*.wig
rm wiggle.tab
# make top-5000 list and launcher on Adam's home page:
sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
> top5000.bed
/cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
/cse/grads/acs/public_html/dm-top5000-9way \
"top 5000 conserved elements (9way)" dm2
# and clean up santest.
rm -r /santest/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib}
rm -r /santest/scratch/dm2/chrom
# Offer raw scores for download since fly folks are likely to be interested:
ssh kkstore01
cd /cluster/data/dm2/bed/multiz9way/phastCons/POSTPROBS
mkdir ../postprobsDownload
foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
> ../postprobsDownload/$chr.pp.gz
end
cd ../postprobsDownload
md5sum *.gz > md5sum.txt
# Make a README.txt there too.
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons9way
cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons9way
ln -s /cluster/data/dm2/bed/multiz9way/phastCons/postprobsDownload/* .
# MAP FEEP PROBES (DONE 2/3/05 angie)
ssh kolossus
mkdir /cluster/data/dm2/bed/flyFeep
cd /cluster/data/dm2/bed/flyFeep
set feepDir = /projects/compbio/data/microarray/flyFEEP
# Make per-chrom probe FASTA files:
foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh)
echo $c
awk '$3 == "'$c'" {printf ">%s\n%s\n", $1, $7;}' \
$feepDir/probes3.1-Exons.1_apr11_exon \
$feepDir/probes3.1-Noncoding.1_apr11_tiling \
> chr${c}_exonNonExonProbes.fa
awk '$1 != "ID" && $5 == "'$c'+" {printf ">%s\n%s\n", $1, $8;}' \
$feepDir/probes3.1-junctions.txt \
> chr${c}_junctionProbes.fa
end
# For exon/non-exon probes (36 consecutive bases), require 36 matching
# bases (for some reason -minScore=36 doesn't filter anything out!).
foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh)
blat -noHead \
/cluster/data/dm2/nib/chr$c.nib chr${c}_exonNonExonProbes.fa stdout \
| grep ^36 | uniq \
> chr${c}_exonNonExonProbes.psl
end
# For splice junction probes (2 blocks of 18 consecutive bases),
# use a small tileSize and -fine, and still require 36 matching bases:
# This was slow, do as small cluster job next time, will still take hours.
foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh)
blat -noHead -tileSize=6 -fine \
/cluster/data/dm2/nib/chr$c.nib chr${c}_junctionProbes.fa stdout \
| grep ^36 | uniq \
> chr${c}_junctionProbes.psl
end
# Check on how many probes were aligned:
foreach f (chr*exon*.fa chr*junc*.fa)
set probes = `cat $f | wc -l`
set probes = `expr $probes / 2`
set aligned = `cat $f:r.psl | wc -l`
set reallyAligned = `awk '{print $10;}' $f:r.psl | sort -u | wc -l `
set not = `expr $probes - $reallyAligned`
echo $f:r"\t"$probes"\t"$aligned"\t"$reallyAligned"\t"\($not\)
end
# set count aligned reallyA not
#chr2h_exonNonExonProbes 644 611 611 (33)
#chr2L_exonNonExonProbes 26631 26749 26627 (4)
#chr2R_exonNonExonProbes 26100 26111 26098 (2)
#chr3h_exonNonExonProbes 833 833 833 (0)
#chr3L_exonNonExonProbes 28756 28773 28749 (7)
#chr3R_exonNonExonProbes 35060 35064 35055 (5)
#chr4_exonNonExonProbes 1798 1797 1797 (1)
#chrX_exonNonExonProbes 28912 28917 28912 (0)
#chrXh_exonNonExonProbes 354 354 354 (0)
#chrYh_exonNonExonProbes 97 97 97 (0)
#chr2h_junctionProbes 3 3 3 (0)
#chr2L_junctionProbes 5048 5015 5012 (36)
#chr2R_junctionProbes 5697 5709 5652 (45)
#chr3h_junctionProbes 1 1 1 (0)
#chr3L_junctionProbes 6498 6437 6429 (69)
#chr3R_junctionProbes 8747 8695 8684 (63)
#chr4_junctionProbes 1056 1048 1048 (8)
#chrXh_junctionProbes 50 50 50 (0)
#chrX_junctionProbes 3688 3684 3680 (8)
#chrYh_junctionProbes 0 0 0 (0)
# Most probes successfully mapped, but a fair number of duplicates.
# Dug into one of those (159937) and found that its probe sequence
# halves were from a tandem repeat! (And didn't map to where they
# were supposed to in dm1 either.) I suspect the probe sequences
# might not be correct in the files I downloaded. Sent Chris Mason
# an email about that.
# If the probe dm1 locations are correct (I sure hope so), just not
# some given probe sequences, I could extract dm1 sequence and map
# that to dm2 instead of mapping the given probes as above.
# Translate PSL to bed12 (this is too simple for PSL in general, but
# since we require the entire query to map perfectly, it works here):
awk '{printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t%d\t%s\t%s\n", \
$14, $16, $17, $10, $9, $16, $17, $18, $19, $20;}' \
chr*.psl \
> flyFeepProbesBed12.bed
# Load probes
ssh hgwdev
cd /cluster/data/dm2/bed/flyFeep
# Add scores to bed:
hgMapMicroarray -bedIn flyFeep.bed hgFixed.flyFeepMedianRatio \
flyFeepProbesBed12.bed
hgLoadBed dm2 flyFeep flyFeep.bed
# (back on fileserver) Winnow probes by Bussemaker lab's lists:
set dm1FeepDir = /cluster/data/dm1/bed/flyFeep
$dm1FeepDir/winnowBed.pl $dm1FeepDir/probesExpressedAboveBackground.txt \
flyFeep.bed \
> flyFeepPEAB.bed
$dm1FeepDir/winnowBed.pl $dm1FeepDir/probesAnovaDiffExpressed.txt \
flyFeep.bed \
> flyFeepAnova.bed
# (back on hgwdev) Load winnowed sets:
hgLoadBed dm2 flyFeepPEAB flyFeepPEAB.bed
hgLoadBed dm2 flyFeepAnova flyFeepAnova.bed
# FLYBASE 4.1 ANNOTATIONS (DONE 2/28/05 angie)
ssh kksilo
mkdir /cluster/data/dm2/bed/flybase4.1
cd /cluster/data/dm2/bed/flybase4.1
foreach c (2L 2R 3L 3R 4 X)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.1_20050207/gff/dmel-$c-r4.1.gff.gz
end
zcat *.gff.gz > flybase.gff3
# What data sources are represented in this file?
grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
# excerpt (many other sources, including blastx:... , sim4:... and
# tblastx:...; also various other types for source "."):
18941 . CDS
63033 . exon
14066 . gene
18941 . mRNA
144 . ncRNA
39 . pseudogene
96 . rRNA
29 . snRNA
28 . snoRNA
295 . tRNA
36921 . transcription_start_site
1571 . transposable_element
16404 . transposable_element_insertion_site
# What keywords are defined in the 9th field?
grep -v '^#' flybase.gff3 \
| awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \
| sort | uniq -c
# Once again, the previous round's parsing needed some updates to
# handle the new data.
extractGenes.pl flybase.gff3
#Oddball parentless ID=Sps2-exon-10336472..10336531 for exon
# Get predicted proteins (for main annotations only)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.1_20050207/fasta/dmel-all-translation-r4.1.fasta.gz
zcat dmel-all-translation-r4.1.fasta.gz \
| perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' > flybasePep.fa
ssh hgwdev
cd /cluster/data/dm2/bed/flybase4.1
# Protein-coding genes:
ldHgGene -gtf dm2 flyBaseGene flybase.gtf
hgPepPred dm2 generic flyBasePep flybasePep.fa
# Fix typo caught by Galt & all.joiner:
hgsql dm2 -e 'update flyBasePep set name = "CG6207-RC" where name = "GLCAT-P-PC"'
# Noncoding genes:
hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
# Cross-referencing info for both coding and noncoding:
hgsql dm2 < $HOME/kent/src/hg/lib/flyBase2004Xref.sql
hgsql dm2 -e 'load data local infile "flyBase2004Xref.tab" \
into table flyBase2004Xref'
# Some featureBits comparisons with refGene which is pretty much like
# version 4.0 but apparently includes some noncoding genes too:
featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout
#chr3R 8221731 8222987 chr3R.1
#...
#87577 bases of 131698467 (0.066%) in intersection
# This shows only the dropped isoforms/duplicates (ignores noncoding):
featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \
-bed=stdout
#chr3R 15620310 15621493 chr3R.1
#chr2L 4698094 4700940 chr2L.1
#chrX 3638700 3639895 chrX.1
#49911 bases of 131698467 (0.038%) in intersection
# add upstream* downloadable files
cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
foreach size (1000 2000 5000)
echo upstream$size
nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
| nice gzip -c > upstream$size.fa.gz
end
md5sum *.zip *.gz > md5sum.txt
# FLYREG (DONE 2/23/05 angie)
ssh kksilo
mkdir /cluster/data/dm2/bed/flyreg
cd /cluster/data/dm2/bed/flyreg
wget http://www.gen.cam.ac.uk/casey/data/Bergman2004/v2.0/Footprint.GFF
# This is not GTF; it should really be bed +. The contributor,
# Casey Bergman, says that coords are 0-based half-open, so
# translation will be even easier.
grep -v '^#' Footprint.GFF \
| perl -wpe 'if (! s/^(\S+)\tBergman_data\tbinding_site\t(\d+)\t(\d+)\t.\t.\t.\tFactor \"([^\"]+)\"; Target \"([^\"]+)\"; PMID \"(\d+)\"; FPID \"(\d+)\"$/chr$1\t$2\t$3\t$4\t$5\t$6\t$7/) { die "Cant parse line $.:\n$_\n"; }' \
> flyreg2.bed
ssh hgwdev
hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/flyreg2.sql \
dm2 flyreg2 /cluster/data/dm2/bed/flyreg/flyreg2.bed
# Add Dan Pollard's MEME motif data for the footprints:
ssh kksilo
cd /cluster/data/dm2/bed/flyreg
wget http://rana.lbl.gov/~dan/matrices/bergman2004/footprint_matrices.txt
/cluster/data/dm1/bed/flyreg/extractMatrices.pl \
footprint_matrices.txt > flyregMotif.tab
ssh hgwdev
cd /cluster/data/dm2/bed/flyreg
sed -e 's/dnaMotif/flyregMotif/' $HOME/kent/src/hg/lib/dnaMotif.sql \
> flyregMotif.sql
hgsql dm2 < flyregMotif.sql
hgsql dm2 -e 'load data local infile "flyregMotif.tab" into table flyregMotif'
# LIFTOVER DM1 FLYREG, COMPARE TO DM2 FLYREG (DONE 2/25/05 angie)
# Casey Bergman is very eager to see alternate confirmation of his
# dm1->dm2 mapping of flyreg, so run liftOver on it:
ssh kksilo
cd /cluster/data/dm2/bed/flyreg
liftOver /cluster/data/dm1/bed/flyreg/flyreg.bed \
/cluster/data/dm1/bed/bedOver/dm1ToDm2.over.chain \
flyregLift.bed flyregLift.unmapped
wc -l flyreg2.bed flyregLift.bed flyregLift.unmapped
# 1362 flyreg2.bed
# 1363 flyregLift.bed
# 0 flyregLift.unmapped
awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5;}' flyreg2.bed | sort \
> /tmp/1
awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5;}' flyregLift.bed | sort \
> /tmp/2
diff -c /tmp/[12] > newVsLift
# Analyzed the differences: 2 duplicates in original (uniquified in v2),
# 1 new item in v2, and 4 items incorrectly mapped by liftOver (fooled
# by what looks like a slightly diverged local duplication) but probably
# correctly mapped by Casey.
# SELF ALIGNMENTS (DONE 2/24/05 angie)
# Doing this largely as a test of doBlastzChainNet.pl...
ssh kksilo
mkdir /cluster/data/dm2/bed/blastz.dm2.2005-02-23
cd /cluster/data/dm2/bed/blastz.dm2.2005-02-23
cat << '_EOF_' > DEF
# D. melanogaster vs. self
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. melanogaster
SEQ2_DIR=/iscratch/i/dm2/nib
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dm2/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.dm2.2005-02-23
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2SelfOut >& do.log &
tail -f do.log
# The script died at the start of the cat step when kki couldn't see
# /panasas (doh!). Asked cluster-admin to fix that.
# When fixed, restarted with -continue, appending to do.log:
rm -r run.cat
doBlastzChainNet.pl -continue cat DEF \
-blastzOutRoot /panasas/store/dm2SelfOut >>& do.log &
tail -f do.log
rmdir /panasas/store/dm2SelfOut
ln -s blastz.dm2.2005-02-23 /cluster/data/dm2/bed/blastz.dm2
# chainSelf is already in top-level trackDb.ra, so no need to add.
# Add /usr/local/apache/htdocs/goldenPath/dm2/vsSelf/README.txt
# BLASTZ/CHAIN/NET DROSIM1 (DONE 4/13/05 angie)
# -- Originally run with default blastz params; replaced by a run with
# better params, below.
# BLASTZ/CHAIN/NET ANOGAM1 -- LOOSER PARAMS (DONE 5/19/05 angie)
# Lower the L threshold from 6000 to 4000 (5-18) and then 3000 (5-19),
# see if that increases coverage from this measurement from the previous
# "human-fugu" params:
# featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainAnoGam1Link
#flyBaseGene 22.967%, chainAnoGam1Link 15.999%, both 12.567%, cover 54.72%, enrich 3.42x
# 5/19, L=3000: too much... low-complexity-anchored alignments cluttering.
# but interesting to know we can get this kind of coverage:
#flyBaseGene 22.967%, chainAnoGam1Link 60.227%, both 18.542%, cover 80.73%, enrich 1.34x
# So use results of this L=4000 run:
ssh kkstore01
mkdir /cluster/data/dm2/bed/blastz.anoGam1.2005-05-18
cd /cluster/data/dm2/bed/blastz.anoGam1.2005-05-18
cat << '_EOF_' > DEF
# D.melanogaster vs. A. gambiae
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - A. gambiae
SEQ2_DIR=/iscratch/i/anoGam1/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=/cluster/data/anoGam1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.anoGam1.2005-05-18
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /santest/scratch/dm2anoGam1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainAnoGam1Link
# 5/18, L=4000:
#flyBaseGene 22.967%, chainAnoGam1Link 22.072%, both 14.077%, cover 61.29%, enrich 2.78x
# Looks good in the browser -- top-level net chains extend farther
# without too much extra crap like we saw with L=3000.
rm /cluster/data/dm2/bed/blastz.anoGam1
ln -s blastz.anoGam1.2005-05-18 /cluster/data/dm2/bed/blastz.anoGam1
# BLASTZ/CHAIN/NET APIMEL2 -- LOOSER PARAMS (DONE 5/19/05 angie)
# Since L=4000 noticeably helped mosquito, try it on the honeybee,
# which had this coverage with L=6000:
#flyBaseGene 22.967%, chainApiMel2Link 13.767%, both 7.802%, cover 33.97%, enrich 2.47x
mkdir /cluster/data/dm2/bed/blastz.apiMel2.2005-05-19
cd /cluster/data/dm2/bed/blastz.apiMel2.2005-05-19
cat << '_EOF_' > DEF
# D.melanogaster vs. A. mellifera
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - A. mellifer
SEQ2_DIR=/iscratch/i/apiMel2/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=0
SEQ2_LEN=/cluster/data/apiMel2/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.apiMel2.2005-05-19
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /santest/scratch/dm2apiMel2 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainApiMel2Link
#flyBaseGene 22.967%, chainApiMel2Link 36.701%, both 11.095%, cover 48.31%, enrich 1.32x
# Wow, quite an increase in coverage! And drop in enrichment... some
# of the new alignments are on the low-complexity side but not nearly
# as bad as mosquito at L=3000. Never know what to expect from these
# param experiments! :| Well, using L=4000 seems like a net gain so
# go with it...
rm /cluster/data/dm2/bed/blastz.apiMel2
ln -s blastz.apiMel2.2005-05-19 /cluster/data/dm2/bed/blastz.apiMel2
# BLASTZ/CHAIN/NET DROMOJ1 -- LOOSER PARAMS (DONE 5/19/05 angie)
# Here's what L=6000 got us:
#flyBaseGene 22.967%, chainDroMoj1Link 59.129%, both 20.159%, cover 87.77%, enrich 1.48x
# OK, let's see what droMoj1 looks like at L=4000...
mkdir /cluster/data/dm2/bed/blastz.droMoj1.2005-05-19
cd /cluster/data/dm2/bed/blastz.droMoj1.2005-05-19
cat << '_EOF_' > DEF
# D. melanogaster vs. D. mojavensis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. mojavensis
SEQ2_DIR=/iscratch/i/droMoj1/droMoj1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droMoj1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droMoj1.2005-05-19
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /santest/scratch/dm2droMoj1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj1Link
#flyBaseGene 22.967%, chainDroMoj1Link 65.901%, both 20.698%, cover 90.12%, enrich 1.37x
# Wow, actually looks pretty good in the browser. Finds more non-top-
# level alignments than L=6000, but not a huge number, and most look
# reasonable in the detailed view.
rm -f /cluster/data/dm2/bed/blastz.droMoj1
ln -s blastz.droMoj1.2005-05-19 /cluster/data/dm2/bed/blastz.droMoj1
# BLASTZ/CHAIN/NET DROVIR1 -- LOOSER PARAMS (DONE 5/20/05 angie)
# coverage with default blastz params:
#flyBaseGene 22.967%, chainDroVir1Link 47.957%, both 18.585%, cover 80.92%, enrich 1.69x
# OK, let's see what droVir1 looks like at L=4000...
mkdir /cluster/data/dm2/bed/blastz.droVir1.2005-05-19
cd /cluster/data/dm2/bed/blastz.droVir1.2005-05-19
cat << '_EOF_' > DEF
# D. melanogaster vs. D. virilis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. virilis
SEQ2_DIR=/santest/scratch/droVir1/droVir1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/santest/scratch/droVir1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droVir1.2005-05-19
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /santest/scratch/dm2droVir1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir1Link
#flyBaseGene 22.967%, chainDroVir1Link 66.657%, both 20.876%, cover 90.90%, enrich 1.36x
rm -f /cluster/data/dm2/bed/blastz.droVir1
ln -s blastz.droVir1.2005-05-19 /cluster/data/dm2/bed/blastz.droVir1
# BLASTZ/CHAIN/NET DP3 -- LOOSER PARAMS (DONE 5/20/05 angie)
# coverage with default blastz params was not too bad but let's see if
# we can do better:
#flyBaseGene 22.967%, chainDp3Link 67.232%, both 20.112%, cover 87.57%, enrich 1.30x
mkdir /cluster/data/dm2/bed/blastz.dp3.2005-05-20
cd /cluster/data/dm2/bed/blastz.dp3.2005-05-20
cat << '_EOF_' > DEF
# D. melanogaster vs. D. pseudoobscura
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp3/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dp3/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.dp3.2005-05-20
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /santest/scratch/dm2dp3 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDp3Link
#flyBaseGene 22.967%, chainDp3Link 74.901%, both 21.335%, cover 92.89%, enrich 1.24x
# Again, reasonable extension of coverage without too much crap.
rm -f /cluster/data/dm2/bed/blastz.dp3
ln -s blastz.dp3.2005-05-20 /cluster/data/dm2/bed/blastz.dp3
# BLASTZ/CHAIN/NET DROANA1 -- LOOSER PARAMS (DONE 5/21/05 angie)
# coverage with default blastz params:
#flyBaseGene 22.967%, chainDroAna1Link 73.508%, both 20.981%, cover 91.35%, enrich 1.24x
# OK, let's see what droAna1 looks like at L=4000...
mkdir /cluster/data/dm2/bed/blastz.droAna1.2005-05-20
cd /cluster/data/dm2/bed/blastz.droAna1.2005-05-20
cat << '_EOF_' > DEF
# D. melanogaster vs. D. ananassae
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. ananassae
SEQ2_DIR=/santest/scratch/droAna1/droAna1.2bit
SEQ2_CHUNK=1000000
SEQ2_LAP=10000
SEQ2_LEN=/santest/scratch/droAna1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droAna1.2005-05-20
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /santest/scratch/dm2droAna1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna1Link
#flyBaseGene 22.967%, chainDroAna1Link 80.519%, both 21.949%, cover 95.57%, enrich 1.19x
rm -f /cluster/data/dm2/bed/blastz.droAna1
ln -s blastz.droAna1.2005-05-20 /cluster/data/dm2/bed/blastz.droAna1
# BLASTZ/CHAIN/NET DROYAK1 -- LOOSER PARAMS (DONE 5/21/05 angie)
# coverage with default blastz params:
#flyBaseGene 22.967%, chainDroYak1Link 90.253%, both 22.586%, cover 98.34%, enrich 1.09x
# OK, let's see what droYak1 looks like at L=4000...
mkdir /cluster/data/dm2/bed/blastz.droYak1.2005-05-20
cd /cluster/data/dm2/bed/blastz.droYak1.2005-05-20
cat << '_EOF_' > DEF
# D. melanogaster vs. D. yakuba
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. yakuba
SEQ2_DIR=/iscratch/i/droYak1/nib
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droYak1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droYak1.2005-05-20
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /santest/scratch/dm2droYak1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroYak1Link
#flyBaseGene 22.967%, chainDroYak1Link 91.896%, both 22.736%, cover 98.99%, enrich 1.08x
rm -f /cluster/data/dm2/bed/blastz.droYak1
ln -s blastz.droYak1.2005-05-20 /cluster/data/dm2/bed/blastz.droYak1
# BLASTZ/CHAIN/NET DROSIM1 -- LOOSER PARAMS (DONE 5/21/05 angie)
# coverage with default blastz params:
#flyBaseGene 22.967%, chainDroSim1Link 90.656%, both 22.139%, cover 96.40%, enrich 1.06x
# OK, let's see what droSim1 looks like at L=4000...
mkdir /cluster/data/dm2/bed/blastz.droSim1.2005-05-21
cd /cluster/data/dm2/bed/blastz.droSim1.2005-05-21
cat << '_EOF_' > DEF
# D. melanogaster vs. D. simulans
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. simulans
SEQ2_DIR=/iscratch/i/droSim1/droSim1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droSim1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droSim1.2005-05-21
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /santest/scratch/dm2droSim1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroSim1Link
#flyBaseGene 22.967%, chainDroSim1Link 91.756%, both 22.420%, cover 97.62%, enrich 1.06x
rm -f /cluster/data/dm2/bed/blastz.droSim1
ln -s blastz.droSim1.2005-05-21 /cluster/data/dm2/bed/blastz.droSim1
# AUGUSTUS GENE PREDICTIONS (Done 6/1/2005 Andy)
ssh hgwdev
cd /cluster/data/dm2/bed
mkdir augustus
cd augustus/
cat > cleanAugustus.awk << _EOF_
# Add the chrom name to the gene and transcript IDs.
{
ch = \$1
while (match(\$0, /"g[0-9\.]+"/))
{
before = substr(\$0, 1, RSTART)
name = substr(\$0, RSTART+1, RLENGTH)
after = substr(\$0, RSTART + RLENGTH + 1)
\$0 = before "" ch "." name "" after
}
print
}
_EOF_
wget http://augustus.gobics.de/predictions/dm2/dm2.allchr.augustus.gtf.gz
zcat dm2.allchr.augustus.gtf.gz | awk -f cleanAugustus.awk | gzip > dm2.allchr.augustus.clean.gtf.gz
ldHgGene -gtf dm2 augustus dm2.allchr.augustus.clean.gtf.gz
rm dm2.allchr.augustus.gtf.gz
# BLAT FlyBase predicted DM proteins against DM (DONE 2005-06-27 braney)
ssh hgwdev
cd /cluster/data/dm2/bed
mkdir blat.dm2FB
cd blat.dm2FB
pepPredToFa dm2 flyBasePep dm2FB.fa
hgPepPred dm2 generic blastFBPep01 dm2FB.fa
ssh kk
cd /cluster/data/dm2/bed/blat.dm2FB
cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
# << keep emacs happy
chmod +x blatSome
ls -1S /iscratch/i/dm2/nib/*.nib > bug.lst
mkdir fbfas
cd fbfas
faSplit sequence ../dm2FB.fa 5010 fb
cd ..
ls -1S fbfas/*.fa > fb.lst
cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << keep emacs happy
gensub2 bug.lst fb.lst blatGsub blatSpec
mkdir psl
cd psl
foreach i (`cat ../bug.lst`)
mkdir `basename $i .nib`
end
cd ..
para create blatSpec
para push
# Completed: 63193 of 63193 jobs
# CPU time in finished jobs: 712543s 11875.72m 197.93h 8.25d 0.023 y
# IO & Wait Time: 172878s 2881.30m 48.02h 2.00d 0.005 y
# Average job time: 14s 0.23m 0.00h 0.00d
# Longest finished job: 2172s 36.20m 0.60h 0.03d
# Submission to last job: 3192s 53.20m 0.89h 0.04d
ssh eieio
cd /cluster/data/dm2/bed/blat.dm2FB
pslSort dirs raw.psl /tmp psl/*
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null
sort -rn cov90.psl | pslUniq stdin dm2FB.psl
pslxToFa dm2FB.psl dm2FB_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
fbName dm2 dm2FB.psl blastFBRef01
ssh hgwdev
cd /cluster/data/dm2/bed/blat.dm2FB
hgsql dm2 < ~/kent/src/hg/lib/blastRef.sql
echo "rename table blastRef to blastFBRef01" | hgsql dm2
echo "load data local infile 'blastFBRef01' into table blastFBRef01" | hgsql dm2
# MAKE Human Proteins track (DONE 06-30-05 braney)
ssh kk
mkdir -p /cluster/data/dm2/blastDb
cd /cluster/data/dm2/blastDb
for i in ../*/*/*_[0-9]*_[0-9]*.fa; do ln -s $i .; done
for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
rm *.fa *.log
ssh kkr1u00
mkdir -p /iscratch/i/dm2/blastDb
cp /cluster/data/dm2/blastDb/* /iscratch/i/dm2/blastDb
iSync 2>&1 > sync.out
mkdir -p /cluster/data/dm2/bed/tblastn.hg17KG
cd /cluster/data/dm2/bed/tblastn.hg17KG
ls -1S /iscratch/i/dm2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst
mkdir kgfa
# calculate a reasonable number of jobs
calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc target.lst | awk "{print \\\$1}"`\)
# 37365/(150000/288) = 71.740800
split -l 72 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
rm -rf /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut
mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut
ln -s /cluster/bluearc/dm2/bed/tblastn.hg17KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dm2/jkStuff/subChr.lft warn $f.2
liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/dm2/jkStuff/liftAll.lft warn $f.3
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
chmod +x blastSome
gensub2 target.lst kg.lst blastGsub blastSpec
ssh kk
cd /cluster/data/dm2/bed/tblastn.hg17KG
para create blastSpec
para push
# Completed: 149472 of 149472 jobs
# CPU time in finished jobs: 5837692s 97294.87m 1621.58h 67.57d 0.185 y
# IO & Wait Time: 1404308s 23405.13m 390.09h 16.25d 0.045 y
# Average job time: 48s 0.81m 0.01h 0.00d
# Longest finished job: 1095s 18.25m 0.30h 0.01d
# Submission to last job: 50465s 841.08m 14.02h 0.58d
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
ssh kki
cd /cluster/data/dm2/bed/tblastn.hg17KG
tcsh
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
para create chainSpec
para push
# Completed: 519 of 519 jobs
# CPU time in finished jobs: 3835s 63.91m 1.07h 0.04d 0.000 y
# IO & Wait Time: 5794s 96.57m 1.61h 0.07d 0.000 y
# Average job time: 19s 0.31m 0.01h 0.00d
# Longest finished job: 114s 1.90m 0.03h 0.00d
# Submission to last job: 824s 13.73m 0.23h 0.01d
ssh kkstore01
cd /cluster/data/dm2/bed/tblastn.hg17KG/blastOut
for i in kg??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/dm2/bed/tblastn.hg17KG/blastHg17KG.psl
cd ..
ssh hgwdev
cd /cluster/data/dm2/bed/tblastn.hg17KG
hgLoadPsl dm2 blastHg17KG.psl
exit
# back to kksilo
rm -rf blastOut
# End tblastn
# BLASTZ/CHAIN/NET TRICAS1 (DONE 7/11/05 Andy)
ssh kkstore01
mkdir /cluster/data/dm2/bed/blastz.triCas1.2005-07-10
cd /cluster/data/dm2/bed/blastz.triCas1.2005-07-10
cat << "_EOF_" > DEF
# D. melanogaster vs. T. castaneum
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. virilis
SEQ2_DIR=/panasas/store/triCas1/triCas1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/panasas/store/triCas1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.triCas1.2005-07-10
_EOF_
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2triCas1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir1Link
#flyBaseGene 22.967%, chainDroVir1Link 66.657%, both 20.876%, cover 90.90%, enrich 1.36x
rm -f /cluster/data/dm2/bed/blastz.triCas1
ln -s blastz.triCas1.2005-05-19 /cluster/data/dm2/bed/blastz.triCas1
# GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie)
ssh hgwdev
mkdir /cluster/data/dm2/bed/geneid
cd /cluster/data/dm2/bed/geneid
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
wget http://genome.imim.es/genepredictions/D.melanogaster/golden_path_200404/geneidv1.2/$chr.gtf
end
ldHgGene -gtf -genePredExt dm2 geneid *.gtf
# BLASTZ/CHAIN/NET DROERE1 (DONE 8/8/05 angie)
mkdir /cluster/data/dm2/bed/blastz.droEre1.2005-08-08
cd /cluster/data/dm2/bed/blastz.droEre1.2005-08-08
cat << '_EOF_' > DEF
# D. melanogaster vs. D. erecta
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. erecta
SEQ2_DIR=/iscratch/i/droEre1/droEre1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droEre1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droEre1.2005-08-08
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droEre1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroEre1Link
#flyBaseGene 22.967%, chainDroEre1Link 91.103%, both 22.725%, cover 98.95%, enrich 1.09x
rm -f /cluster/data/dm2/bed/blastz.droEre1
ln -s blastz.droEre1.2005-08-08 /cluster/data/dm2/bed/blastz.droEre1
# BLASTZ/CHAIN/NET DROANA2 (DONE 8/8/05 angie)
mkdir /cluster/data/dm2/bed/blastz.droAna2.2005-08-08
cd /cluster/data/dm2/bed/blastz.droAna2.2005-08-08
cat << '_EOF_' > DEF
# D. melanogaster vs. D. ananassae
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. ananassae
SEQ2_DIR=/iscratch/i/droAna2/droAna2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droAna2/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droAna2.2005-08-08
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droAna2 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna2Link
#flyBaseGene 22.967%, chainDroAna2Link 81.385%, both 22.050%, cover 96.01%, enrich 1.18x
rm -f /cluster/data/dm2/bed/blastz.droAna2
ln -s blastz.droAna2.2005-08-08 /cluster/data/dm2/bed/blastz.droAna2
# BLASTZ/CHAIN/NET DROMOJ2 (DONE 8/9/05 angie)
mkdir /cluster/data/dm2/bed/blastz.droMoj2.2005-08-08
cd /cluster/data/dm2/bed/blastz.droMoj2.2005-08-08
cat << '_EOF_' > DEF
# D. melanogaster vs. D. mojavensis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. mojavensis
SEQ2_DIR=/iscratch/i/droMoj2/droMoj2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droMoj2/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droMoj2.2005-08-08
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droMoj2 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj2Link
#flyBaseGene 22.967%, chainDroMoj2Link 66.524%, both 20.799%, cover 90.56%, enrich 1.36x
rm -f /cluster/data/dm2/bed/blastz.droMoj2
ln -s blastz.droMoj2.2005-08-08 /cluster/data/dm2/bed/blastz.droMoj2
# BLASTZ/CHAIN/NET DROGRI1 (DONE 8/9/05 angie)
mkdir /cluster/data/dm2/bed/blastz.droGri1.2005-08-08
cd /cluster/data/dm2/bed/blastz.droGri1.2005-08-08
cat << '_EOF_' > DEF
# D. melanogaster vs. D. grimshawi
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. grimshawi
SEQ2_DIR=/iscratch/i/droGri1/droGri1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droGri1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droGri1.2005-08-08
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droGri1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroGri1Link
#flyBaseGene 22.967%, chainDroGri1Link 68.520%, both 20.822%, cover 90.66%, enrich 1.32x
rm -f /cluster/data/dm2/bed/blastz.droGri1
ln -s blastz.droGri1.2005-08-08 /cluster/data/dm2/bed/blastz.droGri1
# BLASTZ/CHAIN/NET DROVIR2 (DONE 8/12/05 angie)
mkdir /cluster/data/dm2/bed/blastz.droVir2.2005-08-11
cd /cluster/data/dm2/bed/blastz.droVir2.2005-08-11
cat << '_EOF_' > DEF
# D. melanogaster vs. D. virilis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. virilis
SEQ2_DIR=/iscratch/i/droVir2/droVir2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droVir2/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droVir2.2005-08-11
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droVir2 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir2Link
#flyBaseGene 22.967%, chainDroVir2Link 67.025%, both 20.901%, cover 91.01%, enrich 1.36x
rm -f /cluster/data/dm2/bed/blastz.droVir2
ln -s blastz.droVir2.2005-08-11 /cluster/data/dm2/bed/blastz.droVir2
# RE-RUN NETTOAXT, AXTTOMAF IN PREP FOR MULTIZ (DONE 8/19/05 angie)
# Kate checked in a fix to netToAxt, to prevent overlapping blocks
# which were causing problems for (display of) multiple alignments.
# Before running multiz, regenerate the axtNet/ and mafNet/ pairwise
# inputs (axtNet downloads should probably be repushed after this).
# The /cluster/data/dm2/bed/blastz.*/axtChain/netChains.csh files
# contain the original axtNet and mafNet commands, adapted here:
ssh kolossus
cd /tmp
cat << '_EOF_' > /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh
#!/bin/csh -efx
foreach oDb (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \
anoGam1 apiMel2 triCas1)
set oSeq = /iscratch/i/$oDb/$oDb.2bit
if (! -e $oSeq) then
set oSeq = /iscratch/i/$oDb/nib
endif
cd /cluster/data/dm2/bed/blastz.$oDb/axtChain
netSplit dm2.$oDb.net.gz net
chainSplit chain dm2.$oDb.all.chain.gz
cd ..
mv axtNet axtNet.bak
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
/iscratch/i/dm2/nib $oSeq stdout \
| axtSort stdin stdout \
| gzip -c > axtNet/$f:t:r.dm2.$oDb.net.axt.gz
end
mv mafNet mafNet.bak
mkdir mafNet
foreach f (axtNet/*.dm2.$oDb.net.axt.gz)
axtToMaf -tPrefix=dm2. -qPrefix=$oDb. $f \
/cluster/data/dm2/chrom.sizes /cluster/data/$oDb/chrom.sizes \
stdout \
| gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end
end
'_EOF_'
# << for emacs
chmod 775 /cluster/data/dm2/jkStuff/reNetToAxtMaf.csh
/cluster/data/dm2/jkStuff/reNetToAxtMaf.csh >& re.log
echo $status
#0
# Made sure everything ran smoothly. Spot-checked some .bak's vs.
# new to make sure they're of comparable size -- newer can be slightly
# smaller due to lack of dup's, or slightly larger due to more headers.
# Clean up:
foreach oDb (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \
anoGam1 apiMel2 triCas1)
rm -r /cluster/data/dm2/bed/blastz.$oDb/axtChain/{chain,net}
rm -r /cluster/data/dm2/bed/blastz.$oDb/{axt,maf}Net.bak
end
# MULTIZ.V10 12WAY (7 FLIES, MOSQUITO, HONEYBEE) (DONE 8/20/05 angie)
# Since 9way, adding D. erecta, D. grimshawi, T. castaneum;
# dro{Ana,Moj,Vir}1 --> dro{Ana,Moj,Vir}2.
# Tree (12-way):
# ((((((((dm2 droSim1) (droYak1 droEre1)) droAna2) dp3) ((droVir2 droMoj2) droGri1)) anoGam1) apiMel2) triCas1)
ssh kkstore01
mkdir /cluster/data/dm2/bed/multiz12way.2005-08-20
ln -s /cluster/data/dm2/bed/multiz12way.2005-08-20 \
/cluster/data/dm2/bed/multiz12way
cd /cluster/data/dm2/bed/multiz12way
# Setup: Copy pairwise MAF to /san/sanvol1/scratch:
mkdir /san/sanvol1/scratch/flyMultiz12way
foreach db (droSim1 droYak1 droEre1 droAna2 dp3 droVir2 droMoj2 droGri1 \
anoGam1 apiMel2 triCas1)
echo $db
cp -pR /cluster/data/dm2/bed/blastz.$db/mafNet \
/san/sanvol1/scratch/flyMultiz12way/$db
end
ls -lLR /san/sanvol1/scratch/flyMultiz12way
# Make output dir:
mkdir maf
# Create scripts to run multiz.v10 in the right order:
# (Thanks to Andy for the idea to use a subroutine for individual
# multiz invocations)
cat << '_EOF_' > /cluster/bin/scripts/runMultizV10.csh
#!/bin/csh -fex
# Run multiz on two inputs.
set closerMaf = $1
set fartherMaf = $2
set v = $3
set ref = $4
set tmpDir = $5
set outMaf = $6
if ($outMaf == "") then
echo "usage: $0 closerMaf fartherMaf v ref tmpDir outMaf"
exit 1
endif
set MZ10 = /cluster/bin/penn/multiz.v10
set PROJECT = /cluster/bin/penn/maf_project
if (-e $closerMaf && $closerMaf:e == "gz") then
gunzip -c $closerMaf > $tmpDir/closer.maf
set closerMaf = $tmpDir/closer.maf
endif
if (-e $fartherMaf && $fartherMaf:e == "gz") then
gunzip -c $fartherMaf > $tmpDir/farther.maf
set fartherMaf = $tmpDir/farther.maf
endif
set closerSubbed = `echo $closerMaf | sed -e 's@/@_@g;'`
set fartherSubbed = `echo $fartherMaf | sed -e 's@/@_@g;'`
if ( -s $closerMaf && -s $fartherMaf ) then
$MZ10 $fartherMaf $closerMaf $v > $tmpDir/tmp.maf
$PROJECT $tmpDir/tmp.maf $ref > $outMaf
else if ( -s $closerMaf ) then
touch missing.$fartherSubbed
cp $closerMaf $outMaf
else if ( -s $fartherMaf ) then
touch missing.$closerSubbed
cp $fartherMaf $outMaf
else
touch missing.$closerSubbed
touch missing.$fartherSubbed
endif
'_EOF_'
# << for emacs
cat << '_EOF_' > doMultizAll.csh
#!/bin/csh -fex
set chr = $1
set tmpDir = /scratch/flyMultiz12way.$chr
mkdir $tmpDir
set mafScratch = /san/sanvol1/scratch/flyMultiz12way
# Really should write a perl script to take a tree like this and generate
# commands like the ones below:
# ((((((((dm2 droSim1) (droYak1 droEre1)) droAna2) dp3) ((droVir2 droMoj2) droGri1)) anoGam1) apiMel2) triCas1)
/cluster/bin/scripts/runMultizV10.csh \
$mafScratch/droYak1/$chr.maf.gz \
$mafScratch/droEre1/$chr.maf.gz \
0 dm2.$chr $tmpDir $tmpDir/$chr.YakEre.maf
/cluster/bin/scripts/runMultizV10.csh \
$mafScratch/droSim1/$chr.maf.gz \
$tmpDir/$chr.YakEre.maf \
1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEre.maf
/cluster/bin/scripts/runMultizV10.csh \
$tmpDir/$chr.SimYakEre.maf \
$mafScratch/droAna2/$chr.maf.gz \
1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAna.maf
/cluster/bin/scripts/runMultizV10.csh \
$tmpDir/$chr.SimYakEreAna.maf \
$mafScratch/dp3/$chr.maf.gz \
1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPse.maf
/cluster/bin/scripts/runMultizV10.csh \
$mafScratch/droVir2/$chr.maf.gz \
$mafScratch/droMoj2/$chr.maf.gz \
0 dm2.$chr $tmpDir $tmpDir/$chr.VirMoj.maf
/cluster/bin/scripts/runMultizV10.csh \
$tmpDir/$chr.SimYakEreAnaPse.maf \
$tmpDir/$chr.VirMoj.maf \
1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMoj.maf
/cluster/bin/scripts/runMultizV10.csh \
$tmpDir/$chr.SimYakEreAnaPseVirMoj.maf \
$mafScratch/droGri1/$chr.maf.gz \
1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGri.maf
/cluster/bin/scripts/runMultizV10.csh \
$tmpDir/$chr.SimYakEreAnaPseVirMojGri.maf \
$mafScratch/anoGam1/$chr.maf.gz \
1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGriAno.maf
/cluster/bin/scripts/runMultizV10.csh \
$tmpDir/$chr.SimYakEreAnaPseVirMojGriAno.maf \
$mafScratch/apiMel2/$chr.maf.gz \
1 dm2.$chr $tmpDir $tmpDir/$chr.SimYakEreAnaPseVirMojGriAnoApi.maf
/cluster/bin/scripts/runMultizV10.csh \
$tmpDir/$chr.SimYakEreAnaPseVirMojGriAnoApi.maf \
$mafScratch/triCas1/$chr.maf.gz \
1 dm2.$chr $tmpDir maf/$chr.maf
rm -f $tmpDir/*.maf
rmdir $tmpDir
'_EOF_'
# << for emacs
chmod 775 /cluster/bin/scripts/runMultizV10.csh
chmod 775 doMultizAll.csh
awk '{print "./doMultizAll.csh " $1;}' /cluster/data/dm2/chrom.sizes \
> jobs.lst
# Run on small cluster
ssh kki
cd /cluster/data/dm2/bed/multiz12way
para make jobs.lst
para time
#Completed: 13 of 13 jobs
#Average job time: 124s 2.07m 0.03h 0.00d
#Longest finished job: 538s 8.97m 0.15h 0.01d
#Submission to last job: 15659s 260.98m 4.35h 0.18d
ls -1 missing*
#ls: No match.
# make /gbdb/ links to 12way maf files:
ssh hgwdev
mkdir -p /gbdb/dm2/multiz12way/maf/multiz12way
ln -s /cluster/data/dm2/bed/multiz12way/maf/chr*.maf \
/gbdb/dm2/multiz12way/maf/multiz12way/
# load into database
cd /tmp
hgLoadMaf -warn dm2 multiz12way \
-pathPrefix=/gbdb/dm2/multiz12way/maf/multiz12way
# load summary table to replace pairwise
cat /cluster/data/dm2/bed/multiz12way/maf/chr*.maf \
| nice hgLoadMafSummary dm2 multiz12waySummary stdin
# Dropped unused indexes (2006-05-09 kate)
# NOTE: this is not required in the future, as the loader
# has been fixed to not generate these indexes
hgsql dm2 -e "alter table multiz9waySummary drop index chrom_2"
hgsql dm2 -e "alter table multiz9waySummary drop index chrom_3"
# put 12way MAF out for download
ssh kolossus
cd /cluster/data/dm2/bed/multiz12way
mkdir mafDownload
foreach f (maf/*.maf)
nice gzip -c $f > mafDownload/$f:t.gz
end
cd mafDownload
md5sum *.maf.gz > md5sum.txt
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/dm2/multiz12way
ln -s /cluster/data/dm2/bed/multiz12way/mafDownload/{*.maf.gz,md5sum.txt} \
/usr/local/apache/htdocs/goldenPath/dm2/multiz12waya
# make a README.txt
# Cleanup
rm -rf /san/sanvol1/scratch/flyMultiz12way/
# PHASTCONS 12WAY WITH METHODS FROM PAPER (DONE 9/20/05 angie)
# ((((((((dm2,droSim1),(droYak1,droEre1)),droAna2),dp3),((droVir2,droMoj2),droGri1)),anoGam1),apiMel2),triCas1)
ssh kkstore01
mkdir -p /san/sanvol1/scratch/dm2/chrom
cp -p /cluster/data/dm2/?{,?}/chr*.fa /san/sanvol1/scratch/dm2/chrom/
# Split chrom fa into smaller windows for phastCons:
ssh pk
mkdir /cluster/data/dm2/bed/multiz12way/phastCons
mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.split
cd /cluster/data/dm2/bed/multiz12way/phastCons/run.split
set WINDOWS = /san/sanvol1/scratch/dm2/phastCons/WINDOWS
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.sh
#!/bin/csh -ef
set PHAST=/cluster/bin/phast
set FA_SRC=/san/sanvol1/scratch/dm2/chrom
set WINDOWS=/san/sanvol1/scratch/dm2/phastCons/WINDOWS
set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
-O dm2,droSim1,droYak1,droEre1,droAna2,dp3,droVir2,droMoj2,droGri1,anoGam1,apiMel2,triCas1 \
-w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
chmod a+x doSplit.sh
rm -f jobList
foreach file (/cluster/data/dm2/bed/multiz12way/maf/*.maf)
if (-s $file) then
echo "doSplit.sh {check in exists+ $file}" >> jobList
endif
end
para make jobList
para time
#Completed: 13 of 13 jobs
#Average job time: 81s 1.35m 0.02h 0.00d
#Longest finished job: 219s 3.65m 0.06h 0.00d
#Submission to last job: 219s 3.65m 0.06h 0.00d
############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
# Use consEntropy --NH to make it suggest a --expected-lengths param
# that we should try next. Adam ran this on hg17 to find out the
# total entropy of the hg17 model:
# consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
# Our target is that NH result: 9.7834 bits.
# Use phyloFit to make an initial model:
ssh kolossus
cd /cluster/data/dm2/bed/multiz12way/phastCons
/cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \
--aggregate dm2,droSim1,droYak1,droEre1,droAna2,dp3,droVir2,droMoj2,droGri1,anoGam1,apiMel2,triCas1 \
-i MAF -o SS > all.ss
/cluster/bin/phast/phyloFit all.ss \
--tree "((((((((dm2,droSim1),(droYak1,droEre1)),droAna2),dp3),((droVir2,droMoj2),droGri1)),anoGam1),apiMel2),triCas1)" \
-i SS --out-root starting-tree
cat starting-tree.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -496988087.624743
#BACKGROUND: 0.288383 0.211930 0.211868 0.287819
#RATE_MAT:
# -0.922499 0.207559 0.437636 0.277304
# 0.282436 -1.104124 0.227654 0.594033
# 0.595687 0.227721 -1.105784 0.282377
# 0.277848 0.437405 0.207862 -0.923114
#TREE: ((((((((dm2:0.028572,droSim1:0.042955):0.010136,(droYak1:0.058491,droEre1:0.056179):0.048245):0.060386,droAna2:0.259454):0.024698,dp3:0.263678):0.012004,((droVir2:0.126666,droMoj2:0.147640):0.138076,droGri1:0.248877):0.126753):0.094211,anoGam1:0.382011):0.052522,apiMel2:0.456711):0.211776,triCas1:0.211776);
# also get GC content from model -- if similar enough, no need to extract it
# separately above.
awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod
#0.423798
# OK, use .424 for --gc below.
# Use the values of --target-coverage and --expected-lengths from the
# last iteration of the 9way run, 0.398 and 23.7.
# Multiply each subst rate on the TREE line by 3.5 which is roughly the
# ratio of noncons to cons in
# /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
/cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
> starting-tree.noncons.mod
/cluster/bin/phast/consEntropy --NH 9.7834 0.398 23.7 \
starting-tree{,.noncons}.mod
#( Solving for new omega: 23.700000 24.343724 24.335125 )
#Transition parameters: gamma=0.398000, omega=23.700000, mu=0.042194, nu=0.027896
#Relative entropy: H=2.651656 bits/site
#Expected min. length: L_min=3.660302 sites
#Expected max. length: L_max=3.808284 sites
#Phylogenetic information threshold: PIT=L_min*H=9.705862 bits
#Recommended expected length: omega=24.335125 sites (for L_min*H=9.783400)
# OK, use --expected-lengths 24.3.
############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
# We're here because the actual target coverage was not satisfactory,
# so we're changing the --target-coverage param. Given that we're
# changing that, take a guess at how we should change --expected-lengths
# in order to also hit the total entropy target.
cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
# SECOND ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.393 24.1 ave.{cons,noncons}.mod
#Recommended expected length: omega=23.801397 sites (for L_min*H=9.783400)
# OK, --expected-lengths 23.8
# THIRD ITERATION:
/cluster/bin/phast/consEntropy --NH 9.7834 0.45 32.2 ave.{cons,noncons}.mod
#Recommended expected length: omega=27.095993 sites (for L_min*H=9.783400)
# ==> 27.1
# Now set up cluster job to estimate model parameters given free params
# --target-coverage and --expected-lengths and the data.
ssh pk
mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
# FIRST ITERATION: Use ../starting-tree.mod:
cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.424 --nrates 1,1 \
--no-post-probs --ignore-missing \
--expected-lengths 24.3 --target-coverage 0.398 \
--quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
# SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.424 --nrates 1,1 \
--no-post-probs --ignore-missing \
--expected-lengths 23.8 --target-coverage 0.393 \
--quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
chmod a+x doEstimate.sh
rm -fr LOG TREES
mkdir -p LOG TREES
rm -f jobList
foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
set root = $f:t:r:r
echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
end
para make jobList
para time
#Completed: 139 of 139 jobs
#Average job time: 6022s 100.37m 1.67h 0.07d
#Longest finished job: 14397s 239.95m 4.00h 0.17d
#Submission to last job: 14419s 240.32m 4.01h 0.17d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kolossus
cd /cluster/data/dm2/bed/multiz12way/phastCons/run.estimate
ls -1 TREES/*.cons.mod > cons.txt
/cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
--output-average ave.cons.mod > cons_summary.txt
ls -1 TREES/*.noncons.mod > noncons.txt
/cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
--output-average ave.noncons.mod > noncons_summary.txt
grep TREE ave*.mod
# FIRST ITERATION:
#ave.cons.mod:TREE: ((((((((dm2:0.015283,droSim1:0.028475):0.004064,(droYak1:0.031904,droEre1:0.033703):0.024654):0.021822,droAna2:0.128599):0.008507,dp3:0.135654):0.003923,((droVir2:0.064832,droMoj2:0.074814):0.064856,droGri1:0.130122):0.056057):0.035045,anoGam1:0.225562):0.032118,apiMel2:0.207067):0.100280,triCas1:0.100280);
#ave.noncons.mod:TREE: ((((((((dm2:0.049188,droSim1:0.086476):0.013855,(droYak1:0.101928,droEre1:0.105461):0.080386):0.076467,droAna2:0.435801):0.030536,dp3:0.463395):0.014225,((droVir2:0.218767,droMoj2:0.254568):0.225537,droGri1:0.443544):0.200405):0.126301,anoGam1:0.786660):0.117326,apiMel2:0.720501):0.348402,triCas1:0.348402);
# SECOND ITERATION:
#ave.cons.mod:TREE: ((((((((dm2:0.015261,droSim1:0.028419):0.004049,(droYak1:0.031866,droEre1:0.033655):0.024630):0.021816,droAna2:0.128628):0.008503,dp3:0.135728):0.003923,((droVir2:0.064840,droMoj2:0.074832):0.064900,droGri1:0.130188):0.056115):0.035117,anoGam1:0.225895):0.032170,apiMel2:0.207375):0.100397,triCas1:0.100397);
#ave.noncons.mod:TREE: ((((((((dm2:0.049200,droSim1:0.086486):0.013823,(droYak1:0.101982,droEre1:0.105514):0.080420):0.076507,droAna2:0.436376):0.030537,dp3:0.464200):0.014233,((droVir2:0.219076,droMoj2:0.254936):0.225862,droGri1:0.444226):0.200721):0.126658,anoGam1:0.788468):0.117561,apiMel2:0.722456):0.349173,triCas1:0.349173);
# THIRD ITERATION:
cat cons_summary.txt
# look over the files cons_summary.txt and noncons_summary.txt.
# The means and medians should be roughly equal and the stdevs
# should be reasonably small compared to the means, particularly
# for rate matrix parameters (at bottom) and for branches to the
# leaves of the tree. The stdevs may be fairly high for branches
# near the root of the tree; that's okay. Some min values may be
# 0 for some parameters. That's okay, but watch out for very large
# values in the max column, which might skew the mean. If you see
# any signs of bad outliers, you may have to track down the
# responsible .mod files and throw them out. I've never had to do
# this; the estimates generally seem pretty well behaved.
# NOTE: Actually, a random sample of several hundred to a thousand
# alignment fragments (say, a number equal to the number of
# available cluster nodes) should be more than adequate for
# parameter estimation. If pressed for time, use this strategy.
# FIRST ITERATION ONLY:
# Check the total entropy figure to see if we're way off.
# This takes an hour for 12way (exponential in #species) and has never
# produced a different answer from the input after the first iteration,
# so do this for the first iteration only:
/cluster/bin/phast/consEntropy --NH 9.7834 0.398 24.1 ave.{cons,noncons}.mod
#Recommended expected length: omega=24.095425 sites (for L_min*H=9.783400)
# ==> use 24.1 instead of 24.3 below.
# Now we are ready to set up the cluster job for computing the
# conservation scores and predicted elements. The we measure the
# conserved elements coverage, and if that's not satisfactory then we
# adjust parameters and repeat.
ssh pk
mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.phast
cd /cluster/data/dm2/bed/multiz12way/phastCons/run.phast
cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
--expected-lengths 23.8 --target-coverage 0.393 \
--quiet --seqname $chr --idpref $pref \
--viterbi /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \
--require-informative 0 \
> $tmpfile
gzip -c $tmpfile > /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
chmod a+x doPhastCons.sh
rm -fr /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
mkdir -p /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
rm -f jobList
foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
echo doPhastCons.sh $f >> jobList
end
para make jobList
para time
#Completed: 139 of 139 jobs
#Average job time: 15s 0.25m 0.00h 0.00d
#Longest finished job: 19s 0.32m 0.01h 0.00d
#Submission to last job: 45s 0.75m 0.01h 0.00d
# back on kolossus:
# combine predictions and transform scores to be in 0-1000 interval
cd /cluster/data/dm2/bed/multiz12way/phastCons
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
/san/sanvol1/scratch/dm2/phastCons/ELEMENTS/*.bed \
| /cluster/bin/scripts/lodToBedScore > all.bed
ssh hgwdev
# Now measure coverage of CDS by conserved elements.
# We want the "cover" figure to be close to 68.9%.
cd /cluster/data/dm2/bed/multiz12way/phastCons
featureBits -enrichment dm2 flyBaseGene:cds all.bed
# FIRST ITERATION: just a bit too high, reduce target-coverage a bit:
#flyBaseGene:cds 16.567%, all.bed 35.522%, both 11.442%, cover 69.07%, enrich 1.94x
# SECOND ITERATION: OK, this is the first time I've seen a reduction in
# target-coverage result in an increase in CDS coverage. Dang. Well,
# I'll take it as a sign that we're close enough (esp. since each iteration
# take a half-day now).
#flyBaseGene:cds 16.567%, all.bed 35.514%, both 11.446%, cover 69.09%, enrich 1.95x
# Having met the CDS coverage target, load up the results.
hgLoadBed dm2 phastConsElements12way all.bed
# Create wiggle
ssh pk
mkdir /cluster/data/dm2/bed/multiz12way/phastCons/run.wib
cd /cluster/data/dm2/bed/multiz12way/phastCons/run.wib
rm -rf /san/sanvol1/scratch/dm2/phastCons/wib
mkdir -p /san/sanvol1/scratch/dm2/phastCons/wib
cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /san/sanvol1/scratch/dm2/phastCons/wib
zcat `ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
| sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
chmod a+x doWigEncode
rm -f jobList
foreach chr (`ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS \
| awk -F\. '{print $1}' | sort -u`)
echo doWigEncode $chr >> jobList
end
para make jobList
para time
#Completed: 13 of 13 jobs
#Average job time: 10s 0.16m 0.00h 0.00d
#Longest finished job: 24s 0.40m 0.01h 0.00d
#Submission to last job: 99s 1.65m 0.03h 0.00d
# back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want
# the raw scores) from san/sanvol1
cd /cluster/data/dm2/bed/multiz12way/phastCons
rm -rf wib POSTPROBS
rsync -av /san/sanvol1/scratch/dm2/phastCons/wib .
rsync -av /san/sanvol1/scratch/dm2/phastCons/POSTPROBS .
# load wiggle component of Conservation track
ssh hgwdev
mkdir /gbdb/dm2/multiz12way/wib
cd /cluster/data/dm2/bed/multiz12way/phastCons
chmod 775 . wib
chmod 664 wib/*.wib
ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz12way/wib/
hgLoadWiggle dm2 phastCons12way \
-pathPrefix=/gbdb/dm2/multiz12way/wib wib/*.wig
rm wiggle.tab
# and clean up san/sanvol1.
rm -r /san/sanvol1/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib}
rm -r /san/sanvol1/scratch/dm2/chrom
# Offer raw scores for download since fly folks are likely to be interested:
ssh kkstore01
cd /cluster/data/dm2/bed/multiz12way/phastCons/POSTPROBS
mkdir ../postprobsDownload
foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
> ../postprobsDownload/$chr.pp.gz
end
cd ../postprobsDownload
md5sum *.gz > md5sum.txt
# Make a README.txt there too.
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons12way
cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons12way
ln -s /cluster/data/dm2/bed/multiz12way/phastCons/postprobsDownload/* .
# MAKE 11.OOC FILE FOR BLAT (DONE 9/19/05 angie)
# Use -repMatch=100 (based on size -- for human we use 1024, and
# fly size is ~4.4% of human judging by gapless dm1 genome size from
# featureBits -- we would use 45, but bump that up a bit to be more
# conservative).
ssh kolossus
blat /cluster/data/dm2/dm2.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/dm2/11.ooc -repMatch=100
#Wrote 3031 overused 11-mers to /cluster/bluearc/dm2/11.ooc
# BAC END PAIRS (DONE 9/20/05 angie)
# Excerpt from an email from Roger Hoskins, 9/17/05:
# > There are two end-sequenced BAC libraries. The RPCI-98 BAC library,
# > produced by Pieter and Kazu, is called "BDGP" on the Genoscope
# > website; individual clones are named BACR01A01 - BACR48H12 (96-well
# > format; R stands for EcoRI). The DrosBAC library, produced in France
# > for the European Drosophila Genome Project, is called "EDGP" on the
# > Genoscope website; individual clones are named BACN01A01 - BACN47H12
# > and BACH48A01-BACH61H12 (N stands for NdeII; H stands for HinDIII).
# >
# > It's not necessary to query GenBank for these data. The BAC end
# > sequences are available from the Genoscope web site, Resources page:
# > http://www.genoscope.cns.fr/externe/English/Outils/
# > For RPCI-98:
# > http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL
# > For DrosBAC (EDGP):
# > http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM
# >
# > Note that if you decide to do this, it would be important to have a
# > fairly high stringency imposed on the sequence alignments for mapping.
# > BACs only go to one location in the genome; paired ends must point
# > toward each other and be separated by an interval of say 50kb to 250
ssh kolossus
# fileserver is unhappy... so work on bluearc for now, copy over later:
mkdir -p /cluster/bluearc/dm2/bed/bacends
cd /cluster/bluearc/dm2/bed/bacends
wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL
wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM
faSize *
#40016446 bases (3778317 N's 36238129 real 33260718 upper 2977391 lower) in 41500 sequences in 2 files
#Total size: mean 964.3 sd 202.7 min 1 (AL0AA030DD12A1) max 1373 (AM0AA015AE07BP1) median 1009
#N count: mean 91.0 sd 80.8
#U count: mean 801.5 sd 207.4
#L count: mean 71.7 sd 45.5
cat banque_Projet_A* > bacends.fa
# Edit the <!DOCTYPE.../HTML> part out of bacends.fa.
# Fly is much smaller than human so we can get away with one job
# per chrom, no splitting...
ssh kki
cd /cluster/bluearc/dm2/bed/bacends
mkdir out
echo bacends.fa > bacends.lst
ls -1S /iscratch/i/dm2/nib/* > dm2.lst
cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/blat -noHead $(path1) $(path2) -ooc=/cluster/bluearc/dm2/11.ooc {check out exists out/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs
gensub2 dm2.lst bacends.lst gsub jobList
para make jobList
para time
#Completed: 13 of 13 jobs
#Average job time: 79s 1.32m 0.02h 0.00d
#Longest finished job: 360s 6.00m 0.10h 0.00d
#Submission to last job: 360s 6.00m 0.10h 0.00d
# Back on kolossus:
cat out/*.psl \
| pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
stdin bacEnds.unpaired.psl /dev/null
#..Processed 318892 alignments
# Translate the particular style of FASTA headers into bacEndPairs.txt:
perl -we 'while (<>) { \
if (/^>(\w+)\s+BAC=(\w+)\s+END=(\w+)/) { \
if (! defined $bacs{$2}) { \
$bacs{$2}->[0] = ""; $bacs{$2}->[1] = ""; \
} \
if ($3 eq "T7") { \
$bacs{$2}->[0] .= "$1,"; \
} else { \
$bacs{$2}->[1] .= "$1,"; \
} \
} elsif (/^>/) { \
die "parse:\n$_\n"; \
} \
} \
foreach $bac (keys %bacs) { \
$t7 = $bacs{$bac}->[0]; $t7 = "," if ($t7 eq ""); \
$other = $bacs{$bac}->[1]; $other = "," if ($other eq ""); \
print "$t7\t$other\t$bac\n"; \
}' bacends.fa \
> bacEndPairs.txt
# Roger's suggested 50kb-250kb range lost some (especially on the short
# side), so I ended up going with Terry's range for human, 35kb-350kb:
pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 \
-slop -short -long -orphan -mismatch -verbose \
bacEnds.unpaired.psl bacEndPairs.txt all_bacends bacEnds
wc -l bacEnds.*
# 21 bacEnds.long
# 180 bacEnds.mismatch
# 7037 bacEnds.orphan
# 11417 bacEnds.pairs
# 37 bacEnds.short
# 126 bacEnds.slop
# 118041 bacEnds.unpaired.psl
# (With Roger's sugg. 50kb-250kb, there were 575 short, 53 long)
# Filter by score and sort by {chrom,chromStart}:
awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
cat bacEnds.{slop,short,long,mismatch,orphan} \
| awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
wc -l *.bed
# 11410 bacEndPairs.bed
# 4842 bacEndPairsBad.bed
# Filter the alignments into those we'll need in the all_bacends table:
extractPslLoad -noBin bacEnds.unpaired.psl \
bacEndPairs.bed bacEndPairsBad.bed \
| sorttbl tname tstart | headchg -del \
> bacEnds.load.psl
cd ..
mv /cluster/bluearc/dm2/bed/bacends /cluster/data/dm2/bed/bacends
# load into database
ssh hgwdev
cd /cluster/data/dm2/bed/bacends
hgLoadBed dm2 bacEndPairs bacEndPairs.bed \
-notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed dm2 bacEndPairsBad bacEndPairsBad.bed \
-notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
hgLoadPsl dm2 -table=all_bacends bacEnds.load.psl
# load BAC end sequences
mkdir -p /gbdb/dm2/bacends
ln -s /cluster/data/dm2/bed/bacends/bacends.fa /gbdb/dm2/bacends/
hgLoadSeq dm2 /gbdb/dm2/bacends/bacends.fa
##########################################################################
# NSCAN track - (2005-09-29 markd) loaded proteins 2005-10-13
cd /cluster/data/dm2/bed/nscan/
# obtained NSCAN-EST predictions from michael brent's group at WUSTL
wget http://genome.cse.wustl.edu/predictions/Drosophila/Dm2_09_19_05/Dm2_09_19_05.tar.gz
tar -zxf Dm2_09_19_05.tar.gz
# change protein fasta file to have transcript id in header
foreach f (chr_ptx/*.ptx)
awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
end
ldHgGene -gtf -genePredExt dm2 nscanGene chr_gtf/chr*.gtf
hgPepPred mm6 generic nscanPep chr_ptx/chr*.fix
rm -rf chr_* *.tab
# update trackDb; need a dm2-specific page to describe informants
mouse/dm2/nscanGene.html
mouse/dm2/trackDb.ra
# BLASTZ/CHAIN/NET DROYAK2 (DONE 11/23/05 angie)
mkdir /cluster/data/dm2/bed/blastz.droYak2.2005-11-23
cd /cluster/data/dm2/bed/blastz.droYak2.2005-11-23
cat << '_EOF_' > DEF
# D. melanogaster vs. D. yakuba
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. yakuba
SEQ2_DIR=/iscratch/i/droYak2/droYak2.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droYak2/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droYak2.2005-11-23
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /cluster/bluearc/dm2droYak2 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroYak2Link
#flyBaseGene 22.967%, chainDroYak2Link 91.943%, both 22.758%, cover 99.09%, enrich 1.08x
rm -f /cluster/data/dm2/bed/blastz.droYak2
ln -s blastz.droYak2.2005-11-23 /cluster/data/dm2/bed/blastz.droYak2
# BLASTZ/CHAIN/NET DROPER1 (DONE 11/29/05 angie)
mkdir /cluster/data/dm2/bed/blastz.droPer1.2005-11-28
cd /cluster/data/dm2/bed/blastz.droPer1.2005-11-28
cat << '_EOF_' > DEF
# D. melanogaster vs. D. persimilis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. persimilis
SEQ2_DIR=/iscratch/i/droPer1/droPer1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droPer1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droPer1.2005-11-28
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droPer1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroPer1Link
#flyBaseGene 22.967%, chainDroPer1Link 74.744%, both 21.320%, cover 92.83%, enrich 1.24x
rm -f /cluster/data/dm2/bed/blastz.droPer1
ln -s blastz.droPer1.2005-11-28 /cluster/data/dm2/bed/blastz.droPer1
# BLASTZ/CHAIN/NET DROSEC1 (DONE 12/5/05 angie)
mkdir /cluster/data/dm2/bed/blastz.droSec1.2005-11-30
cd /cluster/data/dm2/bed/blastz.droSec1.2005-11-30
cat << '_EOF_' > DEF
# D. melanogaster vs. D. sechellia
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. sechellia
SEQ2_DIR=/iscratch/i/droSec1/droSec1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droSec1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droSec1.2005-11-30
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droSec1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroSec1Link
#flyBaseGene 22.967%, chainDroSec1Link 92.860%, both 22.759%, cover 99.09%, enrich 1.07x
rm -f /cluster/data/dm2/bed/blastz.droSec1
ln -s blastz.droSec1.2005-11-30 /cluster/data/dm2/bed/blastz.droSec1
# FLYBASE 4.2 ANNOTATIONS (DONE (except for contacting flybase-help) 1/13/06 angie)
# RELOADED flyBaseNoncoding and flyBase2004Xref 5/30/06 after changing
# extractGenes to catch some new RNA keywords after user reported loss of
# many mir's. (They were marked "ncRNA" in 4.1 but "miRNA" in 4.2.)
# Now extractGenes catches [a-z]+RNA.
# RELOADED flyBaseGene 7/12/06 after user reported a problem with split
# start codons -- cause by error in extractGenes.pl that caused us not to
# notice a single-base overlap between CDS range and exon range.
ssh kkstore01
mkdir /cluster/data/dm2/bed/flybase4.2
cd /cluster/data/dm2/bed/flybase4.2
foreach c (2L 2R 3L 3R 4 X)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.2_20050909/gff/dmel-$c-r4.2.1.gff.gz
end
zcat *.gff.gz > flybase.gff3
# What data sources are represented in this file?
grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
# excerpt (many other sources, including blastx:... , sim4:... and
# tblastx:...; also various other types for source "."):
19178 . CDS
63583 . exon
14380 . gene
19178 . mRNA
66 . miRNA
113 . ncRNA
49 . pseudogene
96 . rRNA
46 . snRNA
63 . snoRNA
294 . tRNA
36921 . transcription_start_site
6006 . transposable_element
33268 . transposable_element_insertion_site
# What keywords are defined in the 9th field?
grep -v '^#' flybase.gff3 \
| awk '{print $9;}' | perl -wpe 's/=[^;]+;/\n/g; s/=.*$//;' \
| sort | uniq -c
117483 Dbxref
3111982 ID
1428395 Name
847565 Parent
20326 cyto_range
46928 dbxref_2nd
15705 gbunit
6 species
92077 synonym
10032 synonym_2nd
# Wow... for the first time, it looks like the same flavor of GFF3
# has been used as last time!
cp ../flybase4.1/extractGenes.pl .
extractGenes.pl flybase.gff3
# Get predicted proteins (for main annotations only)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.2_20050909/fasta/dmel-all-translation-r4.2.1.fasta
perl -wpe 's/^(>\w+)-P(\w)/$1-R$2/' dmel-all-translation-r4.2.1.fasta \
> flybasePep.fa
ssh hgwdev
cd /cluster/data/dm2/bed/flybase4.2
# Protein-coding genes:
ldHgGene -gtf dm2 flyBaseGene flybase.gtf
hgPepPred dm2 generic flyBasePep flybasePep.fa
# Noncoding genes:
hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
# Cross-referencing info for both coding and noncoding:
hgLoadSqlTab dm2 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
flyBase2004Xref.tab
# Some featureBits comparisons with refGene which shoudl be pretty much
# like version 4.0 with a few noncoding genes tossed in... but this is
# a bigger difference than we expect:
featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout
#516631 bases of 131698467 (0.392%) in intersection
# Hmmm, flyBaseGene has nothing on the last 175,000bp of chrX!:
# chrX:22,039,591-22,224,390
# nor the first 40,000bp of chrX:
# chrX:1-102,664
# nor the last 270,000bp of chr3L:
# chr3L:23,273,167-23,771,897
# nor the first 500,000bp of chr2R:
# chr2R:1-600,000
# Also quite a few on the chr*h but I won't look at all those.
#TODO: contact flybase-help about the lack of annotations in those ranges...
# This shows only the dropped isoforms/duplicates (ignores noncoding):
featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \
-bed=stdout
#chr3R 15620310 15621493 chr3R.1
#chr2L 4698094 4700940 chr2L.1
#chrX 3638700 3639895 chrX.1
#474282 bases of 131698467 (0.360%) in intersection
# add upstream* downloadable files
cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
foreach size (1000 2000 5000)
echo upstream$size
nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
| nice gzip -c > upstream$size.fa.gz
end
nice md5sum *.zip *.gz > md5sum.txt
# 2/6/06: fill in missing symbol values with accession so hgNear
# has something to draw a link on...
hgsql dm2 -e 'update flyBase2004Xref set symbol = name where symbol = ""'
# 7/12/06: reload flyBaseGene after correcting extractGenes.pl.
ldHgGene -gtf dm2 flyBaseGene flybase.gtf
# FLYBASE ACODE CROSS-REFERENCING DATA (DONE 1/27/06 angie)
# 10/11/06 -- reloaded after updating flyBaseGene, but no change.
ssh hgwdev
mkdir /cluster/data/dm2/bed/flyBase
cd /cluster/data/dm2/bed/flyBase
wget -O genes.txt.061011 \
ftp://flybase.bio.indiana.edu/flybase/genes/genes.txt
# Had to edit genes.txt.060126 to get around these two lines that didn't
# start with acode symbols:
#line 1609033
#Spiracle sheaths dark.
#line 3664838 (3664839 of original)
#Ommatidia are no
hgFlyBase dm2 genes.txt.061011 -geneTable=flyBaseGene
#Processed 57068 records in 4067314 lines
# MAP UNIPROT IDS TO CG*-R* IDS (DONE 1/27/06 angie - REDONE 10/11/06)
# The UniProt.description table actually contains the most specific
# mapping of UniProt IDs to CG*-R* transcript IDs. Use that when
# available, and otherwise use the more generic mapping to gene IDs
# that we extracted from the acode into fbUniProt.
ssh hgwdev
mkdir /cluster/data/dm2/bed/uniProtToFlyBase
cd /cluster/data/dm2/bed/uniProtToFlyBase
set taxon = `hgsql -N uniProt -e 'select id from taxon where binomial = "Drosophila melanogaster";'`
# Grab UniProt descriptions of all D. mel. proteins so we can parse out
# CG*-R* transcript IDs (and CG* gene IDs when -R* not specified).
hgsql -N uniProt -e \
'select description.* from description,accToTaxon where accToTaxon.taxon = '$taxon' and accToTaxon.acc = description.acc' \
> uniProt.description.txt
# fbUniProt maps FBgn* IDs to multiple UniProt IDs; use flyBase2004Xref
# to map FBgn* to CG* (actually CG*-R* which implies more precision than
# there really is).
hgsql -N dm2 -e \
'select name,uniProtId from flyBase2004Xref,fbUniProt where fbgn=geneId' \
| sort > flyBaseMapping.txt
# Grab the complete list of CG*-R* transcript IDs:
hgsql -N dm2 -e 'select name from flyBaseGene' | sort > transcriptIds.txt
ssh kolossus
cd /cluster/data/dm2/bed/uniProtToFlyBase
# Wrote a perl script, gleanDescription.pl, to parse the various patterns
# used to encode real mappings (not mappings to homologs or things that
# look like CG*-P? IDs but aren't really).
chmod a+x gleanDescription.pl
./gleanDescription.pl uniProt.description.txt | sort > uniProtMapping.txt
# Look for duplicates:
awk '{print $1;}' uniProtMapping.txt | uniq -c | awk '$1 != 1 {print;}'
# 2 CG17131-RA
# 5 CG32637-RA
grep CG17131-PA uniProt.description.txt
#Q8MS37 RE15579p (CG17131-PA).
#Q7PLU3 CG17131-PA.3.
# I looked up both Q8MS37 and Q7PLU3 on http://www.pir.uniprot.org/
# and both do mention CG17131-PA... Q8MS37 has been updated more recently
# so I'll just delete the Q7PLU3 mapping.
grep CG32637-PA uniProt.description.txt
#Q5K354 CG32637-PA protein (Fragment).
#Q5K356 CG32637-PA protein (Fragment).
#Q5K357 CG32637-PA protein (Fragment).
#Q5K361 CG32637-PA protein (Fragment).
#Q8IR71 CG32637-PA.
# Here it's more clear-cut -- delete the fragments.
egrep -v '(Q7PLU3|Q5K354|Q5K356|Q5K357|Q5K361)' uniProtMapping.txt \
> tmp; mv tmp uniProtMapping.txt
# Wrote a perl script, vagueDescription.pl, to parse out CG* without
# isoform specs -- can use those as backups when we don't have a more
# specific mapping for a transcript.
chmod a+x vagueDescription.pl
./vagueDescription.pl uniProt.description.txt | sort \
> uniProtMappingVague.txt
# Write a perl script, combineMappings.pl, to add any additional mappings
# available from acode to the UniProt mappings.
chmod a+x combineMappings.pl
./combineMappings.pl transcriptIds.txt \
uniProtMapping.txt uniProtMappingVague.txt flyBaseMapping.txt \
> flyBaseToUniProt.txt
# Make a 2-column version for loading:
awk '{print $1 "\t" $2;}' flyBaseToUniProt.txt > flyBaseToUniProtLoad.txt
# Load into a genericAlias-type table (this ignores the third column).
ssh hgwdev
cd /cluster/data/dm2/bed/uniProtToFlyBase
sed -e 's/genericAlias/flyBaseToUniProt/' \
~/kent/src/hg/lib/genericAlias.sql > flyBaseToUniProt.sql
hgLoadSqlTab dm2 flyBaseToUniProt flyBaseToUniProt.sql \
flyBaseToUniProtLoad.txt
##########################################################################
# HGNEAR TABLES
# HGNEAR PROTEIN BLAST TABLES (DONE 1/31/06 angie - REDONE 4/6/07)
# mmBlastTab redone 5/1/06 -- makeMm8.doc run used outdated proteins
# hgBlastTab redone 5/9/06 -- makeHg18.doc run used outdated proteins
# Redone 4/3/07 with flybase4.3; mitochondrial proteins yanked 4/6/07
# because they aren't in flyBase2004Xref and were causing joinerCheck errors.
ssh hgwdev
mkdir /cluster/data/dm2/bed/hgNearBlastp
cd /cluster/data/dm2/bed/hgNearBlastp
mkdir 070403
cd 070403
# Get the proteins used by the other hgNear organisms:
pepPredToFa hg18 knownGenePep hg18.known.faa
pepPredToFa mm8 knownGenePep mm8.known.faa
pepPredToFa rn4 knownGenePep rn4.known.faa
pepPredToFa danRer3 ensPep danRer3.ensPep.faa
pepPredToFa dm2 flyBasePep dm2.flyBasePep.faa
pepPredToFa ce2 sangerPep ce2.sangerPep.faa
pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
# Galt's synBlastp.csh filtering, which uses liftOver chains,
# would be the next step if dm2 were reasonably closely related
# (like mammal-mammal) to another Gene Sorter organism. But it is
# pretty distant from all the others, so specify recipBest for all
# query databases.
cat << _EOF_ > config.ra
# Latest fly vs. other Gene Sorter orgs:
# human, mouse, rat, zebrafish, worm, yeast
targetGenesetPrefix flyBase
targetDb dm2
queryDbs hg18 mm8 rn4 danRer3 ce2 sacCer1
recipBest hg18 mm8 rn4 danRer3 ce2 sacCer1
dm2Fa /cluster/data/dm2/bed/hgNearBlastp/070403/dm2.flyBasePep.faa
hg18Fa /cluster/data/dm2/bed/hgNearBlastp/070403/hg18.known.faa
mm8Fa /cluster/data/dm2/bed/hgNearBlastp/070403/mm8.known.faa
rn4Fa /cluster/data/dm2/bed/hgNearBlastp/070403/rn4.known.faa
danRer3Fa /cluster/data/dm2/bed/hgNearBlastp/070403/danRer3.ensPep.faa
ce2Fa /cluster/data/dm2/bed/hgNearBlastp/070403/ce2.sangerPep.faa
sacCer1Fa /cluster/data/dm2/bed/hgNearBlastp/070403/sacCer1.sgdPep.faa
buildDir /cluster/data/dm2/bed/hgNearBlastp/070403
scratchDir /san/sanvol1/scratch/dm2HgNearBlastp
_EOF_
# Run with -noLoad so we can eyeball files, manually load dm2 tables now,
# and later overload other databases' dmBlastTab tables.
~/kent/src/hg/utils/automation/doHgNearBlastp.pl -noLoad -workhorse kkr4u00 config.ra >& do.log &
tail -f do.log
# Ran these manually:
# *** -noLoad was specified -- you can run this script manually to load dm2 tables:
run.dm2.dm2/loadPairwise.csh
#Loading database with 1171077 rows
# *** -noLoad was specified -- you can run these scripts manually to load dm2 tables:
run.dm2.hg18/loadPairwise.csh
#Loading database with 5833 rows
run.dm2.mm8/loadPairwise.csh
#Loading database with 5557 rows
run.dm2.rn4/loadPairwise.csh
#Loading database with 3015 rows
run.dm2.danRer3/loadPairwise.csh
#Loading database with 5253 rows
run.dm2.ce2/loadPairwise.csh
#Loading database with 4537 rows
run.dm2.sacCer1/loadPairwise.csh
#Loading database with 2172 rows
# 4/6/07: remove mitochondrial genes from *BlastTab because they
# are missing from flyBase2004Xref and caused joinerCheck errors.
# Would have been better to have joinerCheck'd flyBasePep before
# running doHgNearBlastp.
hgsql dm2 -e 'delete from flyBaseBlastTab where target like "mt:%"'
foreach table (hgBlastTab mmBlastTab rnBlastTab \
drBlastTab ceBlastTab scBlastTab)
echo $table
hgsql dm2 -e 'delete from '$table' where query like "mt:%"'
end
# 4/6/07: dm2 FlyBase/hgNear/hgGene update is in the pushQ, so I think
# it's a fine time to update dmBlastTab in other databases with the
# latest, so hgwdev is all consistent. When that push is done, will
# need another push request for *.dmBlastTab loaded here.
# *** -noLoad was specified -- you can run these scripts manually to load dmBlastTab in query databases:
run.hg18.dm2/loadPairwise.csh
run.mm8.dm2/loadPairwise.csh
run.rn4.dm2/loadPairwise.csh
run.danRer3.dm2/loadPairwise.csh
run.ce2.dm2/loadPairwise.csh
run.sacCer1.dm2/loadPairwise.csh
# Yank the "mt:%" targets from those to avoid joinerCheck errors.
foreach db (hg18 mm8 rn4 danRer3 ce2 sacCer1)
hgsql $db -e 'delete from dmBlastTab where target like "mt:%"'
end
# CLUSTER GENES AND MAP TO OTHER DATA (DONE 2/6/06 angie - REDONE 4/3/07)
# flyBaseToRefSeq regenerated 5/9/06
ssh hgwdev
# First we have to add a proteinId column to flyBaseGene.
hgClusterGenes dm2 flyBaseGene flyBaseIsoforms flyBaseCanonical \
-protAccQuery='select name,alias from flyBaseToUniProt'
#Got 13566 clusters, from 19452 genes in 13 chromosomes
# Create table that maps between flyBase genes and RefSeq
# Re-ran 5/9/06 to resolve some joinerCheck errors about retired refSeqs:
hgMapToGene dm2 refGene flyBaseGene flyBaseToRefSeq
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt dm2 flyBaseToUniProt name alias Pfam flyBaseToPfam
# Create a table that maps between flyBase genes and the
# Stanford Microarray Project expression data. (see makeHgFixed.doc)
hgsql dm2 -e 'drop table if exists flyBaseToCG; create table flyBaseToCG \
select name,SUBSTRING_INDEX(name,"-",1) as value from flyBaseGene'
hgsql dm2 -e 'create index name on flyBaseToCG(name(12))'
hgExpDistance dm2 hgFixed.arbFlyLifeMedianRatio dummyArg \
arbExpDistance -lookup=flyBaseToCG
#TODO: add the yale expr data too (feep)
# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.
# MAKE A DESCRIPTION TABLE FOR HGNEAR KNOWNDETAILS (DONE 2/6/06 - REDONE 4/3/07)
# hgNear's knownDetails column type expects a single table that links
# transcript ID with uniProt description, so whip one up here:
# (used by hgGene too though hgGene could join on the fly)
ssh hgwdev
hgsql dm2 -e 'drop table if exists flyBaseToDescription; \
create table flyBaseToDescription \
select flyBaseToUniProt.name,uniProt.description.val \
from flyBaseToUniProt,uniProt.description \
where flyBaseToUniProt.alias = uniProt.description.acc'
hgsql dm2 -e 'create index name on flyBaseToDescription(name(12))'
# FLYP2P (DONE 2/6/06 angie)
# See hg/near/makeNear.doc...
# MAKE ORGANISM-SPECIFIC HGNEARDATA AND HGGENEDATA FILES (DONE 2/7/06 angie)
cd ~/kent/src/hg/near/hgNear/hgNearData
# Made dm1 and dm2 subdirs of D_melanogaster with separate sets of
# .ra files... see cvs revision history.
# Similarly for hgGene/hgGeneData.
# ENABLE HGNEAR FOR DM2 IN HGCENTRALTEST (DONE 2/7/06 angie)
hgsql -h genome-testdb hgcentraltest \
-e "update dbDb set hgNearOk = 1 where name = 'dm2';"
# REDO DM2-HG18 PAIRWISE BLAST (DONE 2/24/06 angie)
# Oops, hgBlastTab got overwritten with dm2 4.1 genes by the
# hg18 build process. Tweak back to 4.2:
ssh hgwdev
mkdir /cluster/data/dm2/bed/hgNearBlastp
cd /cluster/data/dm2/bed/hgNearBlastp
cat << _EOF_ > config.ra.hg18redo
# Redo vs. hg18 with 4.2 genes:
targetGenesetPrefix flyBase
targetDb dm2
queryDbs hg18
dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
buildDir /cluster/data/dm2/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/dm2HgNearBlastp
_EOF_
doHgNearBlastp.pl -noSelf config.ra.hg18redo >& do.log & tail -f do.log
# END OF HGNEAR STUFF
#############################################################################
# RECIPROCAL BEST DM2-DROSIM1 (DONE 2/28/06 angie - REDONE 6/19/06)
# Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago)
ssh kolossus
cd /cluster/data/dm2/bed/blastz.droSim1/axtChain
# Swap dm2-best chains to be droSim1-referenced:
chainStitchId dm2.droSim1.over.chain.gz stdout \
| chainSwap stdin stdout \
| chainSort stdin droSim1.dm2.tBest.chain
# Net those on droSim1 to get droSim1-ref'd reciprocal best net:
chainPreNet droSim1.dm2.tBest.chain \
/cluster/data/{droSim1,dm2}/chrom.sizes stdout \
| chainNet -minSpace=1 stdin /cluster/data/{droSim1,dm2}/chrom.sizes \
stdout /dev/null \
| netSyntenic stdin droSim1.dm2.rbest.net
# Extract droSim1-ref'd reciprocal best chain:
netChainSubset droSim1.dm2.rbest.net droSim1.dm2.tBest.chain stdout \
| chainStitchId stdin stdout \
| chainSort stdin droSim1.dm2.rbest.chain
# Swap to get dm2-ref'd reciprocal best chain:
chainSwap droSim1.dm2.rbest.chain stdout \
| chainSort stdin dm2.droSim1.rbest.chain
# Net those on dm2 to get dm2-ref'd reciprocal best net:
chainPreNet dm2.droSim1.rbest.chain \
/cluster/data/{dm2,droSim1}/chrom.sizes stdout \
| chainNet -minSpace=1 -minScore=0 \
stdin /cluster/data/{dm2,droSim1}/chrom.sizes stdout /dev/null \
| netSyntenic stdin dm2.droSim1.rbest.net
# Compress and clean up.
rm droSim1.dm2.tBest.chain
gzip *.rbest.*
md5sum *.rbest.*.gz > md5sum.txt
# Test coverage of *.rbest.* -- should be equal.
netToBed -maxGap=1 droSim1.dm2.rbest.net.gz droSim1.dm2.rbest.net.bed
netToBed -maxGap=1 dm2.droSim1.rbest.net.gz dm2.droSim1.rbest.net.bed
chainToPsl droSim1.dm2.rbest.chain.gz \
/cluster/data/{droSim1,dm2}/chrom.sizes \
/cluster/data/droSim1/droSim1.2bit /cluster/data/dm2/dm2.2bit \
droSim1.dm2.rbest.chain.psl
chainToPsl dm2.droSim1.rbest.chain.gz \
/cluster/data/{dm2,droSim1}/chrom.sizes \
/cluster/data/dm2/dm2.2bit /cluster/data/droSim1/droSim1.2bit \
dm2.droSim1.rbest.chain.psl
ssh hgwdev
cd /cluster/data/dm2/bed/blastz.droSim1/axtChain
featureBits droSim1 droSim1.dm2.rbest.net.bed
#104047585 bases of 127256433 (81.762%) in intersection
featureBits droSim1 droSim1.dm2.rbest.chain.psl
#104047585 bases of 127256433 (81.762%) in intersection
featureBits dm2 dm2.droSim1.rbest.chain.psl
#104047585 bases of 131698467 (79.004%) in intersection
featureBits dm2 dm2.droSim1.rbest.net.bed
#104047585 bases of 131698467 (79.004%) in intersection
mkdir experiments
mv *.bed *.psl experiments
# Put out for download (hgwdev-only)
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/dm2/vsDroSim1/reciprocalBest
cd /usr/local/apache/htdocs/goldenPath/dm2/vsDroSim1/reciprocalBest
rm -f *.gz md5sum.txt
ln -s /cluster/data/dm2/bed/blastz.droSim1/axtChain/*.rbest.*.gz .
ln -s /cluster/data/dm2/bed/blastz.droSim1/axtChain/md5sum.txt .
# Make a README.txt.
# RECIPROCAL BEST DM2-DROYAK2 (DONE 6/20/06 angie)
# Special request for user (Toshio Yoshimatsu and JJ Emerson, UChicago)
ssh kolossus
cd /cluster/data/dm2/bed/blastz.droYak2/axtChain
# Swap dm2-best chains to be droYak2-referenced:
chainStitchId dm2.droYak2.over.chain.gz stdout \
| chainSwap stdin stdout \
| chainSort stdin droYak2.dm2.tBest.chain
# Net those on droYak2 to get droYak2-ref'd reciprocal best net:
chainPreNet droYak2.dm2.tBest.chain \
/cluster/data/{droYak2,dm2}/chrom.sizes stdout \
| chainNet -minSpace=1 stdin /cluster/data/{droYak2,dm2}/chrom.sizes \
stdout /dev/null \
| netSyntenic stdin droYak2.dm2.rbest.net
# Extract droYak2-ref'd reciprocal best chain:
netChainSubset droYak2.dm2.rbest.net droYak2.dm2.tBest.chain stdout \
| chainStitchId stdin droYak2.dm2.rbest.chain
# Swap to get dm2-ref'd reciprocal best chain:
chainSwap droYak2.dm2.rbest.chain stdout \
| chainSort stdin dm2.droYak2.rbest.chain
# Net those on dm2 to get dm2-ref'd reciprocal best net:
chainPreNet dm2.droYak2.rbest.chain \
/cluster/data/{dm2,droYak2}/chrom.sizes stdout \
| chainNet -minSpace=1 -minScore=0 \
stdin /cluster/data/{dm2,droYak2}/chrom.sizes stdout /dev/null \
| netSyntenic stdin dm2.droYak2.rbest.net
# Compress and clean up.
rm droYak2.dm2.tBest.chain
gzip *.rbest.*
md5sum *.rbest.*.gz > md5sum.rbest.txt
# Test coverage of *.rbest.* -- should be equal.
netToBed -maxGap=1 droYak2.dm2.rbest.net.gz droYak2.dm2.rbest.net.bed
netToBed -maxGap=1 dm2.droYak2.rbest.net.gz dm2.droYak2.rbest.net.bed
chainToPsl droYak2.dm2.rbest.chain.gz \
/cluster/data/{droYak2,dm2}/chrom.sizes \
/cluster/data/droYak2/droYak2.2bit /cluster/data/dm2/dm2.2bit \
droYak2.dm2.rbest.chain.psl
chainToPsl dm2.droYak2.rbest.chain.gz \
/cluster/data/{dm2,droYak2}/chrom.sizes \
/cluster/data/dm2/dm2.2bit /cluster/data/droYak2/droYak2.2bit \
dm2.droYak2.rbest.chain.psl
ssh hgwdev
cd /cluster/data/dm2/bed/blastz.droYak2/axtChain
featureBits droYak2 droYak2.dm2.rbest.net.bed
#105927235 bases of 162681153 (65.113%) in intersection
featureBits droYak2 droYak2.dm2.rbest.chain.psl
#105927235 bases of 162681153 (65.113%) in intersection
featureBits dm2 dm2.droYak2.rbest.chain.psl
#105927235 bases of 131698467 (80.432%) in intersection
featureBits dm2 dm2.droYak2.rbest.net.bed
#105927235 bases of 131698467 (80.432%) in intersection
mkdir experiments
mv *.bed *.psl experiments
# Put out for download (hgwdev-only)
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/dm2/vsDroYak2/reciprocalBest
cd /usr/local/apache/htdocs/goldenPath/dm2/vsDroYak2/reciprocalBest
rm -f *.gz md5sum.txt
ln -s /cluster/data/dm2/bed/blastz.droYak2/axtChain/*.rbest.*.gz .
ln -s /cluster/data/dm2/bed/blastz.droYak2/axtChain/md5sum.rbest.txt md5sum.txt
# Make a README.txt.
########################################################################
### microRNA targets tracks (DONE - 2006-03-29 - 2006-04-26 - Hiram)
### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
### Yi-Lu Wang ylw205@nyu.edu
### dg@thp.Uni-Koeln.DE
ssh hgwdev
mkdir /cluster/data/dm2/bed/picTar
cd /cluster/data/dm2/bed/picTar
wget --timestamping \
'http://pictar.bio.nyu.edu/ucsc/new_melanogaster_S1_bed' \
-O newMelanogasterS1.bed
wget --timestamping \
'http://pictar.bio.nyu.edu/ucsc/new_melanogaster_S3_bed' \
-O newMelanogasterS3.bed
grep -v "^track" newMelanogasterS1.bed \
| hgLoadBed -strict dm2 picTarMiRNAS1 stdin
# Loaded 24103 elements of size 9
grep -v "^track" newMelanogasterS3.bed \
| hgLoadBed -strict dm2 picTarMiRNAS3 stdin
# Loaded 11480 elements of size 9
nice -n +19 featureBits dm2 picTarMiRNAS1
# 59881 bases of 131698467 (0.045%) in intersection
nice -n +19 featureBits dm2 picTarMiRNAS3
# 26231 bases of 131698467 (0.020%) in intersection
# FLYBASE IN SITU IMAGES / EXPRESSION (DONE 5/3/06 angie - REDONE 4/7/07)
# FlyBase has downloadable in situ images for BACs:
# ftp://flybase.net/flybase/images/bac_insitu_pic/*
# and FBti's:
# ftp://flybase.net/flybase/images/in-situ-images/insitus.zip
# but what Jim is interested in is the insitus for expression data.
# Don't see that in the ftp listing, but they do make it easy to link in.
ssh hgwdev
cd /cluster/data/dm2/bed/flyBase
wget -O summary.txt.070406 \
'http://www.fruitfly.org/cgi-bin/ex/bquery.pl?qpage=entry&qtype=summarytext'
# remove header line.
hgLoadSqlTab dm2 bdgpExprLink ~/kent/src/hg/lib/bdgpExprLink.sql \
summary.txt.070406
###########################################################################
# LIFTOVER CHAINS DM2 -> DM3 (DONE 8/1/06 angie)
# Started 8/1 -- This creates the directory and scripts:
~/kent/src/utils/doSameSpeciesLiftOver.pl dm2 dm3 -debug
# Run it for real and log it:
cd /cluster/data/dm2/bed/blat.dm3.2006-08-01
~/kent/src/utils/doSameSpeciesLiftOver.pl dm2 dm3 \
>& do.log & tail -f do.log
###########################################################################
# BLASTZ/CHAIN/NET DROERE2 (DONE 9/27/06 angie)
ssh kkstore04
mkdir /cluster/data/dm2/bed/blastz.droEre2.2006-09-27
cd /cluster/data/dm2/bed/blastz.droEre2.2006-09-27
cat << '_EOF_' > DEF
# D. melanogaster vs. D. erecta
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. erecta
SEQ2_DIR=/iscratch/i/droEre2/droEre2.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droEre2/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droEre2.2006-09-27
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droEre2 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroEre2Link
#flyBaseGene 23.333%, chainDroEre2Link 90.665%, both 23.063%, cover 98.84%, enrich 1.09x
rm -f /cluster/data/dm2/bed/blastz.droEre2
ln -s blastz.droEre2.2006-09-27 /cluster/data/dm2/bed/blastz.droEre2
###########################################################################
# BLASTZ/CHAIN/NET DROGRI2 (DONE 9/27/06 angie)
ssh kkstore04
mkdir /cluster/data/dm2/bed/blastz.droGri2.2006-09-27
cd /cluster/data/dm2/bed/blastz.droGri2.2006-09-27
cat << '_EOF_' > DEF
# D. melanogaster vs. D. grimshawi
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. grimshawi
SEQ2_DIR=/iscratch/i/droGri2/droGri2.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droGri2/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droGri2.2006-09-27
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droGri2 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroGri2Link
#flyBaseGene 23.333%, chainDroGri2Link 63.808%, both 20.793%, cover 89.12%, enrich 1.40x
rm -f /cluster/data/dm2/bed/blastz.droGri2
ln -s blastz.droGri2.2006-09-27 /cluster/data/dm2/bed/blastz.droGri2
###########################################################################
# BLASTZ/CHAIN/NET DROWIL1 (DONE 9/27/06 angie)
ssh kkstore04
mkdir /cluster/data/dm2/bed/blastz.droWil1.2006-09-27
cd /cluster/data/dm2/bed/blastz.droWil1.2006-09-27
cat << '_EOF_' > DEF
# D. melanogaster vs. D. willistoni
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. willistoni
SEQ2_DIR=/iscratch/i/droWil1/droWil1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droWil1/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droWil1.2006-09-27
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /cluster/bluearc/dm2droWil1 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroWil1Link
#flyBaseGene 23.333%, chainDroWil1Link 67.722%, both 21.106%, cover 90.45%, enrich 1.34x
rm -f /cluster/data/dm2/bed/blastz.droWil1
ln -s blastz.droWil1.2006-09-27 /cluster/data/dm2/bed/blastz.droWil1
###########################################################################
# BLASTZ/CHAIN/NET DROANA3 (DONE 9/27/06 angie)
ssh kkstore04
mkdir /cluster/data/dm2/bed/blastz.droAna3.2006-09-27
cd /cluster/data/dm2/bed/blastz.droAna3.2006-09-27
cat << '_EOF_' > DEF
# D. melanogaster vs. D. ananassae
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. ananassae
SEQ2_DIR=/iscratch/i/droAna3/droAna3.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droAna3/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droAna3.2006-09-27
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droAna3 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroAna3Link
#flyBaseGene 23.333%, chainDroAna3Link 79.307%, both 22.211%, cover 95.19%, enrich 1.20x
rm -f /cluster/data/dm2/bed/blastz.droAna3
ln -s blastz.droAna3.2006-09-27 /cluster/data/dm2/bed/blastz.droAna3
###########################################################################
# BLASTZ/CHAIN/NET DROMOJ3 (DONE 9/27/06 angie)
ssh kkstore04
mkdir /cluster/data/dm2/bed/blastz.droMoj3.2006-09-27
cd /cluster/data/dm2/bed/blastz.droMoj3.2006-09-27
cat << '_EOF_' > DEF
# D. melanogaster vs. D. mojavensis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. mojavensis
SEQ2_DIR=/iscratch/i/droMoj3/droMoj3.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droMoj3/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droMoj3.2006-09-27
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droMoj3 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroMoj3Link
#flyBaseGene 23.333%, chainDroMoj3Link 60.956%, both 20.676%, cover 88.61%, enrich 1.45x
rm -f /cluster/data/dm2/bed/blastz.droMoj3
ln -s blastz.droMoj3.2006-09-27 /cluster/data/dm2/bed/blastz.droMoj3
###########################################################################
# BLASTZ/CHAIN/NET DROVIR3 (DONE 9/27/06 angie)
ssh kkstore04
mkdir /cluster/data/dm2/bed/blastz.droVir3.2006-09-27
cd /cluster/data/dm2/bed/blastz.droVir3.2006-09-27
cat << '_EOF_' > DEF
# D. melanogaster vs. D. virilis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. virilis
SEQ2_DIR=/iscratch/i/droVir3/droVir3.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droVir3/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.droVir3.2006-09-27
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dm2droVir3 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDroVir3Link
#flyBaseGene 23.333%, chainDroVir3Link 62.998%, both 20.854%, cover 89.38%, enrich 1.42x
rm -f /cluster/data/dm2/bed/blastz.droVir3
ln -s blastz.droVir3.2006-09-27 /cluster/data/dm2/bed/blastz.droVir3
###########################################################################
# BLASTZ/CHAIN/NET DP4 (DONE 9/27/06 angie)
ssh kkstore04
mkdir /cluster/data/dm2/bed/blastz.dp4.2006-09-27
cd /cluster/data/dm2/bed/blastz.dp4.2006-09-27
cat << '_EOF_' > DEF
# D. melanogaster vs. D. pseudoobscura
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp4/dp4.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dp4/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.dp4.2006-09-27
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /cluster/bluearc/dm2dp4 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainDp4Link
#flyBaseGene 23.333%, chainDp4Link 73.891%, both 21.518%, cover 92.22%, enrich 1.25x
rm -f /cluster/data/dm2/bed/blastz.dp4
ln -s blastz.dp4.2006-09-27 /cluster/data/dm2/bed/blastz.dp4
###########################################################################
# BLASTZ/CHAIN/NET TRICAS2 (DONE 9/28/06)
ssh kkstore04
mkdir /cluster/data/dm2/bed/blastz.triCas2.2006-09-28
cd /cluster/data/dm2/bed/blastz.triCas2.2006-09-28
cat <<_EOF_ > DEF
# D. melanogaster vs. T. castaneum
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm2/chrom.sizes
# QUERY - T. castaneum
SEQ2_DIR=/iscratch/i/triCas2/triCas2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/iscratch/i/triCas2/chrom.sizes
BASE=/cluster/data/dm2/bed/blastz.triCas2.2006-09-28
_EOF_
doBlastzChainNet.pl DEF \
-blastzOutRoot /cluster/bluearc/dm2triCas2 >& do.log &
tail -f do.log
featureBits dm2 -chrom=chr2L -enrichment flyBaseGene chainTriCas2Link
#flyBaseGene 23.333%, chainTriCas2Link 16.438%, both 10.740%, cover 46.03%, enrich 2.80x
rm -f /cluster/data/dm2/bed/blastz.triCas2
ln -s blastz.triCas2.2006-09-28 /cluster/data/dm2/bed/blastz.triCas2
###########################################################################
# MULTIZ15WAY (DONE 9/28/06 angie)
# (((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)
ssh kkstore04
mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28
cd /cluster/data/dm2/bed/multiz15way.2006-09-28
# copy MAF's to cluster-friendly server
mkdir /san/sanvol1/scratch/dm2/mafNet
foreach s (droSim1 droSec1 droYak2 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 anoGam1 apiMel2 triCas2)
echo $s
rsync -av /cluster/data/dm2/bed/blastz.$s/mafNet/* \
/san/sanvol1/scratch/dm2/mafNet/$s/
end
echo '(((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)' \
> tree-commas.nh
sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
ssh pk
cd /cluster/data/dm2/bed/multiz15way.2006-09-28
mkdir maf run
cd run
# stash binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = dm2
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/mafNet
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/dm2/bed/multiz15way.2006-09-28/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << emacs
awk '{print $1}' /cluster/data/dm2/chrom.sizes > chrom.lst
gensub2 chrom.lst single spec jobList
para make jobList
para time
#Completed: 13 of 13 jobs
#CPU time in finished jobs: 37955s 632.58m 10.54h 0.44d 0.001 y
#IO & Wait Time: 315s 5.25m 0.09h 0.00d 0.000 y
#Average job time: 2944s 49.06m 0.82h 0.03d
#Longest finished job: 9401s 156.68m 2.61h 0.11d
#Submission to last job: 9429s 157.15m 2.62h 0.11d
# ANNOTATE MULTIZ15WAY MAF AND LOAD TABLES (DONE 9/28/2006 angie)
ssh kolossus
mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/anno
cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno
mkdir maf run
cd run
rm -f sizes nBeds
foreach db (`cat /cluster/data/dm2/bed/multiz15way.2006-09-28/species.lst`)
ln -s /cluster/data/$db/chrom.sizes $db.len
if (! -e /cluster/data/$db/$db.N.bed) then
twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
endif
ln -s /cluster/data/$db/$db.N.bed $db.bed
echo $db.bed >> nBeds
echo $db.len >> sizes
end
echo date > jobs.csh
# do smaller jobs first:
foreach f (`ls -1rS ../../maf/*.maf`)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
/cluster/data/dm2/dm2.2bit ../maf/`basename $f` >> jobs.csh
end
echo date >> jobs.csh
csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
# 4min.
# Load anno/maf
ssh hgwdev
cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf
mkdir -p /gbdb/dm2/multiz15way/anno/maf
ln -s /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf/*.maf \
/gbdb/dm2/multiz15way/anno/maf
date
nice hgLoadMaf -pathPrefix=/gbdb/dm2/multiz15way/anno/maf dm2 multiz15way
date
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kolossus
cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf
cat *.maf \
| nice hgLoadMafSummary dm2 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
-test multiz15waySummary stdin
#Created 112880 summary blocks from 10610488 components and 1151094 mafs from stdin
ssh hgwdev
cd /cluster/data/dm2/bed/multiz15way.2006-09-28/anno/maf
sed -e 's/mafSummary/multiz15waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
> /tmp/multiz15waySummary.sql
time nice hgLoadSqlTab dm2 multiz15waySummary /tmp/multiz15waySummary.sql \
multiz15waySummary.tab
#0.000u 0.001s 0:01.59 0.0% 0+0k 0+0io 0pf+0w
rm *.tab
ln -s multiz15way.2006-09-28 /cluster/data/dm2/bed/multiz15way
# MULTIZ15WAY DOWNLOADABLES (UNANNOTATED) (DONE 10/3/2006 angie)
# Annotated MAF is now documented, so use anno/maf for downloads.
ssh hgwdev
mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads
cd /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads
# upstream mafs
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits dm2 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
nice mafFrags dm2 multiz15way up.bed upstream$i.maf \
-orgs=../species.lst
rm up.bed
end
date
'EOF'
# << emacs
time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
#86.335u 42.090s 6:47.44 31.5% 0+0k 0+0io 0pf+0w
ssh kolossus
cd /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads
cat > downloads.csh << 'EOF'
date
foreach f (../anno/maf/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
date
'EOF'
# << emacs
time csh -efx downloads.csh >&! downloads.log
#325.699u 4.199s 5:34.17 98.7% 0+0k 0+0io 1pf+0w
nice gzip up*.maf
nice md5sum up*.maf.gz >> md5sum.txt
# Make a README.txt .
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/dm2/multiz15way
mkdir $dir
ln -s /cluster/data/dm2/bed/multiz15way.2006-09-28/mafDownloads/*.{gz,txt} $dir
# MULTIZ15WAY MAF FRAMES (DONE 9/29/2006 angie)
# multiz15wayFrames regenerated 11/28/06.
ssh hgwdev
mkdir /cluster/data/dm2/bed/multiz15way.2006-09-28/frames
cd /cluster/data/dm2/bed/multiz15way.2006-09-28/frames
# The following is adapted from MarkD's Makefile used for mm7...
#------------------------------------------------------------------------
# get the genes for all genomes (when available)
# currently we have nothing for:
# droSec1 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 triCas2
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using mrna table as genes: droSim1 droYak2 anoGam1 apiMel2
mkdir genes
foreach queryDb (droSim1 droYak2 anoGam1 apiMel2)
set tmpExt = `mktemp temp.XXXXXX`
set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
set tmpMrna = ${queryDb}.mrna.${tmpExt}
set tmpCds = ${queryDb}.cds.${tmpExt}
echo $queryDb
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
${queryDb} > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
stdout \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
# using refGene for dm2
# genePreds; (must keep only the first 10 columns for knownGene)
foreach queryDb (dm2)
if ($queryDb == "dm2") then
set geneTbl = refGene
endif
hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
# MarkD's 1-pass maf methodology
# This section was re-run *with* dm2 genes (oops, thanks Brooke) 11/28/06.
ssh kolossus
cd /cluster/data/dm2/bed/multiz15way/frames
nice tcsh # easy way to get process niced
(cat ../anno/maf/*.maf \
| time genePredToMafFrames dm2 stdin stdout \
dm2 genes/dm2.gp.gz \
droSim1 genes/droSim1.gp.gz droYak2 genes/droYak2.gp.gz \
anoGam1 genes/anoGam1.gp.gz apiMel2 genes/apiMel2.gp.gz \
| gzip -c > multiz15way.mafFrames.gz) >& frames.log &
ssh hgwdev
cd /cluster/data/dm2/bed/multiz15way/frames
hgLoadMafFrames dm2 multiz15wayFrames multiz15way.mafFrames.gz
# PHASTCONS 15WAY WITH METHODS FROM PAPER (DONE 10/3/06 angie)
# (((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)
# NOTE FROM QA (5/7/08): Overlapping data points were found in dm3 phastCons
# data by user Michael Hiller:
# http://www.soe.ucsc.edu/pipermail/genome/2008-April/016241.html
# The overlapping data points were caused by a bug in phastCons that Adam has
# since fixed. The bug is present in dm2 phastCons data, but as it is not the
# most recent assembly for this organism, we are going to leave it as-is. An
# example region containing the overlap is: chr3R:6,999,990-7,000,010 .
ssh kkstore04
mkdir -p /san/sanvol1/scratch/dm2/chrom
cp -p /cluster/data/dm2/?{,?}/chr*.fa /san/sanvol1/scratch/dm2/chrom/
# Split chrom fa into smaller windows for phastCons:
ssh pk
mkdir /cluster/data/dm2/bed/multiz15way/phastCons
mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.split
cd /cluster/data/dm2/bed/multiz15way/phastCons/run.split
set WINDOWS = /san/sanvol1/scratch/dm2/phastCons/WINDOWS
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.sh
#!/bin/csh -ef
set PHAST=/cluster/bin/phast
set FA_SRC=/san/sanvol1/scratch/dm2/chrom
set WINDOWS=/san/sanvol1/scratch/dm2/phastCons/WINDOWS
set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
-O dm2,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel2,triCas2 \
-w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
chmod a+x doSplit.sh
rm -f jobList
foreach file (/cluster/data/dm2/bed/multiz15way/maf/*.maf)
if (-s $file) then
echo "doSplit.sh {check in exists+ $file}" >> jobList
endif
end
para make jobList
para time
#Completed: 13 of 13 jobs
#CPU time in finished jobs: 1075s 17.92m 0.30h 0.01d 0.000 y
#IO & Wait Time: 101s 1.68m 0.03h 0.00d 0.000 y
#Average job time: 90s 1.51m 0.03h 0.00d
#Longest finished job: 259s 4.32m 0.07h 0.00d
#Submission to last job: 270s 4.50m 0.07h 0.00d
############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
# Use consEntropy --NH to make it suggest a --expected-lengths param
# that we should try next. Adam ran this on hg17 to find out the
# total entropy of the hg17 model:
# consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
# Our target is that NH result: 9.7834 bits.
# Use phyloFit to make an initial model:
ssh kolossus
cd /cluster/data/dm2/bed/multiz15way/phastCons
/cluster/bin/phast/msa_view ../maf/chr{2L,3R,4,X}.maf \
--aggregate dm2,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel2,triCas2 \
-i MAF -o SS > all.ss
/cluster/bin/phast/phyloFit all.ss \
--tree "(((((((((dm2,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel2),triCas2)" \
-i SS --out-root starting-tree
cat starting-tree.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -533323268.267133
#BACKGROUND: 0.285852 0.214455 0.214368 0.285325
#RATE_MAT:
# -0.924927 0.204278 0.451988 0.268661
# 0.272287 -1.098668 0.225404 0.600978
# 0.602709 0.225494 -1.100568 0.272364
# 0.269158 0.451705 0.204631 -0.925493
#TREE: (((((((((dm2:0.032478,(droSim1:0.017650,droSec1:0.015740):0.017736):0.026088,(droYak2:0.058116,droEre2:0.055952):0.031922):0.084953,droAna3:0.218988):0.051563,(dp4:0.013624,droPer1:0.015374):0.210705):0.046101,droWil1:0.292357):0.019351,((droVir3:0.109131,droMoj3:0.142623):0.047595,droGri2:0.154583):0.189999):0.113599,anoGam1:0.357817):0.092848,apiMel2:0.386309):0.169225,triCas2:0.169225);
# also get GC content from model -- if similar enough, no need to extract it
# separately above.
awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod
#0.428823
# OK, use .429 for --gc below.
# Use the values of --target-coverage and --expected-lengths from the
# last iteration of the 9way run, 0.398 and 23.7.
# Multiply each subst rate on the TREE line by 3.5 which is roughly the
# ratio of noncons to cons in
# /cluster/data/dm2/bed/multiz8way/phastCons/run.estimate/ave.*.mod
/cluster/bin/phast/tree_doctor -s 3.5 starting-tree.mod \
> starting-tree.noncons.mod
/cluster/bin/phast/consEntropy --NH 9.7834 0.393 23.8 \
starting-tree{,.noncons}.mod
# Uh-oh.... out of memory. OK! Well, proceed with params from before.
# Now set up cluster job to estimate model parameters given free params
# --target-coverage and --expected-lengths and the data.
ssh pk
mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate
cd /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate
# FIRST ITERATION: Use ../starting-tree.mod:
cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.429 --nrates 1,1 \
--no-post-probs --ignore-missing \
--expected-lengths 23.8 --target-coverage 0.393 \
--quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
# SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.429 --nrates 1,1 \
--no-post-probs --ignore-missing \
--expected-lengths 23.8 --target-coverage 0.393 \
--quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
chmod a+x doEstimate.sh
rm -fr LOG TREES
mkdir -p LOG TREES
rm -f jobList
foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
set root = $f:t:r:r
echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
end
para make jobList
para time
#Completed: 139 of 139 jobs
#CPU time in finished jobs: 1771638s 29527.30m 492.12h 20.51d 0.056 y
#IO & Wait Time: 1154s 19.23m 0.32h 0.01d 0.000 y
#Average job time: 12754s 212.56m 3.54h 0.15d
#Longest finished job: 18435s 307.25m 5.12h 0.21d
#Submission to last job: 26561s 442.68m 7.38h 0.31d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kolossus
cd /cluster/data/dm2/bed/multiz15way/phastCons/run.estimate
ls -1 TREES/*.cons.mod > cons.txt
/cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
--output-average ave.cons.mod > cons_summary.txt
ls -1 TREES/*.noncons.mod > noncons.txt
/cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
--output-average ave.noncons.mod > noncons_summary.txt
grep TREE ave*.mod
# FIRST ITERATION:
#ave.cons.mod:TREE: (((((((((dm2:0.017975,(droSim1:0.012862,droSec1:0.012586):0.011123):0.012087,(droYak2:0.031478,droEre2:0.032714):0.015973):0.033568,droAna3:0.107755):0.019847,(dp4:0.008814,droPer1:0.009578):0.104435):0.016369,droWil1:0.150471):0.005457,((droVir3:0.056362,droMoj3:0.072079):0.020331,droGri2:0.083731):0.092042):0.049021,anoGam1:0.218175):0.046658,apiMel2:0.187335):0.106621,triCas2:0.106621);
#ave.noncons.mod:TREE: (((((((((dm2:0.059235,(droSim1:0.039799,droSec1:0.039025):0.035598):0.041349,(droYak2:0.104304,droEre2:0.106715):0.054017):0.119983,droAna3:0.377089):0.071962,(dp4:0.028079,droPer1:0.031175):0.369228):0.060569,droWil1:0.536488):0.019784,((droVir3:0.195542,droMoj3:0.254453):0.073077,droGri2:0.290773):0.336972):0.180643,anoGam1:0.783559):0.174661,apiMel2:0.670401):0.382291,triCas2:0.382291);
# SECOND ITERATION:
cat cons_summary.txt
# look over the files cons_summary.txt and noncons_summary.txt.
# The means and medians should be roughly equal and the stdevs
# should be reasonably small compared to the means, particularly
# for rate matrix parameters (at bottom) and for branches to the
# leaves of the tree. The stdevs may be fairly high for branches
# near the root of the tree; that's okay. Some min values may be
# 0 for some parameters. That's okay, but watch out for very large
# values in the max column, which might skew the mean. If you see
# any signs of bad outliers, you may have to track down the
# responsible .mod files and throw them out. I've never had to do
# this; the estimates generally seem pretty well behaved.
# NOTE: Actually, a random sample of several hundred to a thousand
# alignment fragments (say, a number equal to the number of
# available cluster nodes) should be more than adequate for
# parameter estimation. If pressed for time, use this strategy.
# No consEntropy check because it runs out of mem on an 8G-mem small
# cluster node and has not completed in a day on 16G-mem kolossus.
# Now we are ready to set up the cluster job for computing the
# conservation scores and predicted elements. The we measure the
# conserved elements coverage, and if that's not satisfactory then we
# adjust parameters and repeat.
ssh pk
mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.phast
cd /cluster/data/dm2/bed/multiz15way/phastCons/run.phast
cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
--expected-lengths 23.8 --target-coverage 0.393 \
--quiet --seqname $chr --idpref $pref \
--viterbi /san/sanvol1/scratch/dm2/phastCons/ELEMENTS/$pref.bed --score \
--require-informative 0 \
> $tmpfile
gzip -c $tmpfile > /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
chmod a+x doPhastCons.sh
rm -fr /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
mkdir -p /san/sanvol1/scratch/dm2/phastCons/{POSTPROBS,ELEMENTS}
rm -f jobList
foreach f (/san/sanvol1/scratch/dm2/phastCons/WINDOWS/*.ss.gz)
echo doPhastCons.sh $f >> jobList
end
para make jobList
para time
#Completed: 139 of 139 jobs
#CPU time in finished jobs: 1388s 23.14m 0.39h 0.02d 0.000 y
#IO & Wait Time: 402s 6.70m 0.11h 0.00d 0.000 y
#Average job time: 13s 0.21m 0.00h 0.00d
#Longest finished job: 17s 0.28m 0.00h 0.00d
#Submission to last job: 170s 2.83m 0.05h 0.00d
# back on kolossus:
# combine predictions and transform scores to be in 0-1000 interval
cd /cluster/data/dm2/bed/multiz15way/phastCons
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
/san/sanvol1/scratch/dm2/phastCons/ELEMENTS/*.bed \
| /cluster/bin/scripts/lodToBedScore > all.bed
ssh hgwdev
# Now measure coverage of CDS by conserved elements.
# We want the "cover" figure to be close to 68.9%.
cd /cluster/data/dm2/bed/multiz15way/phastCons
featureBits -enrichment dm2 flyBaseGene:cds all.bed
# FIRST ITERATION: 68.9'll do!
#flyBaseGene:cds 16.649%, all.bed 37.421%, both 11.471%, cover 68.90%, enrich 1.84x
# Having met the CDS coverage target, load up the results.
hgLoadBed dm2 phastConsElements15way all.bed
# Create wiggle
ssh pk
mkdir /cluster/data/dm2/bed/multiz15way/phastCons/run.wib
cd /cluster/data/dm2/bed/multiz15way/phastCons/run.wib
rm -rf /san/sanvol1/scratch/dm2/phastCons/wib
mkdir -p /san/sanvol1/scratch/dm2/phastCons/wib
cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /san/sanvol1/scratch/dm2/phastCons/wib
zcat `ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS/$chr.*.pp.gz \
| sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
chmod a+x doWigEncode
rm -f jobList
foreach chr (`ls -1 /san/sanvol1/scratch/dm2/phastCons/POSTPROBS \
| awk -F\. '{print $1}' | sort -u`)
echo doWigEncode $chr >> jobList
end
para make jobList
para time
#Completed: 13 of 13 jobs
#CPU time in finished jobs: 71s 1.18m 0.02h 0.00d 0.000 y
#IO & Wait Time: 28s 0.47m 0.01h 0.00d 0.000 y
#Average job time: 8s 0.13m 0.00h 0.00d
#Longest finished job: 17s 0.28m 0.00h 0.00d
#Submission to last job: 101s 1.68m 0.03h 0.00d
# back on kkstore04, copy wibs, wigs and POSTPROBS (people sometimes want
# the raw scores) from san/sanvol1
cd /cluster/data/dm2/bed/multiz15way/phastCons
rm -rf wib POSTPROBS
rsync -av /san/sanvol1/scratch/dm2/phastCons/wib .
rsync -av /san/sanvol1/scratch/dm2/phastCons/POSTPROBS .
# load wiggle component of Conservation track
ssh hgwdev
mkdir /gbdb/dm2/multiz15way/wib
cd /cluster/data/dm2/bed/multiz15way/phastCons
chmod 775 . wib
chmod 664 wib/*.wib
ln -s `pwd`/wib/*.wib /gbdb/dm2/multiz15way/wib/
hgLoadWiggle dm2 phastCons15way \
-pathPrefix=/gbdb/dm2/multiz15way/wib wib/*.wig
rm wiggle.tab
# and clean up san/sanvol1.
rm -r /san/sanvol1/scratch/dm2/phastCons/{ELEMENTS,POSTPROBS,wib}
rm -r /san/sanvol1/scratch/dm2/chrom
# Offer raw scores for download since fly folks are likely to be interested:
ssh kkstore04
cd /cluster/data/dm2/bed/multiz15way/phastCons/POSTPROBS
mkdir ../postprobsDownload
foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
> ../postprobsDownload/$chr.pp.gz
end
cd ../postprobsDownload
md5sum *.gz > md5sum.txt
# Make a README.txt there too.
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/dm2/phastCons15way
cd /usr/local/apache/htdocs/goldenPath/dm2/phastCons15way
ln -s /cluster/data/dm2/bed/multiz15way/phastCons/postprobsDownload/* .
# make a tree picture using phyloGif.
cd /cluster/data/dm2/bed/multiz15way
/usr/local/apache/cgi-bin/phyloGif -phyloGif_tree=tree-commas-orgNames.nh \
-phyloGif_width=260 -phyloGif_height=480 > dm2_15way.gif
cp -p dm2_15way.gif ~/browser/images/phylo/dm2_15way.gif
# Check in browser/images/phylo/dm2_15way.gif. In a clean ~/browser:
cd ~/browser
make alpha
#########################################################################
# ORegAnno Open Regulatory Annotation Belinda Giardine April 2007
#
# Update data via a reload
# trackDb is at top level
# parser for dump from Obi Griffith is at PSU.
# cp data to UCSC and load
hgsql dm2
truncate table oreganno;
truncate table oregannoAttr;
truncate table oregannoLink;
#load the tables
hgsql hgFixed
load data local infile "oregannoAttr.dm2.txt" into table oregannoAttr;
load data local infile "oregannoLink.dm2.txt" into table oregannoLink;
grep "^chr" oreganno.dm2.txt | sort -k1,1 -k2,2n > oreganno.bed
hgLoadBed dm2 oreganno oreganno.bed -noSort -oldTable -tab
rm oreganno.bed
#########################################################################
# FLYBASE 4.3 ANNOTATIONS (DONE (except for contacting flybase-help) 10/11/06 angie - UPDATED 4/6/07)
ssh kkstore04
mkdir /cluster/data/dm2/bed/flybase4.3
cd /cluster/data/dm2/bed/flybase4.3
foreach c (2L 2R 3L 3R 4 X)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.3_20060303/gff/$c.gff
end
cat *.gff > flybase.gff3
# What data sources are represented in this file?
grep -v '^#' flybase.gff3 | awk '{print $2 "\t" $3;}' | sort | uniq -c
# excerpt (many other sources, including blastx:... , sim4:... and
# tblastx:...; also various other types for source "FlyBase" and "."):
85778 FlyBase CDS
64452 FlyBase exon
14449 FlyBase gene
19376 FlyBase mRNA
66 FlyBase miRNA
116 FlyBase ncRNA
51 FlyBase pseudogene
96 FlyBase rRNA
46 FlyBase snRNA
63 FlyBase snoRNA
292 FlyBase tRNA
36921 . transcription_start_site
6003 FlyBase transposable_element
35734 FlyBase transposable_element_insertion_site
# Notable differences since 4.2: "FlyBase" instead of "." for most of
# those, and now there seens to be more than one CDS per exon while it
# use to be ~1 CDS per gene. Hmmmm....
# What keywords are defined in the 9th field? Watch out for stray ;'s
# from html-escaped special characters...
grep -v '^#' flybase.gff3 \
| awk '{print $9;}' \
| perl -wpe 's/&\w\w\w\w?;//g; s/=[^;]+;/\n/g; s/=.*$//;' \
| sort | uniq -c
4250095
74483 Alias
31137 Dbxref
2795760 ID
1432800 Name
765 Ontology_term
515484 Parent
1046375 Synonym
17901 associated_with
15773 cyto_range
36666 derived_cyto_location
96162 evidence
15657 gbunit
12101 programversion
11837 putative_ortholog_of
11790 to_name
121126 to_species
# Notable differences: synonym -> Synonym; no more synonym_2nd or
# dbxref_2nd (or species); some new keywords but I think we can ignore.
# Copy over 4.2 script and modify to fit the latest Immortal Unchanging
# Widely Adopted Standard GFF3.
cp ../flybase4.2/extractGenes.pl .
extractGenes.pl flybase.gff3
# Get predicted proteins (for main annotations only)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r4.3_20060303/fasta/dmel-all-translation-r4.3.fasta
# save some space.
nice gzip ?{,?}.gff dmel-all-transl*
# Translate the fancy headers (some of which already start with CG-P's
# that match the FBtr's -- but not all!) into CG-R's to match flyBaseGene:
zcat dmel-all-translation-r4.3.fasta.gz \
| perl -we 'open(X, "flyBase2004Xref.tab") || die; \
while (<X>) { \
@w = split("\t"); \
$fbtr2cgr{$w[3]} = $w[0]; \
} \
while (<>) { \
if (/^>.*parent=(FBtr\d+)/) { \
$cgr = $fbtr2cgr{$1}; \
s/^>.*/>$cgr/ if ($cgr); \
} \
print; \
} \
' \
> flybasePep.fa
ssh hgwdev
cd /cluster/data/dm2/bed/flybase4.3
# Protein-coding genes:
ldHgGene -gtf dm2 flyBaseGene flyBase*.gtf
hgPepPred dm2 generic flyBasePep flybasePep.fa
# Noncoding genes:
hgLoadBed dm2 flyBaseNoncoding flyBaseNoncoding.bed
# Cross-referencing info for both coding and noncoding:
hgLoadSqlTab dm2 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
flyBase2004Xref.tab
# -- had to remove a couple duplicate lines in flyBase2004Xref.tab.
# CG4110-RA and CG13904-RA had multiple mappings, both with consecutive
# FBtr's so I tossed out the earlier FBtr for both.
# Do this next time around, to catch joinerCheck errors before
# running doHgNearBlastp with these proteins:
runJoiner.csh dm2 flyBasePep
# Some featureBits comparisons with refGene which shoudl be pretty much
# like version 4.0 with a few noncoding genes tossed in... but this is
# a bigger difference than we expect:
featureBits dm2 refGene \!flyBaseGene -minSize=1000 -bed=stdout
#604659 bases of 131698467 (0.459%) in intersection
# Hmmm, flyBaseGene has nothing on the last 165,000bp of chrX!:
# chrX:22,058,584-22,224,390
# Here's anothre 176,000 gap in chrX:
# chrX:21,836,109-22,012,685
# nor the last 496,000bp of chr3L:
# chr3L:23,275,448-23,771,897
# nor the first 527,000bp of chr2R:
# chr2R:1-527,716
# No files for chr2h, chr3h, chr4h, chrXh, chrYh, chrU, so those are
# entirely devoid of FlyBase Genes... yikes.
#TODO: contact flybase-help about the lack of annotations in those ranges...
# This shows only the dropped isoforms/duplicates (ignores noncoding):
featureBits dm2 refGene \!flyBaseGene \!flyBaseNoncoding -minSize=1000 \
-bed=stdout
#chr3R 15620310 15621493 chr3R.1
#chr2L 4698094 4700940 chr2L.1
#chrX 3638700 3639895 chrX.1
#474282 bases of 131698467 (0.360%) in intersection
# add upstream* downloadable files
cd /usr/local/apache/htdocs/goldenPath/dm2/bigZips
foreach size (1000 2000 5000)
echo upstream$size
nice featureBits dm2 flyBaseGene:upstream:$size -fa=stdout \
| nice gzip -c > upstream$size.fa.gz
end
nice md5sum *.zip *.gz > md5sum.txt
# Now go redo all the sections above for tables related to flyBase*.
# 4/6/07: remove mitochondrial genes from flyBasePep because they
# are missing from flyBase2004Xref and caused joinerCheck errors.
hgsql dm2 -e 'delete from flyBasePep where name like "mt:%"'
#########################################################################
# AFFYMETRIX TIMECOURSE (DONE 12/21/06 angie)
ssh kkstore04
mkdir /cluster/data/dm2/bed/affyDrosDev
cd /cluster/data/dm2/bed/affyDrosDev
# Download files from Affy:
wget -r --level=1 --directory-prefix=transfrags --no-directories \
http://transcriptome.affymetrix.com/download/publication/dros_dev/transfrags
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
mkdir $chr
pushd $chr
foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
wget --timestamping http://transcriptome.affymetrix.com/download/publication/dros_dev/graphs/$chr/Dro_Total_AS_${t}_C01.sig.gr
end
popd
end
# Transfrags -- Add chromosome to make bed3 (start is 0-based per Sujit):
foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
cp /dev/null affyDrosDevTransfrags$t.bed
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
if ($chr == "chrM") continue
awk '{print "'$chr'\t" $1 "\t" $2;}' \
transfrags/$chr.Dro_Total_AS_${t}_C01 >> affyDrosDevTransfrags$t.bed
end
end
# Transcription level -- Add wig format variableStep lines and
# agglomerate into per-timepoint .wig files:
foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
cp /dev/null affyDrosDevSignal$t.varStep.wig
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
if ($chr == "chrM") continue
echo 'variableStep chrom='$chr >> affyDrosDevSignal$t.varStep.wig
cat $chr/Dro_Total_AS_${t}_C01.sig.gr >> affyDrosDevSignal$t.varStep.wig
end
wigEncode affyDrosDevSignal$t.varStep.wig affyDrosDevSignal$t.wi{g,b}
end
#Converted affyDrosDevSignal1.varStep.wig, upper limit 1156.00, lower limit -860.00
#Converted affyDrosDevSignal2.varStep.wig, upper limit 1499.75, lower limit -580.50
#Converted affyDrosDevSignal3.varStep.wig, upper limit 1627.50, lower limit -586.00
#Converted affyDrosDevSignal4.varStep.wig, upper limit 1794.50, lower limit -718.50
#Converted affyDrosDevSignal5.varStep.wig, upper limit 2277.25, lower limit -913.00
#Converted affyDrosDevSignal6.varStep.wig, upper limit 2095.25, lower limit -864.00
#Converted affyDrosDevSignal7.varStep.wig, upper limit 1071.75, lower limit -560.50
#Converted affyDrosDevSignal8.varStep.wig, upper limit 1182.50, lower limit -1019.00
#Converted affyDrosDevSignal9.varStep.wig, upper limit 2028.75, lower limit -942.50
#Converted affyDrosDevSignal10.varStep.wig, upper limit 1212.00, lower limit -779.00
#Converted affyDrosDevSignal11.varStep.wig, upper limit 1456.75, lower limit -454.50
#Converted affyDrosDevSignal12.varStep.wig, upper limit 1203.50, lower limit -459.50
# --> trackDb setting: type wig -1019.0 2277.25
# Load tables (--> composite tracks)
ssh hgwdev
cd /cluster/data/dm2/bed/affyDrosDev
mkdir /gbdb/dm2/affyDrosDev
ln -s /cluster/data/dm2/bed/affyDrosDev/affyDrosDevSignal*.wib \
/gbdb/dm2/affyDrosDev
foreach t (1 2 3 4 5 6 7 8 9 10 11 12)
hgLoadBed dm2 affyDrosDevTransfrags$t affyDrosDevTransfrags$t.bed
hgLoadWiggle dm2 affyDrosDevSignal$t \
-pathPrefix=/gbdb/dm2/affyDrosDev affyDrosDevSignal$t.wig
end
rm bed.tab wiggle.tab
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE, 2007-01-26, braney)
ssh kkstore04
bash # if not using bash shell already
# use blastDb database made for hg17 protein track
mkdir -p /san/sanvol1/scratch/dm2/blastDb;
cd /cluster/data/dm2/blastDb
for i in nhr nin nsq;
do
cp *.$i /san/sanvol1/scratch/dm2/blastDb;
done
mkdir /cluster/data/dm2/bed/tblastn.hg18KG
cd /cluster/data/dm2/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/dm2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 288 query.lst
# we want around 250000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(250000/288) = 42.309504
mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa
split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/dm2/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/dm2/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/dm2/jkStuff/subChr.lft warn $f.2
liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/dm2/jkStuff/liftAll.lft warn $f.3
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
# then run the Blast cluster jobs
ssh pk
cd /cluster/data/dm2/bed/tblastn.hg18KG
para create blastSpec
para try, check, push, check etc.
para time
# Completed: 176544 of 176544 jobs
# CPU time in finished jobs: 1614468s 26907.81m 448.46h 18.69d 0.051 y
# IO & Wait Time: 589357s 9822.61m 163.71h 6.82d 0.019 y
# Average job time: 12s 0.21m 0.00h 0.00d
# Longest finished job: 73s 1.22m 0.02h 0.00d
# Submission to last job: 58680s 978.00m 16.30h 0.68d
ssh kkstore03
cd /cluster/data/dm2/bed/tblastn.hg18KG
tcsh
mkdir chainRun
cd chainRun
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS \
/cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh kk
cd /cluster/data/dm2/bed/tblastn.hg18KG/chainRun
para create chainSpec
para try, check, push, check etc.
# Completed: 613 of 613 jobs
# CPU time in finished jobs: 6560s 109.33m 1.82h 0.08d 0.000 y
# IO & Wait Time: 31904s 531.74m 8.86h 0.37d 0.001 y
# Average job time: 63s 1.05m 0.02h 0.00d
# Longest finished job: 176s 2.93m 0.05h 0.00d
# Submission to last job: 1338s 22.30m 0.37h 0.02d
ssh kkstore03
cd /cluster/data/dm2/bed/tblastn.hg18KG/blastOut
bash # if using another shell
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/dm2/bed/tblastn.hg18KG/blastHg18KG.psl
pslCheck /cluster/data/dm2/bed/tblastn.hg18KG/blastHg18KG.psl
# this is ok.
# load table
ssh hgwdev
cd /cluster/data/dm2/bed/tblastn.hg18KG
hgLoadPsl dm2 blastHg18KG.psl
# check coverage
featureBits dm2 blastHg18KG
# 5946352 bases of 131698467 (4.515%) in intersection
featureBits dm2 refGene:cds blastHg18KG -enrichment
# refGene:cds 17.097%, blastHg17KG 4.427%, both 4.246%, cover 24.83%, enrich 5.61x
ssh kkstore04
rm -rf /cluster/data/dm2/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/dm2/bed/tblastn.hg18KG/blastOut
#end human proteins
#################################################################
#########################################################################
# CHROMOSOME BANDS (DONE 6/19/07 angie)
ssh hgwdev
cd /cluster/data/dm2/bed/flybase4.3
grep chromosome_band flybase.gff3 \
| perl -wpe 'chomp; @w = split; \
$w[3]--; $w[3] = 0 if ($w[3] < 0); \
$w[8] =~ s/^.*ID=band-(\w+);$/$1/; \
if ($w[4] <= 0) { s/^.*$//; } else { \
s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\t\n/; }' \
> cytoBand.txt
# Did a bit of manual cleanup... there are some rows that have negative
# starts that should clearly have negative ends, but for some reason
# their end coords have been substituted with 10, 20, 30, 40 --
# I just deleted those 12 rows.
hgLoadSqlTab dm2 cytoBand ~/kent/src/hg/lib/cytoBand.sql cytoBand.txt
# Drop or modify the items that are off the end of their chroms:
awk '{print "delete from dm2.cytoBand where chrom = \""$1"\" and chromStart >= "$2"; update dm2.cytoBand set chromEnd = "$2" where chrom = \""$1"\" and chromEnd > "$2";";}' \
/cluster/data/dm2/chrom.sizes \
| hgsql dm2
checkTableCoords -verbose=2 dm2 cytoBand
# Normally we would just create cytoBandIdeo by select * from cytoBand --
# but that gives too high of a resolution for any bands to be visible!
# So make a boiled-down cytoBandIdeo that just has {number, letter}
# instead of {number, letter, number}.
perl -we 'while (<>) { \
chomp; @w=split(" "); \
if ($w[3] =~ /(\d+[A-Z]+)\d+/) { \
$b = $1; \
} else { die "doh!" } \
if (! defined $lastB) { \
$start = $w[1]; \
} elsif ($lastB ne $b) { \
print "$chrom\t$start\t$end\t$lastB\t\n"; \
$start = $w[1]; \
} \
($chrom, $end, $lastB) = ($w[0], $w[2], $b); \
} \
print "$chrom\t$start\t$end\t$lastB\t\n";' cytoBand.txt \
> cytoBandIdeo.txt
hgLoadSqlTab dm2 cytoBandIdeo ~/kent/src/hg/lib/cytoBandIdeo.sql \
cytoBandIdeo.txt
# Drop or modify the items that are off the end of their chroms:
awk '{print "delete from dm2.cytoBandIdeo where chrom = \""$1"\" and chromStart >= "$2"; update dm2.cytoBandIdeo set chromEnd = "$2" where chrom = \""$1"\" and chromEnd > "$2";";}' \
/cluster/data/dm2/chrom.sizes \
| hgsql dm2
checkTableCoords -verbose=2 dm2 cytoBandIdeo
#########################################################################
# EvoFold track (done, 10.02.2007, Jakob Skou Pedersen)
#
# EvoFold predictions were made based on the drosophila subset (12
# species) of the 15-way multiz insect alignment. The data generation
# process will change in the future and is therefore not described in
# detail here. Instead we start with the final set of predictions
# supplied by me.
ssh hgwdev
mkdir -p /cluster/data/dm2/bed/evofold
cd /cluster/data/dm2/bed/evofold
cp /cluster/home/jsp/projects/flies/finalDataSets/finalPredAll.bed foldsDm2.bed
# The folds_hg17.bed is a 9-column bed file: column 1-6 provide
# standard bed information, column 7 is element length, column 8 is
# the RNA secondary structure in parentheses format, and column nine
# is a commaseparated list of position specific confidence scores
# (floats).
hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql dm2 evofold foldsDm2.bed
#########################################################################
# BDTNP CHIP-CHIP (DONE 4/9/08 angie)
# New data added 2/2/09, see next section.
# Berkeley Drosophila Transcription Network Project's ChIP-Chip data,
# from Mark Biggin (MDBiggin at lbl dot gov).
ssh kkstore04
mkdir /cluster/data/dm2/bed/bdtnpChipper
cd /cluster/data/dm2/bed/bdtnpChipper
wget http://rana.lbl.gov/~smacarth/ChIPChip/Public_Release/wiggle/all_public_080409_1FDR.wig.bz2
wget http://rana.lbl.gov/~smacarth/ChIPChip/Public_Release/wiggle/all_public_080409_25FDR.wig.bz2
# The files contain concatenated custom tracks; split those out into
# one .wig file per track, with UCSC table names:
bzcat all_public* | perl -we \
'my $fh; \
while (<>) { \
if (/^track .* name="([\w-]+)"/) { \
close($fh) if ($fh); \
my $name = $1; \
$name =~ s/([a-zI]+)_(H14)?(\d+)?_\d+_675bp_smoothed_scores-sym-(\d+)/bdtnp\u$1$3Fdr$4/ \
|| die "parse:\n$name\n\t"; \
open($fh, ">$name.wig") || die "Cant open >$name.wig: $!\n"; \
} else { \
die "data before track line? at input line $.:\n$_" if (! $fh); \
print $fh $_; \
} \
}'
# These two warnings are for the polII filenames, no problem:
#Use of uninitialized value in concatenation (.) or string at -e line 6, <> line 714856.
#Use of uninitialized value in concatenation (.) or string at -e line 6, <> line 3415135.
mkdir wigEncoded
cd wigEncoded
foreach f (../*Fdr*.wig)
wigEncode $f $f:t:r.{wig,wib}
end
#Converted ../bdtnpBcd1Fdr1.wig, upper limit 8.14, lower limit 0.80
#Converted ../bdtnpBcd1Fdr25.wig, upper limit 8.14, lower limit 0.80
#Converted ../bdtnpBcd2Fdr1.wig, upper limit 10.74, lower limit 0.68
#Converted ../bdtnpBcd2Fdr25.wig, upper limit 10.74, lower limit 0.68
#Converted ../bdtnpCad1Fdr1.wig, upper limit 16.20, lower limit 1.06
#Converted ../bdtnpCad1Fdr25.wig, upper limit 16.20, lower limit 0.48
#Converted ../bdtnpGt2Fdr1.wig, upper limit 22.22, lower limit 1.21
#Converted ../bdtnpGt2Fdr25.wig, upper limit 22.22, lower limit 0.49
#Converted ../bdtnpHb1Fdr1.wig, upper limit 14.38, lower limit 0.92
#Converted ../bdtnpHb1Fdr25.wig, upper limit 14.38, lower limit 0.86
#Converted ../bdtnpHb2Fdr1.wig, upper limit 24.25, lower limit 0.88
#Converted ../bdtnpHb2Fdr25.wig, upper limit 24.25, lower limit 0.78
#Converted ../bdtnpKni1Fdr1.wig, upper limit 8.08, lower limit 1.78
#Converted ../bdtnpKni1Fdr25.wig, upper limit 8.08, lower limit 1.01
#Converted ../bdtnpKni2Fdr1.wig, upper limit 8.44, lower limit 1.07
#Converted ../bdtnpKni2Fdr25.wig, upper limit 8.44, lower limit 0.76
#Converted ../bdtnpKr1Fdr1.wig, upper limit 14.47, lower limit 0.68
#Converted ../bdtnpKr1Fdr25.wig, upper limit 14.47, lower limit 0.68
#Converted ../bdtnpKr2Fdr1.wig, upper limit 12.92, lower limit 0.66
#Converted ../bdtnpKr2Fdr25.wig, upper limit 12.92, lower limit 0.66
#Converted ../bdtnpPolIIFdr1.wig, upper limit 30.86, lower limit 0.93
#Converted ../bdtnpPolIIFdr25.wig, upper limit 30.86, lower limit 0.70
#Converted ../bdtnpZ2Fdr1.wig, upper limit 11.49, lower limit 0.95
#Converted ../bdtnpZ2Fdr25.wig, upper limit 11.49, lower limit 0.79
ssh hgwdev
mkdir /gbdb/dm2/bdtnp
ln -s /cluster/data/dm2/bed/bdtnpChipper/wigEncoded/*.wib /gbdb/dm2/bdtnp/
cd /cluster/data/dm2/bed/bdtnpChipper/wigEncoded
foreach f (*.wig)
hgLoadWiggle -pathPrefix=/gbdb/dm2/bdtnp dm2 $f:r $f
end
rm wiggle.tab
# Print out template for 24 subtrack defs (longLabel will need editing):
set pri = 1
foreach f (*.wig)
echo " track $f:r"
echo " subTrack bdtnpChipper"
set fac = `echo $f:r | perl -wpe 's/^bdtnp([A-Za-z]+).*/\l$1/ || die;'`
set FAC = `echo $fac | perl -wpe 's/^(\w+)$/\U$1/;'`
set ab = `echo $f:r | perl -wpe 's/^bdtnp[A-Za-z]+(\d)?F.*/$1/ || die;'`
set fdr = `echo $f:r | perl -wpe 's/.*Fdr(\d+)$/$1/ || die;'`
echo " subGroups fdr=$fdr factor=$FAC"
echo " shortLabel $fac AB $ab FDR $fdr%"
echo " longLabel ($fac) antibody $ab, stage 4-5 embryos, False Discovery Rate (FDR) $fdr%"
if ($fdr == 1) then
echo " color 50,50,200"
else
echo " color 25,150,25"
endif
echo " priority $pri"
echo ""
set pri = `expr $pri + 1`
end
#########################################################################
# BDTNP CHIP-CHIP (DONE 2/2/09 angie)
ssh hgwdev
cd /hive/data/genomes/dm2/bed/bdtnpChipper
wget http://rana.lbl.gov/~simi/wiggle/BDTNP_all_new_020209_1FDR.wig.bz2
wget http://rana.lbl.gov/~simi/wiggle/BDTNP_all_new_020209_25FDR.wig.bz2
# The files contain concatenated custom tracks; split those out into
# one .wig file per track, with UCSC table names:
bzcat BDTNP_all_new* | perl -we \
'my $fh; \
while (<>) { \
- if (/^track .* name="([\w-]+)"/) { \
+ if (/^track .* name="([\w-\.]+)"/) { \
close($fh) if ($fh); \
my $name = $1; \
$name =~ s/_FQ_/_1_/; $name =~ s/_BQ_/_2_/; \
$name =~ s/([a-zA-Z0-9]+)_(H14)?(\d+)?_\d+_(FactorResults\.)?675bp_smoothed_scores-sym-(\d+)/bdtnp\u$1$3Fdr$5/ \
|| die "parse:\n$name\n\t"; \
print "$name\n"; \
open($fh, ">$name.wig") || die "Cant open >$name.wig: $!\n"; \
+ } elsif (/^track /) { \
+ die "missed a track line:\n$_\n\t"; \
} else { \
die "data before track line? at input line $.:\n$_" if (! $fh); \
print $fh $_; \
} \
}'
cd wigEncoded
foreach f (../*Fdr*.wig)
if (! -e $f:t:r.wig) then
wigEncode $f $f:t:r.{wig,wib}
endif
end
#Converted ../bdtnpD1Fdr1.wig, upper limit 20.33, lower limit 0.51
#Converted ../bdtnpD1Fdr25.wig, upper limit 20.33, lower limit 0.51
#Converted ../bdtnpDa2Fdr1.wig, upper limit 26.09, lower limit 0.79
#Converted ../bdtnpDa2Fdr25.wig, upper limit 26.09, lower limit 0.49
#Converted ../bdtnpDl3Fdr1.wig, upper limit 16.97, lower limit 0.65
#Converted ../bdtnpDl3Fdr25.wig, upper limit 16.97, lower limit 0.62
#Converted ../bdtnpFtz3Fdr1.wig, upper limit 16.54, lower limit 1.07
#Converted ../bdtnpFtz3Fdr25.wig, upper limit 16.54, lower limit 0.84
#Converted ../bdtnpH1Fdr1.wig, upper limit 18.52, lower limit 0.68
#Converted ../bdtnpH1Fdr25.wig, upper limit 18.52, lower limit 0.68
#Converted ../bdtnpH2Fdr1.wig, upper limit 13.23, lower limit 0.68
#Converted ../bdtnpH2Fdr25.wig, upper limit 13.23, lower limit 0.65
#Converted ../bdtnpHkb1Fdr1.wig, upper limit 11.84, lower limit 1.03
#Converted ../bdtnpHkb1Fdr25.wig, upper limit 11.84, lower limit 0.35
#Converted ../bdtnpHkb2Fdr1.wig, upper limit 10.73, lower limit 0.87
#Converted ../bdtnpHkb2Fdr25.wig, upper limit 10.73, lower limit 0.33
#Converted ../bdtnpHkb3Fdr1.wig, upper limit 10.50, lower limit 0.90
#Converted ../bdtnpHkb3Fdr25.wig, upper limit 10.50, lower limit 0.37
#Converted ../bdtnpMad2Fdr1.wig, upper limit 12.58, lower limit 0.67
#Converted ../bdtnpMad2Fdr25.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMed2Fdr1.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMed2Fdr25.wig, upper limit 12.58, lower limit 0.67
#Converted ../bdtnpPrd1Fdr1.wig, upper limit 13.97, lower limit 0.78
#Converted ../bdtnpPrd1Fdr25.wig, upper limit 13.97, lower limit 0.78
#Converted ../bdtnpPrd2Fdr1.wig, upper limit 12.00, lower limit 0.84
#Converted ../bdtnpPrd2Fdr25.wig, upper limit 12.00, lower limit 0.84
#Converted ../bdtnpRun1Fdr1.wig, upper limit 10.02, lower limit 0.60
#Converted ../bdtnpRun1Fdr25.wig, upper limit 10.02, lower limit 0.60
#Converted ../bdtnpRun2Fdr1.wig, upper limit 7.58, lower limit 1.16
#Converted ../bdtnpRun2Fdr25.wig, upper limit 7.58, lower limit 0.58
#Converted ../bdtnpShn2Fdr1.wig, upper limit 12.62, lower limit 1.19
#Converted ../bdtnpShn2Fdr25.wig, upper limit 12.62, lower limit 0.81
#Converted ../bdtnpShn3Fdr1.wig, upper limit 10.28, lower limit 1.09
#Converted ../bdtnpShn3Fdr25.wig, upper limit 10.28, lower limit 0.95
#Converted ../bdtnpSlp11Fdr1.wig, upper limit 40.39, lower limit 1.06
#Converted ../bdtnpSlp11Fdr25.wig, upper limit 40.39, lower limit 0.62
#Converted ../bdtnpSna1Fdr1.wig, upper limit 8.41, lower limit 0.97
#Converted ../bdtnpSna1Fdr25.wig, upper limit 8.41, lower limit 0.77
#Converted ../bdtnpSna2Fdr1.wig, upper limit 14.30, lower limit 0.57
#Converted ../bdtnpSna2Fdr25.wig, upper limit 14.30, lower limit 0.57
#Converted ../bdtnpTFIIB1Fdr1.wig, upper limit 21.81, lower limit 0.59
#Converted ../bdtnpTFIIB1Fdr25.wig, upper limit 21.81, lower limit 0.59
#Converted ../bdtnpTll1Fdr1.wig, upper limit 16.66, lower limit 0.98
#Converted ../bdtnpTll1Fdr25.wig, upper limit 16.66, lower limit 0.57
#Converted ../bdtnpTwi1Fdr1.wig, upper limit 26.11, lower limit 0.77
#Converted ../bdtnpTwi1Fdr25.wig, upper limit 26.11, lower limit 0.68
#Converted ../bdtnpTwi2Fdr1.wig, upper limit 26.66, lower limit 0.79
#Converted ../bdtnpTwi2Fdr25.wig, upper limit 26.66, lower limit 0.55
foreach f (/hive/data/genomes/dm2/bed/bdtnpChipper/wigEncoded/*.wib)
if (! -l /gbdb/dm2/bdtnp/$f:t) then
ln -s $f /gbdb/dm2/bdtnp/
hgLoadWiggle -pathPrefix=/gbdb/dm2/bdtnp dm2 $f:t:r $f:t:r.wig
endif
end
rm wiggle.tab
# Print out template for new subtrack defs (longLabel will need editing):
set pri = 25
foreach f (`ls -l *.wig | grep Feb | awk '{print $9;}'`)
echo " track $f:r"
echo " subTrack bdtnpChipper off"
set fac = `echo $f:r | perl -wpe 's/^bdtnp([A-Za-z]+).*/\l$1/ || die;'`
set FAC = `echo $fac | perl -wpe 's/^(\w+)$/\U$1/;'`
set ab = `echo $f:r | perl -wpe 's/^bdtnp[A-Za-z]+(\d+)?Fdr.*/$1/ || die;'`
set fdr = `echo $f:r | perl -wpe 's/.*Fdr(\d+)$/$1/ || die;'`
echo " subGroups fdr=$fdr factor=$FAC"
echo " shortLabel $fac AB $ab FDR $fdr%"
echo " longLabel ($fac) antibody $ab, stage 4-5 embryos, False Discovery Rate (FDR) $fdr%"
if ($fdr == 1) then
echo " color 50,50,200"
else
echo " color 25,150,25"
endif
echo " priority $pri"
echo ""
set pri = `expr $pri + 1`
end
#########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
dm2.upstreamGeneTbl = refGene
dm2.upstreamMaf = multiz15way /hive/data/genomes/dm2/bed/multiz15way/species.lst