src/hg/makeDb/doc/dm3.txt 1.24
1.24 2009/02/23 23:41:00 angie
Added bdtnpChipper (liftOver from dm2).
Index: src/hg/makeDb/doc/dm3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/dm3.txt,v
retrieving revision 1.23
retrieving revision 1.24
diff -b -B -U 1000000 -r1.23 -r1.24
--- src/hg/makeDb/doc/dm3.txt 27 Oct 2008 20:10:07 -0000 1.23
+++ src/hg/makeDb/doc/dm3.txt 23 Feb 2009 23:41:00 -0000 1.24
@@ -1,2744 +1,2860 @@
# for emacs: -*- mode: sh; -*-
# Drosophila melanogaster -- BDGP Release 5.0 (Apr. 2006)
#########################################################################
# DOWNLOAD SEQUENCE (DONE 7/7/06 angie)
ssh kkstore05
mkdir /cluster/store12/dm3
ln -s /cluster/store12/dm3 /cluster/data/dm3
mkdir /cluster/data/dm3/downloads
cd /cluster/data/dm3/downloads
wget ftp://ftp.fruitfly.org/pub/download/compressed/dmel_release5-2006-04-16.tgz
tar xvzf dmel_release5-2006-04-16.tgz
cd Dmel_Release5
faSize na*
#168717020 bases (6368725 N's 162348295 real 162348295 upper 0 lower) in 14 sequences in 14 files
#Total size: mean 12051215.7 sd 11751902.6 min 204112 (XHet) max 29004656 (ArmUextra) median 10049037
# Follow FlyBase's lead on the chromosome names, but still use our
# "chr" prefix:
mkdir ../renamedChromFa
foreach f (na_*.dmel.RELEASE5)
set chr = `echo $f:r:r | sed -r -e 's/^na_(arm)?/chr/;'`
echo $chr
if (`grep '^>' $f | wc -l` != 1) then
echo $f does not have exactly one fasta record\!
break
endif
sed -e 's/^>.*/>'$chr'/' $f > ../renamedChromFa/$chr.fa
end
cd ../renamedChromFa/
faSize *
#168717020 bases (6368725 N's 162348295 real 162348295 upper 0 lower) in 14 sequences in 14 files
#Total size: mean 12051215.7 sd 11751902.6 min 204112 (chrXHet) max 29004656 (chrUextra) median 10049037
#########################################################################
# MAKE GENOME DB (DONE 7/10/06 angie)
ssh kkstore05
cd /cluster/data/dm3
cat > dm3.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db dm3
clade insect
scientificName Drosophila melanogaster
assemblyDate Apr. 2006
assemblyLabel BDGP Release 5
orderKey 50
mitoAcc 5835233
fastaFiles /cluster/data/dm3/downloads/renamedChromFa/*.fa
dbDbSpeciesDir drosophila
fakeAgpMinContigGap 20
fakeAgpMinScaffoldGap 50000
EOF
~/kent/src/utils/makeGenomeDb.pl dm3.config.ra \
>& makeGenomeDb.log & tail -f makeGenomeDb.log
#########################################################################
# REPEATMASKER (DONE 7/31/06 angie - REDONE 6/20/07)
# 6/19/07 -- Debugged the coverage drop w/Robert Hubley. It was due to
# version lag between the NCBI taxonomy label for the Drosophila genus
# vs. the label used in the Library file. I fixed the local lib file,
# and Robert said it will be fixed in the next released lib file.
ssh kkstore05
# Run -debug to create the dir structure and preview the scripts:
~/kent/src/hg/utils/automation/doRepeatMasker.pl dm3 -verbose 3 -debug
# Run it for real and tail the log:
~/kent/src/hg/utils/automation/doRepeatMasker.pl dm3 -verbose 3 \
>& /cluster/data/dm3/bed/RepeatMasker.2007-06-20/do.log &
tail -f /cluster/data/dm3/bed/RepeatMasker.2007-06-20/do.log
# RepeatMasker and lib version from do.log:
# May 17 2007 (open-3-1-8) version of RepeatMasker
#grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
#CC RELEASE 20061006; *
# Compare coverage to previous assembly:
featureBits dm3 rmsk
#44719009 bases of 162367812 (27.542%) in intersection
featureBits dm2 rmsk
#16178239 bases of 131698467 (12.284%) in intersection
# Wow! What an increase... largely due to the new chrUextra:
featureBits dm3 rmsk -chrom=chrUextra
#20898714 bases of 25541756 (81.822%) in intersection
featureBits dm3 rmsk -chrom=chr3R
#1544878 bases of 27905053 (5.536%) in intersection
featureBits dm2 rmsk -chrom=chr3R
#1538239 bases of 27905053 (5.512%) in intersection
#########################################################################
# SIMPLE REPEATS (TRF) (DONE 8/1/06 angie)
ssh kolossus
nice tcsh
mkdir /cluster/data/dm3/bed/simpleRepeat
cd /cluster/data/dm3/bed/simpleRepeat
twoBitToFa ../../dm3.unmasked.2bit stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
-bedAt=simpleRepeat.bed -tempDir=/tmp \
>& trf.log & tail -f trf.log
# Only about 20 minutes, but dm3 is only 168Mbp...
# Make a filtered version for sequence masking:
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
splitFileByColumn trfMask.bed trfMaskChrom
# Load unfiltered repeats into the database:
ssh hgwdev
hgLoadBed dm3 simpleRepeat \
/cluster/data/dm3/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Compare coverage to previous assembly:
featureBits dm3 simpleRepeat
#6837416 bases of 162367812 (4.211%) in intersection
featureBits dm2 simpleRepeat
#1597029 bases of 131698467 (1.213%) in intersection
# Looks like the increase is due to chrUextra:
featureBits dm3 simpleRepeat -chrom=chrUextra
#3871170 bases of 25541756 (15.156%) in intersection
featureBits dm3 simpleRepeat -chrom=chr2L
#225239 bases of 23011344 (0.979%) in intersection
#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 8/1/06 angie - REDONE 6/21/07)
ssh kolossus
cd /cluster/data/dm3
time twoBitMask dm3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dm3.2bit
# This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has 10 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#0.136u 0.272s 0:01.98 20.2% 0+0k 0+0io 3pf+0w
# Link to it from /gbdb:
ssh hgwdev
ln -s /cluster/data/dm3/dm3.2bit /gbdb/dm3/dm3.2bit
#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 8/4/06 angie - REDONE 6/21/07)
cd /cluster/data/dm3
makeDownloads.pl dm3 -verbose=2 >& jkStuff/downloads.log &
tail -f jkStuff/downloads.log
# Edit README.txt files when done:
# *** Edit each README.txt to resolve any notes marked with "***":
# /cluster/data/dm3/goldenPath/database/README.txt
# /cluster/data/dm3/goldenPath/bigZips/README.txt
# /cluster/data/dm3/goldenPath/chromosomes/README.txt
# 6/21/07 -- since I'm regenerating these after the GenBank tables are
# in place... our bigZips/upstream* files are from FlyBase Genes --
# remove the refseq versions generated by the script and restore
# the FlyBase upstream:
ssh hgwdev
rm /cluster/data/dm3/goldenPath/bigZips/upstream*
rm /usr/local/apache/htdocs/goldenPath/dm3/bigZips/upstream*
ln -s /cluster/data/dm3/bed/flybase5.1/bigZips/up* \
/usr/local/apache/htdocs/goldenPath/dm3/bigZips/
cat /cluster/data/dm3/bed/flybase5.1/bigZips/md5sum.txt \
>> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
# Edited out the old up*.gz sums.
#########################################################################
# BLASTZ/CHAIN/NET DP3 (DONE 8/14/06 angie)
mkdir /cluster/data/dm3/bed/blastz.dp3.2006-08-14
cd /cluster/data/dm3/bed/blastz.dp3.2006-08-14
cat << '_EOF_' > DEF
# D. melanogaster vs. D. pseudoobscura
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm3/dm3.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp3/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dp3/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.dp3.2006-08-14
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /cluster/bluearc/dm3dp3 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L chainDp3Link
#17523073 bases of 23011344 (76.150%) in intersection
#dm2: 74.901%
ln -s blastz.dp3.2006-08-14 /cluster/data/dm3/bed/blastz.dp3
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 8/18/06 angie - REDONE 6/19/07)
# Use -repMatch=100 (based on size -- for human we use 1024, and
# fly size is ~4.4% of human judging by gapless dm1 genome size from
# featureBits -- we would use 45, but bump that up a bit to be more
# conservative).
ssh kolossus
blat /cluster/data/dm3/dm3.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/dm3/11.ooc -repMatch=100
#Wrote 7657 overused 11-mers to /cluster/bluearc/dm3/11.ooc
# More than twice as many as dm2... probably due to chrUextra.
# 6/19/07 -- That seems to be adversely affecting genbank alignments,
# so bump up repMatch to get a .ooc count similar to dm2's 3031.
# repMatch #overused
# 100 7657
# 150 3709
# 165 3152 <-- close enough
# 180 2688
# 200 2242
blat /cluster/data/dm3/dm3.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/dm3/11.scaled.ooc -repMatch=165
#Wrote 3152 overused 11-mers to /cluster/bluearc/dm3/11.scaled.ooc
# MarkD did a test run, results look good, so I'll replace the
# original .ooc. Also, Mark requested moving the ooc to the same
# dir as the genbank .2bit (iscratch).
mv /cluster/bluearc/dm3/11.scaled.ooc /cluster/bluearc/dm3/11.ooc
foreach i (1 2 3 4 6 7)
rsync -av /cluster/bluearc/dm3/11.ooc kkr${i}u00:/iscratch/i/dm3/
end
# Edited makeDb/genbank/etc/genbank.conf to specify new ooc location.
#########################################################################
# GENBANK AUTO UPDATE (DONE 10/9/06 angie - REDONE 6/20/07 markd)
# 6/20/07 - MarkD rebuilt with regenerated 11.ooc.
# Make a liftAll.lft that specifies 5M chunks for genbank:
ssh kkstore05
cd /cluster/data/dm3
~/kent/src/utils/simplePartition.pl dm3.2bit 5000000 /tmp/dm3P
cat /tmp/dm3P/*/*.lft > jkStuff/liftAll.lft
rm -r /tmp/dm3P
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add dm3 just after dm2...
# dm3 (D. melanogaster)
dm3.serverGenome = /cluster/data/dm3/dm3.2bit
dm3.clusterGenome = /iscratch/i/dm3/dm3.2bit
dm3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
dm3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
dm3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
dm3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
dm3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
dm3.ooc = /iscratch/i/dm3/11.ooc
dm3.lift = /cluster/data/dm3/jkStuff/liftAll.lft
dm3.genbank.mrna.xeno.load = yes
dm3.downloadDir = dm3
cvs ci -m "Added dm3." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial dm3 &
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad dm3 &
# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add dm3 to:
etc/align.dbs
etc/hgwdev.dbs
cvs commit
make etc-update
# 6/20/07 rebuild by MarkD (realign and reload native cDNAs using new
# ooc file):
ssh kkstore02
cd /cluster/data/genbank
rm -f data/aligned/genbank.159.0/dm3/*/*native*
rm -f data/aligned/refseq.23/dm3/*/*native*
./bin/gbAlignStep -initial -orgCat=native dm3
./bin/gbDbLoadStep -drop -initialLoad dm3
#########################################################################
# BACEND PAIRS TRACK (DONE 8/18/06 angie)
ssh kkstore05
# Plenty of room at the moment:
df -h /cluster/data/dm3
# 4.5T 2.1T 2.4T 47% /cluster/store12
mkdir -p /cluster/data/dm3/bed/cloneend/ncbi
cd /cluster/data/dm3/bed/cloneend/ncbi
wget --timestamping \
ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/drosophila_melanogaster/\*
# Wow... the directory exists, but contains no files.
# Use the libs that Roger Hoskins pointed us to for dm2.
mkdir /cluster/data/dm3/bed/bacends
cd /cluster/data/dm3/bed/bacends
wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AL
wget http://www.genoscope.cns.fr/externe/sequences/banque_Projet_AM
faSize *
#40016446 bases (3778317 N's 36238129 real 33260718 upper 2977391 lower) in 41500 sequences in 2 files
#Total size: mean 964.3 sd 202.7 min 1 (AL0AA030DD12A1) max 1373 (AM0AA015AE07BP1) median 1009
#N count: mean 91.0 sd 80.8
#U count: mean 801.5 sd 207.4
#L count: mean 71.7 sd 45.5
cat banque_Projet_A* > bacends.fa
# Edit the <!DOCTYPE.../HTML> part out of bacends.fa.
# In dm2, we parsed bacEndPairs.txt out of sequence headers.
# The sequences have not changed since then, so just copy over
# bacEndPairs.txt.
cp /cluster/data/dm2/bed/bacends/bacEndPairs.txt .
# Fly is much smaller than human so we can get away with one job
# per chrom, no splitting...
ssh kki
mkdir /cluster/bluearc/dm3/bacends
cd /cluster/bluearc/dm3/bacends
cp /cluster/data/dm3/bed/bacends/bacends.fa .
mkdir out
echo bacends.fa > bacends.lst
awk '{print "/iscratch/i/dm3/dm3.2bit:" $1}' /cluster/data/dm3/chrom.sizes \
> dm3.lst
cat > gsub << 'EOF'
#LOOP
/cluster/bin/x86_64/blat -noHead $(path1) $(path2) -ooc=/cluster/bluearc/dm3/11.ooc {check out exists out/$(file1).$(root2).psl}
#ENDLOOP
'EOF'
# << for emacs
gensub2 dm3.lst bacends.lst gsub jobList
para make jobList
para time
#Completed: 15 of 15 jobs
#CPU time in finished jobs: 1721s 28.68m 0.48h 0.02d 0.000 y
#IO & Wait Time: 23s 0.39m 0.01h 0.00d 0.000 y
#Average job time: 116s 1.94m 0.03h 0.00d
#Longest finished job: 744s 12.40m 0.21h 0.01d
#Submission to last job: 744s 12.40m 0.21h 0.01d
# Back on kolossus:
cd /cluster/bluearc/dm3/bacends
cat out/*.psl \
| pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
stdin bacEnds.unpaired.psl /dev/null
# Roger's suggested 50kb-250kb range lost some (especially on the short
# side) in dm2, so I ended up going with Terry's range for human, 35kb-350kb:
# NOTE FOR NEXT TIME: Use Roger's range -- physical sizes don't exceed 250kb!
pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
-slopval=10000 -hardMax=500000 \
-slop -short -long -orphan -mismatch -verbose \
bacEnds.unpaired.psl /cluster/data/dm3/bed/bacends/bacEndPairs.txt \
all_bacends bacEnds
wc -l bacEnds.*
# 17 bacEnds.long
# 96 bacEnds.mismatch
# 6935 bacEnds.orphan
# 11957 bacEnds.pairs
# 28 bacEnds.short
# 120 bacEnds.slop
# 186593 bacEnds.unpaired.psl
# Filter by score and sort by {chrom,chromStart}:
awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
cat bacEnds.{slop,short,long,mismatch,orphan} \
| awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
wc -l *.bed
# 11938 bacEndPairs.bed
# 4824 bacEndPairsBad.bed
# Filter the alignments into those we'll need in the all_bacends table:
extractPslLoad -noBin bacEnds.unpaired.psl \
bacEndPairs.bed bacEndPairsBad.bed \
| sorttbl tname tstart | headchg -del \
> bacEnds.load.psl
mv * /cluster/data/dm3/bed/bacends/
# load into database
ssh hgwdev
cd /cluster/data/dm3/bed/bacends
# load BAC end sequences
mkdir -p /gbdb/dm3/bacends
ln -s /cluster/data/dm3/bed/bacends/bacends.fa /gbdb/dm3/bacends/
hgLoadSeq dm3 /gbdb/dm3/bacends/bacends.fa
hgLoadBed dm3 bacEndPairs bacEndPairs.bed \
-notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# note - this track isn't pushed to RR, just used for assembly QA
hgLoadBed dm3 bacEndPairsBad bacEndPairsBad.bed \
-notItemRgb -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
hgLoadPsl dm3 -table=all_bacends bacEnds.load.psl
# Compares favorably to previous release? (good cov up, bad cov down):
# Well, this is way up:
featureBits dm3 all_bacends
#32765057 bases of 162367812 (20.180%) in intersection
featureBits dm2 all_bacends
#22221379 bases of 131698467 (16.873%) in intersection
# The percentage dropped but I think that's because of the increase in
# assembly size (yep, once again blame it on chrUextra :). The absolute
# count is up which seems good.
featureBits dm3 bacEndPairs
#15484492 bases of 162367812 (9.537%) in intersection
featureBits dm2 bacEndPairs
#14771250 bases of 131698467 (11.216%) in intersection
# Clean up.
rm bed.tab
rm -r out
rm -r /cluster/bluearc/dm3/bacends/
# end BACEND PAIRS TRACK
#########################################################################
# GC5BASE (DONE 8/18/06 angie)
# In the future this will be included in the makeGenomeDb.pl run.
# Make gc5Base wiggle files.
ssh kolossus
mkdir -p /cluster/data/dm3/bed/gc5Base
cd /cluster/data/dm3/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=0 dm3 \
/cluster/data/dm3/dm3.unmasked.2bit \
| wigEncode stdin gc5Base.{wig,wib}
ssh hgwdev
# Load gc5base
mkdir -p /gbdb/dm3/wib
rm -f /gbdb/dm3/wib/gc5Base.wib
ln -s /cluster/data/dm3/bed/gc5Base.wib /gbdb/dm3/wib
hgLoadWiggle -pathPrefix=/gbdb/dm3/wib \
dm3 gc5Base /cluster/data/dm3/bed/gc5Base/gc5Base.wig
#########################################################################
# BLASTZ/CHAIN/NET DROSIM1 (DONE 8/18/06 angie)
mkdir /cluster/data/dm3/bed/blastz.droSim1.2006-08-18
cd /cluster/data/dm3/bed/blastz.droSim1.2006-08-18
cat << '_EOF_' > DEF
# D. melanogaster vs. D. simulans
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. simulans
SEQ2_DIR=/san/sanvol1/scratch/droSim1/droSim1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droSim1/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droSim1.2006-08-18
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/dm3droSim1 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L chainDroSim1Link
#21667700 bases of 23011344 (94.161%) in intersection
# dm2: 91.756%
rm -f /cluster/data/dm3/bed/blastz.droSim1
ln -s blastz.droSim1.2006-08-18 /cluster/data/dm3/bed/blastz.droSim1
#########################################################################
# BLASTZ/CHAIN/NET DROYAK2 (DONE 8/18/06 angie)
mkdir /cluster/data/dm3/bed/blastz.droYak2.2006-08-18
cd /cluster/data/dm3/bed/blastz.droYak2.2006-08-18
cat << '_EOF_' > DEF
# D. melanogaster vs. D. yakuba
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. yakuba
SEQ2_DIR=/san/sanvol1/scratch/droYak2/droYak2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droYak2/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droYak2.2006-08-18
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/dm3droYak2 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L chainDroYak2Link
#21965072 bases of 23011344 (95.453%) in intersection
# dm2: 91.943%
rm -f /cluster/data/dm3/bed/blastz.droYak2
ln -s blastz.droYak2.2006-08-18 /cluster/data/dm3/bed/blastz.droYak2
#########################################################################
# BLASTZ/CHAIN/NET DROSEC1 (DONE 8/21/06 angie)
mkdir /cluster/data/dm3/bed/blastz.droSec1.2006-08-18
cd /cluster/data/dm3/bed/blastz.droSec1.2006-08-18
cat << '_EOF_' > DEF
# D. melanogaster vs. D. sechellia
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. sechellia
SEQ2_DIR=/san/sanvol1/scratch/droSec1/droSec1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droSec1/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droSec1.2006-08-18
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/dm3droSec1 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L chainDroSec1Link
#21913800 bases of 23011344 (95.230%) in intersection
# dm2: 92.860%
rm -f /cluster/data/dm3/bed/blastz.droSec1
ln -s blastz.droSec1.2006-08-18 /cluster/data/dm3/bed/blastz.droSec1
#########################################################################
# BLASTZ/CHAIN/NET DROERE1 (DONE 8/22/06 angie)
mkdir /cluster/data/dm3/bed/blastz.droEre1.2006-08-21
cd /cluster/data/dm3/bed/blastz.droEre1.2006-08-21
cat << '_EOF_' > DEF
# D. melanogaster vs. D. sechellia
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. sechellia
SEQ2_DIR=/san/sanvol1/scratch/droEre1/droEre1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droEre1/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droEre1.2006-08-21
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -bigClusterHub=pk -smallClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/dm3droEre1 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L chainDroEre1Link
#21706597 bases of 23011344 (94.330%) in intersection
# dm2: 91.103%
rm -f /cluster/data/dm3/bed/blastz.droEre1
ln -s blastz.droEre1.2006-08-21 /cluster/data/dm3/bed/blastz.droEre1
# BLASTZ/CHAIN/NET DROANA2 (DONE 8/22/06 angie)
mkdir /cluster/data/dm3/bed/blastz.droAna2.2006-08-22
cd /cluster/data/dm3/bed/blastz.droAna2.2006-08-22
cat << '_EOF_' > DEF
# D. melanogaster vs. D. ananassae
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/dm3.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. ananassae
SEQ2_DIR=/san/sanvol1/scratch/droAna2/droAna2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droAna2/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droAna2.2006-08-22
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -bigClusterHub pk -smallClusterHub pk \
-blastzOutRoot /san/sanvol1/scratch/dm3droAna2 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L chainDroAna2Link
#19329381 bases of 23011344 (83.999%) in intersection
#dm3: 81.385%
rm -f /cluster/data/dm3/bed/blastz.droAna2
ln -s blastz.droAna2.2006-08-22 /cluster/data/dm3/bed/blastz.droAna2
#########################################################################
# LIFTOVER FLYBASE 4.2 ANNOTATIONS FROM DM1 (TEMPORARY) (DONE 8/18/06 angie)
# Until FlyBase releases Release 5 annotations, liftOver'd 4.2 genes
# will be better than nothing.
ssh hgwdev
mkdir /cluster/data/dm3/bed/flybase4.2.liftOver
cd /cluster/data/dm3/bed/flybase4.2.liftOver
hgsql dm2 -N -e 'select * from flyBaseGene' > dm2.flyBaseGene.tab
hgsql dm2 -N -e 'select * from flyBaseNoncoding' > dm2.flyBaseNoncoding.tab
liftOver -genePred dm2.flyBaseGene.tab \
/cluster/data/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \
flyBaseLiftGene.tab flyBaseLiftGene.unmapped.tab
liftOver -bedPlus=12 -hasBin dm2.flyBaseNoncoding.tab \
/cluster/data/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \
flyBaseLiftNoncoding.tab flyBaseLiftNoncoding.unmapped.tab
# Not quite perfect but not bad:
wc -l *.tab
# 19178 dm2.flyBaseGene.tab
# 727 dm2.flyBaseNoncoding.tab
# 19173 flyBaseLiftGene.tab
# 10 flyBaseLiftGene.unmapped.tab
# 727 flyBaseLiftNoncoding.tab
# 0 flyBaseLiftNoncoding.unmapped.tab
ldHgGene -predTab dm3 flyBaseLiftGene flyBaseLiftGene.tab
hgLoadBed -hasBin dm3 flyBaseLiftNoncoding flyBaseLiftNoncoding.tab
# Copy pep table from dm2:
hgPepPred dm3 tab flyBaseLiftGenePep \
/cluster/data/dm2/bed/flybase4.2/flyBasePep.tab
###########################################################################
# BLASTZ/CHAIN/NET DROSIM1 (DONE 11/14/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droSim1.2006-11-14
cd /cluster/data/dm3/bed/blastz.droSim1.2006-11-14
cat << '_EOF_' > DEF
# D. melanogaster vs. D. simulans
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. simulans
SEQ2_DIR=/san/sanvol1/scratch/droSim1/droSim1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droSim1/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droSim1.2006-11-14
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droSim1 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroSim1Link
#flyBaseLiftGene 22.721%, chainDroSim1Link 94.107%, both 22.094%, cover 97.24%, enrich 1.03x
rm -f /cluster/data/dm3/bed/blastz.droSim1
ln -s blastz.droSim1.2006-11-14 /cluster/data/dm3/bed/blastz.droSim1
###########################################################################
# BLASTZ/CHAIN/NET DROSEC1 (DONE 11/29/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droSec1.2006-11-28
cd /cluster/data/dm3/bed/blastz.droSec1.2006-11-28
cat << '_EOF_' > DEF
# D. melanogaster vs. D. sechellia
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. sechellia
SEQ2_DIR=/san/sanvol1/scratch/droSec1/droSec1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/san/sanvol1/scratch/droSec1/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droSec1.2006-11-28
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droSec1 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroSec1Link
#flyBaseLiftGene 22.721%, chainDroSec1Link 95.197%, both 22.491%, cover 98.99%, enrich 1.04x
rm -f /cluster/data/dm3/bed/blastz.droSec1
ln -s blastz.droSec1.2006-11-28 /cluster/data/dm3/bed/blastz.droSec1
###########################################################################
# BLASTZ/CHAIN/NET DROYAK2 (DONE 11/15/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droYak2.2006-11-15
cd /cluster/data/dm3/bed/blastz.droYak2.2006-11-15
cat << '_EOF_' > DEF
# D. melanogaster vs. D. yakuba
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. yakuba
SEQ2_DIR=/san/sanvol1/scratch/droYak2/droYak2.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droYak2/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droYak2.2006-11-15
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droYak2 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroYak2Link
#flyBaseLiftGene 22.721%, chainDroYak2Link 95.403%, both 22.489%, cover 98.98%, enrich 1.04x
rm -f /cluster/data/dm3/bed/blastz.droYak2
ln -s blastz.droYak2.2006-11-15 /cluster/data/dm3/bed/blastz.droYak2
###########################################################################
# BLASTZ/CHAIN/NET DROPER1 (DONE 11/20/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droPer1.2006-11-15
cd /cluster/data/dm3/bed/blastz.droPer1.2006-11-15
cat << '_EOF_' > DEF
# D. melanogaster vs. D. persimilis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. persimilis
SEQ2_DIR=/san/sanvol1/scratch/droPer1/droPer1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/san/sanvol1/scratch/droPer1/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droPer1.2006-11-15
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -blastzOutRoot /cluster/bluearc/dm3droPer1 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroPer1Link
#flyBaseLiftGene 22.721%, chainDroPer1Link 76.570%, both 20.929%, cover 92.11%, enrich 1.20x
rm -f /cluster/data/dm3/bed/blastz.droPer1
ln -s blastz.droPer1.2006-11-15 /cluster/data/dm3/bed/blastz.droPer1
###########################################################################
# BLASTZ/CHAIN/NET DROERE2 (DONE 11/30/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droEre2.2006-11-29
cd /cluster/data/dm3/bed/blastz.droEre2.2006-11-29
cat << '_EOF_' > DEF
# D. melanogaster vs. D. erecta
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. erecta
SEQ2_DIR=/san/sanvol1/scratch/droEre2/droEre2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droEre2/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droEre2.2006-11-29
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -smallClusterHub pk \
-blastzOutRoot /cluster/bluearc/dm3droEre2 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroEre2Link
#flyBaseLiftGene 22.721%, chainDroEre2Link 94.227%, both 22.472%, cover 98.90%, enrich 1.05x
rm -f /cluster/data/dm3/bed/blastz.droEre2
ln -s blastz.droEre2.2006-11-29 /cluster/data/dm3/bed/blastz.droEre2
###########################################################################
# BLASTZ/CHAIN/NET DROGRI2 (DONE 12/1/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droGri2.2006-11-30
cd /cluster/data/dm3/bed/blastz.droGri2.2006-11-30
cat << '_EOF_' > DEF
# D. melanogaster vs. D. grimshawi
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. grimshawi
SEQ2_DIR=/san/sanvol1/scratch/droGri2/droGri2.2bit
SEQ2_CHUNK=50000000
SEQ2_LAP=10000
SEQ2_LEN=/san/sanvol1/scratch/droGri2/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droGri2.2006-11-30
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -smallClusterHub pk \
-blastzOutRoot /cluster/bluearc/dm3droGri2 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroGri2Link
#flyBaseLiftGene 22.721%, chainDroGri2Link 66.170%, both 20.274%, cover 89.23%, enrich 1.35x
rm -f /cluster/data/dm3/bed/blastz.droGri2
ln -s blastz.droGri2.2006-11-30 /cluster/data/dm3/bed/blastz.droGri2
###########################################################################
# BLASTZ/CHAIN/NET DROWIL1 (DONE 12/4/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droWil1.2006-12-04
cd /cluster/data/dm3/bed/blastz.droWil1.2006-12-04
cat << '_EOF_' > DEF
# D. melanogaster vs. D. willistoni
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. willistoni
SEQ2_DIR=/san/sanvol1/scratch/droWil1/droWil1.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/san/sanvol1/scratch/droWil1/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droWil1.2006-12-04
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -smallClusterHub pk \
-blastzOutRoot /cluster/bluearc/dm3droWil1 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroWil1Link
#flyBaseLiftGene 22.721%, chainDroWil1Link 73.133%, both 20.583%, cover 90.59%, enrich 1.24x
rm -f /cluster/data/dm3/bed/blastz.droWil1
ln -s blastz.droWil1.2006-12-04 /cluster/data/dm3/bed/blastz.droWil1
###########################################################################
# BLASTZ/CHAIN/NET DROANA3 (DONE 12/4/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droAna3.2006-12-04
cd /cluster/data/dm3/bed/blastz.droAna3.2006-12-04
cat << '_EOF_' > DEF
# D. melanogaster vs. D. ananassae
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. ananassae
SEQ2_DIR=/san/sanvol1/scratch/droAna3/droAna3.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/san/sanvol1/scratch/droAna3/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droAna3.2006-12-04
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -smallClusterHub pk \
-blastzOutRoot /cluster/bluearc/dm3droAna3 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroAna3Link
#flyBaseLiftGene 22.721%, chainDroAna3Link 83.841%, both 21.651%, cover 95.29%, enrich 1.14x
rm -f /cluster/data/dm3/bed/blastz.droAna3
ln -s blastz.droAna3.2006-12-04 /cluster/data/dm3/bed/blastz.droAna3
###########################################################################
# BLASTZ/CHAIN/NET DROMOJ3 (DONE 12/4/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droMoj3.2006-12-04
cd /cluster/data/dm3/bed/blastz.droMoj3.2006-12-04
cat << '_EOF_' > DEF
# D. melanogaster vs. D. mojavensis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. mojavensis
SEQ2_DIR=/iscratch/i/droMoj3/droMoj3.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droMoj3/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droMoj3.2006-12-04
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-blastzOutRoot /panasas/store/dm3droMoj3 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroMoj3Link
#flyBaseLiftGene 22.721%, chainDroMoj3Link 65.253%, both 20.171%, cover 88.78%, enrich 1.36x
rm -f /cluster/data/dm3/bed/blastz.droMoj3
ln -s blastz.droMoj3.2006-12-04 /cluster/data/dm3/bed/blastz.droMoj3
###########################################################################
# BLASTZ/CHAIN/NET DROVIR3 (DONE 12/4/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.droVir3.2006-12-04
cd /cluster/data/dm3/bed/blastz.droVir3.2006-12-04
cat << '_EOF_' > DEF
# D. melanogaster vs. D. virilis
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. virilis
SEQ2_DIR=/iscratch/i/droVir3/droVir3.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droVir3/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.droVir3.2006-12-04
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-blastzOutRoot /cluster/bluearc/dm3droVir3 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDroVir3Link
#flyBaseLiftGene 22.721%, chainDroVir3Link 67.299%, both 20.340%, cover 89.52%, enrich 1.33x
rm -f /cluster/data/dm3/bed/blastz.droVir3
ln -s blastz.droVir3.2006-12-04 /cluster/data/dm3/bed/blastz.droVir3
###########################################################################
# BLASTZ/CHAIN/NET DP4 (DONE 12/4/06 angie)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.dp4.2006-12-04
cd /cluster/data/dm3/bed/blastz.dp4.2006-12-04
cat << '_EOF_' > DEF
# D. melanogaster vs. D. pseudoobscura
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - D. pseudoobscura
SEQ2_DIR=/san/sanvol1/scratch/dp4/dp4.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dp4/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.dp4.2006-12-04
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-blastzOutRoot /san/sanvol1/scratch/dm3dp4 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainDp4Link
#flyBaseLiftGene 22.721%, chainDp4Link 76.299%, both 20.966%, cover 92.28%, enrich 1.21x
rm -f /cluster/data/dm3/bed/blastz.dp4
ln -s blastz.dp4.2006-12-04 /cluster/data/dm3/bed/blastz.dp4
###########################################################################
# BLASTZ/CHAIN/NET ANOGAM1 (DONE 12/5/06)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.anoGam1.2006-12-04
cd /cluster/data/dm3/bed/blastz.anoGam1.2006-12-04
cat <<_EOF_ > DEF
# D. melanogaster vs. A. gambiae
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - A. gambiae
SEQ2_DIR=/iscratch/i/anoGam1/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/anoGam1/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.anoGam1.2006-12-04
_EOF_
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-blastzOutRoot /cluster/bluearc/dm3anoGam1 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainAnoGam1Link
#flyBaseLiftGene 22.721%, chainAnoGam1Link 19.536%, both 12.711%, cover 55.94%, enrich 2.86x
rm -f /cluster/data/dm3/bed/blastz.anoGam1
ln -s blastz.anoGam1.2006-12-04 /cluster/data/dm3/bed/blastz.anoGam1
###########################################################################
# BLASTZ/CHAIN/NET APIMEL3 (DONE 7/5/07 except for loading of netApiMel3)
ssh kkstore05
mkdir /cluster/data/dm3/bed/blastz.apiMel3.2006-12-11
cd /cluster/data/dm3/bed/blastz.apiMel3.2006-12-11
cat <<_EOF_ > DEF
# D. melanogaster vs. A. mellifera
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - D. melanogaster
SEQ1_DIR=/san/sanvol1/scratch/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - A. mellifera
SEQ2_DIR=/san/sanvol1/scratch/apiMel3/apiMel3.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/apiMel3/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.apiMel3.2006-12-11
_EOF_
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -smallClusterHub pk \
-blastzOutRoot /cluster/bluearc/dm3apiMel3 >& do.log &
tail -f do.log
# Died in loading phase -- netClass failed because there is no apiMel3 db!
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainApiMel3Link
#flyBaseLiftGene 22.721%, chainApiMel3Link 17.865%, both 8.790%, cover 38.68%, enrich 2.17x
rm -f /cluster/data/dm3/bed/blastz.apiMel3
ln -s blastz.apiMel3.2006-12-11 /cluster/data/dm3/bed/blastz.apiMel3
# 7/5/07: continue to make download files.
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-bigClusterHub pk -smallClusterHub pk \
-blastzOutRoot /cluster/bluearc/dm3apiMel3 \
-continue download >>& do.log &
tail -f do.log
###########################################################################
# BLASTZ/CHAIN/NET TRICAS2 (DONE 12/11/06)
ssh kkstore04
mkdir /cluster/data/dm3/bed/blastz.triCas2.2006-12-11
cd /cluster/data/dm3/bed/blastz.triCas2.2006-12-11
cat <<_EOF_ > DEF
# D. melanogaster vs. T. castaneum
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm3/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm3/chrom.sizes
# QUERY - T. castaneum
SEQ2_DIR=/iscratch/i/triCas2/triCas2.2bit
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/iscratch/i/triCas2/chrom.sizes
BASE=/cluster/data/dm3/bed/blastz.triCas2.2006-12-11
_EOF_
doBlastzChainNet.pl DEF -chainMinScore 3000 \
-blastzOutRoot /cluster/bluearc/dm3triCas2 >& do.log &
tail -f do.log
featureBits dm3 -chrom=chr2L -enrichment flyBaseLiftGene chainTriCas2Link
#flyBaseLiftGene 22.721%, chainTriCas2Link 20.039%, both 10.463%, cover 46.05%, enrich 2.30x
rm -f /cluster/data/dm3/bed/blastz.triCas2
ln -s blastz.triCas2.2006-12-11 /cluster/data/dm3/bed/blastz.triCas2
###########################################################################
# MULTIZ15WAY (DONE 12/11/06 angie)
# (((((((((dm3,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2)
ssh kkstore04
mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11
cd /cluster/data/dm3/bed/multiz15way.2006-12-11
# copy MAF's to cluster-friendly server
mkdir /san/sanvol1/scratch/dm3/mafNet
foreach s (droSim1 droSec1 droYak2 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 anoGam1 apiMel3 triCas2)
echo $s
rsync -av /cluster/data/dm3/bed/blastz.$s/mafNet/* \
/san/sanvol1/scratch/dm3/mafNet/$s/
end
echo '(((((((((dm3,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2)' \
> tree-commas.nh
sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
ssh pk
cd /cluster/data/dm3/bed/multiz15way.2006-12-11
mkdir maf run
cd run
# stash binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = dm3
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/mafNet
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/dm3/bed/multiz15way.2006-12-11/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << emacs
awk '{print $1}' /cluster/data/dm3/chrom.sizes > chrom.lst
gensub2 chrom.lst single spec jobList
para make jobList
para time
#Completed: 15 of 15 jobs
#CPU time in finished jobs: 44158s 735.97m 12.27h 0.51d 0.001 y
#IO & Wait Time: 320s 5.33m 0.09h 0.00d 0.000 y
#Average job time: 2965s 49.42m 0.82h 0.03d
#Longest finished job: 9309s 155.15m 2.59h 0.11d
#Submission to last job: 9309s 155.15m 2.59h 0.11d
# ANNOTATE MULTIZ15WAY MAF AND LOAD TABLES (DONE 12/11/2006 angie)
ssh kolossus
mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11/anno
cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno
mkdir maf run
cd run
rm -f sizes nBeds
foreach db (`cat /cluster/data/dm3/bed/multiz15way.2006-12-11/species.lst`)
ln -s /cluster/data/$db/chrom.sizes $db.len
if (! -e /cluster/data/$db/$db.N.bed) then
twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
endif
ln -s /cluster/data/$db/$db.N.bed $db.bed
echo $db.bed >> nBeds
echo $db.len >> sizes
end
echo date > jobs.csh
# do smaller jobs first:
foreach f (`ls -1rS ../../maf/*.maf`)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
/cluster/data/dm3/dm3.2bit ../maf/`basename $f` >> jobs.csh
end
echo date >> jobs.csh
csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
# 5min.
# Load anno/maf
ssh hgwdev
cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf
mkdir -p /gbdb/dm3/multiz15way/anno/maf
ln -s /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf/*.maf \
/gbdb/dm3/multiz15way/anno/maf
date
nice hgLoadMaf -pathPrefix=/gbdb/dm3/multiz15way/anno/maf dm3 multiz15way
date
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kolossus
cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf
cat *.maf \
| nice hgLoadMafSummary dm3 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
-test multiz15waySummary stdin
#Created 171317 summary blocks from 13563512 components and 1633505 mafs from stdin
ssh hgwdev
cd /cluster/data/dm3/bed/multiz15way.2006-12-11/anno/maf
sed -e 's/mafSummary/multiz15waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
> /tmp/multiz15waySummary.sql
time nice hgLoadSqlTab dm3 multiz15waySummary /tmp/multiz15waySummary.sql \
multiz15waySummary.tab
#0.000u 0.002s 0:02.59 0.0% 0+0k 0+0io 2pf+0w
rm *.tab
ln -s multiz15way.2006-12-11 /cluster/data/dm3/bed/multiz15way
# MULTIZ15WAY DOWNLOADABLES (ANNOTATED) (DONE 12/11/2006 angie -- UPSTREAM REDONE 5/18/08, 10/27/08)
# 5/18/08: regenerating upstream* to fix bug (truncated refGene names to
# NM_[0-9]{6} but there are some valid NM_[0-9]{9} IDs now).
# 10/27/08: regenerating with FlyBase Genes and mafFrags -txStarts.
# Annotated MAF is now documented, so use anno/maf for downloads.
ssh hgwdev
mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads
cd /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads
# upstream mafs
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits dm3 flyBaseGene:upstream:$i -fa=/dev/null -bed=stdout \
| sed -re 's/ [^\t]*\t/\t/;' > up.bed
nice mafFrags -txStarts dm3 multiz15way up.bed upstream$i.maf \
-orgs=../species.lst
rm up.bed
end
date
'EOF'
# << emacs
time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
#171.365u 94.950s 21:16.74 20.8% 0+0k 0+0io 0pf+0w
ssh kolossus
cd /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads
cat > downloads.csh << 'EOF'
date
foreach f (../anno/maf/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
date
'EOF'
# << emacs
time csh -efx downloads.csh >&! downloads.log
#459.706u 5.793s 8:23.71 92.4% 0+0k 0+0io 0pf+0w
nice gzip up*.maf
nice md5sum up*.maf.gz >> md5sum.txt
# Make a README.txt .
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/dm3/multiz15way
mkdir $dir
ln -s /cluster/data/dm3/bed/multiz15way.2006-12-11/mafDownloads/*.{gz,txt} $dir
# MULTIZ15WAY MAF FRAMES (DONE 12/11/2006 angie)
ssh hgwdev
mkdir /cluster/data/dm3/bed/multiz15way.2006-12-11/frames
cd /cluster/data/dm3/bed/multiz15way.2006-12-11/frames
# The following is adapted from MarkD's Makefile used for mm7...
#------------------------------------------------------------------------
# get the genes for all genomes (when available)
# currently we have nothing for:
# droSec1 droEre2 droAna3 dp4 droPer1 droWil1 droVir3 droMoj3 droGri2 triCas2
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using mrna table as genes: droSim1 droYak2 anoGam1
mkdir genes
foreach queryDb (droSim1 droYak2 anoGam1)
set tmpExt = `mktemp temp.XXXXXX`
set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
set tmpMrna = ${queryDb}.mrna.${tmpExt}
set tmpCds = ${queryDb}.cds.${tmpExt}
echo $queryDb
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
${queryDb} > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
stdout \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
# using refGene for dm3
# genePreds; (must keep only the first 10 columns for knownGene)
set gpFields = 'name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds'
foreach queryDb (dm3)
if ($queryDb == "dm3") then
set geneTbl = refGene
endif
hgsql -N -e "select $gpFields from $geneTbl" ${queryDb} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
# MarkD's 1-pass maf methodology
ssh kolossus
cd /cluster/data/dm3/bed/multiz15way/frames
nice tcsh # easy way to get process niced
(cat ../anno/maf/*.maf \
| time genePredToMafFrames dm3 stdin stdout \
dm3 genes/dm3.gp.gz \
droSim1 genes/droSim1.gp.gz droYak2 genes/droYak2.gp.gz \
anoGam1 genes/anoGam1.gp.gz \
| gzip -c > multiz15way.mafFrames.gz) >& frames.log
ssh hgwdev
cd /cluster/data/dm3/bed/multiz15way/frames
hgLoadMafFrames dm3 multiz15wayFrames multiz15way.mafFrames.gz
# PHASTCONS 15WAY (DONE 12/11/06 angie)
# (((((((((dm3,(droSim1,droSec1)),(droYak2,droEre2)),droAna3),(dp4,droPer1)),droWil1),((droVir3,droMoj3),droGri2)),anoGam1),apiMel3),triCas2)
# Estimation for dm2 phastCons w/same tree took a long time, and didn't
# lead to any real change in params (thank god no iteration req'd).
# So skip estimation and use the tree from dm2.
ssh kkstore04
mkdir -p /san/sanvol1/scratch/dm3/chrom
foreach chr (`awk '{print $1;}' /cluster/data/dm3/chrom.sizes`)
twoBitToFa -seq=$chr /cluster/data/dm3/dm3.2bit \
/san/sanvol1/scratch/dm3/chrom/$chr.fa
end
# Split chrom fa into smaller windows for phastCons:
ssh pk
mkdir /cluster/data/dm3/bed/multiz15way/phastCons
mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.split
cd /cluster/data/dm3/bed/multiz15way/phastCons/run.split
set WINDOWS = /san/sanvol1/scratch/dm3/phastCons/WINDOWS
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.sh
#!/bin/csh -ef
set PHAST=/cluster/bin/phast
set FA_SRC=/san/sanvol1/scratch/dm3/chrom
set WINDOWS=/san/sanvol1/scratch/dm3/phastCons/WINDOWS
set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
-O dm3,droSim1,droSec1,droYak2,droEre2,droAna3,dp4,droPer1,droWil1,droVir3,droMoj3,droGri2,anoGam1,apiMel3,triCas2 \
-w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
chmod a+x doSplit.sh
rm -f jobList
foreach file (/cluster/data/dm3/bed/multiz15way/maf/*.maf)
if (-s $file) then
echo "doSplit.sh {check in exists+ $file}" >> jobList
endif
end
para make jobList
para time
#Completed: 15 of 15 jobs
#CPU time in finished jobs: 1728s 28.80m 0.48h 0.02d 0.000 y
#IO & Wait Time: 1578s 26.30m 0.44h 0.02d 0.000 y
#Average job time: 220s 3.67m 0.06h 0.00d
#Longest finished job: 1946s 32.43m 0.54h 0.02d
#Submission to last job: 1962s 32.70m 0.55h 0.02d
############### SKIP PARAMETER ESTIMATION #############
mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.estimate
cd /cluster/data/dm3/bed/multiz15way/phastCons/run.estimate
foreach mod (/cluster/data/dm2/bed/multiz15way/phastCons/run.estimate/ave*.mod)
sed -e 's/dm2/dm3/g; s/apiMel2/apiMel3/g;' $mod > $mod:t
end
############### COMPUTE SCORES AND ELEMENTS #############
ssh pk
mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.phast
cd /cluster/data/dm3/bed/multiz15way/phastCons/run.phast
cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
--expected-lengths 23.8 --target-coverage 0.393 \
--quiet --seqname $chr --idpref $pref \
--viterbi /san/sanvol1/scratch/dm3/phastCons/ELEMENTS/$pref.bed --score \
--require-informative 0 \
> $tmpfile
gzip -c $tmpfile > /san/sanvol1/scratch/dm3/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
chmod a+x doPhastCons.sh
rm -fr /san/sanvol1/scratch/dm3/phastCons/{POSTPROBS,ELEMENTS}
mkdir -p /san/sanvol1/scratch/dm3/phastCons/{POSTPROBS,ELEMENTS}
rm -f jobList
foreach f (/san/sanvol1/scratch/dm3/phastCons/WINDOWS/*.ss.gz)
echo doPhastCons.sh $f >> jobList
end
para make jobList
para time
#Completed: 178 of 178 jobs
#CPU time in finished jobs: 1836s 30.60m 0.51h 0.02d 0.000 y
#IO & Wait Time: 16855s 280.92m 4.68h 0.20d 0.001 y
#Average job time: 105s 1.75m 0.03h 0.00d
#Longest finished job: 544s 9.07m 0.15h 0.01d
#Submission to last job: 579s 9.65m 0.16h 0.01d
# back on kolossus:
# combine predictions and transform scores to be in 0-1000 interval
cd /cluster/data/dm3/bed/multiz15way/phastCons
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
/san/sanvol1/scratch/dm3/phastCons/ELEMENTS/*.bed \
| /cluster/bin/scripts/lodToBedScore > all.bed
ssh hgwdev
# Now measure coverage of CDS by conserved elements.
# We want the "cover" figure to be close to 68.9%.
cd /cluster/data/dm3/bed/multiz15way/phastCons
featureBits -enrichment dm3 flyBaseLiftGene:cds all.bed
# 69.26... not exactly our 68.9% target, but close enough.
#flyBaseLiftGene:cds 13.517%, all.bed 32.328%, both 9.361%, cover 69.26%, enrich 2.14x
# Having met the CDS coverage target, load up the results.
hgLoadBed dm3 phastConsElements15way all.bed
# Create wiggle
ssh pk
mkdir /cluster/data/dm3/bed/multiz15way/phastCons/run.wib
cd /cluster/data/dm3/bed/multiz15way/phastCons/run.wib
rm -rf /san/sanvol1/scratch/dm3/phastCons/wib
mkdir -p /san/sanvol1/scratch/dm3/phastCons/wib
cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /san/sanvol1/scratch/dm3/phastCons/wib
zcat `ls -1 /san/sanvol1/scratch/dm3/phastCons/POSTPROBS/$chr.*.pp.gz \
| sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
chmod a+x doWigEncode
rm -f jobList
foreach chr (`ls -1 /san/sanvol1/scratch/dm3/phastCons/POSTPROBS \
| awk -F\. '{print $1}' | sort -u`)
echo doWigEncode $chr >> jobList
end
para make jobList
para time
#Completed: 15 of 15 jobs
#CPU time in finished jobs: 92s 1.53m 0.03h 0.00d 0.000 y
#IO & Wait Time: 71s 1.19m 0.02h 0.00d 0.000 y
#Average job time: 11s 0.18m 0.00h 0.00d
#Longest finished job: 24s 0.40m 0.01h 0.00d
#Submission to last job: 24s 0.40m 0.01h 0.00d
# back on kkstore04, copy wibs, wigs and POSTPROBS (people sometimes want
# the raw scores) from san/sanvol1
cd /cluster/data/dm3/bed/multiz15way/phastCons
rm -rf wib POSTPROBS
rsync -av /san/sanvol1/scratch/dm3/phastCons/wib .
rsync -av /san/sanvol1/scratch/dm3/phastCons/POSTPROBS .
# load wiggle component of Conservation track
ssh hgwdev
mkdir /gbdb/dm3/multiz15way/wib
cd /cluster/data/dm3/bed/multiz15way/phastCons
chmod 775 . wib
chmod 664 wib/*.wib
ln -s `pwd`/wib/*.wib /gbdb/dm3/multiz15way/wib/
hgLoadWiggle dm3 phastCons15way \
-pathPrefix=/gbdb/dm3/multiz15way/wib wib/*.wig
rm wiggle.tab
# and clean up san/sanvol1.
rm -r /san/sanvol1/scratch/dm3/phastCons/{ELEMENTS,POSTPROBS,wib}
rm -r /san/sanvol1/scratch/dm3/chrom
# Offer raw scores for download since fly folks are likely to be interested:
ssh kkstore04
cd /cluster/data/dm3/bed/multiz15way/phastCons/POSTPROBS
mkdir ../postprobsDownload
foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
> ../postprobsDownload/$chr.pp.gz
end
cd ../postprobsDownload
md5sum *.gz > md5sum.txt
# Make a README.txt there too.
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/dm3/phastCons15way
cd /usr/local/apache/htdocs/goldenPath/dm3/phastCons15way
ln -s /cluster/data/dm3/bed/multiz15way/phastCons/postprobsDownload/* .
# make a tree picture using phyloGif.
cd /cluster/data/dm3/bed/multiz15way
cp tree-commas.nh tree-commas-orgNames.nh
# Edit tree-commas-orgNames.nh to use org names.
/usr/local/apache/cgi-bin/phyloGif -phyloGif_tree=tree-commas-orgNames.nh \
-phyloGif_width=260 -phyloGif_height=480 > dm3_15way.gif
cp -p dm3_15way.gif ~/browser/images/phylo/dm3_15way.gif
# Check in browser/images/phylo/dm3_15way.gif. In a clean ~/browser:
cd ~/browser
make alpha
#########################################################################
# FLYBASE 5.1 ANNOTATIONS (DONE 4/6/07 angie)
# OBSOLETED -- see FLYBASE 5.3 ANNOTATIONS BELOW
ssh kkstore05
mkdir /cluster/data/dm3/bed/flybase5.1
cd /cluster/data/dm3/bed/flybase5.1
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.1/gff/dmel-all-r5.1.gff.gz
gunzip dmel-all-r5.1.gff.gz
# What data sources are represented in this file?
grep -v '^#' dmel-all-r5.1.gff | awk '{print $2 "\t" $3;}' | sort | uniq -c
# excerpt (many other sources, including blastx:... , sim4:... and
# tblastx:...; also various other types for source "FlyBase" and "."):
67107 FlyBase CDS
65481 FlyBase exon
14601 FlyBase gene
19781 FlyBase mRNA
92 FlyBase miRNA
95 FlyBase ncRNA
51 FlyBase pseudogene
98 FlyBase rRNA
47 FlyBase snRNA
65 FlyBase snoRNA
314 FlyBase tRNA
6003 FlyBase transposable_element
37267 FlyBase transposable_element_insertion_site
# Very similar to 4.3 except no more ". transcription_start_site".
# What keywords are defined in the 9th field?
# Oh dear, 4.3 had HTML-escaped special characters but 5.1 has
# unescaped ; willy-nilly in there with the ; which separate var=val's.
# | perl -wpe 's/&\w\w\w\w?;//g; s/=[^;]+;/\n/g; s/=.*$//;' \
grep -v '^#' dmel-all-r5.1.gff \
| awk '{print $9;}' \
| perl -wpe 'while (s/(\w+)=//) {print "$1\n";} s/^.*\n$//;' \
| sort | uniq -c
17604 Alias
375522 Dbxref
5523024 ID
5521150 Name
3415862 Parent
3962606 Target
18400 associated_with
15770 cyto_range
37956 derived_cyto_location
15600 gbunit
12101 programversion
11795 putative_ortholog_of
13174 to_species
# Notable differences from 4.3: no more Synonym; a lot fewer Alias,
# a *lot* more Dbxref; some keywords have disappeared but not ones
# we use.
# Copy over 4.3 script and modify to fit the latest Immortal Unchanging
# Widely Adopted Standard GFF3.
# Alias no longer holds the CG*/CR* ID's; now it's Dbxref, with a
# prefix that must be stripped.
# Two pseudogenes do not have CR*; must get those from gene line.
# 'dmel_mitochondrion_genome' --> 'chrM'
cp /cluster/data/dm2/bed/flybase4.3/extractGenes.pl .
extractGenes.pl dmel-all-r5.1.gff
mv flyBaseGene.src=FlyBase.gtf flyBaseGene.gtf
# Get predicted proteins (for main annotations only)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.1/fasta/dmel-all-translation-r5.1.fasta.gz
# Use the FBgn IDs in headers to retrieve CG-R's to match flyBaseGene:
zcat dmel-all-translation-r5.1.fasta.gz \
| perl -we 'open(X, "flyBase2004Xref.tab") || die; \
while (<X>) { \
@w = split("\t"); \
$fbtr2cgr{$w[3]} = $w[0]; \
} \
while (<>) { \
if (/^>.*parent=FBgn\d+,(FBtr\d+)/) { \
$cgr = $fbtr2cgr{$1}; \
if ($cgr) { \
s/^>.*/>$cgr/; \
} else { die; } \
} \
print; \
} \
' \
> flyBasePep.fa
ssh hgwdev
cd /cluster/data/dm3/bed/flybase5.1
# Protein-coding genes:
ldHgGene -gtf dm3 flyBaseGene flyBase*.gtf
# 19781 groups 7 seqs 1 sources 2 feature types
hgPepPred dm3 generic flyBasePep flyBasePep.fa
# Noncoding genes:
hgLoadBed dm3 flyBaseNoncoding flyBaseNoncoding.bed
#Loaded 762 elements of size 12
# Cross-referencing info for both coding and noncoding:
hgLoadSqlTab dm3 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
flyBase2004Xref.tab
# Make sure there are no joinerCheck errors at this point, because
# if left here they can spread via doHgNearBlastp:
runJoiner.csh dm3 flyBasePep
# See if there are still large regions where refGene has genes but
# flybase doesn't:
featureBits dm3 refGene \!flyBaseGene -minSize=1000 -bed=stdout
# Not so bad now -- some cases where sequence might be duplicated and
# we make different calls about where the exons land, but no huge
# missing regions of regular chroms (just no annotations on chr*Het).
# add upstream* downloadable files
ssh hgwdev
mkdir /cluster/data/dm3/bed/flybase5.1/bigZips
cd /cluster/data/dm3/bed/flybase5.1/bigZips
foreach size (1000 2000 5000)
echo upstream$size
nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \
| nice gzip -c > upstream$size.fa.gz
end
nice md5sum *.gz > md5sum.txt
ln -s /cluster/data/dm3/bed/flybase5.1/bigZips/*.gz \
/usr/local/apache/htdocs/goldenPath/dm3/bigZips/
cat md5sum.txt \
>> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
#########################################################################
# CHROMOSOME BANDS (DONE (5.1) 4/5/07 angie - REDONE (5.3) 9/18/07)
# 10/21/08 (5.12): generated files, but no change since 5.3, so I did not
# reload the tables.
ssh hgwdev
cd /cluster/data/dm3/bed/flybase5.3
grep chromosome_band dmel-all-r5.3.gff \
| perl -wpe 'chomp; @w = split; \
$w[3]--; $w[3] = 0 if ($w[3] < 0); \
$w[8] =~ s/^.*Name=band-(\w+)(;.*)?$/$1/; \
if ($w[8] !~ /^\d+[A-Z]\d+$/) { s/^.*$//; next; } \
if ($w[4] <= 0) { s/^.*$//; } else { \
s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\tn\/a\n/; }' \
> cytoBand.txt
hgLoadSqlTab dm3 cytoBand ~/kent/src/hg/lib/cytoBand.sql cytoBand.txt
# Remove the items that are off the end of their chroms:
perl -wpe 's/^(\w+)\t(\d+)$/ \
delete from cytoBand where chrom="$1" and chromStart >= $2; \
update cytoBand set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
../../chrom.sizes \
| hgsql dm3
checkTableCoords -verbose=2 dm3 cytoBand
# As of 5.3, the GFF now includes what we need to make cytoBandIdeo
# directly:
grep chromosome_band dmel-all-r5.3.gff \
| perl -wpe 'chomp; @w = split; \
$w[3]--; $w[3] = 0 if ($w[3] < 0); \
$w[8] =~ s/^.*Name=band-(\w+)(;.*)?$/$1/; \
if ($w[8] !~ /^\d+[A-Z]$/) { s/^.*$//; next; } \
if ($w[4] <= 0) { s/^.*$//; } else { \
s/^.*$/chr$w[0]\t$w[3]\t$w[4]\t$w[8]\tn\/a\n/; }' \
> cytoBandIdeo.txt
hgLoadSqlTab dm3 cytoBandIdeo ~/kent/src/hg/lib/cytoBandIdeo.sql \
cytoBandIdeo.txt
# Remove the items that are off the end of their chroms:
perl -wpe 's/^(\w+)\t(\d+)$/ \
delete from cytoBandIdeo where chrom="$1" and chromStart >= $2; \
update cytoBandIdeo set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
../../chrom.sizes \
| hgsql dm3
checkTableCoords -verbose=2 dm3 cytoBandIdeo
#########################################################################
# FLYBASE ACODE CROSS-REFERENCING DATA (N/A 4/5/07 angie)
# Hmmm, it seems that flybase no longer offser acode. No dice on
# ftping from the old location
# (ftp://flybase.bio.indiana.edu/flybase/genes/genes.txt -- not
# flybase.net either)
# or googling site:flybase.net acode.
# Looks like Chado XML is what is offered now. And an interrupted
# fetch of it is hanging my FireFox.
# When it recovers, I guess I could try downloading it properly
# and seeing if it has the same info that hgFlyBase used to parse
# out of acode.
ftp://ftp.flybase.net/releases/FB2006_01/chado-xml/chado_genes.xml.gz
# This might be smaller...
ftp://ftp.flybase.net/releases/FB2006_01/dmel_r5.1/chado-xml/chado_dmel_genes.xml.gz
#########################################################################
# MAP UNIPROT IDS TO CG*-R* IDS
# FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07)
# Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
# FlyBase 5.3 history: (DONE 9/20/07 angie)
# FlyBase 5.12 history: (DONE 10/24/08 angie)
# In dm2, I combined three sources: specific transcript mappings
# from uniProt.description, less specific gene mappings from
# uniProt.description, and fbUniProt mappings extracted from
# FlyBase acode. We don't have flyBase acode anymore, so combine
# just the first two (uniProt) sources.
ssh hgwdev
mkdir /cluster/data/dm3/bed/uniProtToFlyBase
cd /cluster/data/dm3/bed/uniProtToFlyBase
set taxon = `hgsql -N uniProt -e 'select id from taxon where binomial = "Drosophila melanogaster";'`
# Grab UniProt descriptions of all D. mel. proteins so we can parse out
# CG*-R* transcript IDs (and CG* gene IDs when -R* not specified).
hgsql -N uniProt -e \
'select description.* from description,accToTaxon where \
description.val not like "CG% (Fragment)." and \
accToTaxon.taxon = '$taxon' and accToTaxon.acc = description.acc' \
> uniProt.description.txt
# Grab the complete list of CG*-R* transcript IDs:
hgsql -N dm3 -e 'select name from flyBaseGene' | sort > transcriptIds.txt
ssh kolossus
cd /cluster/data/dm3/bed/uniProtToFlyBase
# Wrote a perl script, gleanDescription.pl, to parse the various patterns
# used to encode real mappings (not mappings to homologs or things that
# look like CG*-P? IDs but aren't really). Update the dm2 copy to handle
# the new CG\d+-R\w\w transcripts:
cp /cluster/data/dm2/bed/uniProtToFlyBase/gleanDescription.pl .
# Loosened a couple regexes in gleanDescription.pl for 5.12.
./gleanDescription.pl uniProt.description.txt | sort > uniProtMapping.txt
# Look for duplicates (if we didn't trim the "(Fragment)." entries above,
# there would be 82! But now we have only one.
foreach cg (`cut -f 1 uniProtMapping.txt | uniq -c \
| awk '$1 != 1 {print $2;}' | sed -e 's/-R/-P/'`)
echo $cg
grep $cg uniProt.description.txt
echo ""
end
#CG9339-PH
#Q6Q4F0 CG9339-PH.
#Q9VIH7 CG9339-PA, isoform A (CG9339-PB, isoform B) (CG9339-PH, isoform H) (LD10117p).
# I looked up both Q6Q4F0 and Q9VIH7 on http://www.pir.uniprot.org/;
# Q9VIH7 is a bit longer, so I'll arbitrarily go with that one and
# delete the Q6Q4F0 mapping from uniProtMapping.txt.
# Wrote a perl script, vagueDescription.pl, to parse out CG* without
# isoform specs -- can use those as backups when we don't have a more
# specific mapping for a transcript. (updated from dm2)
cp /cluster/data/dm2/bed/uniProtToFlyBase/vagueDescription.pl .
./vagueDescription.pl uniProt.description.txt | sort \
> uniProtMappingVague.txt
# acode is no longer available, but UniProt mappings can be gleaned from
# FlyBase GFF3.
perl -we 'while (<>) { chomp; @w = split; \
next unless ($w[8] && $w[1] eq "FlyBase" && $w[2] eq "gene" && \
$w[8] =~ /FlyBase_Annotation_IDs:(CG\d+)/); \
$cg = $1; \
while ($w[8] =~ s/^.*?UniProt\/[-\w]+:(\w+)//) { \
print "$cg\t$1\n"; \
} \
}' ../flybase5.12/dmel-all-r5.12.gff \
> flyBaseCgToUniProt.txt
# Using dm2 perl script, combineMappings.pl, to add any additional mappings
# available from acode to the UniProt mappings. Since the second arg
# is vague-gene and third arg is vague-transcript, feed both vague-gene
# files in as the second input:
cat uniProtMappingVague.txt flyBaseCgToUniProt.txt \
| /cluster/data/dm2/bed/uniProtToFlyBase/combineMappings.pl transcriptIds.txt \
uniProtMapping.txt - /dev/null \
> flyBaseToUniProt.txt
# Make a 2-column version for loading -- and add back in the transcripts for
# which we couldn't find a UniProt acc, because they have to have something in
# flyBaseToUniProt in order for hgNear's oneGeneQuery to work!:
join -t ' ' -a 1 -e n/a -o 1.1,2.2 transcriptIds.txt flyBaseToUniProt.txt \
> flyBaseToUniProtLoad.txt
# Load into a genericAlias-type table:
ssh hgwdev
cd /cluster/data/dm3/bed/uniProtToFlyBase
sed -e 's/genericAlias/flyBaseToUniProt/' \
~/kent/src/hg/lib/genericAlias.sql > flyBaseToUniProt.sql
hgLoadSqlTab dm3 flyBaseToUniProt flyBaseToUniProt.sql \
flyBaseToUniProtLoad.txt
#########################################################################
# HGNEAR TABLES
# HGNEAR PROTEIN BLAST TABLES
# FlyBase 5.1 history: (dm3.* DONE 4/6/07 - dm3.* REDONE 6/18/07,
# *.dmBlastTab DONE 7/18/07)
# Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
# FlyBase 5.3 history: (DONE 9/18/07)
# FlyBase 5.12 history: (DONE 10/21/08)
ssh hgwdev
mkdir /cluster/data/dm3/bed/hgNearBlastp
cd /cluster/data/dm3/bed/hgNearBlastp
mkdir 081021
cd 081021
# Get the proteins used by the other hgNear organisms:
pepPredToFa hg18 knownGenePep hg18.known.faa
pepPredToFa mm9 knownGenePep mm9.known.faa
pepPredToFa rn4 knownGenePep rn4.known.faa
pepPredToFa danRer3 ensPep danRer3.ensPep.faa
pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
pepPredToFa ce6 sangerPep ce6.sangerPep.faa
pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
# Galt's synBlastp.csh filtering, which uses liftOver chains,
# would be the next step if dm3 were reasonably closely related
# (like mammal-mammal) to another Gene Sorter organism. But it is
# pretty distant from all the others, so specify recipBest for all
# query databases.
cat << _EOF_ > config.ra
# Latest fly vs. other Gene Sorter orgs:
# human, mouse, rat, zebrafish, worm, yeast
targetGenesetPrefix flyBase
targetDb dm3
queryDbs hg18 mm9 rn4 danRer3 ce6 sacCer1
recipBest hg18 mm9 rn4 danRer3 ce6 sacCer1
dm3Fa /cluster/data/dm3/bed/hgNearBlastp/081021/dm3.flyBasePep.faa
hg18Fa /cluster/data/dm3/bed/hgNearBlastp/081021/hg18.known.faa
mm9Fa /cluster/data/dm3/bed/hgNearBlastp/081021/mm9.known.faa
rn4Fa /cluster/data/dm3/bed/hgNearBlastp/081021/rn4.known.faa
danRer3Fa /cluster/data/dm3/bed/hgNearBlastp/081021/danRer3.ensPep.faa
ce6Fa /cluster/data/dm3/bed/hgNearBlastp/081021/ce6.sangerPep.faa
sacCer1Fa /cluster/data/dm3/bed/hgNearBlastp/081021/sacCer1.sgdPep.faa
buildDir /cluster/data/dm3/bed/hgNearBlastp/081021
scratchDir /san/sanvol1/scratch/dm3HgNearBlastp
_EOF_
# Run with -noLoad so we can eyeball files, manually load dm3 tables now,
# and later overload other databases' dmBlastTab tables.
doHgNearBlastp.pl -noLoad config.ra >& do.log &
tail -f do.log
# Ran the load scripts for dm3 tables manually as suggested by the
# output of doHgNearBlastp.pl:
# *** -noLoad was specified -- you can run this script manually to load dm3 tables:
run.dm3.dm3/loadPairwise.csh
#Loading database with 904530 rows
# *** -noLoad was specified -- you can run these scripts manually to load dm3 tables:
run.dm3.hg18/loadPairwise.csh
#Loading database with 5946 rows
run.dm3.mm9/loadPairwise.csh
#Loading database with 5906 rows
run.dm3.rn4/loadPairwise.csh
#Loading database with 3058 rows
run.dm3.danRer3/loadPairwise.csh
#Loading database with 5328 rows
run.dm3.ce6/loadPairwise.csh
#Loading database with 4665 rows
run.dm3.sacCer1/loadPairwise.csh
#Loading database with 2205 rows
# 7/18/07,9/18/07,10/21/08: dm3 is on RR, so time to load*.dmBlastTab:
# *** -noLoad was specified -- you can run these scripts manually to load dmBlastTab in query databases:
run.hg18.dm3/loadPairwise.csh
#Loading database with 5946 rows
run.mm9.dm3/loadPairwise.csh
#Loading database with 5906 rows
run.rn4.dm3/loadPairwise.csh
#Loading database with 3058 rows
run.danRer3.dm3/loadPairwise.csh
#Loading database with 5328 rows
run.ce6.dm3/loadPairwise.csh
#Loading database with 4665 rows
run.sacCer1.dm3/loadPairwise.csh
#Loading database with 2205 rows
# CLUSTER GENES AND MAP TO OTHER DATA
# FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07)
# Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
# FlyBase 5.3 history: (DONE 9/20/07 angie)
# FlyBase 5.12 history: (DONE 10/24/08 angie)
ssh hgwdev
hgClusterGenes dm3 flyBaseGene flyBaseIsoforms flyBaseCanonical \
-protAccQuery='select name,alias from flyBaseToUniProt'
#Got 13784 clusters, from 21236 genes in 15 chromosomes
# Create table that maps between flyBase genes and RefSeq
hgMapToGene dm3 refGene flyBaseGene flyBaseToRefSeq
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt dm3 flyBaseToUniProt name alias Pfam flyBaseToPfam
# Create a table that maps between flyBase genes and the
# Stanford Microarray Project expression data. (see makeHgFixed.doc)
hgsql dm3 -e 'drop table if exists flyBaseToCG; create table flyBaseToCG \
select name,SUBSTRING_INDEX(name,"-",1) as value from flyBaseGene'
hgsql dm3 -e 'create index name on flyBaseToCG(name(12))'
nice hgExpDistance dm3 hgFixed.arbFlyLifeMedianRatio dummyArg \
arbExpDistance -lookup=flyBaseToCG
# MAKE A DESCRIPTION TABLE FOR HGNEAR KNOWNDETAILS
# FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07)
# Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
# FlyBase 5.3 history: (DONE 9/20/07 angie)
# FlyBase 5.12 history: (DONE 10/24/08 angie)
# hgNear's knownDetails column type expects a single table that links
# transcript ID with uniProt description, so whip one up here:
# (used by hgGene too though hgGene could join on the fly)
ssh hgwdev
hgsql dm3 -e 'drop table if exists flyBaseToDescription; \
create table flyBaseToDescription \
select flyBaseToUniProt.name,uniProt.description.val \
from flyBaseToUniProt,uniProt.description \
where flyBaseToUniProt.alias = uniProt.description.acc'
hgsql dm3 -e 'create index name on flyBaseToDescription(name(12))'
# FLYP2P
# FlyBase 5.1 history: (DONE 4/5/07 angie - REDONE 6/18/07)
# Redone 6/18/07 after folding in DHGP's chr*Het heterochromatin annotations.
# FlyBase 5.3 history: (hgLoadNetDist REDONE 9/18/07 angie)
# FlyBase 5.12 history: (hgLoadNetDist REDONE 10/21/08 angie)
# See hg/near/makeNear.doc...
mkdir /cluster/data/dm3/bed/p2p
cd /cluster/data/dm3/bed/p2p
# Watch out... hgLoadNetDist overwrites and removes flyP2P.tab!
# so rename here:
cp -p /cluster/data/dm1/p2p/flyP2P.tab flyP2P.raw.txt
ssh -x kolossus hgNetDist /cluster/data/dm3/bed/p2p/{flyP2P.raw.txt,tmp.tab}
hgLoadNetDist tmp.tab dm3 flyP2P \
-sqlRemap='select fbgn,name from flyBase2004Xref'
#hgLoadNetDist 512 id-remapping misses, see missing.tab
# That's OK, some genes were in dm1 but are not in dm3 (flybase 3 vs. 5)
# MAKE ORGANISM-SPECIFIC HGNEARDATA AND HGGENEDATA FILES (DONE 4/6/07 angie)
cd ~/kent/src/hg/near/hgNear/hgNearData
# Made dm3 subdir of D_melanogaster with usual .ra files.
# Similarly for hgGene/hgGeneData.
# Make alpha on genome-test for hgNear, hgGene and trackDb DBS=dm3 .
# ENABLE HGNEAR FOR DM3 IN HGCENTRALTEST (DONE 4/6/07 angie)
hgsql -h genome-testdb hgcentraltest \
-e "update dbDb set hgNearOk = 1 where name = 'dm3';"
# END OF HGNEAR STUFF
#########################################################################
#########################################################################
# BDGP IN SITU IMAGE LINKS (DONE 4/7/06)
# grabbed data again 10/24/08, no change.
# HGGENE
ssh hgwdev
cd /cluster/data/dm3/bed/bdgpExprLink
wget -O summary.txt.081024 \
'http://www.fruitfly.org/cgi-bin/ex/bquery.pl?qpage=entry&qtype=summarytext'
# remove header line.
hgLoadSqlTab dm3 bdgpExprLink ~/kent/src/hg/lib/bdgpExprLink.sql \
summary.txt.070406
#########################################################################
# "FLYBASE 5.1" DHGP HETEROCHROMATIN ANNOTATIONS (DONE 6/18/07 angie)
# OBSOLETED by load of FlyBase 5.3 genes 9/18/07.
# These heterochromatin annotations have just been published in Science
# http://www.sciencemag.org/cgi/content/full/316/5831/1586
# -- future updates will be from FlyBase, but for now the GFF is in the
# paper's Supplemental Data!
ssh kkstore05
mkdir /cluster/data/dm3/bed/flybase5.1/Het
cd /cluster/data/dm3/bed/flybase5.1/Het
wget ftp://ftp.dhgp.org/pub/DHGP/Science_2007_Supplemental_Data/Supplemental_File_B_GFF/R5_all_GFF_results.tar.gz
foreach chr (2LHet 2RHet 3LHet 3RHet U Uextra XHet YHet)
wget ftp://ftp.dhgp.org/pub/DHGP/Science_2007_Supplemental_Data/Supplemental_File_C_FASTA/${chr}_translation_dmel_RELEASE5-1.FASTA
end
tar xvzf R5_all_GFF_results.tar.gz
# Of course they invented a completely new encoding of names and
# properties into GFF 9th col and FASTA headers. So instead of
# attempting to reuse the flybase-parsing script, just pick out
# the rows we need and parse each row type... we will not get
# enough info to complete flyBase2004Xref, but c'est la vie.
# There are files for non-het chroms... I took a look at 2L_annotation
# and its annotations seem to be relative to chr2L:22000975... adding
# that offset, almost all of the ones I checked appear in our current
# dm3 flyBaseGene. So I'll just use the Het files and ignore the others.
foreach type (3_UTR 5_UTR CDS CDS_exon annotation exon transcript)
echo $type
awk '{print $3;}' {*Het,U*}_${type}_dmel* | sort | uniq -c
end
# Looks like the file-types and $3-types we need are these:
# CDS_exon CDS_exon
# annotation exon, gene, ncRNA, pseudogene, rRNA
# I will exclude RR* (repeat region)
# I'll build up a .gff type by type:
cp /dev/null extract.gff
awk '$9 !~ /^name=RR/ && $3 == "CDS_exon" {print;}' {*Het,U*}_CDS_exon*GFF \
| perl -wpe 's/^/chr/; s/CDS_exon/CDS/; \
s/name=\w+:\d+_(C\w\d+-R[A-Z]+).*/$1/ || die "$_";' \
>> extract.gff
awk '$9 !~ /^genegrp=RR/ && $3 == "exon" {print;}' {*Het,U*}_annotation*GFF \
| perl -wpe 's/^/chr/; \
s/genegrp=\w+; transgrp=(C\w\d+-R[A-Z]+).*/$1/ || die "$_";' \
>> extract.gff
# All of the rows break down into either CG* IDs (coding) or CR* (nonc):
wc -l extract.gff
#3245 extract.gff
awk '$9 ~ /^CR/ {print;}' extract.gff | wc -l
#261
awk '$9 ~ /^CG/ {print;}' extract.gff | wc -l
#2984
expr 261 + 2984
#3245
# Break out coding and noncoding, and sort for readability:
awk '$9 ~ /^CR/ {print;}' extract.gff \
| sort -k 9,9 -k 1,1 -k 4n,4n \
> flyBaseNoncodingHet.gff
awk '$9 ~ /^CG/ {print;}' extract.gff \
| sort -k 9,9 -k 1,1 -k 4n,4n \
> flyBaseGeneHet.gff
# gff -> genePred -> bed for noncoding
ldHgGene dummy dummy flyBaseNoncodingHet.gff -nobin -out=stdout \
| genePredToBed \
| awk 'BEGIN {OFS="\t";} $7 == 0 && $8 == 0 {$7 = $2; $8 = $2;} {print;}' \
> flyBaseNoncodingHet.bed
#Read 127 transcripts in 261 lines in 1 files
# Now get what we can get for flyBase2004Xref:
awk '$3 ~ /^(gene|pseudogene|.*RNA)$/ {print;}' {*Het,U*}_annotation*GFF \
| perl -pe 's/^\w+\tgadfly\t(pseudogene|\w+RNA)?(gene)?.*name=(\w+); symbol=([^;]*); (dbxref=FlyBase:(FBgn\d+))?.*$/$3\t$4\t\t\t$6\t\t\t$1/ \
|| die "$_";\
s/^(\w+)\t\t/$1\t$1\t/;' \
| sort -k 1,1 \
> flyBase2004XrefHet.tab
# That almost works but it needs C*-R* IDs and has only C*'s.
# Join with the C*-R* IDs:
awk '{print $9;}' fly*gff \
| sort -u \
| perl -wpe 's/^(\w+)/$1\t$1/;' \
> cgTocgr.txt
join -j 1 -a 2 -t' ' -o '1.2 2.2 2.3 2.4 2.5 2.6 2.7 2.8' \
cgTocgr.txt flyBase2004XrefHet.tab \
> tmp.tab
mv tmp.tab flyBase2004XrefHet.tab
# Uh-oh, some of these are already in flyBase2004Xref, so we'll get
# mysql errors (unique index) from trying to overload... and with
# more of the fields filled in, so omit them from the new load.
awk '{print $1;}' ../flyBase2004Xref.tab | sort > tmp1
awk '{print $1;}' flyBase2004XrefHet.tab | sort > tmp2
comm -1 -2 tmp1 tmp2
#CG41431-RA
#CG41432-RA
#CG41434-RA
#CG41436-RA
# Only 4 such -- manually removed them from flyBase2004XrefHet.tab.
perl -wpe 's/^>(\w+)-P(\w+)/>$1-R$2/' {*Het,U*}_*.FASTA \
> flyBasePepHet.fa
# Manually removed those 4 from the fa too.
# Also removed this lone RR element from the fa: RR40941-RA
# Now *additively* load the new data on top of the old:
ssh hgwdev
cd /cluster/data/dm3/bed/flybase5.1/Het
# Protein-coding genes:
ldHgGene -oldTable -nobin dm3 flyBaseGene flyBaseGeneHet.gff
#Read 503 transcripts in 2984 lines in 1 files
# 503 groups 7 seqs 1 sources 2 feature types
cat ../flyBasePep.fa flyBasePepHet.fa \
| hgPepPred dm3 generic flyBasePep stdin
# Noncoding genes:
hgLoadBed -oldTable dm3 flyBaseNoncoding flyBaseNoncodingHet.bed
#Loaded 127 elements of size 12
# Cross-referencing info for both coding and noncoding:
hgLoadSqlTab -oldTable dm3 flyBase2004Xref \
~/kent/src/hg/lib/flyBase2004Xref.sql flyBase2004XrefHet.tab
# Make sure there are no joinerCheck errors at this point, because
# if left here they can spread via doHgNearBlastp:
runJoiner.csh dm3 flyBasePep
# See if there are still large regions where refGene has genes but
# flybase doesn't:
featureBits dm3 refGene \!flyBaseGene -minSize=1000 -bed=stdout
# better now that the Het's (except for Uextra) are included.
# add upstream* downloadable files
ssh hgwdev
mkdir /cluster/data/dm3/bed/flybase5.1/bigZips
cd /cluster/data/dm3/bed/flybase5.1/bigZips
foreach size (1000 2000 5000)
echo upstream$size
nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \
| nice gzip -c > upstream$size.fa.gz
end
nice md5sum *.gz > md5sum.txt
ln -s /cluster/data/dm3/bed/flybase5.1/bigZips/*.gz \
/usr/local/apache/htdocs/goldenPath/dm3/bigZips/
cat md5sum.txt \
>> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
# Edited out the old upstream* md5sums.
#########################################################################
# TWEAK DEFAULT POSITION (DONE 6/18/07 angie)
ssh hgwdev
hgsql hgcentraltest -e 'update dbDb set defaultPos="chr2L:826001-851000" where name = "dm3";'
#########################################################################
# ORegAnno - Open Regulatory Annotations
# update Oct 26, 2007; July 7, 2008
# loaded by Belinda Giardine, in same manner as hg18 ORegAnno track
#########################################################################
# FLYBASE 5.3 ANNOTATIONS (DONE 9/17/07 angie)
# OBSOLETED by load of FlyBase 5.12 genes 10/21/08.
ssh kkstore05
mkdir /cluster/data/dm3/bed/flybase5.3
cd /cluster/data/dm3/bed/flybase5.3
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.3_FB2007_02/gff/dmel-all-r5.3.gff.gz
gunzip dmel-all-r5.3.gff.gz
# What data sources are represented in this file?
grep -v '^#' dmel-all-r5.3.gff | awk '{print $2 "\t" $3;}' | sort | uniq -c
# excerpt (many other sources, including blastx:... , sim4:... and
# tblastx:...; also various other types for source "FlyBase" and "."):
69531 FlyBase CDS
68426 FlyBase exon
15185 FlyBase gene
20738 FlyBase mRNA
90 FlyBase miRNA
105 FlyBase ncRNA
94 FlyBase pseudogene
164 FlyBase rRNA
47 FlyBase snRNA
249 FlyBase snoRNA
314 FlyBase tRNA
5385 FlyBase transposable_element
38724 FlyBase transposable_element_insertion_site
# Same types as 5.1, good.
# What keywords are defined in the 9th field?
grep -v '^#' dmel-all-r5.3.gff \
| awk '{print $9;}' \
| perl -wpe 'while (s/(\w+)=//) {print "$1\n";} s/^.*\n$//;' \
| sort | uniq -c
17505 Alias
412018 Dbxref
6571566 ID
6530043 Name
4008974 Parent
3048194 Target
16921 cyto_range
# Comparable to 5.1, good.
cp ../flybase5.1/extractGenes.pl .
# Mods to 5.1 script:
# - so many new keywords, just say which ones we're ignoring instead
# of die-ing if we don't recognize a new one.
# - protein line: Parent --> Derives_from.
# - some ncRNA's have CG-*, not CR-*, IDs:
time ./extractGenes.pl dmel-all-r5.3.gff
#Noncoding FBtr0113460 has CG ID (CG33294-RB)
#Noncoding FBtr0113490 has CG ID (CG34647-RA)
#Noncoding FBtr0113491 has CG ID (CG34648-RA)
#Noncoding FBtr0113492 has CG ID (CG34649-RA)
#Keywords that have been ignored in one or more line types:
#CDS: ID
#CDS: Name
#exon: Dbxref
#exon: ID
#exon: Name
#gene: Alias
#gene: Ontology_term
#gene: cyto_range
#gene: gbunit
#mRNA: Alias
#mRNA: Name
#mRNA: score
#mRNA: score_text
#noncoding: Alias
#noncoding: score
#noncoding: score_text
#protein: Alias
#protein: Dbxref
#protein: Name
#protein: derived_isoelectric_point
#protein: derived_molecular_weight
#71.391u 1.296s 1:12.75 99.9% 0+0k 0+0io 0pf+0w
mv flyBaseGene.src=FlyBase.gtf flyBaseGene.gtf
# Get predicted proteins (for main annotations only)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.3_FB2007_02/fasta/dmel-all-translation-r5.3.fasta.gz
# Use the FBgn IDs in headers to retrieve CG-R's to match flyBaseGene:
zcat dmel-all-translation-r5.3.fasta.gz \
| perl -we 'open(X, "flyBase2004Xref.tab") || die; \
while (<X>) { \
@w = split("\t"); \
$fbtr2cgr{$w[3]} = $w[0]; \
} \
while (<>) { \
if (/^>.*parent=FBgn\d+,(FBtr\d+)/) { \
$cgr = $fbtr2cgr{$1}; \
if ($cgr) { \
s/^>.*/>$cgr/; \
} else { die; } \
} \
print; \
} \
' \
> flyBasePep.fa
ssh hgwdev
cd /cluster/data/dm3/bed/flybase5.3
# Protein-coding genes:
ldHgGene -gtf dm3 flyBaseGene flyBase*.gtf
# 20738 groups 14 seqs 1 sources 2 feature types
hgPepPred dm3 generic flyBasePep flyBasePep.fa
# Noncoding genes:
hgLoadBed dm3 flyBaseNoncoding flyBaseNoncoding.bed
#Loaded 1063 elements of size 12
# Cross-referencing info for both coding and noncoding:
hgLoadSqlTab dm3 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
flyBase2004Xref.tab
# Make sure there are no joinerCheck errors amongst those 3 tables
# at this point, because if left here they can spread via doHgNearBlastp.
# Since this is an update of those tables to a new FlyBase version,
# and other hgNear tables already exist with the old version's set of
# IDs, ignore errors for missing CG items in tables that will be
# rebuilt in the hgNear process.:
runJoiner.csh dm3 flyBasePep
# See if there are still large regions where refGene has genes but
# flybase doesn't:
featureBits dm3 refGene \!flyBaseGene -minSize=1000 -bed=stdout
#335940 bases of 162367812 (0.207%) in intersection
# Spot-checked... some transcripts removed here and there, looks OK.
# add upstream* downloadable files
ssh hgwdev
mkdir /cluster/data/dm3/bed/flybase5.3/bigZips
cd /cluster/data/dm3/bed/flybase5.3/bigZips
foreach size (1000 2000 5000)
echo upstream$size
nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \
| nice gzip -c > upstream$size.fa.gz
end
nice md5sum *.gz > md5sum.txt
ln -s /cluster/data/dm3/bed/flybase5.3/bigZips/*.gz \
/usr/local/apache/htdocs/goldenPath/dm3/bigZips/
# When updating, edit out the old upstream* sums from md5sum.txt, then:
cat md5sum.txt \
>> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
#############################################################################
# N-SCAN GENES track ( markd)
# create a composite track with exists ab-inito and new PASA N-SCAN predictions
# download predictions
cd /cluster/data/dm3/bed/nscan
wget -nv http://mblab.wustl.edu/predictions/Drosophila/dm3/dm3.gtf
wget -nv http://mblab.wustl.edu/predictions/Drosophila/dm3/dm3.prot.fa
wget -nv http://mblab.wustl.edu/predictions/Drosophila/dm3/readme.txt
bzip2 dm3.*
chmod a-w *
ldHgGene -gtf -genePredExt dm3 nscanPasaGene dm3.gtf.bz2
hgPepPred dm3 generic nscanPasaPep dm3.prot.fa.bz2
rm *.tab
# update trackDb; need a dm3-specific page to describe informants and PASA
# make sure termRegex matches naming
drosophila/dm3/nscanPasaGene.html
drosophila/dm3/dm3/trackDb.ra
#########################################################################
##########################################################################
# EVOFOLD (Done, 10.02.2007) Jakob Skou Pedersen
# RNA secondary structure predictions lifted from dm2 and filtered
ssh -C hgwdev
mkdir -p /cluster/data/dm3/bed/evofold
cd /cluster/data/dm3/bed/evofold
echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql dm2 | sed -e 1d > foldsDm2.bed
liftOver -minMatch=1.0 foldsDm2.bed /cluster/data/dm2/bed/liftOver/dm2ToDm3.over.chain.gz tmp.bed unmapped.bed
# remove elements which are wrong size after lifting
awk '$3-$2 == $7' tmp.bed | sort -k4,4 > rawFoldsDm3.bed
# structure filters
# first, remove pairs that can't form in human
cut -f 1-6 rawFoldsDm3.bed > tmp.bed
# sequenceForBed can be found and compiled from here: $HOME/kent/src/hg/altSplice/altSplice/
nice /cluster/home/sugnet/bin/i386/sequenceForBed -db=dm3 -bedIn=tmp.bed -fastaOut=tmp.fa
cat tmp.fa | sed -e 's/\.[+-]\.chr.*$//' \
| sed -e '/^>/s/$/\t/' | tr -d '\n' | sed -e 's/>/\n/g' | sed -e '1d' -e '$s/$/\n/' | sort -k1,1 > foldsDm3Seq.tab
join -1 4 -2 1 -o "1.4 1.8 2.2" rawFoldsDm3.bed foldsDm3Seq.tab | sed -e 's/ */\t/g' | sort -k1,1 \
| /cluster/home/jsp/scripts/tabFoldFilter.py > cleanFolds.tab
join -1 4 -2 1 -o "1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 1.9" rawFoldsDm3.bed cleanFolds.tab | sed -e 's/ */\t/g' > tmp1.bed
# second, remove poor predictions
# scripts can be found in cvs tree at: cvsroot/jsp/scripts/. They use a few modules which can be found at: cvsroot/jsp/py_modules
cat tmp1.bed | /cluster/home/jsp/scripts/bedRnassFilter.py --dangling --minAvrStemSize=3 | /cluster/home/jsp/scripts/bedRnassFilter.sh 1 3 \
| /cluster/home/jsp/scripts/roundListFloats.py -c9 > foldsDm3.bed
# clean up
rm tmp.bed tmp1.bed foldsDm2.bed foldsDm3Seq.tab rawFoldsDm3.bed tmp.fa cleanFolds.tab
# comment: all the above filtering did not remove any
# predictions... apparently dm2 and dm3 are quite similar.
# upload
hgLoadBed -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/evofold.sql dm3 evofold foldsDm3.bed
#############################################################################
# CONTRAST GENES (2007-10-02 markd)
# recieved predictions from Sam Gross <ssgross@stanford.edu>
cd /cluster/data/dm3/bed/contrastGene/
wget http://www.stanford.edu/~ssgross/contrast.dm3.bed
# this is a custom track, not a pure BED
tail +2 contrast.dm3.bed | hgLoadBed -tab dm3 contrastGene stdin
# verify
# load track db (ra and contrastGene.html are global
# request push of contrastGene
###########################################################################
# GENE DISRUPTION PROJECT/PSCREEN (DONE 5/1/08 angie)
mkdir /cluster/data/dm3/bed/pscreen
cd /cluster/data/dm3/bed/pscreen
# Saved Excel spreadsheet emailed by Karen Schulze kschulze at bcm dot
# tmc dot edu as tab-separated text. Translate to our table format;
dos2unix pscreen.txt
tail +2 pscreen.txt \
| perl -we ' \
while (<>) { \
chomp; \
($stock, $strain, $arm, $end, $ori, undef, undef, \
$gene1, undef, $gene2, undef, $gene3) = split("\t"); \
next if (! $end); \
$chr = "chr" . $arm; \
$strand = ($ori eq "Minus") ? "-" : "+"; \
$geneCount = 0; $geneIds = ""; \
foreach $gene ($gene1, $gene2, $gene3) { \
if ($gene) { $geneCount++; $geneIds .= "$gene,"; } \
} \
print join("\t", $chr, $end-1, $end, $strain, 0, $strand, \
$stock, $geneCount, $geneIds) . "\n"; \
}' \
| sort -k1,1 -k2n,2n \
> pscreen.bed
hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/pscreen.sql -notItemRgb \
-onServer -tab dm3 pscreen pscreen.bed
#Loaded 7672 elements of size 9
rm bed.tab
#########################################################################
# hgPal downloads (DONE braney 2008-09-30)
ssh hgwdev
screen
bash
rm -rf /cluster/data/dm3/bed/multiz15way/pal
mkdir /cluster/data/dm3/bed/multiz15way/pal
cd /cluster/data/dm3/bed/multiz15way/pal
echo $db > order.lst
echo "select settings from trackDb where tableName='$mz'" | \
hgsql $db | sed 's/\\n/\n/g'| grep speciesOrder | tr ' ' '\n' | \
tail -n +2 >> order.lst
mz=multiz15way
gp=refGene
db=dm3
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > refGene.jobs
time sh -x refGene.jobs > refGene.job.log 2>&1 &
sleep 1
tail -f refGene.job.log
# real 35m20.802s
# user 5m26.007s
# sys 1m25.462s
zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
# we're only distributing exons at the moment
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
mz=multiz15way
gp=flyBaseGene
db=dm3
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.$mz.jobs
time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
sleep 1
tail -f $gp.$mz.job.log
# real 35m13.992s
# user 5m18.930s
# sys 1m27.205s
zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
# we're only distributing exons at the moment
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
#########################################################################
# LIFTOVER TO DM2 (DONE 6/11/08 angie)
doSameSpeciesLiftOver.pl dm3 dm2 > & dm3ToDm2.log & tail -f dm3ToDm2.log
mv dm3ToDm2.log /cluster/data/dm3/bed/blat.dm2.2008-06-11/
#########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd -- UNDONE 2008-10-27 markd)
update genbank.conf:
dm3.upstreamGeneTbl = refGene
dm3.upstreamMaf = multiz15way /hive/data/genomes/dm3/bed/multiz15way/species.lst
# Those lines have been removed because Ann, Angie and Mark decided to
# use FlyBase Genes as the reference set instead of RefSeq.
#########################################################################
# FLYBASE 5.12 ANNOTATIONS (DONE 10/21/08 angie)
mkdir /hive/data/genomes/dm3/bed/flybase5.11
cd /hive/data/genomes/dm3/bed/flybase5.11
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.12_FB2008_09/gff/dmel-all-r5.12.gff.gz
gunzip dmel-all-r5.12.gff.gz
# What data sources are represented in this file?
grep -v '^#' dmel-all-r5.12.gff | awk '{print $2 "\t" $3;}' | sort | uniq -c
# excerpt (many other sources, including blastx:... , sim4:... and
# tblastx:...; also various other types for source "FlyBase" and "."):
72096 FlyBase CDS
69371 FlyBase exon
21140 FlyBase five_prime_UTR
15140 FlyBase gene
21243 FlyBase mRNA
90 FlyBase miRNA
154 FlyBase ncRNA
95 FlyBase pseudogene
161 FlyBase rRNA
47 FlyBase snRNA
249 FlyBase snoRNA
314 FlyBase tRNA
15546 FlyBase three_prime_UTR
5413 FlyBase transposable_element
43178 FlyBase transposable_element_insertion_site
# Same types as 5.1, good.
# What keywords are defined in the 9th field? (again, ignoring a bunch)
grep -v '^#' dmel-all-r5.12.gff \
| awk '{print $9;}' \
| perl -wpe 'while (s/(\w+)=//) {print "$1\n";} s/^.*\n$//;' \
| sort | uniq -c
79889 Alias
408477 Dbxref
21243 Derives_from
3079555 ID
7236214 Name
4460030 Parent
5376450 Target
1384 cyto_range
# Comparable to 5.1, good.
cp ../flybase5.3/extractGenes.pl .
time ./extractGenes.pl dmel-all-r5.12.gff
#Noncoding FBtr0070097 has CG ID (CG18275-RA)
#Keywords that have been ignored in one or more line types:
#CDS: ID
#CDS: Name
#exon: Dbxref
#exon: ID
#exon: Name
#gene: Alias
#gene: Ontology_term
#gene: derived_computed_cyto
#gene: fullname
#gene: gbunit
#mRNA: Alias
#mRNA: Name
#mRNA: score
#mRNA: score_text
#noncoding: Alias
#noncoding: score
#noncoding: score_text
#protein: Alias
#protein: Dbxref
#protein: Name
#protein: derived_isoelectric_point
#protein: derived_molecular_weight
#81.970u 2.490s 1:25.12 99.2% 0+0k 0+0io 0pf+0w
mv flyBaseGene.src=FlyBase.gtf flyBaseGene.gtf
# Get predicted proteins (for main annotations only)
wget ftp://flybase.net/genomes/Drosophila_melanogaster/dmel_r5.12_FB2008_09/fasta/dmel-all-translation-r5.12.fasta.gz
# Use the FBgn IDs in headers to retrieve CG-R's to match flyBaseGene:
zcat dmel-all-translation-r5.12.fasta.gz \
| perl -we 'open(X, "flyBase2004Xref.tab") || die; \
while (<X>) { \
@w = split("\t"); \
$fbtr2cgr{$w[3]} = $w[0]; \
} \
while (<>) { \
if (/^>.*parent=FBgn\d+,(FBtr\d+)/) { \
$cgr = $fbtr2cgr{$1}; \
if ($cgr) { \
s/^>.*/>$cgr/; \
} else { die; } \
} \
print; \
} \
' \
> flyBasePep.fa
# Load Protein-coding genes:
ldHgGene -gtf dm3 flyBaseGene flyBase*.gtf
# 21243 groups 14 seqs 1 sources 2 feature types
hgPepPred dm3 generic flyBasePep flyBasePep.fa
# Noncoding genes:
hgLoadBed dm3 flyBaseNoncoding flyBaseNoncoding.bed
#Loaded 1063 elements of size 12
# Cross-referencing info for both coding and noncoding:
hgLoadSqlTab dm3 flyBase2004Xref ~/kent/src/hg/lib/flyBase2004Xref.sql \
flyBase2004Xref.tab
# Make sure there are no joinerCheck errors amongst those 3 tables
# at this point, because if left here they can spread via doHgNearBlastp.
# Since this is an update of those tables to a new FlyBase version,
# and other hgNear tables already exist with the old version's set of
# IDs, ignore errors for missing CG items in tables that will be
# rebuilt in the hgNear process.:
runJoiner.csh dm3 flyBasePep
# See if there are still large regions where refGene has genes but
# flybase doesn't:
featureBits dm3 flyBaseGene -or flyBaseNoncoding -bed=tmp.bed
featureBits dm3 refGene \!tmp.bed -minSize=1000 -bed=stdout
#279626 bases of 162367812 (0.172%) in intersection
# Spot-checked... some transcripts removed here and there, looks OK.
# add upstream* downloadable files
mkdir /hive/data/genomes/dm3/bed/flybase5.12/bigZips
cd /hive/data/genomes/dm3/bed/flybase5.12/bigZips
foreach size (1000 2000 5000)
echo upstream$size
nice featureBits dm3 flyBaseGene:upstream:$size -fa=stdout \
| nice gzip -c > upstream$size.fa.gz
end
nice md5sum *.gz > md5sum.txt
rm /usr/local/apache/htdocs/goldenPath/dm3/bigZips/upstream?000.fa.gz
ln -s /hive/data/genomes/dm3/bed/flybase5.12/bigZips/*.gz \
/usr/local/apache/htdocs/goldenPath/dm3/bigZips/
# When updating, edit out the old upstream* sums from md5sum.txt, then:
cat md5sum.txt \
>> /usr/local/apache/htdocs/goldenPath/dm3/bigZips/md5sum.txt
# Now update a bunch of tables:
# arbExpDistance, *BlastTab, flyBase{Canonical,Isoforms}, flyBaseTo*,
# and any other tables used in conjunction with flyBaseGene in hgGene
# or hgNear.
#########################################################################
+# LIFTOVER BDTNP DATA FROM DM2 (DONE 1/29/09 angie - REDONE with more data 2/2/09)
+ ssh hgwdev
+ mkdir /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver
+ cd /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver
+ # Original files are varStep with unspecified step=1 -- translate
+ # to bed3, liftOver, and translate back to varStep:
+ foreach f (/hive/data/genomes/dm2/bed/bdtnpChipper/*.wig)
+ echo $f:t:r
+ grep -v ^track $f \
+ | perl -wpe 'chomp; @w = split(/[ \t=]/); \
+ if ($w[0] eq "variableStep") { $chr = $w[2]; $_ = ""; } \
+ elsif (defined $w[1]) { $_ = "$chr\t" . ($w[0]-1) . "\t$w[0]\t$w[1]\n"; }' \
+ | liftOver -bedPlus=4 stdin /hive/data/genomes/dm2/bed/liftOver/dm2ToDm3.over.chain.gz \
+ stdout $f:t:r.unmapped \
+ | sort -k1,1 -k2n,2n \
+ | perl -wpe 'chomp; @w=split("\t"); next unless $w[3]; \
+ print "variableStep chrom=$w[0]\n" if (\!defined $prevChr || $prevChr ne $w[0]); \
+ $_ = "$w[2]\t$w[3]\n"; $prevChr = $w[0];' \
+ > $f:t:r.lo.wig
+ end
+ wc -l *.unmapped | grep -v '^ 0 '
+# 2 bdtnpDl3Fdr25.unmapped
+# 2 total
+ # This is the one unmapped item:
+ cat bdtnpDl3Fdr25.unmapped
+##Deleted in new
+#chrX 3521682 3521683 0.983636307483747
+
+ mkdir wigEncoded
+ cd wigEncoded
+ foreach f (../*Fdr*.lo.wig)
+ wigEncode $f $f:t:r:r.{wig,wib}
+ end
+ # Same output as for dm2, good:
+#Converted ../bdtnpBcd1Fdr1.wig, upper limit 8.14, lower limit 0.80
+#Converted ../bdtnpBcd1Fdr25.wig, upper limit 8.14, lower limit 0.80
+#Converted ../bdtnpBcd2Fdr1.wig, upper limit 10.74, lower limit 0.68
+#Converted ../bdtnpBcd2Fdr25.wig, upper limit 10.74, lower limit 0.68
+#Converted ../bdtnpCad1Fdr1.wig, upper limit 16.20, lower limit 1.06
+#Converted ../bdtnpCad1Fdr25.wig, upper limit 16.20, lower limit 0.48
+#Converted ../bdtnpGt2Fdr1.wig, upper limit 22.22, lower limit 1.21
+#Converted ../bdtnpGt2Fdr25.wig, upper limit 22.22, lower limit 0.49
+#Converted ../bdtnpHb1Fdr1.wig, upper limit 14.38, lower limit 0.92
+#Converted ../bdtnpHb1Fdr25.wig, upper limit 14.38, lower limit 0.86
+#Converted ../bdtnpHb2Fdr1.wig, upper limit 24.25, lower limit 0.88
+#Converted ../bdtnpHb2Fdr25.wig, upper limit 24.25, lower limit 0.78
+#Converted ../bdtnpKni1Fdr1.wig, upper limit 8.08, lower limit 1.78
+#Converted ../bdtnpKni1Fdr25.wig, upper limit 8.08, lower limit 1.01
+#Converted ../bdtnpKni2Fdr1.wig, upper limit 8.44, lower limit 1.07
+#Converted ../bdtnpKni2Fdr25.wig, upper limit 8.44, lower limit 0.76
+#Converted ../bdtnpKr1Fdr1.wig, upper limit 14.47, lower limit 0.68
+#Converted ../bdtnpKr1Fdr25.wig, upper limit 14.47, lower limit 0.68
+#Converted ../bdtnpKr2Fdr1.wig, upper limit 12.92, lower limit 0.66
+#Converted ../bdtnpKr2Fdr25.wig, upper limit 12.92, lower limit 0.66
+#Converted ../bdtnpPolIIFdr1.wig, upper limit 30.86, lower limit 0.93
+#Converted ../bdtnpPolIIFdr25.wig, upper limit 30.86, lower limit 0.70
+#Converted ../bdtnpZ2Fdr1.wig, upper limit 11.49, lower limit 0.95
+#Converted ../bdtnpZ2Fdr25.wig, upper limit 11.49, lower limit 0.79
+#Converted ../bdtnpD1Fdr1.wig, upper limit 20.33, lower limit 0.51
+#Converted ../bdtnpD1Fdr25.wig, upper limit 20.33, lower limit 0.51
+#Converted ../bdtnpDa2Fdr1.wig, upper limit 26.09, lower limit 0.79
+#Converted ../bdtnpDa2Fdr25.wig, upper limit 26.09, lower limit 0.49
+#Converted ../bdtnpDl3Fdr1.wig, upper limit 16.97, lower limit 0.65
+#Converted ../bdtnpDl3Fdr25.wig, upper limit 16.97, lower limit 0.62
+#Converted ../bdtnpFtz3Fdr1.wig, upper limit 16.54, lower limit 1.07
+#Converted ../bdtnpFtz3Fdr25.wig, upper limit 16.54, lower limit 0.84
+#Converted ../bdtnpH1Fdr1.wig, upper limit 18.52, lower limit 0.68
+#Converted ../bdtnpH1Fdr25.wig, upper limit 18.52, lower limit 0.68
+#Converted ../bdtnpH2Fdr1.wig, upper limit 13.23, lower limit 0.68
+#Converted ../bdtnpH2Fdr25.wig, upper limit 13.23, lower limit 0.65
+#Converted ../bdtnpHkb1Fdr1.wig, upper limit 11.84, lower limit 1.03
+#Converted ../bdtnpHkb1Fdr25.wig, upper limit 11.84, lower limit 0.35
+#Converted ../bdtnpHkb2Fdr1.wig, upper limit 10.73, lower limit 0.87
+#Converted ../bdtnpHkb2Fdr25.wig, upper limit 10.73, lower limit 0.33
+#Converted ../bdtnpHkb3Fdr1.wig, upper limit 10.50, lower limit 0.90
+#Converted ../bdtnpHkb3Fdr25.wig, upper limit 10.50, lower limit 0.37
+#Converted ../bdtnpMad2Fdr1.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMad2Fdr25.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMed2Fdr1.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpMed2Fdr25.wig, upper limit 12.58, lower limit 0.67
+#Converted ../bdtnpPrd1Fdr1.wig, upper limit 13.97, lower limit 0.78
+#Converted ../bdtnpPrd1Fdr25.wig, upper limit 13.97, lower limit 0.78
+#Converted ../bdtnpPrd2Fdr1.wig, upper limit 12.00, lower limit 0.84
+#Converted ../bdtnpPrd2Fdr25.wig, upper limit 12.00, lower limit 0.84
+#Converted ../bdtnpRun1Fdr1.wig, upper limit 10.02, lower limit 0.60
+#Converted ../bdtnpRun1Fdr25.wig, upper limit 10.02, lower limit 0.60
+#Converted ../bdtnpRun2Fdr1.wig, upper limit 7.58, lower limit 1.16
+#Converted ../bdtnpRun2Fdr25.wig, upper limit 7.58, lower limit 0.58
+#Converted ../bdtnpShn2Fdr1.wig, upper limit 12.62, lower limit 1.19
+#Converted ../bdtnpShn2Fdr25.wig, upper limit 12.62, lower limit 0.81
+#Converted ../bdtnpShn3Fdr1.wig, upper limit 10.28, lower limit 1.09
+#Converted ../bdtnpShn3Fdr25.wig, upper limit 10.28, lower limit 0.95
+#Converted ../bdtnpSlp11Fdr1.wig, upper limit 40.39, lower limit 1.06
+#Converted ../bdtnpSlp11Fdr25.wig, upper limit 40.39, lower limit 0.62
+#Converted ../bdtnpSna1Fdr1.wig, upper limit 8.41, lower limit 0.97
+#Converted ../bdtnpSna1Fdr25.wig, upper limit 8.41, lower limit 0.77
+#Converted ../bdtnpSna2Fdr1.wig, upper limit 14.30, lower limit 0.57
+#Converted ../bdtnpSna2Fdr25.wig, upper limit 14.30, lower limit 0.57
+#Converted ../bdtnpTFIIB1Fdr1.wig, upper limit 21.81, lower limit 0.59
+#Converted ../bdtnpTFIIB1Fdr25.wig, upper limit 21.81, lower limit 0.59
+#Converted ../bdtnpTll1Fdr1.wig, upper limit 16.66, lower limit 0.98
+#Converted ../bdtnpTll1Fdr25.wig, upper limit 16.66, lower limit 0.57
+#Converted ../bdtnpTwi1Fdr1.wig, upper limit 26.11, lower limit 0.77
+#Converted ../bdtnpTwi1Fdr25.wig, upper limit 26.11, lower limit 0.68
+#Converted ../bdtnpTwi2Fdr1.wig, upper limit 26.66, lower limit 0.79
+#Converted ../bdtnpTwi2Fdr25.wig, upper limit 26.66, lower limit 0.55
+
+ mkdir /gbdb/dm3/bdtnp
+ ln -s /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver/wigEncoded/*.wib /gbdb/dm3/bdtnp/
+ cd /hive/data/genomes/dm3/bed/bdtnpChipperLiftOver/wigEncoded
+ foreach f (*.wig)
+ hgLoadWiggle -pathPrefix=/gbdb/dm3/bdtnp dm3 $f:r $f
+ end
+ rm wiggle.tab
+
+