src/hg/makeDb/doc/droMoj2.txt 1.4
1.4 2009/03/16 21:20:35 angie
LiftOver to dm3 for genome list q.
Index: src/hg/makeDb/doc/droMoj2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/droMoj2.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/makeDb/doc/droMoj2.txt 21 Aug 2006 21:36:29 -0000 1.3
+++ src/hg/makeDb/doc/droMoj2.txt 16 Mar 2009 21:20:35 -0000 1.4
@@ -1,675 +1,686 @@
# for emacs: -*- mode: sh; -*-
# Drosophila mojavensis --
#
# Agencourt's 1 August 2005 assembly
#
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+---------------------------------+
# | tableName | type |
# +-------------+---------------------------------+
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | geneMapper | genePred |
# | genscan | genePred genscanPep |
# +-------------+---------------------------------+
# DOWNLOAD SEQUENCE (DONE 8/5/05 angie)
ssh kkstore02
mkdir /cluster/store11/droMoj2
cd /cluster/data
ln -s /cluster/store11/droMoj2 droMoj2
cd /cluster/data/droMoj2
mkdir jkStuff bed
mkdir downloads
cd downloads
wget http://rana.lbl.gov/drosophila/assemblies/dmoj_agencourt_arachne_01aug05.tar.gz
tar xvzf dmoj_agencourt_arachne_01aug05.tar.gz
cd agencourt_arachne_01aug2005
faSize scaffolds.fa
#194270144 bases (13750513 N's 180519631 real 180519631 upper 0 lower) in 6843 sequences in 1 files
#Total size: mean 28389.6 sd 757113.0 min 101 (scaffold_5710) max 34172700 (scaffold_6540) median 1671
#N count: mean 2009.4 sd 16940.3
#U count: mean 26380.2 sd 746670.3
#L count: mean 0.0 sd 0.0
# PARTITION SCAFFOLDS FOR REPEATMASKER RUN (DONE 8/5/05 angie)
# Chop up large scaffolds into ~500kb chunks for RepeatMasker,
# then glom the tiny scaffolds up into ~500k collections (looks like
# some almost-500k pieces are glommed together --> a few almost-1M chunks,
# but that's OK).
ssh kkstore02
cd /cluster/data/droMoj2
mkdir scaffoldsSplit
mv downloads/agencourt_arachne_01aug2005/scaffolds.fa .
faSplit size scaffolds.fa 500000 -oneFile \
scaffoldsSplit -lift=jkStuff/scaffoldsSplit.lft
mkdir chunks500k
faSplit about scaffoldsSplit.fa 500000 chunks500k/chunk_
# CREATING DATABASE (DONE 8/5/05 angie)
# Create the database.
ssh hgwdev
# Make sure there is at least 5 gig free for the database
df -h /var/lib/mysql
#/dev/sdc1 /dev/sdc1 1.8T 970G 690G 59% /var/lib/mysql
hgsql '' -e 'create database droMoj2'
# Copy the table "grp" from an existing database to the new database
hgsql droMoj2 -e 'create table grp (PRIMARY KEY(NAME)) select * from dm2.grp'
# RUN REPEAT MASKER (DONE 8/5/05 angie)
ssh kkstore02
cat /cluster/bluearc/RepeatMasker/Libraries/version
#RepBase Update 9.11, RM database version 20050112
# make the run directory, output directory, and job list
cd /cluster/data/droMoj2
cat << '_EOF_' > jkStuff/RMDrosophila
#!/bin/csh -fe
cd $1
/bin/mkdir -p /tmp/droMoj2/$2
/bin/cp ../chunks500k/$2 /tmp/droMoj2/$2/
pushd /tmp/droMoj2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2
popd
/bin/cp /tmp/droMoj2/$2/$2.out ./
/bin/rm -fr /tmp/droMoj2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/droMoj2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/droMoj2
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMDrosophila
mkdir RMRun RMOut
cp /dev/null RMRun/RMJobs
foreach f ( chunks500k/*.fa )
set chunk = $f:t
echo ../jkStuff/RMDrosophila \
/cluster/data/droMoj2/RMOut $chunk \
'{'check in line+ /cluster/data/droMoj2/$f'}' \
'{'check out line+ /cluster/data/droMoj2/RMOut/$chunk.out'}' \
>> RMRun/RMJobs
end
# do the run
ssh kk9
cd /cluster/data/droMoj2/RMRun
para make RMJobs
para time
#Completed: 379 of 379 jobs
#Average job time: 3453s 57.55m 0.96h 0.04d
#Longest finished job: 5584s 93.07m 1.55h 0.06d
#Submission to last job: 16519s 275.32m 4.59h 0.19d
# Lift up the split-scaffold .out's to scaffold .out's
ssh kkstore02
cd /cluster/data/droMoj2
foreach f (RMOut/*.fa.out)
liftUp $f:r:r.scaf.out jkStuff/scaffoldsSplit.lft warn $f > /dev/null
end
# Make a consolidated scaffold .out file too:
head -3 RMOut/chunk_00.fa.out > RMOut/scaffolds.fa.out
foreach f (RMOut/*.scaf.out)
tail +4 $f >> RMOut/scaffolds.fa.out
end
# Load the .out files into the database with:
ssh hgwdev
hgLoadOut droMoj2 /cluster/data/droMoj2/RMOut/scaffolds.fa.out
# hgLoadOut made a "scaffolds_rmsk" table even with -table=rmsk,
# but we want a non-split with no prefix table:
hgsql droMoj2 -e 'rename table scaffolds_rmsk to rmsk'
# Fix up the indices too:
hgsql droMoj2 -e 'drop index bin on rmsk; \
drop index genoStart on rmsk; \
drop index genoEnd on rmsk; \
create index bin on rmsk (genoName(7), bin); \
create index genoStart on rmsk (genoName(7), genoStart);'
# EXTRACT AGP FROM ASSEMBLY.LINKS FILE (DONE 1/23/06 angie)
ssh hgwdev
cd /cluster/data/droMoj2
# assembly.links includes negative gap values which are supposed to
# indicate overlap between contigs, but when scaffold sequences were
# generated, Mike Eisen chose to use +25 instead of the negative values.
# So replace negative gap values with 25 for consistency with the
# scaffold sequences, then translate to AGP:
perl -wpe 'next if (/^#/); @w = split; \
$w[6] = 25 if ($w[6] < 0); $w[7] = 25 if ($w[7] < 0); \
$_ = join("\t", @w) . "\n";' \
downloads/agencourt_arachne_01aug2005/assembly.links \
| ~/kent/src/utils/arachneLinksToAgp.pl \
> scaffolds.agp
nice checkAgpAndFa scaffolds.agp scaffolds.fa | tail
hgGoldGapGl -noGl droMoj2 scaffolds.agp
# EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 8/5/04 angie)
ssh kkstore02
mkdir /cluster/data/droMoj2/bed/fakeAgp
cd /cluster/data/droMoj2/bed/fakeAgp
faGapSizes ../../scaffolds.fa \
-niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000
# A disproportionately large number of gaps are exactly 25bp long, so
# hgFakeAgp's default -minContigGap of 25 will be fine.
hgFakeAgp ../../scaffolds.fa fake.agp
ssh hgwdev
hgLoadGap -unsplit droMoj2 /cluster/data/droMoj2/bed/fakeAgp/fake.agp
# SIMPLE REPEATS (TRF) (DONE 8/5/05 angie)
ssh kolossus
mkdir /cluster/data/droMoj2/bed/simpleRepeat
cd /cluster/data/droMoj2/bed/simpleRepeat
nice trfBig -trf=/cluster/bin/i386/trf ../../scaffolds.fa \
/dev/null -bedAt=simpleRepeat.bed -tempDir=/tmp \
|& egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \
> trf.log &
# check on this with
tail -f trf.log
# Load this into the database as so
ssh hgwdev
hgLoadBed droMoj2 simpleRepeat \
/cluster/data/droMoj2/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 8/5/05 angie)
# make a filtered version of the trf output:
# keep trf's with period <= 12:
ssh kkstore02
cd /cluster/data/droMoj2/bed/simpleRepeat
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 8/5/05 angie)
ssh kkstore02
cd /cluster/data/droMoj2
maskOutFa -soft scaffolds.fa bed/simpleRepeat/trfMask.bed \
scaffolds.fa
maskOutFa -softAdd scaffolds.fa RMOut/scaffolds.fa.out scaffolds.fa
# Now clean up the unmasked split scaffolds to avoid confusion later.
rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft
# STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 8/5/05 angie)
# Translate to 2bit
ssh kkstore02
cd /cluster/data/droMoj2
faToTwoBit scaffolds.fa droMoj2.2bit
# Make chromInfo.tab.
mkdir bed/chromInfo
twoBitInfo droMoj2.2bit stdout \
| awk '{printf "%s\t%s\t/gbdb/droMoj2/droMoj2.2bit\n", $1, $2;}' \
> bed/chromInfo/chromInfo.tab
# Make symbolic a link from /gbdb/droMoj2 to the 2bit.
ssh hgwdev
mkdir -p /gbdb/droMoj2
ln -s /cluster/data/droMoj2/droMoj2.2bit /gbdb/droMoj2/
# Load chromInfo table.
hgsql droMoj2 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql droMoj2 -e 'load data local infile \
"/cluster/data/droMoj2/bed/chromInfo/chromInfo.tab" into table chromInfo'
# Make chrom.sizes from chromInfo contents and check scaffold count.
hgsql droMoj2 -N -e 'select chrom,size from chromInfo' \
> /cluster/data/droMoj2/chrom.sizes
wc -l /cluster/data/droMoj2/chrom.sizes
# 6843 /cluster/data/droMoj2/chrom.sizes
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 8/5/05 angie)
# Warning: genome and organism fields must correspond
# with defaultDb values
hgsql -h genome-testdb hgcentraltest -e \
'INSERT INTO dbDb \
(name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) values \
("droMoj2", "Aug. 2005", "/gbdb/droMoj2", "D. mojavensis", \
"scaffold_6500:23838548-23875233", 1, 57, \
"D. mojavensis", \
"Drosophila mojavensis", "/gbdb/droMoj2/html/description.html", \
0, 0, "Agencourt 1 August 2005");'
# This is not the first droAna, so defaultDb and genomeClade already
# have entries for D. mojavensis.
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
cd ~/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit trackDb/makefile to add droMoj2 to the DBS variable.
mkdir drosophila/droMoj2
# Create a simple drosophila/droMoj2/description.html file.
cvs add drosophila/droMoj2
cvs add drosophila/droMoj2/description.html
make update DBS=droMoj2 ZOO_DBS=
# go public on genome-test
cvs ci makefile
cvs ci drosophila/droMoj2
mkdir /gbdb/droMoj2/html
# in a clean, updated tree's kent/src/hg/makeDb/trackDb:
make alpha
# PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 8/5/05 angie)
# First, agglomerate small scaffolds into chunks of ~100k median
# (many scaffolds are larger than that) so we don't have too many
# files for one dir, but keep a reasonably low job run time:
ssh kkstore02
cd /cluster/data/droMoj2
mkdir chunksUnsplit
faSplit about scaffolds.fa 100000 chunksUnsplit/chunk_
ssh kkr1u00
mkdir /iscratch/i/droMoj2
rsync -av /cluster/data/droMoj2/chunksUnsplit /iscratch/i/droMoj2/
rsync -av /cluster/data/droMoj2/droMoj2.2bit /iscratch/i/droMoj2/
iSync
# PRODUCING GENSCAN PREDICTIONS (DONE 8/5/05 angie)
ssh kkstore02
# Make hard-masked scaffolds and split up for processing:
cd /cluster/data/droMoj2
maskOutFa scaffolds.fa hard scaffolds.fa.masked
mkdir chunksUnsplitMasked
faSplit about scaffolds.fa.masked 100000 chunksUnsplitMasked/chunk_
mkdir /cluster/data/droMoj2/bed/genscan
cd /cluster/data/droMoj2/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
ls -1S ../../chunksUnsplitMasked/chunk*.fa > chunks.list
cat << '_EOF_' > gsub
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
ssh kki
cd /cluster/data/droMoj2/bed/genscan
gensub2 chunks.list single gsub jobList
para make jobList
para time
#Completed: 304 of 304 jobs
#Average job time: 24s 0.40m 0.01h 0.00d
#Longest finished job: 1097s 18.28m 0.30h 0.01d
#Submission to last job: 1097s 18.28m 0.30h 0.01d
# If there are crashes, diagnose with "para problems".
# If a job crashes due to genscan running out of memory, re-run it
# manually with "-window=1200000" instead of "-window=2400000".
# Concatenate scaffold-level results:
ssh kkstore02
cd /cluster/data/droMoj2/bed/genscan
cat gtf/*.gtf > genscan.gtf
cat subopt/*.bed > genscanSubopt.bed
cat pep/*.pep > genscan.pep
# Clean up
rm -r /cluster/data/droMoj2/chunksUnsplitMasked
# Load into the database as so:
ssh hgwdev
cd /cluster/data/droMoj2/bed/genscan
ldHgGene -gtf droMoj2 genscan genscan.gtf
hgPepPred droMoj2 generic genscanPep genscan.pep
hgLoadBed droMoj2 genscanSubopt genscanSubopt.bed
# MAKE DOWNLOADABLE FILES (DONE 8/5/05 angie)
ssh kkstore02
mkdir /cluster/data/droMoj2/zips
cd /cluster/data/droMoj2
gzip -c RMOut/scaffolds.fa.out > zips/scaffoldOut.gz
gzip -c scaffolds.fa > zips/scaffoldFa.gz
gzip -c scaffolds.fa.masked > zips/scaffoldFaMasked.gz
gzip -c bed/simpleRepeat/trfMask.bed > zips/scaffoldTrf.gz
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/droMoj2
cd /usr/local/apache/htdocs/goldenPath/droMoj2
mkdir bigZips database
# Create README.txt files in bigZips/ and database/ to explain the files.
cd bigZips
ln -s /cluster/data/droMoj2/zips/*.gz .
nice md5sum *.gz > md5sum.txt
# MAKE 11.OOC FILE FOR BLAT (DONE 8/5/05 angie)
# Use -repMatch=100 (based on size -- for human we use 1024, and
# fly size is ~4.4% of human judging by gapless dm1 genome size from
# featureBits -- we would use 45, but bump that up a bit to be more
# conservative).
ssh kkr1u00
mkdir /cluster/bluearc/droMoj2
blat /cluster/data/droMoj2/droMoj2.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/droMoj2/11.ooc -repMatch=100
#Wrote 17296 overused 11-mers to /cluster/bluearc/droMoj2/11.ooc
cp -p /cluster/bluearc/droMoj2/*.ooc /iscratch/i/droMoj2/
iSync
# AUTO UPDATE GENBANK MRNA RUN (TODO 8/5/05 angie)
ssh hgwdev
# Update genbank config and source in CVS:
cd ~/kent/src/hg/makeDb/genbank
cvsup .
# Edit etc/genbank.conf and add these lines (note scaffold-browser settings):
# droMoj2 (D. mojavensis)
droMoj2.genome = /iscratch/i/droMoj2/droMoj2.2bit
droMoj2.mondoTwoBitParts = 1000
droMoj2.lift = no
droMoj2.refseq.mrna.native.load = no
droMoj2.refseq.mrna.xeno.load = yes
droMoj2.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
droMoj2.genbank.mrna.xeno.load = yes
# GenBank has no D. mojavensis ESTs at this point... that may change.
droMoj2.genbank.est.native.load = no
droMoj2.genbank.est.xeno.load = no
droMoj2.downloadDir = droMoj2
droMoj2.perChromTables = no
cvs ci etc/genbank.conf
# Update src/align/gbBlat to use /iscratch/i/droMoj2/11.ooc
cvs diff src/align/gbBlat
make
cvs ci src/align/gbBlat
# Install to /cluster/data/genbank:
make install-server
ssh `fileServer /cluster/data/genbank/`
cd /cluster/data/genbank
# This is an -initial run, (xeno) RefSeq only:
nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial droMoj2 &
tail -f [its logfile]
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droMoj2
featureBits droMoj2 xenoRefGene
#16520385 bases of 165766797 (9.966%) in intersection
# Clean up:
rm -rf work/initial.droMoj2
# This is an -initial run, mRNA only:
nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial droMoj2 &
tail -f [its logfile]
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droMoj2
featureBits droMoj2 all_mrna
#19602 bases of 165766797 (0.012%) in intersection
featureBits droMoj2 xenoMrna
#17295487 bases of 165766797 (10.434%) in intersection
# Clean up:
rm -rf work/initial.droMoj2
# SWAP CHAINS FROM DM2, BUILD NETS ETC. (IN PROGRESS 8/9/05 angie)
mkdir /cluster/data/droMoj2/bed/blastz.dm2.swap
cd /cluster/data/droMoj2/bed/blastz.dm2.swap
doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.droMoj2/DEF \
>& do.log
echo "check /cluster/data/droMoj2/bed/blastz.dm2.swap/do.log" \
| mail -s "check do.log" $USER
# Add {chain,net}Dm2 to trackDb.ra if necessary.
# MAKE GCPERCENT (DONE 8/9/05 angie)
ssh hgwdev
mkdir /cluster/data/droMoj2/bed/gc5Base
cd /cluster/data/droMoj2/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 droMoj2 \
/cluster/data/droMoj2 | wigEncode stdin gc5Base.wig gc5Base.wib
mkdir /gbdb/droMoj2/wib
ln -s `pwd`/gc5Base.wib /gbdb/droMoj2/wib
hgLoadWiggle -pathPrefix=/gbdb/droMoj2/wib droMoj2 gc5Base gc5Base.wig
# MAKE THIS THE DEFAULT ASSEMBLY WHEN THERE ARE ENOUGH TRACKS (TODO 8/?/05 angie)
hgsql -h genome-testdb hgcentraltest -e \
'UPDATE defaultDb set name = "droMoj2" where genome = "D. mojavensis";'
# MAKE Drosophila Proteins track (DONE braney 2005-08-20)
ssh kkstore02
mkdir -p /cluster/data/droMoj2/blastDb
cd /cluster/data/droMoj2/blastDb
faSplit sequence ../scaffolds.fa 400 x
for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
rm *.fa *.log
ssh kkr1u00
mkdir -p /iscratch/i/droMoj2/blastDb
cp /cluster/data/droMoj2/blastDb/* /iscratch/i/droMoj2/blastDb
iSync > sync.out
ssh kk
mkdir -p /cluster/data/droMoj2/bed/tblastn.dm2FB
cd /cluster/data/droMoj2/bed/tblastn.dm2FB
ls -1S /iscratch/i/droMoj2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst
mkdir fbfa
# calculate a reasonable number of jobs
calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl|awk "{print \\\$1}"`/\(80000/`wc target.lst | awk "{print \\\$1}"`\)
# 18929/(80000/396) = 93.698550
split -l 94 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
cd fbfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S fbfa/*.fa > fb.lst
mkdir -p /cluster/bluearc/droMoj2/bed/tblastn.dm2FB/blastOut
ln -s /cluster/bluearc/droMoj2/bed/tblastn.dm2FB/blastOut
for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.2
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 $f.4
exit 1
'_EOF_'
chmod +x blastSome
gensub2 target.lst fb.lst blastGsub blastSpec
para create blastSpec
para push
# Completed: 79991 of 79992 jobs
# para.results: file not found. paraHub can't write to this dir?
# CPU time in finished jobs: 5722807s 95380.12m 1589.67h 66.24d 0.181 y
# IO & Wait Time: 517066s 8617.76m 143.63h 5.98d 0.016 y
# Average job time: 78s 1.30m 0.02h 0.00d
# Longest finished job: 44845s 747.42m 12.46h 0.52d
# Submission to last job: 78192s 1303.20m 21.72h 0.91d
ssh pk
cd /cluster/data/droMoj2/bed/tblastn.dm2FB
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/fb?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
para create chainSpec
para maxNode 20
para push
# Completed: 202 of 202 jobs
# CPU time in finished jobs: 27335s 455.59m 7.59h 0.32d 0.001 y
# IO & Wait Time: 3390s 56.50m 0.94h 0.04d 0.000 y
# Average job time: 152s 2.54m 0.04h 0.00d
# Longest finished job: 4228s 70.47m 1.17h 0.05d
# Submission to last job: 4228s 70.47m 1.17h 0.05d
ssh kkstore02
cd /cluster/data/droMoj2/bed/tblastn.dm2FB/blastOut
for i in fb??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/droMoj2/bed/tblastn.dm2FB/blastDm2FB.psl
cd ..
wc blastDm2FB.psl
# 19158 402318 5264815 blastDm2FB.psl
pslUniq blastDm2FB.psl stdout | wc
# 17958 377118 5085091
cat fbfa/*fa | grep ">" | wc
# 82338 82338 1300520
ssh hgwdev
cd /cluster/data/droMoj2/bed/tblastn.dm2FB
hgLoadPsl droMoj2 blastDm2FB.psl
featureBits droMoj2 blastDm2FB
# 18239436 bases of 180520992 (10.104%) in intersection
exit
# back to kkstore02
rm -rf blastOut
# End tblastn
# AUTO UPDATE GENBANK MRNA AND EST GENES RUN (DONE, 2005-08-22, markd)
# align with revised genbank process
cd ~kent/src/makeDb/genbank
cvs update -d etc
# edit etc/genbank.conf to add droMoj2, had to run on pk, due to kk
# being down. Set temporary locations for server files
# droMoj2 (D. mojavensis)
# genbank has 1 mRNA and 0 ESTs at time of initial build
droMoj2.serverGenome = /cluster/data/droMoj2/droMoj2.2bit
##droMoj2.clusterGenome = /iscratch/i/droMoj2/droMoj2.2bit
##droMoj2.ooc = /iscratch/i/droMoj2/11.ooc
droMoj2.clusterGenome = /san/sanvol1/scratch/droMoj2/droMoj2.2bit
droMoj2.ooc = /san/sanvol1/scratch/droMoj2/11.ooc
droMoj2.lift = no
droMoj2.refseq.mrna.native.load = no
droMoj2.refseq.mrna.xeno.load = yes
droMoj2.refseq.mrna.xeno.pslCDnaFilter = -minCover=0.15 -coverNearTop=0.005 -minId=0.75 -idNearTop=0.005 -maxRepMatch=0.4 -bestOverlap
droMoj2.genbank.mrna.xeno.load = yes
droMoj2.genbank.est.native.load = no
droMoj2.genbank.est.xeno.load = no
droMoj2.downloadDir = droMoj2
droMoj2.perChromTables = no
# update /cluster/data/genbank/
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial droMoj2 &
# when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad droMoj2&
# AUTO UPDATE GENBANK MRNA AND EST GENES RUN (DONE, 2005-08-31, markd)
# redo genbank revised alignment procedure once again to
# pickup local near best pslCDnaFilter
# align with revised genbank process
cd ~kent/src/makeDb/genbank
cvs update -d etc
# edit etc/genbank.conf to add droMoj2
# droMoj2 (D. mojavensis)
# genbank has 1 mRNA and 0 ESTs at time of initial build
droMoj2.serverGenome = /cluster/data/droMoj2/droMoj2.2bit
droMoj2.clusterGenome = /iscratch/i/droMoj2/droMoj2.2bit
droMoj2.ooc = /iscratch/i/droMoj2/11.ooc
droMoj2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter}
droMoj2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
droMoj2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
droMoj2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
droMoj2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter}
droMoj2.genbank.est.xeno.pslCDnaFilter = ${lowCover.genbank.est.xeno.pslCDnaFilter}
droMoj2.lift = no
droMoj2.refseq.mrna.native.load = no
droMoj2.refseq.mrna.xeno.load = yes
droMoj2.genbank.mrna.xeno.load = yes
droMoj2.genbank.est.native.load = no
droMoj2.genbank.est.xeno.load = no
droMoj2.downloadDir = droMoj2
droMoj2.perChromTables = no
# update /cluster/data/genbank/
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial droMoj2 &
# when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad droMoj2&
# GENEMAPPER PREDICTIONS FROM UCB (DONE 1/24/06 angie)
ssh hgwdev
mkdir /cluster/data/droMoj2/bed/geneMapper
cd /cluster/data/droMoj2/bed/geneMapper
wget http://bio.math.berkeley.edu/genemapper/GFF/rel0.2/DroMoj_20050801.gff
# Don't use -genePredExt... there are no start/stop_codon items, so
# all get marked "incmpl", and name2 always gets the same value as name.
# Get rid of custom track header lines:
egrep -v '^(track|browser) ' DroMoj_20050801.gff \
| ldHgGene -gtf droMoj2 geneMapper stdin
+###########################################################################
+# LIFTOVER TO DROMOJ3 (DONE 3/16/09 angie)
+ doSameSpeciesLiftOver.pl droMoj2 droMoj3 -debug \
+ -ooc=/hive/data/staging/data/droMoj2/11.ooc -workhorse=kolossus
+# *** Steps were performed in /hive/data/genomes/droMoj2/bed/blat.droMoj3.2009-03-16
+ cd /hive/data/genomes/droMoj2/bed/blat.droMoj3.2009-03-16/
+ doSameSpeciesLiftOver.pl droMoj2 droMoj3 \
+ -ooc=/hive/data/staging/data/droMoj2/11.ooc -workhorse=kolossus
+
+
+###########################################################################