src/hg/makeDb/doc/taeGut1.txt 1.17
1.17 2009/06/02 23:50:26 markd
rebuild incorrect data that was submitted
Index: src/hg/makeDb/doc/taeGut1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/taeGut1.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/makeDb/doc/taeGut1.txt 18 Apr 2009 05:13:37 -0000 1.16
+++ src/hg/makeDb/doc/taeGut1.txt 2 Jun 2009 23:50:26 -0000 1.17
@@ -1,710 +1,715 @@
# for emacs: -*- mode: sh; -*-
# zebra finch ( Taeniopygia guttata )
#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-08-05)
ssh kkstore03
mkdir /cluster/store7/taeGut1
ln -s /cluster/store7/taeGut1 /cluster/data
mkdir /cluster/data/taeGut1/fromWustl
cd /cluster/data/taeGut1/fromWustl
wget "http://genome.wustl.edu/pub/user/lhillier/private/T_guttata/Taeniopygia_guttata-3.2.4.tar.gz"
gunzip *
mkdir tmp
cd tmp
tar xvf ../*.tar
cat *.fa | sed 's/Chr/chr/' | sed 's/LG/chrLG/' | gzip -c > ../assembly.bases.gz
cat *.agp | sed 's/Chr/chr/' | sed 's/LG/chrLG/' > ../assembly.agp
# some scaffolds missing quality values
XXX - this is broken. These *qual files are already in chrom coordinates,
XXX - they do not need a lift. The lift had the effect of applying -mScore=98
XXX - to all values. --Hiram 2008-12-04
cat *qual | sed 's/Chr/chr/' | sed 's/LG/chrLG/' | qaToQac stdin stdout | qacAgpLift -mScore=98 ../assembly.agp stdin ../taeGut1.qual.qac
cd ..
cut -f 1 assembly.agp | uniq -c | wc -l
# Number of scaffolds: 69
#########################################################################
# Create .ra file and run makeGenomeDb.pl (DONE braney 2008-08-05)
ssh kkstore03
cd /cluster/data/taeGut1
cat << _EOF_ >taeGut1.config.ra
# Config parameters for makeGenomeDb.pl:
db taeGut1
clade vertebrate
genomeCladePriority 68
scientificName Taeniopygia guttata
commonName Zebra finch
assemblyDate Jul. 2008
assemblyLabel Washington University taeGut1
orderKey 437
mitoAcc NC_007897
fastaFiles /cluster/data/taeGut1/fromWustl/assembly.bases.gz
agpFiles /cluster/data/taeGut1/fromWustl/assembly.agp
qualFiles /cluster/data/taeGut1/fromWustl/taeGut1.qual.qac
dbDbSpeciesDir zebraFinch
_EOF_
# use 'screen' make sure on kkstore03
makeGenomeDb.pl -verbose=2 taeGut1.config.ra > makeGenomeDb.out 2>&1 &
# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 432944.500000
# median 2517995.000000
# Q3 17897912.000000
# average 17616947.728571
# min 9909.000000
# max 175225315.000000
# count 70
# total 1233186341.000000
# standard deviation 35378770.640295
#########################################################################
# SIMPLE REPEATS TRF (DONE braney 2008-08-05)
ssh kkstore03
screen # use a screen to manage this job
mkdir /cluster/data/taeGut1/bed/simpleRepeat
cd /cluster/data/taeGut1/bed/simpleRepeat
#
doSimpleRepeat.pl -buildDir=/cluster/data/taeGut1/bed/simpleRepeat \
taeGut1 > do.log 2>&1 &
#### When done
ssh pk
para time
featureBits taeGut1 simpleRepeat
# 25363702 bases of 1222864691 (2.074%) in intersection
# Link to it from /gbdb:
ln -s /cluster/data/taeGut1/taeGut1.2bit /gbdb/taeGut1/taeGut1.2bit
#########################################################################
# Run WindowMasker (DONE braney 2008-08-05)
ssh kkstore03
screen
mkdir /cluster/data/taeGut1/bed/windowMasker
cd /cluster/data/taeGut1/bed/windowMasker
nice doWindowMasker.pl -workhorse=kolossus \
-buildDir=/cluster/data/taeGut1/bed/windowMasker taeGut1 > wmRun.log 2>&1 &
# load this initial data to get ready to clean it
ssh hgwdev
cd /cluster/data/taeGut1/bed/windowMasker
hgLoadBed taeGut1 windowmaskerSdust windowmasker.sdust.bed.gz
# Loaded 7487462 elements of size 3
# eliminate the gaps from the masking
featureBits taeGut1 -not gap -bed=notGap.bed
featureBits taeGut1 windowmaskerSdust
# 260083327 bases of 1222864691 (21.268%) in intersection
time nice -n +19 featureBits taeGut1 windowmaskerSdust notGap.bed \
-bed=stdout | gzip -c > cleanWMask.bed.gz
# 249761677 bases of 1222864691 (20.424%) in intersection
# reload track to get it clean
hgLoadBed taeGut1 windowmaskerSdust cleanWMask.bed.gz
featureBits taeGut1 windowmaskerSdust
# 249761677 bases of 1222864691 (20.424%) in intersection
# mask the sequence with this clean mask
zcat cleanWMask.bed.gz \
| twoBitMask ../../taeGut1.unmasked.2bit stdin \
-type=.bed taeGut1.cleanWMSdust.2bit
twoBitToFa taeGut1.cleanWMSdust.2bit stdout | faSize stdin \
> taeGut1.cleanWMSdust.faSize.txt
cat taeGut1.cleanWMSdust.faSize.txt
# 1233186341 bases (10321650 N's 1222864691 real 973103014 upper 249761677
# lower) in 70 sequences in 1 files
# Total size: mean 17616947.7 sd 35634216.3 min 9909 (chr16) max 175225315
# (chrUn) median 2517995
# N count: mean 147452.1 sd 400992.9
# U count: mean 13901471.6 sd 27798925.9
# L count: mean 3568024.0 sd 7705836.1
# %20.25 masked total, %20.42 masked real
cp taeGut1.cleanWMSdust.2bit /cluster/data/taeGut1/taeGut1.2bit
ln -s /cluster/data/taeGut1/taeGut1.2bit /gbdb/taeGut1
#########################################################################
# Create OOC file for genbank runs (DONE braney 2008-08-05)
# use same repMatch value as bosTau2
ssh kkstore03
cd /cluster/data/taeGut1
blat taeGut1.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=taeGut1.11.1005.ooc -repMatch=1005
# -rw-rw-r-- 1 braney protein 6368 Aug 5 14:31 taeGut1.11.1005.ooc
blat taeGut1.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=taeGut1.11.ooc -repMatch=1024
# -rw-rw-r-- 1 braney protein 6044 Jul 30 16:03 taeGut1.11.ooc
mkdir -p /cluster/bluearc/scratch/data/taeGut1
cp taeGut1.2bit taeGut1.11.ooc chrom.sizes /cluster/bluearc/scratch/data/taeGut1
# request admins to sync these to pk
#########################################################################
## Create a lift file for genbank work(DONE braney 2008-08-05)
ssh kkstore03
cd /cluster/data/taeGut1
cat fromWustl/assembly.agp | /cluster/bin/scripts/agpToLift -revStrand > jkStuff/liftAll.lft
#########################################################################
# GENBANK AUTO UPDATE (DONE braney 2008-08-06)
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add taeGut1
# taeGut1
taeGut1.serverGenome = /cluster/data/taeGut1/taeGut1.2bit
taeGut1.clusterGenome = /scratch/data/taeGut1/taeGut1.2bit
taeGut1.ooc = /scratch/data/taeGut1/11.ooc
taeGut1.align.unplacedChroms = chrUn,chr*_random
taeGut1.lift = /cluster/data/taeGut1/jkStuff/liftAll.lft
taeGut1.refseq.mrna.native.pslCDnaFilter =
${ordered.refseq.mrna.native.pslCDnaFilter}
taeGut1.refseq.mrna.xeno.pslCDnaFilter =
${ordered.refseq.mrna.xeno.pslCDnaFilter}
taeGut1.genbank.mrna.native.pslCDnaFilter =
${ordered.genbank.mrna.native.pslCDnaFilter}
taeGut1.genbank.mrna.xeno.pslCDnaFilter =
${ordered.genbank.mrna.xeno.pslCDnaFilter}
taeGut1.genbank.est.native.pslCDnaFilter =
${ordered.genbank.est.native.pslCDnaFilter}
taeGut1.refseq.mrna.native.load = yes
taeGut1.refseq.mrna.xeno.load = yes
taeGut1.genbank.mrna.xeno.load = yes
taeGut1.downloadDir = taeGut1
cvs ci -m "Added taeGut1." etc/genbank.conf
# add taeGut to src/lib/gbGenome.c
cvs ci -m "Added taeGut" src/lib/gbGenome.c
# update /cluster/data/genbank/:
make etc-update
ssh genbank
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial taeGut1 &
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad taeGut1
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add taeGut1 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added taeGut1 - zebra finch" etc/align.dbs etc/hgwdev.dbs
make etc-update
############################################################################
## Default position (DONE braney 2008-08-06)
ssh hgwdev
hgsql -e 'update dbDb set defaultPos="chrZ:56,293,325-56,333,199"
where name="taeGut1";' hgcentraltest
############################################################################
# MAKE DOWNLOADABLE FILES (restarted braney 2008-08-02)
ssh kkstore03
screen
cd /cluster/data/taeGut1
# edited makeDownloads.pl to take out repeat masker files
# makeDownloads.pl taeGut1 > downloads.log 2>&1 &
/cluster/bin/scripts/fripple taeGut1 > downloads.log 2>&1 &
############################################################################
## Adding a Photograph, from Art Arnold in email (restarted braney 2008-08-03)
mkdir /cluster/data/taeGut1/photograph
cd /cluster/data/taeGut1/photograph
ls -l Taeniopygia_guttata.jpg
# -rw-rw-r-- 1 braney protein 16160 Aug 4 09:43 Taeniopygia_guttata.jpg
# check this .jpg file into the browser doc source tree
cvs add -kb Taeniopygia_guttata.jpg
cvs commit Taeniopygia_guttata.jpg
cp -p Taeniopygia_guttata.jpg /usr/local/apache/htdocs/images
############################################################################
##########################################################################
## Creating pushQ (DONE - braney - 2008-08-06 )
ssh hgwdev
mkdir /cluster/data/taeGut1/pushQ
cd /cluster/data/taeGut1/pushQ
/cluster/bin/scripts/makePushQSql.pl taeGut1 > taeGut1.sql 2> stderr.out
## check the stderr.out for anything that needs to be fixed
scp taeGut1.sql hgwbeta.cse.ucsc.edu:/tmp
ssh hgwbeta
cd /tmp
hgsql qapushq < taeGut1.sql
############################################################################
# BLATSERVERS ENTRY (DONE - 2008-08-09 - braney)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("taeGut1", "blat2", "17782", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("taeGut1", "blat2", "17783", "0", "1");' \
hgcentraltest
############################################################################
# BAC ends (working braney...)
ssh kkstore03
mkdir /cluster/data/taeGut1/bed/bacEnds
cd /cluster/data/taeGut1/bed/bacEnds
mkdir fromWustl
cd fromWustl
wget "http://genome.wustl.edu/pub/organism/Other_Vertebrates/Taeniopygia_guttata/end_sequences/bac_ends.fa.gz"
gunzip *
ssh hgwdev
mkdir -p /gbdb/taeGut1/bacEnds
ln -s `pwd`/bac_ends.fa /gbdb/taeGut1/bacEnds
cd /tmp
hgLoadSeq taeGut1 /gbdb/taeGut1/bacEnds/bac_ends.fa
ssh kkstore03
cd /cluster/data/taeGut1/bed/bacEnds
rm -rf /san/sanvol1/scratch/taeGut1.splitEnds
mkdir -p /san/sanvol1/scratch/taeGut1.splitEnds
faSplit sequence fromWustl/bac_ends.fa 400 /san/sanvol1/scratch/taeGut1.splitEnds/x
mkdir run.blat
cd run.blat
ls /san/sanvol1/scratch/taeGut1.splitEnds/*.fa > inSeq.lst
rm -rf /cluster/bluearc/taeGut1.blatEnds
mkdir -p /cluster/bluearc/taeGut1.blatEnds
for i in `cat inSeq.lst`
do
f=`basename $i .fa`.psl
echo /cluster/bin/x86_64/blat /scratch/data/taeGut1/taeGut1.2bit \
-ooc=/scratch/data/taeGut1/taeGut1.11.ooc -tileSize=11 \
$i {check out line+ /cluster/bluearc/taeGut1.blatEnds/$f}
done > jobList
ssh memk
cd /cluster/data/taeGut1/bed/bacEnds/run.blat
para create jobList
para time
# CPU time in finished jobs: 124366s 2072.77m 34.55h 1.44d 0.004 y
# IO & Wait Time: 865s 14.41m 0.24h 0.01d 0.000 y
# Average job time: 383s 6.38m 0.11h 0.00d
# Longest finished job: 557s 9.28m 0.15h 0.01d
# Submission to last job: 4817s 80.28m 1.34h 0.06d
ssh kkstore03
cd /cluster/data/taeGut1/bed/bacEnds
for i in /cluster/bluearc/taeGut1.blatEnds/*.psl; do tail +6 $i; done | pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons stdin bacEnds.psl /dev/null
cat fromWustl/bac_ends.fa | awk '/>/ {print $1}' | \
tr -d '>'| sed 's/\./ /' | \
awk '{clones[$1] = $2 " " clones[$1] } \
END {for (i in clones) print i, clones[i]}' | \
awk 'BEGIN {OFS="\t"} \
{first="";second=""; \
for(ii=2; ii <= NF ; ii++) \
{if (($ii == "b1") || ($ii == "b2")) \
first=$1 "." $ii "," first; \
if (($ii == "g1") || ($ii == "g2")) \
second=$1 "." $ii "," second;}
print first,second,$1}' | \
awk '{if (NF == 3) print}' > allPairs.txt
time /cluster/bin/x86_64/pslPairs -tInsert=10000 \
-minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl allPairs.txt \
all_bacends bacEnds
# Filter by score and sort by {chrom,chromStart}:
awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
cat bacEnds.{slop,short,long,mismatch,orphan} \
| awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $5}' bacEndPairs.bed | sort -u
/cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed \
| sorttbl tname tstart | headchg -del \
> bacEnds.load.psl
wc -l bacEnds.*
# 526720 bacEnds.load.psl
# 65 bacEnds.long
# 1412 bacEnds.mismatch
# 33163 bacEnds.orphan
# 111379 bacEnds.pairs
# 625401 bacEnds.psl
# 530 bacEnds.short
# 312 bacEnds.slop
# 1298982 total
# load into database
ssh hgwdev
cd /cluster/data/taeGut1/bed/bacEnds
hgLoadBed -notItemRgb taeGut1 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# note - the "Bad" track isn't pushed to RR, just used for assembly QA.
hgLoadBed -notItemRgb taeGut1 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
hgLoadPsl taeGut1 -table=all_bacends bacEnds.load.psl
featureBits taeGut1 all_bacends
# 154544748 bases of 1222864691 (12.638%) in intersection
featureBits taeGut1 bacEndPairs
# 118749166 bases of 1222864691 (9.711%) in intersection
featureBits taeGut1 bacEndPairsBad
# 21778830 bases of 1222864691 (1.781%) in intersection
#########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE 2008-08-29 braney)
ssh hgwdev
mkdir /cluster/data/taeGut1/bed/genscan
cd /cluster/data/taeGut1/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
ssh swarm
cd /cluster/data/taeGut1/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt fasta
for i in ../../goldenPath/chromosomes/*.gz
do
echo $i
maskOutFa $i hard fasta/`basename $i .gz`
done
ls fasta/*.fa > genome.lst
wc -l genome.lst
# 70 genome.lst
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.lst single gsub jobList
para make jobList
para time
#Completed: 255 of 259 jobs
#Crashed: 4 jobs
#CPU time in finished jobs: 82258s 1370.97m 22.85h 0.95d 0.003 y
#IO & Wait Time: 1053s 17.55m 0.29h 0.01d 0.000 y
#Average job time: 327s 5.45m 0.09h 0.00d
#Longest finished job: 20131s 335.52m 5.59h 0.23d
#Submission to last job: 23171s 386.18m 6.44h 0.27d
# two jobs crashed due to genscan running out of memory, re-run it
# with "-window=1200000" instead of "-window=2400000".
para crashed | sed 's/2400000/1200000/' > crash.jobs
para create -batch=crashBatch crash.jobs
para push -batch=crashBatch
# Convert these to chromosome level files as so:
cat gtf/*.gtf > genscan.gtf
cat subopt/*.bed > genscanSubopt.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/taeGut1/bed/genscan
ldHgGene taeGut1 genscan genscan.gtf
hgPepPred taeGut1 generic genscanPep genscan.pep
hgLoadBed taeGut1 genscanSubopt genscanSubopt.bed
# cleanup
rm -rf fasta
#####################################################
# build 2bit with contigs from random chroms and chrUn (reDONE braney 2008-09-06)
cd /cluster/data/taeGut1
fgrep 'random
Un' taeGut1.agp | awk '{if ($5 == "W") print $1, $2-1, $3, $6, $9}' \
| while read chr start stop contig strand;
do
echo ">$contig";
if test $strand == '+'
then
twoBitToFa taeGut1.2bit:$chr:$start-$stop stdout \
| tail -n +2;
else
twoBitToFa taeGut1.2bit:$chr:$start-$stop stdout | \
faRc -keepCase stdin stdout | tail -n +2;
fi
done > unRandom.fa
fgrep -v 'random
Un' chrom.sizes | awk '{print $1}' | \
twoBitToFa taeGut1.2bit -seqList=stdin noUn.fa
cat noUn.fa unRandom.fa | faToTwoBit stdin taeGut1.blastz.2bit
twoBitInfo taeGut1.blastz.2bit taeGut1.blastz.sizes
#########################################################################
## BLASTZ galGal3 swap (reDONE braney 2008-09-06)
## the original blastz to galGal3 measured
ssh swarm
mkdir /cluster/data/taeGut1/bed/blastz.galGal3.swap
cd /cluster/data/taeGut1/bed/blastz.galGal3.swap
screen
doBlastzChainNet.pl \
/cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -bigClusterHub=swarm -smallCluster=swarm \
-workhorse=kolossus -swap > swap.log 2>&1 &
ssh hgwdev
cd /cluster/data/taeGut1/bed/blastz.galGal3.swap
nice -n +19 featureBits taeGut1 chainGalGal3Link \
> fb.taeGut1l.chainGalGal3Link.txt 2>&1
# 802731012 bases of 1222864691 (65.643%) in intersection
#########################################################################
## BLASTZ hg18 swap (reDONE braney 2008-09-10)
ssh swarm
mkdir /cluster/data/taeGut1/bed/blastz.hg18.swap
cd /cluster/data/taeGut1/bed/blastz.hg18.swap
screen
doBlastzChainNet.pl \
/cluster/data/hg18/bed/blastz.taeGut1.2008-09-09/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -bigClusterHub=swarm -smallCluster=swarm \
-workhorse=kolossus -swap > swap.log 2>&1 &
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-09-07)
# bash if not using bash shell already
ssh kolossus
mkdir /cluster/data/taeGut1/blastDb
cd /cluster/data/taeGut1
awk '{if ($2 > 1000000) print $1}' taeGut1.blastz.sizes > 1meg.lst
twoBitToFa -seqList=1meg.lst taeGut1.blastz.2bit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa 1meg.lst
awk '{if ($2 <= 1000000) print $1}' taeGut1.blastz.sizes > less1meg.lst
twoBitToFa -seqList=less1meg.lst taeGut1.blastz.2bit temp.fa
faSplit about temp.fa 1000000 blastDb/y
cd blastDb
for i in *.fa
do
/hive/data/outside/blast229/formatdb -i $i -p F
done
rm *.fa
ls *.nsq | wc -l
# 1243
mkdir -p /cluster/data/taeGut1/bed/tblastn.hg18KG
cd /cluster/data/taeGut1/bed/tblastn.hg18KG
echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 1243 query.lst
# we want around 150000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(150000/`wc query.lst | awk '{print $1}'`\)
# 36727/(150000/1243) = 304.344407
mkdir -p kgfa
split -l 304 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/taeGut1/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/taeGut1/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit
ssh swarm
cd /cluster/data/taeGut1/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 150403 of 150403 jobs
# CPU time in finished jobs: 7194206s 119903.43m 1998.39h 83.27d 0.228 y
# IO & Wait Time: 576761s 9612.69m 160.21h 6.68d 0.018 y
# Average job time: 52s 0.86m 0.01h 0.00d
# Longest finished job: 120s 2.00m 0.03h 0.00d
# Submission to last job: 12759s 212.65m 3.54h 0.15d
ssh swarm
cd /cluster/data/taeGut1/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS ../blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
para create chainSpec
para try, check, push, check etc.
# Completed: 121 of 121 jobs
# CPU time in finished jobs: 14143s 235.72m 3.93h 0.16d 0.000 y
# IO & Wait Time: 54085s 901.42m 15.02h 0.63d 0.002 y
# Average job time: 564s 9.40m 0.16h 0.01d
# Longest finished job: 995s 16.58m 0.28h 0.01d
# Submission to last job: 1047s 17.45m 0.29h 0.01d
cd /cluster/data/taeGut1/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort u.*.psl m60* | uniq > ../unliftBlastHg18KG.psl
cd ..
pslCheck unliftBlastHg18KG.psl
liftUp -nohead temp.psl ../../jkStuff/liftAll.lft carry unliftBlastHg18KG.psl
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n temp.psl > blastHg18KG.psl
rm temp.psl
pslCheck blastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/taeGut1/bed/tblastn.hg18KG
hgLoadPsl taeGut1 blastHg18KG.psl
# check coverage
featureBits taeGut1 blastHg18KG
# 19587281 bases of 1222864691 (1.602%) in intersection
featureBits taeGut1 all_mrna blastHg18KG -enrichment
# all_mrna 0.117%, blastHg18KG 1.602%, both 0.049%, cover 41.81%, enrich 26.10x
rm -rf blastOut
#end tblastn
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
taeGut1.upstreamGeneTbl = refGene
#########################################
# RepeatMasker ( 2009-02-04 RMH and braney)
mkdir /hive/data/genomes/taeGut1/bed/RMRunRMH
cd /hive/data/genomes/taeGut1/bed/RMRunRMH
doRepeatMasker.pl -buildDir `pwd` taeGut1
featureBits taeGut1 rmsk
# 99288448 bases of 1222864691 (8.119%) in intersection
# was earlier...
# 97507442 bases of 1222864691 (7.974%) in intersection
#############################################################################
# N-SCAN gene predictions (nscanGene) - (2009-04-17 markd)
/mblab.wustl.edu/predictions/taeGut1
# obtained NSCAN predictions from michael brent's group
# at WUSTL
cd /cluster/data/taeGut1/bed/nscan/
wget -nv http://mblab.wustl.edu/predictions/taeGut1/taeGut1.gtf
wget -nv http://mblab.wustl.edu/predictions/taeGut1/taeGut1.prot.fa
bzip2 taeGut1.*
chmod a-w *
# load track
ldHgGene -bin -gtf -genePredExt taeGut1 nscanGene taeGut1.gtf.bz2
hgPepPred taeGut1 generic nscanPep taeGut1.prot.fa.bz2
rm *.tab
# create zebraFinch/nscanGene.html
# zebraFinch/taeGut1/trackDb.ra, add:
track nscanGene override
informant Zerba Finch N-SCAN uses chicken (galGal3) as the informant
searchTable nscanGene
searchType genePred
termRegex chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?
searchPriority 50
+ # a mismatch was found in nscanPep; WUStL recreate
+ wget -nv http://mblab.wustl.edu/predictions/taeGut1/taeGut1.prot.fa
+ bzip2 taeGut1.prot.fa
+ chmod a-w *
+ hgPepPred taeGut1 generic nscanPep taeGut1.prot.fa.bz2