src/hg/makeDb/doc/rn4.txt 1.38
1.38 2010/02/12 20:40:05 fanhsu
Added RGD Genes build pipeline.
Index: src/hg/makeDb/doc/rn4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/rn4.txt,v
retrieving revision 1.37
retrieving revision 1.38
diff -b -B -U 1000000 -r1.37 -r1.38
--- src/hg/makeDb/doc/rn4.txt 9 Feb 2010 23:17:59 -0000 1.37
+++ src/hg/makeDb/doc/rn4.txt 12 Feb 2010 20:40:05 -0000 1.38
@@ -1,5105 +1,5222 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on the Rattus
# Norvegicus genome, November 2004 update (Rnor3.4) from Baylor.
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+-------------------------------------+
# | tableName | type |
# +-------------+-------------------------------------+
# | knownGene | genePred knownGenePep knownGeneMrna |
# | refGene | genePred refPep refMrna |
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | mgcGenes | genePred |
# | ensGene | genePred ensPep |
# | genscan | genePred genscanPep |
# +-------------+-------------------------------------+
# CREATE BUILD DIRECTORY (DONE 1/27/06 angie)
# df -h /cluster/store*, choose the one with the most space...
ssh kkstore01
mkdir /cluster/store9/rn4
ln -s /cluster/store9/rn4 /cluster/data/rn4
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 1/27/06 angie)
mkdir /cluster/data/rn4/M
cd /cluster/data/rn4/M
# go to http://www.ncbi.nih.gov/ and search Nucleotide for
# "Rattus norvegicus mitochondrion complete genome".
# There are more than one of those... I picked NC_001665 whose gi # is
# 5835177
# Use that number in the entrez linking interface to get fasta:
wget -O chrM.fa \
'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5835177&dopt=FASTA'
# Edit chrM.fa: make sure the long fancy header line says it's the
# Rattus norvegicus mitochondrion complete genome, and then replace the
# header line with just ">chrM".
# DOWNLOAD SEQUENCE (DONE 1/27/06 angie)
ssh kkstore01
cd /cluster/data/rn4
mkdir downloads
cd downloads
wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/README
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X)
echo chr$c
wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}.fa.gz
wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}.fa.qual.gz
wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}-random.fa.gz
wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}-random.fa.qual.gz
echo ""
end
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X)
echo chr$c
wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/contigs/chr{$c}.agp
end
mkdir bacs
cd bacs
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X)
echo chr$c
wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/contigs/chr${c}.contig_bacs.fa.gz
end
cd ..
# Distribute into chrom dirs.
foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X)
echo chr$c
mkdir ../$c
zcat Rnor3.4chr${c}.fa.gz \
| sed -e 's/^>gnl[|].*[|]/>/' > ../$c/chr${c}.fa
zcat Rnor3.4chr${c}-random.fa.gz \
| sed -e 's/^>gnl[|].*[|]/>/' > ../$c/chr${c}_random.fa
mv chr${c}.agp ../$c/
end
#mv: cannot stat `chrUn.agp': No such file or directory
# No agp for Un, nor *_random.... guess we'll have to use hgFakeAgp
# to at least get gaps for those.
# checkAgpAndFa prints out way too much info -- keep the end/stderr only:
foreach agp (*/chr*.agp)
set fa = $agp:r.fa
echo checking consistency of $agp and $fa
checkAgpAndFa $agp $fa | tail -1
end
faSize */chr*.fa
#2834127293 bases (267832528 N's 2566294765 real 2566294765 upper 0 lower) in 45 sequences in 45 files
#Total size: mean 62980606.5 sd 75162894.7 min 16300 (chrM) max 267910886 (chr1) median 6862066
#N count: mean 5951834.0 sd 6940811.4
#U count: mean 57028772.6 sd 68565220.7
#L count: mean 0.0 sd 0.0
# MAKE FAKE AGP WHERE NECESSARY (DONE 1/27/06 angie)
# In the chromosomal AGP, all gaps are marked "fragment" *except* for
# gaps of exactly 50000 which are "clone". There are gaps of >>50000
# marked "fragment"! However, in the chr*_random and chrUn here,
# that just seems wrong... chrUn has a gap of 2602000, how could that
# possibly be bridged? (Why are they wasting that kind of space?)
# So just count >= 50000 as "clone no", even though that is not what
# they do in their AGP for the chroms.
ssh kkstore01
cd /cluster/data/rn4
foreach f (?{,?}/chr*.fa)
set agp = $f:r.agp
if (! -e $agp) then
echo Faking missing AGP $agp
hgFakeAgp -minContigGap=50 -minScaffoldGap=50000 $f stdout \
| sed -e 's/contig/fragment/; s/scaffold/clone/' \
> $agp
endif
end
# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 1/27/06 angie)
ssh kkstore01
cd /cluster/data/rn4
foreach agp (*/chr*.agp)
set fa = $agp:r.fa
echo splitting $agp and $fa
cp -p $agp $agp.bak
cp -p $fa $fa.bak
splitFaIntoContigs $agp $fa . -nSize=5000000
end
# splitFaIntoContigs makes new dirs for _randoms. Move their contents
# back into the main chrom dirs and get rid of the _random dirs.
foreach d (*_random)
set base = `echo $d | sed -e 's/_random$//'`
mv $d/lift/oOut.lst $base/lift/rOut.lst
mv $d/lift/ordered.lft $base/lift/random.lft
mv $d/lift/ordered.lst $base/lift/random.lst
rmdir $d/lift
mv $d/* $base
rmdir $d
end
# checkAgpAndFa again to get a warm-fuzzy
foreach agp (*/chr*.agp)
set fa = $agp:r.fa
echo checking consistency of $agp and $fa
checkAgpAndFa $agp $fa | tail -1
end
# Make a "pseudo-contig" for processing chrM too:
mkdir M/chrM_1
sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
mkdir M/lift
set mSize=`faSize M/chrM.fa | awk '{print $1;}'`
echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
echo "chrM_1" > M/lift/ordered.lst
echo "0 M/chrM_1 "$mSize" chrM "$mSize"" > M/lift/ordered.lft
# MAKE JKSTUFF AND BED DIRECTORIES (DONE 1/27/06 angie)
# This used to hold scripts -- better to keep them inline here so
# they're in CVS. Now it should just hold lift file(s) and
# temporary scripts made by copy-paste from this file.
mkdir /cluster/data/rn4/jkStuff
# This is where most tracks will be built:
mkdir /cluster/data/rn4/bed
# REPEAT MASKING (DONE 3/16/06 angie)
# Originally run 1/30/06. Coverage was a bit low (43.020%, compared to
# 43.463% for rn3) so I made a test case (see below) for Robert Hubley,
# who fixed a bug and re-released RepeatMasker 3/14/06. So I'm
# re-running... original chr*.fa.out have been moved to .bak and
# lower-level .fa.out's have been removed. Added -ali at Robert H's
# request.
# Record RM version used:
ls -l /cluster/bluearc/RepeatMasker
#lrwxrwxrwx 1 angie protein 18 Mar 14 16:56 /cluster/bluearc/RepeatMasker -> RepeatMasker060314/
grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
#CC RELEASE 20060314; *
# Run RepeatMasker on a dummy input, just to make it initialize its
# rat libraries once before the cluster run:
/cluster/bluearc/RepeatMasker/RepeatMasker -spec rat /dev/null
#Building species libraries in: /cluster/bluearc/RepeatMasker060314/Libraries/20060314/rattus
#- Split contigs into 500kb chunks, at gaps if possible:
ssh kkstore01
cd /cluster/data/rn4
foreach c (?{,?})
foreach d ($c/chr${c}*_?{,?})
cd $d
echo "splitting $d"
set contig = $d:t
faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
cd ../..
end
end
#- Make the run directory and job list:
cd /cluster/data/rn4
cat << '_EOF_' > jkStuff/RMRat
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/rn4/$2
/bin/cp $2 /tmp/rn4/$2/
cd /tmp/rn4/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -ali -spec rat $2
popd
/bin/cp /tmp/rn4/$2/$2.out ./
if (-e /tmp/rn4/$2/$2.tbl) /bin/cp /tmp/rn4/$2/$2.tbl ./
if (-e /tmp/rn4/$2/$2.cat) /bin/cp /tmp/rn4/$2/$2.cat ./
/bin/rm -fr /tmp/rn4/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/rn4/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/rn4
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMRat
mkdir RMRun
cp /dev/null RMRun/RMJobs
foreach c (?{,?})
foreach d ($c/chr${c}{,_random}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/rn4/jkStuff/RMRat \
/cluster/data/rn4/$d $f \
'{'check out line+ /cluster/data/rn4/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
end
#- Do the run
ssh pk
cd /cluster/data/rn4/RMRun
para create RMJobs
para make RMJobs; para time | mail -s 'RM cluster run finished' $USER
#Completed: 6487 of 6488 jobs
#Crashed: 1 jobs
#CPU time in finished jobs: 27797186s 463286.43m 7721.44h 321.73d 0.881 y
#IO & Wait Time: 41276s 687.94m 11.47h 0.48d 0.001 y
#Average job time: 4291s 71.52m 1.19h 0.05d
#Longest finished job: 6361s 106.02m 1.77h 0.07d
#Submission to last job: 133586s 2226.43m 37.11h 1.55d
# The one crash was a divide-by-zero error on chr1_24_04 -- the patch
# that Robert send for open-3-1-3 was missing from open-3-1-4. So I
# applied it to open-3-1-4 and manually ran ProcessRepeats on the .cat.
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kkstore01
cd /cluster/data/rn4
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
foreach c (?{,?})
echo lifting $c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
> /dev/null
endif
if (-e lift/random.lft && ! -z lift/random.lft) then
liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
> /dev/null
endif
cd ..
end
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/rn4
hgLoadOut rn4 ?{,?}/chr*.fa.out
# Many warnings like this one:
#Strange perc. field -7282.5 line 396666 of 1/chr1.fa.out
#Strange perc. field -143.1 line 396666 of 1/chr1.fa.out
# That's from chr1_49, not chr1_24 so it is independent of the tweak to
# ProcessRepeats.
# There was also this warning at the end:
#note: 249 records dropped due to repStart > repEnd
# run with -verbose=2 for details
# Ran with -verbose=2 on kolossus... these are some of the warnings:
#bad rep range [4533, 4532] line 955 of 14/chr14.fa.out
#bad rep range [6936, 6935] line 15888 of 14/chr14.fa.out
#bad rep range [458, 457] line 30645 of 14/chr14.fa.out
#bad rep range [5798, 5797] line 85282 of 14/chr14.fa.out
#TODO: send those on to Robert too.
nice featureBits rn4 rmsk
#1137387810 bases of 2571531505 (44.230%) in intersection
# Good -- it's now higher than rn3:
#1117483165 bases of 2571104688 (43.463%) in intersection
# CREATING DATABASE (DONE 1/27/06 angie)
ssh hgwdev
hgsql '' -e 'create database rn4'
# Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
df -h /var/lib/mysql
#/dev/sdc1 1.8T 1.6T 88G 95% /var/lib/mysql
# Wow... we sure have filled that sucker up.
# CREATING GRP TABLE FOR TRACK GROUPING (DONE 1/27/06 angie)
ssh hgwdev
hgsql rn4 -e "create table grp (PRIMARY KEY(NAME)) select * from rn3.grp"
# MAKE REPEATMASKER LOW-COVERAGE TEST CASE (DONE 3/10/06 angie)
# Eyeball some repeat annotations in the browser, compare to lib seqs.
# Run featureBits on rn4 and on previous genome build, and compare:
ssh hgwdev
nice featureBits rn4 rmsk
# Hmmmm... seems wrong for this to decrease...!
#1106283290 bases of 2571531505 (43.020%) in intersection
nice featureBits rn3 rmsk
#1117483165 bases of 2571104688 (43.463%) in intersection
# Do some comparisons on the smallest chrom, chr20.
featureBits rn3 -chrom=chr20 rmsk
#20269238 bases of 49432254 (41.004%) in intersection
# liftOver rn3 chr20 rmsk to rn4, find non-overlap and compare:
ssh kolossus
awk '$1 ~ /[0-9]/ {print $5 "\t" $6 "\t" $7;}' \
/cluster/data/rn3/20/chr20.fa.out > /tmp/rn3.chr20_rmsk.bed
liftOver /tmp/rn3.chr20_rmsk.bed \
/cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \
/tmp/rn4.chr20_rmskLifted.bed /tmp/rn4.chr20.unmapped
featureBits rn4 -chrom=chr20 rmsk
#19994056 bases of 49409453 (40.466%) in intersection
featureBits rn4 -chrom=chr20 /tmp/rn4.chr20_rmskLifted.bed
#20117851 bases of 49409453 (40.717%) in intersection
# Wow, even after some small liftOver losses, the rn3 rmsk run is
# still bigger than the rn4 run! Look for large missing pieces:
featureBits rn4 -chrom=chr20 /tmp/rn4.chr20_rmskLifted.bed \!rmsk \
-minSize=500 -bed=stdout
#chr20 107997 108538 chr20.1
#chr20 15410534 15411405 chr20.2
#chr20 26448452 26448976 chr20.3
#chr20 34984144 34984704 chr20.4
#chr20 41862299 41862843 chr20.5
#chr20 49189694 49190470 chr20.6
#chr20 50960151 50960656 chr20.7
#chr20 53898688 53899203 chr20.8
#chr20 54856700 54857268 chr20.9
#634331 bases of 49409453 (1.284%) in intersection
# For the first such region above, the rn3 location is the same
# (chr20:107998-108538) and is covered by a LINE... 34.1% divergence.
# Well, that should make for a handy little test case because the
# sequence has not changed at all -- only the RM version.
mkdir /cluster/data/rn4/RMTest
cd /cluster/data/rn4/RMTest
twoBitToFa ../rn4.2bit -seq=chr20 -start=100000 -end=110000 chr20_frag.fa
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec rat chr20_frag.fa
mkdir open-3-1-3
mv chr20_frag.fa.* open-3-1-3/
# The version used for rn3:
/cluster/bluearc/RepeatMasker030619/RepeatMasker -s -spec rat chr20_frag.fa
mkdir 030619
mv chr20_frag.fa.* 030619/
# The next-latest version:
/cluster/bluearc/RepeatMasker051101/RepeatMasker -s -spec rat chr20_frag.fa
mkdir open-3-1-2
mv chr20_frag.fa.* open-3-1-2/
# 3-1-2 makes exactly the same .out as 3-1-3. 060319 has one more LINE...
# The first open- version that we have -- .out identical to 3-1-3:
/cluster/bluearc/RepeatMasker040909/RepeatMasker -s -spec rat chr20_frag.fa
mkdir open-3-0-5
mv chr20_frag.fa.* open-3-0-5/
# The only other version between 030619 and 040909:
/cluster/bluearc/RepeatMasker040130/RepeatMasker -s -spec rat chr20_frag.fa
mkdir 040130
mv chr20_frag.fa.* 040130/
# Sent the test case to Robert Hubley -- he found a rodent-specific bug.
# MAKE LIFTALL.LFT (DONE 1/27/06 angie)
ssh kkstore01
cd /cluster/data/rn4
cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
# GOLD AND GAP TRACKS (DONE 1/27/06 angie)
ssh hgwdev
cd /cluster/data/rn4
hgGoldGapGl -noGl rn4 /cluster/data/rn4 .
# Wow, tons of warnings like this:
#unexpected coords (6960, 6960) for frag chrX_random_2 in chrom chrX_random
# -- There really are little single non-N bases between large blocks
# of N in the random fasta! Sheesh, what a mess. But randoms are
# the dregs...
# SIMPLE REPEATS (TRF) (DONE 1/30/06 angie)
ssh kolossus
mkdir /cluster/data/rn4/bed/simpleRepeat
cd /cluster/data/rn4/bed/simpleRepeat
mkdir trf
cp /dev/null jobs.csh
foreach d (/cluster/data/rn4/*/chr*_?{,?})
set ctg = $d:t
foreach f ($d/${ctg}.fa)
set fout = $f:t:r.bed
echo $fout
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
>> jobs.csh
end
end
csh -ef jobs.csh >&! jobs.log &
# check on this with
tail -f jobs.log
wc -l jobs.csh
ls -1 trf | wc -l
#591
endsInLf trf/*
#trf/chrM_1.bed zero length
# When job is done do:
liftUp simpleRepeat.bed /cluster/data/rn4/jkStuff/liftAll.lft warn \
trf/*.bed
# Load into the database:
ssh hgwdev
hgLoadBed rn4 simpleRepeat \
/cluster/data/rn4/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
nice featureBits rn4 simpleRepeat
#72859247 bases of 2571531505 (2.833%) in intersection
# Compare to rn3:
nice featureBits rn3 simpleRepeat
#70073656 bases of 2571104688 (2.725%) in intersection
# SET UP DB ON KOLOSSUS (DONE 1/30/06 angie)
# to spare hgwdev, make an rn4 on kolossus so that we have the
# option of loading there and pushing to hgwdev. It will need the
# gap tables and also chromInfo (added below) so that we can run
# featureBits on it.
hgsql '' -e 'create database rn4'
cd /cluster/data/rn4
hgGoldGapGl -noGl rn4 /cluster/data/rn4 .
# CYTOBAND (DONE 1/30/06 angie)
ssh hgwdev
mkdir /cluster/data/rn4/bed/cytoBand
cd /cluster/data/rn4/bed/cytoBand
wget ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/mapview/ideogram.gz
zcat ideogram.gz | sort -k1,1 -k6n,6n > ideogram.sorted
/cluster/bin/scripts/createNcbiCytoBand ideogram.sorted
# Load the bed file into both cytoBand and cytoBandIdeo:
hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
rn4 cytoBand cytoBand.bed
hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBandIdeo.sql \
rn4 cytoBandIdeo cytoBand.bed
# Doh, the file does not have stain information! All of the bands
# are the same as in rn3, so grab stain info from there.
hgsql rn4 -e '\
create table bandToStain select name,gieStain from rn3.cytoBand; \
update cytoBand,bandToStain \
set cytoBand.gieStain = bandToStain.gieStain \
where cytoBand.name = bandToStain.name; \
update cytoBandIdeo,bandToStain \
set cytoBandIdeo.gieStain = bandToStain.gieStain \
where cytoBandIdeo.name = bandToStain.name; \
drop table bandToStain;'
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
ssh hgwdev
cd /cluster/data/rn4/bed
mkdir microsat
cd microsat
awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
/cluster/bin/i386/hgLoadBed rn4 microsat microsat.bed
# PROCESS SIMPLE REPEATS INTO MASK (DONE 1/30/06 angie)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kkstore01
cd /cluster/data/rn4/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/rn4
mkdir bed/simpleRepeat/trfMaskChrom
foreach c (?{,?})
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# Here's the coverage for the filtered TRF:
ssh kolossus
cat /cluster/data/rn4/bed/simpleRepeat/trfMaskChrom/*.bed \
> /tmp/filtTrf.bed
featureBits rn4 /tmp/filtTrf.bed
#50551402 bases of 2571531505 (1.966%) in intersection
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 3/17/06 angie)
# Originally done 1/30 -- redoing after re-run of RepeatMasker.
ssh kkstore01
cd /cluster/data/rn4
# Soft-mask (lower-case) the contig and chr .fa's,
# then make hard-masked versions from the soft-masked.
set trfCtg=bed/simpleRepeat/trfMask
set trfChr=bed/simpleRepeat/trfMaskChrom
foreach f (?{,?}/chr*.fa)
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
set chr = $f:t:r
maskOutFa -softAdd $f $trfChr/$chr.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
foreach c (?{,?})
echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
foreach d ($c/chr*_?{,?})
set ctg=$d:t
set f=$d/$ctg.fa
maskOutFa -soft $f $f.out $f
maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
maskOutFa $f hard $f.masked
end
end
# Make 2bit (for hgBlat, browser):
faToTwoBit ?{,?}/chr*.fa rn4.2bit
# Make nib (for blastz w/linSpecRep, OK for genbank too):
mkdir nib
foreach f (?{,?}/chr*.fa)
echo $f:t:r
faToNib -softMask $f nib/$f:t:r.nib
end
# PUT NIBS ON /SCRATCH (DONE 3/17/06 angie)
# Originally done 1/30 -- redoing after re-run of RepeatMasker.
ssh kkstore01
mkdir /cluster/bluearc/scratch/hg/rn4
rsync -av /cluster/data/rn4/nib/* /cluster/bluearc/scratch/hg/rn4/nib/
cp -p /cluster/data/rn4/rn4.2bit /cluster/bluearc/scratch/hg/rn4/
# Ask cluster-admin to distribute to /scratch on big & small cluster
# MAKE CHROMINFO TABLE WITH 2BIT (DONE 1/30/06 angie)
ssh kkstore01
cd /cluster/data/rn4
mkdir bed/chromInfo
twoBitInfo rn4.2bit stdout \
| awk '{print $1 "\t" $2 "\t/gbdb/rn4/rn4.2bit";}' \
> bed/chromInfo/chromInfo.tab
# Link to 2bit from /gbdb/rn4/:
ssh hgwdev
mkdir /gbdb/rn4
ln -s /cluster/data/rn4/rn4.2bit /gbdb/rn4/
# Load /gbdb/rn4/rn4.2bit paths into database and save size info.
hgsql rn4 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql rn4 -e 'load data local infile \
"/cluster/data/rn4/bed/chromInfo/chromInfo.tab" \
into table chromInfo;'
echo "select chrom,size from chromInfo" | hgsql -N rn4 > chrom.sizes
# take a look at chrom.sizes size
wc chrom.sizes
# 45 90 789 chrom.sizes
# Load chromInfo on kolossus too (required for featureBits):
ssh kolossus
hgsql rn4 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql rn4 -e 'load data local infile \
"/cluster/data/rn4/bed/chromInfo/chromInfo.tab" \
into table chromInfo;'
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 1/30/06 angie)
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
cd ~/kent/src/hg/makeDb/trackDb
cvsup
# Add trackDb directories and a description.html
mkdir rat/rn4
cvs add rat/rn4
cvs add rat/rn4/description.html
cvs ci rat/rn4
# Edit that makefile to add rn4 in all the right places and do
make update DBS=rn4
mkdir /gbdb/rn4/html
cvs ci makefile
# Go public on genome-test. In a clean tree (no mods, up-to-date):
cvs up makefile
make alpha
# Note: hgcentral*.genome values must correspond
# with defaultDb.genome values
hgsql -h genome-testdb hgcentraltest \
-e 'INSERT INTO dbDb \
(name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) values \
("rn4", "Nov. 2004", "/gbdb/rn4", "Rat", \
"chr2:22845558-23004134", 1, 30, "Rat", \
"Rattus norvegicus", "/gbdb/rn4/html/description.html", \
0, 0, "Baylor HGSC v. 3.4");'
# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 1/30/06 angie)
# chrom{Fa*,Out}, md5sum.txt REDONE, 3/17/06, after RepeatMasker re-run.
# upstream*.fa.gz added 6/23/06
ssh kolossus
cd /cluster/data/rn4
#- Build the .tar.gz files -- no genbank for now.
cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf bigZips
mkdir bigZips
tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp
tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked
cd bed/simpleRepeat
tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed
cd ../..
'_EOF_'
# << this line makes emacs coloring happy
csh -ef ./jkStuff/zipAll.csh >& zipAll.log &
tail -f zipAll.log
#- Look at zipAll.log to make sure all file lists look reasonable.
cd bigZips
md5sum *.gz > md5sum.txt
# Make a README.txt
cd ..
mkdir chromGz
foreach f ( ?{,?}/chr*.fa )
echo $f:t:r
gzip -c $f > chromGz/$f:t.gz
end
cd chromGz
md5sum *.gz > md5sum.txt
# Make a README.txt
#- Link the .gz and .txt files to hgwdev:/usr/local/apache/...
ssh hgwdev
set gp = /usr/local/apache/htdocs/goldenPath/rn4
mkdir -p $gp/bigZips
ln -s /cluster/data/rn4/bigZips/{chrom*.tar.gz,*.txt} $gp/bigZips
mkdir -p $gp/chromosomes
ln -s /cluster/data/rn4/chromGz/{chr*.gz,*.txt} $gp/chromosomes
# Take a look at bigZips/* and chromosomes/*
mkdir $gp/database
# Create README.txt files in database/ to explain the files.
# 6/23/06 -- Add upstream*.fa.gz.
cd /cluster/data/rn4/bigZips
foreach size (1000 2000 5000)
echo $size
featureBits rn4 refGene:upstream:$size -fa=stdout \
| nice gzip -c > upstream$size.fa.gz
end
nice md5sum up*.gz >> md5sum.txt
ln -s /cluster/data/rn4/bigZips/up*.gz $gp/bigZips/
# MAKE 11.OOC (DONE 2/10/06 angie)
ssh kolossus
cd /cluster/data/rn4
mkdir /cluster/bluearc/rn4
blat rn4.2bit /dev/null /dev/null \
-tileSize=11 -makeOoc=/cluster/bluearc/rn4/11.ooc -repMatch=1024
#Wrote 25608 overused 11-mers to /cluster/bluearc/rn4/11.ooc
# and for use in other contexts:
cp -p /cluster/bluearc/rn4/11.ooc /cluster/data/rn4
# GENBANK AUTO UPDATE (DONE 2/10/06 angie)
# align with revised genbank process. drop xeno ESTs.
cd ~/kent/src/makeDb/genbank
cvsup
# edit etc/genbank.conf to add rn4
# rn4
rn4.serverGenome = /cluster/data/rn4/rn4.2bit
rn4.clusterGenome = /iscratch/i/rn4/rn4.2bit
rn4.ooc = /cluster/bluearc/rn4/11.ooc
rn4.align.unplacedChroms = chrUn,chr*_random
rn4.lift = /cluster/data/rn4/jkStuff/liftAll.lft
rn4.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
rn4.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
rn4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
rn4.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
rn4.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
rn4.downloadDir = rn4
rn4.refseq.mrna.xeno.load = yes
rn4.refseq.mrna.xeno.loadDesc = yes
rn4.mgcTables.default = full
rn4.mgcTables.mgc = all
cvs ci etc/genbank.conf
# update /cluster/data/genbank/
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial rn4 &
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad rn4 &
# enable daily alignment and update of hgwdev
cd ~/kent/src/makeDb/genbank
cvsup
# add rn4 to:
etc/align.dbs
etc/hgwdev.dbs
cvs commit
make etc-update
# PRODUCING GENSCAN PREDICTIONS (DONE 1/31/06 angie)
ssh hgwdev
mkdir /cluster/data/rn4/bed/genscan
cd /cluster/data/rn4/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kki
cd /cluster/data/rn4/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
cp /dev/null genome.list
foreach f ( `ls -1S /cluster/data/rn4/*/chr*_*/chr*_?{,?}.fa.masked` )
egrep '[ACGT]' $f > /dev/null
if ($status == 0) echo $f >> genome.list
end
wc -l genome.list
#591 genome.list
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para make jobList
para time
#Completed: 589 of 591 jobs
#Crashed: 2 jobs
#CPU time in finished jobs: 110813s 1846.88m 30.78h 1.28d 0.004 y
#IO & Wait Time: 8627s 143.79m 2.40h 0.10d 0.000 y
#Average job time: 203s 3.38m 0.06h 0.00d
#Longest finished job: 4230s 70.50m 1.18h 0.05d
#Submission to last job: 11260s 187.67m 3.13h 0.13d
# If there are crashes, diagnose with "para problems" / "para crashed".
# If a job crashes due to genscan running out of memory, re-run it
# manually with "-window=1200000" instead of "-window=2400000".
ssh kkr5u00
cd /cluster/data/rn4/bed/genscan
/cluster/bin/x86_64/gsBig /cluster/data/rn4/10/chr10_3/chr10_3.fa.masked gtf/chr10_3.fa.gtf -trans=pep/chr10_3.fa.pep -subopt=subopt/chr10_3.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
/cluster/bin/x86_64/gsBig /cluster/data/rn4/9/chr9_21/chr9_21.fa.masked gtf/chr9_21.fa.gtf -trans=pep/chr9_21.fa.pep -subopt=subopt/chr9_21.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
# Convert these to chromosome level files as so:
ssh kkstore01
cd /cluster/data/rn4/bed/genscan
liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/rn4/bed/genscan
ldHgGene rn4 genscan genscan.gtf
hgPepPred rn4 generic genscanPep genscan.pep
hgLoadBed rn4 genscanSubopt genscanSubopt.bed
# MAKE GCPERCENT (DONE 1/30/06 angie)
ssh kolossus
mkdir /cluster/data/rn4/bed/gc5Base
cd /cluster/data/rn4/bed/gc5Base
hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 rn4 \
/cluster/data/rn4 \
| wigEncode stdin gc5Base.wig gc5Base.wib
ssh hgwdev
mkdir /gbdb/rn4/wib
cd /cluster/data/rn4/bed/gc5Base
ln -s `pwd`/gc5Base.wib /gbdb/rn4/wib
hgLoadWiggle -pathPrefix=/gbdb/rn4/wib rn4 gc5Base gc5Base.wig
# ENSEMBL GENES (DONE 1/30/06 angie - UPDATED 6/13/06)
mkdir /cluster/data/rn4/bed/ensembl
cd /cluster/data/rn4/bed/ensembl
# Get the ensembl gene data from
# http://www.ensembl.org/Rattus_norvegicus/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Rattus_norvegicus choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput. choose gzip compression. hit export.
# Save as ensembl.gff.gz
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software.
gunzip -c ensembl.gff.gz \
| perl -wpe 's/^([0-9]+|X|Y|M|Un(_random)?)/chr$1/ \
|| die "Line $. doesnt start with rat chrom:\n$_"; \
s/^chrMT/chrM/;' \
> ensGene.gtf
ssh hgwdev
ldHgGene -gtf -genePredExt rn4 ensGene \
/cluster/data/rn4/bed/ensembl/ensGene.gtf
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
# Save file as ensGtp.txt.gz
gunzip ensGtp.txt.gz
hgLoadSqlTab rn4 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt
# Load Ensembl peptides:
# Get them from ensembl as above in the gene section except for
# Page 3) Choose the "Sequences" box.
# Page 4) Transcripts/Proteins. Peptide. Format = FASTA.
# Save file as ensemblPep.fa.gz
gunzip -c ensemblPep.fa.gz \
| perl -wpe 's/^>.*\|(ENSRNOT\d+\.\d+).*/>$1/' \
> ensPep.fa
hgPepPred rn4 generic ensPep ensPep.fa
# CPGISSLANDS (WUSTL) (DONE 1/30/06 angie)
ssh hgwdev
mkdir /cluster/data/rn4/bed/cpgIsland
cd /cluster/data/rn4/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
mv cpglh.exe /cluster/data/rn4/bed/cpgIsland/
ssh kolossus
cd /cluster/data/rn4/bed/cpgIsland
foreach f (../../*/chr*.fa.masked)
set fout=$f:t:r:r.cpg
echo running cpglh on $f to $fout
nice ./cpglh.exe $f > $fout
end
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
# load into database:
ssh hgwdev
cd /cluster/data/rn4/bed/cpgIsland
# Reloaded without -noBin (bin added to .sql) 4/7/06.
hgLoadBed rn4 cpgIslandExt -tab \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# CPGISLANDS (ANDY LAW) (DONE 1/30/06 angie)
# See notes in makeGalGal2.doc
ssh kolossus
mkdir /cluster/data/rn4/bed/cpgIslandGgfAndy
cd /cluster/data/rn4/bed/cpgIslandGgfAndy
# Build the preProcGgfAndy program in
# kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE
# Use masked sequence since this is a mammal...
cp /dev/null cpgIslandGgfAndyMasked.bed
foreach f (../../*/chr*.fa.masked)
set chr = $f:t:r:r
echo preproc and run on masked $chr
~/bin/x86_64/preProcGgfAndy $f \
| /cluster/home/angie/ggf-andy-cpg-island.pl \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
$gc = $c + $g; $pCpG = (100.0 * 2 * $cpg / $n); \
$pGc = (100.0 * $gc / $n); \
$_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
"$pCpG\t$pGc\t$oE\n";' \
>> cpgIslandGgfAndyMasked.bed
end
# load into database:
ssh hgwdev
cd /cluster/data/rn4/bed/cpgIslandGgfAndy
# Reloaded without -noBin (bin added to .sql) 4/7/06.
sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
hgLoadBed rn4 cpgIslandGgfAndyMasked -tab \
-sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
#Loaded 91097 elements of size 10
featureBits rn4 cpgIslandExt
#9629809 bases of 2571531505 (0.374%) in intersection
featureBits rn4 cpgIslandGgfAndyMasked
#43899646 bases of 2571531505 (1.707%) in intersection
wc -l ../cpgIsland/cpgIsland.bed *bed
# 15809 ../cpgIsland/cpgIsland.bed
# 91097 cpgIslandGgfAndyMasked.bed
# LIFTOVER RGD (DONE 2/9/06 angie)
# ftp://rgd.mcw.edu/pub/RGD_genome_annotations/ still has 3.1 (rn3) not
# 3.4 (rn4). Jim said these are interesting enough that we should lift
# over at least the QTLs.
ssh kolossus
mkdir /cluster/data/rn4/bed/rgdLiftover
cd /cluster/data/rn4/bed/rgdLiftover
wget ftp://rgd.mcw.edu/pub/RGD_genome_annotations/V3.1/GFF_files/rgd_rat_qtl_12052005.gff
awk '{print $1"\t"$4-1"\t"$5"\t"$10}' rgd_rat_qtl_12052005.gff \
| sed -e 's/Chr/chr/g; s/"//g; s/RGD://g; s/;//g' \
> rgdQtl_rn3.bed
awk '{printf "%s\t%s\t", $12, $10; \
for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' \
rgd_rat_qtl_12052005.gff \
| sed -e 's/"//g; s/RGD://g; s/;\t/\t/g' \
> rgdQtlLink.tab
# liftOver rgdQtl_rn3.bed \
# /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \
# rgdQtl.bed rgdQtl.unmapped
# liftOver of rgdQtl_rn3.bed did not go well -- out of 903 qtls,
# only 95 were successfully mapped. 772 got "Partially split in new."
# Since these are enormous, presumably fuzzy-edged regions, just lift
# start and end. If the start and end of each feature seems to be
# liftOver'd coherently, "chain" those back up into big regions in rn4.
perl -we 'while (<>) { \
chomp; ($chr, $start, $end, $name) = split; \
print "$chr\t$start\t" . ($start+100) . "\tstart_$name\n"; \
print "$chr\t" . ($end-100) . "\t$end\tend_$name\n"; \
}' \
rgdQtl_rn3.bed > rgdQtl_rn3_endpoints.bed
liftOver rgdQtl_rn3_endpoints.bed \
/cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \
rgdQtl_endpoints.bed rgdQtl_endpoints.unmapped
# Now 1706 out of 1806 survive... but 100 are deleted in new?!
# I manually checked 10 of them and they all are either in a gap
# or off the end of a chrom. Makes me wonder if these really are
# rn3 coords... oh well, I guess we can load them up and ask
# RGD about them in the meantime. I submitted a question using their
# online form -- my message ID is 439 and I can email rgd@rgd.mcw.edu
# with that ID if I don't hear from them in 3 biz days.
perl -we 'while (<>) { \
chomp; ($chr, $start, $end, $name) = split; \
if ($name =~ /^start_(\S+)/) { \
$starts{$1} = [$chr, $start]; \
} elsif ($name =~ /^end_(\S+)/) { \
$ends{$1} = [$chr, $end]; \
} else { die "parse error line $.: name $name\n"; } \
} \
foreach my $name (keys %starts) { \
if (defined $ends{$name}) { \
my ($sChr, $start) = @{$starts{$name}}; \
my ($eChr, $end) = @{$ends{$name}}; \
if (($sChr eq $eChr) && $end > $start) { \
print "$sChr\t$start\t$end\t$name\n"; \
} else { \
print STDERR "reject: [$sChr, $start] / [$eChr, $end] / $name\n"; \
} \
} \
}' \
rgdQtl_endpoints.bed \
| sort -k 1,1 -k 2n,2n > rgdQtl.bed
wc -l rgdQtl.bed
#806 rgdQtl.bed
ssh hgwdev
cd /cluster/data/rn4/bed/rgdLiftover
hgLoadBed rn4 rgdQtl rgdQtl.bed
hgsql rn4 < ~/kent/src/hg/lib/rgdQtlLink.sql
hgsql rn4 -e \
'load data local infile "rgdQtlLink.tab" into table rn4.rgdQtlLink;'
# MAKE LINEAGE-SPECIFIC REPEATS VS. HUMAN, MOUSE (DONE 2/8/06 angie)
ssh kolossus
mkdir /cluster/data/rn4/rmsk
cd /cluster/data/rn4/rmsk
ln -s ../?{,?}/chr*.fa.out .
# Run Arian's DateRepsinRMoutput.pl to add extra columns telling
# whether repeats in -query are also expected in -comp species.
# Human in extra column 1, Mouse in extra column 2
foreach outfl ( *.out )
echo "$outfl"
nice /cluster/bluearc/RepeatMasker/DateRepeats \
${outfl} -query rat -comp human -comp mouse
end
# Now extract human (extra column 1), mouse (extra column).
cd ..
mkdir linSpecRep.notInHuman
mkdir linSpecRep.notInMouse
foreach f (rmsk/*.out_homo-sapiens_mus-musculus)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractLinSpecReps 1 $f > \
linSpecRep.notInHuman/$base.out.spec
/cluster/bin/scripts/extractLinSpecReps 2 $f > \
linSpecRep.notInMouse/$base.out.spec
end
wc -l rmsk/*.out
# 4417757 total
wc -l linSpecRep.notInHuman/*
# 2676056 total
wc -l linSpecRep.notInMouse/*
# 471417 total
# Clean up.
rm -r rmsk
# Distribute linSpecRep.* for cluster run
ssh kkstore01
mkdir /san/sanvol1/scratch/rn4
rsync -av /cluster/data/rn4/linSpecRep.notInHuman/* \
/san/sanvol1/scratch/rn4/linSpecRep.notInHuman/
rsync -av /cluster/data/rn4/linSpecRep.notInMouse/* \
/san/sanvol1/scratch/rn4/linSpecRep.notInMouse/
# SWAP/CHAIN/NET HG17 (DONE 2/10/06 angie)
mkdir /cluster/data/rn4/bed/blastz.hg17.swap
cd /cluster/data/rn4/bed/blastz.hg17.swap
doBlastzChainNet.pl -swap /cluster/data/hg17/bed/blastz.rn4/DEF \
-workhorse=kkr5u00 >& do.log & tail -f do.log
ln -s blastz.hg17.swap /cluster/data/rn4/bed/blastz.hg177
# MAKE LINEAGE-SPECIFIC REPEATS FOR NON-MAMMALS (DONE 2/13/06 angie)
# In an email 2/13/04 to Angie, Arian said we could treat all
# human repeats as lineage-specific for human-chicken blastz.
# Extrapolate to any mammal vs. anything at least as distant as chicken.
ssh kkr1u00
mkdir /iscratch/i/rn4/linSpecRep.notInNonMammal
foreach f (/cluster/data/rn4/*/chr*.fa.out)
cp -p $f /iscratch/i/rn4/linSpecRep.notInNonMammal/$f:t:r:r.out.spec
end
iSync
# BLASTZ CHICKEN (GALGAL2) (DONE 2/14/06 angie)
ssh kk
mkdir /cluster/data/rn4/bed/blastz.galGal2.2006-02-13
cd /cluster/data/rn4/bed/blastz.galGal2.2006-02-13
# Set L=10000 (higher threshold on blastz's outer loop) and abridge
# repeats.
cat << '_EOF_' > DEF
#rat vs. chicken
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Rat
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_SMSK=/iscratch/i/rn4/linSpecRep.notInNonMammal
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
# QUERY: Chicken
SEQ2_DIR=/iscratch/i/galGal2/nib
SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LEN=/cluster/data/galGal2/chrom.sizes
BASE=/cluster/data/rn4/bed/blastz.galGal2.2006-02-13
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot=/cluster/bluearc/blastzRn4GalGal2Out \
> do.log & tail -f do.log
cd /cluster/data/rn4/bed
ln -s /cluster/data/rn4/bed/blastz.galGal2.2006-02-13 blastz.galGal2
##########################################################################
# BACEND PAIRS TRACK
# DOWNLOAD CLONEENDS (BACENDS) FROM NCBI (DONE 2/23/06 angie)
ssh kkstore01
cd /cluster/data/rn4
# Plenty of room at the moment:
df -h .
# 1.3T 1009G 234G 82% /cluster/store9
mkdir -p bed/cloneend/ncbi
cd bed/cloneend/ncbi
wget --timestamping \
ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/rattus_norvegicus/\*
cd ..
# Extract unversioned accessions from headers and combine into one
# uncompressed file which we will link to from /gbdb/:
zcat ncbi/*ends*.mfa.gz \
| perl -wpe 'if (/^>/) { \
s/^>.*\|(gb|dbj)\|(\w+)\.\d+\|.*/>$2/ || die "parse line $.:$_\n"; }' \
> cloneEnds.fa
# Make sure the sequences are intact after the header-munging:
faSize ncbi/*.mfa.gz
#188448559 bases (4451 N's 188444108 real 102020307 upper 86423801 lower) in 307557 sequences in 18 files
#Total size: mean 612.7 sd 198.9 min 83 (gi|30387889|gb|CC181697.1|) max 1158 (gi|24002262|gb|BZ277934.1|) median 634
faSize cloneEnds.fa
#188448559 bases (4451 N's 188444108 real 102020307 upper 86423801 lower) in 307557 sequences in 1 files
#Total size: mean 612.7 sd 198.9 min 83 (CC181697) max 1158 (BZ277934) median 634
# Extract pairings from info files:
zcat ncbi/*info*.txt.gz \
| /cluster/bin/scripts/convertCloneEndInfo stdin
#134351 pairs and 38410 singles
# split for cluster run
mkdir /cluster/bluearc/rn4/cloneEnds
faSplit sequence cloneEnds.fa 100 /cluster/bluearc/rn4/cloneEnds/cloneEnds
# Check to ensure no breakage:
faSize /cluster/bluearc/rn4/cloneEnds/*.fa
#188448559 bases (4451 N's 188444108 real 102020307 upper 86423801 lower) in 307557 sequences in 98 files
#Total size: mean 612.7 sd 198.9 min 83 (CC181697) max 1158 (BZ277934) median 634
# same numbers as before
# load sequences
ssh hgwdev
mkdir /gbdb/rn4/cloneend
ln -s /cluster/data/rn4/bed/cloneend/cloneEnds.fa /gbdb/rn4/cloneend/
cd /tmp
hgLoadSeq rn4 /gbdb/rn4/cloneend/cloneEnds.fa
#307557 sequences
# BACEND SEQUENCE ALIGNMENTS (DONE 2/23/06 angie)
ssh kkstore01
# Make unmasked fasta.
cd /cluster/data/rn4
mkdir /cluster/bluearc/rn4/noMask
foreach f (?{,?}/chr*.fa)
echo $f:t:r
perl -wpe 'tr/a-z/A-Z/ if (! /^>/);' $f \
> /cluster/bluearc/rn4/noMask/$f:t
end
# kluster run
ssh pk
mkdir /cluster/data/rn4/bed/bacends
cd /cluster/data/rn4/bed/bacends
mkdir out
# allow blat to run politely in /tmp while it writes output, then
# copy results to results file:
cat << '_EOF_' > runBlat.csh
#!/bin/csh -ef
set root1 = $1
set root2 = $2
set result = $3
rm -fr /scratch/tmp/${root1}_${root2}
mkdir /scratch/tmp/${root1}_${root2}
pushd /scratch/tmp/${root1}_${root2}
/cluster/bin/x86_64/blat /cluster/bluearc/rn4/noMask/${root1}.fa \
/cluster/bluearc/rn4/cloneEnds/${root2}.fa \
-ooc=/cluster/bluearc/rn4/11.ooc ${root1}.${root2}.psl
popd
mkdir -p out/${root2}
rm -f ${result}
mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /scratch/tmp/${root1}_${root2}
'_EOF_'
# << happy emacs
chmod +x runBlat.csh
cat << '_EOF_' > template
#LOOP
./runBlat.csh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
# << emacs happy
ls -1S /cluster/bluearc/rn4/cloneEnds/cloneEnds???.fa > bacEnds.lst
ls -1S /cluster/bluearc/rn4/noMask/chr*.fa > contig.lst
gensub2 contig.lst bacEnds.lst template jobList
para make jobList
#Completed: 4410 of 4410 jobs
#CPU time in finished jobs: 450309s 7505.15m 125.09h 5.21d 0.014 y
#IO & Wait Time: 812287s 13538.12m 225.64h 9.40d 0.026 y
#Average job time: 286s 4.77m 0.08h 0.00d
#Longest finished job: 1834s 30.57m 0.51h 0.02d
#Submission to last job: 6982s 116.37m 1.94h 0.08d
ssh kkstore01
cd /cluster/data/rn4/bed/bacends
mkdir temp
time pslSort dirs raw.psl temp out/*
#538.065u 55.510s 14:46.80 66.9% 0+0k 0+0io 3pf+0w
time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \
raw.psl bacEnds.psl /dev/null
#203.262u 8.877s 3:57.34 89.3% 0+0k 0+0io 2pf+0w
# BACEND PAIRS TRACK (DONE 2/23/06 angie)
ssh kolossus
cd /cluster/data/rn4/bed/bacends
time /cluster/bin/x86_64/pslPairs -tInsert=10000 \
-minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose bacEnds.psl \
../cloneend/cloneEndPairs.txt all_bacends bacEnds
#19.229u 4.720s 0:26.34 90.8% 0+0k 0+0io 0pf+0w
# Filter by score and sort by {chrom,chromStart}:
awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
cat bacEnds.{slop,short,long,mismatch,orphan} \
| awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
# CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
awk '{print $5}' bacEndPairs.bed | sort -u
/cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
bacEndPairsBad.bed \
| sorttbl tname tstart | headchg -del \
> bacEnds.load.psl
wc -l bacEnds.*
# 5590477 bacEnds.psl
# 4462768 bacEnds.load.psl
# 109315 bacEnds.pairs
# 705 bacEnds.long
# 98 bacEnds.lst
# 3925 bacEnds.mismatch
# 88292 bacEnds.orphan
# 558 bacEnds.short
# 687 bacEnds.slop
# load into database
ssh hgwdev
cd /cluster/data/rn4/bed/bacends
# Reloaded with updated .sql (no chromEnd index) 4/7/06.
hgLoadBed -strict -notItemRgb rn4 bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
#Loaded 109139 elements of size 11
# note - the "Bad" track isn't pushed to RR, just used for assembly QA.
hgLoadBed -strict -notItemRgb rn4 bacEndPairsBad bacEndPairsBad.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
#Loaded 38981 elements of size 11
time hgLoadPsl rn4 -table=all_bacends bacEnds.load.psl
#load of all_bacends did not go as planned: 4462768 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab
#165.730u 24.690s 10:57.43 28.9% 0+0k 0+0io 258pf+0w
# To diagnose:
echo select \* from all_bacends | hgsql -N rn4 > psl.tab.loaded
diff psl.tab*
#1491015c1491015
#< 120 908 82 0 0 3 -124 6 278579 - BZ232973 866 0 866 chr17 97296363 49307517 49587086 8 21,5,71,86,327,250,116,114, 0,21,27,98,184,382,632,752, 49307517,49307539,49307544,49335694,49336333,49470455,49586851,49586972,
#---
#> 120 908 82 0 0 3 0 6 278579 - BZ232973 866 0 866 chr17 97296363 49307517 49587086 8 21,5,71,86,327,250,116,114, 0,21,27,98,184,382,632,752, 49307517,49307539,49307544,49335694,49336333,49470455,49586851,49586972,
# Q gap bases was -124 in... clipped to 0 in the db.
# Identify and save the out/*.psl file that contains the -124:
grep '^>BZ232973' /cluster/bluearc/rn4/cloneEnds/*.fa
#/cluster/bluearc/rn4/cloneEnds/cloneEnds079.fa:>BZ232973
grep BZ232973 out/cloneEnds079/*.psl | grep .-124
#out/cloneEnds079/chr17.cloneEnds079.psl:908 82 0 0 3 -124 6 278579 - BZ232973 866 0 866 chr17 97296363 49307517 49587086 8 21,5,71,86,327,250,116,114, 0,21,27,98,184,382,632,752, 49307517,49307539,49307544,49335694,49336333,49470455,49586851,49586972,
cp -p out/cloneEnds079/chr17.cloneEnds079.psl .
featureBits rn4 all_bacends
#241203668 bases of 2571531505 (9.380%) in intersection
featureBits rn3 all_bacends
#232313031 bases of 2571104688 (9.036%) in intersection
featureBits rn4 bacEndPairs
#2687958438 bases of 2571531505 (104.528%) in intersection
featureBits rn3 bacEndPairs
#2717444119 bases of 2571104688 (105.692%) in intersection
featureBits rn4 bacEndPairsBad
#664080718 bases of 2571531505 (25.824%) in intersection
featureBits rn3 bacEndPairsBad
#table bacEndPairsBad not found for any chroms
# Clean up - this recovers >10G!
rm psl.tab psl.tab.loaded raw.psl bed.tab
rm -r out
rmdir temp
rm -r /cluster/bluearc/rn4/noMask/ /cluster/bluearc/rn4/cloneEnds/
# end BACEND PAIRS TRACK
##########################################################################
# SWAP/CHAIN/NET MM7 (DONE 2/24/06 angie)
mkdir /cluster/data/rn4/bed/blastz.mm7.swap
cd /cluster/data/rn4/bed/blastz.mm7.swap
doBlastzChainNet.pl -swap /cluster/data/mm7/bed/blastz.rn4/DEF \
>& do.log & tail -f do.log
ln -s blastz.mm7.swap /cluster/data/rn4/bed/blastz.mm7
# SWAP/CHAIN/NET HG18 (DONE 2/24/06 angie)
mkdir /cluster/data/rn4/bed/blastz.hg18.swap
cd /cluster/data/rn4/bed/blastz.hg18.swap
doBlastzChainNet.pl -swap /cluster/data/hg18/bed/blastz.rn4/DEF \
>& do.log & tail -f do.log
ln -s blastz.hg18.swap /cluster/data/rn4/bed/blastz.hg18
# MAKE LINEAGE-SPECIFIC REPEATS VS. DOG, COW, RABBIT (DONE 2/24/06 angie)
ssh kolossus
mkdir /cluster/data/rn4/rmsk
cd /cluster/data/rn4/rmsk
ln -s ../?{,?}/chr*.fa.out .
# Run Arian's DateRepsinRMoutput.pl to add extra columns telling
# whether repeats in -query are also expected in -comp species.
# Dog in extra column 1, mouse in 2, rabbit in 3
foreach outfl ( *.out )
echo "$outfl"
nice /cluster/bluearc/RepeatMasker/DateRepeats \
${outfl} -query rat -comp dog -comp cow -comp rabbit
end
# Now extract human (extra column 1), mouse (extra column).
cd ..
mkdir linSpecRep.notIn{Dog,Cow,Rabbit}
foreach f (rmsk/*.out_*)
set base = $f:t:r:r
echo $base.out.spec
/cluster/bin/scripts/extractRepeats 1 $f > \
linSpecRep.notInDog/$base.out.spec
/cluster/bin/scripts/extractRepeats 2 $f > \
linSpecRep.notInCow/$base.out.spec
/cluster/bin/scripts/extractRepeats 3 $f > \
linSpecRep.notInRabbit/$base.out.spec
end
wc -l rmsk/*.out
# 4417757 total
wc -l linSpecRep.notInDog/*
# 2676056 total
wc -l linSpecRep.notInCow/*
# 2676056 total
wc -l linSpecRep.notInRabbit/*
# 2676056 total
# These all look identical to each other (and to human except for header):
foreach f (linSpecRep.notInDog/*)
cmp $f linSpecRep.notInCow/$f:t
cmp $f linSpecRep.notInRabbit/$f:t
end
# Consolidate to save space (probably could have done this for human too):
mv linSpecRep.notInDog linSpecRep.notInNonRodentMammal
# Clean up.
rm -r rmsk linSpecRep.notInCow linSpecRep.notInRabbit
# Distribute linSpecRep.* for cluster run
ssh kkstore01
rsync -av /cluster/data/rn4/linSpecRep.notInNonRodentMammal/* \
/cluster/bluearc/rn4/linSpecRep.notInNonRodentMammal/
# BLASTZ/CHAIN/NET CANFAM2 (DONE 2/24/06 angie)
ssh kkstore01
mkdir /cluster/data/rn4/bed/blastz.canFam2.2006-02-24
cd /cluster/data/rn4/bed/blastz.canFam2.2006-02-24
cat << '_EOF_' > DEF
# rat vs. dog
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Rat
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_SMSK=/cluster/bluearc/rn4/linSpecRep.notInNonRodentMammal
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_SMSK=/cluster/bluearc/canFam2/linSpecRep.notInRat
SEQ2_LEN=/cluster/data/canFam2/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.canFam2.2006-02-24
'_EOF_'
# << for emacs
doBlastzChainNet.pl DEF -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/blastzRn4CanFam2Out >& do.log &
tail -f do.log
ln -s blastz.canFam2.2006-02-24 /cluster/data/rn4/bed/blastz.canFam2
##########################################################################
# STS MARKERS
# NOTE: Instead of using UniSTS_rat.sts as usual, I made a local version
# which uses the name given in UniSTS.sts if none is given in UniSTS_rat.sts
# and "UniSTS:$id" if no name is given in UniSTS.sts. Some inconsistencies
# between UniSTS.sts and UniSTS_rat.sts reported 2/21/06; haven't heard
# back from them yet so I'll just forge ahead with latest UniSTS files.
# MAKE STSINFORAT (DONE 3/6/06 angie)
# This method was developed by Yontao. ytlu/script is under CVS control
# so I'm using ~/ytlu/script/perl/script/match but have copied several
# other scripts from /cluster/store4/ratMarker/code into
# kent/src/hg/stsMarkers/ to get them in CVS too.
ssh kkstore01
mkdir /cluster/data/rn4/bed/stsMarkers
mkdir /cluster/data/rn4/bed/stsMarkers/info
cd /cluster/data/rn4/bed/stsMarkers/info
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Rattus_norvegicus/10116.FHH_x_ACI.7.txt
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Rattus_norvegicus/10116.RH_map.3.4.txt
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Rattus_norvegicus/10116.SHRSP_x_BN.7.txt
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_rat.sts
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
wget --timestamping ftp://rgd.mcw.edu/pub/rhmap/3.4/RHv3_4_MapData.txt
wget --timestamping ftp://rgd.mcw.edu/pub/rhmap/3.4/RAW_MARKER_SEQINFO.txt
# UniSTS_rat.sts is missing a lot of names that can be found in UniSTS.sts
# so load that up and make a fixed-up UniSTS_rat.rat_extracted.sts:
wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
awk -F"\t" '$5 != "-" {print $1 "\t" $5 "\t" $8;}' UniSTS.sts \
> UniSTS.names
cat > extractRat.pl <<'_EOF_'
#!/usr/bin/perl -w
use strict;
open(NAMES, "<UniSTS.names") || die "Can't open UniSTS.names: $!\n";
print STDERR "Hashing UniSTS.names...\n";
my %id2name;
while (<NAMES>) {
chomp;
my @words = split("\t");
$id2name{$words[0]} = $words[1];
}
print STDERR "Extracting rat and adding in names...\n";
while (<>) {
chomp;
my @words = split("\t");
next if ($words[7] ne "Rattus norvegicus");
if ($words[4] eq '-') {
$words[4] = $id2name{$words[0]};
$words[4] = "UniSTS:$words[0]" if (! defined $words[4]);
} elsif ((defined $id2name{$words[0]}) &&
($words[4] ne $id2name{$words[0]})) {
print STDERR "Disagreement: name of $words[0] given as " .
$id2name{$words[0]} . " and as $words[4]\n";
}
print join("\t", @words) . "\n";
}
'_EOF_'
# << emacs
chmod a+x extractRat.pl
./extractRat.pl UniSTS_rat.sts > UniSTS_rat.rat_extracted.sts
# Many warnings about different name given in UniSTS_rat.sts vs.
# UniSTS.sts -- reported to NCBI.
# Make UniSTS_rat.alias by joining the first columns of
# UniSTS_rat.rat_extracted.sts and UniSTS.aliases so we get just the
# aliases of items in UniSTS_rat.rat_extracted.sts:
/cluster/home/ytlu/ytlu/script/perl/script/match \
UniSTS_rat.rat_extracted.sts 1 UniSTS.aliases 1 \
> UniSTS_rat.alias
# Extract these columns from RHv3_4_MapData.txt:
# RGD_Sym TEMPL_SEQ FORWARD_PRIMER REVERSE_PRIMER EXP_SIZE
awk '{print $2,$1,$17,$8,$9,$10}' RHv3_4_MapData.txt | sed s/" "/"\t"/g \
> inMapData.txt
# Find entries not in RAW_MARKER_SEQINFO.txt but in inMapData.txt,
/cluster/home/ytlu/ytlu/script/perl/script/match -M \
RAW_MARKER_SEQINFO.txt 1 inMapData.txt 1 > inMapDataOnly.txt
cat inMapDataOnly.txt RAW_MARKER_SEQINFO.txt > marker_seqinfo.txt
# Combine sources into a "draft" stsInfoRat.tab file:
~/kent/src/hg/stsMarkers/createStsInfoRat \
UniSTS_rat.rat_extracted.sts marker_seqinfo.txt UniSTS_rat.alias \
10116.RH_map.3.4.txt 10116.FHH_x_ACI.7.txt 10116.SHRSP_x_BN.7.txt \
> stsInfoRat-draft
# clean some redundant and conflicting entries --> final stsInfoRat.tab:
~/kent/src/hg/stsMarkers/cleanInfo.pl -rat stsInfoRat-draft \
> ../stsInfoRat.withSeq.tab
# Create the primer.fa, cloneseq.fa, and primerinfo files for alignment:
cd ..
~/kent/src/hg/stsMarkers/luConvertPrimerToFa stsInfoRat.withSeq.tab \
ratPrimer.fa ratSeq.fa ratPrimer.info -a 200
# Strip the sequence column off so that it isn't loaded into the db
# (causing mysql warnings due to truncation). The seq column is not
# used from here on out:
perl -wpe '@w=split("\t"); $w[24] = ""; $_ = join("\t", @w) . "\n";' \
stsInfoRat.withSeq.tab > stsInfoRat.tab
# BLAT CLONE SEQUENCES (DONE 3/8/06 angie)
# Based on what was done for hg17, I think the thing to do is not use
# ooc, but filter out the results with tGapSize > 1000.
ssh kkstore01
cd /cluster/data/rn4
mkdir -p /cluster/bluearc/rn4/ctgFa
foreach d (*/chr*_?{,?})
cp -p $d/$d:t.fa /cluster/bluearc/rn4/ctgFa/
end
cp -p /cluster/data/rn4/bed/stsMarkers/ratSeq.fa \
/cluster/bluearc/rn4/
ssh pk
mkdir /cluster/data/rn4/bed/stsMarkers/clone
cd /cluster/data/rn4/bed/stsMarkers/clone
# A run with ooc, and then a run without ooc only on the unaligned seqs,
# would probably save time next time around...
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat {check in line+ $(path1)} {check in line+ /cluster/bluearc/rn4/ratSeq.fa} -stepSize=5 {check out line+ clone.out/$(root1).psl}
#ENDLOOP
'_EOF_'
# << emacs
mkdir clone.out
ls -1S /cluster/bluearc/rn4/ctgFa/chr*.fa > ctg.lst
gensub2 ctg.lst single gsub jobList
para make jobList
para time
#Completed: 591 of 591 jobs
#CPU time in finished jobs: 1628303s 27138.38m 452.31h 18.85d 0.052 y
#IO & Wait Time: 26091s 434.85m 7.25h 0.30d 0.001 y
#Average job time: 2799s 46.66m 0.78h 0.03d
#Longest finished job: 6138s 102.30m 1.71h 0.07d
#Submission to last job: 8714s 145.23m 2.42h 0.10d
ssh kolossus
cd /cluster/data/rn4/bed/stsMarkers/clone
# filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
# should not be more than 100 bases different.
pslSort dirs stdout temp clone.out \
| pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
clones.unlifted.tmp.psl /dev/null
awk '($8 < 1000) {print;}' clones.unlifted.tmp.psl > clones.unlifted.psl
wc -l clones.un*
# 19097 clones.unlifted.psl
# 22275 clones.unlifted.tmp.psl
rmdir temp
rm /cluster/bluearc/rn4/ratSeq.fa
# BLAT.2 STS PRIMERS (DONE 3/7/06 angie)
cp -p /cluster/data/rn4/bed/stsMarkers/ratPrimer.fa \
/cluster/bluearc/rn4/
ssh kk
mkdir /cluster/data/rn4/bed/stsMarkers/primer
cd /cluster/data/rn4/bed/stsMarkers/primer
# PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE.
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat.2 {check in line+ $(path1)} {check in line+ /cluster/bluearc/rn4/ratPrimer.fa} -ooc=/cluster/bluearc/rn4/11.ooc -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl}
#ENDLOOP
'_EOF_'
# << emacs
mkdir primers.out
ls -1S /cluster/bluearc/rn4/ctgFa/chr*.fa > ctg.lst
gensub2 ctg.lst single gsub jobList
para make jobList
para time
#Completed: 591 of 591 jobs
#CPU time in finished jobs: 466140s 7769.00m 129.48h 5.40d 0.015 y
#IO & Wait Time: 63525s 1058.75m 17.65h 0.74d 0.002 y
#Average job time: 896s 14.94m 0.25h 0.01d
#Longest finished job: 1756s 29.27m 0.49h 0.02d
#Submission to last job: 5330s 88.83m 1.48h 0.06d
ssh kkstore01
cd /cluster/data/rn4/bed/stsMarkers/primer
# filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
# should not be more than 100 bases different.
pslSort dirs stdout temp primers.out \
| awk -F"\t" ' \
{ if (((($13 - $12) - ($17 - $16)) > -100) && \
((($13 - $12) - ($17 - $16)) < 100)) {print} \
}' \
> primers.psl.100.unlifted
rmdir temp
rm /cluster/bluearc/rn4/ratPrimer.fa
# Would be very nice to compare wc output with rn3 run, but I cannot
# find the rn3 run.
wc primers.psl.100.unlifted
# 8050086 169051724 799770587 primers.psl.100.unlifted
# E-PCR STS PRIMERS (DONE 3/7/06 angie)
ssh kkstore01
cp -p /cluster/data/rn4/bed/stsMarkers/ratPrimer.info \
/cluster/bluearc/rn4/
ssh kk
mkdir /cluster/data/rn4/bed/stsMarkers/ePCR
cd /cluster/data/rn4/bed/stsMarkers/ePCR
ls -1S /cluster/bluearc/rn4/ctgFa/chr*.fa > ctg.lst
# Hiram built latest e-PCR into /cluster/bin/{i386,x86_64}/e-PCR,
# see notes in makeMm7.doc.
mkdir epcr.out
cat << '_EOF_' > runPCR.csh
#!/bin/csh -fe
/cluster/bin/$MACHTYPE/e-PCR $1 $2 N=1 M=50 W=5 > $3
'_EOF_'
chmod +x runPCR.csh
cat << '_EOF_' > gsub
#LOOP
./runPCR.csh {check in line+ /cluster/bluearc/rn4/ratPrimer.info} {check in line+ $(path1)} {check out line+ epcr.out/$(num1).epcr}
#ENDLOOP
'_EOF_'
gensub2 ctg.lst single gsub jobList
para make jobList
para time
# Actually ran on pk -- kk is swamped at the moment.
#Completed: 591 of 591 jobs
#CPU time in finished jobs: 103485s 1724.75m 28.75h 1.20d 0.003 y
#IO & Wait Time: 29018s 483.63m 8.06h 0.34d 0.001 y
#Average job time: 224s 3.74m 0.06h 0.00d
#Longest finished job: 341s 5.68m 0.09h 0.00d
#Submission to last job: 482s 8.03m 0.13h 0.01d
ssh kkstore01
cd /cluster/data/rn4/bed/stsMarkers/ePCR
cat epcr.out/*.epcr > all.epcr
rm /cluster/bluearc/rn4/ratPrimer.info
rm -r /cluster/bluearc/rn4/ctgFa
# Would compare to rn3 if I could find it...
wc all.epcr
# 73885 295540 3915703 all.epcr
# COMBINE AND FILTER BLAT.2 AND E-PCR RESULTS (DONE 3/8/06 angie -- stsMapRat.bed REDONE 6/15/06)
ssh kolossus
cd /cluster/data/rn4/bed/stsMarkers/primer
# create accession_info.rdb: AGP info.
/cluster/bin/scripts/compileAccInfo -rat /cluster/data/rn4 /dev/null
#0 processed
# "0 processed" sounds a little scary but it's 0 because we didn't pass
# in a -pre rdb.
mv accession_info.rdb accession_info.rdb.tmp
/cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp \
> accession_info.rdb
# some lines have "chrN_random_M" in the Contig column but "chrN"
# (not chrN_random) in the chrom column.... looks weird, but reading
# compileAccInfo it seems intentional to lump ordered and random.
rm accession_info.rdb.tmp
# Would compare to rn3 if I could find it...
wc accession_info.rdb
# 138571 1524285 9921966 accession_info.rdb
# filterSTSPrimers combines blat.2 and epcr results.
# For human there is a program pslFilterPrimers that combines isPcr and
# epcr results... might be interesting to try that next time around.
/cluster/bin/scripts/filterSTSPrimers -rat \
../stsInfoRat.tab primers.psl.100.unlifted \
../ratPrimer.info ../ePCR/all.epcr \
| liftUp -nohead -type=.psl primers.psl.filter.blat \
../../../jkStuff/liftAll.lft warn stdin
# Would compare to rn3 if I could find it...
wc primers.psl.100.unlifted
# 8050086 169051724 799770587 primers.psl.100.unlifted
wc primers.psl.filter.blat
# 38707 812847 4100619 primers.psl.filter.blat
# filterSTSPrimers creates epcr.not.found which actually contains
# items that *were* found by epcr but not by blat. Translate to PSL:
/cluster/bin/scripts/epcrToPsl -rat epcr.not.found ../ratPrimer.info \
accession_info.rdb /cluster/data/rn4
# epcrToPsl creates epcr.not.found.nomatch and epcr.not.found.psl
# Would compare to rn3 if I could find it...
wc epcr*
# 748 2992 27112 epcr.not.found
# 0 0 0 epcr.not.found.nomatch
# 748 15708 72452 epcr.not.found.psl
# Lift the PSL:
mv epcr.not.found.psl epcr.not.found.unlifted.psl
liftUp -nohead epcr.not.found.psl \
../../../jkStuff/liftAll.lft warn epcr.not.found.unlifted.psl
wc epcr.not.found.psl
# 748 15708 80899 epcr.not.found.psl
cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
# create primers.psl.filter.lifted.initial
/cluster/bin/scripts/extractPslInfo primers.psl.filter
# Would compare to rn3 if I could find it...
wc primers.psl.filter.initial
# 39451 236706 2071215 primers.psl.filter.initial
# create primers.psl.filter.lifted.initial.acc
/cluster/bin/scripts/findAccession -agp \
-rat primers.psl.filter.initial /cluster/data/rn4
# it warns about missing AGP for M_random -- OK.
# Would compare to rn3 if I could find it...
wc primers.psl.filter.initial.acc
# 39451 276157 2579345 primers.psl.filter.initial.acc
/cluster/bin/scripts/getStsId -rat \
../stsInfoRat.tab primers.psl.filter.initial.acc \
| sort -k 4n primers.initial.acc.trans > primers.final
# Would compare to rn3 if I could find it...
wc primers.final
# 39451 276157 2232848 primers.final
# Lift clone sequence alignments
cd /cluster/data/rn4/bed/stsMarkers/clone
liftUp -nohead clones.psl \
../../../jkStuff/liftAll.lft warn clones.unlifted.psl
/cluster/bin/scripts/extractPslInfo clones.psl
# extractPslInfo creates clones.psl.initial
/cluster/bin/scripts/findAccession -agp \
-rat clones.psl.initial /cluster/data/rn4
# findAccession creates clones.psl.initial.acc
sort -k 4n clones.psl.initial.acc > clones.final
# Combine clone sequence alignments with primer alignments
cd /cluster/data/rn4/bed/stsMarkers
/cluster/bin/scripts/combineSeqPrimerPos \
clone/clones.final primer/primers.final
# creates stsMarkers_pos.rdb
# Would compare to rn3 if I could find it...
wc stsMarkers_pos.rdb
# 39291 275037 1893500 stsMarkers_pos.rdb
# 6/15/06: re-running from here after fixing createStsMapRat to make
# empty chroms instead of "0" when the chrom is not specified --
# hgTracks.c code depends on this.
~/kent/src/hg/stsMarkers/createStsMapRat \
stsInfoRat.tab stsMarkers_pos.rdb > stsMapRat.bed
# Would compare to rn3 if I could find it...
wc stsMapRat.bed
# 38802 490471 3205607 stsMapRat.bed
~/kent/src/hg/stsMarkers/createStsAlias stsInfoRat.tab \
| sort -u > stsAlias.tab
# Warnings about aliases with multiple trueNames... would be good to
# pass that on to NCBI, if we ever get a resolution for the previous
# questions.
# Would compare to rn3 if I could find it...
wc stsAlias.tab
# 69051 207153 1687764 stsAlias.tab
# compare old and new name lists:
ssh -x hgwdev 'hgsql -N rn3 -e "select name from stsMapRat" | sort -u' \
> rn3.nameList
awk '{print $4}' stsMapRat.bed | sort -u > rn4.nameList
comm -12 rn?.nameList | wc
# 32693 32693 290774 [in common]
comm -23 rn3.nameList rn4.nameList | wc
# 1208 1208 10659 [in rn3 only]
comm -13 rn3.nameList rn4.nameList | wc
# 5234 5234 48832 [in rn4 only]
# LOAD STS MARKER TABLES (DONE 3/8/06 angie - ratSeq.fa added 6/14/06, stsMapRat reloaded 6/15/06)
ssh hgwdev
cd /cluster/data/rn4/bed/stsMarkers
# Add primer sequences to /gbdb and load:
mkdir /gbdb/rn4/stsMarker
ln -s /cluster/data/rn4/bed/stsMarkers/ratPrimer.fa \
/gbdb/rn4/stsMarker/ratPrimer.fa
ln -s /cluster/data/rn4/bed/stsMarkers/ratSeq.fa \
/gbdb/rn4/stsMarker/ratSeq.fa
# PLEASE NOTE: if re-running hgLoadSeq, you MUST add
# -replace
# hgLoadSeq -replace rn4 /gbdb/rn4/stsMarker/ratPrimer.fa
# otherwise there will be a problem that the seq and extFile tables
# will be out of sync.
hgLoadSeq rn4 /gbdb/rn4/stsMarker/ratPrimer.fa
hgLoadSeq rn4 /gbdb/rn4/stsMarker/ratSeq.fa
hgLoadSqlTab rn4 stsAlias \
~/kent/src/hg/lib/stsAlias.sql stsAlias.tab
hgLoadSqlTab rn4 stsInfoRat \
~/kent/src/hg/lib/stsInfoRat.sql stsInfoRat.tab
# After some work on the stsInfoRat creation process which should help
# next time around (do not do this part next time, just for the record):
hgLoadSqlTab rn4 stsInfoRat \
~/kent/src/hg/lib/stsInfoRat.sql stsInfoRat.tab.new
#Warning: load of stsInfoRat did not go as planned: 48048 record(s), 0 row(s) skipped, 5 warning(s) loading stsInfoRat.tab
echo 'select * from stsInfoRat' | hgsql -N rn4 > /tmp/tmp
# compare /tmp/tmp and stsInfoRat.tmp -- looks like 5 alias fields are
# truncated at 255 chars, not so terrible because we have stsAlias.
# -- But since the IDs are off-by-one for a large stretch of that due
# -- to getting rid of a garbage line, I reloaded and manually deleted
# -- the unused and often truncated clone sequence values:
hgLoadSqlTab rn4 stsInfoRat \
~/kent/src/hg/lib/stsInfoRat.sql stsInfoRat.tab
#Warning: load of stsInfoRat did not go as planned: 48049 record(s), 0 row(s) skipped, 187463 warning(s) loading stsInfoRat.tab
hgsql rn4 -e 'update stsInfoRat set clone = "";'
# Again, next time around the above should not be necessary.
cat primer/primers.psl.filter clone/clones.psl \
| hgLoadPsl -table=all_sts_primer rn4 stdin
# Reloaded 6/15/06
hgLoadBed rn4 stsMapRat -tab \
-sqlTable=$HOME/kent/src/hg/lib/stsMapRat.sql stsMapRat.bed
#Loaded 38802 elements of size 15
nice featureBits rn4 all_sts_primer
#10103096 bases of 2571531505 (0.393%) in intersection
nice featureBits rn3 all_sts_primer
#9722909 bases of 2571104688 (0.378%) in intersection
nice featureBits rn4 stsMapRat
#11755453 bases of 2571531505 (0.457%) in intersection
nice featureBits rn3 stsMapRat
#10119869 bases of 2571104688 (0.394%) in intersection
# Clean up.
rm -r primer/primers.out/ primer/primers.psl.100.unlifted clone/clone.out/
# end STS MARKERS
##########################################################################
###############################################################################
# BUILD KNOWN GENES TABLES (DONE 3/2/06 angie)
# Use protein databases built by Fan: sp060115 and proteins060115
# See makeProteins060115.doc for details.
# Ask cluster-admin to rync sp060115 and proteins060115 to kolossus
# so that database work can be done there; then final results will be
# rsync'd back to hgwdev.
# Create working subdirectories and temporary databases (kgRn4A)
ssh kolossus
mkdir /cluster/data/rn4/bed/kgRn4A
ln -s /cluster/data/rn4/bed/kgRn4A /cluster/store6/kgDB/bed/kgRn4A
ln -s /cluster/data/rn4/bed/kgRn4A /cluster/store11/kg/kgRn4A
hgsql rn4 -e "create database kgRn4A"
hgsql rn4 -e "create database kgRn4ATemp"
mkdir /cluster/bluearc/kgDB/kgRn4A
mkdir /cluster/bluearc/kgDB/kgRn4A/protBlat
ln -s /cluster/bluearc/kgDB/kgRn4A/protBlat \
/cluster/data/rn4/bed/kgRn4A/protBlat
# Save a copy of several genbank tables so that we can produce consistent
# results if we need to add more tables later. First, ask cluster-admin
# to rsync the tables from hgwdev rn4 --> kolossus. Save files too:
mkdir /cluster/data/rn4/bed/kgRn4A/genbankBackup
cd /cluster/data/rn4/bed/kgRn4A/genbankBackup
chmod 777 .
foreach t (all_mrna cds gbCdnaInfo gbExtFile gbLoaded gbSeq gbStatus \
mgcGenes \
refFlat refGene refLink refSeqAli refSeqStatus refSeqSummary \
xenoMrna xenoRefFlat xenoRefGene xenoRefSeqAli)
echo $t
hgsqldump --tab=. rn4 $t
end
gzip *.txt
chmod 775 .
# Get all rat protein sequences
cd /cluster/data/rn4/bed/kgRn4A/protBlat
hgsql -N sp060115 -e \
'select p.acc, p.val from protein p, accToTaxon x \
where x.taxon=10116 and p.acc=x.acc'\
| awk '{print ">" $1; print $2;}' \
> ratProt.fa
hgsql -N sp060115 -e \
'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x \
where v.parAcc = p.acc and x.taxon=10116 and v.parAcc=x.acc'\
| awk '{print ">" $1; print $2;}' \
>> ratProt.fa
# Prepare and perform cluster run for protein/genome alignment
ssh kolossus
cd /cluster/bluearc/kgDB/kgRn4A/protBlat
mkdir prot
faSplit sequence ratProt.fa 2000 prot/prot
ls /cluster/bluearc/kgDB/kgRn4A/protBlat/prot/* > prot.lis
hgsql rn4 -N -e 'select chrom from chromInfo' > chrom.lis
ssh pk
cd /cluster/bluearc/kgDB/kgRn4A/protBlat
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat -t=dnax -q=prot /scratch/hg/rn4/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgRn4A/protBlat/result/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << emacs
mkdir result
gensub2 chrom.lis prot.lis gsub jobList
para make jobList
#Completed: 89100 of 89100 jobs
#CPU time in finished jobs: 4215176s 70252.93m 1170.88h 48.79d 0.134 y
#IO & Wait Time: 230997s 3849.96m 64.17h 2.67d 0.007 y
#Average job time: 50s 0.83m 0.01h 0.00d
#Longest finished job: 5794s 96.57m 1.61h 0.07d
#Submission to last job: 16257s 270.95m 4.52h 0.19d
# collect BLAT results
ssh kolossus
cd /cluster/bluearc/kgDB/kgRn4A/protBlat
pslSort -nohead dirs raw.psl temp result
pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl \
protBlat.psl /dev/null
hgLoadPsl -fastLoad rn4 protBlat.psl
#load of protBlat did not go as planned: 14768 record(s), 0 row(s) skipped, 48 warning(s) loading psl.tab
# Some of the lines had negative qGapSizes. ask Jim...
# create all_mrna.psl and tight_mrna.psl
zcat /cluster/data/rn4/bed/kgRn4A/genbankBackup/all_mrna.txt.gz \
| cut -f 2-22 > all_mrna.psl
pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 all_mrna.psl \
tight_mrna.psl /dev/null
# Use overlapSelect to get protein and mRNA alignment overlaps
overlapSelect -statsOutput -dropped=protOut.psl -overlapThreshold=0.90 \
-selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.stat
overlapSelect -mergeOutput -dropped=protOut.psl -overlapThreshold=0.90 \
-selectFmt=psl -inFmt=psl tight_mrna.psl protBlat.psl protMrna.out
# Create protein/mRNA pair and protein lists
cut -f 10,31 protMrna.out | sort -u > spMrna.tab
cut -f 10 protMrna.out | sort -u > protein.lis
# Load spMrna.tab into spMrna table in temp DB.
hgLoadSqlTab -notOnServer kgRn4ATemp spMrna ~/kent/src/hg/lib/spMrna.sql \
spMrna.tab
hgsql kgRn4ATemp -e 'create index mrnaID on spMrna(mrnaID)'
# Prepare and perform cluster run of protein/mRNA alignment
# Get mRNA fa file.
cd /cluster/data/rn4/bed/kgRn4A
/cluster/data/genbank/bin/i386/gbGetSeqs -native -db=rn4 \
-gbRoot=/cluster/data/genbank genbank mrna mrna.fa
# Create mrnaSeq table in kgRn4ATemp DB.
hgPepPred kgRn4ATemp generic mrnaSeq mrna.fa
# Prepare files for cluster run
cd /cluster/bluearc/kgDB/kgRn4A
~/kent/src/hg/protein/KG2.sh kgRn4A rn4 060115
# Perform cluster run of protein/mRNA alignment
~/kent/src/hg/protein/KG3.sh kgRn4A rn4 060115
# Collect cluster run results
cd kgBestMrna
cp /dev/null protMrnaRaw.psl
foreach d (out/*)
echo $d
cat $d/*.out >> protMrnaRaw.psl
end
# Filter out low quality alignments
pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl \
protMrnaBlat.psl /dev/null
#Processed 33623 alignments
cut -f 10,14 protMrnaBlat.psl | sort -u > protMrna.lis
wc protMrna.lis
# 21430 42860 334656 protMrna.lis
# Load BLAT results into temp DB.
ssh kolossus
cd /cluster/data/rn4/bed/kgRn4A/kgBestMrna
hgsql kgRn4ATemp -e 'create table chromInfo select * from rn4.chromInfo'
hgLoadPsl -fastLoad kgRn4ATemp protMrnaBlat.psl
#load of protMrnaBlat did not go as planned: 21432 record(s), 0 row(s) skipped, 97 warning(s) loading psl.tab
#-- tGapBases (tGapInsert) this time -- ask Jim
# Create CDS files from protein/mRNA alignment results.
hgsql kgRn4ATemp -N -e \
'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat \
order by qName,tName,tEnd-tStart desc'\
| sed 's/\t_\t/_/g; s/\t:\t/../g' > protMrna.cds
# Create protMrna.psl with proteinID_mrnaID as query ID.
cut -f 22-30 ../protBlat/protMrna.out > j1.tmp
cut -f 32-42 ../protBlat/protMrna.out > j2.tmp
cut -f 10,31 ../protBlat/protMrna.out | sed -e 's/\t/_/g' > j3.tmp
paste j1.tmp j3.tmp j2.tmp > protMrna.psl
rm j1.tmp j2.tmp j3.tmp
# Run mrnaToGene to create protMrna.gp
bash
mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp \
2>protMrna.err >protMrna.log
exit
# No need to move kgBestMrna anywhere else for now:
du -sh
#240M .
df -h .
#kkstore01-10:/export/cluster/store9
# 1.3T 1.1T 199G 84% /cluster/store9
# Prepare refGene and all_mrna gp files.
cd ..
zcat genbankBackup/refGene.txt.gz > ref.gp
hgsql rn4 -N -e \
'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna \
where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \
| sort -u > all_mrna.cds
bash
mrnaToGene -cdsFile=all_mrna.cds \
/cluster/bluearc/kgDB/kgRn4A/protBlat/all_mrna.psl all_mrna.gp \
2>all_mrna.err > all_mrna.log
exit
# Align proteins to RefSeq.
overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl \
-inFmt=genePred protBlat/protBlat.psl ref.gp ref.stat
overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 \
-selectFmt=psl -inFmt=genePred protBlat/protBlat.psl ref.gp protRef.gp
overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl \
-overlapThreshold=0.80 -inFmt=psl \
-selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out
cut -f 10,22 protRef.out | sort -u >spRef.tab
cut -f 10 protRef.out | sort -u >protRef.lis
hgLoadSqlTab kgRn4ATemp spRef ~/kent/src/hg/lib/spRef.sql spRef.tab
# Prepare and perform cluster runs for protein/RefSeq alignments
~/kent/src/hg/protein/KGRef2.sh kgRn4A rn4 060115
~/kent/src/hg/protein/KGRef3.sh kgRn4A rn4 060115
cd kgBestRef
cp /dev/null protRefRaw.psl
foreach d (out/*)
echo $d
cat $d/*.out >> protRefRaw.psl
end
# Filter out low quality alignments.
pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl \
protRefBlat.psl /dev/null
#Processed 15266 alignments
cut -f 10,14 protRefBlat.psl | sort -u > protRef.lis
wc protRef.lis
# 11957 23914 216980 protRef.lis
hgLoadPsl -fastLoad kgRn4ATemp protRefBlat.psl
#load of protRefBlat did not go as planned: 11958 record(s), 0 row(s) skipped, 45 warning(s) loading psl.tab
# Negative tGapSizes again.
# Run gene-check to filter out invalid gp entries
cd /cluster/data/rn4/bed/kgRn4A
cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
/cluster/bin/i386/gene-check -incl-ok -ok-genepred-out \
kgCandidate0.passed.gp \
-nib-dir /cluster/data/rn4/nib kgCandidate0.gp kgCandidate0.check
ldHgGene -predTab kgRn4ATemp kgCandidate0 kgCandidate0.gp
tail +3 kgCandidate0.check > geneCheck.tab
hgLoadSqlTab kgRn4ATemp geneCheck ~/kent/src/hg/lib/geneCheck.sql \
geneCheck.tab
# Run kgCheck to get all KG candidates that pass the KG gene check criteria
kgCheck kgRn4ATemp rn4 kgCandidate0 geneCheck kgCandidate.tab
hgLoadSqlTab kgRn4ATemp kgCandidate ~/kent/src/hg/lib/kgCandidate.sql \
kgCandidate.tab
hgsql kgRn4ATemp -e 'create index alignID on kgCandidate(alignID)'
# Construct the kgCandidateX table that has alignID in the name field.
cut -f 2-10 kgCandidate.tab > j2.tmp
cut -f 11 kgCandidate.tab > j1.tmp
paste j1.tmp j2.tmp > kgCandidateX.tab
ldHgGene -predTab kgRn4ATemp kgCandidateX kgCandidateX.tab
# Score protein/mRna and protein/RefSeq alignments
ln -s /cluster/bluearc/kgDB/kgRn4A/protBlat/protein.lis .
kgResultBestMrna2 060115 kgRn4ATemp rn4 protMrnaBlat | sort -u \
> protMrnaBlatScore.tab
kgResultBestRef2 060115 kgRn4ATemp rn4 protRefBlat | sort -u \
> protRefScore.tab
# Combine scoring results and load them into temp DB.
cat protMrnaBlatScore.tab protRefScore.tab > protMrnaScore.tab
hgLoadSqlTab kgRn4ATemp protMrnaScore ~/kent/src/hg/lib/protMrnaScore.sql \
protMrnaScore.tab
hgsql kgRn4ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
# Run kgGetCds to get CDS structure of each gene
kgGetCds kgRn4ATemp 060115 kgCandidateX stdout \
| sort -u > kgCandidateY.tab
hgLoadSqlTab kgRn4ATemp kgCandidateY ~/kent/src/hg/lib/kgCandidateY.sql \
kgCandidateY.tab
# Run kgPickPrep to replace long cds structure string with cdsId.
kgPickPrep kgRn4ATemp kgCandidateZ.tab
hgLoadSqlTab kgRn4ATemp kgCandidateZ ~/kent/src/hg/lib/kgCandidateZ.sql \
kgCandidateZ.tab
hgsql kgRn4ATemp -e 'create index cdsId on kgCandidateZ(cdsId)'
# Run kgPick to pick the representative a mrna/protein pair for each
# unique CDS structure.
kgPick kgRn4ATemp rn4 sp060115 kg3.tmp stdout \
| sort -u > dupSpMrna.tab
# Create put back list
/cluster/data/genbank/bin/i386/gbGetSeqs \
-gbRoot=/cluster/data/genbank db=rn4 -get=ra -native RefSeq mrna stdout \
| perl -wpe 's/^(\w+) (.*)/$1\t$2/ || next; $acc = $2 if ($1 eq "acc"); \
s/^/$acc\t/;' \
| sort -u | tail +2 > refRa.tab
hgLoadSqlTab rn4 refRa ~/kent/src/hg/lib/refRa.sql refRa.tab
hgsql rn4 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and \
r2.val="rev" and r3.acc=r.acc and r3.attr="org" and \
r3.val="Rattus norvegicus"' \
> kgPutBack2.tab
hgsql rn4 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
where r.attr="cno" and r.val like "%ribosomal frameshift%" and \
r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and \
r3.acc=r.acc and r3.attr="org" and r3.val="Rattus norvegicus"' \
>> kgPutBack2.tab
hgsql rn4 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and \
r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and \
r3.attr="org" and r3.val="Rattus norvegicus"' \
>> kgPutBack2.tab
hgsql rn4 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and \
r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and \
r3.val="Rattus norvegicus"' \
>> kgPutBack2.tab
hgsql rn4 -N -e \
'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and \
r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and \
r3.val="Rattus norvegicus"' \
>> kgPutBack2.tab
wc -l kgPutBack2.tab
#10 kgPutBack2.tab
hgLoadSqlTab kgRn4ATemp kgPutBack2 ~/kent/src/hg/lib/kgPutBack2.sql \
kgPutBack2.tab
kgPutBack kgRn4ATemp rn4 sp060115 kgPutBack2 kgPutBack2.gp
# Sort KG genes to make the kg4.gp table file.
cat kgPutBack2.gp kg3.tmp > kg4.tmp
~/kent/src/hg/protein/sortKg.pl kg4.tmp \
| cut -f 1-12 > knownGene.tab
# Load data into knownGene table in both kgRn4ATemp and rn4.
hgLoadSqlTab kgRn4ATemp knownGene ~/kent/src/hg/lib/knownGene.sql \
knownGene.tab
hgLoadSqlTab rn4 knownGene ~/kent/src/hg/lib/knownGene.sql \
knownGene.tab
# Load dupSpMrna table after knownGene table is loaded so that \
# joinerCheck does not complain.
hgLoadSqlTab rn4 dupSpMrna ~/kent/src/hg/lib/dupSpMrna.sql dupSpMrna.tab
# Perform analysis on KG
nice featureBits rn4 -enrichment refGene knownGene
#refGene 0.722%, knownGene 0.582%, both 0.539%, cover 74.72%, enrich 128.33x
nice featureBits rn4 -enrichment refGene:cds knownGene:cds
#refGene:cds 0.500%, knownGene:cds 0.377%, both 0.352%, cover 70.41%, enrich 186.83x
nice featureBits rn3 -enrichment refGene knownGene
#refGene 0.721%, knownGene 0.523%, both 0.411%, cover 56.96%, enrich 108.94x
nice featureBits rn3 -enrichment refGene:cds knownGene:cds
#refGene:cds 0.500%, knownGene:cds 0.392%, both 0.294%, cover 58.91%, enrich 150.16x
# Build knownGeneMrna and knownGenePep tables.
kgPepMrna kgRn4ATemp rn4 060115
hgPepPred rn4 tab knownGeneMrna knownGeneMrna.tab
hgLoadSqlTab rn4 knownGenePep ~/kent/src/hg/lib/knownGenePep.sql knownGenePep.tab
# Use hgwdev's entrez database (loaded by Fan, see makeHg18.doc)
# to buid the mrnaRefseq table.
ssh hgwdev
hgsql entrez -N -e \
'select mrna, refseq from entrezRefseq, entrezMrna \
where entrezRefseq.geneID=entrezMrna.geneID' \
> /cluster/data/rn4/bed/kgRn4A/mrnaRefseq.tmp
hgsql rn4 -N -e 'select name, name from refGene' \
>> /cluster/data/rn4/bed/kgRn4A/mrnaRefseq.tmp
ssh kolossus
cd /cluster/data/rn4/bed/kgRn4A
sort -u mrnaRefseq.tmp > mrnaRefseq.tab
hgLoadSqlTab rn4 mrnaRefseq ~/kent/src/hg/lib/mrnaRefseq.sql mrnaRefseq.tab
# Build kgXref table
kgXref2 kgRn4ATemp 060115 rn4
hgLoadSqlTab rn4 kgXref ~/kent/src/hg/lib/kgXref.sql kgXref.tab
# Build spMrna table
hgsql rn4 -N -e 'select proteinID, name from knownGene' > kgSpMrna.tab
hgLoadSqlTab rn4 spMrna ~/kent/src/hg/lib/spMrna.sql kgSpMrna.tab
# Build kgProtMap table
ln -s /cluster/bluearc/kgDB/kgRn4A/protBlat/tight_mrna.psl .
~/kent/src/hg/protein/kgProtMap2.sh kgRn4A rn4 060115
#####################################
# Build alias tables.
kgAliasM rn4 proteins060115
kgAliasKgXref rn4
kgAliasRefseq rn4
hgsql sp060115 -N -e \
'select name,gene.val from rn4.knownGene,displayId,gene \
where displayId.val=proteinID and displayId.acc=gene.acc' \
| sort -u > kgAliasP.tab
hgsql rn4 -N -e 'select name, name from knownGene' > kgAliasDup.tab
hgsql rn4 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >> kgAliasDup.tab
sort -u kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab \
kgAliasDup.tab > kgAlias.tab
hgLoadSqlTab rn4 kgAlias ~/kent/src/hg/lib/kgAlias.sql kgAlias.tab
kgProtAlias rn4 060115
hgsql rn4 -N -e \
'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""' \
| sort -u > kgProtAliasNCBI.tab
# include variant splice protein IDs
hgsql rn4 -N -e \
'select name, proteinID, parAcc from knownGene,sp060115.varAcc \
where varAcc=proteinID' \
| sort -u > kgProtAliasDup.tab
# include duplicate protein IDs from dupSpMrna table
hgsql rn4 -N -e \
'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna \
where name=mrnaID' \
| sort -u >> kgProtAliasDup.tab
# catch parent acc from dupProteinID too
hgsql rn4 -N -e\
'select name, knownGene.proteinID, parAcc \
from knownGene,dupSpMrna,sp060115.varAcc \
where name=mrnaID and dupProteinID=varAcc.varAcc' \
| sort -u >> kgProtAliasDup.tab
sort -u kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab \
> kgProtAliasAll.tab
hgLoadSqlTab rn4 kgProtAlias ~/kent/src/hg/lib/kgProtAlias.sql \
kgProtAliasAll.tab
# Build kgSpAlias table
hgsql rn4 -e \
'select kgXref.kgID, spID, alias from kgXref, kgAlias \
where kgXref.kgID=kgAlias.kgID' > j.tmp
hgsql rn4 -e \
'select kgXref.kgID, spID, alias from kgXref, kgProtAlias \
where kgXref.kgID=kgProtAlias.kgID' >> j.tmp
sort -u j.tmp | grep -v 'kgID' > rn4.kgSpAlias.tab
rm j.tmp
hgLoadSqlTab rn4 kgSpAlias ~/kent/src/hg/lib/kgSpAlias.sql \
rn4.kgSpAlias.tab
# Ask cluster-admin to rsync the following tables from kolossus rn4
# back to hgwdev rn4:
dupSpMrna
kgAlias
kgProtAlias
kgProtMap
kgSpAlias
kgXref
knownGene
knownGeneMrna
knownGenePep
mrnaRefseq
protBlat
refRa
spMrna
# Copy history comments from kolossus to hgwdev's rn4.history table.
hgsqldump rn4 history \
| egrep 'dupSpMrna|kgAlias|kgProtAlias|kgProtMap|kgSpAlias|kgXref|knownGene|knownGeneMrna|knownGenePep|mrnaRefseq|protBlat|refRa|spMrna' \
| perl -wpe 's/\([0-9]+,/(NULL,/' \
> kgHistory.sql
ssh hgwdev
hgsql rn4 < /cluster/data/rn4/bed/kgRn4A/kgHistory.sql
# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 3/6/06 angie)
# This depends on the go and uniProt databases as well as
# the kgAlias and kgProAlias tables.
ssh hgwdev
mkdir /cluster/data/rn4/bed/kgRn4A/index
cd /cluster/data/rn4/bed/kgRn4A/index
hgKgGetText rn4 knownGene.text
ixIxx knownGene.text knownGene.ix knownGene.ixx
ln -s /cluster/data/rn4/bed/kgRn4A/index/knownGene.ix /gbdb/rn4/
ln -s /cluster/data/rn4/bed/kgRn4A/index/knownGene.ixx /gbdb/rn4/
# end KNOWN GENES
###############################################################################
# BLASTZ/CHAIN/NET DANRER3 - NO LINSPECREP (DONE 3/2/06 angie)
ssh kkstore01
mkdir /cluster/data/rn4/bed/blastz.danRer3.2006-03-02
cd /cluster/data/rn4/bed/blastz.danRer3.2006-03-02
cat << '_EOF_' > DEF
# rat vs zebrafish
BLASTZ=blastz.v7.x86_64
# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Rat Rn4
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer3)
# large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
SEQ2_CHUNK=100000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.danRer3.2006-03-02
'_EOF_'
# << for emacs
doBlastzChainNet.pl DEF -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/blastzRn4DanRer3Out >& do.log &
tail -f do.log
ln -s blastz.danRer3.2006-03-02 /cluster/data/rn4/bed/blastz.danRer3
# BLASTZ/CHAIN/NET DANRER3 - WITH LINSPECREP (DONE 3/3/06 angie)
# The coverage decreased so I ended up not using this run.
# It would be interesting to try a run that uses all repeats
# *except for simple* as lineage-specific too sometime.
ssh kkstore01
# Set up rn4/linSpecRep.notInNonMammal
mkdir /cluster/bluearc/rn4/linSpecRep.notInNonMammal
cd /cluster/data/rn4
foreach f (?{,?}/chr*.fa.out)
echo $f:t:r:r
cp -p $f /cluster/bluearc/rn4/linSpecRep.notInNonMammal/$f:t:r:r.out.spec
end
# Rename tables from previous run; remove download dir
ssh hgwdev
foreach t (netDanRer3 \
`hgsql rn4 -N -e 'show tables like "%chainDanRer3%"'`)
set newT = `echo $t | sed -e 's/Rer3/Rer3NoLSR/'`
hgsql rn4 -e 'rename table '$t' to '$newT
end
rm -r /usr/local/apache/htdocs/goldenPath/rn4/vsDanRer3
ssh pk
mkdir /cluster/data/rn4/bed/blastz.danRer3.2006-03-03
cd /cluster/data/rn4/bed/blastz.danRer3.2006-03-03
cat << '_EOF_' > DEF
# rat vs zebrafish
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Rat Rn4
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_SMSK=/cluster/bluearc/rn4/linSpecRep.notInNonMammal
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer3)
# large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
SEQ2_CHUNK=100000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.danRer3.2006-03-03
'_EOF_'
# << for emacs
doBlastzChainNet.pl DEF -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/blastzRn4DanRer3Out >& do.log &
tail -f do.log
ln -s blastz.danRer3.2006-03-03 /cluster/data/rn4/bed/blastz.danRer3
# COMPARE danRer3 runs, without vs. with LSR (DONE 3/13/06 angie)
ssh kolossus
cd /cluster/data/rn4/bed
# have to use redirect because vprintf doesn't like being called
# twice on x86_64... should fix that sometime.
netStats /dev/null blastz.danRer3.2006-03-02/axtChain/rn4.danRer3.net.gz \
> blastz.danRer3.2006-03-02/axtChain/netStats.out
netStats /dev/null blastz.danRer3.2006-03-03/axtChain/rn4.danRer3.net.gz \
> blastz.danRer3.2006-03-03/axtChain/netStats.out
sdiff blastz.danRer3.2006-03-0[23]/axtChain/netStats.out
# A few points of comparison from that:
# Without: With:
# max depth: 11 7
# gap count: 618046 487550
# gap average size T: 809.1 1024.5
# gap average size Q: 529.2 688.0
# fill count: 60347 58415
# top count: 38000 37787
# top percent of total: 84.9% 87.5%
# nonSyn percent of total: 11.1% 9.6%
# syn percent of total: 2.5% 1.8%
# Coverage:
netToBed blastz.danRer3.2006-03-02/axtChain/{rn4.danRer3.net.gz,net.bed}
netToBed blastz.danRer3.2006-03-03/axtChain/{rn4.danRer3.net.gz,net.bed}
featureBits rn4 blastz.danRer3.2006-03-02/axtChain/net.bed
#513894027 bases of 2571531505 (19.984%) in intersection
featureBits rn4 blastz.danRer3.2006-03-03/axtChain/net.bed
#509076601 bases of 2571531505 (19.797%) in intersection
# OK, so coverage drops with LSR. Maybe some repeats are being snipped
# that could be aligned (ancestral)?
# Eyeball the one place where level == 11 in the 02 set
# ("select * from netDanRer3NoLSR where level >= 11;" on hgwdev):
# chr3 | 97071076 | 97071655, zoom out 1.5
# NoLSR has a lot more chains... more levels filled.
# with LSR gets a "Level 2" chain that NoLSR does not get at all.
# Who's to say which is right??? NoLSR looks messier to me but that
# is a very subjective judgment.
# Interestingly, there are no repeats on rn4 in that region.
# Panning to the right a ways, just long gaps for NoLSR but a few
# blocks for with. NoLSR gets some sizable alignments where with
# does not... a stretch of simple/low-complexity repeats there.
# Those should probably be excluded when we are using all repeats
# as lin-spec! Hopping over to a danRer3 region aligned only by
# NoLSR I see some more low-complexity. Should run that by Arian.
# Could do yet another run with all repeats except simple/low-complexity
# (what is DNA? maybe that too?) as linSpec.
# Panning to the right some more, a long dry spell for with LSR.
# There are olfactory receptor genes around which could indicate a
# tougher job than usual. Not much agreement between the nets though,
# yikes. High precision, but at what accuracy?
# I just worry about the reliability of what comes out of the pipeline...
# so sensitive to parameters.
# chr3:97,345,533-97,360,012 there is a withLSR chain but the net is empty.
# Since the coverage is higher without LSR, go with that for now.
# BLASTZ/CHAIN/NET XENTRO1 (DONE 3/20/06 angie)
ssh kkstore01
mkdir /cluster/data/rn4/bed/blastz.xenTro1.2006-03-20
cd /cluster/data/rn4/bed/blastz.xenTro1.2006-03-20
cat << '_EOF_' > DEF
# rat vs. frog
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
# Mammal to non-mammal, with threshold of 8000 as in mm8-xenTro1
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Rat rn4
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Frog xenTro1 - single chunk big enough to run two of the
# largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/rn4/bed/blastz.xenTro1.2006-03-20
'_EOF_'
# << emacs
doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/rn4XenTro1 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
>& do.log & tail -f do.log
ln -s blastz.xenTro1.2006-03-20 /cluster/data/rn4/bed/blastz.xenTro1
# BLASTZ/CHAIN/NET RHEMAC2 (DONE 3/23/06 angie)
# Won't put this in Conservation -- special request for ancestor recon.
ssh pk
mkdir /cluster/data/rn4/bed/blastz.rheMac2.2006-03-22
cd /cluster/data/rn4/bed/blastz.rheMac2.2006-03-22
cat << '_EOF_' > DEF
# Rat vs. macacque
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Rat (rn4)
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Macacque (rheMac2)
SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib
SEQ2_SMSK=/cluster/bluearc/rheMac2/linSpecRep/notInRodent
SEQ2_LEN=/cluster/data/rheMac2/chrom.sizes
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.rheMac2.2006-03-22
'_EOF_'
# << emacs
doBlastzChainNet.pl -blastzOutRoot /san/sanvol1/rn4RheMac2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium DEF \
>& do.log & tail -f do.log
ln -s blastz.rheMac2.2006-03-22 /cluster/data/rn4/bed/blastz.rheMac2
#######################################################################
# BLASTZ/CHAIN/NET MONDOM4 (DONE 3/24/06 angie)
ssh pk
mkdir /cluster/data/rn4/bed/blastz.monDom4.2006-03-23
cd /cluster/data/rn4/bed/blastz.monDom4.2006-03-23
cat << '_EOF_' > DEF
# Rat vs. opossum
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
# Mammal to non-mammal -- well, I guess non-placental-mammal, with
# high L like chicken, and low M to guard against pileups w/monDom4:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=20
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Rat (rn4)
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
# QUERY: Opossum monDom4
SEQ2_DIR=/cluster/bluearc/scratch/hg/monDom4/monDom4.2bit
SEQ2_LEN=/cluster/bluearc/scratch/hg/monDom4/chrom.sizes
SEQ2_CHUNK=50000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.monDom4.2006-03-23
'_EOF_'
# << emacs
doBlastzChainNet.pl -blastzOutRoot /san/sanvol1/rn4MonDom4 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
-workhorse kkr7u00 >& do.log & tail -f do.log
ln -s blastz.monDom4.2006-03-23 /cluster/data/rn4/bed/blastz.monDom4
ssh kolossus
cd /cluster/data/rn4/bed/blastz.monDom4
time nice -n +19 featureBits rn4 chainMonDom4Link \
> fb.rn4.chainMonDom4Link 2>&1
cat fb.rn4.chainMonDom4Link
# 200166664 bases of 2571531505 (7.784%) in intersection
#######################################################################
# BLASTZ/CHAIN/NET BOSTAU2 (DONE 3/24/06 angie)
ssh pk
mkdir /cluster/data/rn4/bed/blastz.bosTau2.2006-03-23
cd /cluster/data/rn4/bed/blastz.bosTau2.2006-03-23
cat << '_EOF_' > DEF
# rat vs cow
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
# TARGET: Rat (rn4)
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Cow (bosTau2)
# large enough chunk to do chroms in one piece
SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
SEQ2_CHUNK=150000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.bosTau2.2006-03-23
'_EOF_'
# << emacs
doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=pk -workhorse=kkr8u00 DEF \
>& do.log & tail -f do.log
ln -s blastz.bosTau2.2006-03-23 /cluster/data/rn4/bed/blastz.bosTau2
# SELF CHAINS (DONE - NOT LOADED INTO DB 3/27/06 angie)
# Won't put this in Conservation -- special request for ancestor recon.
ssh pk
mkdir /cluster/data/rn4/bed/blastz.rn4.2006-03-24
cd /cluster/data/rn4/bed/blastz.rn4.2006-03-24
cat << '_EOF_' > DEF
# rat vs rat
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=200
# TARGET: rat rn4
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: rat rn4
SEQ2_DIR=/scratch/hg/rn4/nib
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.rn4.2006-03-24
'_EOF_'
# << emacs
doBlastzChainNet.pl -chainMinScore=10000 -chainLinearGap=medium \
-bigClusterHub=pk -workhorse=kkr7u00 DEF \
>& do.log & tail -f do.log
# It died during loading because hgwdev /var/lib/mysql ran out of room!
# There is enough space now but I deleted the tables anyway -- can load
# later if anyone wants to browse. Did -continue download so that the
# ancestor-reconstruction folks who requested this can find the chains
# in the usual place.
ln -s blastz.rn4.2006-03-24 /cluster/data/rn4/bed/blastz.rn4
# BLASTZ/CHAIN/NET PANTRO2 (DONE 3/27/06 angie)
# Won't put this in Conservation -- special request for ancestor recon.
ssh pk
mkdir /cluster/data/rn4/bed/blastz.panTro2.2006-03-27
cd /cluster/data/rn4/bed/blastz.panTro2.2006-03-27
cat << '_EOF_' > DEF
# Rat vs. chimp
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Rat (rn4)
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chimp (panTro2)
SEQ2_DIR=/scratch/hg/panTro2/nib
SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
SEQ2_LEN=/cluster/data/panTro2/chrom.sizes
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.panTro2.2006-03-27
'_EOF_'
# << emacs
doBlastzChainNet.pl -blastzOutRoot /san/sanvol1/rn4PanTro2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium DEF \
-workhorse=kkr5u00 >& do.log & tail -f do.log
ln -s blastz.panTro2.2006-03-27 /cluster/data/rn4/bed/blastz.panTro2
############################################################################
## BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
ssh kk
cd /cluster/data/mm8/bed/blastzRn4.2006-02-16
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
`pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits rn4 chainMm8Link
# 1791093685 bases of 2571531505 (69.651%) in intersection
# BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-03-13, Fan)
# This should be done after KG tables are complete from known genes build
# process.
#
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/rn4/bed/geneSorter.2006-03-13
# remove old symbolic link
rm /cluster/data/rn4/bed/geneSorter
ln -s /cluster/data/rn4/bed/geneSorter.2006-03-13 /cluster/data/rn4/bed/geneSorter
cd /cluster/data/rn4/bed/geneSorter
hgClusterGenes rn4 knownGene knownIsoforms knownCanonical
# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/rn4/bed/geneSorter/blastp
ln -s /cluster/data/rn4/bed/geneSorter/blastp /cluster/data/rn4/bed/blastp
cd /cluster/data/rn4/bed/geneSorter/blastp
pepPredToFa rn4 knownGenePep known.faa
# You may need to build this binary in src/hg/near/pepPredToFa
/scratch/blast/formatdb -i known.faa -t known -n known
# This command is in /projects/compbio/bin/$MACH/formatdb
#TODO: swap rn4 to: xenTro1
# MULTIZ9WAY (DONE 3/30/06 angie)
# (((((((rn4 mm8) hg18) (canFam2 bosTau2)) monDom4) galGal2) xenTro1) danRer3)
ssh kkstore01
mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30
cd /cluster/data/rn4/bed/multiz9way.2006-03-30
# copy MAF's to cluster-friendly server
# These MAF's already on bluearc:
# canFam2, fr1, galGal2, panTro1, rn4
mkdir /san/sanvol1/scratch/rn4/mafNet
foreach s (mm8 hg18 canFam2 bosTau2 monDom4 galGal2 xenTro1 danRer3)
echo $s
rsync -av /cluster/data/rn4/bed/blastz.$s/mafNet/* \
/san/sanvol1/scratch/rn4/mafNet/$s/
end
# Prune the hg17 17way tree to just these 9 and update db names:
/cluster/bin/phast/tree_doctor \
--prune-all-but=rat_rn3,mouse_mm7,human_hg17,dog_canFam2,cow_bosTau2,monodelphis_monDom2,chicken_galGal2,xenopus_xenTro1,zebrafish_danRer3 \
--rename="rat_rn3 -> rat_rn4 ; mouse_mm7 -> mouse_mm8 ; human_hg17 -> human_hg18 ; monodelphis_monDom2 -> monodelphis_monDom4" \
/cluster/data/hg17/bed/multiz17way/17way.nh > 9way.nh
# *carefully* edit 9way.nh to move hg18 to appear after (rat mouse),
# save as 9way_ratFirst.nh, and make a tree picture:
/cluster/bin/phast/draw_tree 9way_ratFirst.nh > 9way.ps
ps2pdf 9way.ps > 9way.pdf
/cluster/bin/phast/all_dists 9way_ratFirst.nh > 9way.distances.txt
grep rn4 9way.distances.txt | sort -k3,3n | \
awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
cat distances.txt
#0.1587 mouse_mm8
#0.4724 human_hg18
#0.6277 dog_canFam2
#0.6392 cow_bosTau2
#1.0745 monodelphis_monDom4
#1.3472 chicken_galGal2
#1.7983 xenopus_xenTro1
#2.1105 zebrafish_danRer3
# the order in the browser display will be by tree topology,
# not by distance, but in this case (unlike human) they happen to match.
# create species list and stripped down tree for autoMZ
sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 9way_ratFirst.nh \
> tree-commas.nh
sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
ssh pk
cd /cluster/data/rn4/bed/multiz9way.2006-03-30
mkdir maf run
cd run
# stash binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = rn4
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/mafNet
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/rn4/bed/multiz9way.2006-03-30/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << emacs
awk '{print $1}' /cluster/data/rn4/chrom.sizes > chrom.lst
gensub2 chrom.lst single spec jobList
para make jobList
para time
#Completed: 45 of 45 jobs
#CPU time in finished jobs: 60564s 1009.39m 16.82h 0.70d 0.002 y
#IO & Wait Time: 2996s 49.94m 0.83h 0.03d 0.000 y
#Average job time: 1412s 23.54m 0.39h 0.02d
#Longest finished job: 6172s 102.87m 1.71h 0.07d
#Submission to last job: 6173s 102.88m 1.71h 0.07d
# Make .jpg for tree and install in htdocs/images/phylo/... don't forget
# to request a push of that file. The treeImage setting in trackDb.ra
# is phylo/rn4_9way.jpg (relative to htdocs/images).
ssh hgwdev
cd /cluster/data/rn4/bed/multiz9way.2006-03-30
pstopnm -stdout 9way.ps | pnmtojpeg > rn4_9way.jpg
ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/rn4_9way.jpg \
/usr/local/apache/htdocs/images/phylo/rn4_9way.jpg
# ANNOTATE MULTIZ9WAY MAF AND LOAD TABLES (DONE 4/4/2006 angie)
ssh kolossus
mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/anno
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno
mkdir maf run
cd run
rm -f sizes nBeds
foreach db (`cat /cluster/data/rn4/bed/multiz9way.2006-03-30/species.lst`)
ln -s /cluster/data/$db/chrom.sizes $db.len
if (! -e /cluster/data/$db/$db.N.bed) then
twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
endif
ln -s /cluster/data/$db/$db.N.bed $db.bed
echo $db.bed >> nBeds
echo $db.len >> sizes
end
echo date > jobs.csh
# do smaller jobs first:
foreach f (`ls -1rS ../../maf/*.maf`)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
/cluster/data/rn4/rn4.2bit ../maf/`basename $f` >> jobs.csh
echo "echo $f" >> jobs.csh
end
echo date >> jobs.csh
csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
# 1.5 hrs.
# Load anno/maf
ssh hgwdev
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf
mkdir -p /gbdb/rn4/multiz9way/anno/maf
ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf/*.maf \
/gbdb/rn4/multiz9way/anno/maf
cat > loadMaf.csh << 'EOF'
date
nice hgLoadMaf -pathPrefix=/gbdb/rn4/multiz9way/anno/maf rn4 multiz9way
date
'EOF'
# << emacs
csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kkr7u00
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf
cat *.maf \
| nice hgLoadMafSummary rn4 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
-test multiz9waySummary stdin
#Created 1408247 summary blocks from 22055250 components and 8577575 mafs from stdin
ssh hgwdev
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf
sed -e 's/mafSummary/multiz9waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
> /tmp/multiz9waySummary.sql
time nice hgLoadSqlTab rn4 multiz9waySummary /tmp/multiz9waySummary.sql \
multiz9waySummary.tab
#0.000u 0.000s 4:04.62 0.0% 0+0k 0+0io 217pf+0w
rm *.tab
ln -s multiz9way.2006-03-30 /cluster/data/rn4/bed/multiz9way
# MULTIZ9WAY DOWNLOADABLES (UNANNOTATED) (DONE 6/9/2006 angie)
# Annotated MAF is now documented, so use anno/maf for downloads.
ssh hgwdev
mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads
# upstream mafs
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits rn4 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
nice mafFrags rn4 multiz9way up.bed upstream$i.maf \
-orgs=../species.lst
rm up.bed
end
date
'EOF'
# << emacs
time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
#old hgwdev:
#75.280u 480.470s 17:07.43 54.0% 0+0k 0+0io 4640pf+0w
#new hgwdev:
#85.814u 171.368s 7:37.96 56.1% 0+0k 0+0io 0pf+0w
ssh kkstore04
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads
cat > downloads.csh << 'EOF'
date
foreach f (../anno/maf/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
date
'EOF'
# << emacs
time csh -efx downloads.csh >&! downloads.log
#1897.874u 25.317s 32:58.19 97.2% 0+0k 0+0io 2pf+0w
nice gzip up*.maf
nice md5sum up*.maf.gz >> md5sum.txt
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/rn4/multiz9way
mkdir $dir
ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads/{*.gz,md5sum.txt} $dir
cp /usr/local/apache/htdocs/goldenPath/mm7/multiz17way/README.txt $dir
# edit README.txt
# MULTIZ9WAY MAF FRAMES (DONE 4/5/2006 angie)
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
ssh hgwdev
mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/frames
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/frames
# The following is adapted from MarkD's Makefile used for mm7...
#------------------------------------------------------------------------
# get the genes for all genomes
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using mrna table as genes: canFam2 bosTau2 danRer3
mkdir genes
foreach queryDb (canFam2 bosTau2 danRer3)
set tmpExt = `mktemp temp.XXXXXX`
set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
set tmpMrna = ${queryDb}.mrna.${tmpExt}
set tmpCds = ${queryDb}.cds.${tmpExt}
echo $queryDb
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
${queryDb} > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
stdout \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
# using knownGene for rn4 mm8 hg18
# using refGene for galGal2
# using mgcGenes for xenTro1
# no genes for monDom4
# genePreds; (must keep only the first 10 columns for knownGene)
foreach queryDb (rn4 mm8 hg18 galGal2 xenTro1)
if ($queryDb == "xenTro1") then
set geneTbl = mgcGenes
else if ($queryDb == "galGal2") then
set geneTbl = refGene
else
set geneTbl = knownGene
endif
hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
#------------------------------------------------------------------------
# create frames
set clusterDir = /cluster/bluearc/rn4/multiz9wayFrames
set multizDir = /cluster/data/rn4/bed/multiz9way.2006-03-30
set mafDir = $multizDir/mafDownloads
set geneDir = $multizDir/frames/genes
set clusterMafDir = ${clusterDir}/maf
set clusterGeneDir = ${clusterDir}/genes
set clusterFramesDir = ${clusterDir}/mafFrames.kki
# copy mafs to cluster storage
mkdir $clusterDir
ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/"
# copy genes to cluster storage
ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/"
# run cluster jobs
set tmpExt = `mktemp temp.XXXXXX`
set paraDir = $multizDir/frames/para.${tmpExt}
mkdir mafFrames $paraDir
rm -f $paraDir/jobList
mkdir ${clusterFramesDir}
foreach queryDb (`cat /cluster/data/rn4/bed/multiz9way.2006-03-30/species.lst`)
mkdir ${clusterFramesDir}/${queryDb}
foreach c (`awk '{print $1;}' /cluster/data/rn4/chrom.sizes`)
if (-e ${clusterGeneDir}/${queryDb}.gp.gz) then
echo /cluster/bin/scripts/mkMafFrames.pl ${queryDb} rn4 \
${clusterGeneDir}/${queryDb}.gp.gz ${clusterMafDir}/$c.maf.gz \
${clusterFramesDir}/${queryDb}/$c.mafFrames \
>> $paraDir/jobList
endif
end
end
rm -f $tmpExt
ssh -x kki "cd ${paraDir} && para make jobList && para time"
#Completed: 360 of 360 jobs
#CPU time in finished jobs: 2027s 33.78m 0.56h 0.02d 0.000 y
#IO & Wait Time: 3816s 63.60m 1.06h 0.04d 0.000 y
#Average job time: 16s 0.27m 0.00h 0.00d
#Longest finished job: 124s 2.07m 0.03h 0.00d
#Submission to last job: 493s 8.22m 0.14h 0.01d
# combine results from cluster
foreach queryDb (`cat ../species.lst`)
echo $queryDb
ssh -x kolossus "cat ${clusterFramesDir}/${queryDb}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${queryDb}.mafFrames.gz"
end
#------------------------------------------------------------------------
# load the database
hgLoadMafFrames rn4 multiz9wayFrames mafFrames/*.mafFrames.gz
#------------------------------------------------------------------------
# clean up
rm -rf ${clusterDir}
# rebuild frames to get bug fix, using 1-pass maf methodology
# (2006-06-09 markd)
ssh kkstore04
cd /cluster/data/rn4/bed/multiz9way/frames
mv mafFrames/ mafFrames.old
nice tcsh # easy way to get process niced
(cat ../anno/maf/*.maf | time genePredToMafFrames rn4 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz rn4 genes/rn4.gp.gz xenTro1 genes/xenTro1.gp.gz | gzip >multiz9way.mafFrames.gz)>&log&
ssh hgwdev
cd /cluster/data/rn4/bed/multiz9way/frames
hgLoadMafFrames rn4 multiz9wayFrames multiz9way.mafFrames.gz >&log&
# PHASTCONS (DONE 4/2/2006 angie)
# Using Kate's process from makeHg17.doc.
# This process is distilled from Hiram and Adam's experiments
# on mouse (mm7) 17way track. Many parameters are now fixed, without
# being experimentally derived, either because the experiments
# were lengthy and produced similar results, or because they
# weren't runnable given the alignment size.
# These parameters are:
# --rho
# --expected-length
# --target-coverage
# Also, instead of generating cons and noncons tree models,
# we use a single, pre-existing tree model -- Elliot Margulies' model
# from the (37-way) ENCODE alignments.
#
# NOTE: reusing cluster-friendly chrom fasta files created earlier
#cd /cluster/data/rn4
#foreach f (`cat chrom.lst`)
#echo $f
#cp $f/*.fa /cluster/bluearc/rn4/chrom
#end
# Split chromosome MAF's into windows and use to generate
# "sufficient statistics" (ss) files for phastCons input
# NOTE: as the SAN fs has lotsa space, we're leaving these
# big (temp) files unzipped, to save time during phastCons run.
# Note also the larger chunk sizes from previous runs -- this
# reduces run-time on the split, slows down the actual phastCons
# enough so jobs don't crash (jobs are very quick, just a minute
# or so), and according to Adam, will produce better results.
# The previous small chunks were probably required by
# the phyloFit step, which we are no longer using for the
# human alignments.
ssh pk
mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
/cluster/bin/phast/tree_doctor \
--prune-all-but=rn3,mm7,hg17,canFam2,bosTau2,monDom2,galGal2,xenTro1,danRer3 \
--rename="rn3 -> rn4 ; hg17 -> hg18 ; mm7 -> mm8 ; monDom2 -> monDom4" \
/san/sanvol1/scratch/mm7/cons/elliotsEncode.mod \
> elliotsEncodePruned.mod
mkdir run.split
cd run.split
set WINDOWS = /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ss
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/rn4/bed/multiz9way.2006-03-30/maf
set WINDOWS = /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
-M /cluster/bluearc/rn4/chrom/$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
echo "Done" >> $c.done
'EOF'
# << emacs
chmod +x doSplit.csh
rm -f jobList
foreach f (../../maf/*.maf)
set c = $f:t:r
echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end
para make jobList
para time
#Completed: 45 of 45 jobs
#CPU time in finished jobs: 2102s 35.03m 0.58h 0.02d 0.000 y
#IO & Wait Time: 8185s 136.42m 2.27h 0.09d 0.000 y
#Average job time: 229s 3.81m 0.06h 0.00d
#Longest finished job: 661s 11.02m 0.18h 0.01d
#Submission to last job: 661s 11.02m 0.18h 0.01d
# check tree model on 5MB chunk, using params recommended by Adam,
# (to verify branch lengths on 2X species)
ssh kolossus
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
--tree "`cat ../tree-commas.nh`" \
/san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ss/chr7/chr7.119999202-129997313.ss \
-o phyloFit.tree
# Comment from makeHg17.doc:
# # he ok'ed the results -- not necessary for next human run
# TODO: maybe run these by Adam... the numbers in the RATE_MAT and
# TREE are different between the phyloFit and elliotsEncode models,
# not sure if the differences are significant. I will proceed with
# elliotsEncode as in makeHg17.doc.
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons
cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons
cp -p $san/ss/$c/$f.ss ../elliotsEncodePruned.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncodePruned.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
# << emacs
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 14 .007 .27
#ENDLOOP
'EOF'
# << emacs
# Create parasol batch and run it
ssh pk
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons
pushd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons
ls -1S ss/chr*/chr*.ss \
| sed 's/.ss$//' \
> /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons/in.list
popd
gensub2 in.list single template jobList
para make jobList
para time
#Completed: 314 of 313 jobs
#CPU time in finished jobs: 9623s 160.38m 2.67h 0.11d 0.000 y
#IO & Wait Time: 12815s 213.59m 3.56h 0.15d 0.000 y
#Average job time: 72s 1.19m 0.02h 0.00d
#Longest finished job: 108s 1.80m 0.03h 0.00d
#Submission to last job: 148s 2.47m 0.04h 0.00d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons
# The sed's and the sort get the file names in chrom,start order
# (Hiram tricks -- split into columns on [.-/] with
# identifying x,y,z, to allow column sorting and
# restoring the filename. Warning: the sort column
# will depend on how deep you are in the dir
find ./bed -name "chr*.bed" \
| sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \
| sort -k7,7 -k9,9n \
| sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \
| xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements9way.bed
cp -p phastConsElements9way.bed /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
# Measure coverage. If good, load elements into database and proceed with wiggle.
# Try for 5% overall cov, and 70% CDS cov.
ssh hgwdev
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
featureBits rn4 -enrichment refGene:cds phastConsElements9way.bed
# FIRST ITERATION: doPhast (len cov rho) = (14 .008 .28)
#refGene:cds 0.499%, phastConsElements9way.bed 5.101%, both 0.359%, cover 71.97%, enrich 14.11x
mv phastConsElements9way.bed phastConsElements9way_14_008_28.bed
# SECOND ITERATION: doPhast (len cov rho) = (13 .007 .27)
#refGene:cds 0.499%, phastConsElements9way.bed 4.866%, both 0.355%, cover 71.07%, enrich 14.60x
mv phastConsElements9way.bed phastConsElements9way_13_007_27.bed
# THIRD ITERATION: doPhast (len cov rho) = (14 .007 .27)
#refGene:cds 0.499%, phastConsElements9way.bed 4.982%, both 0.356%, cover 71.32%, enrich 14.31x
mv phastConsElements9way.bed phastConsElements9way_14_007_27.bed
# FOURTH ITERATION: doPhast (len cov rho) = (14 .007 .28)
#refGene:cds 0.499%, phastConsElements9way.bed 5.073%, both 0.359%, cover 71.96%, enrich 14.19x
mv phastConsElements9way.bed phastConsElements9way_14_007_28.bed
# FIFTH ITERATION: doPhast (len cov rho) = (14 .006 .28)
#refGene:cds 0.499%, phastConsElements9way.bed 5.042%, both 0.359%, cover 71.95%, enrich 14.27x
mv phastConsElements9way.bed phastConsElements9way_14_006_28.bed
# Wow, the CDS coverage doesn't move much. Go with 14 .007 .27.
# When happy:
hgLoadBed -strict rn4 phastConsElements9way phastConsElements9way.bed
# Create merged posterier probability file and wiggle track data files
# pk is currently closer to the san than any other machine
ssh kolossus
cd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
#next time try Angie's simpler sort, below
time find ./pp -name "chr*.pp" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
nice wigEncode stdin phastCons9way.wig phastCons9way.wib
# about 23 minutes for above
cp -p phastCons9way.wi? /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
ln -s `pwd`/phastCons9way.wib /gbdb/rn4/multiz9way/phastCons9way.wib
hgLoadWiggle -pathPrefix=/gbdb/rn4/multiz9way rn4 \
phastCons9way phastCons9way.wig
# ~ 3 minute load
# PHASTCONS SCORES DOWNLOADABLES FOR 9WAY (DONE 4/3/2006 angie)
ssh kolossus
cd /cluster/data/rn4/bed/multiz9way.2006-03-30
mkdir phastConsDownloads
cd phastConsDownloads
cat > downloads.csh << 'EOF'
date
cd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/pp
foreach chr (`awk '{print $1}' /cluster/data/rn4/chrom.sizes`)
echo $chr
cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
| nice gzip -c \
> /cluster/data/rn4/bed/multiz9way.2006-03-30/phastConsDownloads/$chr.gz
end
date
'EOF'
# << emacs
csh -efx downloads.csh >&! downloads.log & tail -f downloads.log
# ~20min
md5sum *.gz > md5sum.txt
ssh hgwdev
cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastConsDownloads
set dir = /usr/local/apache/htdocs/goldenPath/rn4/phastCons9way
mkdir $dir
ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/phastConsDownloads/{*.gz,md5sum.txt} $dir
cp /usr/local/apache/htdocs/goldenPath/mm7/phastCons17Scores/README.txt $dir
# edit README.txt
# Clean up after phastCons run.
ssh kkstore04
rm /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/*.tab
rm -r /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons
# MIRNA (DONE 4/10/06 angie)
ssh hgwdev
mkdir /cluster/data/rn4/bed/miRNA
cd /cluster/data/rn4/bed/miRNA
wget ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/8.0/database_files/tables.sql
foreach t (mirna mirna_species mirna_mature mirna_pre_mature \
mirna_chromosome_build)
wget ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/8.0/database_files/$t.dat.gz
end
nice gunzip *.gz
hgsql '' -e 'create database miRNAtmp'
hgsql miRNAtmp < tables.sql
foreach t (mirna mirna_species mirna_mature mirna_pre_mature \
mirna_chromosome_build)
hgsql miRNAtmp -e 'load data local infile "'$t.dat'" into table '$t
end
set species = `hgsql miRNAtmp -N -e 'select auto_id from mirna_species where name = "Rattus norvegicus"'`
hgsql miRNAtmp -e "create table mirna_rat select * from mirna where auto_species = $species"
hgsql miRNAtmp -N -e "select count(*) from mirna_rat"
#| 234 |
# This is a tiny database so this query takes almost no time at all:
time nice hgsql miRNAtmp -N -e \
"select mirna_chromosome_build.xsome, \
mirna_chromosome_build.contig_start - 1, \
mirna_chromosome_build.contig_end, \
mirna_rat.mirna_id, 0, \
mirna_chromosome_build.strand, \
(mirna_chromosome_build.contig_start + mirna_mature.mature_from - 2), \
(mirna_chromosome_build.contig_start + mirna_mature.mature_to - 1) \
from mirna_rat, mirna_chromosome_build, mirna_mature, mirna_pre_mature \
where mirna_rat.auto_mirna = mirna_chromosome_build.auto_mirna and \
mirna_rat.auto_mirna = mirna_pre_mature.auto_mirna and \
mirna_pre_mature.auto_mature = mirna_mature.auto_mature" \
> mirna.almostBed
#0.000u 0.010s 0:00.03 33.3% 0+0k 0+0io 211pf+0w
awk '{$5 = 960;} $6 == "-" {$5 = 480;} {$1 = "chr" $1; print;}' \
mirna.almostBed > miRNA.bed
hgLoadBed rn4 miRNA miRNA.bed
hgsql miRNAtmp -e 'drop database miRNAtmp'
# AFFYRAE230 (DONE 4/11/06 angie)
# Rachel already set up consensus sequences in
# /iscratch/i/affy/RAE230_all.fa - see makeRn3.doc.
# Align consensus sequences to genome.
ssh kk
mkdir /cluster/data/rn4/bed/affyRat.2006-04-11
cd /cluster/data/rn4/bed/affyRat.2006-04-11
ls -1 /iscratch/i/affy/RAE230_all.fa > affy.lst
ls -1S /panasas/store/rn4/ctgFa/* > allctg.lst
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/rn4/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 allctg.lst affy.lst template.sub para.spec
mkdir psl
para make para.spec
para time
#Completed: 591 of 591 jobs
#CPU time in finished jobs: 10704s 178.40m 2.97h 0.12d 0.000 y
#IO & Wait Time: 9584s 159.73m 2.66h 0.11d 0.000 y
#Average job time: 34s 0.57m 0.01h 0.00d
#Longest finished job: 83s 1.38m 0.02h 0.00d
#Submission to last job: 270s 4.50m 0.07h 0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyRAE230.psl
pslSort dirs raw.psl tmp psl
# only use alignments that cover 30% of sequence and have at least
# 95% identity in aligned region. minAli = 0.97 too high.
# low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl \
/dev/null
liftUp affyRAE230.psl ../../jkStuff/liftAll.lft warn contig.psl
# shorten names in psl file
sed -e 's/RAE230//' affyRAE230.psl > affyRAE230.psl.bak
mv affyRAE230.psl.bak affyRAE230.psl
ssh hgwdev
cd /cluster/data/rn4/bed/affyRat.2006-04-11
# Rachel already set up /gbdb/hgFixed/affyProbes/RAE230_all.fa -
# see makeRn3.doc.
hgLoadSeq -abbr=RAE230 rn4 /gbdb/hgFixed/affyProbes/RAE230_all.fa
# load track into database
hgLoadPsl rn4 affyRAE230.psl
# Load knownToRAE230:
hgMapToGene rn4 affyRAE230 knownGene knownToRAE230
# Clean up
rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab
# AFFYRGU34A (DONE 4/11/06 angie)
# RG-U34A is the chip used for the GNF Atlas 2 expression data for Rat
# Rachel already set up consensus sequences in
# /iscratch/i/affy/RG-U34A_all.fa -see makeRn3.doc.
# Align consensus sequences to genome.
ssh kk
mkdir /cluster/data/rn4/bed/affyU34A
cd /cluster/data/rn4/bed/affyU34A
ls -1 /iscratch/i/affy/RG-U34A_all.fa > affy.lst
ls -1 /panasas/store/rn4/ctgFa/* > allctg.lst
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/rn4/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 allctg.lst affy.lst template.sub para.spec
mkdir psl
para make para.spec
#Completed: 591 of 591 jobs
#CPU time in finished jobs: 4911s 81.85m 1.36h 0.06d 0.000 y
#IO & Wait Time: 10019s 166.99m 2.78h 0.12d 0.000 y
#Average job time: 25s 0.42m 0.01h 0.00d
#Longest finished job: 64s 1.07m 0.02h 0.00d
#Submission to last job: 388s 6.47m 0.11h 0.00d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyRGU34A.psl
pslSort dirs raw.psl tmp psl
# only use alignments that cover 30% of sequence and have at least
# 95% identity in aligned region. minAli = 0.97 too high.
# low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl \
/dev/null
liftUp affyU34A.psl ../../jkStuff/liftAll.lft warn contig.psl
# shorten names in psl file
sed -e 's/RG-U34A://' affyU34A.psl > affyU34A.psl.bak
mv affyU34A.psl.bak affyU34A.psl
ssh hgwdev
cd /cluster/data/rn4/bed/affyU34A
# Rachel already set up /gbdb/hgFixed/affyProbes/RG-U34A_all.fa -
# see makeRn3.doc.
hgLoadSeq -abbr=RG-U34A: rn4 /gbdb/hgFixed/affyProbes/RG-U34A_all.fa
# load track into database
hgLoadPsl rn4 affyU34A.psl
# create and load knownToU34A
hgMapToGene rn4 affyU34A knownGene knownToU34A
# Clean up
rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab
# MERGE GNF ATLAS 2 EXPRESSION DATA AND ALIGNMENT (DONE 4/13/06 angie)
ssh hgwdev
cd /cluster/data/rn4/bed/affyU34A
hgMapMicroarray gnfRatAtlas2.bed hgFixed.gnfRatAtlas2MedianRatio \
affyU34A.psl
#Loaded 8022 rows of expression data from hgFixed.gnfRatAtlas2MedianRatio
#Mapped 7447, multiply-mapped 216, missed 695, unmapped 575
hgLoadBed rn4 gnfAtlas2 gnfRatAtlas2.bed
rm bed.tab
# Create table to map between known genes and GNF Atlas2
# expression data.
hgMapToGene rn4 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
hgExpDistance rn4 hgFixed.gnfRatAtlas2MedianRatio \
hgFixed.gnfRatAtlas2MedianExps gnfAtlas2Distance \
-lookup=knownToGnfAtlas2
# HGNEAR PROTEIN BLAST TABLES -- TARGET (DONE 4/12/06 angie)
ssh hgwdev
mkdir /cluster/data/rn4/bed/hgNearBlastp
cd /cluster/data/rn4/bed/hgNearBlastp
cat << _EOF_ > config.ra
# Latest rat vs. other Gene Sorter orgs:
# human, mouse, zebrafish, fly, worm, yeast
targetGenesetPrefix known
targetDb rn4
queryDbs hg18 mm8 danRer3 dm2 ce2 sacCer1
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
ce2Fa /cluster/data/ce2/bed/blastp/wormPep151.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
buildDir /cluster/data/rn4/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/rn4HgNearBlastp
_EOF_
# Do -targetOnly until rn4 has been released -- then it will be OK to
# update rnBlastTab in the other species.
doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log
# *** Check these tables in rn4:
# *** knownBaseBlastTab hgBlastTab mmBlastTab drBlastTab dmBlastTab ceBlastTab scBlastTab
# HGNEAR PROTEIN BLAST TABLES -- QUERIES (DONE 6/26/06 angie)
ssh hgwdev
cd /cluster/data/rn4/bed/hgNearBlastp
# Do -queryOnly after rn4 has been released:
doHgNearBlastp.pl -queryOnly config.ra >& do.queries.log &
tail -f do.queries.log
# *** Check rnBlastTab in these databases:
# *** hg18 mm8 danRer3 dm2 ce2 sacCer1
# MORE HGNEAR TABLES (DONE 4/13/06 angie)
ssh hgwdev
cd /tmp
# Create table that maps between known genes and RefSeq
hgMapToGene rn4 refGene knownGene knownToRefSeq
# Create knownToEnsembl column -- redone 6/13/06 after ensGene update
hgMapToGene rn4 ensGene knownGene knownToEnsembl
# MGI?
# Recomb rate - no doc, but ls -l /cluster/bin/scripts/*ecomb* and see
# ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/Mapping_data/
# SNPS - TBD by Heather's new process.
#TODO after SNPS:
# Make knownToCdsSnp column.
hgMapToGene rn4 snpMap knownGene knownToCdsSnp -all -cds
# BLASTZ/CHAIN/NET XENTRO2 (DONE 4/19/06 angie)
ssh kkstore04
mkdir /cluster/data/rn4/bed/blastz.xenTro2.2006-04-19
cd /cluster/data/rn4/bed/blastz.xenTro2.2006-04-19
cat << '_EOF_' > DEF
# rat vs. frog
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
# Mammal to non-mammal, with threshold of 8000 as in mm8-xenTro2
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Rat rn4
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Frog xenTro2 - single chunk big enough to run two of the
# largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/rn4/bed/blastz.xenTro2.2006-04-19
'_EOF_'
# << emacs
doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/rn4XenTro2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
>& do.log & tail -f do.log
ln -s blastz.xenTro2.2006-04-19 /cluster/data/rn4/bed/blastz.xenTro2
#########################################################################
# BLASTZ CHICKEN galGal3 (DONE 5/25/06 angie)
ssh pk
mkdir /cluster/data/rn4/bed/blastz.galGal3.2006-05-24
cd /cluster/data/rn4/bed/blastz.galGal3.2006-05-24
cat << '_EOF_' > DEF
# rat vs chicken
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Rat rn4
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInNonMammal
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/san/sanvol1/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.galGal3.2006-05-24
'_EOF_'
# << emacs
doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsrn4 \
-bigClusterHub=pk -smallClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose \
>& do.log & tail -f do.log
ln -s blastz.galGal3.2006-05-24 /cluster/data/rn4/bed/blastz.galGal3
#########################################################################
###### ADD gdbPdb ENTRY FOR rn4 INTO THE CENTRAL DB. (DONE, 6/1/06, Fan)
mysql -u hgcat -p$HGPSWD -h genome-testdb -A hgcentraltest
delete from gdbPdb where genomeDb='rn4';
insert into gdbPdb values('rn4', 'proteins060115');
###########################################################
#BUILD KEGG PATHWAY TABLES. DONE 6/12/06. Fan.
ssh hgwdev
cd /cluster/store11/kg/kgRn4A
md kegg
cd kegg
~/src/hg/protein/KGpath.sh kgRn4A rn4 060115
# This script exited early due to SOAP module of perl seems missing
# So go to hgwdevold and manually ran the following:
~/src/hg/protein/getKeggList2.pl rno > keggList.tab
hgsql rn4 < ~/kent/src/hg/lib/keggList.sql
hgsql -e 'LOAD DATA local INFILE "keggList.tab" into table keggList;' rn4
# go back to hgwdev
hgKegg3 rn4 rn4
hgsql rn4 -e "drop table keggMapDesc"
hgsql rn4 -e "drop table keggPathway"
hgsql rn4 <~/src/hg/lib/keggMapDesc.sql
hgsql rn4 <~/src/hg/lib/keggPathway.sql
hgsql rn4 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
hgsql rn4 -e 'load data local infile "keggPathway.tab" into table keggPathway'
#########################################################################
# BUILD SOME KNOWNTO... TABLES, DONE 6/12/06 Fan.
cd /cluster/data/rn4/bed/geneSorter
# Create table that maps between known genes and RefSeq
hgMapToGene rn4 refGene knownGene knownToRefSeq
# Create table that maps between known genes and LocusLink
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" rn4 > refToLl.txt
hgMapToGene rn4 refGene knownGene knownToLocusLink -lookup=refToLl.txt
# Create table that maps between known genes and Pfam domains
hgMapViaSwissProt rn4 knownGene name proteinID Pfam knownToPfam
hgsql -e "select count(*) from knownToPfam;" rn4
#########################################################################
# BLASTZ/CHAIN/NET DANRER4 - NO LINSPECREP (DONE, 2006-06-19, hartera)
ssh pk
mkdir /cluster/data/rn4/bed/blastz.danRer4.2006-06-19
cd /cluster/data/rn4/bed/blastz.danRer4.2006-06-19
# There are no lineage-specific repeats for these two species. All repeats
# were not used as this was found to lower coverage with rn4 and danRer3
# so use dynamic masking instead (M parameter for Blastz).
cat << '_EOF_' > DEF
# rat (rn4) vs zebrafish (danRer4)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0
# Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Rat Rn4
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer4)
# large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer4/nib
SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
SEQ2_CHUNK=250000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.danRer4.2006-06-19
'_EOF_'
# << for emacs
chmod +x DEF
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF >& do.log &
tail -f do.log
# Took 2 hours and 50 minutes to run.
ln -s blastz.danRer4.2006-06-19 /cluster/data/rn4/bed/blastz.danRer4
# Do swap to create danRer4 vs rn4:
ssh pk
cd /cluster/data/rn4/bed/blastz.danRer4.2006-06-19
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& swap.log &
# Took about 10 minutes to run.
rm *.log
# Check README.txt is correct for alignment downloads.
featureBits rn4 chainDanRer4Link
# 67765280 bases of 2571531505 (2.635%) in intersection
featureBits rn4 refGene:cds chainDanRer4Link -chrom=chr1 -enrichment
# refGene:cds 0.731%, chainDanRer4Link 3.337%, both 0.518%,
# cover 70.86%, enrich 21.24x
featureBits rn4 refGene:cds chainDanRer3Link -chrom=chr1 -enrichment
# refGene:cds 0.731%, chainDanRer3Link 2.699%, both 0.475%,
# cover 64.89%, enrich 24.04x
featureBits rn4 refGene:cds netDanRer4 -chrom=chr1 -enrichment
# refGene:cds 0.731%, netDanRer4 29.999%, both 0.575%, cover 78.65%,
# enrich 2.62x
featureBits rn4 refGene:cds netDanRer3 -chrom=chr1 -enrichment
# refGene:cds 0.731%, netDanRer3 25.934%, both 0.530%, cover 72.52%,
# enrich 2.80x
#########################################################################
#### REBUILD kgProtMap TABLE (DONE. 6/19/06 Fan)
# Existing kgProtMap table only had 33 entries.
ssh kolossus
cd /cluster/store9/rn4/bed/kgRn4A
~/kent/src/hg/protein/kgProtMap2.sh kgRn4A rn4 060115
cd kgProtMap
hgLoadPsl -tNameIx rn4 psl.tmp/kgProtMap.psl
########################################################################
#### BUILD SUPERFAMILY RELATED TABLES (DONE - 2006-06-19 - Fan)
# Download latest Superfamily data files and build the Superfamily DB
# from supfam.mrc-lmb.cam.ac.uk
mkdir /cluster/store10/superfamily/060619
ln -s /cluster/store10/superfamily/060619 /cluster/data/superfamily/060619
cd /cluster/data/superfamily/060619
# ftp over the following two files:
ass_18-Jun-2006.tab.gz
supfam_18-Jun-2006.sql.gz
gzip -d *.gz
# Load the Superfamily database
hgsql rn4 -e "create database superfam060619"
nice hgsql superfam060619 < supfam_18-Jun-2006.sql &
# This may take about an hour.
# Make sure to add an index on id of the des table of superfam060619.
hgsql superfam060619 -e "create index id on des(id);"
hgsql superfam060619 < ~/src/hg/lib/sfAssign.sql
hgsql superfam060619 -e 'load data local infile "ass_18-Jun-2006.tab" into table superfam060619.sfAssign;'
# Build or rebuild Superfamily track and create sf tables needed for PB
hgsql rn4 < ~/src/hg/lib/sfAssign.sql
cd /cluster/data/superfamily/060619
hgsql rn4 -e 'load data local infile "ass_18-Jun-2006.tab" into table rn4.sfAssign;'
# If rn4.sfDes already exists, drop it.
hgsql superfam060619 -N -e "select * from des" >sfDes.tab
hgsql rn4 < ~/src/hg/lib/sfDes.sql
hgsql rn4 -e 'load data local infile "sfDes.tab" into table sfDes'
# Build ensemblXref3
# Get the ensembl gene/protein cross-reference data from Ensembl BioMart
# http://www.ensembl.org/Multi/martview
# Follow this sequence through the pages:
# Page 1) Select Ensembl39 and Rat. Hit next.
# Page 2) Do not select anything. Hit next.
# Page 3) Choose the "Feature" box, select Ensembl gene ID, transcript ID, peptide ID,
UniProt/TrEMBL ID, UniProt/SWISSPROT ID, and UniProt/SWISSPROT Accession
# Page 4) Choose "Text, tab separated". choose gzip compression. hit export.
# Save as ensXref.tsv.gz
ssh hgwdev
cd /cluster/data/rn4/bed/sf
hgsql rn4 < ~/src/hg/lib/ensemblXref3Temp.sql
hgsql rn4 -e \
'load data local infile "ensXref.tsv" into table ensemblXref3Temp ignore 1 lines'
hgsql rn4 -N -e \
'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \
> ensemblXref3.tab
hgsql rn4 -e 'drop table ensemblXref3'
hgsql rn4 <~/src/hg/lib/ensemblXref3.sql
hgsql rn4 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3'
# If rn4.superfamily already exists, drop it.
cd /cluster/data/rn4/bed
mkdir /cluster/data/rn4/bed/sf.20060619
ln -s sf.20060619 sf
cd sf
hgSuperfam rn4 superfam060619 > sf.log
# It is normal that many proteins do not have corresponding Superfamily entries.
# If rn4.sfDescription exists, drop it.
hgsql rn4 < ~/src/hg/lib/sfDescription.sql
hgsql rn4 -e 'LOAD DATA local INFILE "sfDescription.tab" into table rn4.sfDescription;'
# Finally, load the superfamily table.
hgLoadBed rn4 superfamily superfamily.tab -tab
# Create knownToSuperfamily table
# Note hs is changed into ht for this Superfamily release.
cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \
| hgKnownToSuper rn4 rn stdin
# created 25287 rows in knownToSuper
# Build tables needed by pbGlobal in proteins060115
cd /cluster/data/superfamily/060619
hgsql proteins060115 -e 'delete from sfAssign'
hgsql proteins060115 -e 'delete from sfDes'
hgsql proteins060115 -e 'load data local infile "ass_18-Jun-2006.tab" into table sfAssign'
hgsql proteins060115 -e 'load data local infile "sfDes.tab" into table sfDes'
########################################################################
# Build rn4 PROTEOME BROWSER TABLES (DONE 6/20/06, Fan)
# These are instructions for building tables
# needed for the Proteome Browser.
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
# ARE REBUILT.
# This build is based on proteins DBs dated 060115.
# Create the working directory
ssh hgwdev
mkdir /cluster/store11/kg/kgRn4A/pb-2006-06-19
cd /cluster/data/rn4/bed
rm pb
ln -s /cluster/store11/kg/kgRn4A/pb-2006-06-19 pb
cd pb
# Define pep* tables in rn4 DB
cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
# First edit out pepPred table definition, then
hgsql rn4 < pepAll.sql
# Build the pepMwAa table
hgsql proteins060115 -N -e \
"select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10116 and accToTaxon.acc = info.acc" > pepMwAa.tab
hgsql rn4 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'
o Build the pepPi table
hgsql proteins060115 -e \
"select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10116 and accToTaxon.acc = info.acc" > protAcc.lis
hgsql rn4 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis
pbCalPi protAcc.lis sp060115 pepPi.tab
hgsql rn4 -e 'delete from pepPi'
hgsql rn4 -e 'load data local infile "pepPi.tab" into table rn4.pepPi'
# Calculate and load pep distributions
pbCalDist sp060115 proteins060115 10116 rn4 >pbCalDist.out
wc pbCalDist.out
hgsql rn4
load data local infile "pepExonCntDist.tab" into table rn4.pepExonCntDist;
load data local infile "pepCCntDist.tab" into table rn4.pepCCntDist;
load data local infile "pepHydroDist.tab" into table rn4.pepHydroDist;
load data local infile "pepMolWtDist.tab" into table rn4.pepMolWtDist;
load data local infile "pepResDist.tab" into table rn4.pepResDist;
load data local infile "pepIPCntDist.tab" into table rn4.pepIPCntDist;
load data local infile "pepPiDist.tab" into table rn4.pepPiDist;
quit
# Calculate frequency distributions
pbCalResStd sp060115 10116 rn4
# Create pbAnomLimit and pbResAvgStd tables
hgsql rn4 -e "drop table pbAnomLimit"
hgsql rn4 -e "drop table pbResAvgStd"
hgsql rn4 < ~/src/hg/lib/pbAnomLimit.sql
hgsql rn4 < ~/src/hg/lib/pbResAvgStd.sql
hgsql rn4 -e 'load data local infile "pbResAvgStd.tab" into table rn4.pbResAvgStd;'
hgsql rn4 -e 'load data local infile "pbAnomLimit.tab" into table rn4.pbAnomLimit;'
# Create pbStamp table for PB
hgsql rn4 -e "drop table pbStamp"
hgsql rn4 < ~/src/hg/lib/pbStamp.sql
hgsql rn3 -N -e 'select * from pbStamp' > pbStamp.tab
hgsql rn4 -e 'delete from pbStamp'
hgsql rn4 -e 'load data local infile "pbStamp.tab" into table rn4.pbStamp'
# Turn on Proteome Browser for rn4.
hgsql -e 'update dbDb set hgPbOk=1 where name="rn4"' \
-h genome-testdb hgcentraltest
# Adjust drawing parameters for Proteome Browser stamps
Now invoke Proteome Browser and adjust various drawing parameters
(mostly the ymax of each stamp) if necessary, by updating the
pbStamp.tab file and then delete and reload the pbStamp table.
hgsql rn4 -e "drop table pbStamp"
hgsql rn4 < ~/src/hg/lib/pbStamp.sql
hgsql rn4 -e 'load data local infile "pbStamp.tab" into table rn4.pbStamp'
# Perform preliminary review of Proteome Browser for rn4, then
notify QA for formal review.
################################################################################
# BUILD KNOWN GENE LIST FOR GOOGLE. (DONE, 6/6/06, Fan)
# make knownGeneLists.html rn4GeneList.html mm5GeneList.html rm3GeneList.html
cd /cluster/data/rn4/bed
rm -rf knownGeneList/rn4
# Run hgKnownGeneList to generate the tree of HTML pages
# under ./knownGeneList/rn4
hgKnownGeneList rn4
# copy over to /usr/local/apache/htdocs
rm -rf /usr/local/apache/htdocs/knownGeneList/rn4
mkdir -p /usr/local/apache/htdocs/knownGeneList/rn4
cp -Rfp knownGeneList/rn4/* /usr/local/apache/htdocs/knownGeneList/rn4
################################################################################
##########################################################################
# N-SCAN gene predictions (nscanGene) - (2006-09-11 markd)
cd /cluster/data/rn4/bed/nscan/
# obtained NSCAN predictions from michael brent's group
# at WUSTL
wget -nv -r -l 1 -np --accept=gtf http://ardor.wustl.edu/predictions/rat/rn4/chr_gtf
wget -nv -r -l 1 -np --accept=fa http://ardor.wustl.edu/predictions/rat/rn4/chr_ptx
# clean up downloaded directorys and compress files.
mv ardor.wustl.edu/predictions/rat/rn4/* .
rm -rf ardor.wustl.edu
gzip chr_*/*
chmod a-w chr_*/*.gz
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt rn4 nscanGene chr_gtf/chr*.gtf.gz
# load protein, add .a suffix to match transcript id
hgPepPred -suffix=.a rn4 generic nscanPep chr_ptx/chr*.fa.gz
rm *.tab
# update trackDb; need a rn4-specific page to describe informants
rat/rn4/nscanGene.html (copy from mm8 and edit)
rat/rn4/trackDb.ra
###########################################################################
# dbSNP BUILD 125 (Heather, September 2006)
# We didn't have ctgPos.
# dbSNP build 125 uses chrom coords (phys_pos_from and phys_pos) in ContigLoc
# but not for chrUn. chrUn has contig coords only, and without ctgPos
# for supercontigs, I can't translate the chrUn contig coords into chrom coords.
# Also note: rn4 has no chrY, and there were no SNPs for chrM.
# Not having ctgPos also meant I couldn't process the new locTypes
# (4,5,6: rangeInsertion, rangeSubstitution, rangeDeletion)
# because those are also only given in contig coords
# These were a small number.
# Set up directory structure
ssh kkstore02
cd /cluster/data/dbSNP/125/organisms/rat_10116
mkdir data
mkdir schema
mkdir rs_fasta
# Get data from NCBI (anonymous FTP)
cd /cluster/data/dbSNP/125/organisms/rat_10116/data
ftp ftp.ncbi.nih.gov
cd snp/organisms/rat_10116/database/organism_data
# ContigLoc table has coords, orientation, loc_type, and refNCBI allele
get b125_SNPContigLoc_3_1.bcp.gz
# ContigLocusId has function
get b125_SNPContigLocusId_3_1.bcp.gz
get b125_ContigInfo_3_1.bcp.gz
# MapInfo has alignment weights
get b125_SNPMapInfo_3_1.bcp.gz
# SNP has univar_id, validation status and heterozygosity
get SNP.bcp.gz
# Get schema from NCBI
cd /cluster/data/dbSNP/125/organisms/rat_10116/schema
ftp ftp.ncbi.nih.gov
cd snp/organisms/rat_10116/database/organism_schema
get rat_10116_table.sql.gz
# Get fasta files from NCBI
# using headers of fasta files for molType
cd /cluster/data/dbSNP/125/organisms/rat_10116/rs_fasta
ftp ftp.ncbi.nih.gov
cd snp/organisms/rat_10116/rs_fasta
prompt
mget *.gz
# add rs_fasta to seq/extFile
# 2 edits first: strip header to just rsId, and remove duplicates
# work on /cluster/store12 (kkstore05) which has more disk space
cp rs_ch*.fas.gz /cluster/store12/snp/125/rat/rs_fasta
ssh kkstore05
cd /cluster/store12/snp/125/rat/rs_fasta
# concat into rsAll.fas
cat << '_EOF_' > concat.csh
#!/bin/csh -ef
rm -f rsAll.fas
foreach file (rs_ch*.fas)
echo $file
zcat $file >> rsAll.fas
end
'_EOF_'
# << emacs
#snpCleanSeq strips the header and skips duplicates
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
rm rsAll.fas
# load on hgwdev
ssh hgwdev
mkdir /gbdb/rn4/snp
ln -s /cluster/store12/snp/125/rat/rs_fasta/snp.fa /gbdb/rn4/snp/snp.fa
cd /cluster/store12/snp/125/rat/rs_fasta
hgLoadSeq rn4 /gbdb/rn4/snp/snp.fa
# look up id in extFile
hgsql rn4 < snpSeq.sql
hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 420007' rn4
hgsql -e 'delete from seq where extFile = 420007' rn4
hgsql -e 'alter table snpSeq add index acc (acc)' rn4
# clean up after hgLoadSeq
rm seq.tab
# Simplify names of data files
cd /cluster/data/dbSNP/125/organisms/rat_10116/data
mv b125_ContigInfo_3_1.bcp.gz ContigInfo.gz
mv b125_SNPContigLoc_3_1.bcp.gz ContigLoc.gz
mv b125_SNPContigLocusId_3_1.bcp.gz ContigLocusId.gz
mv b125_SNPMapInfo_3_1.bcp.gz MapInfo.gz
mv SNP.bcp.gz SNP.gz
ls -1 *.gz > filelist
# edit table descriptions
cd /cluster/data/dbSNP/125/organisms/rat_10116/schema
# get CREATE statements from rat_10116_table.sql for our 5 tables
# store in table.tmp
# convert and rename tables
sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
rm table.tmp
sed -f 'tableRename.sed' table2.tmp > table.sql
rm table2.tmp
# get header lines from rs_fasta
cd /cluster/data/dbSNP/125/organisms/rat_10116/rs_fasta
/bin/csh gnl.csh
# load on kkr5u00
ssh kkr5u00
hgsql -e mysql 'create database rn4snp125'
cd /cluster/data/dbSNP/125/organisms/rat_10116/schema
hgsql rn4snp125 < table.sql
cd ../data
/bin/csh load.csh
# note rowcount
# ContigLoc 94945
# SNP 41658
# MapInfo 41658
# ContigLocusId 32954
# Get UniVariation from 126 (recently downloaded for mm8snp126)
cd /cluster/data/dbSNP/126/shared
hgsql rn4snp125 < UniVariation.sql
zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' rn4snp125
# create working /scratch dir
cd /scratch/snp/125
mkdir rat
cd rat
# no rn4.ctgPos table to compare against
# get gnl files
cp /cluster/data/dbSNP/125/organisms/rat_10116/rs_fasta/*.gnl .
# examine ContigInfo for group_term and edit pipeline.csh
# use "ref_assembly"
# filter ContigLoc into ContigLocFilter
# this lifts from contig coords to chrom coords
# phys_pos_from is used to check coords for non-random chroms
# errors reported to stdout
# this gets rid of alternate assemblies (using ContigInfo)
# this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
# assumes all contigs are positively oriented; will abort if not true
mysql> desc ContigLocFilter;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromName | varchar(32) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | start | int(11) | NO | | | |
# | end | int(11) | YES | | NULL | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter125 rn4snp125 ref_assembly RGSC_v3.4.0
# note rowcount
# ContigLocFilter 45735
# how many are positive strand? hopefully 90%
# we get less than half here!
mysql> select count(*) from ContigLocFilter where orientation = 0;
# 23353
# note count by loc_type
mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
# +----------+----------+
# | count(*) | loc_type |
# +----------+----------+
# | 57 | 1 |
# | 45482 | 2 |
# | 196 | 3 |
# +----------+----------+
# filter ContigLocusId into ContigLocusIdFilter
# need to filter on contig_acc because ctg_id = 0 for all rows in ContigLocusId
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter125 rn4snp125 ref_assembly
# note rowcount
# 32954 (same as ContigLocusId)
# condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
# assumes SNPs are in numerical order; will errAbort if not true
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense rn4snp125
# note rowcount; expect about 50% (ascertainment bias for SNPs within genes)
# actually we get about 80% here
# ContigLocusIdCondense 23754
# could delete ContigLocusIdFilter table here
# create chrN_snpFasta tables from *.gnl files
# we are just using molType, but also storing class and observed
# need chromInfo for this
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta rn4snp125
# (could start using pipeline.csh here)
# (pipeline.csh takes about 5 minutes to run)
# split ContigLocFilter by chrom
# create the first chrN_snpTmp
# we will reuse this table name, adding/changing columns as we go
# at this point chrN_snpTmp will have the same description as ContigLocFilter
# this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom rn4snp125 ref_assembly
# check coords using loc_type
# possible errors logged to snpLocType.error:
# Unknown locType
# Between with allele != '-'
# Exact with end != start + 1
# Range with end < start
# possible exceptions logged to snpLocType.exceptions:
# RefAlleleWrongSize
# This run no errors, no exceptions
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +---------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +---------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# +---------------+-------------+------+-----+---------+-------+
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype125 rn4snp125 ref_assembly
# expand allele as necessary
# report syntax errors to snpExpandAllele.errors
# possible exceptions logged to snpExpandAllele.exceptions:
# RefAlleleWrongSize
# This run just 1 error
# 0 alleles expanded
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele rn4snp125 ref_assembly
# the next few steps prepare for working in UCSC space
# sort by position
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort rn4snp125 ref_assembly
# get nib files
ssh hgwdev
cd /cluster/data/rn4
cp rn4.2bit /cluster/home/heather/transfer/snp/rn4
ssh kkr5u00
cd /scratch/snp/125/rat
cp /cluster/home/heather/transfer/snp/rn4/rn4.2bit .
# edit path in chromInfo, load into rn4snp125 with editted path
# lookup reference allele in nibs
# keep reverse complement to use in error checking (snpCheckAlleles)
# check here for SNPs larger than 1024
# errAbort if detected
# check for coords that are too large, log to snpRefUCSC.error and skip
# This run no errors
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC rn4snp125
# morph chrN_snpTmp
mysql> desc chr1_snpTmp;
# +--------------------+-------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+-------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | ctg_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# +--------------------+-------------+------+-----+---------+-------+
# compare allele from dbSNP to refUCSC
# locType between is excluded from this check
# log exceptions to snpCheckAllele.exceptions
# if SNP is positive strand, expect allele == refUCSC
# log RefAlleleMismatch if not
# if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
# If allele == refUCSCRevComp, log RefAlleleNotRevComp
# If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
# This run we got:
# 0 RefAlleleMismatch
# 1830 RefAlleleNotRevComp
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles rn4snp125
# add class and observed using univar_id from SNP table
# to get class (subsnp_class) and observed (var_str) from UniVariation
# log errors to snpClassAndObserved.errors
# errors detected:
# class = 0 in UniVariation
# class > 8 in UniVariation
# univar_id = 0 in SNP
# no row in SNP for snp_id in chrN_snpTmp
# This run we got:
# 3 class = 0 in UniVariation
# 0 class > 8 in UniVariation
# 0 univar_id = 0 in SNP
# 0 no row in SNP for snp_id in chrN_snpTmp
# dbSNP has class = 'in-del'
# we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved rn4snp125
# morph chrN_snpTmp
# +--------------------+---------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +--------------------+---------------+------+-----+---------+-------+
# | snp_id | int(11) | NO | | | |
# | chromStart | int(11) | NO | | | |
# | chromEnd | int(11) | NO | | | |
# | loc_type | tinyint(4) | NO | | | |
# | class | varchar(255) | NO | | | |
# | orientation | tinyint(4) | NO | | | |
# | allele | blob | YES | | NULL | |
# | refUCSC | blob | YES | | NULL | |
# | refUCSCReverseComp | blob | YES | | NULL | |
# | observed | blob | YES | | NULL | |
# +--------------------+---------------+------+-----+---------+-------+
# generate exceptions for class and observed
# SingleClassBetweenLocType
# SingleClassRangeLocType
# NamedClassWrongLocType
# ObservedWrongFormat
# ObservedWrongSize
# ObservedMismatch
# RangeSubstitutionLocTypeExactMatch
# SingleClassTriAllelic
# SingleClassQuadAllelic
# This will also detect IUPAC symbols in allele
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved rn4snp125
# add function
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction rn4snp125
# add validation status and heterozygosity
# log error if validation status > 31 or missing
# no errors this run
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP rn4snp125
# add molType
# errors detected: missing or duplicate molType
# this run no errors
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype rn4snp125
# generate chrN_snp125 and snp125Exceptions tables
cp snpCheckAlleles.exceptions snpCheckAlleles.tab
cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
cp snpExpandAllele.exceptions snpExpandAllele.tab
cp snpLocType.exceptions snpLocType.tab
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable rn4snp125 125
# concat into snp125.tab
# cat chr*_snp125.tab >> snp125.tab
/bin/sh concat.sh
# check for multiple alignments
/cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple rn4snp125 125
mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions;
# load on hgwdev
cp snp125.tab /cluster/home/heather/transfer/snp/rn4
hgsql rn4snp125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/rn4/snp125Exceptions.tab
ssh hgwdev
cd transfer/snp/rn4
mysql> load data local infile 'snp125.tab' into table snp125;
mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions;
# create indexes
mysql> alter table snp125 add index name (name);
mysql> alter table snp125 add index chrom (chrom, bin);
mysql> alter table snp125Exceptions add index name(name);
# create snp125ExceptionDesc table
cd /cluster/data/dbSNP
hgsql rn4 < snp125ExceptionDesc.sql
# add counts to exception.human.125, can start with exception.template
hgsql -e 'select count(*), exception from snp125Exceptions group by exception' canfam1
mysql> load data local infile 'exception.rn4.snp125' into table snp125ExceptionDesc;
mysql> select count(*), exception from snp125Exceptions group by exception;
+----------+---------------------------+
| count(*) | exception |
+----------+---------------------------+
| 15252 | MultipleAlignments |
| 517 | ObservedMismatch |
| 1 | ObservedWrongFormat |
| 1 | ObservedWrongSize |
| 1830 | RefAlleleNotRevComp |
| 1 | RefAlleleWrongSize |
| 194 | SingleClassBetweenLocType |
| 1 | SingleClassQuadAllelic |
| 48 | SingleClassRangeLocType |
| 92 | SingleClassTriAllelic |
+----------+---------------------------+
##########################################################################
# xxBlastTab - Help filter out unwanted paralogs (Galt 2007-01-11)
#
# We are starting with xxBlastTab tables already built in the usual way with
# blastall/blastp, probably with doHgNearBlastp.pl script.
#
# we want to update rn4 for human and mouse,
# so check ./hgGeneData/Rat/rn4/otherOrgs.ra for current settings
ssh hgwdev
synBlastp.csh rn4 hg18
#rn4.hgBlastTab:
#new number of unique query values:
#7379
#new number of unique target values
#6373
#old number of unique query values:
#7977
#old number of unique target values
#6657
synBlastp.csh rn4 mm8
#rn4.mmBlastTab:
#new number of unique query values:
#7397
#new number of unique target values
#6502
#old number of unique query values:
#7988
#old number of unique target values
#6788
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page
# genbank release 157.0 now contains misc_diff fields for MGC clones
# reloading mRNAs results in gbMiscDiff table being created.
./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna rn4
###########################################################################
# REVERSE LIFTOVER CHAINS RN4 -> RN3 (DONE 2007-02-09 kate)
# copy rn3 2bit to cluster-friendly dir
ssh kkstore06
cp -p /cluster/data/rn3/rn3.2bit /san/sanvol1/scratch/rn3
# create the directory and scripts
ssh kkstore04
cd /cluster/data/rn4
doSameSpeciesLiftOver.pl rn4 rn3 -debug -bigClusterHub pk
# go to created dir and run for real
cd /cluster/data/rn4/bed/blat.rn3.2007-02-09
doSameSpeciesLiftOver.pl rn4 rn3 -bigClusterHub pk >& do.log & tail -f do.log
# failed for unknown region during chainNet of chr4. Error message
# indicated the file /cluster/data/rn3/chrom.sizes couldn't be found.
# I extracted the remaining steps from the doNet.csh file and ran those.
doNet.csh >&! doNet.log
doSameSpeciesLiftOver.pl rn4 rn3 -continue load -bigClusterHub pk >& doLoad.log
###########################################################################
# QUALITY SCORES (DONE 2/22/07 angie)
ssh kolossus
nice tcsh
mkdir /cluster/data/rn4/bed/qual
cd /cluster/data/rn4/bed/qual
zcat ../../downloads/*.qual.gz \
| sed -re 's/^>gnl\|Rnor3.[0-9_]+\|/>/' \
| qaToQac stdin stdout \
| qacToWig -fixed stdin stdout \
| wigEncode stdin qual.{wig,wib}
#Made 1 .wig files in stdout
#Converted stdin, upper limit 99.00, lower limit 0.00
ssh hgwdev
cd /cluster/data/rn4/bed/qual
rm -f /gbdb/rn4/wib/qual.wib
ln -s /cluster/data/rn4/bed/qual/qual.wib /gbdb/rn4/wib/
hgLoadWiggle -pathPrefix=/gbdb/rn4/wib rn4 quality qual.wig
##########################################################################
# BLASTZ/CHAIN/NET HORSE AND RAT (DONE 2/18/07 Fan)
ssh kkstore05
mkdir /cluster/data/equCab1/bed/blastz.rn4.2007-02-18
cd /cluster/data/equCab1/bed/blastz.rn4.2007-02-18
cat << '_EOF_' > DEF
# Horse vs. Rat
BLASTZ_M=50
# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500
SEQ1_CHUNK=30000000
SEQ1_LAP=10000
# QUERY: Mouse rn4
SEQ2_DIR=/scratch/hg/rn4/nib
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/equCab1/bed/blastz.rn4.2007-02-18
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/equCab1Rn4 >& do.log &
tail -f do.log
ssh hgwdev
cd /cluster/data/equCab1/bed
ln -s blastz.rn4.2007-02-18 /cluster/data/equCab1/bed/blastz.rn4
nice featureBits equCab1 -chrom=chr1 chainRn4Link
# 67408374 bases of 177498097 (37.977%) in intersection
bash
time nice -n 19 featureBits equCab1 chainRn4Link \
> fb.equCab1.chainRn4Link.txt 2>&1
# 859116859 bases of 2421923695 (35.472%) in intersection
ssh kkstore05
mkdir /cluster/data/rn4/bed/blastz.equCab1.swap
cd /cluster/data/rn4/bed/blastz.equCab1.swap
bash
time doBlastzChainNet.pl \
/cluster/data/equCab1/bed/blastz.rn4.2007-02-18/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-verbose=2 -swap -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/equCab1Rn4 > swap.log 2>&1 &
ssh hgwdev
cd /cluster/data/rn4/bed/blastz.equCab1.swap
bash
time nice -n 19 featureBits rn4 chainEquCab1Link \
> fb.rn4.chainEquCab1Link.txt 2>&1
# 867029914 bases of 2571531505 (33.716%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET BOSTAU3 (Done March 2007 heather)
mkdir /cluster/data/rn4/bed/blastz.bosTau3.2007-03-25
ln -s /cluster/data/rn4/bed/blastz.bosTau3.2007-03-25 /cluster/data/rn4/bed/blastz.bosTau3
cd /cluster/data/rn4/bed/blastz.bosTau3
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Rat rn4
SEQ1_DIR=/scratch/hg/rn4/nib
SEQ1_LEN=/scratch/hg/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow bosTau3
SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes
SEQ2_LIMIT=500
SEQ2_CHUNK=50000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastz.bosTau3.2007-03-25
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-bigClusterHub pk \
-chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /cluster/bluearc/bosTau3/blastz.rn4 >& do.log &
tail -f do.log
nice featureBits -chrom=chr1 rn4 chainBosTau3Link
# 61426292 bases of 243058842 (25.272%) in intersection
#########################################################################
# BLASTZ Mouse mm9 swap (DONE - 2007-09-11 - Hiram)
# the original
ssh kkstore06
cd /cluster/data/mm9/bed/blastz.rn4
cat fb.mm9.chainRn4Link.txt
# 1713186474 bases of 2620346127 (65.380%) in intersection
# and the swap
mkdir /cluster/data/rn4/bed/blastz.mm9.swap
cd /cluster/data/rn4/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzRn4.2007-08-31/DEF \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
-swap -syntenicNet > swap.out 2>&1 &
#########################################################################
# RGD QTL (DONE 9/24/07 angie -- RELOADED 9/26/07)
ssh kkstore04
mkdir /cluster/data/rn4/bed/rgdQtl
cd /cluster/data/rn4/bed/rgdQtl
wget ftp://rgd.mcw.edu:21/pub/data_release/QTLS
# Make bed4 and rgdQtlLink:
perl -we 'open(BED, ">rgdQtl.bed") || die; \
open(LINK, ">rgdQtlLink.txt") || die; \
while (<>) { \
chomp; my @w = split("\t"); \
next unless ($w[1] =~ /^rat/ && $w[15]); \
$w[5] =~ s/^/chr/; \
$w[15] =~ s/^([-\d]+).*$/$1/ || die "parse start pos"; \
$w[16] =~ s/^(\d+).*$/$1/ || die "parse end pos"; \
if ($w[15] > $w[16]) { \
$tmp = $w[15]; $w[15] = $w[16]; $w[16] = $tmp; \
} \
$w[15]--; \
$w[15] = 0 if ($w[15] < 0); \
print BED "$w[5]\t$w[15]\t$w[16]\t$w[2]\n"; \
print LINK "$w[0]\t$w[2]\t$w[3]\n"; \
} \
close(BED); close(LINK);' \
QTLS
ssh hgwdev
cd /cluster/data/rn4/bed/rgdQtl
# 9/26/07: RGD said that the duplicate Gluco13 was supposed to be
# Gluco13 + Gluco31. Edited rgdQtl.bed to change Gluco13 -> 31 for
# the chr15 copy, and rgdQtlLink.txt for id 631516.
hgLoadBed rn4 rgdQtl rgdQtl.bed
hgLoadSqlTab rn4 rgdQtlLink ~/kent/src/hg/lib/rgdQtlLink.sql rgdQtlLink.txt
# Fix coords > chromSize:
perl -wpe 's/^(\w+)\t(\d+)$/ \
delete from rgdQtl where chrom="$1" and chromStart >= $2; \
update rgdQtl set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
../../chrom.sizes \
| hgsql rn4
checkTableCoords -verbose=2 rn4 rgdQtl
runJoiner.csh rn4 rgdQtl
#############################################################################
# CONTRAST GENES (2007-10-02 markd)
# recieved predictions from Sam Gross <ssgross@stanford.edu>
cd /cluster/data/rn4/bed/contrastGene/
wget http://www.stanford.edu/~ssgross/contrast.rn4.bed
# this is a custom track, not a pure BED
tail +2 contrast.rn4.bed | hgLoadBed -tab rn4 contrastGene stdin
# verify
# load track db (ra and contrastGene.html are global
# request push of contrastGene
###########################################################################
# loading affy mouse Exon probes and transcripts (DONE - 2007-10-04 - Hiram)
# data was supplied from Venu Valmeekam Venu_Valmeekam@affymetrix.com
# dropped via FTP to genome-test
ssh hgwdev
mkdir /cluster/data/rn4/bed/affyRaEx1
cd /cluster/data/rn4/bed/affyRaEx1
# the files received:
# -rw-r--r-- 1 10151819 Oct 3 10:53 transcript_cluster_rn.bed.gz
# -rw-r--r-- 1 42240033 Oct 4 13:32 probe_rn_score.bed.gz
# loading:
hgLoadBed -tmpDir=/scratch/tmp rn4 affyRaEx1Probe probe_rn_score.bed.gz
# Loaded 4025442 elements of size 6
hgLoadBed -tmpDir=/scratch/tmp rn4 affyRaEx1Transcript \
transcript_cluster_rn.bed.gz
# Loaded 360668 elements of size 12
# working on description pages for these with Venu.
# I manually set the scores in the affyRaEx1Transcript track to
# 1000 so it would work OK (not color) with the useScore 1 so that
# the affyRaEx1Probe would color itself on the score
#########################################################################
# EXONIPHY Rn4, lifted from hg18 (DONE - 2007-11-19 - Hiram)
# needed for uscsGenes10 building
# create a syntenic liftOver chain file
ssh kolossus
cd /cluster/data/hg18/bed/blastz.rn4/axtChain
time nice -n +19 netFilter -syn hg18.rn4.net.gz \
| netChainSubset -verbose=0 stdin hg18.rn4.all.chain.gz stdout \
| chainStitchId stdin stdout | gzip -c > hg18.rn4.syn.chain.gz
# real 5m10.947s
# slightly smaller than the ordinary liftOver chain file:
# -rw-rw-r-- 1 73768012 Feb 17 2006 hg18.rn4.over.chain.gz
# -rw-rw-r-- 1 69198780 Nov 19 14:13 hg18.rn4.syn.chain.gz
# exoniphyRn4.gp is prepared as follows
ssh hgwdev
mkdir /cluster/data/rn4/bed/exoniphy
cd /cluster/data/rn4/bed/exoniphy
hgsql hg18 -N -e "select * from exoniphy" > exoniphyHg18.gp
time nice -n +19 liftOver -genePred exoniphyHg18.gp \
/cluster/data/hg18/bed/blastz.rn4/axtChain/hg18.rn4.syn.chain.gz \
exoniphyRn4.gp unmapped
# real 32m0.405s
wc -l *
# 178162 exoniphyHg18.gp
# 166859 exoniphyRn4.gp
# 22606 unmapped
ssh hgwdev
cd /cluster/data/rn4/bed/exoniphy
nice -n +19 hgLoadGenePred -genePredExt rn4 exoniphy exoniphyRn4.gp
nice -n +19 featureBits rn4 exoniphy
# 24959589 bases of 2571531505 (0.971%) in intersection
nice -n +19 featureBits mm9 exoniphy
# 25931742 bases of 2620346127 (0.990%) in intersection
nice -n +19 featureBits mm8 exoniphy
# 25952211 bases of 2567283971 (1.011%) in intersection
############################################################################
# SGP GENES (DONE - 2007-12-20 - Hiram)
ssh kkstore04
mkdir /cluster/data/rn4/bed/sgp
cd /cluster/data/rn4/bed/sgp
mkdir gtf
for C in `awk '{print $1}' /cluster/data/rn4/chrom.sizes`
do
wget --timestamping \
"http://genome.imim.es/genepredictions/R.norvegicus/rnNov2004/SGP/humangp200603/${C}.gtf" \
-O "gtf/${C}.gtf"
done
ssh hgwdev
cd /cluster/data/rn4/bed/sgp
ldHgGene -gtf -genePredExt rn4 sgpGene gtf/chr*.gtf
# Read 36569 transcripts in 286299 lines in 45 files
# 36569 groups 44 seqs 1 sources 3 feature types
# 36569 gene predictions
featureBits rn4 sgpGene
# 36219236 bases of 2571531505 (1.408%) in intersection
featureBits rn4 -enrichment refGene:CDS sgpGene
# refGene:CDS 0.770%, sgpGene 1.408%, both 0.672%, cover 87.35%, enrich 62.02x
#####################################################################
# LOAD GENEID GENES (DONE - 2007-12-20 - Hiram)
ssh kkstore04
mkdir -p /cluster/data/rn4/bed/geneid
cd /cluster/data/rn4/bed/geneid
mkdir gtf prot
for C in `awk '{print $1}' /cluster/data/rn4/chrom.sizes`
do
echo $C
wget --timestamping \
"http://genome.imim.es/genepredictions/R.norvegicus/rnNov2004/geneid_v1.2/${C}.gtf" \
-O "gtf/${C}.gtf"
wget --timestamping \
"http://genome.imim.es/genepredictions/R.norvegicus/rnNov2004/geneid_v1.2/${C}.prot" \
-O "prot/${C}.prot"
done
# Add missing .1 to protein id's
cd prot
foreach f (*.prot)
perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot
end
ssh hgwdev
cd /cluster/data/rn4/bed/geneid
ldHgGene -genePredExt -gtf rn4 geneid gtf/*.gtf
# Read 38143 transcripts in 287158 lines in 45 files
# 38143 groups 45 seqs 1 sources 3 feature types
# 38143 gene predictions
hgPepPred rn4 generic geneidPep prot/chr*-fixed.prot
featureBits rn4 geneid
# 40255245 bases of 2571531505 (1.565%) in intersection
featureBits rn4 -enrichment refGene:CDS geneid
# refGene:CDS 0.770%, geneid 1.565%, both 0.615%, cover 79.92%, enrich 51.05x
##########################################################################
# AGILENT CGH PROBES (Done 2008-05-13, Andy)
# (see hg18.txt)
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
#############################################################################
# RAT TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)
# (to build the affyExonTissues track, see the steps outlined in hg18.txt)
#############################################################################
########################################################################
## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy)
## (instructions are in hg18.txt)
########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
rn4.upstreamGeneTbl = refGene
rn4.upstreamMaf = multiz9way /hive/data/genomes/rn4/bed/multiz9way/species.lst
#############################################################################
# TEST LASTZ Mouse Mm9 (DONE - 2008-11-26,12-01 - Hiram)
# to compare like with like for Rn5 lastz
screen # use screen to manage this job
mkdir /hive/data/genomes/rn4/bed/blastzMm9.2008-11-26
cd /hive/data/genomes/rn4/bed/blastzMm9.2008-11-26
cat << '_EOF_' > DEF
# rat vs mouse
# Specially tuned blastz parameters from Webb Miller
BLASTZ=lastz
BLASTZ_M=254
BLASTZ_ABRIDGE_REPEATS=0
BLASTZ_O=600
BLASTZ_E=55
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/mouse_rat.q
# TARGET: Rat Rn4
SEQ1_DIR=/scratch/data/rn4/nib
SEQ1_LEN=/scratch/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Mouse Mm9
SEQ2_DIR=/scratch/data/mm9/nib
SEQ2_LEN=/scratch/data/mm9/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# establish a screen to control this job
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium \
-stop=net `pwd`/DEF > do.log 2>&1 &
# real 313m17.777s
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium \
-continue=cat -stop=net `pwd`/DEF >cat.log 2>&1
# create a loadUp.csh file so it can be modified to change
# the table names to not clash with existing Mm9 tables
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium \
-debug -continue=load -stop=load `pwd`/DEF >load.log 2>&1
# real 79m48.305s
cd /hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/axtChain
time ./load.csh
# real 23m39.747s
cat fb.rn4.chainMm9LastzLink.txt
# 1710794258 bases of 2571531505 (66.528%) in intersection
mkdir /hive/data/genomes/mm9/bed/blastz.rn4.swap
cd /hive/data/genomes/mm9/bed/blastz.rn4.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/DEF \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium \
-swap -stop=net > swap.log 2>&1 &
# real 120m41.998s
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/DEF \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=medium \
-debug -swap -continue=load -stop=load > load.log 2>&1 &
# real 27m24.819s
cat fb.mm9.chainRn4LastzLink.txt
# 1713323126 bases of 2620346127 (65.385%) in intersection
#############################################################################
# LIFTOVER TO Rn5 (DONE - 2008-12-14 - Hiram )
mkdir /hive/data/genomes/rn4/bed/blat.rn5.2008-12-14
cd /hive/data/genomes/rn4/bed/blat.rn5.2008-12-14
# -debug run to create run dir, preview scripts...
doSameSpeciesLiftOver.pl -debug rn4 rn5
# Real run:
time nice -n +19 doSameSpeciesLiftOver.pl rn4 rn5 > do.log 2>&1
#############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
############################################################################
# ADD LINK TO GENENETWORK (DONE. 11/06/09 Fan).
# Received geneNetwork ID list file, GN_rat_RefSeq.txt, for rn4 from GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com].
ssh hgwdev
mkdir -p /cluster/data/rn4/bed/geneNetwork
cd /cluster/data/rn4/bed/geneNetwork
hgsql rn4 < ~/src/hg/lib/geneNetworkId.sql
hgsql rn4 -e \
'load data local infile "GN_rat_RefSeq.txt" into table geneNetworkId'
#########################################################################
# BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2010-01-19,22 - Hiram)
ssh hgwdev
screen # use a screen to manage this multi-day job
mkdir /hive/data/genomes/rn4/bed/blastzBosTau4.2010-01-19
cd /hive/data/genomes/rn4/bed/blastzBosTau4.2010-01-19
cat << '_EOF_' > DEF
# Rat vs. Cow
BLASTZ_M=50
# TARGET: Rat Rn4
SEQ1_DIR=/scratch/data/rn4/nib
SEQ1_LEN=/cluster/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow bosTau4
SEQ2_DIR=/scratch/data/bosTau4/bosTau4.2bit
SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=100
SEQ2_CHUNK=20000000
SEQ2_LAP=0
BASE=/cluster/data/rn4/bed/blastzBosTau4.2010-01-19
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-workhorse=hgwdev -bigClusterHub=pk -chainMinScore=3000 \
-chainLinearGap=medium -noLoadChainSplit -syntenicNet > do.log 2>&1 &
# interrupted by storm induced power failures, the finish lastz batch
# then continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-continue=cat -workhorse=hgwdev -bigClusterHub=pk -chainMinScore=3000 \
-smallClusterHub=memk -chainLinearGap=medium -noLoadChainSplit \
-syntenicNet > cat.log 2>&1 &
# real 107m32.779s
cat fb.rn4.chainBosTau4Link.txt
# 649931321 bases of 2571531505 (25.274%) in intersection
mkdir /cluster/data/bosTau4/bed/blastz.rn4.swap
cd /cluster/data/bosTau4/bed/blastz.rn4.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/rn4/bed/blastzBosTau4.2010-01-19/DEF \
-noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet > swap.log 2>&1 &
# real 77m18.645s
cat fb.bosTau4.chainRn4Link.txt
# 664253901 bases of 2731830700 (24.315%) in intersection
#########################################################################
# ailMel1 Panda alignment (DONE - 2010-01-21 - Hiram)
mkdir /hive/data/genomes/rn4/bed/lastzAilMel1.2010-01-21
cd /hive/data/genomes/rn4/bed/lastzAilMel1.2010-01-21
cat << '_EOF_' > DEF
# Rat vs. Panda
# parameters from the Panda paper supplemental where they describe
# their lastz parameters
BLASTZ_K=2200
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_H=2000
BLASTZ_C=2
BLASTZ_T=2
# our usual M
BLASTZ_M=50
# TARGET: Rat Rn4
SEQ1_DIR=/scratch/data/rn4/nib
SEQ1_LEN=/scratch/data/rn4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Panda
SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/hive/data/genomes/rn4/bed/lastzAilMel1.2010-01-21
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
# real 129m23.300s
cat fb.rn4.chainAilMel1Link.txt
# 707335282 bases of 2571531505 (27.506%) in intersection
mkdir /hive/data/genomes/ailMel1/bed/blastz.rn4.swap
cd /hive/data/genomes/ailMel1/bed/blastz.rn4.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/rn4/bed/lastzAilMel1.2010-01-21/DEF \
-swap -noLoadChainSplit -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
# real 65m7.728s
cat fb.ailMel1.chainRn4Link.txt
# 693099721 bases of 2225124764 (31.149%) in intersection
############################################################################
# tRNAs track (2010-01-15, Fan DONE)
#
ssh hgwdev
cd /hive/data/genomes/rn4/bed
mkdir tRNAs
cd tRNAs
# Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/
cp -p /projects/lowelab/users/lowe/Browser/vertebrates/rn4-tRNAs.bed .
cp -p \
/projects/lowelab/users/lowe/Browser/vertebrates/rn4_tRNAs_images.tar.gz\
.
hgsql rn4 -e 'drop table if exists tRNAs'
hgLoadBed -tab rn4 tRNAs rn4-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql
mkdir gif
cd gif
gzip -d ../rn4_tRNAs_images.tar.gz
tar -xvf rn4_tRNAs_images.tar
mkdir /hive/data/gbdb/rn4/RNA-img
cp -p * /hive/data/gbdb/rn4/RNA-img
#####################################################################
+# INITIAL RGD GENES BUILD (2010-02-12, Fan, DONE)
+#
+# NOTE: THE BUILD PIPELINE IS DEVELOPED. SEVERAL MAJOR PROBLEMS WERE
+# DISCOVERED IN THE ORIGINAL SOURCE DATA FILE. RGD IS LOOKING INTO THEM.
+#
+# THIS PIPELINE SHOULD BE ADJUSTED/MODIFIED AND RE-RUN AFTER RGD FIXES THOSE PROBLEMS
+# AND PROVIDES AN UPDATED VERSION OF THEIR DATA FILE.
+ssh hgwdev
+cd /hive/data/genomes/rn4/bed
+mkdir rgdGene
+cd rgdGene
+
+# download raw data from RGD
+wget --timestamping ftp://rgd.mcw.edu/pub/data_release/GFF/rgd_genes_gff.zip
+
+unzip rgd_genes_gff.zip
+
+cat gff/RGDgenes_chr*.gff3 >RGDgenes_all.gff3
+
+hgsql rn4 -e 'drop table rgdGeneRaw0'
+hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw0.sql
+hgsql rn4 -e 'load data local infile "RGDgenes_all.gff3" into table rgdGeneRaw0'
+
+
+# build the rgdId.tab file
+hgsql rn4 -N -e 'select "RGD:",seqname from rgdGeneRaw0 '|cut -f 1 >j.rgd
+hgsql rn4 -N -e 'select * from rgdGeneRaw0 '|cut -f 9 >j.0
+
+cat j.0 |sed -e 's/RGD:/\t/'|cut -f 2 >j.1
+cat j.1 |sed -e 's/,/\t/'|sed -e 's/;/\t/'|cut -f 1 >j.2
+
+paste j.rgd j.2 |sed -e 's/\t//' >rgdId.tab
+rm j.*
+
+# fix some currently known problems of RGD Genes
+
+# First, remove gene records that contains exons of different strands.
+# create two temp tables rgdGeneRaw2 and rgdGeneRaw3
+
+cut -f 1-8 RGDgenes_all.gff3 >j.temp8
+paste j.temp8 rgdId.tab >rgdGeneRaw2.tab
+
+hgsql rn4 -e 'drop table rgdGeneRaw2'
+hgsql rn4 -e 'drop table rgdGeneRaw3'
+hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw2.sql
+hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw3.sql
+hgsql rn4 -e 'load data local infile "rgdGeneRaw2.tab" into table rgdGeneRaw2'
+hgsql rn4 -e 'load data local infile "rgdGeneRaw2.tab" into table rgdGeneRaw3'
+
+# create the one line script del1 containing the following line:
+
+hgsql rn4 -N -e "delete from rgdGeneRaw2 where rgdId='${2}' and feature='exon' and seqname='${1}' and strand = '${3}'"
+
+chmod +x del1
+hgsql rn4 -N -e \
+"select 'del1', x.seqname, x.rgdId, x.strand from rgdGeneRaw2 x, rgdGeneRaw3 y where x.rgdId=y.rgdId and x.seqname=y.seqname and x.feature='exon' and y.feature='gene' and x.strand!=y.strand;" |\
+sort -u > delAll
+chmod +x delAll
+
+./delAll
+
+hgsql rn4 -N -e 'select "chr" from rgdGeneRaw2 where feature != "CDS"' >j.chr
+hgsql rn4 -N -e 'select * from rgdGeneRaw2 where feature != "CDS"' >j.1
+paste j.chr j.1 |sed -e 's/chr\t/chr/'>j.2
+
+# remove one problematic gene
+cat j.2|grep -v "RGD:3683" >rgdGeneTemp.gff
+
+ldHgGene -gtf rn4 rgdGeneTemp rgdGeneTemp.gff
+
+hgsql rn4 -N -e 'select x.*, y.start-1, y.end from rgdGeneTemp x, rgdGeneRaw2 y where x.name=y.rgdId and y.feature="CDS"' >j.0
+
+cut -f 2-6 j.0 >j.1
+
+cut -f 12-13 j.0 >j.2
+
+cut -f 9-11 j.0 >j.3
+
+#paste j.1 j.2 j.3 >rgdGene2.tab
+paste j.1 j.2 j.3 >j.10
+
+cut -f 1-3 j.10 >j.1-3
+
+cut -f 9 j.10 |sed -e 's/,/\t/'|cut -f 1 >j.start
+
+cut -f 5-10 j.10 >j.5-10
+
+paste j.1-3 j.start j.5-10 >rgdGene2.tab
+
+hgsql rn4 -e 'drop table rgdGene2'
+hgsql rn4 < ~/kent/src/hg/lib/rgdGene2.sql
+
+hgsql rn4 -e 'load data local infile "rgdGene2.tab" into table rgdGene2'
+
+hgsql rn4 -e 'drop table rgdGeneTemp'
+
+rm j.*
+
+####### temp fix to make txEnd the same as last exonStart
+
+getLastExonEnd rn4 |sed -e 's/,//' >j.out
+cut -f 1 j.out >j.1
+cut -f 1 rgdGene2.tab >j.0
+diff j.0 j.1
+
+cut -f 2 j.out >j.last
+
+cut -f 1-4 rgdGene2.tab > j.1-4
+cut -f 6-10 rgdGene2.tab > j.6-10
+
+paste j.1-4 j.last j.6-10 >rgdGene2.tab.new
+
+hgsql rn4 -e 'drop table rgdGene2'
+hgsql rn4 < ~/kent/src/hg/lib/rgdGene2.sql
+
+hgsql rn4 -e 'load data local infile "rgdGene2.tab.new" into table rgdGene2'
+rm j.*