src/hg/makeDb/doc/galGal1.txt 1.4
1.4 2009/11/25 21:48:39 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/galGal1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/galGal1.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/makeDb/doc/galGal1.txt 25 Nov 2009 18:29:18 -0000 1.3
+++ src/hg/makeDb/doc/galGal1.txt 25 Nov 2009 21:48:39 -0000 1.4
@@ -1,1713 +1,1713 @@
# for emacs: -*- mode: sh; -*-
# This file describes how we made the browser database on the
# Chicken (Gallus gallus) January 2004 release.
# CREATE BUILD DIRECTORY (DONE 1/23/04 angie)
ssh kksilo
mkdir /cluster/store6/galGal1
ln -s /cluster/store6/galGal1 /cluster/data/galGal1
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 1/23/04 angie)
mkdir /cluster/data/galGal1/M
cd /cluster/data/galGal1/M
# go to http://www.ncbi.nih.gov/ and search Nucleotide for
# "gallus mitochondrion genome". That shows the gi number:
# 5834843
# Use that number in the entrez linking interface to get fasta:
wget -O chrM.fa \
'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834843&dopt=FASTA'
# Edit chrM.fa: make sure the long fancy header line says it's the
# Gallus gallus mitochondrion complete genome, and then replace the
# header line with just ">chrM".
# DOWNLOAD GENOME SEQUENCE (DONE 1/28/04 angie)
ssh kksilo
cd /cluster/data/galGal1
# Contig sequences have been stable for a while:
wget ftp://genome.wustl.edu/private/chicken_040105/Fasta/chicken_040105.contigs.tar.gz
# Official location (password-protected):
# http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken_040105.contigs.tar.gz
mkdir contigs
cd contigs
tar xvfz ../chicken_040105.contigs.tar.gz
foreach f (chicken_031214.pcap.contigs*)
set n = `echo $f | sed -e 's/chicken_031214.pcap.contigs//'`
mv $f contigs$n.fa
end
faSize *.fa
#1054180845 bases (107580 N's 1054073265 real) in 111864 sequences in 100 files
#Total size: mean 9423.8 sd 19990.9 min 52 (Contig28908.1) max 441791 (Contig132.21) median 2122
#N count: mean 1.0 sd 5.1
mkdir /cluster/bluearc/galGal1
cat ../contigs/*.fa > /cluster/bluearc/galGal1/allContigs.fa
# Get qual scores too
wget ftp://genome.wustl.edu/private/chicken_040105/Qual/chicken_040105.contigs.qual.tar.gz
# Official location (password-protected):
# http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken_040105.contigs.qual.tar.gz
# BUILD AND CHECK CHROM-LEVEL SEQUENCE (DONE 1/30/04 angie)
# 1/30/04: ZW_random has been made a part of Z.
# on hgwdev, "hgsql galGal1" and get rid of any chrZW_random tables
ssh kolossus
cd /cluster/data/galGal1
wget ftp://genome.wustl.edu/private/lhillier/old/chicken_agp.040129.tar.gz
# Note: this January 30th tarfile is labeled January 29 -- rename it
# to avoid confusion with several Jan. 29 agp downloads.
mv chicken_agp.040129.tar.gz chicken_agp.040130.tar.gz
# Official location (password-protected):
# http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken_agp.040129.tar.gz
# I downloaded that and cmp'd to chicken_agp.040130.tar.gz -- same.
tar xvzf chicken_agp.040130.tar.gz
cd chicken_agp
# confirmed that the only AGP changes from chicken_agp_29_third were
# that ZW_random went away and Z got some new stuff at the beginning.
cp /dev/null ../chrom.lst
foreach f (*.agp)
set chr = `echo $f:r | sed -e 's/^chr//'`
set base = `echo $chr | sed -e 's/_random$//'`
mkdir -p ../$base
cp -p $f ../$base
if ("$chr" == "$base") echo $chr >> ../chrom.lst
end
# OK, tack chrM on there too:
echo M >> ../chrom.lst
cd /cluster/data/galGal1
# get rid of the ZW_random and make a clean slate for Z (not Z_random)
rm -r ZW Z/chrZ.fa* Z/chrZ_? nib/chrZW_random.nib
foreach c (`cat chrom.lst`)
foreach agp ($c/chr$c{,_random}.agp)
if (-e $agp) then
set fa = $agp:r.fa
echo building $fa from $agp
agpToFa -simpleMultiMixed $agp $agp:t:r $fa \
/cluster/bluearc/galGal1/allContigs.fa
endif
end
end
# checkAgpAndFa prints out way too much info -- keep the end/stderr only:
foreach c (`cat chrom.lst`)
foreach agp ($c/chr$c{,_random}.agp)
if (-e $agp) then
set fa = $agp:r.fa
echo checking consistency of $agp and $fa
checkAgpAndFa $agp $fa | tail -1
endif
end
end
faSize */chr*.fa
#1172160385 bases (118070345 N's 1054090040 real) in 47 sequences in 47 files
#Total size: mean 24939582.7 sd 47753836.2 min 2535 (chrE50C23) max 229650211 (chrUn) median 7900282
#N count: mean 2512135.0 sd 12440150.2
# COMPARE BASE COUNTS WITH WUSTL'S (DONE 2/1/04 angie)
# Compare LaDeana's size summary file with our chrom faSize output:
cd /cluster/data/galGal1
# Download from password-protected location:
# http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken.localized_per_chromosome
# Actually, we can only compare total bases with faSize because faSize
# doesn't differentiate between an N from a gap and an N base.
tail +4 chicken.localized_per_chromosome \
| perl -wpe 'next if (/^\s*$/); \
if (s/^(\w+)\s+(\d+)\s+(\d+)\s*$//) { \
($c, $noGapBP, $totalBP) = ($1, $2, $3); \
$c =~ s/Unlocalized/Un/; \
$chr = "chr$c"; \
$c =~ s/_random//; \
$faSize = "faSize $c/$chr.fa"; \
$faSize = `$faSize`; \
if ($faSize =~ /^(\d+) bases \((\d+) N.s (\d+) real/) { \
if ($totalBP != $1) { \
print "$chr totalBP: WUSTL $totalBP, us $1!\n"; \
} \
} else { print "parsed faSize wrong:\n$faSize\n"; } \
} else { print "what is this?\n$_\n"; }'
# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 1/30/04 angie)
ssh kksilo
cd /cluster/data/galGal1
foreach c (`cat chrom.lst`)
foreach agp ($c/chr$c{,_random}.agp)
if (-e $agp) then
set fa = $agp:r.fa
echo splitting $agp and $fa
cp -p $agp $agp.bak
cp -p $fa $fa.bak
splitFaIntoContigs $agp $fa . -nSize=5000000
endif
end
end
# splitFaIntoContigs makes new dirs for _randoms. Move their contents
# back into the main chrom dirs and get rid of the _random dirs.
foreach d (*_random)
set base = `echo $d | sed -e 's/_random$//'`
mv $d/lift/oOut.lst $base/lift/rOut.lst
mv $d/lift/ordered.lft $base/lift/random.lft
mv $d/lift/ordered.lst $base/lift/random.lst
rmdir $d/lift
mv $d/* $base
rmdir $d
end
# Make a "pseudo-contig" for processing chrM too:
mkdir M/chrM_1
sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
mkdir M/lift
echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
echo "chrM_1" > M/lift/ordered.lst
echo "0 M/chrM_1 16775 chrM 16775" > M/lift/ordered.lft
# MAKE JKSTUFF AND BED DIRECTORIES (DONE 1/23/04 angie)
# This used to hold scripts -- better to keep them inline here so
# they're in CVS. Now it should just hold lift file(s) and
# temporary scripts made by copy-paste from this file.
mkdir /cluster/data/galGal1/jkStuff
# This is where most tracks will be built:
mkdir /cluster/data/galGal1/bed
# CREATING DATABASE (DONE 1/28/04 angie)
ssh hgwdev
echo 'create database galGal1' | hgsql ''
# Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
df -h /var/lib/mysql
# /dev/sdc1 1.8T 221G 1.5T 14% /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE 1/28/04 angie)
ssh hgwdev
echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
| hgsql galGal1
# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 1/30/04 angie)
# Make nib/, unmasked until RepeatMasker and TRF steps are done.
# Do this now so we can load up RepeatMasker and run featureBits;
# can also load up other tables that don't depend on masking.
ssh kksilo
cd /cluster/data/galGal1
mkdir nib
foreach c (`cat chrom.lst`)
foreach f ($c/chr${c}{,_random}.fa)
if (-e $f) then
echo "nibbing $f"
/cluster/bin/i386/faToNib $f nib/$f:t:r.nib
endif
end
end
# Make symbolic links from /gbdb/galGal1/nib to the real nibs.
ssh hgwdev
mkdir -p /gbdb/galGal1/nib
foreach f (/cluster/data/galGal1/nib/chr*.nib)
ln -s $f /gbdb/galGal1/nib
end
# Load /gbdb/galGal1/nib paths into database and save size info.
cd /cluster/data/galGal1
hgsql galGal1 < $HOME/kent/src/hg/lib/chromInfo.sql
hgNibSeq -preMadeNib galGal1 /gbdb/galGal1/nib */chr*.fa
echo "select chrom,size from chromInfo" | hgsql -N galGal1 > chrom.sizes
# take a look at chrom.sizes, should be 47 lines
wc chrom.sizes
# REPEAT MASKING (DONE 1/31/04 angie)
#- Split contigs into 500kb chunks, at gaps if possible:
ssh kksilo
cd /cluster/data/galGal1
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}*_?{,?})
cd $d
echo "splitting $d"
set contig = $d:t
~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
cd ../..
end
end
#- Make the run directory and job list:
cd /cluster/data/galGal1
cat << '_EOF_' > jkStuff/RMChicken
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/galGal1/$2
/bin/cp $2 /tmp/galGal1/$2/
cd /tmp/galGal1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec chicken $2
popd
/bin/cp /tmp/galGal1/$2/$2.out ./
if (-e /tmp/galGal1/$2/$2.align) /bin/cp /tmp/galGal1/$2/$2.align ./
if (-e /tmp/galGal1/$2/$2.tbl) /bin/cp /tmp/galGal1/$2/$2.tbl ./
if (-e /tmp/galGal1/$2/$2.cat) /bin/cp /tmp/galGal1/$2/$2.cat ./
/bin/rm -fr /tmp/galGal1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/galGal1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/galGal1
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMChicken
mkdir RMRun
rm -f RMRun/RMJobs
touch RMRun/RMJobs
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}{,_random}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/galGal1/jkStuff/RMChicken \
/cluster/data/galGal1/$d $f \
'{'check out line+ /cluster/data/galGal1/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
end
#- Do the run
ssh kk
cd /cluster/data/galGal1/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
#Completed: 2783 of 2783 jobs
#Average job time: 1448s 24.13m 0.40h 0.02d
#Longest job: 3201s 53.35m 0.89h 0.04d
#Submission to last job: 5738s 95.63m 1.59h 0.07d
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kksilo
cd /cluster/data/galGal1
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
foreach c (`cat chrom.lst`)
echo lifting $c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
> /dev/null
endif
if (-e lift/random.lft && ! -z lift/random.lft) then
liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
> /dev/null
endif
cd ..
end
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/galGal1
hgLoadOut galGal1 */chr*.fa.out
# VERIFY REPEATMASKER RESULTS (DONE 1/30/04 angie)
# Eyeball some repeat annotations in the browser, compare to lib seqs.
# Run featureBits on galGal1 and on a comparable genome build, and compare:
ssh hgwdev
featureBits galGal1 rmsk
#103553237 bases of 1054197620 (9.823%) in intersection
# No comparable species -- in conf call 01/21/04, Arian said about
# 8-10% should be covered.
# MAKE LIFTALL.LFT (DONE 1/30/04 angie)
ssh kksilo
cd /cluster/data/galGal1
cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
# PUT UNMASKED CHUNKS ON BLUEARC (DONE 1/30/04 angie)
# Some of Terry's staging scripts require a flat dir containing
# all contigs; also, doesn't hurt to set aside unmasked seq.
ssh kksilo
cd /cluster/data/galGal1
mkdir /cluster/bluearc/galGal1/unmaskedChunks
foreach d (*/chr*_?{,?})
cp -p $d/${d:t}.fa /cluster/bluearc/galGal1/unmaskedChunks
end
# MAKE STS MARKERS TRACK (IN PROGRESS 1/30/04 angie -- need more info from Terry)
# Doing a dry run to see if I can duplicate Terry's mapping of
# markers to the prelim release.
set base = /cluster/data/galGal1/bed/markers
mkdir -p $base/primers $base/sts
# Need to find out how Terry built these:
cp -p /cluster/bluearc/gg1/markers/sts/convert.pl $base
cp -p /cluster/bluearc/gg1/markers/stsAlias.bed $base
cp -p /cluster/bluearc/gg1/markers/stsInfoGG.bed $base
cp -p /cluster/bluearc/gg1/10.ooc $base
# Need to find out where Terry got these files:
cp -p /cluster/bluearc/gg1/markers/primers/all.primers{,.fa} $base/primers
cp -p /cluster/bluearc/gg1/markers/sts/all.sequence.fa $base/sts
# Run e-PCR on primers
mkdir $base/primers/epcr
cd $base/primers/epcr
mkdir epcr.out
/cluster/bin/scripts/createCloneList \
/cluster/bluearc/galGal1/unmaskedChunks
# That script creates in.lst/$n.in files, where $n is
# the contigs number *plus 1*! in.lst/0.in is empty and makes
# para create complain, so get rid of it.
rm in.lst/0.in
egrep -v '/0.in$' files.lst > tmp.lst; mv tmp.lst files.lst
echo $base/primers/all.primers > epcr.lst
cat << '_EOF_' > template
#LOOP
/cluster/bin/scripts/runEpcr {check in line+ $(path1)} {check in line+ $(path2)} {check out exists epcr.out/$(root2).epcr}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
/cluster/bin/i386/gensub2 epcr.lst files.lst template jobList
ssh kk
set base = /cluster/data/galGal1/bed/markers
cd $base/primers/epcr
para create jobList
para try, check, push, check, ...
#Completed: 251 of 251 jobs
#Average job time: 30s 0.50m 0.01h 0.00d
#Longest job: 45s 0.75m 0.01h 0.00d
#Submission to last job: 46s 0.77m 0.01h 0.00d
cat `ls -1 epcr.out/* | sort -g` > all.epcr
# blat primers -- with old blat.2 !
mkdir $base/primers/blat
cd $base/primers/blat
mkdir primers.out
ls -1S /cluster/bluearc/galGal1/unmaskedChunks/*.fa > contigs.lst
echo $base/primers/all.primers.fa > primers.lst
cat << '_EOF_' > template
#LOOP
/cluster/home/kent/bin/i386/blat.2 -tileSize=10 -ooc=../../10.ooc -minMatch=1 -minScore=1 -minIdentity=75 {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ primers.out/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
/cluster/bin/i386/gensub2 contigs.lst primers.lst template jobList
ssh kk
set base = /cluster/data/galGal1/bed/markers
cd $base/primers/blat
para create jobList
para try, check, push, check, ...
#Completed: 261 of 261 jobs
#Average job time: 18s 0.30m 0.00h 0.00d
#Longest job: 27s 0.45m 0.01h 0.00d
#Submission to last job: 28s 0.47m 0.01h 0.00d
/cluster/bin/i386/pslSort dirs primers.psl temp primers.out
rm -r temp
# Make primers.final from e-PCR and blat results:
cd $base/primers
filterPrimers -chicken ../stsInfoGG.bed blat/primers.psl all.primers \
epcr/all.epcr > primers.psl.filter
extractPslInfo primers.psl.filter
mv primers.psl.filter.initial primers.initial
sort -k 4n primers.initial > primers.final
# blat sts sequences
mkdir $base/sts/blat
cd $base/sts/blat
mkdir stsMarkers.out
ls -1S /cluster/bluearc/galGal1/unmaskedChunks/*.fa > contigs.lst
echo $base/sts/all.sequence.fa > stsMarkers.lst
cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ stsMarkers.out/$(root1).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
/cluster/bin/i386/gensub2 contigs.lst stsMarkers.lst template jobList
ssh kk
set base = /cluster/data/galGal1/bed/markers
cd $base/sts/blat
para create jobList
para try, check, push, check, ...
#Completed: 261 of 261 jobs
#Average job time: 96s 1.59m 0.03h 0.00d
#Longest job: 199s 3.32m 0.06h 0.00d
#Submission to last job: 199s 3.32m 0.06h 0.00d
/cluster/bin/i386/pslSort dirs raw.psl temp stsMarkers.out
/cluster/bin/i386/pslReps -nearTop=0.01 -minCover=0.6 -minAli=0.8 \
-noIntrons raw.psl stsMarkers.psl /dev/null
rm -rf temp
rm -f raw.psl
cd $base/sts
filterPsl blat/stsMarkers.psl
extractUniqPsl stsMarkers.psl.filter stsMarkers.psl.filter
pslReps stsMarkers.psl.filter.dup.psl stsMarkers.psl.filter.1 /dev/null \
-noIntrons -minCover=0.80 -minAli=0.90 -nearTop=0.05
mv stsMarkers.psl.filter stsMarkers.psl.filter.orig
cat stsMarkers.psl.filter.1 stsMarkers.psl.filter.uniq.psl \
> stsMarkers.psl.filter
extractPslInfo -h stsMarkers.psl.filter
rm stsMarkers.psl.filter.1*
rm stsMarkers.psl.filter.dup.psl stsMarkers.psl.filter.names \
stsMarkers.psl.filter.uniq.psl
extractPslInfo -h stsMarkers.psl.filter
mv stsMarkers.psl.filter.initial stsMarkers.initial
sort -k 4n stsMarkers.initial > stsMarkers.final.orig
../convert.pl ../stsAlias.bed stsMarkers.final.orig > stsMarkers.final
cd $base
combineSeqPrimerPos $base/sts/stsMarkers.final \
$base/primers/primers.final > stsMarkers_pos.rdb
# This step needs work -- output is 0-length
createSTSbed stsInfoGG.bed stsMarkers_pos.rdb > stsMap.bed
ssh hgwdev
cd /cluster/data/galGal1/bed/markers
# Make an stsMarkersTmp so we can at least see the locations
awk '{printf "%s\t%s\t%s\t%s\t%d\n", $1, $2, $3, $4, $5 * 1000};' \
sts/stsMarkers.final.orig \
| liftUp -type=.bed stsMarkersTmp.bed ../../jkStuff/liftAll.lft warn stdin
hgLoadBed galGal1 stsMarkerTmp stsMarkersTmp.bed
# lift & load all_sts_primer
liftUp primers.filter.lifted.psl ../../jkStuff/liftAll.lft warn \
primers/primers.psl.filter
hgLoadPsl -table=all_sts_primer -fastLoad \
galGal1 primers.filter.lifted.psl
# lift & load all_sts_seq
liftUp stsMarkers.filter.lifted.psl ../../jkStuff/liftAll.lft warn \
sts/stsMarkers.psl.filter
hgLoadPsl -table=all_sts_seq -fastLoad \
galGal1 stsMarkers.filter.lifted.psl
# load sequences
mkdir /gbdb/galGal1/sts
ln -s /cluster/data/galGal1/bed/markers/sts/all.sequence.fa \
/gbdb/galGal1/sts/
ln -s /cluster/data/galGal1/bed/markers/primers/all.primers.fa \
/gbdb/galGal1/sts/
hgLoadSeq galGal1 /gbdb/galGal1/sts/*.fa
# BACENDS (DONE 1/31/04 angie)
ssh kksilo
mkdir /cluster/data/galGal1/bed/bacends
cd /cluster/data/galGal1/bed/bacends
# download
wget ftp://ftp.ncbi.nih.gov/genomes/BACENDS/gallus_gallus/AllBACends.mfa.gz
wget ftp://ftp.ncbi.nih.gov/genomes/BACENDS/gallus_gallus/cl_acc_gi_len.gz
gunzip *.gz
/cluster/bin/scripts/convertBacEndPairInfo cl_acc_gi_len >& pairInfo.log
# Next time around, don't keep the version number, so that the
# bacends.fa accs will match the cl_acc_gi_len accs. Use this
# replacement instead: 's/^>gi\|\d+\|gb\|(\w+)\.\d+\|.*/>$1/'
perl -wpe 's/^>gi\|\d+\|gb\|(\w+\.\d+)\|.*/>$1/' AllBACends.mfa \
> bacends.fa
faSize bacends.fa
#146921771 bases (2724913 N's 144196858 real) in 135555 sequences in 1 files
#Total size: mean 1083.9 sd 184.7 min 64 (CC322877.1) max 2431 (CC263290.1) median 1095
#N count: mean 20.1 sd 73.1
mkdir /cluster/bluearc/galGal1/bacends
faSplit sequence bacends.fa 20 /cluster/bluearc/galGal1/bacends/bacends
ls -1S /cluster/bluearc/galGal1/bacends/*.fa > bacends.lst
ls -1S /cluster/bluearc/galGal1/unmaskedChunks/*.fa > contigs.lst
cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -tileSize=10 -ooc=../markers/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
mkdir psl
gensub2 contigs.lst bacends.lst template jobList
ssh kk
cd /cluster/data/galGal1/bed/bacends
para create jobList
para try, check, push, check, ...
#Completed: 5220 of 5220 jobs
#Average job time: 105s 1.75m 0.03h 0.00d
#Longest job: 2431s 40.52m 0.68h 0.03d
#Submission to last job: 2608s 43.47m 0.72h 0.03d
# back on kksilo, filter and lift results:
pslSort dirs raw.psl temp psl
pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons raw.psl \
bacEnds.psl /dev/null
liftUp bacEnds.lifted.psl ../../jkStuff/liftAll.lft warn bacEnds.psl
# pslSort ??
rm -r temp
rm raw.psl
extractUniqPsl bacEnds.lifted.psl bacEnds.psl.filter
pslReps bacEnds.psl.filter.dup.psl bacEnds.psl.filter.1 /dev/null \
-noIntrons -minCover=0.80 -minAli=0.90 -nearTop=0.05
extractUniqPsl bacEnds.psl.filter.1 bacEnds.psl.filter.1
rm bacEnds.psl.filter.1
pslReps bacEnds.psl.filter.1.dup.psl bacEnds.psl.filter.2 /dev/null \
-noIntrons -minCover=0.90 -minAli=0.95 -nearTop=0.01
extractUniqPsl bacEnds.psl.filter.2 bacEnds.psl.filter.2
rm bacEnds.psl.filter.2
pslReps bacEnds.psl.filter.2.dup.psl bacEnds.psl.filter.3 /dev/null \
-noIntrons -minCover=0.95 -minAli=0.98 -nearTop=0.01
cat bacEnds.psl.filter.3 \
bacEnds.psl.filter.uniq.psl \
bacEnds.psl.filter.1.uniq.psl \
bacEnds.psl.filter.2.uniq.psl \
> bacEnds.psl.filter.lifted
rm *.names bacEnds.psl.filter.3 bacEnds.psl.filter.uniq.psl \
bacEnds.psl.filter.1.uniq.psl bacEnds.psl.filter.2.uniq.psl
extractPslInfo -h -be bacEnds.psl.filter.lifted
mv bacEnds.psl.filter.lifted.initial bacEnds.initial
# Oops, should have stripped the version from bacends.fa acc's so
# they would match cl_acc_gi_len accs!
perl -wpe 's/^(\w+\s+\d+\s+\d+\s+\w+)\.\d+/$1/' bacEnds.initial \
> bacEndsNoVers.initial
mv bacEndsNoVers.initial bacEnds.initial
findAccession -agp -chicken bacEnds.initial /cluster/data/galGal1
# rm bacEnds.initial
sort -k 4 bacEnds.initial.acc > bacEnds.sort
compileBacEnds bacEnds.sort bacEndPairs.txt bacEndSingles.txt \
> bacEnds.temp
sorttbl chrom chromStart < bacEnds.temp > bacEnds.rdb
rm bacEnds.initial.acc bacEnds.sort bacEnds.temp
createBACpairs bacEnds.rdb bacEndPairs.txt > temp.rdb
sorttbl chrom1 -r ord1 chromStart1 < temp.rdb > bacEnds.pairs.rdb
rm temp.rdb
createBACpairs -bed bacEnds.rdb bacEndPairs.txt > bacEndPairs.rdb
sorttbl chrom chromStart < bacEndPairs.rdb \
| headchg -del > bacEndPairs.bed
rm bacEndPairs.rdb
createBacPairsBad -bed bacEnds.rdb bacEndPairs.txt > bacEndPairsBad.rdb
sorttbl chrom chromStart < bacEndPairsBad.rdb \
| headchg -del > bacEndPairsBad.bed
rm bacEndPairsBad.rdb
sorttbl chrom chromStart < bacEndPairsLong.rdb \
| headchg -del > bacEndPairsLong.bed
rm bacEndPairsLong.rdb
# This made something 0-length, but I'm not going to pursue for now:
findMissingEnd -max 500000 singleBac.txt bacEnds.psl.filter.2.dup.psl \
bacEnds.psl.filter.1.dup.psl bacEnds.psl.filter.dup.psl > bacEndsMiss.psl
mv bacEnds.psl.filter.lifted bacEnds.psl.filter.lifted.orig
cat bacEnds.psl.filter.lifted.orig bacEndsMiss.psl \
> bacEnds.psl.filter.lifted
rm bacEnds.psl.filter.2.dup.psl bacEnds.psl.filter.1.dup.psl \
bacEnds.psl.filter.dup.psl
# Doh... again, having to strip those damn version numbers:
perl -wpe \
'@w=split(/\t/); if (defined $w[9]){ $w[9] =~ s/(\w+)\.\d+/$1/; } \
$_=join("\t",@w)' \
bacEnds.psl.filter.lifted > tmp
mv tmp bacEnds.psl.filter.lifted
extractPslLoad bacEnds.psl.filter.lifted bacEndPairs.bed \
bacEndPairsBad.bed bacEndPairsLong.bed > bacEnds.psl.rdb
sorttbl tname tstart < bacEnds.psl.rdb | headchg -del > bacEnds.load.psl
rm bacEnds.psl.rdb
# OK, now before loading sequences, we really have to fix bacends.fa:
perl -wpe 's/^>(\w+)\.\d+/>$1/' bacends.fa \
> tmp
mv tmp bacends.fa
# Load seq's and alignments
ssh hgwdev
cd /cluster/data/galGal1/bed/bacends
mkdir /gbdb/galGal1/bacends
ln -s /cluster/data/galGal1/bed/bacends/bacends.fa /gbdb/galGal1/bacends
hgLoadSeq galGal1 /gbdb/galGal1/bacends/bacends.fa
hgLoadPsl -table=all_bacends -fastLoad \
galGal1 /cluster/data/galGal1/bed/bacends/bacEnds.load.psl
# Don't know which two rows prompted this... but all seem to have loaded.
#load of all_bacends did not go as planned: 76325 record(s), 0 row(s) skipped, 2 warning(s) loading psl.tab
~/bin/i386/hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql \
-hasBin galGal1 bacEndPairs bacEndPairs.bed
sed -e 's/bacEndPairs/bacEndPairsBad/g' \
$HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsBad.sql
~/bin/i386/hgLoadBed -sqlTable=bacEndPairsBad.sql \
-hasBin galGal1 bacEndPairsBad bacEndPairsBad.bed
# Doh! 12 lines of that have negative starts. Well, trim those
# for now and load the rest, then go back and find the neg-start
# cause...
egrep -v ' -[0-9]+' bacEndPairsBad.bed > bacEndPairsBadFudge.bed
~/bin/i386/hgLoadBed -sqlTable=bacEndPairsBad.sql \
-hasBin galGal1 bacEndPairsBad bacEndPairsBadFudge.bed
sed -e 's/bacEndPairs/bacEndPairsLong/g' \
$HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsLong.sql
~/bin/i386/hgLoadBed -sqlTable=bacEndPairsLong.sql \
-hasBin galGal1 bacEndPairsLong bacEndPairsLong.bed
# SIMPLE REPEATS (TRF) (DONE 1/30/04 angie)
# TRF runs pretty quickly now... it takes a few hours total runtime,
# so instead of binrsyncing and para-running, just do this on the
# local fileserver
ssh kksilo
mkdir /cluster/data/galGal1/bed/simpleRepeat
cd /cluster/data/galGal1/bed/simpleRepeat
mkdir trf
cp /dev/null jobs.csh
foreach d (/cluster/data/galGal1/*/chr*_?{,?})
set ctg = $d:t
foreach f ($d/${ctg}.fa)
set fout = $f:t:r.bed
echo $fout
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
>> jobs.csh
end
end
csh jobs.csh >&! jobs.log &
# check on this with
tail -f jobs.log
wc -l jobs.csh
ls -1 trf | wc -l
# When job is done do:
liftUp simpleRepeat.bed /cluster/data/galGal1/jkStuff/liftAll.lft warn \
trf/*.bed
# Load into the database:
ssh hgwdev
hgLoadBed galGal1 simpleRepeat \
/cluster/data/galGal1/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
featureBits galGal1 simpleRepeat
# 8436953 bases of 1054197620 (0.800%) in intersection
# Seems awful low even though chicken is supposed to be a
# low-repeat-density animal. Run it by someone...
# PROCESS SIMPLE REPEATS INTO MASK (DONE 1/30/04/ angie)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kksilo
cd /cluster/data/galGal1/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/galGal1
mkdir bed/simpleRepeat/trfMaskChrom
foreach c (`cat chrom.lst`)
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# Here's the coverage for the filtered TRF:
ssh hgwdev
cat /cluster/data/galGal1/bed/simpleRepeat/trfMaskChrom/*.bed \
> /tmp/filtTrf.bed
featureBits galGal1 /tmp/filtTrf.bed
# 4512560 bases of 1054197620 (0.428%) in intersection
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 1/31/04 angie)
ssh kksilo
cd /cluster/data/galGal1
# Soft-mask (lower-case) the contig and chr .fa's,
# then make hard-masked versions from the soft-masked.
set trfCtg=bed/simpleRepeat/trfMask
set trfChr=bed/simpleRepeat/trfMaskChrom
foreach f (*/chr*.fa)
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
set chr = $f:t:r
maskOutFa -softAdd $f $trfChr/$chr.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
foreach c (`cat chrom.lst`)
echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
foreach d ($c/chr*_?{,?})
set ctg=$d:t
set f=$d/$ctg.fa
maskOutFa -soft $f $f.out $f
maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
maskOutFa $f hard $f.masked
end
end
#- Rebuild the nib files, using the soft masking in the fa:
mkdir nib
foreach f (*/chr*.fa)
faToNib -softMask $f nib/$f:t:r.nib
end
# GOLD AND GAP TRACKS (DONE 1/30/04 angie)
ssh hgwdev
cd /cluster/data/galGal1
hgGoldGapGl -noGl -chromLst=chrom.lst galGal1 /cluster/data/galGal1 .
# featureBits fails if there's no chrM_gap, so make one:
# echo "create table chrM_gap like chr1_gap" | hgsql galGal1
# oops, that won't work until v4.1, so do this for the time being:
echo "create table chrM_gap select * from chr1_gap where 0=1" \
| hgsql galGal1
# MAKE GCPERCENT (DONE 1/30/04 angie -- DROPPED 2/4/04, replaced by gc20kBase)
ssh hgwdev
mkdir /cluster/data/galGal1/bed/gcPercent
cd /cluster/data/galGal1/bed/gcPercent
hgsql galGal1 < $HOME/kent/src/hg/lib/gcPercent.sql
hgGcPercent galGal1 ../../nib
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR GALGAL1 (DONE 1/28/04 angie)
ssh hgwdev
# Add dbDb and defaultDb entries:
echo 'insert into dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk) \
values("galGal1", "Jan. 2004", \
"/gbdb/galGal1/nib", "Chicken", "chr13:13124250-13128200", 1, \
35, "Chicken", "Gallus gallus", \
"/gbdb/galGal1/html/description.html", 0);' \
| hgsql -h genome-testdb hgcentraltest
echo 'insert into defaultDb values("Chicken", "galGal1");' \
| hgsql -h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
cd $HOME/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add galGal1 in all the right places and do
make update
make alpha
cvs commit makefile
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR GALGAL1 (TODO)
ssh hgwdev
echo 'insert into blatServers values("galGal1", "blat?", "17778", "1"); \
insert into blatServers values("galGal1", "blat?", "17779", "0");' \
| hgsql -h genome-testdb hgcentraltest
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 1/28/04 angie)
ssh hgwdev
mkdir /gbdb/galGal1/html
# Write ~/kent/src/hg/makeDb/trackDb/chicken/galGal1/description.html
# with a description of the assembly and some sample position queries.
chmod a+r $HOME/kent/src/hg/makeDb/trackDb/chicken/galGal1/description.html
# Check it in and copy (ideally using "make alpha" in trackDb) to
# /gbdb/galGal1/html
# MAKE DOWNLOADABLE SEQUENCE FILES (TODO)
ssh kksilo
cd /cluster/data/galGal1
#- Build the .zip files
cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
zip -j zip/chromAgp.zip */chr*.agp
zip -j zip/chromOut.zip */chr*.fa.out
zip -j zip/chromFa.zip */chr*.fa
zip -j zip/chromFaMasked.zip */chr*.fa.masked
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed
cd ../..
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=galGal1 -native GenBank mrna \
/cluster/data/galGal1/zip/mrna.fa
cd /cluster/data/galGal1/zip
zip -j mrna.zip mrna.fa
'_EOF_'
# << this line makes emacs coloring happy
csh ./jkStuff/zipAll.csh |& tee zip/zipAll.log
cd zip
#- Look at zipAll.log to make sure all file lists look reasonable.
#- Check zip file integrity:
foreach f (*.zip)
unzip -t $f > $f.test
tail -1 $f.test
end
wc -l *.zip.test
#- Copy the .zip files to hgwdev:/usr/local/apache/...
ssh hgwdev
cd /cluster/data/galGal1/zip
set gp = /usr/local/apache/htdocs/goldenPath/galGal1
mkdir -p $gp/chromosomes
foreach f ( ../*/chr*.fa )
zip -j $gp/chromosomes/$f:t.zip $f
end
mkdir -p $gp/bigZips
cp -p *.zip $gp/bigZips
cd $gp
# Take a look at bigZips/* and chromosomes/*, update their README.txt's
# Can't make refGene upstream sequence files - no refSeq for chicken.
# Maybe ensGene when we get that.
# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 1/31/04 angie)
ssh kksilo
# Chrom-level mixed nibs that have been repeat- and trf-masked:
rm -rf /cluster/bluearc/galGal1/nib
mkdir /cluster/bluearc/galGal1/nib
cp -p nib/chr*.nib /cluster/bluearc/galGal1/nib
# Pseudo-contig fa that have been repeat- and trf-masked:
rm -rf /cluster/bluearc/galGal1/trfFa
mkdir /cluster/bluearc/galGal1/trfFa
foreach d (*/chr*_?{,?})
cp $d/$d:t.fa /cluster/bluearc/galGal1/trfFa
end
# MAKE CHICKEN LINEAGE-SPECIFIC REPEATS (DONE 2/18/04 angie)
# In an email 2/13/04, Arian said we could treat all human repeats as
# lineage-specific, but could exclude these from chicken as ancestral:
# L3, L3a, L3b, MIR, MIR3, MIRb, MIRm
ssh kksilo
cd /cluster/data/galGal1
mkdir /cluster/bluearc/galGal1/linSpecRep
foreach f (*/chr*.fa.out)
awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \
> /cluster/bluearc/galGal1/linSpecRep/$f:t:r:r.out.spec
end
# SWAP BLASTZ HUMAN-CHICKEN TO CHICKEN-HUMAN (HG16) (DONE 2/19/04 angie)
ssh kolossus
mkdir /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18
set aliDir = /cluster/data/hg16/bed/blastz.galGal1.2004-02-18
cp $aliDir/S1.len S2.len
cp $aliDir/S2.len S1.len
mkdir unsorted axtChrom
cat $aliDir/axtChrom/chr*.axt \
| axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
| axtSplitByTarget stdin unsorted
# Sort the shuffled .axt files.
foreach f (unsorted/*.axt)
echo sorting $f:t:r
axtSort $f axtChrom/$f:t
end
du -sh $aliDir/axtChrom unsorted axtChrom
#1.4G /cluster/data/hg16/bed/blastz.galGal1.2004-02-18/axtChrom
#1.4G unsorted
#1.4G axtChrom
rm -r unsorted
# CHAIN HUMAN BLASTZ (DONE 2/19/04 angie)
# Run axtChain on little cluster
ssh kkr1u00
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Make our own linear gap file with reduced gap penalties,
# in hopes of getting longer chains:
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ=chrUn_random $1 \
| ~/bin/i386/axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../chickenHumanTuned.gap \
-minScore=5000 stdin \
/cluster/bluearc/galGal1/nib \
/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
#Completed: 45 of 45 jobs
#Average job time: 292s 4.86m 0.08h 0.00d
#Longest job: 906s 15.10m 0.25h 0.01d
#Submission to last job: 2233s 37.22m 0.62h 0.03d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=10000 /tmp/score.$f:t:r
echo ""
end
# Load chains into database
ssh hgwdev
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18/axtChain/chain
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain galGal1 ${c}_chainHg16 $i
end
# NET HUMAN BLASTZ (DONE 2/19/04 angie)
ssh kksilo
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18/axtChain
netClass noClass.net galGal1 hg16 human.net
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18/axtChain
rm noClass.net
# Make a 'syntenic' subset of these with
netFilter -syn human.net > humanSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18/axtChain
netFilter -minGap=10 human.net | hgLoadNet galGal1 netHg16 stdin
netFilter -minGap=10 humanSyn.net | hgLoadNet galGal1 netSyntenyHg16 stdin
# Add entries for chaing16, netHg16, syntenyHg16 to chicken/galGal1 trackDb
# RUN AXTBEST ON CHICKEN/HUMAN (DONE 2/19/04 angie)
ssh kolossus
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18
mkdir axtBest pslBest
foreach f (axtChrom/chr*.axt)
set chr=$f:t:r
echo axtBesting $chr
axtBest $f $chr axtBest/$chr.axt -minScore=300
axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/$chr.psl
end
ssh hgwdev
cd /cluster/data/galGal1/bed/blastz.hg16.swap.2004-02-18
cat pslBest/chr*.psl | hgLoadPsl -table=blastzBestHg16 galGal1 stdin
# BLASTZ SELF (DONE 2/23/04 angie)
# lavToAxt has a new -dropSelf option that *will* replace axtDropOverlap,
# once I fix a bug in it...
ssh kk
mkdir -p /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19
# Abridge repeats (linSpecRep relative to human but should be fine right?)
cat << '_EOF_' > DEF
# chicken vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# Chicken
SEQ1_DIR=/cluster/bluearc/galGal1/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=/cluster/bluearc/galGal1/linSpecRep
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Human
SEQ2_DIR=/cluster/bluearc/galGal1/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=/cluster/bluearc/galGal1/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
BASE=/cluster/data/galGal1/bed/blastz.galGal1.2004-02-19
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# Save the DEF file in the current standard place
chmod +x DEF
cp DEF ~angie/hummus/DEF.galGal1-galGal1
# source the DEF file
bash
. ./DEF
mkdir -p $BASE/run
~angie/hummus/make-joblist $DEF > $BASE/run/j
sh $BASE/xdir.sh
cd $BASE/run
# now edit j to prefix path to executable name
sed -e 's#^#/cluster/bin/penn/#' j > j2
wc -l j*
head j2
# make sure the j2 edits are OK, then use it:
mv j2 j
para create j
para try, check, push, check, ...
#Completed: 21025 of 21025 jobs
#Average job time: 400s 6.67m 0.11h 0.00d
#Longest job: 1473s 24.55m 0.41h 0.02d
#Submission to last job: 9966s 166.10m 2.77h 0.12d
# --- normalize (PennSpeak for lift)
ssh kkr1u00
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19
# run bash shell if not running it already
source DEF
mkdir -p $BASE/run.1
mkdir -p $BASE/lav
# create a new job list to convert out files to lav
/cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
> run.1/jobList
cd run.1
wc -l jobList
head jobList
# make sure the job list is OK
para create jobList
para push
#Completed: 145 of 145 jobs
#Average job time: 46s 0.77m 0.01h 0.00d
#Longest job: 328s 5.47m 0.09h 0.00d
#Submission to last job: 1192s 19.87m 0.33h 0.01d
# convert lav files to axt
ssh kkr1u00
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
# Use blastz-self-chromlav2axt, which includes axtDropOverlap in
# the pipe (that helps to keep axtSort from barfing).
# usage: blastz-self-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/scripts/blastz-self-chromlav2axt /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/lav/$(root1) {check out line+ /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/axtChrom/$(root1).axt} /cluster/bluearc/galGal1/nib /cluster/bluearc/galGal1/nib /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/S1.len /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/S2.len
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1S /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/lav > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
#Completed: 46 of 47 jobs
#Crashed: 1 jobs
#Average job time: 34s 0.56m 0.01h 0.00d
#Longest job: 294s 4.90m 0.08h 0.00d
#Submission to last job: 668s 11.13m 0.19h 0.01d
# chrUn crashed -- run on kolossus, in two passes!
# actually, it was a segv in lavToAxt (fixed), but this works anyway:
ssh kolossus
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19
set base=`pwd`
set seq1_dir=/cluster/data/galGal1/nib
set seq2_dir=/cluster/data/galGal1/nib
foreach chr (chrUn)
set out=axtChrom/$chr.axt
echo "Translating $chr lav to $out"
foreach d (lav/$chr/*.lav)
set smallout=$d.axt
echo $smallout
~/bin/$MACHTYPE/lavToAxt $d $seq1_dir $seq2_dir stdout \
| axtDropOverlap stdin $base/S1.len $base/S2.len stdout \
| axtSort stdin $smallout
end
cat `ls -1 lav/$chr/*.lav.axt | sort -g` | axtSort stdin $out
end
# CHAIN SELF BLASTZ (TODO 2/19/04 angie)
# Run axtChain on little cluster
ssh kkr1u00
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ=chrUn $1 \
| ~/bin/i386/axtChain stdin \
/cluster/bluearc/galGal1/nib \
/cluster/bluearc/galGal1/nib $2 > $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
#Completed: 47 of 47 jobs
#Average job time: 140s 2.33m 0.04h 0.00d
#Longest job: 766s 12.77m 0.21h 0.01d
#Submission to last job: 766s 12.77m 0.21h 0.01d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
rm run1/chain/*.chain
# trim to minScore=20000
mkdir chainFilt
foreach f (chain/*.chain)
chainFilter -minScore=20000 $f > chainFilt/$f:t
end
# Load chains into database
ssh hgwdev
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/axtChain/chainFilt
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain galGal1 ${c}_chainSelf $i
end
# NET SELF BLASTZ (TODO 2/19/04 angie)
ssh kksilo
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/axtChain
chainPreNet all.chain ../S1.len ../S2.len stdout \
| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/axtChain
netClass -noAr noClass.net galGal1 galGal1 self.net
# Make a 'syntenic' subset:
ssh kksilo
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/axtChain
rm noClass.net
netFilter -syn self.net > selfSyn.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19/axtChain
netFilter -minGap=10 self.net | hgLoadNet galGal1 netSelf stdin
netFilter -minGap=10 selfSyn.net | hgLoadNet galGal1 netSyntenySelf stdin
# Add entries for chainSelf, netSelf, netSyntenySelf to
# chicken/galGal1 trackDb
# RUN AXTBEST ON SELF (TODO 2/19/04 angie)
# chain/net seem kinda sparse, let's try axtBest...
ssh kolossus
cd /cluster/data/galGal1/bed/blastz.galGal1.2004-02-19
mkdir axtBest pslBest
foreach f (axtChrom/chr*.axt)
set chr=$f:t:r
echo axtBesting $chr
axtBest $f $chr axtBest/$chr.axt -minScore=300
axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/$chr.psl
end
# MAKE 11.OOC FILE FOR BLAT (DONE 2/1/04 angie)
ssh kksilo
mkdir /cluster/data/galGal1/bed/ooc
cd /cluster/data/galGal1/bed/ooc
ls -1 /cluster/bluearc/galGal1/nib/chr*.nib > nib.lst
blat nib.lst /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/galGal1/11.ooc -repMatch=1024
#Wrote 1053 overused 11-mers to /cluster/bluearc/galGal1/11.ooc
# AUTO UPDATE GENBANK MRNA RUN (DONE 2/1/04 angie)
# Update genbank config and source in CVS:
cd ~/kent/src/hg/makeDb/genbank
cvsup .
# See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
# edits, check them in if necessary:
diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf
# Edit etc/genbank.conf and add these lines:
# galGal1 (chicken)
galGal1.genome = /cluster/bluearc/galGal1/nib/chr*.nib
galGal1.lift = /cluster/data/galGal1/jkStuff/liftAll.lft
galGal1.refseq.mrna.native.load = no
galGal1.genbank.mrna.xeno.load = yes
galGal1.genbank.est.xeno.load = no
galGal1.downloadDir = galGal1
cvs ci etc/genbank.conf
# Since chicken is a new species for us, edit src/lib/gbGenome.c.
# Pick some other browser species, & monkey-see monkey-do.
cvs diff src/lib/gbGenome.c
make
cvs ci src/lib/gbGenome.c
# Edit src/align/gbBlat to add /cluster/bluearc/galGal1/11.ooc
cvs diff src/align/gbBlat
make
cvs ci src/align/gbBlat
# Install to /cluster/data/genbank:
make install-server
ssh eieio
cd /cluster/data/genbank
# This is an -initial run, mRNA only:
nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
-srcDb=genbank -type=mrna -verbose=1 -initial galGal1
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad galGal1
# Clean up:
rm -r /cluster/bluearc/genbank/work/initial.galGal1
ssh eieio
# -initial for ESTs:
nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
-srcDb=genbank -type=est -verbose=1 -initial galGal1
# Load results:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 galGal1
# Clean up:
rm -r /cluster/bluearc/genbank/work/initial.galGal1
# PRODUCING GENSCAN PREDICTIONS (DONE 2/3/04 angie)
ssh hgwdev
mkdir /cluster/data/galGal1/bed/genscan
cd /cluster/data/galGal1/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kkr1u00
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
rm -f genome.list
touch genome.list
foreach f ( `ls -1S /cluster/data/galGal1/*/chr*_*/chr*_?{,?}.fa.masked` )
egrep '[ACGT]' $f > /dev/null
if ($status == 0) echo $f >> genome.list
end
wc -l genome.list
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para create jobList
para try, check, push, check, ...
#Completed: 259 of 261 jobs
#Crashed: 2 jobs
#Average job time: 2920s 48.67m 0.81h 0.03d
#Longest job: 141367s 2356.12m 39.27h 1.64d
#Submission to last job: 144447s 2407.45m 40.12h 1.67d
# If there are crashes, diagnose with "para problems".
# If a job crashes due to genscan running out of memory, re-run it
# manually with "-window=1200000" instead of "-window=2400000".
/cluster/bin/i386/gsBig /cluster/data/galGal1/5/chr5_5/chr5_5.fa.masked gtf/chr5_5.fa.gtf -trans=pep/chr5_5.fa.pep -subopt=subopt/chr5_5.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
/cluster/bin/i386/gsBig /cluster/data/galGal1/20/chr20_1/chr20_1.fa.masked gtf/chr20_1.fa.gtf -trans=pep/chr20_1.fa.pep -subopt=subopt/chr20_1.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
# Convert these to chromosome level files as so:
ssh kksilo
cd /cluster/data/galGal1/bed/genscan
liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/galGal1/bed/genscan
ldHgGene galGal1 genscan genscan.gtf
hgPepPred galGal1 generic genscanPep genscan.pep
hgLoadBed galGal1 genscanSubopt genscanSubopt.bed
# PRODUCING FUGU BLAT ALIGNMENTS (DONE 2/6/04)
ssh kk
mkdir /cluster/data/galGal1/bed/blatFr1
cd /cluster/data/galGal1/bed/blatFr1
ls -1S /cluster/bluearc/fugu/fr1/trfFa/*.fa > fugu.lst
ls -1S /cluster/bluearc/galGal1/trfFa/*.fa > chicken.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
mkdir psl
gensub2 chicken.lst fugu.lst gsub spec
para create spec
para try, check, push, check, ...
# cluster & bluearc were very busy => slow:
#Completed: 1827 of 1827 jobs
#Average job time: 1685s 28.08m 0.47h 0.02d
#Longest job: 4121s 68.68m 1.14h 0.05d
#Submission to last job: 56039s 933.98m 15.57h 0.65d
# Sort alignments:
ssh kksilo
cd /cluster/data/galGal1/bed/blatFr1
pslCat -dir psl \
| liftUp -type=.psl stdout /cluster/data/galGal1/jkStuff/liftAll.lft \
warn stdin \
| pslSortAcc nohead chrom /cluster/store2/temp stdin
# load into database:
ssh hgwdev
cd /cluster/data/galGal1/bed/blatFr1/chrom
cat *.psl | hgLoadPsl -fastLoad -table=blatFr1 galGal1 stdin
mkdir /gbdb/galGal1/fuguSeq
ln -s /cluster/data/fr1/fugu_v3.masked.fa /gbdb/galGal1/fuguSeq/
cd /cluster/data/galGal1/bed/blatFr1
hgLoadSeq galGal1 /gbdb/galGal1/fuguSeq/fugu_v3.masked.fa
# LOAD CPGISSLANDS (DONE 2/1/04 angie)
ssh hgwdev
mkdir -p /cluster/data/galGal1/bed/cpgIsland
cd /cluster/data/galGal1/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
mv cpglh.exe /cluster/data/galGal1/bed/cpgIsland/
ssh kksilo
cd /cluster/data/galGal1/bed/cpgIsland
foreach f (../../*/chr*.fa.masked)
set fout=$f:t:r:r.cpg
echo running cpglh on $f to $fout
./cpglh.exe $f > $fout.cpg
end
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
/* chr1\t1325\t3865\t754\tCpG: 183\t64.9\t0.7 */
/* Transforms to: (tab separated columns above, spaces below) */
/* chr1 1325 3865 CpG: 183 754 183 489 64.9 0.7 */
{
width = $3-$2;
printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\n",
$1,$2,$3,$5,$6,width,$6,width*$7*0.01,100.0*2*$6/($3-$2),$7);}
'_EOF_'
# << this line makes emacs coloring happy
awk -f filter.awk chr*.cpg > cpgIsland.bed
# load into database:
ssh hgwdev
cd /cluster/data/galGal1/bed/cpgIsland
hgLoadBed galGal1 cpgIsland -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed
# LOAD SOFTBERRY GENES (TODO)
cd /cluster/data/galGal1/bed
mkdir softberry
cd softberry
wget \
ftp://www.softberry.com/pub/SC_CHICKEN_JUN03/Softb_fgenesh_chicken_jun03.tar.gz
tar xvzf Softb_fgenesh_chicken_jun03.tar.gz
ldHgGene galGal1 softberryGene chr*.gff
hgPepPred galGal1 softberry *.protein
hgSoftberryHom galGal1 *.protein
# BUILD BLAST DATABASES
cd /cluster/data/galGal1
mkdir blastDb
for i in `cat chrom.lst`; do for j in `echo $i/chr$i_*/chr*_*_*.fa`; do ln -s `pwd`/$j blastDb/`basename $j .fa`;
done; done
cd blastDb
for i in *; do formatdb -p F -i $i; done
# END BLAST
# TIGR GENE INDEX (TODO)
mkdir -p /cluster/data/galGal1/bed/tigr
cd /cluster/data/galGal1/bed/tigr
wget ftp://ftp.tigr.org//pub/data/tgi/TGI_track_ChickenGenome_Jun2003.tgz
tar xvfz TGI_track_ChickenGenome_Jun2003.tgz
foreach f (*cattle*)
set f1 = `echo $f | sed -e 's/cattle/cow/g'`
mv $f $f1
end
foreach o (rat cow human pig chicken)
setenv O $o
foreach f (chr*_$o*s)
tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
end
end
ldHgGene -exon=TC galGal1 tigrGeneIndex *.gff
# 139456 gene predictions
gzip *TCs *.gff
# GC 5 BASE WIGGLE TRACK (DONE - 2004-02-04 - Hiram)
# working on kksilo, the fileserver for this data.
ssh kksilo
mkdir /cluster/data/galGal1/bed/gc5Base
cd /cluster/data/galGal1/bed/gc5Base
mkdir wigData5 dataLimits5
for n in ../../nib/*.nib
do
c=`basename ${n} | sed -e "s/.nib//"`
C=`echo $c | sed -e "s/chr//"`
echo -n "working on ${c} - ${C} ... "
hgGcPercent -chr=${c} -doGaps \
-file=stdout -win=5 galGal1 ../../nib | grep -w GC | \
awk '
{
bases = $3 - $2
perCent = $5/10.0
printf "%d\t%.1f\n", $2+1, perCent
}' | wigAsciiToBinary \
-dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
-name=${C} stdin > dataLimits5/${c}
echo "done ${c}"
done
# data is complete, load track on hgwdev
ssh hgwdev
cd /cluster/data/galGal1/bed/gc5Base
hgLoadWiggle galGal1 gc5Base wigData5/*.wig
# create symlinks for .wib files
mkdir /gbdb/galGal1/wib
ln -s `pwd`/wigData5/*.wib /gbdb/galGal1/wib
# the trackDb entry
track gc5Base
shortLabel GC Counts
longLabel GC Percent in 5 base windows
group map
priority 23.5
visibility hide
-autoScaleDefault Off
+autoScale Off
maxHeightPixels 128:36:16
graphTypeDefault Bar
gridDefault OFF
windowingFunction Mean
color 0,128,255
altColor 255,128,0
viewLimits 30:70
type wig 0 100
# SUPERCONTIG LOCATIONS TRACK (DONE 2/5/04 angie)
# Supercontig N is a collection of contained "ContigN.*".
# Extract Supercontig starts and ends from AGP into bed.
mkdir /cluster/data/galGal1/bed/supercontig
cd /cluster/data/galGal1/bed/supercontig
cat ../../chicken_agp/chr*.agp \
| perl -we 'while (<>) { \
chop; @words = split("\t"); \
next if ($words[4] eq "N"); \
if ($words[5] =~ /^(Contig\d+)\.(\d+)$/) { \
$sup = $1; $newNum = $2; \
$firstNum = $newNum if (! defined $firstNum); \
if (defined $lastSup && $lastSup ne $sup) { \
print "$chr\t$start\t$end\t$lastSup (.$firstNum-.$lastNum)\t1000\t$strand\n"; \
undef $start; undef $end; $firstNum = $newNum; \
} \
$chr = $words[0]; $cStart = $words[1]; $cEnd = $words[2]; \
$strand = $words[8]; \
$start = $cStart if (\!defined $start || $start > $cStart); \
$end = $cEnd if (\!defined $end || $end < $cEnd); \
$lastSup = $sup; $lastNum = $newNum; \
} else { die "bad 6th word $words[5]\n"; } \
} \
print "$chr\t$start\t$end\t$lastSup (.$firstNum-.$lastNum)\t1000\t$strand\n";' \
> supercontig.bed
ssh hgwdev
cd /cluster/data/galGal1/bed/supercontig
hgLoadBed -tab galGal1 supercontig supercontig.bed
# PLOT MARKER GENETIC VS. GENOMIC POS (DONE 2/5/04 angie)
ssh kksilo
mkdir /cluster/data/galGal1/bed/markerPlots
cd /cluster/data/galGal1/bed/markerPlots
# Need to find out where Terry got this file:
cp -p /cluster/bluearc/gg1/markers/genetic.map .
mkdir gnuplot
# This is a one-shot and I'm writing a perl script -- too long for inliner.
chmod a+x ./plotMarkers.pl
./plotMarkers.pl genetic.map ../markers/stsMarkersTmp.bed
mkdir gif
ssh hgwdev
cd /cluster/data/galGal1/bed/markerPlots
foreach f (gnuplot/*.gnuplot)
gnuplot < $f > tmp.ppm
ppmtogif tmp.ppm > gif/$f:t:r.gif
rm tmp.ppm
end
mkdir /usr/local/apache/htdocs/angie/chickenPlots
cp -p /cluster/data/galGal1/bed/markerPlots/gif/* \
/usr/local/apache/htdocs/angie/chickenPlots
cp -p /cluster/data/galGal1/bed/markerPlots/*.txt \
/usr/local/apache/htdocs/angie/chickenPlots
cp -p /cluster/bluearc/gg1/markers/genetic.map \
/usr/local/apache/htdocs/angie/chickenPlots/genetic_map.txt
cp -p /cluster/data/galGal1/bed/markers/stsMarkersTmp.bed \
/usr/local/apache/htdocs/angie/chickenPlots/stsMarkerPlacements.txt
# CREATING QUALITY SCORE TRACK (DONE 2/6/04 angie)
ssh kksilo
mkdir /cluster/data/galGal1/bed/qual
cd /cluster/data/galGal1/bed/qual
cat ../../chicken_agp/chr*.agp > assembly.agp
tar xvzf ../../chicken_040105.contigs.qual.tar.gz
cat chicken*.qual | qaToQac stdin stdout \
| ~/bin/i386/chimpChromQuals assembly.agp stdin chrom.qac
rm chicken*.qual
mkdir wigData dataLimits
foreach c (`cat ../../chrom.lst`)
foreach agp (../../$c/chr$c.agp ../../$c/chr${c}_random.agp)
if (-e $agp) then
set chr = $agp:t:r
set abbrev = `echo $chr | sed -e 's/^chr//; s/_random/r/;'`
echo $chr to $abbrev wiggle
qacToWig chrom.qac -name=$chr stdout \
| wigAsciiToBinary -dataSpan=1 -chrom=$chr \
-wibFile=wigData/qual_$abbrev -name=$abbrev stdin \
> dataLimits/$chr
endif
end
end
# Verify size of .wib file = chrom length
foreach f (wigData/*.wib)
set abbrev = $f:t:r
set chr = `echo $abbrev | sed -e 's/^qual_/chr/; s/r$/_random/;'`
set wibLen = `ls -l $f | awk '{print $5;}'`
set chromLen = `grep -w $chr ../../chrom.sizes | awk '{print $2;}'`
if ($wibLen != $chromLen) then
echo "ERROR: $chr size is $chromLen but wib size is $wibLen"
else
echo $chr OK.
endif
end
#ERROR: chr26 size is 4262107 but wib size is 3762107
# That's OK -- chr26 ends in a 500000bp centromere gap -- not incl'd in wib
# /gbdb & load:
ssh hgwdev
cd /cluster/data/galGal1/bed/qual
mkdir -p /gbdb/galGal1/wib
ln -s `pwd`/wigData/*.wib /gbdb/galGal1/wib
hgLoadWiggle galGal1 quality wigData/*.wig
# To speed up display for whole chrom views, need to make zoomed
# data views. Zoom to 1K points per pixel - Hiram 2004-02-17
ssh kksilo
cd /cluster/data/galGal1/bed/qual/wigData1K
mkdir -p wigData1K
mkdir -p dataLimits1K
for c in `cat ../../chrom.lst`
do
if [ -f ../../${c}/chr${c}.agp ]; then
echo "chr${c} quality 1K zoom"
qacToWig chrom.qac -name=chr${c} stdout |
wigZoom stdin | wigAsciiToBinary -dataSpan=1024 \
-chrom=chr${c} -wibFile=wigData1K/qc1K_${c} \
-name=${c} stdin > dataLimits1K/chr${c}
fi
if [ -f ../../${c}/chr${c}_random.agp ]; then
echo "chr${c}_random quality 1K zoom"
qacToWig chrom.qac -name=chr${c}_random stdout |
wigZoom stdin | wigAsciiToBinary -dataSpan=1024 \
-chrom=chr${c}_random -wibFile=wigData1K/qc1K_${c}r \
-name=${c}_random stdin > dataLimits1K/chr${c}r
fi
done
ssh hgwdev
cd /cluster/data/galGal1/bed/qual/wigData1K
# load in addition to existing data
hgLoadWiggle -oldTable galGal1 quality *.wig
# create symlinks for .wib files
ln -s `pwd`/*.wib /gbdb/galGal1/wib
# PLOT MRNA-MARKER GENETIC VS. GENOMIC POS (DONE 2/10/04 angie)
ssh hgwdev
mkdir /cluster/data/galGal1/bed/mrnaPlots
cd /cluster/data/galGal1/bed/mrnaPlots
# Need to find out where Terry got this file:
cp -p /cluster/bluearc/gg1/markers/stsAlias.bed .
echo 'select all_mrna.tName, all_mrna.tStart, all_mrna.tEnd, \
all_mrna.qName, 0, all_mrna.strand, geneName.name \
from all_mrna, mrna, geneName \
where all_mrna.qName = mrna.acc and mrna.geneName != 0 and \
mrna.geneName = geneName.id' \
| hgsql galGal1 -N > mrnaWithNames.bed
mkdir gnuplot
# This is a one-shot and I'm writing a perl script -- too long for inliner.
chmod a+x ./plotMrnaMarkers.pl
./plotMrnaMarkers.pl ../markerPlots/genetic.map stsAlias.bed \
mrnaWithNames.bed
# Too few results to plot -- just emailed summary to LaDeana et al.
# COMPARE GENBANK MRNA ANNOTATED POSITION TO ALIGNED (DONE 2/11/04 angie)
ssh hgwdev
cd /cluster/data/galGal1/bed/mrnaPlots
echo 'select tName, tStart, tEnd, qName, 0, strand from all_mrna' \
| hgsql galGal1 -N > mrnaPos.bed
# write another 1-shot perl script
chmod a+x getMrnaGBPos.pl
# eieio is the fileserver for genbank stuff...
ssh eieio
cd /cluster/data/galGal1/bed/mrnaPlots
gunzip -c /cluster/store5/genbank/data/download/genbank.139.0/gbvrt*.gz \
| ./getMrnaGBPos.pl mrnaPos.bed
# emailed report files mrnaGB*.txt