src/hg/makeDb/doc/ce8.txt 1.3
1.3 2009/09/24 20:54:24 hiram
Tried to get the sanger genes up, not complete
Index: src/hg/makeDb/doc/ce8.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce8.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/makeDb/doc/ce8.txt 28 Jul 2009 21:55:26 -0000 1.2
+++ src/hg/makeDb/doc/ce8.txt 24 Sep 2009 20:54:24 -0000 1.3
@@ -1,524 +1,591 @@
# for emacs: -*- mode: sh; -*-
# Caenorhabditis elegans
# Washington University School of Medicine GSC and Sanger Institute WS204
# $Id$
#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2009-07-22 - Hiram)
mkdir /hive/data/genomes/ce8
cd /hive/data/genomes/ce8
mkdir ws204
cd ws204
TOP=/hive/data/genomes/ce8/ws204
export TOP
for D in annotation genome_feature_tables/GFF2 \
genome_feature_tables/SUPPLEMENTARY_GFF sequences/dna \
sequences/protein sequences/rna
do
mkdir -p ${D}
cd ${D}
wget --timestamping \
ftp://ftp.sanger.ac.uk/pub2/wormbase/WS204/genomes/c_elegans/${D}/*.*
cd ${TOP}
done
# about 1h24m
#########################################################################
# NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2009-07-24 - Hiram)
mkdir /hive/data/genomes/ce8/sanger
cd /hive/data/genomes/ce8/sanger
# Fix fasta names:
zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz \
| sed -e '/^$/ d; s/^>CHROMOSOME_MtDNA/>chrM/; s/^>CHROMOSOME_/>chr/;' \
| gzip -c > UCSC.fa.gz
faSize -detailed UCSC.fa.gz
# chrI 15072421
# chrII 15279323
# chrIII 13783685
# chrIV 17493784
# chrM 13794
# chrV 20924143
# chrX 17718854
# chrII(+1) and chrII(+3) are slightly different than WS200
# Make sure we get the same sizes from this command:
zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz | sed -e '/^$/ d;' \
| faSize -detailed stdin
faCount UCSC.fa.gz
#seq len A C G T N cpg
# chrI 15072421 4835939 2695879 2692150 4848453 0 503521
# chrII 15279323 4878195 2769216 2762198 4869714 0 492149
# chrIII 13783685 4444652 2449141 2466322 4423570 0 459669
# chrIV 17493784 5711040 3034767 3017008 5730969 0 522372
# chrM 13794 4335 1225 2055 6179 0 110
# chrV 20924143 6750393 3712058 3701397 6760295 0 638983
# chrX 17718854 5747199 3119702 3117868 5734085 0 514715
# total 100286004 32371753 17781988 17758998 323732650 3131519
# WS200:
# chrII 15279324 4878196 2769216 2762198 4869714 0 492149
# chrIII 13783682 4444652 2449139 2466321 4423570 0 459669
# total 100286002 32371754 17781986 17758997 323732650 3131519
# Fix AGP names:
sed -e 's/^/chr/' ../ws204/sequences/dna/CHR*.agp > UCSC.agp
# And add a fake mitochondrial AGP entry for the sake of downstream
# tools (make sure the GenBank sequence is identical to given):
echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp
#########################################################################
# run the makeGenomeDb procedure to create the db and unmasked sequence
# (DONE - 2009-07-22 - Hiram)
cd /hive/data/genomes/ce8
cat << '_EOF_' > ce8.config.ra
# Config parameters for makeGenomeDb.pl:
db ce8
clade worm
genomeCladePriority 10
scientificName Caenorhabditis elegans
commonName C. elegans
assemblyDate Jun 2009
assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS204
orderKey 824
mitoAcc none
fastaFiles /hive/data/genomes/ce8/sanger/UCSC.fa.gz
agpFiles /hive/data/genomes/ce8/sanger/UCSC.agp
# qualFiles /dev/null
dbDbSpeciesDir worm
taxId 6239
'_EOF_'
# << emacs
mkdir jkStuff
# run just to AGP to make sure things are sane first
nice -n +19 makeGenomeDb.pl ce8.config.ra -stop=agp \
> jkStuff/makeGenomeDb.agp.log 2>&1
# now, continuing to make the Db and all
time nice -n +19 makeGenomeDb.pl ce8.config.ra -continue=db \
> jkStuff/makeGenomeDb.db.log 2>&1
# real 1m33.036s
# take the trackDb business there and check it into the source tree
# fixup the description, gap and gold html page descriptions
#########################################################################
# REPEATMASKER (DONE - 2009-07-24 - Hiram)
screen # use screen to control the job
mkdir /hive/data/genomes/ce8/bed/repeatMasker
cd /hive/data/genomes/ce8/bed/repeatMasker
time nice -n +19 doRepeatMasker.pl -bigClusterHub=swarm \
-buildDir=`pwd` ce8 > do.log 2>&1 &
# real 35m58.812s
# from the do.log:
# June 4 2009 (open-3-2-8) version of RepeatMasker
# CC RELEASE 20090604;
cat faSize.rmsk.txt
# 100286004 bases (0 N's 100286004 real 87035623 upper 13250381 lower)
# in 7 sequences in 1 files
# %13.21 masked total, %13.21 masked real
#########################################################################
# SIMPLE REPEATS (DONE - 2009-07-24 - Hiram)
screen # use screen to control the job
mkdir /hive/data/genomes/ce8/bed/simpleRepeat
cd /hive/data/genomes/ce8/bed/simpleRepeat
time nice -n +19 doSimpleRepeat.pl -smallClusterHub=encodek \
-buildDir=`pwd` ce8 > do.log 2>&1 &
# real 18m30.323s
cat fb.simpleRepeat
# 4331076 bases of 100286004 (4.319%) in intersection
#########################################################################
# MASK SEQUENCE WITH RM+TRF (DONE - 2009-07-24 - Hiram)
# Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed,
# now it's time to combine the masking into the final ce8.2bit,
# following the instructions at the end of doSimpleRepeat's output.
cd /hive/data/genomes/ce8
twoBitMask ce8.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce8.2bit
# You can safely ignore the warning about extra BED columns
twoBitToFa ce8.2bit stdout | faSize stdin > faSize.ce8.2bit.txt
cat faSize.ce8.2bit.txt
# 100286004 bases (0 N's 100286004 real 86863769 upper 13422235 lower)
# in 7 sequences in 1 files
# %13.38 masked total, %13.38 masked real
# set the symlink on hgwdev to /gbdb/ce8
rm -f /gbdb/ce8/ce8.2bit
ln -s `pwd`/ce8.2bit /gbdb/ce8/ce8.2bit
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2009-07-22 - Hiram)
# Use -repMatch=100 (based on size -- for human we use 1024, and
# worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome
# size from featureBits. So we would use 34, but that yields a very
# high number of tiles to ignore, especially for a small more compact
# genome. Bump that up a bit to be more conservative.
cd /hive/data/genomes/ce8
blat ce8.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=jkStuff/ce8.11.ooc -repMatch=100
# Wrote 8514 overused 11-mers to jkStuff/ce8.11.ooc
# copy all of this stuff to the klusters:
mkdir /hive/data/staging/data/ce8
cp -p jkStuff/ce8.11.ooc chrom.sizes ce8.2bit /hive/data/staging/data/ce8
# request push of that data to kluster nodes /scratch/data/ce8/
#########################################################################
## LASTZ caePb2 (DONE - 2009-07-28 - Hiram)
screen # use screen to control the job
mkdir /hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28
cd /hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28
cat << '_EOF_' > DEF
# ce8 vs caePb2
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce8
SEQ1_DIR=/scratch/data/ce8/ce8.2bit
SEQ1_LEN=/scratch/data/ce8/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. PB2801 caePb2
SEQ2_DIR=/scratch/data/caePb2/caePb2.2bit
SEQ2_LEN=/scratch/data/caePb2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/caePb2/caePb2.supercontigs.2bit
SEQ2_CTGLEN=/scratch/data/caePb2/caePb2.supercontigs.sizes
SEQ2_LIFT=/scratch/data/caePb2/caePb2.supercontigs.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=swarm -workhorse=hgwdev \
-qRepeats=windowmaskerSdust -noLoadChainSplit -smallClusterHub=memk \
> do.log 2>&1 &
# about 50 minutes
cat fb.ce8.chainCaePb2Link.txt
# 40793017 bases of 100286004 (40.677%) in intersection
# swap, this is also in caePb2.txt
mkdir /hive/data/genomes/caePb2/bed/blastz.ce8.swap
cd /hive/data/genomes/caePb2/bed/blastz.ce8.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-workhorse=hgwdev -qRepeats=windowmaskerSdust \
/hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28/DEF \
-bigClusterHub=swarm -smallClusterHub=encodek -swap > swap.log 2>&1 &
# real 3m16.709s
cat fb.caePb2.chainCe8Link.txt
# 55084580 bases of 170473138 (32.313%) in intersection
#########################################################################
## BLASTZ caeJap2 (DONE - 2009-07-28 - Hiram)
screen # use screen to control the job
mkdir /hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28
cd /hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28
cat << '_EOF_' > DEF
# ce8 vs caeJap2
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce8
SEQ1_DIR=/scratch/data/ce8/ce8.2bit
SEQ1_LEN=/scratch/data/ce8/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. japonica caeJap2
SEQ2_DIR=/scratch/data/caeJap2/caeJap2.2bit
SEQ2_LEN=/scratch/data/caeJap2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/caeJap2/caeJap2.supers.2bit
SEQ2_CTGLEN=/scratch/data/caeJap2/caeJap2.supers.sizes
SEQ2_LIFT=/scratch/data/caeJap2/caeJap2.chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-bigClusterHub=swarm -noLoadChainSplit -qRepeats=windowmaskerSdust \
-workhorse=hgwdev -smallClusterHub=memk > do.log 2>&1 &
# about 42 minutes
cat fb.ce8.chainCaeJap2Link.txt
# 27270052 bases of 100286004 (27.192%) in intersection
# swap, this is also in caeJap2.txt
mkdir /hive/data/genomes/caeJap2/bed/blastz.ce8.swap
cd /hive/data/genomes/caeJap2/bed/blastz.ce8.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust -bigClusterHub=swarm -noLoadChainSplit \
/hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28/DEF \
-smallClusterHub=encodek -swap > swap.log 2>&1 &
# real 3m44.671s
cat fb.caeJap2.chainCe8Link.txt
# 26440993 bases of 129295754 (20.450%) in intersection
############################################################################
## BLASTZ cb3 (DONE - 2009-07-28 - Hiram)
screen # use screen to control the job
mkdir /hive/data/genomes/ce8/bed/lastzCb3.2009-07-28
cd /hive/data/genomes/ce8/bed/lastzCb3.2009-07-28
cat << '_EOF_' > DEF
# ce8 vs cb3
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce8
SEQ1_DIR=/scratch/data/ce8/ce8.2bit
SEQ1_LEN=/scratch/data/ce8/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. briggsae cb3
SEQ2_DIR=/hive/data/genomes/cb3/cb3.rmskTrf.2bit
SEQ2_LEN=/hive/data/genomes/cb3/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=0
BASE=/hive/data/genomes/ce8/bed/lastzCb3.2009-07-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \
-smallClusterHub=memk > do.log 2>&1 &
# about 40 minutes
cat fb.ce8.chainCb3Link.txt
# 42421395 bases of 100286004 (42.300%) in intersection
# swap, this is also in cb3.txt
mkdir /hive/data/genomes/cb3/bed/blastz.ce8.swap
cd /hive/data/genomes/cb3/bed/blastz.ce8.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/ce8/bed/lastzCb3.2009-07-28/DEF \
-workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \
-smallClusterHub=encodek -swap > swap.log 2>&1 &
# real 3m46.700s
cat fb.cb3.chainCe8Link.txt
# 43115929 bases of 108433446 (39.763%) in intersection
############################################################################
## BLASTZ caeRem3 (DONE - 2009-07-28,09 - Hiram)
screen # use screen to control the job
mkdir /hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28
cd /hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28
cat << '_EOF_' > DEF
# ce8 vs caeRem3
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce8
SEQ1_DIR=/scratch/data/ce8/ce8.2bit
SEQ1_LEN=/scratch/data/ce8/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. remanei caeRem3
SEQ2_DIR=/scratch/data/caeRem3/caeRem3.2bit
SEQ2_LEN=/scratch/data/caeRem3/chrom.sizes
SEQ2_CTGDIR=/scratch/data/caeRem3/caeRem3.supercontigs.2bit
SEQ2_CTGLEN=/scratch/data/caeRem3/caeRem3.supercontigs.sizes
SEQ2_LIFT=/scratch/data/caeRem3/caeRem3.chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \
-qRepeats=windowmaskerSdust -smallClusterHub=memk > do.log 2>&1 &
XXX - running Tue Jul 28 11:14:53 PDT 2009
# real 28m14.168s
cat fb.ce8.chainCaeRem3Link.txt
# 41841184 bases of 100286004 (41.722%) in intersection
# swap, this is also in caeRem3.txt
mkdir /hive/data/genomes/caeRem3/bed/blastz.ce8.swap
cd /hive/data/genomes/caeRem3/bed/blastz.ce8.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
-workhorse=hgwdev -noLoadChainSplit \
/hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28/DEF \
-bigClusterHub=swarm -smallClusterHub=encodek -swap > swap.log 2>&1 &
# real 3m10.033s
cat fb.caeRem3.chainCe8Link.txt
# 46320775 bases of 138406388 (33.467%) in intersection
############################################################################
## 5-Way multiple alignment (DONE - 2009-07-28 - Hiram)
- mkdir /cluster/data/ce8/bed/multiz5way
- cd /cluster/data/ce8/bed/multiz5way
+ mkdir /hive/data/genomes/ce8/bed/multiz5way
+ cd /hive/data/genomes/ce8/bed/multiz5way
# See notes in ce6.txt for 6-way alignment. This is the tree from
# there.
cat << '_EOF_' > 5way.nh
((C._elegans_ce8:0.003000,
(C._brenneri_caePb2:0.013000,
(C._remanei_caeRem3:0.003000,C._briggsae_cb3:0.005000):0.004000)
:0.002000):0.001000,
C._japonica_caeJap2:0.023000);
'_EOF_'
# << happy emacs
/cluster/bin/phast/x86_64/all_dists 5way.nh > 5way.distances.txt
grep -i ce8 5way.distances.txt | sort -k3,3n
# Use this output for reference, and use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chaince8Link chain linearGap
# distance on Ce8 on other minScore
# 1 0.0120 - remanei_caeRem3 (% 41.722) (% 33.467) 1000 loose
# 2 0.0140 - briggsae_cb3 (% 42.300) (% 39.763) 1000 loose
# 3 0.0180 - brenneri_caePb2 (% 40.677) (% 32.313) 1000 loose
# 3 0.0270 - japonica_caeJap2 (% 27.192) (% 20.450) 1000 loose
- cd /cluster/data/ce8/bed/multiz5way
+ cd /hive/data/genomes/ce8/bed/multiz5way
# bash shell syntax here ...
- export H=/cluster/data/ce8/bed
+ export H=/hive/data/genomes/ce8/bed
mkdir mafLinks
for G in caeRem3 cb3 caePb2 caeJap2
do
mkdir mafLinks/$G
if [ ! -d ${H}/lastz.${G}/mafNet ]; then
echo "missing directory lastz.${G}/mafNet"
fi
ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
done
# these are x86_64 binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn
# the autoMultiz cluster run
ssh memk
cd /hive/data/genomes/ce8/bed/multiz5way/
# create species list and stripped down tree for autoMZ
sed -e \
's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d; s/C._//g' \
5way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.list
mkdir maf run
cd run
# NOTE: set the db and pairs directories in this script
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = ce8
set c = $1
set result = $2
set run = `pwd`
set tmp = $run/tmp/$db/multiz.$c
set nway = /hive/data/genomes/ce8/bed/multiz5way
set pairs = $nway/mafLinks
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p $nway/tree.nh $nway/species.list $tmp
pushd $tmp
foreach s (`sed -e "s/ $db//" species.list`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
/bin/zcat $in.gz > $out
if (! -s $out) then
echo "##maf version=1 scoring=autoMZ" > $out
endif
else if (-e $in) then
ln -s $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($nway/penn $path); rehash
$nway/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
/bin/rmdir --ignore-fail-on-non-empty $run/tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/ce8/bed/multiz5way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
- awk '{print $1}' /cluster/data/ce8/chrom.sizes > chrom.lst
+ awk '{print $1}' /hive/data/genomes/ce8/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
para -maxNode=1 push
para check ... push ... etc ...
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 1517s 25.28m 0.42h 0.02d 0.000 y
# IO & Wait Time: 124s 2.07m 0.03h 0.00d 0.000 y
# Average job time: 234s 3.91m 0.07h 0.00d
# Longest finished job: 334s 5.57m 0.09h 0.00d
# Submission to last job: 348s 5.80m 0.10h 0.00d
cd /hive/data/genomes/ce8/bed/multiz5way
time nice -n +19 catDir maf > multiz5way.maf
# a few seconds to produce:
# -rw-rw-r-- 1 321149265 Jul 28 14:53 multiz5way.maf
# before getting to the annotation, load this up so we can get
# a first view of the track. This will be replaced with the annotated
# mafs
ssh hgwdev
cd /hive/data/genomes/ce8/bed/multiz5way
mkdir /gbdb/ce8/multiz5way
ln -s /hive/data/genomes/ce8/bed/multiz5way/multiz5way.maf \
/gbdb/ce8/multiz5way
# this load creates a large file, do that on local disk:
cd /scratch/tmp
time nice -n +19 hgLoadMaf ce8 multiz5way
# a few seconds to load:
# Loaded 327164 mafs in 1 files from /gbdb/ce8/multiz5way
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 ce8 multiz5waySummary /gbdb/ce8/multiz5way/multiz5way.maf
# a few seconds to load:
# Created 111894 summary blocks from 850001 components and 327164 mafs
# from /gbdb/ce8/multiz5way/multiz5way.maf
# remove the temporary .tab files:
rm multiz5*.tab
############################################################################
+# reset default position, same as ce4 on the ZC101 / unc-52 locus
+ ssh hgwdev
+ hgsql -e 'update dbDb set defaultPos="chrII:14646344-14667746"
+ where name="ce8";' hgcentraltest
+
+############################################################################
+## SANGER GENE TRACK (WORKING - 2009-07-29 - Hiram)
+## There is a tremendous amount of extraneous notations in the gff
+## files. Filter them down to a manageable set, change the chrom
+## names, eliminate duplicates, select only what will work in
+## ldHgGene
+ mkdir /hive/data/genomes/ce8/bed/sangerGene
+ cd /hive/data/genomes/ce8/bed/sangerGene
+for C in I II III IV V X
+do
+ echo -n "${C} "
+ cat ../../ws204/genome_feature_tables/GFF2/CHROMOSOME_${C}.gff | \
+ sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
+ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
+ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
+ s/CHROMOSOME_M/chrM/g;" \
+ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
+ -e 's/CDS "//' -e 's/"//' \
+ > chr${C}.gff
+done
+C=M
+echo -n "${C} "
+cat ../../ws204/genome_feature_tables/GFF2/CHROMOSOME_MtDNA.gff | \
+sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
+ s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
+ s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
+ s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \
+ -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
+ -e 's/CDS "//' -e 's/"//' \
+ > chr${C}.gff
+for C in I II III IV V X M
+do
+ echo "chr${C}.gff -> filtered.chr${C}.gff"
+ grep -v "^#" chr${C}.gff | awk -F'\t' '
+BEGIN { IGNORECASE=1 }
+{
+ if (match($2,"curated|Coding_transcript")) {
+ if (match($3,"intron|coding_exon|exon|cds|three_prime_UTR|five_prime_UTR")) {
+ gsub("coding_exon","CDS",$3)
+ gsub("Transcript ","",$9)
+ gsub(" .*","",$9)
+ gsub("three_prime_UTR","3utr",$3)
+ gsub("five_prime_UTR","5utr",$3)
+ for (i = 1; i < 9; ++i) {
+ printf "%s\t", $i
+ }
+ printf "%s\n", $9
+ }
+ }
+}
+' | sort -u | sort -k4n > filtered.chr${C}.gff
+done
+
+ ssh hgwdev
+ cd /hive/data/genomes/ce8/bed/sangerGene
+ nice -n +19 ldHgGene ce8 sangerGene filtered.*.gff
+ nice -n +19 ldHgGene -out=filteredGenePred.tab ce8 sangerGene filtered.*.gff
+ # Read 55287 transcripts in 1027094 lines in 7 files
+ # 55287 groups 7 seqs 3 sources 5 feature types
+ # 31064 gene predictions
+
+###############################################################################