src/hg/makeDb/doc/ce8.txt 1.3

1.3 2009/09/24 20:54:24 hiram
Tried to get the sanger genes up, not complete
Index: src/hg/makeDb/doc/ce8.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce8.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/makeDb/doc/ce8.txt	28 Jul 2009 21:55:26 -0000	1.2
+++ src/hg/makeDb/doc/ce8.txt	24 Sep 2009 20:54:24 -0000	1.3
@@ -1,524 +1,591 @@
 # for emacs: -*- mode: sh; -*-
 
 # Caenorhabditis elegans
 # Washington University School of Medicine GSC and Sanger Institute WS204
 
 #  $Id$
 
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE - 2009-07-22 - Hiram)
     mkdir /hive/data/genomes/ce8
     cd /hive/data/genomes/ce8
     mkdir ws204
     cd ws204
 TOP=/hive/data/genomes/ce8/ws204
 export TOP
 for D in annotation genome_feature_tables/GFF2 \
         genome_feature_tables/SUPPLEMENTARY_GFF sequences/dna \
         sequences/protein sequences/rna
 do
     mkdir -p ${D}
     cd ${D}
     wget --timestamping \
 ftp://ftp.sanger.ac.uk/pub2/wormbase/WS204/genomes/c_elegans/${D}/*.*
     cd ${TOP}
 done
     #	about 1h24m
 
 #########################################################################
 # NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2009-07-24 - Hiram)
     mkdir /hive/data/genomes/ce8/sanger
     cd /hive/data/genomes/ce8/sanger
     # Fix fasta names:
     zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz \
     | sed -e '/^$/ d;  s/^>CHROMOSOME_MtDNA/>chrM/;  s/^>CHROMOSOME_/>chr/;' \
     | gzip -c > UCSC.fa.gz
     faSize -detailed UCSC.fa.gz
 # chrI    15072421
 # chrII   15279323
 # chrIII  13783685
 # chrIV   17493784
 # chrM    13794
 # chrV    20924143
 # chrX    17718854
 
     # chrII(+1) and chrII(+3) are slightly different than WS200
     # Make sure we get the same sizes from this command:
     zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz | sed -e '/^$/ d;' \
 	| faSize -detailed stdin
 
     faCount UCSC.fa.gz
 #seq    len     A       C       G       T       N       cpg
 # chrI    15072421        4835939 2695879 2692150 4848453 0       503521
 # chrII   15279323        4878195 2769216 2762198 4869714 0       492149
 # chrIII  13783685        4444652 2449141 2466322 4423570 0       459669
 # chrIV   17493784        5711040 3034767 3017008 5730969 0       522372
 # chrM    13794   4335    1225    2055    6179    0       110
 # chrV    20924143        6750393 3712058 3701397 6760295 0       638983
 # chrX    17718854        5747199 3119702 3117868 5734085 0       514715
 # total   100286004       32371753  17781988  17758998  323732650  3131519
 
 #  WS200:
 # chrII   15279324        4878196 2769216 2762198 4869714 0       492149
 # chrIII  13783682        4444652 2449139 2466321 4423570 0       459669
 # total   100286002       32371754  17781986  17758997  323732650  3131519
 
     # Fix AGP names:
     sed -e 's/^/chr/' ../ws204/sequences/dna/CHR*.agp > UCSC.agp
     # And add a fake mitochondrial AGP entry for the sake of downstream
     # tools (make sure the GenBank sequence is identical to given):
     echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp
 
 #########################################################################
 # run the makeGenomeDb procedure to create the db and unmasked sequence
 #	(DONE - 2009-07-22 - Hiram)
     cd /hive/data/genomes/ce8
     cat << '_EOF_' > ce8.config.ra
 # Config parameters for makeGenomeDb.pl:
 db ce8
 clade worm
 genomeCladePriority 10
 scientificName Caenorhabditis elegans
 commonName C. elegans
 assemblyDate Jun 2009
 assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS204
 orderKey 824
 mitoAcc none
 fastaFiles /hive/data/genomes/ce8/sanger/UCSC.fa.gz
 agpFiles   /hive/data/genomes/ce8/sanger/UCSC.agp
 # qualFiles /dev/null
 dbDbSpeciesDir worm
 taxId 6239
 '_EOF_'
     # << emacs
 
     mkdir jkStuff
     #	run just to AGP to make sure things are sane first
     nice -n +19 makeGenomeDb.pl ce8.config.ra -stop=agp \
       > jkStuff/makeGenomeDb.agp.log 2>&1
     #	now, continuing to make the Db and all
     time nice -n +19 makeGenomeDb.pl ce8.config.ra -continue=db \
       > jkStuff/makeGenomeDb.db.log 2>&1
     #	real    1m33.036s
     #	take the trackDb business there and check it into the source tree
     #	fixup the description, gap and gold html page descriptions
 
 #########################################################################
 # REPEATMASKER (DONE - 2009-07-24 - Hiram)
     screen 	#	use screen to control the job
     mkdir /hive/data/genomes/ce8/bed/repeatMasker
     cd /hive/data/genomes/ce8/bed/repeatMasker
     time nice -n +19 doRepeatMasker.pl -bigClusterHub=swarm \
 	-buildDir=`pwd` ce8 > do.log 2>&1 &
     #	real    35m58.812s
     #	from the do.log:
     #   June 4 2009 (open-3-2-8) version of RepeatMasker
     #	CC   RELEASE 20090604;   
     cat faSize.rmsk.txt
     #	100286004 bases (0 N's 100286004 real 87035623 upper 13250381 lower)
     #	in 7 sequences in 1 files
     #	%13.21 masked total, %13.21 masked real
 
 #########################################################################
 # SIMPLE REPEATS (DONE - 2009-07-24 - Hiram)
     screen 	#	use screen to control the job
     mkdir /hive/data/genomes/ce8/bed/simpleRepeat
     cd /hive/data/genomes/ce8/bed/simpleRepeat
     time nice -n +19 doSimpleRepeat.pl -smallClusterHub=encodek \
 	-buildDir=`pwd` ce8 > do.log 2>&1 &
     #	real    18m30.323s
 
     cat fb.simpleRepeat 
     #	4331076 bases of 100286004 (4.319%) in intersection
 
 #########################################################################
 # MASK SEQUENCE WITH RM+TRF (DONE - 2009-07-24 - Hiram)
     # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed,
     # now it's time to combine the masking into the final ce8.2bit,
     # following the instructions at the end of doSimpleRepeat's output.
     cd /hive/data/genomes/ce8
     twoBitMask ce8.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce8.2bit
     # You can safely ignore the warning about extra BED columns
     twoBitToFa ce8.2bit stdout | faSize stdin > faSize.ce8.2bit.txt
     cat faSize.ce8.2bit.txt
     #	100286004 bases (0 N's 100286004 real 86863769 upper 13422235 lower)
     #	in 7 sequences in 1 files
     #	%13.38 masked total, %13.38 masked real
     #	set the symlink on hgwdev to /gbdb/ce8
     rm -f /gbdb/ce8/ce8.2bit
     ln -s `pwd`/ce8.2bit /gbdb/ce8/ce8.2bit
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE - 2009-07-22 - Hiram)
     # Use -repMatch=100 (based on size -- for human we use 1024, and 
     # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome 
     # size from featureBits. So we would use 34, but that yields a very
     # high number of tiles to ignore, especially for a small more compact 
     # genome.  Bump that up a bit to be more conservative.
     cd /hive/data/genomes/ce8
     blat ce8.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=jkStuff/ce8.11.ooc -repMatch=100
     #	Wrote 8514 overused 11-mers to jkStuff/ce8.11.ooc
     #	copy all of this stuff to the klusters:
     mkdir /hive/data/staging/data/ce8
     cp -p jkStuff/ce8.11.ooc chrom.sizes ce8.2bit /hive/data/staging/data/ce8
     #	request push of that data to kluster nodes /scratch/data/ce8/
 
 #########################################################################
 ## LASTZ caePb2 (DONE - 2009-07-28 - Hiram)
     screen 	#	use screen to control the job
     mkdir /hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28
     cd /hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28
 
     cat << '_EOF_' > DEF
 # ce8 vs caePb2
 BLASTZ_H=2000
 BLASTZ_M=50
 
 # TARGET: elegans Ce8
 SEQ1_DIR=/scratch/data/ce8/ce8.2bit
 SEQ1_LEN=/scratch/data/ce8/chrom.sizes
 SEQ1_CHUNK=1000000
 SEQ1_LAP=10000
 
 # QUERY: C. PB2801 caePb2
 SEQ2_DIR=/scratch/data/caePb2/caePb2.2bit
 SEQ2_LEN=/scratch/data/caePb2/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/caePb2/caePb2.supercontigs.2bit
 SEQ2_CTGLEN=/scratch/data/caePb2/caePb2.supercontigs.sizes
 SEQ2_LIFT=/scratch/data/caePb2/caePb2.supercontigs.lift
 SEQ2_CHUNK=1000000
 SEQ2_LAP=0
 SEQ2_LIMIT=50
 
 BASE=/hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF -verbose=2 -bigClusterHub=swarm -workhorse=hgwdev \
 	-qRepeats=windowmaskerSdust -noLoadChainSplit -smallClusterHub=memk \
 	> do.log 2>&1 &
     #	about 50 minutes
     cat fb.ce8.chainCaePb2Link.txt
     #	40793017 bases of 100286004 (40.677%) in intersection
 
     #	swap, this is also in caePb2.txt
     mkdir /hive/data/genomes/caePb2/bed/blastz.ce8.swap
     cd /hive/data/genomes/caePb2/bed/blastz.ce8.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-workhorse=hgwdev -qRepeats=windowmaskerSdust \
 	/hive/data/genomes/ce8/bed/lastzCaePb2.2009-07-28/DEF \
 	-bigClusterHub=swarm -smallClusterHub=encodek -swap > swap.log 2>&1 &
     #	real    3m16.709s
     cat fb.caePb2.chainCe8Link.txt
     #	55084580 bases of 170473138 (32.313%) in intersection
 
 #########################################################################
 ## BLASTZ caeJap2 (DONE - 2009-07-28 - Hiram)
     screen 	#	use screen to control the job
     mkdir /hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28
     cd /hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28
 
     cat << '_EOF_' > DEF
 # ce8 vs caeJap2
 BLASTZ_H=2000
 BLASTZ_M=50
 
 # TARGET: elegans Ce8
 SEQ1_DIR=/scratch/data/ce8/ce8.2bit
 SEQ1_LEN=/scratch/data/ce8/chrom.sizes
 SEQ1_CHUNK=1000000
 SEQ1_LAP=10000
 
 # QUERY: C. japonica caeJap2
 SEQ2_DIR=/scratch/data/caeJap2/caeJap2.2bit
 SEQ2_LEN=/scratch/data/caeJap2/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/caeJap2/caeJap2.supers.2bit
 SEQ2_CTGLEN=/scratch/data/caeJap2/caeJap2.supers.sizes
 SEQ2_LIFT=/scratch/data/caeJap2/caeJap2.chrUn.lift
 SEQ2_CHUNK=1000000
 SEQ2_LAP=0
 SEQ2_LIMIT=50
 
 BASE=/hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=swarm -noLoadChainSplit -qRepeats=windowmaskerSdust \
 	-workhorse=hgwdev -smallClusterHub=memk > do.log 2>&1 &
     #	about 42 minutes
     cat fb.ce8.chainCaeJap2Link.txt 
     #	27270052 bases of 100286004 (27.192%) in intersection
 
     #	swap, this is also in caeJap2.txt
     mkdir /hive/data/genomes/caeJap2/bed/blastz.ce8.swap
     cd /hive/data/genomes/caeJap2/bed/blastz.ce8.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-qRepeats=windowmaskerSdust -bigClusterHub=swarm -noLoadChainSplit \
 	/hive/data/genomes/ce8/bed/lastzCaeJap2.2009-07-28/DEF \
 	-smallClusterHub=encodek -swap > swap.log 2>&1 &
     #	real    3m44.671s
     cat fb.caeJap2.chainCe8Link.txt 
     #	26440993 bases of 129295754 (20.450%) in intersection
 
 ############################################################################
 ## BLASTZ cb3 (DONE - 2009-07-28 - Hiram)
     screen 	#	use screen to control the job
     mkdir /hive/data/genomes/ce8/bed/lastzCb3.2009-07-28
     cd /hive/data/genomes/ce8/bed/lastzCb3.2009-07-28
 
     cat << '_EOF_' > DEF
 # ce8 vs cb3
 BLASTZ_H=2000
 BLASTZ_M=50
 
 # TARGET: elegans Ce8
 SEQ1_DIR=/scratch/data/ce8/ce8.2bit
 SEQ1_LEN=/scratch/data/ce8/chrom.sizes
 SEQ1_CHUNK=1000000
 SEQ1_LAP=10000
 
 # QUERY: C. briggsae cb3
 SEQ2_DIR=/hive/data/genomes/cb3/cb3.rmskTrf.2bit
 SEQ2_LEN=/hive/data/genomes/cb3/chrom.sizes
 SEQ2_CHUNK=1000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/ce8/bed/lastzCb3.2009-07-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \
 	-smallClusterHub=memk > do.log 2>&1 &
     #	about 40 minutes
     cat fb.ce8.chainCb3Link.txt 
     #	42421395 bases of 100286004 (42.300%) in intersection
 
     #	swap, this is also in cb3.txt
     mkdir /hive/data/genomes/cb3/bed/blastz.ce8.swap
     cd /hive/data/genomes/cb3/bed/blastz.ce8.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/ce8/bed/lastzCb3.2009-07-28/DEF \
 	-workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \
 	-smallClusterHub=encodek -swap > swap.log 2>&1 &
     #	real   3m46.700s
 
     cat fb.cb3.chainCe8Link.txt 
     #	43115929 bases of 108433446 (39.763%) in intersection
 
 ############################################################################
 ## BLASTZ caeRem3 (DONE - 2009-07-28,09 - Hiram)
     screen 	#	use screen to control the job
     mkdir /hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28
     cd /hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28
 
     cat << '_EOF_' > DEF
 # ce8 vs caeRem3
 BLASTZ_H=2000
 BLASTZ_M=50
 
 # TARGET: elegans Ce8
 SEQ1_DIR=/scratch/data/ce8/ce8.2bit
 SEQ1_LEN=/scratch/data/ce8/chrom.sizes
 SEQ1_CHUNK=1000000
 SEQ1_LAP=10000
 
 # QUERY: C. remanei caeRem3
 SEQ2_DIR=/scratch/data/caeRem3/caeRem3.2bit
 SEQ2_LEN=/scratch/data/caeRem3/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/caeRem3/caeRem3.supercontigs.2bit
 SEQ2_CTGLEN=/scratch/data/caeRem3/caeRem3.supercontigs.sizes
 SEQ2_LIFT=/scratch/data/caeRem3/caeRem3.chrUn.lift
 SEQ2_CHUNK=1000000
 SEQ2_LAP=0
 SEQ2_LIMIT=50
 
 BASE=/hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \
 	-qRepeats=windowmaskerSdust -smallClusterHub=memk > do.log 2>&1 &
 XXX - running Tue Jul 28 11:14:53 PDT 2009
     #	real    28m14.168s
     cat fb.ce8.chainCaeRem3Link.txt 
     #	41841184 bases of 100286004 (41.722%) in intersection
 
     #	swap, this is also in caeRem3.txt
     mkdir /hive/data/genomes/caeRem3/bed/blastz.ce8.swap
     cd /hive/data/genomes/caeRem3/bed/blastz.ce8.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-qRepeats=windowmaskerSdust \
 	-workhorse=hgwdev -noLoadChainSplit \
 	/hive/data/genomes/ce8/bed/lastzCaeRem3.2009-07-28/DEF \
 	-bigClusterHub=swarm -smallClusterHub=encodek -swap > swap.log 2>&1 &
     #	real    3m10.033s
     cat fb.caeRem3.chainCe8Link.txt 
     #	46320775 bases of 138406388 (33.467%) in intersection
 
 ############################################################################
 ## 5-Way multiple alignment (DONE - 2009-07-28 - Hiram)
 
-    mkdir /cluster/data/ce8/bed/multiz5way
-    cd /cluster/data/ce8/bed/multiz5way
+    mkdir /hive/data/genomes/ce8/bed/multiz5way
+    cd /hive/data/genomes/ce8/bed/multiz5way
     #	See notes in ce6.txt for 6-way alignment.  This is the tree from
     #	there.
 
     cat << '_EOF_' > 5way.nh
 ((C._elegans_ce8:0.003000,
     (C._brenneri_caePb2:0.013000,
 	(C._remanei_caeRem3:0.003000,C._briggsae_cb3:0.005000):0.004000)
 		:0.002000):0.001000,
 	    C._japonica_caeJap2:0.023000);
 '_EOF_'
     # << happy emacs
 
     /cluster/bin/phast/x86_64/all_dists 5way.nh > 5way.distances.txt
     grep -i ce8 5way.distances.txt | sort -k3,3n
     #	Use this output for reference, and use the calculated
     #	distances in the table below to order the organisms and check
     #	the button order on the browser.
     #	And if you can fill in the table below entirely, you have
     #	succeeded in finishing all the alignments required.
     #
 #                         featureBits chainLink measures
 #                                           chaince8Link   chain   linearGap
 #    distance                       on Ce8      on other   minScore
 #  1 0.0120  - remanei_caeRem3     (% 41.722)  (% 33.467)  1000     loose
 #  2 0.0140  - briggsae_cb3        (% 42.300)  (% 39.763)  1000     loose
 #  3 0.0180  - brenneri_caePb2     (% 40.677)  (% 32.313)  1000     loose
 #  3 0.0270  - japonica_caeJap2    (% 27.192)  (% 20.450)  1000     loose
 
-    cd /cluster/data/ce8/bed/multiz5way
+    cd /hive/data/genomes/ce8/bed/multiz5way
     #	bash shell syntax here ...
-    export H=/cluster/data/ce8/bed
+    export H=/hive/data/genomes/ce8/bed
     mkdir mafLinks
     for G in caeRem3 cb3 caePb2 caeJap2
     do
 	mkdir mafLinks/$G
 	if [ ! -d ${H}/lastz.${G}/mafNet ]; then
 	    echo "missing directory lastz.${G}/mafNet"
 	fi
 	ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
     done
 
     #	these are x86_64 binaries
     mkdir penn
     cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn 
     cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn 
     cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn 
 
     # the autoMultiz cluster run
     ssh memk
     cd /hive/data/genomes/ce8/bed/multiz5way/
 
     # create species list and stripped down tree for autoMZ
     sed -e \
 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d; s/C._//g' \
 	5way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.list
 
     mkdir maf run
     cd run
 
     #	NOTE: set the db and pairs directories in this script
     cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 set db = ce8
 set c = $1
 set result = $2
 set run = `pwd`
 set tmp = $run/tmp/$db/multiz.$c
 set nway = /hive/data/genomes/ce8/bed/multiz5way
 set pairs = $nway/mafLinks
 /bin/rm -fr $tmp
 /bin/mkdir -p $tmp
 /bin/cp -p $nway/tree.nh $nway/species.list $tmp
 pushd $tmp
 foreach s (`sed -e "s/ $db//" species.list`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if (-e $in.gz) then
         /bin/zcat $in.gz > $out
 	if (! -s $out) then
 	    echo "##maf version=1 scoring=autoMZ" > $out
 	endif
     else if (-e $in) then
         ln -s $in $out
     else
         echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($nway/penn $path); rehash
 $nway/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 /bin/rm -f $result
 /bin/cp -p $tmp/$c.maf $result
 /bin/rm -fr $tmp
 /bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
 /bin/rmdir --ignore-fail-on-non-empty $run/tmp
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/ce8/bed/multiz5way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
 # << happy emacs
 
-    awk '{print $1}' /cluster/data/ce8/chrom.sizes > chrom.lst
+    awk '{print $1}' /hive/data/genomes/ce8/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     para -maxNode=1 push
     para check ... push ... etc ...
 # Completed: 7 of 7 jobs
 # CPU time in finished jobs:       1517s      25.28m     0.42h    0.02d  0.000 y
 # IO & Wait Time:                   124s       2.07m     0.03h    0.00d  0.000 y
 # Average job time:                 234s       3.91m     0.07h    0.00d
 # Longest finished job:             334s       5.57m     0.09h    0.00d
 # Submission to last job:           348s       5.80m     0.10h    0.00d
 
     cd /hive/data/genomes/ce8/bed/multiz5way
     time nice -n +19 catDir maf > multiz5way.maf
     # a few seconds to produce:
     #	-rw-rw-r-- 1 321149265 Jul 28 14:53 multiz5way.maf
 
     #	before getting to the annotation, load this up so we can get
     #	a first view of the track.  This will be replaced with the annotated
     #	mafs
     ssh hgwdev
     cd /hive/data/genomes/ce8/bed/multiz5way
     mkdir /gbdb/ce8/multiz5way
     ln -s /hive/data/genomes/ce8/bed/multiz5way/multiz5way.maf \
 	/gbdb/ce8/multiz5way
 
     #	this load creates a large file, do that on local disk:
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf ce8 multiz5way
     #	a few seconds to load:
     #	Loaded 327164 mafs in 1 files from /gbdb/ce8/multiz5way
 
     time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
 	-maxSize=50000 ce8 multiz5waySummary /gbdb/ce8/multiz5way/multiz5way.maf
     #	a few seconds to load:
     #	Created 111894 summary blocks from 850001 components and 327164 mafs
     #	from /gbdb/ce8/multiz5way/multiz5way.maf
 
     #	remove the temporary .tab files:
     rm multiz5*.tab
 ############################################################################
+# reset default position, same as ce4 on the ZC101 / unc-52 locus
+    ssh hgwdev
+    hgsql -e 'update dbDb set defaultPos="chrII:14646344-14667746"
+	where name="ce8";' hgcentraltest
+
+############################################################################
+## SANGER GENE TRACK (WORKING - 2009-07-29 - Hiram)
+##	There is a tremendous amount of extraneous notations in the gff
+##	files.  Filter them down to a manageable set, change the chrom
+##	names, eliminate duplicates, select only what will work in
+##	ldHgGene
+    mkdir /hive/data/genomes/ce8/bed/sangerGene
+    cd /hive/data/genomes/ce8/bed/sangerGene
+for C in I II III IV V X
+do
+    echo -n "${C} "
+    cat ../../ws204/genome_feature_tables/GFF2/CHROMOSOME_${C}.gff | \
+    sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
+        s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
+        s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
+        s/CHROMOSOME_M/chrM/g;" \
+        -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
+        -e 's/CDS "//' -e 's/"//' \
+                > chr${C}.gff
+done
+C=M
+echo -n "${C} "
+cat ../../ws204/genome_feature_tables/GFF2/CHROMOSOME_MtDNA.gff | \
+sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
+    s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
+    s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
+    s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \
+    -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
+    -e 's/CDS "//' -e 's/"//' \
+            > chr${C}.gff
+for C in I II III IV V X M
+do
+    echo "chr${C}.gff -> filtered.chr${C}.gff"
+    grep -v "^#" chr${C}.gff | awk -F'\t' '
+BEGIN { IGNORECASE=1 }
+{
+    if (match($2,"curated|Coding_transcript")) {
+	if (match($3,"intron|coding_exon|exon|cds|three_prime_UTR|five_prime_UTR")) {
+	    gsub("coding_exon","CDS",$3)
+            gsub("Transcript ","",$9)
+            gsub(" .*","",$9)
+            gsub("three_prime_UTR","3utr",$3)
+            gsub("five_prime_UTR","5utr",$3)
+            for (i = 1; i < 9; ++i) {
+                printf "%s\t", $i
+            }
+            printf "%s\n", $9
+        }
+    }
+}
+' | sort -u | sort -k4n > filtered.chr${C}.gff
+done
+
+    ssh hgwdev
+    cd /hive/data/genomes/ce8/bed/sangerGene
+    nice -n +19 ldHgGene ce8 sangerGene filtered.*.gff
+    nice -n +19 ldHgGene -out=filteredGenePred.tab ce8 sangerGene filtered.*.gff
+    #	Read 55287 transcripts in 1027094 lines in 7 files
+    #	55287 groups 7 seqs 3 sources 5 feature types
+    #	31064 gene predictions
+
+###############################################################################