src/hg/makeDb/doc/ce2.txt 1.8

1.8 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/ce2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce2.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/hg/makeDb/doc/ce2.txt	17 Oct 2008 01:06:31 -0000	1.7
+++ src/hg/makeDb/doc/ce2.txt	25 Nov 2009 21:48:38 -0000	1.8
@@ -1,2825 +1,2825 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes how to make the browser database for the
 # worm C. elegans
 # 2004-03-30 [DONE, 2004-05-20, hartera]
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +------------------+-------------------------+
 #  | tableName        | type                    |
 #  +------------------+-------------------------+
 #  | sangerGene       | genePred sangerPep      |
 #  | sangerGenefinder | genePred                |
 #  | refGene          | genePred refPep refMrna |
 #  | twinscan         | genePred twinscanPep    |
 #  | geneid           | genePred geneidPep      |
 #  +------------------+-------------------------+
 
 
 # DOWNLOAD SEQUENCE (DONE, 2004-03-31, hartera)
 
     # next machine
     ssh eieio
     mkdir -p /cluster/store5/worm/ce2/sanger
     mkdir -p /cluster/store5/worm/ce2/tmp
     ln -s /cluster/store5/worm/ce2 /cluster/data/ce2
     cd /cluster/data/ce2/sanger
 
     wget -o ce2.fetch.log -r -l1 --no-directories \
         ftp://ftp.sanger.ac.uk/pub/wormbase/WS120/CHROMOSOMES/
                                                                            
     # This site is now at:                                                          # ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS120/CHROMOSOMES/
 
 # Takes about five minutes
 # This current_release is updated every two weeks.  Although the
 #     sequence is quite stable at this time and changes very little.
 # Check that you have some valid files. Should be chroms I II III IV V X
 #     in dna, gff and agp formats, also Mitochondrial DNA, MtDNA in dna and 
 #     gff formats.
 
     ls -ogrt
 
 # Rename CHROMOSOME_MtDNA files to CHROMOSOME_M and change to 
 # "CHROMOSOME_M" inside files 
     chmod u+w CHROMOSOME_M*
     zcat CHROMOSOME_MtDNA.dna.gz | sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" \
         | gzip > CHROMOSOME_M.dna.gz
     zcat CHROMOSOME_MtDNA.gff.gz | sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" \
         | gzip > CHROMOSOME_M.gff.gz
     # remove old files
     rm CHROMOSOME_MtDNA* 
 
 # CHROMOSOME_M sequence is identical to X54252.1 in GenBank
 # create a .agp file for chrM as there is none and hgGoldGapGl and other 
 # programs require a .agp file so create CHROMOSOME_M.agp:
 # M       1       13794   1       F       X54252.1	1       13794   +
 
 # translate to unzipped .fa, all upper case, and
 # rename the agp files so hgGoldGap can find them
     mkdir sangerFa
     foreach f (I II III IV V X M)
         echo -n "${f} "
         zcat sanger/CHROMOSOME_${f}.dna.gz | tr '[a-z]' '[A-Z]' | \
                 sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${f}.fa
         mkdir -p sangerFa/${f}
         ln -s /cluster/data/ce2/sanger/CHROMOSOME_${f}.agp sangerFa/${f}/chr${f}.agp
     end
     # hgGoldGap does not handle dir names over 2 characters:
     mv sangerFa/III sangerFa/3
 
 # CREATING DATABASE (DONE, 2004-03-31 - hartera)
 
     # Create the database.
     # next machine
 
     ssh hgwdev
     echo 'create database ce2' | hgsql ''
     # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
     echo 'drop database ce2' | hgsql ce2
     # Use df to ake sure there is at least 5 gig free on
     df -h /var/lib/mysql
 # Before loading data:
 # Filesystem            Size  Used Avail Use% Mounted on
 # /dev/sdc1             1.8T  280G  1.4T  17% /var/lib/mysql    
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-03-31 - hartera)
     # next machine
     ssh hgwdev
     #  the following command copies all the data from the table
     #  grp in the database galGal2 to our new database ce2
     echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \
       | hgsql ce2
     # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
     echo 'drop table grp;' | hgsql ce2
     
 # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-04-01, hartera)
     # This used to hold scripts -- better to keep them inline here so
     # they're in CVS.  Now it should just hold lift file(s) and
     # temporary scripts made by copy-paste from this file.
     mkdir /cluster/data/ce2/jkStuff
     # This is where most tracks will be built:
     mkdir /cluster/data/ce2/bed
 
 # PREPARE Split contigs into 100,000 bp chunks for cluster runs
 # (DONE, 2004-04-01, hartera)
     # next machine
     ssh eieio
     cd /cluster/data/ce2
     rm -fr ./split
     mkdir split
     foreach f (I II III IV V X M)
         mkdir split/$f
         faSplit size sangerFa/chr${f}.fa 100000 split/$f/c -lift=split/chr${f}.lft
     end
                                                                           
 151 pieces of 151 written
 153 pieces of 153 written
 138 pieces of 138 written
 175 pieces of 175 written
 210 pieces of 210 written
 178 pieces of 178 written
 1 pieces of 1 written
 
    cat split/*.lft > liftAll.lft
    # copy them to /iscratch/i for cluster rsync
    # next machine
    ssh kkr1u00
    cd /cluster/data/ce2/split
    foreach c (I II III IV V X M)
        echo -n "${c} "
        mkdir -p /iscratch/i/worms/Celegans2/unmaskedSplit/${c}
        cp -p ${c}/c*.fa /iscratch/i/worms/Celegans2/unmaskedSplit/${c}
    end
    iSync
 
 # Run RepeatMasker on the chromosomes (DONE, 2004-04-02 - hartera)
     # next machine
     ssh kk
     cd /cluster/data/ce2
     # make run directory and job list, create the script to use 
     # for the RepeatMasker run
     cat << '_EOF_' > jkStuff/RMWorm
 #!/bin/csh -fe
 #
 #       This is a slight rearrangement of the
 #       RMChicken script used in makeGalGal2.doc
 #       The results here need to go to a different location
 #       $1 == chrom name: I II III IV V X M
 #       $2 == directory where split contig .fa is found
 #       $3 == name of contig .fa file
 cd $1
 pushd .
 cd $2
 /bin/mkdir -p /tmp/ce2/$3/$1
 /bin/cp $3 /tmp/ce2/$3/$1
 cd /tmp/ce2/$3/$1
 /cluster/bluearc/RepeatMasker/RepeatMasker -alignments -s -species elegans $3
 popd
 /bin/cp /tmp/ce2/$3/$1/$3.out ./
 if( -e /tmp/ce2/$3/$1/$3.align ) /bin/cp /tmp/ce2/$3/$1/$3.align ./
 if (-e /tmp/ce2/$3/$1/$3.tbl) /bin/cp /tmp/ce2/$3/$1/$3.tbl ./
 if (-e /tmp/ce2/$3/$1/$3.cat) /bin/cp /tmp/ce2/$3/$1/$3.cat ./
 /bin/rm -r /tmp/ce2/$3/$1
 /bin/rmdir --ignore-fail-on-non-empty /tmp/ce2/$3
 /bin/rmdir --ignore-fail-on-non-empty /tmp/ce2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x jkStuff/RMWorm
     # create job list
     mkdir RMRun
     rm -f RMRun/RMJobs
     foreach c (I II III IV V X M)
         mkdir /cluster/data/ce2/RMRun/${c}
         foreach t ( /iscratch/i/worms/Celegans2/unmaskedSplit/$c/c*.fa )
             set d = $t:h
             set f = $t:t
             echo /cluster/data/ce2/jkStuff/RMWorm ${c} ${d} ${f} \
             '{'check out line+ /cluster/data/ce2/RMRun/$c/${f}.out'}' \
             >> RMRun/RMJobs
         end
     end
     # Do the run
     cd /cluster/data/ce2/RMRun
     para create RMJobs
     para try, para check, para check, para push, para check, ...
 para try:
 # para time
 # Checking finished jobs
 # Completed: 1006 of 1006 jobs
 # CPU time in finished jobs:     821747s   13695.78m   228.26h    9.51d  0.026 y
 # IO & Wait Time:                 13643s     227.39m     3.79h    0.16d  0.000 y
 # Average job time:                 830s      13.84m     0.23h    0.01d
 # Longest job:                      935s      15.58m     0.26h    0.01d
 # Submission to last job:          5837s      97.28m     1.62h    0.07d
   
     # when they are finished, liftUp and load the .out files into the database:
     # next machine
     ssh eieio
     cd /cluster/data/ce2/RMRun
     foreach c (I II III IV V X M)
         liftUp chr${c}.fa.out /cluster/data/ce2/split/chr${c}.lft warn ${c}/*.fa.out
     end
 
     # next machine
     ssh hgwdev
     cd /cluster/data/ce2/RMRun
     hgLoadOut ce2 chr*.fa.out
 
     # Noticed one error in this load (reported to Robert Hubley):
     # Processing chrV.fa.out
     # Strange perc. field -0.1 line 2196 of chrV.fa.out
 
 # SIMPLE REPEAT [TRF] TRACK  (DONE, hartera 2004-04-01)
     # ensure chr*.fa files exist on /iscratch/i
     # next machine
     ssh kkr1u00
     mkdir -p /iscratch/i/worms/Celegans2/unmaskedFa
     cp -p /cluster/data/ce2/sangerFa/*.fa \
         /iscratch/i/worms/Celegans2/unmaskedFa
     iSync
 # done iSync, 
     # Create cluster parasol job:
     # next machine
     ssh kk
     mkdir -p /cluster/data/ce2/bed/simpleRepeat
     cd /cluster/data/ce2/bed/simpleRepeat
     mkdir trf
     ls -1S /iscratch/i/worms/Celegans2/unmaskedFa/chr*.fa > genome.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)}  /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
 #ENDLOOP
 '_EOF_'
                                                                                 
     echo "" > dummy.lst
     gensub2 genome.lst dummy.lst gsub spec
     para create spec
     para try    # there are only 7, so this runs them all
     para check
 
 # para time
 # Checking finished jobs
 # Completed: 7 of 7 jobs
 # CPU time in finished jobs:       3301s      55.01m     0.92h    0.04d  0.000 y
 # IO & Wait Time:                    38s       0.64m     0.01h    0.00d  0.000 y
 # Average job time:                 477s       7.95m     0.13h    0.01d
 # Longest job:                      975s      16.25m     0.27h    0.01d
 # Submission to last job:           975s      16.25m     0.27h    0.01d
 
     #  When cluster run is done, combine into one:
     cat trf/*.bed > simpleRepeat.bed
 
     # Load into the database:
     # next machine
     ssh hgwdev
     cd /cluster/data/ce2/bed/simpleRepeat
     hgLoadBed ce2 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
     # Loaded 28598 elements of size 16
 
 # PROCESS SIMPLE REPEATS INTO MASK (DONE,  2004-04-02 - hartera)
     # After the simpleRepeats track has been built, make a filtered version
     # of the trf output: keep trf's with period <= 12:
 
     # next machine
     ssh eieio
     cd /cluster/data/ce2/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/*.bed)
         awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
 
 #  create Soft and Hard masks from RepeatMaster and TRF outputs:
 #  and rebuild the nib files using the soft masking in the fa:
     # next machine
     ssh eieio
     cd /cluster/data/ce2
     mkdir softMask
     mkdir nib
     cd /cluster/data/ce2
     foreach c (I II III IV V X M)
         echo -n "masking chr${c} "
         maskOutFa sangerFa/chr${c}.fa RMRun/chr${c}.fa.out \
                 softMask/chr${c}.fa -soft
         maskOutFa softMask/chr${c}.fa \
                 bed/simpleRepeat/trfMask/chr${c}.bed \
                 softMask/chr${c}.fa -softAdd
                 faToNib -softMask softMask/chr${c}.fa nib/chr${c}.nib
     end
 
 # output:
 # masking chrI Writing 15080483 bases in 7540250 bytes
 # masking chrII Writing 15279308 bases in 7639662 bytes
 # masking chrIII Writing 13783313 bases in 6891665 bytes
 # masking chrIV Writing 17493791 bases in 8746904 bytes
 # masking chrV Writing 20922231 bases in 10461124 bytes
 # masking chrX Writing 17718849 bases in 8859433 bytes
 # masking chrM Writing 13794 bases in 6905 bytes
 
 # create hard masks 
 mkdir hardMask
     foreach c (I II III IV V X M)
         echo "masking chr${c}"
         /cluster/bin/i386/maskOutFa softMask/chr${c}.fa hard \
                 hardMask/chr${c}.fa
     end
 
     ssh kkr1u00
     cd /cluster/data/ce2/softMask
     mkdir -p /iscratch/i/worms/Celegans2/bothMasksFa
     mkdir -p /iscratch/i/worms/Celegans2/nib
     cp -p *.fa /iscratch/i/worms/Celegans2/bothMasksFa
     cd /cluster/data/ce2/nib
     cp -p c*.nib /iscratch/i/worms/Celegans2/nib
     iSync
 
 STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE, 2004-04-02 - hartera)
 
     # Make symbolic links from /gbdb/ce1/nib to the real nibs.
     # next machine
     ssh hgwdev
     mkdir -p /gbdb/ce2/nib
 
     foreach f (/cluster/data/ce2/nib/*.nib) 
       ln -s $f /gbdb/ce2/nib
     end
     cd /cluster/data/ce2/tmp
     # Load /gbdb/ce2/nib paths into database and save size info
     # hgNibSeq creates chromInfo table
     hgNibSeq -preMadeNib ce2 /gbdb/ce2/nib /cluster/data/ce2/sangerFa/chr*.fa
 
 # Typical output:
 # Processing /cluster/data/ce2/sangerFa/chrI.fa to /gbdb/ce2/nib/chrI.nib
 # Processing /cluster/data/ce2/sangerFa/chrII.fa to /gbdb/ce2/nib/chrII.nib
 # Processing /cluster/data/ce2/sangerFa/chrIII.fa to /gbdb/ce2/nib/chrIII.nib
 # Processing /cluster/data/ce2/sangerFa/chrIV.fa to /gbdb/ce2/nib/chrIV.nib
 # Processing /cluster/data/ce2/sangerFa/chrM.fa to /gbdb/ce2/nib/chrM.nib
 # Processing /cluster/data/ce2/sangerFa/chrV.fa to /gbdb/ce2/nib/chrV.nib
 # Processing /cluster/data/ce2/sangerFa/chrX.fa to /gbdb/ce2/nib/chrX.nib
 # 100291769 total bases
 
     #   Verify the hgNibSeq load functioned OK:
     hgsql -e "select chrom, size from chromInfo" ce2 > chrom.sizes
     cat chrom.sizes
 # chrom.sizes:
 # chrom   size
 # chrI    15080483
 # chrII   15279308
 # chrIII  13783313
 # chrIV   17493791
 # chrM    13794
 # chrV    20922231
 # chrX    17718849
 
 # MAKE GAP tracks AND LOAD ASSEMBLY FRAGMENTS INTO DATABASE (DONE, 2004-04-05, hartera)
     # next machine
      ssh hgwdev
      mkdir -p /cluster/data/ce2/bed/gap
      cd /cluster/data/ce2/bed/gap
      # finds motifs and finds location of gaps as part of output
      foreach c (I II III IV V X M)
         findMotif -chr=chr${c} -verbose=4 -motif=gcatg /gbdb/ce2/nib >& chr${c}Bed.stderr 
      end
      grep -h GAP *.stderr | sed -e "s/#GAP //" > gap.bed
 
     # hgGoldGap does not handle dir names over 2 characters
     #   directory III has been moved to 3 when all this was created above
     hgGoldGapGl -noGl ce2 /cluster/data/ce2 sangerFa
 
     # All the gap tables are empty
     # Load in gap.bed
     # Need to add extra fields to gap.bed file
 
     cat << '_EOF_' > /cluster/data/ce2/jkStuff/createGapFile.pl
 #!/usr/bin/perl -w
 use strict;
 
 my $oldchr = "";
 while (<STDIN>) {
   my @fields = split(/\t/);
   my $chr = $fields[0];
   if ($chr ne $oldchr) {
      open(OUT, ">$chr"."_gap.bed");
      $oldchr = $chr;
   }
   print OUT "$fields[0]\t$fields[1]\t$fields[2]\t$fields[3]\tN\t$fields[4]\tfragment\tyes\n";
 }
 '_EOF_'
  
     perl /cluster/data/ce2/jkStuff/createGapFile.pl < gap.bed
     # load into relevant tables
     foreach c (I II III IV V X M) 
         hgLoadBed -tab -oldTable ce2 chr${c}_gap chr${c}_gap.bed
     end 
 
 # CREATE gc5Base wiggle TRACK (DONE, 2004-04-05, hartera)
 
     # Perform a gc count with a 5 base window. 
     # Also compute a "zoomed" view for display efficiency.
 
     mkdir /cluster/data/ce2/bed/gc5Base
     cd /cluster/data/ce2/bed/gc5Base
 
     #   in the script below, the 'grep -w GC' selects the lines of
     #   output from hgGcPercent that are real data and not just some
     #   information from hgGcPercent.  The awk computes the number
     #   of bases that hgGcPercent claimed it measured, which is not
     #   necessarily always 5 if it ran into gaps, and then the division
     #   by 10.0 scales down the numbers from hgGcPercent to the range
     #   [0-100].  Two columns come out of the awk print statement:
     #   <position> and <value> which are fed into wigAsciiToBinary through
     #   the pipe.  It is set at a dataSpan of 5 because each value
     #   represents the measurement over five bases beginning with
     #   <position>.  The result files end up in ./wigData5.
     cat << '_EOF_' > /cluster/data/ce2/jkStuff/runGcPercent.sh
 #!/bin/sh
 mkdir -p wigData5
 mkdir -p dataLimits5
 for n in /cluster/data/ce2/nib/*.nib
 do
         c=`basename ${n} | sed -e "s/.nib//"`
         C=`echo $c | sed -e "s/chr//"`
         echo -n "working on ${c} - ${C} ... "
         hgGcPercent -chr=${c} -doGaps \
                 -file=stdout -win=5 ce2 /cluster/data/ce2/nib | grep -w GC | \
                 awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \
         wigAsciiToBinary \
         -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
         -name=${C} stdin 2> dataLimits5/${c}
 echo "done"
 done
 '_EOF_'
 
     chmod +x /cluster/data/ce2/jkStuff/runGcPercent.sh
 
     #   This is going to take perhaps two hours to run.  It is a lot of
     #   data.  make sure you do it on the fileserver:
     ssh eieio
     cd /cluster/data/ce2/bed/gc5Base
     /cluster/data/ce2/jkStuff/runGcPercent.sh
     # load the .wig files back on hgwdev:
     ssh hgwdev
     cd /cluster/data/ce2/bed/gc5Base
     hgLoadWiggle ce2 gc5Base wigData5/*.wig
     # and symlink the .wib files into /gbdb
     mkdir /gbdb/ce2/wib
     ln -s `pwd`/wigData5/*.wib /gbdb/ce2/wib
 
     # to speed up display for whole chromosome views, compute a "zoomed"
     # view and load that on top of the existing table.  The savings
     # comes from the number of data table rows the browser needs to load
     # for a full chromosome view.  Without the zoomed view there are
     # over 43,000 data rows for chrom 1.  With the zoomed view there are
     # only 222 rows needed for the display.  If your original data was
     # at 1 value per base the savings would be even greater.
     #   Pretty much the same data calculation
     # situation as above, although this time note the use of the
     # 'wigZoom -dataSpan=1000 stdin' in the pipeline.  This will average
     # together the data points coming out of the awk print statment over
     # a span of 1000 bases.  Thus each <position> coming out of wigZoom
     # will represent the measurement of GC in the next 1000 bases.  Note
     # the use of -dataSpan=1000 on the wigAsciiToBinary to account for
     # this type of data.  You want your dataSpan here to be an exact
     # multiple of your original dataSpan (5*200=1000) and on the order
     # of at least 1000, doesn't need to go too high.  For data that is
     # originally at 1 base per value, a convenient span is: -dataSpan=1024
     # A new set of result files ends up in ./wigData5_1K/*.wi[gb]
 
     cat << '_EOF_' > /cluster/data/ce2/jkStuff/runZoom.sh
 #!/bin/sh
                                                                                 
 mkdir -p wigData5_1K
 mkdir -p dataLimits5_1K
                                                                                 
 for n in /cluster/data/ce2/nib/*.nib
 do
         c=`basename ${n} | sed -e "s/.nib//"`
         C=`echo $c | sed -e "s/chr//"`
         echo -n "working on ${c} - ${C} ... "
         hgGcPercent -chr=${c} -doGaps \
                 -file=stdout -win=5 ce2 /cluster/data/ce2/nib | grep -w GC | \
                 awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \
         wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \
         -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \
         -name=${C} stdin 2> dataLimits5_1K/${c}
 echo "done"
 done
 '_EOF_'
 
     chmod +x /cluster/data/ce2/jkStuff/runZoom.sh
     #   This is going to take even longer than above, certainly do this
     #   on the fileserver
     ssh eieio
     cd /cluster/data/ce2/bed/gc5Base
     time /cluster/data/ce2/jkStuff/runZoom.sh
     # 520.000u 19.310s 6:15.09 143.7% 0+0k 0+0io 7875pf+0w
 
     #   Then load these .wig files into the same database as above
     ssh hgwdev
     hgLoadWiggle -oldTable ce2 gc5Base wigData5_1K/*.wig
     # and symlink these .wib files into /gbdb
     mkdir /gbdb/ce2/wib
     ln -s `pwd`/wigData5_1K/*.wib /gbdb/ce2/wib
 
     #   The browser needs to be fixed so it can display the assembly track 
     #   without the need for the gap tables to exist.
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CE2 (DONE, 2002-04-05, hartera)
     # next machine
     ssh hgwdev
     # Make trackDb table so browser knows what to expect:
     cd $HOME/kent/src/hg/makeDb/trackDb
     cvs up -d -P
     # Edit that makefile to add ce2 in all the right places and do
     make update
     make alpha
     cvs commit -m "Added ce2" makefile
 
     # Add dbDb and defaultDb entries:
     echo 'insert into dbDb (name, description, nibPath, organism,  \
           defaultPos, active, orderKey, genome, scientificName,  \
           htmlPath, hgNearOk)  \
           values("ce2", "March 2004", \
           "/gbdb/ce2/nib", "Worm", "chrII:14642289-14671631", 1, \
           60, "C. elegans", "Caenorhabditis elegans", \
           "/gbdb/ce2/html/description.html", 0);' \
     | hgsql -h genome-testdb hgcentraltest
     echo 'update defaultDb set name = "ce2" where genome = "C. elegans"' \
         | hgsql -h genome-testdb hgcentraltest
                                                                                
 # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-04-05, hartera)
     ssh hgwdev
     mkdir /cluster/data/ce2/html
     cd /cluster/data/ce2/html
     # make a symbolic link from /gbdb/ce2/html to /cluster/data/ce2/html
     ln -s /cluster/data/ce2/html /gbdb/ce2/html
     # Write a description.hmtl - copy from /cluster/data/ce1/html/
     # with a description of the assembly and some sample position queries.
     # create ce2 dir in /trackDb/worm and commit to CVS
     mkdir ~/kent/src/hg/makeDb/trackDb/worm/ce2
     cvs add ce2
     cvs commit ce2
     # Add this also to ~/kent/src/hg/makeDb/trackDb/worm/ce2/description.html
     chmod a+r $HOME/kent/src/hg/makeDb/trackDb/worm/ce2/description.html    
     # Check it in and copy (ideally using "make alpha" in trackDb) to
     # /gbdb/ce2/html
     cvs commit description.html
   
 # MAKE SANGER GENE (WORM BASE GENES) TRACK (DONE, 2004-04-09, hartera)
 # ADD MISSING ORFS AND PROTEIN NAMES TO sangerLinks TABLE AND 
 # ADD proteinID COLUMN TO SANGERGENE TABLE FOR USE BY THE 
 # FAMILY BROWSER (DONE, 2004-05-20, hartera)
                                                                                 
    # next machine
    ssh eieio
    mkdir -p /cluster/data/ce2/bed/sangerGene
    cd /cluster/data/ce2/bed/sangerGene
     # the perl trims these files down to a reasonable size.  As they are
     #   they cause ldHgGene to hangup due to memory size.
     foreach f (I II III IV V X M)
         echo -n "${f} "
         zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \
         sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
         -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
         -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
         -e "s/CHROMOSOME_M/chrM/g" \
         -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
         perl -ne '@a = split; \
         print if($a[1] =~ /curated|DNA|RNA/i && \
         $a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff
     end
     echo
     # remove CDS before id e.g. from CDS "Y74C9A.2" and remove "
     perl -pi.bak -e 's/CDS "//' chr*.gff
     perl -pi.bak -e 's/"//' chr*.gff
     # check this worked ok then cleanup
     rm *.bak
     #  check file sizes, should be reasonable
     ls -ogrt
 # total 23352
 # -rw-rw-r--    1  3430979 Apr  9 16:09 chrIII.gff
 # -rw-rw-r--    1  3885105 Apr  9 16:09 chrII.gff
 # -rw-rw-r--    1  3680753 Apr  9 16:09 chrI.gff
 # -rw-rw-r--    1  4926390 Apr  9 16:09 chrV.gff
 # -rw-rw-r--    1     3635 Apr  9 16:09 chrM.gff
 # -rw-rw-r--    1  3854560 Apr  9 16:09 chrIV.gff
 # -rw-rw-r--    1  4085489 Apr  9 16:09 chrX.gff                                                                                
     # now load database with those transformed gff files
     # next machine
     ssh hgwdev
     cd /cluster/data/ce2/bed/sangerGene
     # 2004-05-10, hartera, Reload sangerGene table using extended GenePred
     # format. 2004-05-11, hartera. Extended format frame information does not
     # look correct. Reload without the extended fields.
     ldHgGene ce2 sangerGene *.gff
 # Read 41259 transcripts in 423845 lines in 7 files
 #  41259 groups 7 seqs 11 sources 6 feature types
 # 23076 gene predictions
 
     # if you need to delete that table:
     echo 'drop table sangerGene' | hgsql ce2
 
 #    worm/ce2 trackDb.ra entry
 # track sangerGene
 # shortLabel WormBase Genes
 # longLabel WormBase Gene Annotations
 # group genes
 # priority 48
 # visibility pack
 # color 0,0,200
 # chromosomes chrI,chrII,chrIII,chrIV,chrV,chrX,chrM
 # type genePred sangerPep
 # url http://www.wormbase.org/db/gene/gene?name=$$
 # hgGene on
     
     # Add proteinID field to sangerGene table, used by Gene Sorter
     ssh hgwdev
     cd /cluster/data/ce2/bed/sangerGene
     hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce2
     # To add index on this column
     hgsql -e 'alter table sangerGene add index(proteinID);' ce2
     # Add Swiss-Prot protein IDs to this column
     # There are 23076 entries in sangerGene and 21780 of the names 
     # are in sangerLinks as orfName
     hgsql -N -e 'select count(*) from sangerGene as g,sangerLinks as l \
            where g.name = l.orfName;' ce2
     # 21780
     # get names from sangerGene and sangerLinks tables
     hgsql -N -e "select name from sangerGene;" ce2 | sort > name.sangerGene
     hgsql -N -e "select orfName from sangerLinks;" ce2 | sort > orfName.sangerLinks
     # get list of names in sangerGene not in sangerLinks
     comm -23 name.sangerGene orfName.sangerLinks > geneNames.notin.sangerLinks
     # Go to the WS120 WormBase mirror site and check SwissProt IDs
     # http://http://ws120.wormbase.org/db/searches/info_dump
     # Some of these genes are non coding RNAs or have no SwissProt ID BUT
     # others in the geneNames.notin.sangerLinks list do have a Swiss-Prot ID 
     # Download IDs in using batches of about 400 for gene names from 
     # geneNames.notin.sangerLinks and add to file, SPIds.wormPep.WS120
     # Create a perl script to parse out Swiss-Prot IDs for those genes that 
     # have them and create the sql statements to insert them into sangerLinks
 
 cat << '_EOF_' > getSPIDsandLoad.pl
 #!/usr/bin/perl -w
 use strict;
 my %SPhash;
 my $found = "false";
 my $gene = "";
 my $SP = "";
 
 open(SP, ">addSPIds.sql") || die "Can not create addSPIds.sql: $!";
 while(<STDIN>) {
     chomp;
                                                                                     
     my @fields = split(/\t/);
     if ($fields[0] =~ /^Gene:/) {
        $gene = $fields[1];
     }
     elsif (($fields[0] =~ /^GenPep:/) && ($gene ne "n/a") ){
        # if there are 2 fields then store second as Swiss-Prot ID
        if ($#fields == 1) {
           $SP = $fields[1];
        }
                                                                                     
        # add gene and Swiss-Prot ID to hash
        $SPhash{$gene} = $SP;
        $gene = "";
        $SP = "";
     }
 }
 foreach my $g (keys(%SPhash) ) {
    my $s = $SPhash{$g};
    # print if there is a Swiss-Prot ID
    if ($s =~ /^[A-Z]{1}[0-9]+/) {
        print "Gene: $g\tSP ID: $s\n";
        # create mySQL to add this to the sangerLinks table;
        print SP "INSERT into sangerLinks (orfName, protName, description) VALUES(\"$g\",\"$s\",\"\");\n"
    }
 }
 close SP;
 '_EOF_'
 
       # run perl script
       perl getSPIDsandLoad.pl < SPIds.wormPep.WS120 > orfsandSPIds.out
       # There are 111 of these genes with SwissProt IDs
       # To insert new orfNames and SwissProt IDs into table:
       hgsql ce2 < addSPIds.sql 
       # Now 21891 names in sangerGene that match orfNames in sangerLinks
 
       # Use SwissProt IDs in sangerLinks table to fill proteinID 
       # column in the sangerGene table
       hgsql -e 'update sangerGene as g, sangerLinks as l \
           set g.proteinID = l.protName where g.name = l.orfName;' ce2
       # check there are 21891 rows with the protName field filled
 
 # MYTOUCH FIX - Jen - 2006-01-24
   sudo mytouch ce2 orfToGene 0505031600.00
   sudo mytouch ce2 sangerBlastTab 0505031600.00
   sudo mytouch ce2 sangerBlastTab 0505031600.00
   sudo mytouch ce2 sangerCanonical 0505031600.00
   sudo mytouch ce2 sangerIsoforms 0505031600.00
   sudo mytouch ce2 sangerLinks 0505031600.00
   sudo mytouch ce2 sangerPep 0505031600.00
   sudo mytouch ce2 sangerToKim 0505031600.00
   sudo mytouch ce2 sangerToPfam 0505031600.00
   sudo mytouch ce2 sangerToRefSeq 0505031600.00
 
 
 # MAKE WORMBASE GENEFINDER TRACKS (DONE, 2004-04-12, hartera)
     # next machine
     ssh eieio
     mkdir -p /cluster/data/ce2/bed/sangerGenefinder
     cd /cluster/data/ce2/bed/sangerGenefinder
     # the perl trims these files down to a reasonable size.  As they are
     #   they cause ldHgGene to hangup due to memory size.
     foreach f (I II III IV V X M)
         echo -n "${f} "
         zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \
         sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
         -e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
         -e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
         -e "s/CHROMOSOME_M/chrM/g" \
         -e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
         perl -ne '@a = split; \
         print if($a[1] =~ /Genefinder/i && \
         $a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff
     end
     echo
     # remove CDS before id e.g. from CDS "Y74C9A.2" and remove "
     perl -pi.bak -e 's/CDS "//' chr*.gff
     perl -pi.bak -e 's/"//' chr*.gff
     # check this worked ok then cleanup
     rm *.bak
     #  check file sizes, should be reasonable
     ls -ogrt
 # total 23712
 # -rw-rw-r--    1  3984558 Apr 12 09:14 chrII.gff
 # -rw-rw-r--    1  3702836 Apr 12 09:14 chrI.gff
 # -rw-rw-r--    1        0 Apr 12 09:14 chrM.gff
 # -rw-rw-r--    1  3948579 Apr 12 09:14 chrIV.gff
 # -rw-rw-r--    1  3364561 Apr 12 09:14 chrIII.gff
 # -rw-rw-r--    1  5303266 Apr 12 09:14 chrV.gff
 # -rw-rw-r--    1  3929597 Apr 12 09:14 chrX.gff
 
     # now load database with those transformed gff files
     # next machine
     ssh hgwdev
     cd /cluster/data/ce2/bed/sangerGenefinder
     # 2004-05-10, hartera, Reload sangerGene table using extended GenePred 
     # format. 2004-05-11, hartera. Extended format frame information does not
     # look correct. Reload without the extended fields.
     ldHgGene ce2 sangerGenefinder *.gff
 # Read 36976 transcripts in 401688 lines in 7 files
 #   36976 groups 6 seqs 1 sources 4 feature types
 # 21180 gene predictions
 
     # if you need to delete that table:
     echo 'drop table sangerGenefinder' | hgsql ce2
 
 # RUN Waba alignment with briggsae  (WORKING - 2004-04-06 - Hiram)
     # prepare contigs from C. briggsae
     #   Assumes C. briggsae data has been downloaded according to
     #   makeCb1.doc
     # using briggsae contigs from previous work:
 	/iscratch/i/worms/Cbriggsae/contigs
     
     # next machine
     ssh kk
     mkdir -p /cluster/data/ce2/bed/waba/out
     cd /cluster/data/ce2/bed/waba
     ls -1S /iscratch/i/worms/Cbriggsae/contigs/c*.fa > briggsae.lst
     ls -1S /iscratch/i/worms/Celegans2/unmaskedFa/chr*.fa > elegans.lst
     cat elegans.lst | while read FN
     do
 	b=`basename ${FN}`
 	mkdir out/${b%%.fa}
     done
     # create scripts to be used here
     cat << '_EOF_' > wabaRun
 #!/bin/csh -fe
 #
 #	$1 - full pathname to a briggsae contig
 #	$2 - file path to an elegans chrom.fa
 #	$3 - result file full pathname
 #
 set f = $1:t
 set chr = $2:t
 set d = $chr:r
 mkdir -p /tmp/$d/$f
 cp $1 /tmp/$d/$f
 pushd .
 cd /tmp/$d/$f
 set t = $f:r
 /cluster/home/hiram/bin/i386/waba 1 $f $2 $t.1
 /cluster/home/hiram/bin/i386/waba 2 $t.1 $t.2
 /cluster/home/hiram/bin/i386/waba 3 $t.2 $t.wab
 cp $t.wab $3
 popd
 rm -f /tmp/$d/$f/$t.*
 rmdir --ignore-fail-on-non-empty /tmp/$d/$f
 rmdir --ignore-fail-on-non-empty /tmp/$d
 '_EOF_'
     chmod +x wabaRun
 
     cat << '_EOF_' > jobTemplate
 #LOOP
 /cluster/store5/worm/ce2/bed/waba/wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists /cluster/store5/worm/ce2/bed/waba/out/$(root2)/$(root1).wab}
 #ENDLOOP
 '_EOF_'
 
     gensub2 briggsae.lst elegans.lst jobTemplate jobList
     para create jobList
     para try
     para check
     para push
     # one of the jobs takes quite a while.  Most of the others are OK:
 Completed: 6950 of 6951 jobs
 Crashed: 1 jobs
 CPU time in finished jobs:   19284472s  321407.87m  5356.80h  223.20d  0.612 y
 IO & Wait Time:                 63556s    1059.26m    17.65h    0.74d  0.002 y
 Average job time:                2784s      46.40m     0.77h    0.03d
 Longest job:                    47017s     783.62m    13.06h    0.54d
 Submission to last job:         58555s     975.92m    16.27h    0.68d
 
     #	The failed job is:
     # /cluster/store5/worm/ce2/bed/waba/wabaRun \
     #	/iscratch/i/worms/Cbriggsae/contigs/c0907.fa \
     #	/iscratch/i/worms/Celegans2/unmaskedFa/chrI.fa \
     #	/cluster/store5/worm/ce2/bed/waba/out/chrI/c0907.wab
 XXXX - running 2004-04-07 - retrying this stand-along on kkr1u00
     # after about 1.5 hours fails with the error:
     # waba: xenbig.c:550: mergeTwo: Assertion `c->qStart == qStart &&
     #	c->qEnd == qEnd' failed.
     # Abort
 
     # Ignoring that error, and moving on
 
     # next machine
     ssh hgwdev
 
     cd /cluster/data/ce2/bed/waba
     mkdir Load
     cd Load
     #  The cat through the pipe to hgWaba will avoid making large files
     # 	that are not needed.
     cat << '_EOF_' > loadEm.sh
 #!/bin/sh
 #
 for c in I II III IV V X M
 do
 	echo -n "${c} "
         cat /cluster/store5/worm/ce2/bed/waba/out/chr${c}/c*.wab |
         /cluster/bin/i386/hgWaba ce2 Cbr chr${c} 0 stdin > proc${c}.out 2>&1
 done
 
 exit 0
 '_EOF_'
     chmod +x loadEm.sh
 
     # run it to load the waba track:
     ./loadEm.sh
     # remove garbage temp file:
     rm full_waba.tab chrom_waba.tab
 
     # worm/ce2/trackDb.ra entry:
 #   track cbrWaba
 #   shortLabel Briggsae Waba
 #   longLabel C. briggsae Waba Alignments
 #   group compGeno
 #   priority 20
 #   visibility dense
 #   color 140,0,200
 #   altColor 210,140,250
 
 # MAKE 2BIT FILE FOR BLAT (DONE, 2004-04-07, hartera)  
     # Make one big 2bit file as well, and make a link to it in
     # /gbdb/ce2/nib because hgBlat looks there:
     ssh eieio
     cd /cluster/data/ce2
     faToTwoBit softMask/chr*.fa ce2.2bit
     ssh hgwdev
     ln -s /cluster/data/ce2/ce2.2bit /gbdb/ce2/nib/
 
 # MAKE 11.OOC FILE FOR BLAT (DONE, 2004-04-07, hartera)
 # Use -repMatch=40 (based on size, for human use 1024)
     ssh kkr1u00
     mkdir /cluster/bluearc/ce2
     mkdir /cluster/data/ce2/bed/ooc
     cd /cluster/data/ce2/bed/ooc
     ls -1 /cluster/data/ce2/nib/chr*.nib > nib.lst
     /cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/ce2/11.ooc -repMatch=40
 # Writing /cluster/bluearc/ce2/11.ooc
 # Wrote 43676 overused 11-mers to /cluster/bluearc/ce2/11.ooc
    cp -p /cluster/bluearc/ce2/11.ooc /iscratch/i/worms/Celegans2/
    iSync
 
 # AUTO UPDATE GENBANK MRNA AND EST RUN (DONE, 2004-04-07, hartera)
     # next machine
     ssh hgwdev
     # Update genbank config and source in CVS:
     cd ~/kent/src/hg/makeDb/genbank
     cvsup .
     # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
     # edits, check them in if necessary:
     diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf
 
     # Edit etc/genbank.conf: default includes native genbank mRNAs and ESTs, 
     # genbank xeno mRNAs but no ESTs, native RefSeq mRNAs but not xeno RefSeq
     # Add these lines:
 # ce2 (C. elegans)
 ce2.genome = /iscratch/i/worms/Celegans2/nib/chr*.nib
 ce2.lift = no
 ce2.downloadDir = ce2
 
     cvs commit -m "Added ce2" etc/genbank.conf
     make
 
     # markd added -maxIntron=100000 for ce to genbank/src/align/gbBlat
     # Edit src/align/gbBlat to add /iscratch/i/worms/Celegans2/11.ooc
     cvs diff src/align/gbBlat
     make
     cvs commit -m "Added 11.ooc for ce2" src/align/gbBlat
     # Install to /cluster/data/genbank:
     make install-server
     
     # next machine
     ssh eieio
     cd /cluster/data/genbank
     # This is an -initial run, RefSeq mRNA only:
     nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial ce2
 
     # Load results for RefSeq mRNAs:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad ce2
     # To start next run, results need to be moved out the way
     cd /cluster/data/genbank/work
     mv initial.ce2 initial.ce2.refseq.mrna
 
     # -initial for GenBank mRNAs
     ssh eieio
     cd /cluster/data/genbank
     nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial ce2
 
     # Load results for GenBank mRNAs
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 ce2 
     cd /cluster/data/genbank/work
     mv initial.ce2 initial.ce2.genbank.mrna
 
     # -initial for ESTs
     ssh eieio
     cd /cluster/data/genbank
     nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial ce2
 
     # Load results for GenBank ESTs
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 ce2 
     cd /cluster/data/genbank/work
     mv initial.ce2 initial.ce2.genbank.est
    
     # Everything loaded ok, so clean up
     rm -r /cluster/data/genbank/work/initial.ce2*
     
 # REMOVE XENO mRNAs (DONE, 2004-04-09, hartera)
     # Remove xeno mRNAs, too many are distantly related and do 
     # not have meaningful alignments using Blat
     ssh hgwdev
     # Remove all mrna.xeno.* files from
     # /cluster/data/genbank/data/aligned/genbank.140.0/ce2
     # edit /cluster/data/genbank/etc/genbank.conf
     cd ~/kent/src/hg/makeDb/genbank
     # add line to ce2:
 ce2.genbank.mrna.xeno.load = no
 # so now it is:
 # ce2 (C. elegans)
 ce2.genome = /iscratch/i/worms/Celegans2/nib/chr*.nib
 ce2.lift = no
 ce2.genbank.mrna.xeno.load = no
 ce2.downloadDir = ce2
     make
     cvs update etc/genbank.conf
     cvs commit etc/genbank.conf
     make install-server
 
     cd /cluster/data/genbank/
     # reload only the native mRNAs to the database
     nice bin/gbDbLoadStep -reload -type=mrna -srcDb=genbank -verbose=1 ce2
     # drop xenoMrna table
     echo 'drop table xenoMrna' | hgsql ce2
     
 # PRODUCING FUGU BLAT ALIGNMENTS (DONE, 2004-04-07, hartera)
     # Assumes masked NIBs have been prepared as above
     # and Fugu pieces are already on kluster /iscratch/i.
     # next machine
     ssh kk
     mkdir /cluster/data/ce2/bed/blatFr1
     cd /cluster/data/ce2/bed/blatFr1
     ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
     ls -1S /iscratch/i/worms/Celegans2/nib/*.nib > worm.lst
     cat << '_EOF_' > gsub
 #LOOP
 blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     mkdir psl
     gensub2 worm.lst fugu.lst gsub spec
     para create spec
     para try, check, push, check, ...
 # para time
 # Completed: 4046 of 4046 jobs
 # CPU time in finished jobs:     263359s    4389.31m    73.16h    3.05d  0.008 y
 # IO & Wait Time:                 13317s     221.95m     3.70h    0.15d  0.000 y
 # Average job time:                  68s       1.14m     0.02h    0.00d
 # Longest job:                     1103s      18.38m     0.31h    0.01d
 # Submission to last job:          2170s      36.17m     0.60h    0.03d
 
     # When cluster run is done, sort alignments
     # next machine
     ssh eieio
     cd /cluster/data/ce2/bed/blatFr1
     foreach d (I II III IV V X M)
         echo -n "${d} "
         pslCat psl/chr${d}_*.psl | \
         pslSortAcc nohead chrom temp stdin
         rm -f chrom/chr${d}_blatFr1.psl
         mv chrom/chr${d}.psl chrom/chr${d}_blatFr1.psl
     end
 
     # next machine
     ssh hgwdev
     # Load Fugu Blat alignments
     cd /cluster/data/ce2/bed/blatFr1/chrom
     cat *.psl | hgLoadPsl -fastLoad -table=blatFr1 ce2 stdin
 
     # Make fugu /gbdb/ symlink and load Fugu sequence data.
     mkdir /gbdb/ce2/fuguSeq
     ln -s cluster/data/fr1/fugu_v3.masked.fa /gbdb/ce2/fuguSeq
     cd /cluster/data/ce2/bed/blatFr1
     hgLoadSeq ce2 /gbdb/ce2/fuguSeq/fugu_v3.masked.fa
     # Adding /gbdb/ce2/fuguSeq/fugu_v3.masked.fa
     # 20379 sequences
 
     # worm/ce2 trackDb.ra entry
 #   track blatFr1
 #   shortLabel Fugu Blat
 #   longLabel Takifugu rubripes (fr1) Translated Blat Alignments
 #   group compGeno
 #   priority 110
 #   visibility dense
 #   color 0,60,120
 #   altColor 200,220,255
 #   spectrum on
 #   type psl xeno
 
 # BLASTZ C. Briggsae (Cb1) (DONE, 2004-04-13, hartera)
                                                                                 
     # next machine
     ssh kkr1u00
     # blastz requires lineage-specific repeats but there are none for the worms
     # so create empty files for each chromsome and iSync
     mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
     cd /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
     # create chrI.out.spec and cp to chrN.out.spec for chrII, chrIII, chrIV, chrV, chrX, chrM
     mkdir -p /iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans
     # create empty chrUn.out.spec file
     iSync
     
     ssh kk
     mkdir -p /cluster/data/ce2/bed/blastz.cb1.2004-04-12
     cd /cluster/data/ce2/bed
     ln -s  blastz.cb1.2004-04-12 blastz.cb1
     cd blastz.cb1
  
 cat << '_EOF_' > DEF
 # C. elegans vs. C. briggsae
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 #BLASTZ_ABRIDGE_REPEATS=1
 #  when SMSK=/dev/null
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET
 SEQ1_DIR=/iscratch/i/worms/Celegans2/nib
 # RMSK not currently used
 SEQ1_RMSK=/iscratch/i/worms/Celegans2/rmsk
 SEQ1_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
 # FLAG not currently used
 SEQ1_FLAG=-worm
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 SEQ2_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib
 # RMSK not currently used
 SEQ2_RMSK=/iscratch/i/worms/Cbriggsae/rmsk
 # FLAG not currently used
 SEQ2_SMSK=/iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans
 SEQ2_FLAG=-worm
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/ce2/bed/blastz.cb1
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy  
     
     # Save the DEF file in the current standard place
     cp DEF ~angie/hummus/DEF.ce2-cb1.2004-04-12
     # Need shell scripts from mm4 to do cluster runs
     mv /cluster/data/mm4/jkStuff/BlastZ*.sh /cluster/data/ce2/jkStuff/
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/ce2/bed/blastz.cb1
     source DEF
     /cluster/data/ce2/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 # para time
 # Completed: 56 of 56 jobs
 # CPU time in finished jobs:     155723s    2595.39m    43.26h    1.80d  0.005 y
 # IO & Wait Time:                  6387s     106.45m     1.77h    0.07d  0.000 y
 # Average job time:                2895s      48.25m     0.80h    0.03d
 # Longest job:                     6137s     102.28m     1.70h    0.07d
 # Submission to last job:         11078s     184.63m     3.08h    0.13d
 
     #   Second cluster run to convert the .out's to .lav's
     cd /cluster/data/ce2/bed/blastz.cb1
     source DEF
     /cluster/data/ce2/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # para time
 # Completed: 14 of 14 jobs
 # CPU time in finished jobs:        762s      12.70m     0.21h    0.01d  0.000 y
 # IO & Wait Time:                    62s       1.03m     0.02h    0.00d  0.000 y
 # Average job time:                  59s       0.98m     0.02h    0.00d
 # Longest job:                      115s       1.92m     0.03h    0.00d
 # Submission to last job:           293s       4.88m     0.08h    0.00d
 
     #   Third cluster run to convert lav's to axt's
     cd /cluster/data/ce2/bed/blastz.cb1
     source DEF
     /cluster/data/ce2/jkStuff/BlastZ_run2.sh
     cd run.2
     para try
     para check
     # only 7 jobs so all completed by para try
 # para time
 # Completed: 7 of 7 jobs
 # CPU time in finished jobs:        187s       3.11m     0.05h    0.00d  0.000 y
 # IO & Wait Time:                   169s       2.82m     0.05h    0.00d  0.000 y
 # Average job time:                  51s       0.85m     0.01h    0.00d
 # Longest job:                       77s       1.28m     0.02h    0.00d
 # Submission to last job:            77s       1.28m     0.02h    0.00d
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/ce2/bed/blastz.cb1
     mkdir -p pslChrom
     set tbl = "blastzCb1"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
 
    # Load database tables
     ssh hgwdev
     cd /cluster/data/ce2/bed/blastz.cb1/pslChrom
     foreach d (I II III IV V X M)
         /cluster/bin/i386/hgLoadPsl -noTNameIx ce2 chr${d}_blastzCb1.psl
     end
 
 # CHAIN Cb1 BLASTZ (DONE, 2004-04-30, hartera)
 # CHAINS ARE RE-DONE DUE TO BUG IN axtChain which has now been fixed by Jim
     ssh kk
     mkdir -p /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/run1
     cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/run1
     mkdir out chain
    
     ls -1S /cluster/data/ce2/bed/blastz.cb1/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
     axtFilter $1 | /cluster/home/hartera/bin/i386/axtChain stdin \
         /iscratch/i/worms/Celegans2/nib \
         /iscratch/i/worms/Cbriggsae/bothMasksNib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     # only 7 jobs so all done by para try
     para try
     para check
 # para time
 # Completed: 7 of 7 jobs
 # CPU time in finished jobs:       1265s      21.08m     0.35h    0.01d  0.000 y
 # IO & Wait Time:                    69s       1.16m     0.02h    0.00d  0.000 y
 # Average job time:                 191s       3.18m     0.05h    0.00d
 # Longest job:                      632s      10.53m     0.18h    0.01d
 # Submission to last job:           632s      10.53m     0.18h    0.01d
 
     # now on the file server, sort chains
     ssh eieio
     cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
     time chainMergeSort run1/chain/*.chain > all.chain
     # User 37.040u
     # System 3.090s
     # Elapsed Real 0:42.54
 
     time chainSplit chain all.chain
     # User 37.310u
     # System 2.930s
     # Elapsed Real 0:43.85
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain ce2 ${c}_chainCb1 $i
         echo done $c
     end
 
 # NET Cb1 (DONE, 2004-04-30, hartera)
 # REMAKE NET Cb1 WITH NEW CHAINS
    ssh eieio
    cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
    mkdir preNet
    mv /cluster/data/ce2/tmp/chrom.sizes /cluster/data/ce2/
    # remove header line in file so chainPreNet will work
    cd chain
    foreach i (*.chain)
    echo preNetting $i
    /cluster/bin/i386/chainPreNet $i /cluster/data/ce2/chrom.sizes \
                         /cluster/data/cb1/chrom.sizes ../preNet/$i
     end
 
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/ce2/chrom.sizes \
                             /cluster/data/cb1/chrom.sizes ../n1/$n /dev/null
     end
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     # memory usage 84246528, utime 710 s/100, stime 70
 
     ssh hgwdev
     cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
     # add classification info using db tables 
     # use -noAr option - don't look for ancient repeats
     time netClass -noAr hNoClass.net ce2 cb1 cb1.net
     # User 11.860u
     # System 2.090s 
     # Elapsed Real 0:16.24
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
     netFilter -minGap=10 cb1.net |  hgLoadNet ce2 netCb1 stdin
 
 # MAKE axtNet TRACK (DONE, 2004-04-30, hartera)
 # TRACK REMADE WITH NEW CHAINS 
     # Move back to the file server to create axt files corresponding to the net
     ssh kksilo
     cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
     # try without netFilter as not working well
     # netFilter -minGap=10 cb1.net > cb1Filt.net
     mkdir /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29
     netSplit cb1.net cb1NetSplit
     cd cb1NetSplit
     foreach i (*.net)
         set c = $i:r
         netToAxt -maxGap=300 $i \
            /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/chain/$c.chain \
            /iscratch/i/worms/Celegans2/nib \
            /iscratch/i/worms/Cbriggsae/bothMasksNib \
            /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/$c.axt
         echo done ../axtNet.2004-04-29/$c.axt
     end
     cd ..
     rm -r cb1NetSplit
     # sort axt files based on target position
     mkdir /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet
     cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29
     
     foreach i (*.axt) 
        set c = $i:r
        axtSort $c.axt ./sortedaxtNet/$c.axt
        echo done sorting $c.axt
     end
 
     # Load up the axtNet (alignment score wiggle) track:
     ssh hgwdev
     mkdir /gbdb/ce2/axtNetCb1
     foreach f (/cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet/chr*.axt)
        ln -s $f /gbdb/ce2/axtNetCb1
     end
     cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet
     ${HOME}/bin/i386/hgLoadAxt ce2 axtNetCb1
 
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CE2 (DONE, 2003-04-13, hartera)
     # next machine
     ssh hgwdev
     # DNA port is "0", trans prot port is "1"
     # 8/26/04: set canPcr=1 for untranslated blat server.
     echo 'insert into blatServers values("ce2", "blat1", 17781, 0, 1); \
           insert into blatServers values("ce2", "blat1", 17780, 1, 0);' \
       | hgsql -h genome-testdb hgcentraltest
         # if you need to delete those entries
         echo 'delete from blatServers where db="ce2";' \
         | hgsql -h genome-testdb hgcentraltest
         # to check the entries:
         echo 'select * from blatServers where db="ce2";' \
         | hgsql -h genome-testdb hgcentraltest
 
 # CREATE ORFTOGENE TABLE (Used by hgGene and hgNear) (DONE, 2004-04-27, hartera)
    mkdir /cluster/data/ce2/bed/orfToGene
    cd !$
    # gene_names.txt for WS120 was created and provided by todd.harris@cshl.edu
    # ORF e.g. Y110A7A.10 gene e.g. aap-1
    awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt
    # reformat this for use in creating Sanger Links with hgWormLinks
    awk 'NF == 2' orfToGene.txt > orfToGene.txt2
    # use perl script to move ORFs with the same gene name onto separate lines
    # and remove lines where there is a gene name with an alternate name
    # in parentheses. this output file is used for Sanger Links
    /cluster/data/ce2/jkStuff/formatorfToGene.pl < orfToGene.txt2 > orfToGene.tab2   
    hgCeOrfToGene ce2 gene_names.txt sangerGene orfToGene
                                                                                 
 # DOWNLOAD WORMBASE FOR WS120 RELEASE (DONE, 2004-04-23, hartera)
 # REMOVED acedb DIRECTORY AS NO LONGER NEEDED (DONE, 2008-08-18, hartera)
     ssh eieio
     mkdir -p /cluster/store7/acedb
     cd /cluster/store7/acedb
     wget -o ce2.fetchws120.log -r -l1 --timestamp --no-directories \
         ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS120/
     gunzip *.gz
     tar xvf *.tar
 
     # 2008-08-18, hartera. Remove the acedb directory.
     cd /cluster/store7
     rm -rf acedb
 
 # CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear)
 # (DONE, 2004-05-03, hartera)
    mkdir /cluster/data/ce2/bed/steinHelp
 # Mail Lincoln Stein and ask for mappings from wormPep to
 # SwissProt and from ORF to wormPep to description.  Place
 # these files in the steinHelp directory:
 #       swissprot_dump.txt  - contains ORF/WormPep/Description
 #       and contains SwissProt Ids and Accessions
     # Jorge downloaded and installed the software for using AceDB 
     # to /cluster/bin/acedb/
     # To use 
     ssh hgwdev
     cd /cluster/store7/acedb
     # can use /cluster/bin/acedb/xace . to get X-Window query interface 
     # To use AceDB Query Language (AQL)
     cd /cluster/store7/acedb
     /cluster/bin/acede/tace .
     # At prompt, can type ? for help, type the following queries
 #    wbGeneClass.txt - associates 3 letter first part of worm gene
 #             name with a brief description.  
     acedb> AQL -o wbGeneClass.txt select l,l->Description  from l in class Gene_Clas
 #    wbConcise.txt - associates worm gene name with a several
 #             sentence description. 
     acedb> AQL -o wbConcise.txt select l,g from l in class Locus, g in l->Gene_information[Provisional_description]
     # remove "" in wbGeneClass.txt and wbConcise.txt
     sed -e 's/"//g;' wbGeneClass.txt > wbGeneClass.new
     sed -e 's/"//g;' wbConcise.txt > wbConcise.new
     mv wbGeneClass.new wbGeneClass.txt
     mv wbConcise.new wbConcise.txt
     # first remove header lines
     sed -e 's/Format//' wbConcise.txt | sed -e 's/Locus[:]*//' > tmp
     sed -e 's/Text//' tmp > tmp2
     mv tmp2 wbConcise.txt
     rm tmp
     sed -e 's/Format//' wbGeneClass.txt | sed -e 's/Gene_class[:]*//' > tmp
     sed -e 's/Text//' tmp > tmp2
     mv tmp2 wbGeneClass.txt
     rm tmp
     
     # get SwissProt and TrEMBL Ids and accessions from ftp site
     # TrEMBL Ids missing from swissprot_dump.txt
     # Need these for details pages to work for sangerGene track
     wget -o ce2.fetch.log -r -l1 --no-directories --timestamping \
         ftp://ftp.sanger.ac.uk/pub/databases/wormpep/old_wormpep120/wormpep.table120
     
     # parse out CDS to Swiss-Prot IDs mappings
     cat << '_EOF_' > /cluster/data/ce2/jkStuff/getCDSandSPId.pl
 #!/usr/bin/perl -w
 use strict;
                                                                                 
 my $found = "false";
 while (<STDIN>) {
     chomp;
     my @fields = split(/\t/);
     foreach my $f (@fields) {
         # get CDS name
         if ($f =~ /^>([0-9A-Za-z\.]+)/) {
              print "$1\t";
              $found = "false";
         }
         elsif ($f =~ /^(TR:|SP:)([0-9A-Z]{4,})$/) {
              print $2;
              $found = "true";
         }
     }
     if ($found eq "false") {
         print "NULL";
     }
     print "\n";
 }
 '_EOF_'
     chmod a+x /cluster/data/ce2/jkStuff/getCDSandSPId.pl
     perl /cluster/data/ce2/jkStuff/getCDSandSPId.pl < wormpep.table120 > spCdsToId.txt
     # rewrote hgWormLinks to extract information from these files
     hgWormLinks spCdsToId.txt swissprot_dump.txt \
         wbConcise.txt wbGeneClass.txt \
         /cluster/data/ce2/bed/orfToGene/orfToGene.tab2 sangerLinks.txt
     # create table in ce2 database and load data
     hgsql ce2 < ~/kent/src/hg/near/hgWormLinks/sangerLinks.sql
     hgsql -e \
         'load data local infile "sangerLinks.txt" into table sangerLinks' \
         ce2
 
 # CREATE SANGERPEP TABLE (DONE, 2004-04-29, hartera)
 # RECREATED SANGERPEP TABLE SO AS all.joiner EXPECTS THE CREATE TIME TO
 # BE LATER THAN THAT FOR THE SANGERGENE TABLE (DONE, 2004-05-21, hartera)
     mkdir -p /cluster/data/ce2/bed/sangerPep
     cd /cluster/data/ce2/bed/sangerPep
     # Download peptide sequences from the Sanger Centre ftp site:
     wget -o ce2.fetch.log -r -l1 --no-directories --timestamping \
         ftp://ftp.sanger.ac.uk/pub/databases/wormpep/old_wormpep120/wormpep120
     # Load into database
     # 2004-05-10 and 2004-05-11 hartera dropped sangerPep table and recreated it
     # all.joiner expects sangerGene table to have an earlier
     # load time than sangerPep so recreate sangerPep after sangerGene
     hgPepPred ce2 generic sangerPep wormpep120
     # the sangerPep table is used by the sangerGene track
 
 # TWINSCAN GENE PREDICTIONS (DONE, 2004-04-16, hartera)
     # These are Iscan (new version of Twinscan) gene predictions
     # e-mailed Michael Brent at WUSTL: brent@cse.wustl.edu to obtain data
     ssh hgwdev
     mkdir /cluster/data/ce2/bed/twinscan
     cd /cluster/data/ce2/bed/twinscan
     wget --timestamping -r \
         http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS120/4-13-2004/
     mv ./genes.cs.wustl.edu/predictions/worm/C_elegans/WS120/4-13-2004/* \
         /cluster/data/ce2/bed/twinscan/
     rm -r genes.cs.wustl.edu
     rm index* ./chr_gtf/index* ./chr_ptx/index* ./chr_tx/index*
 
     # Clean up chrom field of GTF files
     foreach c (I II III IV V X)
         set f = chr_gtf/chr_${c}.iscan_pred_gtf
         echo ${f}
         sed -e "s/CHROMOSOME_/chr/g" ${f} | \
         sed -e "s/_[0-9][0-9]*.seq\t/\t/" > \
                 chr_gtf/chr${c}-fixed.gtf
     end
 
     # pare down protein FASTA header to id and add missing .1:
     foreach c (I II III IV V X)
         set f = chr_ptx/CHROMOSOME_${c}.ptx
         echo ${f}
         perl -wpe 's/^\>CHROMOSOME_(\S+).*$/\>chr$1.1/;' < \
                 ${f} > chr_ptx/chr${c}-fixed.fa
     end
     # load data into database, need -gtf option as no stop codons in
     # these predictions. also use new -frame -id and -geneName options
     ldHgGene -gtf -frame -id -geneName ce2 twinscan chr_gtf/chr*-fixed.gtf
     
     # Read 20990 transcripts in 147535 lines in 6 files
     # 20990 groups 6 seqs 1 sources 3 feature types
     # 20990 gene predictions
     
     # 2004-05-10, hartera dropped twinscanPep table and re-created it.
     # twinscan table was reloaded using extended options after loading 
     # twinscanPep. all.joiner expects twinscan table to have an earlier
     # load time than twinscanPep
     hgPepPred ce2 generic twinscanPep chr_ptx/chr*-fixed.fa
   
 #  MAKING BLASTZ SELF (DONE, 2004-04-20, hartera)
     # The procedure for lineage spec business with self is to simply
     # use the actual repeat masker output for this C. elegans assembly as
     # the lineage specific repeats for itself.  Thus, merely make
     # symlinks to the repeat masker out files and name them as expected
     # for blastz.  In this case they are called notinCelegans but they
     # really mean inCelegans.  Yes, it is confusing, but that's just the
     # nature of the game in this case. 
 
     # next machine
     ssh kkr1u00
     mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
     cd /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
     foreach f (/iscratch/i/worms/Celegans2/bothMasksFa/*.fa)
         set base = $f:t:r:r
         echo $base.out.spec
         ln -s $f $base.out.spec
     end
     iSync
 
     # next machine
     ssh hgwdev
     mkdir -p /cluster/data/ce2/bed/blastzSelf
     cd /cluster/data/ce2/bed/blastzSelf
 
     cat << '_EOF_' > DEF
 # C. elegans vs. C. elegans
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
                                                                                 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
                                                                                 
 # TARGET
 # C. elegans
 SEQ1_DIR=/iscratch/i/worms/Celegans2/nib
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
                                                                                 
 # QUERY
 # C. elegans
 SEQ2_DIR=/iscratch/i/worms/Celegans2/nib
 # not currently used
 SEQ2_RMSK=
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 
 BASE=/cluster/data/ce2/bed/blastzSelf
                                                                                 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
                                                                                 
 # Save the DEF file in the current standard place
     cp DEF ~angie/hummus/DEF.ce2-ce2.2004-04-14
     ssh kk
     cd /cluster/data/ce2/bed/blastzSelf
                                                                                 
     # source the DEF file to establish environment for following commands
     source DEF
     /cluster/data/mm4/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 # para time
 # Completed: 196 of 196 jobs
 # CPU time in finished jobs:     418569s    6976.16m   116.27h    4.84d  0.013 y
 # IO & Wait Time:                  2416s      40.26m     0.67h    0.03d  0.000 y
 # Average job time:                2148s      35.80m     0.60h    0.02d
 # Longest job:                    17917s     298.62m     4.98h    0.21d
 # Submission to last job:         53181s     886.35m    14.77h    0.62d
 
     cd /cluster/data/ce2/bed/blastzSelf
     source DEF
     /cluster/data/ce2/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 
 # para time
 # Completed: 14 of 14 jobs
 # CPU time in finished jobs:       3340s      55.67m     0.93h    0.04d  0.000 y
 # IO & Wait Time:                    98s       1.63m     0.03h    0.00d  0.000 y
 # Average job time:                 246s       4.09m     0.07h    0.00d
 # Longest job:                      439s       7.32m     0.12h    0.01d
 # Submission to last job:           892s      14.87m     0.25h    0.01d
 
     # Third cluster run to convert lav's to axt's
     # lavToAxt has a new -dropSelf option that replaces axtDropOverlap
     # to remove the alignment blocks on the diagonal
     # When this was first run, it crashed for chrV with the 
     # error: breakUpIfOnDiagonal: Too many fragmented block lists!
     # This means that when a blockList is taken and broken up everywhere
     # there is a block along the diagonal and added to an array of 
     # fragmented sub-lists then in this chrosomosome the list has exceeded
     # the array size of 256 for an alignment. Change the line:
     # struct block *bArr[256] to struct block *bArr[1024] in the
     # parseIntoAxt function, make and re-run 
   
     cd /cluster/data/ce2/bed/blastzSelf
     mkdir axtChrom
     mkdir run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh
 cd $1
 cat `ls -1 *.lav | sort -g` \
 | ~/bin/i386/lavToAxt -dropSelf stdin /iscratch/i/worms/Celegans2/nib \
     /iscratch/i/worms/Celegans2/nib stdout \
 | axtSort stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/ce2/bed/blastzSelf/axtChrom/$(root1).axt}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     # check jobList
     wc -l jobList
     head jobList
     para create jobList
     para try, check ....
 
     # only 7 jobs so all completed by para try
     para try, para check ...etc.
 # para time
 # Completed: 7 of 7 jobs
 # CPU time in finished jobs:        796s      13.27m     0.22h    0.01d  0.000 y
 # IO & Wait Time:                   551s       9.18m     0.15h    0.01d  0.000 y
 # Average job time:                 192s       3.21m     0.05h    0.00d
 # Longest job:                      302s       5.03m     0.08h    0.00d
 # Submission to last job:           303s       5.05m     0.08h    0.00d
 
     # Use Blastz alignments to create Self Chain. Do not create 
     # Self Blastz track since this is redundant with the Self Chain.
 
 # CHAIN SELF BLASTZ (DONE, 2004-05-03, hartera)
 # CHAINS ARE RE-DONE DUE TO BUG IN axtChain which has now been fixed by Jim
 # The axtChain is best run on the small kluster, or the kki kluster
 #       in this case, it was run on the kk kluster
     ssh kk
     mkdir -p /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/run1
     cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/run1
     mkdir out chain
 
     ls -1S /cluster/data/ce2/bed/blastzSelf/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 # Problems running this. At first, all jobs crashed and para problems said that
 # the ./out/*.out files were empty. Then copied nib files also to a nib2 
 # directory and change in doChain so the 2 nib arguments are not the same.
 # Then crashed saying that the ./chain/*.chain files were empty and that the 
 # files in nib2 could not be opened (permissions ok). changed doChain back to 
 # having the two nib directory arguments the same. Removed and recreated jobList# but left ./chain/*.chain and ./out/*.out 
 # This time the jobs ran ok. Also runs ok if run axtFilter followed by 
 # axtChain separately on the command line.                                                                                
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtFilter $1 | /cluster/home/hartera/bin/i386/axtChain stdin \
     /iscratch/i/worms/Celegans2/nib /iscratch/i/worms/Celegans2/nib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     # 7 jobs so all completed by para try
     para try, check,....etc.
 # after remaking the axt files using the -dropSelf option for lavToAxt,
 # these jobs no longer crashed.
 # para time
 # Completed: 7 of 7 jobs
 # CPU time in finished jobs:       2700s      45.00m     0.75h    0.03d  0.000 y
 # IO & Wait Time:                   620s      10.34m     0.17h    0.01d  0.000 y
 # Average job time:                 474s       7.90m     0.13h    0.01d
 # Longest job:                      844s      14.07m     0.23h    0.01d
 # Submission to last job:           844s      14.07m     0.23h    0.01d
     # now on the file server, sort chains
     ssh eieio
     cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r
       textHistogram -binSize=10000 /tmp/score.$f:t:r
       echo ""
     end
     # for chrI
     # chrI
 # large values truncated: need 1414 bins or larger binSize than 10000
 #   0 ************************************************************ 710660
 # 10000 ******** 90988
 # 20000 ** 25092
 # 30000 * 10405
 # 40000  5912
 # 50000  2958
 # 60000  2157
 # 70000  1780
 # 80000  1336
 # 90000  900
 # 100000  636
 # 110000  579
 # 120000  451
 # 130000  403
 # 140000  351
 # 150000  337
 # 160000  287
 # 170000  210
 # 180000  88
 # 190000  129
 # 200000  147
 # 210000  80
 # 220000  60
 # 230000  73
 # 240000  42
     # trim to minScore=20000 to cut some of the fluff
     mkdir chainFilt
     foreach f (chain/*.chain)
       chainFilter -minScore=20000 $f > chainFilt/$f:t
     end
  
     # Load chains into database 
     # next machine
     ssh hgwdev
     cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/chainFilt
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain ce2 ${c}_chainSelf $i
         echo done $c
     end
 
 # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-04-21, hartera)
     ssh kksilo
     cd /cluster/data/ce2
     #- Build the .zip files
     cat << '_EOF_' > jkStuff/zipAll.csh
 rm -rf zip
 mkdir zip
 # chrom AGP's
 zip -j zip/chromAgp.zip ./sangerFa/[0-9A-Z]*/chr*.agp
 # chrom RepeatMasker out files
 zip -j zip/chromOut.zip ./RMRun/chr*.fa.out
 # soft masked chrom fasta
 zip -j zip/chromFa.zip ./softMask/chr*.fa
 # hard masked chrom fasta
 zip -j zip/chromFaMasked.zip hardMask/chr*.fa
 # chrom TRF output files
 cd bed/simpleRepeat
 zip ../../zip/chromTrf.zip trfMask/chr*.bed
 cd ../..
 # get GenBank native mRNAs
 cd /cluster/data/genbank
 ./bin/i386/gbGetSeqs -db=ce2 -native GenBank mrna \
         /cluster/data/ce2/zip/mrna.fa
 # get GenBank xeno mRNAs
 ./bin/i386/gbGetSeqs -db=ce2 -xeno GenBank mrna \
         /cluster/data/ce2/zip/xenoMrna.fa
 # get native RefSeq mRNAs
 ./bin/i386/gbGetSeqs -db=ce2 -native refseq mrna \
 /cluster/data/ce2/zip/refMrna.fa
 # get native GenBank ESTs
 ./bin/i386/gbGetSeqs -db=ce2 -native GenBank est \
 /cluster/data/ce2/zip/est.fa
 
 cd /cluster/data/ce2/zip
 # zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
 zip -j mrna.zip mrna.fa
 zip -j xenoMrna.zip xenoMrna.fa
 zip -j refMrna.zip refMrna.fa
 zip -j est.zip est.fa
 '_EOF_'
     # << this line makes emacs coloring happy
     csh ./jkStuff/zipAll.csh |& tee zipAll.log
     cd zip
     #- Look at zipAll.log to make sure all file lists look reasonable.
 
     # Make upstream files and Copy the .zip files to 
     # hgwdev:/usr/local/apache/...
     ssh hgwdev
     cd /cluster/data/ce2/zip
     # make upstream files for C. elegans RefSeq
     featureBits ce2 refGene:upstream:1000 -fa=upstream1000.fa
     zip upstream1000.zip upstream1000.fa
     featureBits ce2 refGene:upstream:2000 -fa=upstream2000.fa
     zip upstream2000.zip upstream2000.fa
     featureBits ce2 refGene:upstream:5000 -fa=upstream5000.fa
     zip upstream5000.zip upstream5000.fa
 
     #- Check zip file integrity:
     foreach f (*.zip)
       unzip -t $f > $f.test
       tail -1 $f.test
     end
     wc -l *.zip.test
     
     set gp = /usr/local/apache/htdocs/goldenPath/ce2
     mkdir -p $gp/bigZips
     cp -p *.zip $gp/bigZips
     mkdir -p $gp/chromosomes
     foreach f ( ../sangerFa/chr*.fa )
       zip -j $gp/chromosomes/$f:t.zip $f
     end
     cd $gp/bigZips
     md5sum *.zip > md5sum.txt
     cd $gp/chromosomes
     md5sum *.zip > md5sum.txt
     # Take a look at bigZips/* and chromosomes/*, update their README.txt's
 
 # MAKE VSCB1 DOWNLOADS (DONE, 2004-05-11, hartera)
     # DO THIS LATER
     ssh kksilo
     cd /cluster/data/ce2/bed/blastz.cb1
     zip -j /cluster/data/ce2/zip/Cb1axtChrom.zip axtChrom/chr*.axt
     cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
     cp all.chain cb1.chain
     zip -j /cluster/data/ce2/zip/cb1.chain.zip cb1.chain
     rm cb1.chain
     zip -j /cluster/data/ce2/zip/cb1.net.zip cb1.net
     # add axtNets
     cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet
     foreach f (I II III IV V X M)
         cp chr${f}.axt chr{$f}.axtNet.axt
     end
     zip -j /cluster/data/ce2/zip/axtNetCb1.zip chr*.axtNet.axt
     rm *.axtNet.axt
 
     ssh hgwdev
     mkdir -p /usr/local/apache/htdocs/goldenPath/ce2/vsCb1
     cd /usr/local/apache/htdocs/goldenPath/ce2/vsCb1
     mv /cluster/data/ce2/zip/Cb1axtChrom.zip axtChrom.zip
     mv /cluster/data/ce2/zip/cb1*.zip .
     mv /cluster/data/ce2/zip/axtNetCb1.zip .
     md5sum *.zip > md5sum.txt
 
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 # MAKE VSSELF DOWNLOADS (DONE, 2004-05-11, hartera)
     ssh kksilo
     cd /cluster/data/ce2/bed/blastSelf
     zip -j /cluster/data/ce2/zip/SelfaxtChrom.zip axtChrom/chr*.axt
     cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30
     cp all.chain self.chain
     zip -j /cluster/data/ce2/zip/self.chain.zip self.chain
     rm self.chain
    
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/ce2/vsSelf
     cd /usr/local/apache/htdocs/goldenPath/ce2/vsSelf
     mv /cluster/data/ce2/zip/SelfaxtChrom.zip axtChrom.zip
     mv /cluster/data/ce2/zip/self.chain.zip .
     md5sum *.zip > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain formats. 
 
 #  miRNA track (DONE - 2004-05-04 - Hiram)
     #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
     #	and Michel.Weber@ibcg.biotoul.fr
     #	notify them if this assembly updates to renew this track
     ssh hgwdev
     mkdir /cluster/data/ce2/bed/miRNA
     cd /cluster/data/ce2/bed/miRNA
     wget --timestamping \
     "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/cel_ws120.*"
     grep -v "^track " cel_ws120.bed | sed -e "s/ /\t/g" > ce2.bed
     hgLoadBed ce2 miRNA ce2.bed
     # entry in trackDb/trackDb.ra already there
     #	Compare with Ce1:
     #	featureBits ce1 miRNA
     #	10329 bases of 100277784 (0.010%) in intersection
     #	featureBits ce2 miRNA
     #	11382 bases of 100291761 (0.011%) in intersection
 
 # MAKE ALT. SPLICING TRACK (DONE, 2004-05-13, hartera)
     # create rnaCluster table and Gene Bounds track
     ssh hgwdev
     mkdir /cluster/data/ce2/bed/rnaCluster
     cd /cluster/data/ce2/bed/rnaCluster
     foreach f (I II III IV V X)
         echo "clusterRna ce2 /dev/null chr${f}.bed -chrom=${f}"
         clusterRna ce2 /dev/null chr${f}.bed -chrom=chr${f}
     end
     hgLoadBed ce2 rnaCluster *.bed
 
     hgsql -A -e "select chrom from chromInfo" ce2 | egrep -v chrom > chrom.tab
     (perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce2 | egrep -v chromStart | cut -f2\-13 > $_.ce2.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.tab > rnaCluster.sh
 
     chmod a+x rnaCluster.sh
     rnaCluster.sh
     mkdir -p /cluster/data/ce2/bed/altSplice/agx
     cd /cluster/data/ce2/bed/altSplice
 
     hgsql -A -e "select chrom from chromInfo" ce2 | egrep -v chrom > chrom.tab
     perl -e 'while(<>) {chomp($_); print "~/bin/i386/altSplice -db=ce2 -beds=../rnaCluster/$_.ce2.rnaCluster.bed -agxOut=agx/$_.ce2.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.tab > altSplice.para.spec
     # these jobs are no big deal on the worm, just run this file:
     chmod +x altSplice.para.spec
     mkdir agx
     ./altSplice.para.spec
 # 0 genePredictions with 2610 clusters, 0 cassette exons, 0 of are not mod 3.
     cat agx/*.agx > altSplice.agx
     # UP TO HERE, make agxToBed and use local copy 
     ~/bin/i386/agxToBed altSplice.agx altSplice.bed
 
     cat << '_EOF_' >> altGraphX.sql
 CREATE TABLE altGraphX (
   bin smallint unsigned not null,
   tName varchar(255) NOT NULL default '',
   tStart int(11) NOT NULL default '0',
   tEnd int(11) NOT NULL default '0',
   name varchar(255) NOT NULL default '',
   id int(10) unsigned NOT NULL auto_increment,
   strand char(2) NOT NULL default '',
   vertexCount int(10) unsigned NOT NULL default '0',
   vTypes longblob NOT NULL,
   vPositions longblob NOT NULL,
   edgeCount int(10) unsigned NOT NULL default '0',
   edgeStarts longblob NOT NULL,
   edgeEnds longblob NOT NULL,
   evidence longblob NOT NULL,
   edgeTypes longblob NOT NULL,
   mrnaRefCount int(11) NOT NULL default '0',
   mrnaRefs longblob NOT NULL,
   mrnaTissues longblob NOT NULL,
   mrnaLibs longblob NOT NULL,
   PRIMARY KEY  (id),
   KEY tName (tName(16),tStart,tEnd)
 );
 '_EOF_'
 
     hgLoadBed -sqlTable=altGraphX.sql ce2 altGraphX altSplice.agx
 
     # trackDb entry:
         track altGraphX
         shortLabel Alt-Splice
         longLabel Alternative Splicing
         group rna
         priority 74.1
         visibility pack
         type bed 3
         canPack off
 
 ##################################################################
 # Create frames based page views of alt splice areas of interest
 #	(DONE - 2004-05-16 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/ce2/bed/altSplice/altAnalysis
     cd /cluster/data/ce2/bed/altSplice/altAnalysis
     altAnalysis ce2 ../altSplice.agx altSummary.txt links.html \
 	frame.html -bedName=all
     # creates files, (as well as several .bed files):
     #	-rw-rw-r--    1      282 May 16 09:19 frame.html
     #	-rw-rw-r--    1   391029 May 16 09:19 links.html
     #	Copy these to the Intronerator WS120 location, change
     #	default opening location:
 
     cp -p links.html /usr/local/apache/htdocs/IntronWS120
     sed -e \
     "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
 	-e "s/NM_005487/chrII:14689493-14690232/" \
 	-e "s/hgwdev-sugnet/genome-test/" \
 	 frame.html > /usr/local/apache/htdocs/IntronWS120/frame.html
 
     #	Better location is in the goldenPath itself 2006-01-12:
     mkdir /usr/local/apache/htdocs/goldenPath/ce2/altGraphX
     sed -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" links.html \
 	> /usr/local/apache/htdocs/goldenPath/ce2/altGraphX/links.html
 
     sed -e \
     "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
 	-e "s/NM_005487/chrII:14689493-14690232/" \
 	-e "s#http://hgwdev-sugnet.cse.ucsc.edu##" \
 	 frame.html \
 	    > /usr/local/apache/htdocs/goldenPath/ce2/altGraphX/frame.html
 
 # MAKE single coverage BEST track from blastz axt's (DONE - 2004-05-11 - Hiram)
 #	This data is for use with the phyloHMM construction, below.
     ssh eieio
     cd /cluster/data/ce2
     cat << '_EOF_' > jkStuff/mkBest.sh
 #!/bin/sh
 
 mkdir axtBest
 mkdir pslBest
 
 for c in I II III IV V X M
 do
     echo "axtBesting chr$c"
     /cluster/bin/i386/axtBest axtChrom/chr${c}.axt chr${c} \
         axtBest/chr${c}.axt -minScore=300
     echo "processing chr${c}.axt -> chr${c}_blastzBestCb1.psl"
     /cluster/bin/i386/axtToPsl axtBest/chr${c}.axt \
         S1.len S2.len pslBest/chr${c}_blastzBestCb1.psl
     echo "Done: chr${c}_blastzBestRn3.psl"
 done
 '_EOF_'
     chmod +x jkStuff/mkBest.sh
     cd /cluster/data/ce2/bed/blastz.cb1
     time ../../jkStuff/mkBest.sh
     #	real    1m47.069s
     #	user    1m1.880s
     #	sys     0m14.240s
 
     #	Load this table
     ssh hgwdev
     cd /cluster/data/ce2/bed/blastz.cb1/pslBest
     time /cluster/bin/i386/hgLoadPsl ce2 -table=blastzBestCb1 \
 	chr*_blastzBestCb1.psl
     #	real    0m21.775s
     #	user    0m8.510s
     #	sys     0m1.500s
 
 # MAKE phyloHMM data
 # (redone using new phastCons, acs 9/14/04)
 # (redone using nets rather than axtBest, acs 10/10/04)
     ssh hgwdev
     mkdir /cluster/data/ce2/bed/pHMM
     cd /cluster/data/ce2/bed/pHMM
     mkdir axt
     cd axt
     ln -s /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet/*.axt .
     cd ..
     mkdir maf
     #	create maf files from axtBest axt files
 for c in I II III IV M V X
 do
 axtToMaf axt/chr${c}.axt tSizes qSizes maf/chr${c}.Tmaf
 sed -e "s/chrUn/cb1.chrUn/" maf/chr${c}.Tmaf | sed -e "s/ chr/ ce2.chr/" > \
         maf/chr${c}.maf
 rm -f maf/chr${c}.Tmaf
 echo "done chr${c}"
 done
 
     # split up alignments; no need to use cluster with worm
 mkdir -p /cluster/bluearc/ce2/phastCons/WINDOWS
 mkdir -p /scratch/phastCons.ce2.09-14-04.WINDOWS
 for i in I II III IV V X M; do \
     msa_split maf/chr${i}.maf -i MAF -M /cluster/data/ce2/sangerFa/chr${i}.fa \
 	-w 1000000,0 -r /scratch/phastCons.ce2.09-14-04.WINDOWS/chr${i} -o SS \
 	-I 1000 -B 5000 ;\
 done
 for file in /scratch/phastCons.ce2.09-14-04.WINDOWS/* ; do \
     echo $file ;\
     gzip -c $file > /cluster/bluearc/ce2/phastCons/WINDOWS/`basename $file`.gz ;\
 done
 rm -rf /scratch/phastCons.ce2.09-14-04.WINDOWS
 
     # produce rough starting model
 msa_view -i MAF --aggregate ce2,cb1 -o SS -z maf/chr*.maf > all.ss
 phyloFit -i SS all.ss -o starting-tree
     # also get average GC content of aligned regions
 msa_view -i SS all.ss --summary-only
 #   descrip.                      A          C          G          T        G+C length   all_gaps  some_gaps
 #   all.ss                   0.3130     0.1872     0.1869     0.3129     0.3741   58909311          0   11236473
 
     # now set up cluster job to estimate model parameters.  See
     # makeHg17.doc for add'l details
 
 cat << 'EOF' > doEstimate.sh
 #!/bin/sh
 zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.3741 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 25 --target-coverage 0.45 --quiet --log $2 --estimate-trees $3
 EOF
 chmod u+x doEstimate.sh
     # (target of .45 is based on target of 0.2 and about 45% alignment
     # coverage: 0.2/0.45 is approx. 0.45)
 
 rm -fr LOG TREES
 mkdir -p LOG TREES
 rm -f jobs.lst
 for f in /cluster/bluearc/ce2/phastCons/WINDOWS/*.ss.gz ; do
     root=`basename $f .ss.gz` ;\
     echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\
 done
 
     # run cluster job
 ssh kk... para create ...
     # (takes about 2 minutes)
 
     # Now combine parameter estimates.  
 ls TREES/*.cons.mod > cons.txt
 phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
 ls TREES/*.noncons.mod > noncons.txt
 phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
 
     # Now set up cluster job for computing consevation scores and predicted elements
 cat << 'EOF' > doPhastCons.sh
 #!/bin/sh
 
 mkdir -p /cluster/bluearc/ce2/phastCons/POSTPROBS /cluster/bluearc/ce2/phastCons/ELEMENTS
 pref=`basename $1 .ss.gz`
 chr=`echo $pref | awk -F\. '{print $1}'`
 tmpfile=/scratch/phastCons.$$
 zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 25 --target-coverage 0.45 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/ce2/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
 gzip -c $tmpfile > /cluster/bluearc/ce2/phastCons/POSTPROBS/$pref.pp.gz
 rm $tmpfile
 EOF
 chmod u+x doPhastCons.sh
 
 rm -fr /cluster/bluearc/ce2/phastCons/POSTPROBS /cluster/bluearc/ce2/phastCons/ELEMENTS
 rm -f jobs2.lst
 for f in /cluster/bluearc/ce2/phastCons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done
 
     # run cluster job
 ssh kk... para create, ...
     # (takes about 30 sec)
 
     # set up phastConsElements track
 awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' /cluster/bluearc/ce2/phastCons/ELEMENTS/*.bed > all.raw.bed 
 /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
 hgLoadBed ce2 phastConsElements all.bed
 featureBits ce2 phastConsElements
 #   20217282 bases of 100291761 (20.158%) in intersection
 
     # set up wiggle
 mkdir wib
 for i in I II III IV V X M; do \
     zcat `ls /cluster/bluearc/ce2/phastCons/POSTPROBS/chr${i}.*.pp.gz | sort -t\. -k2,2n` | \
 	wigAsciiToBinary -chrom=chr${i} -wibFile=wib/chr${i}_phastCons stdin ;\
 done
 hgLoadWiggle ce2 phastCons -pathPrefix=/gbdb/ce2/phastCons/wib wib/chr*_phastCons.wig
 mkdir -p /gbdb/ce2/phastCons/wib
 rm -f /gbdb/ce2/phastCons/wib/chr*phastCons.wib
 ln -s /cluster/data/ce2/bed/pHMM/wib/*.wib /gbdb/ce2/phastCons/wib
 chmod 775 . wib /gbdb/ce2/phastCons /gbdb/ce2/phastCons/wib
 chmod 664 wib/*.wib
 
     # move postprobs over and clean up bluearc 
 rsync -av /cluster/bluearc/ce2/phastCons/POSTPROBS .
     # (people sometimes want the raw scores)
 rm -r /cluster/bluearc/ce2/phastCons/ELEMENTS /cluster/bluearc/ce2/phastCons/POSTPROBS 
 
     # set up full alignment/conservation track
     # note that in this case the pairwise mafs are used both for the
     # "multiple" alignment and the pairwise alignments
 mkdir -p /gbdb/ce2/c_briggsae_pwMaf
 cd maf
 ln -s `pwd`/*.maf /gbdb/ce2/c_briggsae_pwMaf
 hgLoadMaf ce2 -warn c_briggsae_pwMaf -pathPrefix=/gbdb/ce2/c_briggsae_pwMaf
 chmod 775 /gbdb/ce2/c_briggsae_pwMaf .
 chmod 664 *.maf
 
     # trackDb entry:
 #   track c_briggsae_pwMaf
 #   shortLabel Conservation
 #   longLabel Blastz Alignments with C. Briggsae & Conservation 
 #   group compGeno
 #   priority 100
 #   visibility pack
 #   type wigMaf 0.0 1.0
 #   maxHeightPixels 100:40:11
 #   wiggle phastCons
 #   yLineOnOff Off
-#   autoScaleDefault Off
+#   autoScale Off
 #   pairwise pwMaf
 #   speciesOrder c_briggsae
 
 
 ######## MAKING GENE SORTER TABLES #### (DONE - 2004-05-27 - hartera)
 # These are instructions for building the
 # neighborhood browser.  Don't start these until
 # there is a sangerGene table with a proteinID column containing Swiss-Prot
 # protein IDs, and also Kim lab expression data is required (in hgFixed).
 
     # Cluster together various alt-splicing isoforms.
     # Creates the sangerIsoforms and sangerCanonical tables
     # (DONE, 2004-05-21, hartera)
     ssh hgwdev
     hgClusterGenes ce2 sangerGene sangerIsoforms sangerCanonical
     # You may need to build this binary in src/hg/near/hgClusterGenes
     # Got 20713 clusters, from 23076 genes in 7 chromosomes
     # featureBits ce2 sangerCanonical
     # 54156601 bases of 100291761 (53.999%) in intersection
     # featureBits ce1 sangerCanonical 
     # 53654286 bases of 100277784 (53.506%) in intersection
 
     # Create Self blastp table - sangerBlastTab (DONE, 2004-05-24, hartera)
     # Reload blastp data into sangerBlastTab and drop knownBlastTab 
     # - table given the wrong name (DONE, 2004-05-28, hartera)
     # Get sangerPep peptide sequences fasta file from sangerPep dir
     # and create a blast database out of them.
     mkdir -p /cluster/data/ce2/bed/blastp
     cd /cluster/data/ce2/bed/blastp
     mv /cluster/data/ce2/bed/sangerPep/wormPep120 wormPep120.faa
     
     formatdb -i wormPep120.faa -t wormPep120 -n wormPep120
     # This command is in /projects/compbio/bin/$MACH/formatdb
 
     # Copy database over to iscratch
     ssh kkr1u00
     mkdir -p mkdir -p /iscratch/i/ce2/blastp
     cp /cluster/data/ce2/bed/blastp/wormPep120.* /iscratch/i/ce2/blastp
     iSync
     # Blastall and Data directory are already in /iscratch/i/blast/
     # From the dates, this looks to be the same versio as in
     # /projects/compbio/bin/i686/
     # Split up fasta file into bite sized chunks for cluster
     cd /cluster/data/ce2/bed/blastp
     mkdir split
     faSplit sequence wormPep120.faa 8000 split/wp
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/ce2/bed/blastp/self
     cd /cluster/data/ce2/bed/blastp/self
     mkdir run
     cd run
     mkdir out
 
 # Make blast script
     cat  << '_EOF_' > blastSome
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep120 \
 -i $1 -o $2 -e 0.01 -m 8 -b 1000
 '_EOF_'
      chmod a+x blastSome
      # Make gensub2 file
      cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
      # Create parasol batch
      # 'ls ../../split/*.fa' is too long an argument list, hence the echo
 
      echo ../../split/*.fa | wordLine stdin > split.lst
      gensub2 split.lst single gsub jobList
      para create jobList
      para try
 # Wait a minute, and do a para check,  if all is good
 # then do a
      para push, para check etc.
 # para time
 # Completed: 6684 of 6684 jobs
 # CPU time in finished jobs:      35657s     594.28m     9.90h    0.41d  0.001 y
 # IO & Wait Time:                 27262s     454.37m     7.57h    0.32d  0.001 y
 # Average job time:                   9s       0.16m     0.00h    0.00d
 # Longest job:                      193s       3.22m     0.05h    0.00d
 # Submission to last job:           302s       5.03m     0.08h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/ce2/bed/blastp/self/run/out
     hgLoadBlastTab ce2 sangerBlastTab *.tab
 
 # CREATE EXPRESSION DISTANCE TABLES FOR  
 # KIM LAB EXPRESSION DATA (DONE, 2004-05-24, hartera) 
 hgExpDistance ce2 hgFixed.kimWormLifeMedianRatio \
               hgFixed.kimWormLifeMedianExps kimExpDistance
 
 # CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES
 # sangerToRefSeq (DONE, 2004-05-24, hartera)
 hgMapToGene ce2 refGene sangerGene sangerToRefSeq
 
 # CREATE TABLE TO MAP BETWEEN SANGER GENES AND PFAM DOMAINS
 # sangerToPfam (DONE, 2004-05-24, hartera)
 # Drop table knownToPfam - wrong table name.
 # Reload data into sangerToPfam (DONE, 2004-05-28, hartera)
 hgMapViaSwissProt ce2 sangerGene name proteinID Pfam sangerToPfam
 
 # CREATE MAPPING OF SANGER GENES TO KIM LAB EXPRESSION DATA GENES 
 # sangerToKim (DONE, 2004-05-27, hartera)
 # This is actually a mapping of the sangerGene table to itself - 
 # this is how this table was created for ce1.
 # This means that many gene names (4614/19134) in kimWormLifeAllRatio 
 # do not appear in sangerGene but when randomly checking some of these, 
 # it was found they are not in WS120 WormBase and probably represent 
 # alternate names or names that no longer exist or withdrawn sequences.
 hgMapToGene ce2 sangerGene sangerGene sangerToKim
 
 # SET dbDb TABLE ENTRY FOR GENE SORTER (DONE, 2004-05-25, hartera)
 # set hgNearOk to 1 on hgcentraltest to make Ce2 Gene Sorter viewable
 
 hgsql -h hgwbeta -e 'update dbDb set hgNearOk = 1 where name = "Ce2";' \
          hgcentraltest
 
 
 # MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie)
     ssh hgwdev
     mkdir /cluster/data/ce2/bed/mitopred
     cd /cluster/data/ce2/bed/mitopred
     wget http://mitopred.sdsc.edu/data/cel_30.out
     perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' cel_30.out > mitopred.tab
     cat > mitopred.sql << '_EOF_'
 # Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
 CREATE TABLE mitopred (
     name varchar(10) not null,      # SwissProt ID
     confidence varchar(8) not null, # Confidence level
               #Indices
     PRIMARY KEY(name(6))
 );
 '_EOF_'
     hgsql ce2 < mitopred.sql
     hgsql ce2 -e 'load data local infile "mitopred.tab" into table mitopred'
 
 # MAKE Human Proteins track
     ssh eieio
     mkdir -p /cluster/data/ce2/blastDb
     cd /cluster/data/ce2/blastDb
     for i in ../sangerFa/*.fa; do ln -s $i .; done
     for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
     rm *.fa *.log
 
     ssh kkr1u00
     mkdir -p /iscratch/i/ce2/blastDb
     cp /cluster/data/ce2/blastDb/* /iscratch/i/ce2/blastDb
     (~kent/bin/iSync) 2>&1 > sync.out
     
     mkdir -p /cluster/data/ce2/bed/tblastn.hg16KG
     cd /cluster/data/ce2/bed/tblastn.hg16KG
     ls -1S /iscratch/i/ce2/blastDb/*.nsq | sed "s/\.nsq//" > worm.lst
     exit
 
     # back to eieio
     cd /cluster/data/ce2/bed/tblastn.hg16KG
     mkdir kgfa
     # calculate a reasonable number of jobs
     calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(150000/`wc worm.lst | awk "{print \\\$1}"`\)
     # 41117/(150000/578) = 158.437507
     split -l 158 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg
     cd kgfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     mkdir blastOut
     for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 f=/tmp/`basename $3`
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
 if /cluster/bin/i386/blastToPsl $f.1 $f.2
 then
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2
         mv $3.tmp $3
         rm -f $f.1 $f.2 
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp $f.8
 exit 1
 '_EOF_'
 
     chmod +x blastSome
     gensub2 worm.lst kg.lst blastGsub blastSpec
 
     ssh kk
     cd /cluster/data/ce2/bed/tblastn.hg16KG
     para create blastSpec
     para shove   # jobs will need to be restarted, but they should all finish
 # Completed: 150858 of 150858 jobs
 # CPU time in finished jobs:   10404860s  173414.33m  2890.24h  120.43d  0.330 y
 # IO & Wait Time:               2468643s   41144.06m   685.73h   28.57d  0.078 y
 # Average job time:                  85s       1.42m     0.02h    0.00d
 # Longest job:                      411s       6.85m     0.11h    0.00d
 # Submission to last job:         26364s     439.40m     7.32h    0.31d
 
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainSome
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=10000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainSome
 
     ls -1dS `pwd`/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
 
     ssh kki
     cd /cluster/data/ce2/bed/tblastn.hg16KG
     para create chainSpec
     para push
 # Completed: 261 of 261 jobs
 # CPU time in finished jobs:        828s      13.81m     0.23h    0.01d  0.000 y
 # IO & Wait Time:                 19521s     325.34m     5.42h    0.23d  0.001 y
 # Average job time:                  78s       1.30m     0.02h    0.00d
 # Longest job:                      270s       4.50m     0.07h    0.00d
 # Submission to last job:          3878s      64.63m     1.08h    0.04d
 
     exit
     # back to eieio
     cd /cluster/data/ce2/bed/tblastn.hg16KG/blastOut
     for i in kg??
     do 
 	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/ce2/jkStuff/liftAll.lft warn stdin |  \
     	sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../preblastHg16KG.psl
     cd ..
     blatDir=/cluster/data/hg16/bed/blat.hg16KG
     protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl
 
     ssh hgwdev
     cd /cluster/data/ce2/bed/tblastn.hg16KG
     hgLoadPsl ce2 blastHg16KG.psl
     exit
 
     # back to eieio
     rm -rf blastOut
 
 # End tblastn Human
 
 # MAKE Drosophila Proteins track
     ssh eieio
     mkdir -p /cluster/data/ce2/blastDb
     cd /cluster/data/ce2/blastDb
     for i in ../sangerFa/*.fa; do ln -s $i .; done
     for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
     rm *.fa *.log
 
     ssh kkr1u00
     mkdir -p /iscratch/i/ce2/blastDb
     cp /cluster/data/ce2/blastDb/* /iscratch/i/ce2/blastDb
     (~kent/bin/iSync) 2>&1 > sync.out
     
     mkdir -p /cluster/data/ce2/bed/tblastn.dm1FB
     cd /cluster/data/ce2/bed/tblastn.dm1FB
     ls -1S /iscratch/i/ce2/blastDb/*.nsq | sed "s/\.nsq//" > worm.lst
     exit
 
     # back to eieio
     cd /cluster/data/ce2/bed/tblastn.dm1FB
     mkdir fbfa
     split -l 40 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
     cd fbfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S fbfa/*.fa > fb.lst
     mkdir blastOut
     for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
     tcsh
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 1 0.1 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
 if /cluster/bin/i386/blastToPsl $f.1 $f.2
 then
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2
         mv $3.tmp $3
         rm -f $f.1 $f.2 
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp 
 exit 1
 '_EOF_'
 
     chmod +x blastSome
     gensub2 worm.lst fb.lst blastGsub blastSpec
 
     exit
     ssh kk
     cd /cluster/data/ce2/bed/tblastn.dm1FB
     para create blastSpec
     para push
 # Completed: 3283 of 3283 jobs
 # CPU time in finished jobs:    3063561s   51059.35m   850.99h   35.46d  0.097 y
 # IO & Wait Time:                 25333s     422.22m     7.04h    0.29d  0.001 y
 # Average job time:                 941s      15.68m     0.26h    0.01d
 # Longest job:                     3435s      57.25m     0.95h    0.04d
 # Submission to last job:          9387s     156.45m     2.61h    0.11d
 
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainSome
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     # << for emacs
     chmod +x chainSome
 
     ls -1dS `pwd`/blastOut/fb?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
 
     ssh kki
     cd /cluster/data/ce2/bed/tblastn.dm1FB
     para create chainSpec
     para push
 # Completed: 469 of 469 jobs
 # CPU time in finished jobs:        721s      12.02m     0.20h    0.01d  0.000 y
 # IO & Wait Time:                  1562s      26.03m     0.43h    0.02d  0.000 y
 # Average job time:                   5s       0.08m     0.00h    0.00d
 # Longest job:                       42s       0.70m     0.01h    0.00d
 # Submission to last job:          5546s      92.43m     1.54h    0.06d
 
     exit
     # back to eieio
     cd /cluster/data/ce2/bed/tblastn.dm1FB/blastOut
     for i in fb??
     do 
 	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     cat u.*.psl m60* |  sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../blastDm1FB.psl
     cd ..
 
     ssh hgwdev
     cd /cluster/data/ce2/bed/tblastn.dm1FB
     hgLoadPsl ce2 blastDm1FB.psl
     exit
 
     # back to eieio
     rm -rf blastOut
 
 # End tblastn
 
 
 # MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie)
     # First, get ranges of target coverage:
     ssh eieio
     mkdir /cluster/data/ce2/bed/pHMM/coverage
     cd /cluster/data/ce2/bed/pHMM/maf.new.axtNet
     cat *.maf | nice mafRanges -notAllOGap stdin ce2 \
       /cluster/data/ce2/bed/pHMM/coverage/ce2.mafRanges.bed
 
     ssh hgwdev
     cd /cluster/data/ce2/bed/pHMM/coverage
     # To make subsequent intersections a bit quicker, output a bed with 
     # duplicate/overlapping ranges collapsed:
     nice featureBits ce2 ce2.mafRanges.bed \
       -bed=ce2.mafRangesCollapsed.bed
 #43967098 bases of 100291761 (43.839%) in intersection
 
     # mafCoverage barfs currently, so pass on this for now:
     #cat ../maf.new.axtNet/*.maf \
     #| nice mafCoverage -count=2 ce2 stdin > ce2.mafCoverage
 
     # Intersect maf target coverage with gene regions -- 
     # use Adam's files which are a union of sangerGene and refGene:
     nice featureBits ce2 -enrichment \
       /cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormCds.bed \
       ce2.mafRangesCollapsed.bed \
       -bed=ce2.mafCds.bed
 #wormCds.bed 25.701%, ce2.mafRangesCollapsed.bed 43.839%, both 20.701%, cover 80.55%, enrich 1.84x
     nice featureBits ce2 -enrichment \
       /cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormUtr3.bed \
       ce2.mafRangesCollapsed.bed \
       -bed=ce2.mafUtr3.bed
 #wormUtr3.bed 0.323%, ce2.mafRangesCollapsed.bed 43.839%, both 0.216%, cover 66.86%, enrich 1.53x
     nice featureBits ce2 -enrichment \
       /cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormUtr5.bed \
       ce2.mafRangesCollapsed.bed \
       -bed=ce2.mafUtr5.bed
 #wormUtr5.bed 0.057%, ce2.mafRangesCollapsed.bed 43.839%, both 0.041%, cover 72.37%, enrich 1.65x
 
 
 # syntenic net with cb1 (acs, 11/18/04)
 
     cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
     netFilter -syn cb1.net > cb1Syn.net
     netFilter -minGap=10 cb1Syn.net | hgLoadNet ce2 syntenyNetCb1 stdin
     # (add trackDb.ra entry)
 
 # [acs@hgwdev axtChain.2004-04-29]$ featureBits ce2 netCb1
 # 88355987 bases of 100291761 (88.099%) in intersection
 # [acs@hgwdev axtChain.2004-04-29]$ featureBits ce2 syntenyNetCb1
 # 73427463 bases of 100291761 (73.214%) in intersection
 # Hmmm -- looking in browser, seems a bit stringent for worm...
 
 # CLEANUP CB1 BLASTZ (DONE, 2004-02-24, hartera)
     ssh eieio
     cd /cluster/data/ce2/bed/blastz.cb1
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/hNoClass.net &
     nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/cb1Filt.net axtChain/*.net &
     nice rm -fr axtChain/chain axtChain/preNet &
     nice rm -rf raw &
     nice rm -rf lav &
 
 # CLEANUP SELF BLASTZ (DONE, 2004-02-24, hartera)
     ssh eieio
     cd /cluster/data/ce2/bed/blastzSelf
     # remove old chains
     nice rm -fr axtChain &
     nice rm axtChain.2004-04-30/run1/chain/* &
     nice gzip axtChrom/* pslChrom/* axtChain.2004-04-30/all.chain axtChain.2004-04-30/chainFilt/* axtChain.2004-04-30/chain/* &
     nice rm -rf raw &
     nice rm -rf lav &
 
 
 ###########################################################################
 # LOAD GENEID PREDICTIONS (DONE 3/15/05)
     ssh eieio
     mkdir /cluster/data/ce2/bed/geneid
     cd /cluster/data/ce2/bed/geneid
     foreach chr (chrI chrII chrIII chrIV chrM chrV chrX)
       wget \
   http://genome.imim.es/genepredictions/C.elegans/Ce200403/geneid_v1.2/$chr.gtf
       wget \
   http://genome.imim.es/genepredictions/C.elegans/Ce200403/geneid_v1.2/$chr.prot
     end
     # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
     perl -wpe 's/^(>\S+)/$1.1/' *.prot > geneid.fa
     ssh hgwdev
     cd /cluster/data/ce2/bed/geneid
     ldHgGene -gtf -genePredExt ce2 geneid *.gtf 
     hgPepPred ce2 generic geneidPep geneid.fa
 
 ###########################################################################
 #	DONE - 2006-01-06 - Hiram
 #	Obtained a photograph from Professor Mark Blaxter
 #	mark.blaxter@ed.ac.uk - http://nema.cap.ed.ac.uk/index.html
 #	Saved image from email to:
 #	/cluster/data/ce2/html/C.elegans.jpg
 #	-rw-rw-r--    1   227317 Jan  6 09:40 C.elegans.jpg
 #	Transform image to a size appropriate for our WEB pages:
     ssh hgwdev
     cd /cluster/data/ce2/html
     convert -normalize -sharpen 0 -geometry 300x200 \
 	C.elegans.jpg Caenorhabditis_elegans.jpg
     cp -p Caenorhabditis_elegans.jpg /usr/local/apache/htdocs/images
 #	Add IMG tag pointer to this image in the description.html page
 #	and for ce1 and ce3
 
 
 ########################################################################
 ### microRNA targets tracks  (DONE - 2006-03-29 - 2006-04-26 - Hiram)
 ### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
 ### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
 ### Yi-Lu Wang ylw205@nyu.edu
 ### dg@thp.Uni-Koeln.DE
     ssh hgwdev
     mkdir /cluster/data/ce2/bed/picTar
     cd /cluster/data/ce2/bed/picTar
     wget --timestamping \
 	'http://pictar.bio.nyu.edu/ucsc/new_single_elegans_bed' \
 	    -O newSingleElegans.bed
 
     grep -v "^track" newSingleElegans.bed \
 	| hgLoadBed -strict ce2 picTar stdin
     #	Loaded 11987 elements of size 9
 
     nice -n +19 featureBits ce2 picTar
     #	39233 bases of 100291761 (0.039%) in intersection
 
 ########################################################################
 # CLEANUP OF CB1 BLASTZ (DONE, 2007-06-25, hartera)
     ssh kkstore02
     cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChain
     rm -r cb1NetSplit cb1NetSplit1 tmp1
     cd ..
     rm -r axtChain.2004-04-29test
     # Looks like new chians were made on 2004-04-29 
     cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChain.2004-04-29
     # remove the split net and chain directories as these can easily be
     # created from the all.chain and cb1.net files
     rm -r chain cb1NetSplit run1/chain
     # gzip files
     gzip all.chain *.net
     cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/
     rm -r axtNet1
     cd axtNet.2004-04-29
     # remove *.axt files as these are also in the sortedaxtNet directory
     rm *.axt sortedaxtNet/axtNetCb1.tab 
     cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChrom/maf
     gzip sortedaxtNet/*.axt
 
 #######################################################################
 ## LIFTOVER To Ce4 (DONE - 2007-07-16 Hiram)
     ssh kkr1u00
     cd /cluster/data/ce4
     mkdir nib
     cd nib
     for C in I II III IV V X M
 do
     faToNib -softMask ../goldenPath/chromosomes/chr${C}.fa.gz chr${C}.nib
 done
     #	Writing 15072419 bases in 7536218 bytes
     #	Writing 15279316 bases in 7639666 bytes
     #	Writing 13783681 bases in 6891849 bytes
     #	Writing 17493784 bases in 8746900 bytes
     #	Writing 20919398 bases in 10459707 bytes
     #	Writing 17718852 bases in 8859434 bytes
     #	Writing 13794 bases in 6905 bytes
 
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
     	ce4 /cluster/data/ce4/nib
     # as it says, DO THIS NEXT:
     ssh kk
     #	if bin/scripts is not in your PATH, add it for this command:
     PATH=$PATH:/cluster/bin/scripts \
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
 	ce2 /cluster/data/ce2/nib ce4 /iscratch/i/ce4/split10k \
 	/cluster/data/ce4/jkStuff/11.ooc
     # as it says, DO THIS NEXT:
     cd /cluster/data/ce2/bed/blat.ce4.2007-07-16/run
     para try, check, push, check, ...
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:      14998s     249.97m     4.17h    0.17d  0.000 y
 # IO & Wait Time:                   405s       6.74m     0.11h    0.00d  0.000 y
 # Average job time:                 314s       5.24m     0.09h    0.00d
 # Longest finished job:             706s      11.77m     0.20h    0.01d
 # Submission to last job:           728s      12.13m     0.20h    0.01d
 
     # as it says, DO THIS NEXT:
     #	this does the liftUp and makes the psl files
     #	kkr1u00 is down at this time, fixup this script to work on kkr3u00
     ssh kkr3u00
     cd /cluster/data/ce2/bed
     ln -s blat.ce4.2007-07-16 ce4
     time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh ce2 ce4
     #	real    1m1.780s
     # as it says, DO THIS NEXT:
     #	the prepares the batch to run for the chaining
     ssh kki
     time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
 	ce2 /cluster/data/ce2/nib ce4 /cluster/data/ce4/nib
     # as it says, DO THIS NEXT:
     #	running the chain batch
     cd /cluster/data/ce2/bed/blat.ce4.2007-07-16/chainRun
     para try, check, push, check, ...
 # Completed: 7 of 7 jobs
 # CPU time in finished jobs:         19s       0.31m     0.01h    0.00d  0.000 y
 # IO & Wait Time:                    17s       0.29m     0.00h    0.00d  0.000 y
 # Average job time:                   5s       0.09m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:               7s       0.12m     0.00h    0.00d
 # Submission to last job:             7s       0.12m     0.00h    0.00d
 
     ssh kkstore02
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh ce2 ce4
     #	Created /cluster/data/ce2/bed/liftOver/ce2ToCe4.over.chain.gz
     # as it says, DO THIS NEXT:
     ssh hgwdev
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh ce2 ce4
     #	It says this:
     # 	Now, add link for
     #	/usr/local/apache/htdocs/goldenPath/ce2/liftOver/ce2ToCe4.over.chain
     #	to hgLiftOver
     #	But I believe that link was already done:
     cd /gbdb/ce2/liftOver
     ls -og ce2ToCe4*
     #	lrwxrwxrwx  1 53 Jun  5 16:35 ce2ToCe4.over.chain.gz ->
     #		/cluster/data/ce2/bed/liftOver/ce2ToCe4.over.chain.gz
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 ce2.upstreamGeneTbl = refGene