src/hg/makeDb/doc/ce1.txt 1.4

1.4 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/ce1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce1.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/makeDb/doc/ce1.txt	25 Nov 2009 18:29:18 -0000	1.3
+++ src/hg/makeDb/doc/ce1.txt	25 Nov 2009 21:48:38 -0000	1.4
@@ -1,1542 +1,1542 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes how to make the browser database for the
 # worm C. elegans
 # Currently 2003-05-28 this file is in a state of flux as it is being
 # worked out.
 
 DOWNLOAD SEQUENCE (DONE 2003-05-12 - Hiram)
 
     # next machine
     ssh eieio
     mkdir -p /cluster/store5/worm/ce1/sanger
     mkdir -p /cluster/store5/worm/ce1/tmp
     cd /cluster/store5/worm/ce1/sanger
     wget -o ce1.fetch.log -r -l1 --no-directories \
 	ftp://ftp.sanger.ac.uk/pub/wormbase/current_release/CHROMOSOMES/
 #   Takes about five minutes
 #   This current_release is updated every two weeks.  Although the
 #	sequence is quite stable at this time and changes very little.
 #   Check that you have some valid files. Should be chroms I II III IV V X
 #	in dna, gff and agp formats
     ls -ogrt
 #   get out of this download data directory, and create your home symlink
 #	shortcut
     cd ..
     ln -s /cluster/store5/worm/ce1 ~/ce1
     cd ~/ce1
 #   There is a Mitochondria sequence.  It can be found at NCBI
 #	This search sequence at NCBI in 'nucleotide' selects one
 #	entry: X54252:
 #	Caenorhabditis Elegans[Organism] AND mitochondria AND MTCE
 #	I'd like to figure out a wget command to fetch this, but doing
 #	that manually via the WEB browser is not such a big deal.  And
 #	besides, once it is fetched, it is stable (haven't seen it
 #	change in years) so you can
 #	merely copy from a previous Ce build.  A copy has been manually
 #	placed in ~/ce1/ncbi/x54252.fa
 #	
 
 #   translate to unzipped .fa, all upper case, and
 #	rename the agp files so hgGoldGap can find them
     mkdir sangerFa
     sed -e "s/^>.*/>chrM/" ./ncbi/x54252.fa > sangerFa/chrM.fa
     foreach f (I II III IV V X)
 	echo -n "${f} "
 	zcat sanger/CHROMOSOME_${f}.dna.gz | tr '[a-z]' '[A-Z]' | \
 		sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${f}.fa
     	mkdir sangerFa/${f}
         ln -s ~/ce1/sanger/CHROMOSOME_${f}.agp sangerFa/${f}/chr${f}.agp
     end
     # hgGoldGap does not handle dir names over 2 characters:
     mv sangerFa/III sangerFa/3
 
 # CREATE AN .agp FILE FOR ChrM (DONE, 2004-04-19, hartera)
 
     ssh hgwdev
     cd /cluster/data/ce1/sanger
     cat << '_EOF_' > CHROMOSOME_M.agp
 M       1       13794   1       F       X54252.1      1       13794   +
 '_EOF_'
     mkdir -p /cluster/data/ce1/sangerFa/M
     ln -s /cluster/data/ce1/sanger/CHROMOSOME_M.agp \
            /cluster/data/ce1/sangerFa/M/chrM.agp
     
 # CREATING DATABASE (DONE 2003-05-12 - Hiram)
 
     # Create the database.
     # next machine
     ssh hgwdev
     echo 'create database ce1' | hgsql ''
     # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
 	echo 'drop database ce1' | hgsql ce1
     # Use df to ake sure there is at least 5 gig free on 
     df -h /var/lib/mysql
 # Before loading data:
 # Filesystem            Size  Used Avail Use% Mounted on
 # /dev/sda1             472G  389G   58G  87% /var/lib/mysql
 
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-05-12 - Hiram)
     # next machine
     ssh hgwdev
     #  the following command copies all the data from the table
     #	grp in the database rn1 to our new database ce1
     echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
       | hgsql ce1
     # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
 	echo 'drop table grp;' | hgsql ce1
 
 # PREPARE Split contigs into 100,000 bp chunks for cluster runs
     # next machine
     ssh eieio
     cd ~/ce1
     rm -fr ./split
     mkdir split
     foreach f (I II III IV V X M)
 	mkdir split/$f
         faSplit size sangerFa/chr${f}.fa 100000 split/$f/c -lift=split/chr${f}.lft
     end
 # 151 pieces of 151 written
 # 153 pieces of 153 written
 # 138 pieces of 138 written
 # 175 pieces of 175 written
 # 210 pieces of 210 written
 # 178 pieces of 178 written
 # 1 pieces of 1 written
     cat split/*.lft > liftAll.lft
 
     # copy them to /iscratch/i for cluster rsync
     # next machine
     ssh kkr1u00
     cd ~/ce1/split
     foreach c (I II III IV V X M)
 	echo -n "${c} "
 	mkdir -p /iscratch/i/worms/Celegans/unmaskedSplit/${c}
         cp -p ${c}/c*.fa /iscratch/i/worms/Celegans/unmaskedSplit/${c}
     end
     ~kent/bin/iSync
 
 # Run RepeatMasker on the chromosomes (DONE 2003-06-04 - Hiram)
     # next machine
     ssh kk
     cd ~/ce1
     mkdir RMRun
     cd RMRun
     # create the script to use for the RepeatMasker run
     cat << '_EOF_' > RMWorm
 #!/bin/csh -fe
 #
 #       This is a slight rearrangement of the
 #       /cluster/bin/scripts/RMWorm used in makeHg15.doc
 #       The results here need to go to a different location 
 #       $1 == chrom name: I II III IV V X M
 #       $2 == directory where split contig .fa is found
 #       $3 == name of contig .fa file
 cd $1
 pushd .
 cd $2
 /bin/mkdir -p /tmp/$3/$1
 /bin/cp $3 /tmp/$3/$1
 cd /tmp/$3/$1
 /scratch/hg/RepeatMasker/RepeatMasker -s -el -ali $3
 popd
 /bin/cp /tmp/$3/$1/$3.out ./
 if( -e /tmp/$3/$1/$3.align ) /bin/cp /tmp/$3/$1/$3.align ./
 # /bin/cp /tmp/$2*.masked ../masked/
 /bin/rm -r /tmp/$3/$1
 /bin/rmdir --ignore-fail-on-non-empty /tmp/$3
 '_EOF_'
     chmod +x RMWorm
 
     # create job list
     rm -f RMJobs
     foreach c (I II III IV V X M)
 	mkdir ~/ce1/RMRun/${c}
         foreach t ( /iscratch/i/worms/Celegans/unmaskedSplit/$c/c*.fa )
 	    set d = $t:h
 	    set f = $t:t
 	    echo ./RMWorm ${c} ${d} ${f} \
 	    '{'check out line+ ~/ce1/RMRun/$c/${f}.out'}' \
 	    >> RMJobs
 	end
     end
     para create RMJobs
     para try
     para check
     # when they are finished, liftUp and load the .out files into the database:
     # next machine
     ssh eieio
     cd ~/ce1/RMRun
     foreach c (I II III IV V X M)
 	liftUp chr${c}.fa.out ../split/chr${c}.lft warn ${c}/*.fa.out
     end
     
     # next machine
     ssh hgwdev
     cd ~/ce1/RMRun
     hgLoadOut ce1 chr*.fa.out
     # Noticed one error in this load:
     # Processing chrI.fa.out
     # Strange perc. field -0.6 line 791 of chrI.fa.out
 
 
 # SIMPLE REPEAT [TRF] TRACK (DONE 2003-06-05 - Hiram)
     # ensure chr*.fa files exist on /iscratch/i
     # next machine
     ssh kkr1u00
     cp -p /cluster/store5/worm/ce1/sangerFa/*.fa \
 	/iscratch/i/worms/Celegans/unmaskedFa
     ~kent/bin/iSync
 
     # Create cluster parasol job:
     # next machine
     ssh kk
     mkdir -p ~/ce1/bed/simpleRepeat
     cd ~/ce1/bed/simpleRepeat
     mkdir trf
     ls -1S /iscratch/i/worms/Celegans/unmaskedFa/chr*.fa > genome.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)}  /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
 #ENDLOOP
 '_EOF_'
 
     echo "" > dummy.lst
     gensub2 genome.lst dummy.lst gsub spec
     para create spec
     para try    # there are only 7, so this runs them all
     para check
 Average job time:                 468s       7.80m     0.13h    0.01d
 Longest job:                      963s      16.05m     0.27h    0.01d
 
     #  When cluster run is done, combine into one:
     cat trf/*.bed > simpleRepeat.bed
     
     # Load into the database:
     # next machine
     ssh hgwdev
     cd ~/ce1/bed/simpleRepeat
     /cluster/bin/i386/hgLoadBed ce1 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
     # Loaded 28590 elements
 
 # PROCESS SIMPLE REPEATS INTO MASK (DONE 2003-06-05 - Hiram)
     # next machine
     ssh eieio
     cd ~/ce1/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/*.bed)
 	awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
 
 # Create Soft and Hard masks from RepeatMaster and TRF outputs:
 #  and rebuild the nib files
     # next machine
     ssh eieio
     cd ~/ce1
     mkdir softMask
     cd ~/ce1
     foreach c (I II III IV V X M)
         echo -n "masking chr${c} "
 	/cluster/bin/i386/maskOutFa sangerFa/chr${c}.fa RMRun/chr${c}.fa.out \
 		softMask/chr${c}.fa -soft
 	/cluster/bin/i386/maskOutFa softMask/chr${c}.fa \
 		bed/simpleRepeat/trfMask/chr${c}.bed \
 		softMask/chr${c}.fa -softAdd
 		faToNib -softMask softMask/chr${c}.fa nib/chr${c}.nib
     end
     # output:
     # masking chrI Writing 15080473 bases in 7540245 bytes
     # masking chrII Writing 15279303 bases in 7639660 bytes
     # masking chrIII Writing 13783268 bases in 6891642 bytes
     # masking chrIV Writing 17493790 bases in 8746903 bytes
     # masking chrV Writing 20922238 bases in 10461127 bytes
     # masking chrX Writing 17705013 bases in 8852515 bytes
     # masking chrM Writing 13794 bases in 6905 bytes
 
     # unknown if hardMasks are actually needed for anything
     mkdir hardMask
     foreach c (I II III IV V X M)
         echo "masking chr${c}"
 	/cluster/bin/i386/maskOutFa softMask/chr${c}.fa hard \
 		hardMask/chr${c}.fa
     end
 
     #  With masked nib files ready, prepare cluster for blastz runs
     # next machine
     ssh kkr1u00
     cd ~/ce1/softMask
     mkdir -p /iscratch/i/worms/Celegans/bothMasksFa
     mkdir -p /iscratch/i/worms/Celegans/bothMasksNib
     cp -p *.fa /iscratch/i/worms/Celegans/bothMasksFa
     cd ~/ce1/nib
     cp -p c*.nib /iscratch/i/worms/Celegans/bothMasksNib
     ~kent/bin/iSync
 
 STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 2003-05-12 - Hiram)
 
     # Make symbolic links from /gbdb/ce1/nib to the real nibs.
     # next machine
     ssh hgwdev
     mkdir -p /gbdb/ce1/nib
     foreach f (/cluster/store5/worm/ce1/nib/*.nib)
       ln -s $f /gbdb/ce1/nib
     end
     cd ~/ce1/tmp
     # Load /gbdb/ce1/nib paths into database and save size info.
     hgsql ce1  < ~/kent/src/hg/lib/chromInfo.sql
     # if you need to delete that table:   !!! DELETES ALL DATA IN TABLE !!!
 	echo 'drop table chromInfo;' | hgsql ce1
     hgNibSeq -preMadeNib ce1 /gbdb/ce1/nib sangerFa/chr*.fa
 # Typical output:
 # Processing chrI.fa to /gbdb/ce1/nib/chrI.nib
 # Processing chrII.fa to /gbdb/ce1/nib/chrII.nib
 # Processing chrIII.fa to /gbdb/ce1/nib/chrIII.nib
 # Processing chrIV.fa to /gbdb/ce1/nib/chrIV.nib
 # Processing chrM.fa to /gbdb/ce1/nib/chrM.nib
 # Processing chrV.fa to /gbdb/ce1/nib/chrV.nib
 # Processing chrX.fa to /gbdb/ce1/nib/chrX.nib
 # 100277879 total bases
 
 #   Verify the hgNibSeq load functioned OK:
     echo "select chrom,size from chromInfo" | hgsql -N ce1 > chrom.sizes
     cat chrom.sizes
 #   Typical contents of chrom.sizes:
 chrI    15080473
 chrII   15279303
 chrIII  13783268
 chrIV   17493790
 chrM    13794
 chrV    20922238
 chrX    17705013
 
     # Set up relational mrna tables.
     hgLoadRna new ce1
     # that created a bunch of tables.  If you need to delete them:
 	hgLoadRna drop ce1
 
 MAKE GCPERCENT AND GAP tracks (DONE 2003-05-12 - Hiram)
     # next machine
      ssh hgwdev
      mkdir -p /cluster/store5/worm/ce1/bed/gcPercent
      cd /cluster/store5/worm/ce1/bed/gcPercent
      hgsql ce1  < ~/kent/src/hg/lib/gcPercent.sql
     #  If you need to delete that table created
 	echo 'drop table gcPercent;' | hgsql ce1;
      hgGcPercent ce1 ../../nib
     # hgGoldGap does not handle dir names over 2 characters
     #	directory III has been moved to 3 when all this was created above
      hgGoldGapGl -noGl ce1 /cluster/store5/worm/ce1 sangerFa
     # there are no gaps in these chromosomes.  They are all making
     # empty tables except for chrM.  We tried running the browser
     # with these empty tables removed, but then it complained when
     # we tried to display the assembly track.  The browser needs to
     # be fixed.  In the meantime, for completeness, and to allow
     # featureBits to function, create an empty chrM_gap:
     cat << '_EOF_' | hgsql ce1
 CREATE TABLE chrM_gap (
      bin smallint not null,
      chrom varchar(255) not null,
      chromStart int unsigned not null,
      chromEnd int unsigned not null,
      ix int not null,
      n char(1) not null,
      size int unsigned not null,
      type varchar(255) not null,
      bridge varchar(255) not null,
      UNIQUE(chromStart),
      UNIQUE(chromEnd)
 );
 '_EOF_'
     #  hgGoldGapGl needs to be fixed so it can create a gold track
     #	and not need to create a gap track.  And the browser needs to
     #	be fixed so it can display the assembly track without the need
     #	for the gap tables to exist.
 
 # REMAKE GAP TRACKS - ADD POSITIONS OF Ns TO ChrN_gap TABLES 
 # (DONE, 2004-04-30, hartera)
 # Add Ns to gap tables as for Ce2, this is described on Gap details page
     # next machine
     ssh hgwdev
     mkdir -p /cluster/data/ce1/bed/gap
     cd /cluster/data/ce1/bed/gap
     # finds motifs and finds location of gaps as part of output
     foreach c (I II III IV V X M)
         findMotif -chr=chr${c} -verbose=4 -motif=gcatg \
             /gbdb/ce1/nib >& chr${c}Bed.stderr
     end
     grep -h GAP *.stderr | sed -e "s/#GAP //" > gap.bed
     # All the gap tables are empty so 
     # Load in gap.bed
     # Need to add extra fields to gap.bed file
                                                                                 
     cat << '_EOF_' > /cluster/data/ce1/scripts/createGapFile.pl
 #!/usr/bin/perl -w
 use strict;
                                                                                 
 my $oldchr = "";
 while (<STDIN>) {
   my @fields = split(/\t/);
   my $chr = $fields[0];
   if ($chr ne $oldchr) {
      open(OUT, ">$chr"."_gap.bed");
      $oldchr = $chr;
   }
   print OUT "$fields[0]\t$fields[1]\t$fields[2]\t$fields[3]\tN\t$fields[4]\tfragment\tyes\n";
 }
 '_EOF_'
 
     perl /cluster/data/ce1/scripts/createGapFile.pl < gap.bed
     # load into relevant tables
     foreach c (I II III IV V X M)
         hgLoadBed -tab -oldTable ce1 chr${c}_gap chr${c}_gap.bed
     end
 
 # MAKE chrM_gold TABLE FROM NEW chrM.agp FILE CREATED ABOVE 
 # (DONE, 2004-04-19, hartera)
     # next machine
     ssh hgwdev
     cd /cluster/data/ce1/
     hgGoldGapGl -noGl -chrom=chrM ce1 /cluster/data/ce1 sangerFa
 
 MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CE1 (DONE 2002-05-12 - Hiram)
     # next machine
      ssh hgwdev
     echo 'insert into defaultDb values("C. elegans", "ce1");' \
       | hgsql -h genome-testdb hgcentraltest
     #  If you need to delete that entry:
 	echo 'delete from defaultDb where name="ce1";' \
         | hgsql -h genome-testdb hgcentraltest
     # Note: for next assembly, set scientificName column to 
     # "Caenorhabditis elegans"
 
     echo 'insert into dbDb values("ce1", "May 2003", \
 	"/gbdb/ce1/nib", "Worm", "chrII:14642289-14671631", 1, 10, \
 	"C. elegans");' \
 	| hgsql -h genome-testdb hgcentraltest
     #  If you need to delete that entry:
 	echo 'delete from dbDb where name="ce1";' \
 	| hgsql -h genome-testdb hgcentraltest
     # Make trackDb table so browser knows what tracks to expect:
     cd ~/kent/src/hg/makeDb/trackDb
     cvs up -d -P
     # Edit that makefile to add ce1 in all the right places and do
     make update
     make alpha
     cvs commit makefile
 
 MAKE SANGER GENE TRACK (DONE - 2003-05-19 - Hiram)
 
     # next machine
     ssh eieio
     mkdir -p /cluster/store5/worm/ce1/bed/sangerGene
     cd /cluster/store5/worm/ce1/bed/sangerGene
     # the perl trims these files down to a reasonable size.  As they are
     #	they cause ldHgGene to hangup due to memory size.
     foreach f (I II III IV V X)
 	echo -n "${f} "
 	zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \
 	sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
 	-e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
 	-e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
 	-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
 	perl -ne '@a = split; \
         print if($a[1] =~ /curated|DNA|RNA/i && \
         $a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff
     end
     echo
     #  check file sizes, should be reasonable
     ls -ogrt
 # [hiram@eieio sangerGene]$ ls -ogrt
 # total 22656
 # -rw-r--r--    1 hiram     3495873 May 19 11:52 chrI.gff
 # -rw-r--r--    1 hiram     3670984 May 19 11:54 chrII.gff
 # -rw-r--r--    1 hiram     3264917 May 19 11:56 chrIII.gff
 # -rw-r--r--    1 hiram     3642256 May 19 11:59 chrIV.gff
 # -rw-r--r--    1 hiram     4704564 May 19 12:02 chrV.gff
 # -rw-rw-r--    1 hiram     3787721 May 19 12:03 chrX.gff
 
     # now load database with those transformed gff files
     # next machine
     ssh hgwdev
     cd /cluster/store5/worm/ce1/bed/sangerGene
     ldHgGene ce1 sangerGene *.gff
 # Read 37330 transcripts in 411148 lines in 6 files
 #  37330 groups 6 seqs 11 sources 6 feature types
 #  22128 gene predictions
 
     # if you need to delete that table:
 	echo 'drop table sangerGene' | hgsql ce1
 
 # MAKING AND STORING mRNA AND EST ALIGNMENTS (WORKING 2003-05-28 - Hiram)
     # next machine
     ssh kkr1u00
     cd /iscratch/i/worms/Celegans
     mkdir mRNA
     # there are 1644 sequences in mrna.fa, the 1000000 makes 1644 files
     faSplit sequence /iscratch/i/mrna.134/Caenorhabditis_elegans/mrna.fa \
 		1000000 mRNA/m
     ~kent/bin/iSync
 
     # next machine
     ssh kk
     cd ~/ce1/bed
     mkdir mrna est
     cd mrna
     mkdir psl
     ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
     ls -1S /iscratch/i/worms/Celegans/mRNA/m*.fa > mrna.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
     gensub2 worm.lst mrna.lst gsub spec
     para create spec
     para try
     para check
     para push
     ... etc ...
     #  These jobs are a bit on the short side.  I wonder if the mrna
     #  fa's could be put together in larger chunks ?
 # Average job time:                   8s       0.13m     0.00h    0.00d
 # Longest job:                       63s       1.05m     0.02h    0.00d
 
 
     # cluster run done
     pslSort dirs raw.psl /tmp psl
     pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_mrna.psl \
       /dev/null
     pslSortAcc nohead chrom /tmp all_mrna.psl
 
     # load mrna tables
     # next machine
     ssh hgwdev
     cd ~/ce1/bed/mrna/chrom
     # names must be chr*_mrna.psl
     foreach d (I II III IV V X M)
 	rm -f chr${d}_mrna.psl
 	mv chr${d}.psl chr${d}_mrna.psl
     end
     # this load command rewrites these table entirely if they exist
     hgLoadPsl ce1 *.psl
     mkdir /gbdb/ce1/mrna.134
     ln -s /cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.fa \
 	/gbdb/ce1/mrna.134
     # ! ! !  DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
     cd ~/ce1/bed/mrna
     hgLoadRna add -type=mRNA ce1 /gbdb/ce1/mrna.134/mrna.fa \
 	/cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.ra
     hgLoadPsl ce1 all_mrna.psl
 
     # setup /iscratch/i with the est.fa file for blat run
     # next machine
     ssh kkr1u00
     mkdir -p /iscratch/i/worms/Celegans/EST
     cd /iscratch/i/worms/Celegans/EST
     # create about a 200 files to provide about 1400 jobs to the cluster
     #	(these jobs are very small despite this small number of est files)
     faSplit sequence \
 	/cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.fa 200 e
     ~kent/bin/iSync
 
     # next machine
     ssh kk
     cd ~/ce1/bed/est
     mkdir psl
     ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
     ls -1S /iscratch/i/worms/Celegans/EST/e*.fa > est.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
     gensub2 worm.lst est.lst gsub spec
     para create spec
     para try
     para check
     para push
     # These 1358 jobs are pretty short:
 # Average job time:                  43s       0.71m     0.01h    0.00d
 # Longest job:                       77s       1.28m     0.02h    0.00d
 
     # cluster run done
     # next machine
     ssh hgwdev
     cd ~/ce1/bed/est
     pslSort dirs raw.psl /tmp psl
     pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_est.psl \
       /dev/null
     pslSortAcc nohead chrom /tmp all_est.psl
     cd chrom
     # names must be chr*_est.psl
     foreach d (I II III IV V X M)
 	rm -f chr${d}_est.psl
 	mv chr${d}.psl chr${d}_est.psl
     end
     # this load command rewrites these table entirely if they exist
     hgLoadPsl ce1 *.psl
 
     ln -s /cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.fa \
 	/gbdb/ce1/mrna.134
     # ! ! !  DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
     cd ~/ce1/bed/est
     hgLoadRna add -mrnaType=EST -type=EST ce1 /gbdb/ce1/mrna.134/est.fa \
 	/cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.ra
     hgLoadPsl ce1 all_est.psl -nobin
 
 MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CE1 (DONE - 2003-05-29 - Hiram)
     # next machine
     ssh hgwdev
     # DNA port is "0", trans prot port is "1"
     echo 'insert into blatServers values("ce1", "blat10", "17784", "0"); \
           insert into blatServers values("ce1", "blat10", "17785", "1");' \
       | hgsql -h genome-testdb hgcentraltest
         # if you need to delete those entries
 	echo 'delete from blatServers where db="ce1";' \
         | hgsql -h genome-testdb hgcentraltest
         # to check the entries:
 	echo 'select * from blatServers where db="ce1";' \
         | hgsql -h genome-testdb hgcentraltest
 
 PRODUCING FUGU FISH ALIGNMENTS  (DONE 2003-06-05 - Hiram)
     # Assumes masked NIBs have been prepared as above
     # and Fugu pieces are already on kluster /iscratch/i.
     # next machine
     ssh kk
     mkdir -p ~/ce1/bed/blatFugu
     cd ~/ce1/bed/blatFugu
     mkdir psl
     ls -1S /iscratch/i/fugu/*.fa > fugu.lst
     ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -q=dnax -t=dnax -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
     gensub2 worm.lst fugu.lst gsub spec
     para create spec
     para try
     para check
     para push
     para check
 # Average job time:                 237s       3.95m     0.07h    0.00d
 # Longest job:                     1326s      22.10m     0.37h    0.02d
 
     # want names chrI_blatFugu
     #  When cluster run is done, sort alignments
     # next machine
     ssh eieio
     cd ~/ce1/bed/blatFugu
     foreach d (I II III IV V X M)
         echo -n "${d} "
 	pslCat psl/chr${d}_*.psl | \
 	pslSortAcc nohead chrom temp stdin
 	rm -f chrom/chr${d}_blatFugu.psl
 	mv chrom/chr${d}.psl chrom/chr${d}_blatFugu.psl
     end
 
     # next machine
     ssh hgwdev
     cd ~/ce1/bed/blatFugu/chrom
     hgLoadPsl ce1 chr*_blatFugu.psl
 
     # Make fugu /gbdb/ symlink and load Fugu sequence data.
     mkdir /gbdb/ce1/fuguSeq
     ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta /gbdb/ce1/fuguSeq
     # ! ! !  DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
     cd ~/ce1/bed/blatFugu
     hgLoadRna addSeq ce1 /gbdb/ce1/fuguSeq/fugu_v3_mask.fasta
 
 # RUN Waba alignment with briggsae  (DONE - 2003-06-02 - Hiram)
     # prepare contigs from C. briggsae
     #   Assumes C. briggsae data has been downloaded according to
     #   makeCb1.doc
     # next machine
     ssh kkr1u00
     mkdir -p /iscratch/i/worms/Cbriggsae/waba_contigs
     cd /iscratch/i/worms/Cbriggsae/waba_contigs
     faSplit sequence ../contigs.fa 2000 c
     ~kent/bin/iSync
     
     # next machine
     ssh kk
     mkdir ~/ce1/bed/waba/out
     cd ~/ce1/bed/waba
     ls /iscratch/i/worms/Cbriggsae/contigs/c*.fa > briggsae.lst
     ls /iscratch/i/worms/Celegans/unmaskedFa/chr*.fa > elegans.lst
     echo "" > dummy.lst
     # create scripts to be used here
     cat << '_EOF_' > wabaRun
 #!/bin/csh -fe
 #
 #	$1 - full pathname to a briggsae contig
 #	$2 - file path to an elegans chrom.fa
 #	$3 - result file full pathname
 #
 set f = $1:t
 set chr = $2:t
 set d = $chr:r
 mkdir -p /tmp/$d/$f
 cp $1 /tmp/$d/$f
 pushd .
 cd /tmp/$d/$f
 set t = $f:r
 /cluster/home/hiram/bin/i386/waba 1 $f $2 $t.1
 /cluster/home/hiram/bin/i386/waba 2 $t.1 $t.2
 /cluster/home/hiram/bin/i386/waba 3 $t.2 $t.wab
 cp $t.wab $3
 popd
 rm -f /tmp/$d/$f/$t.*
 rmdir --ignore-fail-on-non-empty /tmp/$d/$f
 rmdir --ignore-fail-on-non-empty /tmp/$d
 '_EOF_'
     chmod +x wabaRun
 
     echo
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/store5/worm/ce1/bed/waba/wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists /cluster/store5/worm/ce1/bed/waba/out/$(root2)/$(root1).wab}
 #ENDLOOP
 '_EOF_'
 
     gensub2 briggsae.lst elegans.lst gsub spec
     para create spec
     para try
     para check
     para push
     ... etc ...
     # one of the jobs takes quite a while.  Most of the others are OK:
 # Average job time:                2308s      38.46m     0.64h    0.03d
 # Longest job:                    47442s     790.70m    13.18h    0.55d
 
     # next machine
     ssh hgwdev
     # you may need to build hgWaba
     cd ~/kent/src/hg/makeDb/hgWaba
     make
     # hgWaba.c has been adjusted to avoid the squeezed assert by
     #  ignoring them.  That may want to be looked at some day.
     #  (The waba cluster run has a number of errors too.  waba is not
     #	perfect, but it is as good as the Intronerator was...)
 
     cd ~/ce1/bed/waba
     # one of the outputs produces a result that won't process properly
     # with hgWaba.  Move it out of the way
     mv out/chrII/c0911.wab out/chrII_c0911.wab.broken
  
     mkdir Load
     cd Load
     #  The cat through the pipe to hgWaba will avoid making large files
     # 	that are not needed.
     cat << '_EOF_' > loadEm.sh
 #!/bin/sh
 #
 for c in I II III IV V X M
 do
 	echo -n "${c} "
         cat /cluster/store5/worm/ce1/bed/waba/out/chr${c}/c*.wab |
         ~/bin/i386/hgWaba ce1 Cbr chr${c} 0 stdin > proc${c}.out 2>&1
 done
 
 exit 0
 '_EOF_'
 
     # run it to load the waba track:
     ./loadEm.sh
     # remove garbage temp file:
     rm full_waba.tab
     rm chrom_waba.tab
 
     # worm/ce1/trackDb.ra entry:
 #   track cbrWaba
 #   shortLabel Briggsae Waba
 #   longLabel C. briggsae Waba Alignments
 #   group genes
 #   priority 50
 #   visibility dense
 #   color 140,0,200
 #   altColor 210,140,250
 
 # BEGINNING OF Cb1 BLASTZ
 
     # next machine
     ssh kkr1u00
     #  These nibs were created in the combined RepeatMasker and TRF above
     mkdir -p /iscratch/i/worms/Celegans/rmsk
     cd /iscratch/i/worms/Celegans/rmsk
     cp -p /cluster/store5/worm/ce1/RMRun/chr*.fa.out .
     ~kent/bin/iSync
   
 #	These blastz sections usually start by saying:
 #		FIRST FOLLOW INSTRUCTIONS IN ~angie/hummus/Notes
 #	This Notes file is the actual blastz run.
 #	But I found those notes a bit obscure, for clarity I have
 #	reproduced the exact steps here.
 #	The initial DEF file has been created for this.
 #	It is in ~angie/hummus/DEF.ce1-cb1.2003-05-29
 #	which will blastz soft masked Cbriggsae ultra contigs
 #	against the soft masked Celegans chroms
 
 #	1. created the DEF file for this particular blastz run
 #	    The TARGET files are the masked NIBS created above, in:
 #	SEQ1_DIR=/iscratch/i/worms/Celegans/bothMasksNib
 #	    and the repeatmasker *.out files in:
 #	SEQ1_RMSK=/iscratch/i/worms/Celegans/rmsk
 #	    There is no lineage specific repeats for the worms as far as
 #	    I can tell, thus:
 #	SEQ1_SMSK=/dev/null
 #	    This requires:
 #	BLASTZ_ABRIDGE_REPEATS=0
 #	    The QUERY sequence are the repeat and trf soft masked chrUn.fa
 #	    nib file from the briggsae processing:
 #	SEQ2_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib
 #	    And the repeat masker run from the briggsae processing:
 #	SEQ2_RMSK=/iscratch/i/worms/Cbriggsae/rmsk
 #	    No lineage specific repeats:
 #	SEQ2_SMSK=/dev/null
 #
 #	    Our base of operations:
 #	BASE=/cluster/store5/worm/ce1/bed/blastzCb1
 #
 
 #	Reproduced here are the instructions as specified in the
 #	hummus/Notes file.
 #	Evidently historical archives of the DEF files exist in
 #	in ~angie/hummus/DEF* and they are copied out of there to
 #	to the file: $BASE/DEF for the actual run.  Don't worry, none of
 #	this seems to make any sense.
 	# next machine
 	ssh kk
 	cd ~/ce1/bed/blastzCb1
 #	source the DEF file which was carefully written as above.
 	. ./DEF
 	mkdir -p $BASE/run
 	~angie/hummus/make-joblist $DEF > $BASE/run/j
 	# that created xdir.sh and joblist run/j
 	sh $BASE/xdir.sh
 	# xdir.sh makes a bunch of result directories in $BASE/raw/
 	# based on chrom name and CHUNK size
         cd $BASE/run
 	# now edit j because it is not correct:
 	sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
 	wc -l j*
 	head j2
 	# *** make sure the j2 edits are OK, then use it:
 	mv j2 j
 	# para create will create the file: 'batch' for the cluster run
 	para create j
 	para try
 	para check
 	para push
 	... etc ...
 Average job time:                1222s      20.36m     0.34h    0.01d
 Longest job:                     3772s      62.87m     1.05h    0.04d
 
 	# When that cluster run is done, results are in $BASE/raw/chr*/*
 	# continuing with ~angie/hummus/Notes:
 	# --- normalize and single_cov
 	cd ~/ce1/bed/blastzCb1
 	#   source the DEF file again in case you are coming back to this
 	. ./DEF
 	# a new run directory
 	mkdir -p $BASE/run.1
 	# another obscure script creates a new job list:
 	~angie/hummus/do.out2lav $DEF >$BASE/run.1/j
  	cd $BASE/run.1
 	# the job list is once again incorrect, edit it:
  	sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2
  	wc -l j*
  	head j2
 	# make sure the edited j2 is OK, then use it:
  	mv j2 j
  	para create j
  	para try
 	para push;para check; ... etc ...
 Average job time:                  65s       1.08m     0.02h    0.00d
 Longest job:                      136s       2.27m     0.04h    0.00d
 
 
     # Translate the .lav files created by the end of ~angie/hummus Notes
     # into axt files
     ssh eieio
     set base="/cluster/store5/worm/ce1/bed/blastzCb1"
     set seq1_dir="/cluster/store5/worm/ce1/nib"
     set seq2_dir="/cluster/store5/worm/cb1/nib"
     set tbl="blastzCb1"
     cd $base
     mkdir -p axtChrom
     foreach c (lav/*)
       pushd $c
       set chr=$c:t
       set out=$base/axtChrom/$chr.axt
       echo "Translating $chr lav to $out"
       cat `ls -1 *.lav | sort -g` \
         | /cluster/home/hiram/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \
         | /cluster/bin/i386/axtSort stdin $out
       popd
     end
 
     # Translate the sorted axt files into psl:
     cd $base
     mkdir -p pslChrom
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
 
     # load these blastz results
     # next machine
     ssh hgwdev
     cd ~/ce1/bed/blastzCb1/pslChrom
     /cluster/bin/i386/hgLoadPsl ce1 chr*_*.psl
     # trackDb/worm/ce1/trackDb.ra entry:
 #   track blastzCb1
 #   shortLabel briggsae Blastz
 #   longLabel Blastz C briggsae
 #   group compGeno
 #   priority 159
 #   visibility dense
 #   color 0,0,0
 #   altColor 50,128,50
 #   spectrum on
 #   type psl xeno cb1
 
 # CHAINING Briggsae blastz
     # next machine, small cluster is good for these tiny jobs
     ssh kkr1u00
     cd /cluster/store5/worm/ce1/bed/blastzCb1
     mkdir axtChain
     cd axtChain
     mkdir run1
     cd run1
     ls -1S ../../axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > doChain
 #!/bin/csh
     axtChain $1  /cluster/store5/worm/ce1/nib /cluster/store5/worm/cb1/nib $2 > $3
 '_EOF_'
     chmod a+x doChain
 
     mkdir out chain
     gensub2 input.lst single gsub spec
     para create spec
     para try
 Average job time:                 303s       5.04m     0.08h    0.00d
 Longest job:                     1006s      16.77m     0.28h    0.01d
 
     # now on the cluster server, sort chains
     # next machine
     ssh eieio
     cd ~/ce1/bed/blastzCb1/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd ~/ce1/bed/blastzCb1/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain ce1 ${c}_cb1Chain $i
         echo done $c
     end
 
     # Create the nets  XXXX STOPPED HERE 2003-06-11
     # next machine
     ssh eieio
     cd ~/ce1/bed/blastzCb1/axtChain
     mkdir preNet
     cd chain
     
 
 # MAKING AND STORING refSeq ALIGNMENTS (WORKING 2003-06-02 - Hiram)
 # MAKING AND STORING refSeq ALIGNMENTS - hgRefSeqMrna load REDONE (2004-05-18, hartera)
 # hgRefSeqStatus - refSeqStatus table rebuilt with updated loc2ref
 # (2004-06-16, hartera)
 
     # next machine
     ssh kkr1u00
     mkdir -p /iscratch/i/worms/Celegans/refSeq
     cd /iscratch/i/worms/Celegans
     # there are about 20000 sequences in the refSeq.fa file,
     #	split them into about 300 files
     faSplit sequence \
   /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.fa \
 		300 refSeq/r
     ~kent/bin/iSync
 
     # next machine
     ssh kk
     cd ~/ce1/bed
     mkdir -p refSeq/psl
     cd refSeq
     # for reference:
     ln -s \
  /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.* .
     ls -1S /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
     ls -1S /cluster/bluearc/iscratch/i/worms/Celegans/refSeq/r*.fa > refSeq.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
     gensub2 worm.lst refSeq.lst gsub spec
     para create spec
     para try
     para check
     para push  ... etc ...
 # Average job time:                  55s       0.92m     0.02h    0.00d
 # Longest job:                      626s      10.43m     0.17h    0.01d
 
     # when cluster run done
     # next machine
     ssh eieio
     cd ~/ce1/bed/refSeq
     pslSort dirs raw.psl /tmp psl
     pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_refSeq.psl \
       /dev/null
     pslSortAcc nohead chrom /tmp all_refSeq.psl
     pslCat -dir chrom > refSeqAli.psl
 
     # load refSeq tables
     # next machine
     ssh hgwdev
     cd ~/ce1/bed/refSeq
     hgLoadPsl ce1 -tNameIx refSeqAli.psl
     ln -s \
  /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.fa \
 	/gbdb/ce1/mrna.134
     # ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
     cd ~/ce1/bed/refSeq
     hgLoadRna add -type=refSeq ce1 /gbdb/ce1/mrna.134/refSeq.fa \
  /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.ra
     # Reload refLink table as LocusLink ID column is all 0 and protein
     # accessions missing. (DONE, 2004-05-12, hartera)
     # RefSeq accessions for ce1 appear not to be in 
     # /cluster/store5/mrna.134/refSeq/041003/loc2ref so could not also
     # map the LocusLink ID to OMIM ID through mim2loc
     # Dowload the latest loc2ref and mim2loc from NCBI and put in
     # /cluster/store5/mrna.134/refSeq/051204/
     # Previous error in loading refPep was because there were duplicate
     # accessions in hs.faa once the suffix of .1 and .2 were removed
     # A search in GenBank showed that NP_002900.1 has been replaced 
     # by NP_002900.2 so removed the former from hs.faa
     # (2004-05-18, hartera) Deleted contents of refPep table since
     # hs.faa contains only human sequences. No C. elegans peptide sequences
     # available from this NCBI Build 134 so added protein translation of genomic DNA 
     # on the RefSeq genes details pages. 
     hgRefSeqMrna ce1 /gbdb/ce1/mrna.134/refSeq.fa \
  /cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.ra \
       all_refSeq.psl \
       /cluster/store5/mrna.134/refSeq/051204/loc2ref \
       /cluster/store5/mrna.134/refSeq/041003/hs.faa \
       /cluster/store5/mrna.134/refSeq/051204/mim2loc
     # Missing locusLinkIds for 184 of 20661
     # Missing protein accessions for 184 of 20661 
     # Add RefSeq status info (updated 2004-06-16 as mrnaAcc values not found
     # in refLink.mrnaAcc column). Only load in those for C. elegans (Taxonomy
     # ID is 6239)
     cd /cluster/data/ce1/bed/refSeq
     hgRefSeqStatus -taxId=6239 ce1 \
          /cluster/store5/mrna.134/refSeq/051204/loc2ref
 
     # Create precomputed join of refFlat and refGene:
     echo 'CREATE TABLE refFlat \
           (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) \
           SELECT refLink.name as geneName, refGene.* \
           FROM refLink,refGene \
           WHERE refLink.mrnaAcc = refGene.name' \
     | hgsql ce1
 
 # END - MAKING AND STORING refSeq ALIGNMENTS
 
 # BEGINNING OF Self BLASTZ
 
 #	DEF file here is the same as the blastzCb1/DEF file, with
 #	the following changes:
 #	SEQ1_DIR=/iscratch/i/worms/Celegans/bothMasksNib
 #	SEQ2_DIR=/iscratch/i/worms/Celegans/bothMasksNib
 #	SEQ2_RMSK=/iscratch/i/worms/Celegans/rmsk
 #	SEQ2_CHUNK=10000000
 #	SEQ2_LAP=10000
 #	BASE=/cluster/store5/worm/ce1/bed/blastzSelf
 
 #	Reproduced here are the instructions as specified in the
 #	hummus/Notes file.
 #	Evidently historical archives of the DEF files exist in
 #	in ~angie/hummus/DEF* and they are copied out of there to
 #	to the file: $BASE/DEF for the actual run.  Don't worry, none of
 #	this seems to make any sense.
 	# next machine
     ssh kk
     mkdir -p ~/ce1/bed/blastzSelf
     cd ~/ce1/bed/blastzSelf
     . ./DEF
     mkdir -p $BASE/run
     ~angie/hummus/make-joblist $DEF > $BASE/run/j
     # that created xdir.sh and joblist run/j
     sh $BASE/xdir.sh
     # xdir.sh makes a bunch of result directories in $BASE/raw/
     # based on chrom name and CHUNK size
     cd $BASE/run
     # now edit j because it is not correct:
     sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
     wc -l j*
     head j2
     # *** make sure the j2 edits are OK, then use it:
     mv j2 j
     # para create will create the file: 'batch' for the cluster run
     para create j
     para try
     para check
     para push
 	... etc ...
 Average job time:                2228s      37.14m     0.62h    0.03d
 Longest job:                    15281s     254.68m     4.24h    0.18d
     # That longest job time is not correct.  I was having trouble with
     # the cluster during that run.
 
     # When that cluster run is done, results are in $BASE/raw/chr*/*
     # continuing with ~angie/hummus/Notes:
     # --- normalize and single_cov
     cd ~/ce1/bed/blastzSelf
     #   source the DEF file again in case you are coming back to this
     . ./DEF
     # a new run directory
     mkdir -p $BASE/run.1
     # another obscure script creates a new job list:
     ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j
     cd $BASE/run.1
     # the job list is once again incorrect, edit it:
     sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2
     wc -l j*
     head j2
     # make sure the edited j2 is OK, then use it:
     mv j2 j
     para create j
     para try
     para push;para check; ... etc ...
 Average job time:                 241s       4.01m     0.07h    0.00d
 Longest job:                      459s       7.65m     0.13h    0.01d
 
 
     # Translate the .lav files created by the end of ~angie/hummus Notes
     # into axt files
     ssh eieio
     set base="/cluster/store5/worm/ce1/bed/blastzSelf"
     set seq1_dir="/cluster/store5/worm/ce1/nib"
     set seq2_dir="/cluster/store5/worm/ce1/nib"
     set tbl="blastzSelf"
     cd $base
     mkdir -p axtChrom
     foreach c (lav/*)
       pushd $c
       set chr=$c:t
       set out=$base/axtChrom/$chr.axt
       echo "Translating $chr lav to $out"
       cat `ls -1 *.lav | sort -g` \
         | /cluster/home/hiram/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \
         | /cluster/bin/i386/axtSort stdin $out
       popd
     end
 
     #  DropOverlap cleans out a bit more than DropSelf:
     cd $base
     mkdir DropOverlap
     foreach c (M I II III IV V X)
 	echo "dropping chr${c}"
 	/cluster/bin/i386/axtDropOverlap axtChrom/chr${c}.axt \
 		DropOverlap/chr${c}.axt
     end
 
     # Translate the sorted axt files into psl:
     mkdir -p pslChrom
     foreach f (DropOverlap/chr*.axt)
       set c=$f:t:r
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
 
     # load these blastz results  (they already have the correct names
     #		chr*_blastzSelf.psl from axtToPsl above)
     # next machine
     ssh hgwdev
     cd ~/ce1/bed/blastzSelf/pslChrom
     /cluster/bin/i386/hgLoadPsl ce1 chr*_*.psl
     # trackDb/worm/ce1/trackDb.ra entry:
 #   track blastzSelf
 #   shortLabel Blastz Self
 #   longLabel Blastz C elegans (self)
 #   group compGeno
 #   priority 159
 #   visibility dense
 #   color 0,0,0
 #   altColor 50,128,50
 #   spectrum on
 #   type psl xeno ce1
 
 # CHAINING Self blastz
     # next machine, small cluster is good for these tiny jobs
     ssh kkr1u00
     cd /cluster/store5/worm/ce1/bed/blastzSelf
     mkdir -p axtChain/run1
     cd /cluster/store5/worm/ce1/bed/blastzSelf/axtChain/run1
     ls -1S /cluster/store5/worm/ce1/bed/blastzSelf/DropOverlap/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     #  Nibs are on /cluster/bluearc now
     cat << '_EOF_' > doChain
 #!/bin/csh
     axtChain $1  /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib /iscratch/i/worms/Celegans/bothMasksNib $2 > $3
 '_EOF_'
     chmod a+x doChain
 
     mkdir out chain
     gensub2 input.lst single gsub spec
     para create spec
     para try
     para push ... etc ...
 Average job time:                 502s       8.37m     0.14h    0.01d
 Longest job:                      874s      14.57m     0.24h    0.01d
 
     # now on the cluster server, sort chains
     # next machine
     ssh eieio
     cd ~/ce1/bed/blastzSelf/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd ~/ce1/bed/blastzSelf/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain ce1 ${c}_chainSelf $i
         echo done $c
     end
 #  Loading 845504 chains into ce1.chrI_chainSelf
 #  Loading 845307 chains into ce1.chrII_chainSelf
 #  Loading 1152790 chains into ce1.chrIII_chainSelf
 #  Loading 859566 chains into ce1.chrIV_chainSelf
 #  Loading 2 chains into ce1.chrM_chainSelf
 #  Loading 1114939 chains into ce1.chrV_chainSelf
 #  Loading 604293 chains into ce1.chrX_chainSelf
 
 
 #  END of Blastz Self
 
 
 LOAD SOFTBERRY GENES (DONE - 2003-08-14 - Hiram)
      mkdir -p /cluster/data/ce1/bed/softberry
      cd /cluster/data/ce1/bed/softberry
 
      wget \
 	ftp://www.softberry.com/pub/SC_CEL_MAY03/Soft_fgenesh_Ce03.tar.gz
      tar xvzf Soft_fgenesh_Ce03.tar.gz
 
      ldHgGene ce1 softberryGene chr*.gff
 # Read 19112 transcripts in 276202 lines in 6 files
 #   19112 groups 6 seqs 1 sources 4 feature types
 # 19112 gene predictions
      hgPepPred ce1 softberry *.protein
      hgSoftberryHom ce1 *.protein
 
     # Add entry to worm/ce1/trackDb.ra and add specific
     # softberryGene.html file
 
 CREATE ORFTOGENE TABLE (Used by hgGene and hgNear)
    mkdir /cluster/data/ce1/bed/orfToGene
    cd !$
    wget ftp://ftp.wormbase.org/pub/wormbase/GENE_DUMPS/gene_names.txt
    awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt
    hgCeOrfToGene ce1 gene_names.txt sangerGene orfToGene
 
 CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear)
    mkdir /cluster/data/ce1/bed/steinHelp
 # Mail Lincoln Stein and ask for mappings from wormPep to
 # SwissProt and from ORF to wormPep to description.  Place
 # these files in the steinHelp directory:
 #	seqPepDesc.ace  - contains ORF/WormPep/Description
 #	spWp.ace - contains WormPep/SwissProt
 # Then go to the AQL search page at
 # http://www.wormbase.org/db/searches/aql_query.
 # and create the following files in that directory:
 #    wbGeneClass.txt - associates 3 letter first part of worm gene
 #             name with a brief description.  Produced with query:
 #             select l,l->Description  from l in class Gene_Class
 #    wbConcise.txt - associates worm gene name with a several
 #             sentence description.  Produces with query:
 #             select l,g from l in class Locus, g in l->Gene_information[Provisional_description]
     hgWormLinks spWp.ace seqPepDesc.ace wbConcise.txt wbGeneClass.txt ../orfToGene/orfToGene sangerLinks.txt
     hgsql ce1 < ~/kent/src/hg/near/hgWormLinks/sangerLinks.sql
     hgsql -e \
 	'load data local infile "sangerLinks.txt" into table sangerLinks.sql' \
 	ce1
 
 TWINSCAN GENE PREDICTIONS (DONE - 2004-01-05 - Hiram)
     mkdir /cluster/data/ce1/bed/twinscan
     cd /cluster/data/ce1/bed/twinscan
     wget --timestamping \
 http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS100/12-15-2003/Celegans_WS100_12-15-03.tgz
 
     tar xvzf Celegans_WS100_12-15-03.tgz
     rm -r chr_tx
     # clean up chrom field of GTF files
     foreach c (I II III IV V X)
 	set f = chr_gtf/CHROMOSOME_${c}.pred_gtf
 	echo ${f}
 	sed -e "s/CHROMOSOME_/chr/g" ${f} | \
 	sed -e "s/_[0-9][0-9]*.seq\t/\t/" > \
 		chr_gtf/chr${c}-fixed.gtf
     end
 
     # pare down protein FASTA header to id and add missing .1:
     foreach c (I II III IV V X)
 	set f = chr_ptx/CHROMOSOME_${c}.ptx
 	echo ${f}
 	perl -wpe 's/^\>CHROMOSOME_(\S+).*$/\>chr$1.1/;' < \
 		${f} > chr_ptx/chr${c}-fixed.fa
     end
     ldHgGene ce1 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS
 #    Read 20889 transcripts in 146518 lines in 6 files
 #      20889 groups 206 seqs 1 sources 3 feature types
 #    20889 gene predictions
     hgPepPred ce1 generic twinscanPep chr_ptx/chr*-fixed.fa
 
 # MYTOUCH FIX - Jen - 2006-01-25
   sudo mytouch ce1 twinscanPep 0404031400.00
 
 # MRNAORIENT, creating tables for clusterRna operation
 #		(working 2004-02-10 - Hiram)
     mkdir -p /cluster/bluearc/ce1/bed/est
     mkdir -p /cluster/bluearc/ce1/bed/mrna
     cd /cluster/bluearc/ce1/bed/est
     cp -p /cluster/data/ce1/bed/est/chrom/*.psl .
     pslSortAcc nohead contig /tmp/psl *.psl
     cd /cluster/bluearc/ce1/bed/mrna
     cp -p /cluster/data/ce1/bed/mrna/chrom/*.psl .
     pslSortAcc nohead contig /tmp/psl *.psl
     mkdir mRNA
     faSplit sequence /cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.fa 1000000 mRNA/m
     mkdir -p /cluster/data/ce1/bed/mrnaOrientInfo/oi
     cd /cluster/data/ce1/bed/mrnaOrientInfo
     ls -1S /cluster/bluearc/ce1/bed/mrna/contig/*.psl > psl.lst
     ls -1S /iscratch/i/worms/Celegans/mrna.134/mrna.fa > mrna.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/polyInfo {check in line $(path1)} {check in line+ /iscratch/i/worms/Celegans/trfFa/$(root1).fa} /iscratch/i/worms/Celegans/mrna.134/mrna.fa {check out line+ oi/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     ssh kk
     cd /cluster/data/ce1/bed/mrnaOrientInfo
     gensub2 psl.lst mrna.lst gsub jobList
     para create jobList
     para try			# there are only six jobs, this runs them
 
     cat oi/*.tab > mrnaOrientInfo.bed
     hgLoadBed ce1 mrnaOrientInfo mrnaOrientInfo.bed \
       -sqlTable=$HOME/kent/src/hg/lib/mrnaOrientInfo.sql
 
     mkdir -p /cluster/data/ce1/bed/estOrientInfo/oi
     cd /cluster/data/ce1/bed/estOrientInfo
     ls -1S /cluster/bluearc/ce1/bed/est/contig/*.psl > psl.lst
     ls -1S /iscratch/i/worms/Celegans/mrna.134/est.fa > est.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/polyInfo {check in line $(path1)} {check in line+ /iscratch/i/worms/Celegans/trfFa/$(root1).fa} /iscratch/i/worms/Celegans/mrna.134/est.fa {check out line+ oi/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     ssh kkr1u00
     cd /cluster/data/ce1/bed/estaOrientInfo
     gensub2 psl.lst est.lst gsub jobList
     para create jobList
     para try			# there are only six jobs, this runs them
 
     cat oi/*.tab > estOrientInfo.bed
     bedSort estOrientInfo.bed estOrientInfo.bed
     hgLoadBed ce1 estOrientInfo estOrientInfo.bed \
       -sqlTable=$HOME/kent/src/hg/lib/estOrientInfo.sql
 
     mkdir /cluster/data/ce1/bed/rnaCluster
     cd /cluster/data/ce1/bed/rnaCluster
     foreach f (I II III IV V X)
 	echo "clusterRna ce1 /dev/null chr${f}.bed -chrom=${f}"
 	clusterRna ce1 /dev/null chr${f}.bed -chrom=chr${f}
     end
     hgLoadBed ce1 rnaCluster *.bed
 
 
     hgsql -A -e "select chrom from chromInfo" ce1 | egrep -v chrom > chrom.tab
     (perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce1 | egrep -v chromStart | cut -f2\-13 > $_.ce1.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.tab > rnaCluster.sh
 
     mkdir -p /cluster/data/ce1/bed/altSplice/agx
     cd /cluster/data/ce1/bed/altSplice
 
     for c in I II III IV V X M
     do
     hgsql -e "create table chr${c}_intronEst as select chr${c}_est.* from chr${c}_est,estOrientInfo where (chr${c}_est.qName = estOrientInfo.name) and (estOrientInfo.intronOrientation != 0)" ce1
     done
 
     for c in I II III IV V X M
     do
     hgsql -e "alter table chr${c}_intronEst add index bin (bin);" ce1
     hgsql -e "alter table chr${c}_intronEst add index tStart (tStart);" ce1
     hgsql -e "alter table chr${c}_intronEst add index qName (qName(12));" ce1
     hgsql -e "alter table chr${c}_intronEst add index tEnd (tEnd);" ce1
     echo "done chr${c}"
     done
 
     hgsql -A -e "select chrom from chromInfo" ce1 | egrep -v chrom > chrom.tab
     perl -e 'while(<>) {chomp($_); print "~/bin/i386/altSplice -db=ce1 -beds=../rnaCluster/$_.ce1.rnaCluster.bed -agxOut=agx/$_.ce1.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.tab > altSplice.para.spec
     # these jobs are no big deal on the worm, just run this file:
     chmod +x altSplice.para.spec
     mkdir agx
     ./altSplice.para.spec
     cat agx/*.agx > altSplice.agx
     agxToBed altSplice.agx altSplice.bed
 
     cat << '_EOF_' >> altGraphX.sql
 CREATE TABLE altGraphX (
   bin smallint unsigned not null, 
   tName varchar(255) NOT NULL default '',
   tStart int(11) NOT NULL default '0',
   tEnd int(11) NOT NULL default '0',
   name varchar(255) NOT NULL default '',
   id int(10) unsigned NOT NULL auto_increment,
   strand char(2) NOT NULL default '',
   vertexCount int(10) unsigned NOT NULL default '0',
   vTypes longblob NOT NULL,
   vPositions longblob NOT NULL,
   edgeCount int(10) unsigned NOT NULL default '0',
   edgeStarts longblob NOT NULL,
   edgeEnds longblob NOT NULL,
   evidence longblob NOT NULL,
   edgeTypes longblob NOT NULL,
   mrnaRefCount int(11) NOT NULL default '0',
   mrnaRefs longblob NOT NULL,
   mrnaTissues longblob NOT NULL,
   mrnaLibs longblob NOT NULL,
   PRIMARY KEY  (id),
   KEY tName (tName(16),tStart,tEnd)
 );
 '_EOF_'
     # << emacs
 
     hgLoadBed -sqlTable=altGraphX.sql ce1 altGraphX altSplice.agx
 
     # trackDb entry:
 	track altGraphX
 	shortLabel Alt-Splice
 	longLabel Alternative Splicing
 	group rna
 	priority 74.1
 	visibility pack
 	type bed 3
 	canPack off
 
     #	copy frame based view to goldenPath 2006-01-12 - Hiram
     cd /usr/local/apache/htdocs/goldenPath
     ln -s ceMay2003 ce1
     cd /cluster/data/ce1/bed/altSites
     mkdir /usr/local/apache/htdocs/goldenPath/ce1/altGraphX
     sed -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" links.html \
 	> /usr/local/apache/htdocs/goldenPath/ce1/altGraphX/links.html
 
     sed -e \
     "s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
 	-e "s/NM_005487/chrII:14689493-14690232/" \
 	-e "s#http://hgwdev-sugnet.cse.ucsc.edu##" \
 	 frame.html \
 	    > /usr/local/apache/htdocs/goldenPath/ce1/altGraphX/frame.html
 
 #  GC 5 BASE WIGGLE TRACK (DONE - 2004-02-13 - Hiram)
     #    working on eieio, the fileserver for this data.
     # was named gc20KBase to start with, became a 5 base dataset later
     #	the generic trackDb html file is named gc20KBase.html
     ssh eieio
     mkdir /cluster/data/ce1/bed/gc20KBase
     cd /cluster/data/ce1/bed/gc20KBase
     mkdir wigData5 dataLimits5
 for n in ../../nib/*.nib
 do
     c=`basename ${n} | sed -e "s/.nib//"`
     C=`echo $c | sed -e "s/chr//"`
     echo -n "working on ${c} - ${C} ... "
     hgGcPercent -chr=${c} -doGaps \
 	-file=stdout -win=5 ce1 ../../nib | grep -w GC | awk '
 {
     bases = $3 - $2
     perCent = $5/10.0
     printf "%d\t%.1f\n", $2+1, perCent
 }' | wigAsciiToBinary -dataSpan=5 -chrom=${c} \
 	-wibFile=wigData5/gc5Base_${C} -name=${C} stdin > dataLimits5/${c}
 echo "done ${c}"
 done
 
     #	data is compete, load track on hgwdev
     ssh hgwdev
     cd /cluster/data/ce1/bed/gc20KBase
     hgLoadWiggle ce1 gc20KBase wigData5/*.wig
     #	create symlinks for .wib files
     mkdir /gbdb/ce1/wib
     ln -s `pwd`/wigData5/*.wib /gbdb/ce1/wib
     #	the trackDb entry
 track gc20KBase
 shortLabel GC Percent
 longLabel GC Percent in 5 base windows
 group map
 priority 23.5
 visibility hide
-autoScaleDefault Off
+autoScale Off
 maxHeightPixels 128:36:16
 graphTypeDefault Bar
 gridDefault OFF
 windowingFunction Mean
 color 0,128,255
 altColor 255,128,0
 viewLimits 30:70
 type wig 0 100
 
 
 #  miRNA track (DONE - 2004-03-02 - Hiram)
     #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
     #	and Michel.Weber@ibcg.biotoul.fr
     #	notify them if this assembly updates to renew this track
     mkdir /cluster/data/ce1/bed/miRNA
     cd /cluster/data/ce1/bed/miRNA
     wget --timestamping \
     "http://www.sanger.ac.uk/Software/Rfam/mirna/data/ucsc_cel_mir_track.txt"
     grep "^chr" ucsc_cel_mir_track.txt | sed -e "s/ /\t/g" > worm.mir.bed
     hgLoadBed ce1 miRNA worm.mir.bed
     # entry in trackDb/trackDb.ra already there for same track in Hg16
     #	see makeHg16.doc for more comments on this
 
 
 #####################################################################
 # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
     # File emailed from Xinwei She <xws@u.washington.edu>
     mkdir /cluster/data/ce1/bed/genomicSuperDups
     cd /cluster/data/ce1/bed/genomicSuperDups
     sed -e 's/CHROMOSOME_/chr/g; s/\t_\t/\t-\t/;' \
       ce1_genomicSuperDup.tab \
     | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
     | hgLoadBed ce1 genomicSuperDups stdin \
       -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql