src/hg/makeDb/doc/sacCer1.txt 1.10

1.10 2009/11/25 21:48:42 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/sacCer1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/sacCer1.txt,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/hg/makeDb/doc/sacCer1.txt	6 Apr 2009 05:42:22 -0000	1.9
+++ src/hg/makeDb/doc/sacCer1.txt	25 Nov 2009 21:48:42 -0000	1.10
@@ -1,1459 +1,1459 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This describes how to make the sacCer1 browser database.
 #
 # The genomic sequence was downloaded Nov. 17, 2003, and is dated
 # 10/1/2003 on the Stanford FTP site.  The previous version of the
 # genomic sequence is dated Nov. 2002.  I don't see version numbers.
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +-----------+-----------------+
 #  | tableName | type            |
 #  +-----------+-----------------+
 #  | sgdGene   | genePred sgdPep |
 #  +-----------+-----------------+
 
 
 # Create the directory structure and download sequence from the SGD
 # site at Stanford.
     mkdir /cluster/store6/sacCer1
     ln -s /cluster/store6/sacCer1 /cluster/data/sacCer1
     cd /cluster/data/sacCer1
     mkdir 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M bed download
     cd download
     mkdir chromosomes
     cd chromosomes
     foreach i (01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt)
 	wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/chromosomes/fasta/chr$i.fsa
     end
     foreach i (1 2 3 4 5 6 7 8 9)
 	echo ">chr$i" > chr$i.fa
 	grep -v '^>' chr0$i.fsa >> chr$i.fa
     end
     foreach i (chr1?.fsa)
 	echo ">$i:r" > $i:r.fa
 	grep -v '^>' $i >> $i:r.fa
     end
     echo ">chrM" > chrM.fa
     grep -v '^>' chrmt.fsa > chrM.fa
     foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M)
 	mv chr$i.fa ../../$i
     end
     gzip -4 *.fsa
 
 # Download other info from the FTP site.
     wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature
     wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/gene_registry
     wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation
     wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/oracle_schema
     wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/protein_info
     wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence_similarity
     wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/systematic_results
     wget -nH --cut-dirs=5 -r -l 2 ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein
 
     # Check that the genome is in sync with the annotations
     cd /cluster/data/sacCer1
     checkSgdSync download
     # This showed 5 of 5788 with no ATG.  I sent these to Mike Cherry.
 
 # CREATING DATABASE, MAKE AND LOAD NIBS (DONE 2003-11-24 - Jim)
 # NOTE FOR YEAST WE DO NOT REPEAT MASK SEQUENCE.
     ssh hgwdev
     echo 'create database sacCer1' | hgsql ''
     cd /cluster/data/sacCer1
     hgNibSeq sacCer1 /cluster/data/sacCer1/nib */*.fa
     faSize -detailed */*.fa > chrom.sizes
     mkdir -p /gbdb/sacCer1/nib
     ln -s /cluster/data/sacCer1/nib/*.nib /gbdb/sacCer1/nib
     # Create a read-only alias in your .cshrc or .profile
     alias sacCer1 mysql -u hguser -phguserstuff -A sacCer1
     # Use df to ake sure there is at least 5 gig of free mySql table space
     df -h /var/lib/mysql
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-11-21 - Jim)
     ssh hgwdev
     #  the following command copies all the data from the table
     #	grp in the database hg16 to our new database sacCer1
     echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
       | hgsql sacCer1
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR YEAST (DONE 2003-11-24 Jim)
     # Warning: must genome and organism fields must correspond
     # with defaultDb values
     echo 'INSERT INTO dbDb \
         (name, description, nibPath, organism, \
                 defaultPos, active, orderKey, genome, scientificName, \
                 htmlPath, hgNearOk) values \
         ("sacCer1", "Oct. 2003", "/gbdb/sacCer1/nib", "Yeast", \
                "chr2:827700-845800", 1, 65, "Yeast", \
                 "Saccharomyces cerevisiae", "/gbdb/sacCer1/html/description.html", \
                 0);' \
       | hgsql -h genome-testdb hgcentraltest
     echo 'INSERT INTO defaultDb (genome, name) values ("Yeast", "sacCer1");' \
       | hgsql -h genome-testdb hgcentraltest
 
     # Make trackDb table so browser knows what tracks to expect:
     ssh hgwdev
     cd ~/src/hg/makeDb/trackDb
     cvs up -d -P
 
     # Edit that makefile to add sacCer1 in all the right places and do
     make update
 
     # go public on genome-test
     #make alpha
     cvs commit makefile
 
     # Add trackDb directories
     mkdir sacCer
     mkdir sacCer/sacCer1
     cvs add sacCer
     cvs add sacCer/sacCer1
     cvs commit sacCer
 
 # MAKE GCPERCENT (DONE 2003-11-24 - Jim)
      ssh hgwdev
      mkdir /cluster/data/sacCer1/bed/gcPercent
      cd /cluster/data/sacCer1/bed/gcPercent
      # create and load gcPercent table
      hgsql sacCer1  < ~/src/hg/lib/gcPercent.sql
      hgGcPercent sacCer1 ../../nib
 
 # RUN TANDEM REPEAT MASKER (DONE 2003-11-24 - Jim)
     # Do tandem repeat masking - this takes about 2 minutes.
     ssh hgwdev
     mkdir -p /cluster/data/sacCer1/bed/simpleRepeat
     cd /cluster/data/sacCer1
     foreach i (? ??)
 	trfBig $i/chr$i.fa /dev/null \
 	-bedAt=/cluster/data/sacCer1/bed/simpleRepeat/chr$i.bed
     end
     # Load into the database
     cd /cluster/data/sacCer1/bed/simpleRepeat
     hgLoadBed sacCer1 simpleRepeat *.bed \
       -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
     # Loaded 1316 elements of size 16
     featureBits sacCer1 simpleRepeat
     # 82600648 bases of 2627444668 (3.144%) in intersection
 
 # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (done)
     ssh hgwdev
     # Write ~/kent/src/hg/makeDb/trackDb/sacCer/sacCer1/description.html 
     # with a description of the assembly and some sample position queries.  
     chmod a+r ~/kent/src/hg/makeDb/trackDb/sacCer/sacCer1/description.html
     # Check it in 
     mkdir -p /gbdb/sacCer1/html
     ln -s /cluster/data/sacCer1/html/description.html /gbdb/sacCer1/html/
     # Create line in trackDb/makefile that copies description.html into
     #     /cluster/data/sacCer1/html/description.html  
     # Note, you definitely want to make the symbolic link in /gbdb/sacCer/html
     # before doing this.
 
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 2003-11-24 Jim)
 # AND SET UP BLAT SERVERS
     ssh blat10
     cd /scratch
     mkdir sacCer1Nib
     scp 'hgwdev:/cluster/data/sacCer1/nib/*.nib' sacCer1Nib
     # Ask admins to set up blat servers and ask which ports they assign.
     # 8/26/04: set canPcr=1 for untranslated blat server.
     ssh hgwdev
     echo 'insert into blatServers values("sacCer1", "blat10", 17788, 0, 1); \
           insert into blatServers values("sacCer1", "blat10", 17789, 1, 0);' \
       | hgsql -h genome-testdb hgcentraltest
 
 # CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE 2003-12-02 Jim)
 # Note initially the s_cerevisiae.gff3 file ended up being out of
 # sync with the genome. Mike Cherry (cherry@genome.stanford.edu)
 # regenerated it.  The format may end up changing in the future though.
     ssh hgwdev
     cd /cluster/data/sacCer1/bed
     mkdir sgdGene
     hgSgdGff3 ../download/chromosomal_feature/s_cerevisiae.gff3 sgdGene
     cd sgdGene
     ldHgGene sacCer1 sgdGene codingGenes.gff
     hgLoadBed sacCer1 sgdOther otherFeatures.bed \
         -tab -sqlTable=$HOME/kent/src/hg/lib/sgdOther.sql
     zcat ../../download/orf_protein/*.fasta.gz \
         | hgSgdPep -restrict=genePred.tab stdin sgdPep.fa symbol.txt
     hgPepPred sacCer1 generic sgdPep sgdPep.fa
     echo 'create table sgdToName ( \
           name varchar(10) not null, \
 	  value varchar(10) not null, \
 	  PRIMARY KEY(name), \
 	  INDEX (value));' | hgsql sacCer1
     echo 'load data local infile "symbol.txt" \
           into table sgdToName;' | hgsql sacCer1
     hgsql sacCer1 < $HOME/kent/src/hg/lib/sgdDescription.sql
     echo 'load data local infile "descriptions.txt" \
           into table sgdDescription;' | hgsql sacCer1
     hgsql sacCer1 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql
     echo 'load data local infile "notes.txt" \
           into table sgdOtherDescription;' | hgsql sacCer1
 
 # ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE 2003-11-25 Jim)
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/sgdGene
     awk '$2 == "SwissProt" {printf("%s\t%s\n", $3, $1);}' \
         ../../download/chromosomal_feature/external_id.tab \
 	> sgdToSwissProt.txt
     echo 'create table sgdToSwissProt ( \
           name varchar(10) not null, \
 	  value varchar(10) not null, \
 	  PRIMARY KEY(name), \
 	  INDEX (value));' | hgsql sacCer1
     echo 'load data local infile "sgdToSwissProt.txt" \
           into table sgdToSwissProt;' | hgsql sacCer1
     hgProtIdToGenePred sacCer1 sgdGene sgdToSwissProt name value
 
 # CREATE SGD-BASED CLONE TRACK (DONE 2003-11-25 Jim)
     ssh hgwdev
     cd /cluster/data/sacCer1/bed
     mkdir sgdClone
     cd sgdClone
     awk -F '\t' '{printf("chr%s\t%d\t%d\t%s\t%s\n", $3, $4-1, $5, $2, $1);}' \
     	../../download/chromosomal_feature/clone.tab > sgdClone.bed
     hgLoadBed sacCer1 sgdClone  sgdClone.bed -tab \
         -sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql
 
 # AUTO UPDATE GENBANK MRNA RUN  (Done 2003-11-24 Jim)
 
     # Put the nib's on /cluster/bluearc:
     ssh eieio
     mkdir -p /cluster/bluearc/sacCer/sacCer1/nib
     cp -pR /cluster/data/sacCer1/nib/*.nib /cluster/bluearc/sacCer/sacCer1/nib
 
     # Instructions for setting up incremental genbank updates are here:
     # http://www.soe.ucsc.edu/~markd/genbank-update/doc/initial-load.html
 
     # Added
 static char *sacCerNames[] = {"Saccharomyces cerevisiae", NULL};
     and
 {"dm", dmNames, NULL},
     to appropriate parts of src/hg/makeDb/genbank/src/lib/gbGenome.c
     # Then make and make install
     cd src/hg/make/makeDb/genbank
     make PREFIX=/cluster/store5/genbank install
 
     # Edit /cluster/data/genbank/etc/genbank.conf and add:
 # sacCer1
 sacCer1.genome = /cluster/bluearc/sacCer/sacCer1/nib/chr*.nib
 sacCer1.lift = no
 sacCer1.genbank.mrna.xeno.load = no
 sacCer1.genbank.est.xeno.load = no
 sacCer1.downloadDir = sacCer1
 
     # Do the alignments
     ssh eieio
     cd /cluster/data/genbank
     nice bin/gbAlignStep -iserver=no \
       -clusterRootDir=/cluster/bluearc/genbank \
       -verbose=1 -initial sacCer1
 
     # Load the results from the above
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad sacCer1
 
 # DOING MULTIPLE ALIGNMENT WITH OTHER YEAST  (Done Jim)
 # Grab sequence from all six yeast and massage
 # it so that fasta records all start with sacXXX
 # in the directory /cluster/data/sacCer1/bed/otherYeast/align
        ssh kkr1u00
        cd /cluster/data/sacCer1/bed/otherYeast/align
 
        # Create directory full of size info
        mkdir sizes
        foreach i (sac*)
            faSize $i -detailed > sizes/$i
        end
 
        # Create some working directories
        mkdir lav axtAll axtBest mafIn mafRawOut mafOut
 
        # Create little shell script to do blastz alignments
        cat > bz << end
 #!/bin/csh -ef
 blastz \$1 \$2 > \$3
 end
        chmod a+x bz
 
        # Create job spec to do all blastz alignments
        ls -1 ../../../*/chr*.fa > sacCer.lst
        ls -1 sac??? > other.lst
        cat > gsub << end
 #LOOP
 bz \$(path1) \$(path2) {check out line+ lav/\$(root1).\$(root2)}
 #ENDLOOP
 end
        gensub2 sacCer.lst other.lst gsub spec
 
        # Do parasol blastz run
        para create spec
        para try
        # Do some para checks and if all is well
        para push
 #Completed: 102 of 102 jobs
 #CPU time in finished jobs:      43279s     721.31m    12.02h    0.50d  0.001 y
 #IO & Wait Time:                   540s       9.00m     0.15h    0.01d  0.000 y
 #Average job time:                 430s       7.16m     0.12h    0.00d
 #Longest job:                     1458s      24.30m     0.41h    0.02d
 #Submission to last job:          3994s      66.57m     1.11h    0.05d
 
        # Convert from lav to axt
        cd lav
        foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu)
            foreach j (*.$i)
 	       lavToAxt $j ../../../../nib ../$i ../axtAll/$j -fa
 	   end
 	   echo done $i
        end
        cd ..
 
        # Run axtBest
        cd axtAll
        foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu)
            foreach j (*.$i)
 	       axtBest $j $j:r ../axtBest/$j
 	   end
 	   echo done $i
        end
        cd ..
 
        # Convert to maf
        cd axtBest
        foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu)
            foreach j (*.$i)
 	       axtToMaf $j ../../../../chrom.sizes ../sizes/$i ../mafIn/$j -tPrefix=sacCer1.
 	   end
 	   echo done $i
        end
        cd ..
 
 
        # Run multiz
        cd mafIn
        set mz = /cluster/bin/penn/tba.9.13/multiz
        foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M)
           set c = chr$i
 	  $mz $c.sacPar $c.sacMik - > ../mafRawOut/$c.maf
 	  echo done $c.sacMik
 	  foreach s (sacKud sacBay sacCas sacKlu)
 	      $mz $c.$s ../mafRawOut/$c.maf - > ../mafRawOut/tmp
 	      mv ../mafRawOut/tmp ../mafRawOut/$c.maf
 	      echo done $c.$s
 	  end
        end
        cd ..
 
        #Load into database
        ssh hgwdev
        cd /cluster/data/sacCer1/bed/otherYeast/align
        ln -s /cluster/store6/sacCer1/bed/otherYeast/align/mafOut /gbdb/sacCer1/multizYeast
        hgLoadMaf sacCer1 multizYeast
 
        # Clean up
        cd mafRawOut
        foreach i (*.maf)
            mafFilter -minScore=100 $i > ../mafOut/$i
        rm -r axtAll axtBest lav err mafRawOut multizYeast.tab
 
 
 # ADDING LOCALIZATION AND ABUNDANCE INFO FROM SGD AND 
 # http://yeastgfp.ucsf.edu  				(DONE 2003-11-24 - Jim)
 	ssh hgwdev
 	cd /cluster/data/sacCer1/bed
 	mkdir sgdLocalization
 	cd sgdLocalization
 	hgSgdGfp sacCer1 ../../download/systematic_results/localization/OS*.tab .
 
 # UPDATE GO DATABASE  (DONE -Jim)
     ssh hgwdev
     cd /cluster/store1/geneOntology
     mkdir 20031202
     cd 20031202
     wget ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest/go_200311-termdb-data.gz
     wget ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_sptr.gz
     hgsql mysql <<end
     drop database go;
     create database go;
     end
     zcat go_*data.gz | hgsql go
     zcat gene_association.goa_sptr.gz | hgGoAssociation go goaPart stdin -taxon=9606,10090,10116,6239,7227,4932
 
 ######## MAKING FAMILY BROWSER TABLES #######
 
 # GETTING STARTED WITH FAMILY BROWSER (DONE 2003-11-29 Jim)
     # Making tables to cluster splice varients
     ssh hgwdev
     hgClusterGenes sacCer1 sgdGene sgdIsoforms sgdCanonical
 
     # Make self mapping table for expression. 
     echo 'create table sgdToSgd ( \
           name varchar(10) not null, \
 	  value varchar(10) not null, \
 	  PRIMARY KEY(name), \
 	  UNIQUE (value));' | hgsql sacCer1
     echo 'select name from sgdGene;' | hgsql -N sacCer1 \
          | awk '{printf("%s\t%s\n", $1, $1);}' > sgdToSgd.tab
     echo 'load data local infile "sgdToSgd.tab" into table sgdToSgd' \
          | hgsql sacCer1
 
     # Make expression similarity table. 
     cd ~/src/hg/near/hgExpDistance
     hgExpDistance sacCer1 hgFixed.yeastChoCellCycle hgFixed.yeastChoCellCycleExps choExpDistance 
 
 
 #DOING SELF-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
     # Extract peptides from genome database into fasta file
     # and create a blast database out of them. 
     mkdir -p /cluster/data/sacCer1/bed/blastp
     cd /cluster/data/sacCer1/bed/blastp
     pepPredToFa sacCer1 sgdPep sgdPep.faa
     formatdb -i sgdPep.faa -t sgdPep -n sgdPep
     cd ..
 
     # Copy over database to /scratch
     ssh kk
     mkdir -p /scratch/sacCer1/blastp
     cp /cluster/data/sacCer1/bed/blastp/sgdPep.p* /scratch/sacCer1/blastp
     sudo /cluster/install/utilities/updateLocal
 
     # Split up fasta file into bite sized chunks for cluster
     cd /cluster/data/sacCer1/bed/blastp
     mkdir split
     faSplit sequence sgdPep.faa 6000 split/kg
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/sacCer1/bed/blastp/self
     cd /cluster/data/sacCer1/bed/blastp/self
     mkdir run
     cd run
     mkdir out
 
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /scratch/blast/data
 /scratch/blast/blastall -p blastp -d /scratch/sacCer1/blastp/sgdPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1000
 end
     chmod a+x blastSome
 
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
 
     # Create parasol batch
     echo ../../split/*.fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try
 
     # Wait a couple of minutes, and do a para check,  if all is good
     # then do a
     para push
     # This should finish in 1 minute if the cluster is free.
 #Completed: 5743 of 5743 jobs
 #CPU time in finished jobs:       3570s      59.50m     0.99h    0.04d  0.000 y
 #IO & Wait Time:                 14973s     249.55m     4.16h    0.17d  0.000 y
 #Average job time:                   3s       0.05m     0.00h    0.00d
 #Longest job:                       12s       0.20m     0.00h    0.00d
 #Submission to last job:            55s       0.92m     0.02h    0.00d
 
     # Load into database.  This takes a minute 
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/blastp/self/run/out
     hgLoadBlastTab sacCer1 sgdBlastTab *.tab
 #Scanning through 5743 files
 #Loading database with 52725 rows
 
 
 #DOING C.ELEGANS-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/sacCer1/bed/blastp/ce1
     cd /cluster/data/sacCer1/bed/blastp/ce1
     mkdir run
     cd run
     mkdir out
 
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /scratch/blast/blastall -p blastp -d /iscratch/i/ce1/blastp/wormPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
 end
     chmod a+x blastSome
 
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
 
     # Create parasol batch
     echo ../../split/*.fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try
 
     # Wait a couple of minutes, and do a para check,  if all is good
     # then do a
     para push
 Completed: 5743 of 5743 jobs
 #CPU time in finished jobs:       9397s     156.62m     2.61h    0.11d  0.000 y
 #IO & Wait Time:                 21849s     364.15m     6.07h    0.25d  0.001 y
 #Average job time:                   5s       0.09m     0.00h    0.00d
 #Longest job:                       26s       0.43m     0.01h    0.00d
 #Submission to last job:            75s       1.25m     0.02h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/blastp/ce1/run/out
     hgLoadBlastTab sacCer1 ceBlastTab -maxPer=1 *.tab
 
 #DOING MOUSE-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
     # Make mouse ortholog column using blastp on mouse known genes.
     # First make mouse protein database and copy it to iscratch/i
     # if it doesn't exist already
     cd /cluster/data/mm4/bed
     mkdir blastp
     cd blastp
     pepPredToFa mm4 knownGenePep known.faa
     formatdb -i known.faa -t known -n known
     ssh kkr1u00
     if (-e /iscratch/i/mm4/blastp) then
        rm -r /iscratch/i/mm4/blastp
     endif
     mkdir -p /iscratch/i/mm4/blastp
     cp /cluster/data/mm4/bed/blastp/known.p?? /iscratch/i/mm4/blastp
     iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/sacCer1/bed/blastp/mm4
     cd /cluster/data/sacCer1/bed/blastp/mm4
     mkdir run
     cd run
     mkdir out
 
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /scratch/blast/blastall -p blastp -d /iscratch/i/mm4/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
 end
 chmod a+x blastSome
 
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
 
     # Create parasol batch
     echo ../../split/*.fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try
 
     # Wait a couple of minutes, and do a para check,  if all is good
     # then do a
     para push
 #Completed: 5743 of 5743 jobs
 #CPU time in finished jobs:      13913s     231.88m     3.86h    0.16d  0.000 y
 #IO & Wait Time:                 27267s     454.45m     7.57h    0.32d  0.001 y
 #Average job time:                   7s       0.12m     0.00h    0.00d
 #Longest job:                       40s       0.67m     0.01h    0.00d
 #Submission to last job:            80s       1.33m     0.02h    0.00d
 
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/blastp/mm4/run/out
     hgLoadBlastTab sacCer1 mmBlastTab -maxPer=1 *.tab
 
 #DOING ZEBRAFISH-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
     # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
     # First make protein database and copy it to iscratch/i
     # if it doesn't exist already
     cd /cluster/data/dr1/bed
     mkdir blastp
     cd blastp
     wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH2.pep.fa.gz 
     zcat Dan*.pep.fa.gz > ensembl.faa
     echo "Translation:" > subs.in
     subs -e ensembl.faa > /dev/null
     formatdb -i ensembl.faa -t ensembl -n ensembl
     ssh kkr1u00
     if (-e /iscratch/i/dr1/blastp) then
        rm -r /iscratch/i/dr1/blastp
     endif
     mkdir -p /iscratch/i/dr1/blastp
     cp /cluster/data/dr1/bed/blastp/ensembl.p?? /iscratch/i/dr1/blastp
     iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/sacCer1/bed/blastp/dr1
     cd /cluster/data/sacCer1/bed/blastp/dr1
     mkdir run
     cd run
     mkdir out
 
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /scratch/blast/blastall -p blastp -d /iscratch/i/dr1/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
 end
     chmod a+x blastSome
 
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
 
     # Create parasol batch
     echo ../../split/*.fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try
 
     # Wait a couple of minutes, and do a para check,  if all is good
     # then do a
     para push
 #Completed: 5743 of 5743 jobs
 #CPU time in finished jobs:      11135s     185.58m     3.09h    0.13d  0.000 y
 #IO & Wait Time:                 23501s     391.69m     6.53h    0.27d  0.001 y
 #Average job time:                   6s       0.10m     0.00h    0.00d
 #Longest job:                       40s       0.67m     0.01h    0.00d
 #Submission to last job:            71s       1.18m     0.02h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/blastp/dr1/run/out
     hgLoadBlastTab sacCer1 drBlastTab -maxPer=1 *.tab
 
 #DOING FRUITFLY-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
     # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
     # First make SwissProt protein database and copy it to iscratch/i
     # if it doesn't exist already
     cd /cluster/data/dm1/bed
     mkdir blastp
     cd blastp
     pepPredToFa dm1 bdgpGenePep bdgp.faa
     formatdb -i bdgp.faa -t bdgp -n bdgp
     ssh kkr1u00
     if (-e /iscratch/i/dm1/blastp) then
        rm -r /iscratch/i/dm1/blastp
     endif
     mkdir -p /iscratch/i/dm1/blastp
     cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
     iSync
 
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/sacCer1/bed/blastp/dm1
     cd /cluster/data/sacCer1/bed/blastp/dm1
     mkdir run
     cd run
     mkdir out
 
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /scratch/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o \$2 -e 0.01 -m 8 -b 1
 end
 chmod a+x blastSome
 
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
 
     # Create parasol batch
     echo ../../split/*.fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try
 
     # Wait a couple of minutes, and do a para check,  if all is good
     # then do a
     para push
 #Completed: 5743 of 5743 jobs
 #CPU time in finished jobs:       9749s     162.49m     2.71h    0.11d  0.000 y
 #IO & Wait Time:                 22247s     370.78m     6.18h    0.26d  0.001 y
 #Average job time:                   6s       0.09m     0.00h    0.00d
 #Longest job:                       28s       0.47m     0.01h    0.00d
 #Submission to last job:            64s       1.07m     0.02h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/blastp/dm1/run/out
     hgLoadBlastTab sacCer1 dmBlastTab -maxPer=1 *.tab
 
 #DOING HUMAN-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
     # Make human ortholog column using blastp on human known genes.
     # First make human protein database and copy it to iscratch/i
     # if it doesn't exist already
     cd /cluster/data/hg16/bed
     mkdir blastp
     cd blastp
     pepPredToFa hg16 knownGenePep known.faa
     formatdb -i known.faa -t known -n known
     ssh kkr1u00
     if (-e /iscratch/i/hg16/blastp) then
        rm -r /iscratch/i/hg16/blastp
     endif
     mkdir -p /iscratch/i/hg16/blastp
     cp /cluster/data/hg16/bed/blastp/known.p?? /iscratch/i/hg16/blastp
     iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/sacCer1/bed/blastp/hg16
     cd /cluster/data/sacCer1/bed/blastp/hg16
     mkdir run
     cd run
     mkdir out
 
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /scratch/blast/blastall -p blastp -d /iscratch/i/hg16/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
 end
 chmod a+x blastSome
 
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # << this line makes emacs coloring happy
 
     # Create parasol batch
     echo ../../split/*.fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try
 
     # Wait a couple of minutes, and do a para check,  if all is good
     # then do a
     para push
 #Completed: 5743 of 5743 jobs
 #CPU time in finished jobs:      16096s     268.27m     4.47h    0.19d  0.001 y
 #IO & Wait Time:                 29943s     499.05m     8.32h    0.35d  0.001 y
 #Average job time:                   8s       0.13m     0.00h    0.00d
 #Longest job:                       65s       1.08m     0.02h    0.00d
 #Submission to last job:           100s       1.67m     0.03h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/blastp/hg16/run/out
     hgLoadBlastTab sacCer1 hgBlastTab -maxPer=1 *.tab
 
 # LOADING ERAN SEGAL'S REGULATORY MODULE STUFF  (DONE 2004-9-22 -Jim)
     cd /cluster/data/sacCer1/bed
     mkdir eranSegal
     cd eranSegal
     # Copy module_assignments.tab, motif_attributes.tab, motif_modules.tab
     # from email from <eran@cs.stanford.edu> into this directory.
     echo "select name,chrom,txStart,txEnd,strand,500,0 from sgdGene;" | \
         hgsql -N sacCer1 > gene_position.tab
     echo "select name,chrom,chromStart,chromEnd,strand,500,0 from sgdOther;" | \
         hgsql -N sacCer1 >> gene_position.tab
     hgLoadEranModules sacCer1 module_assignments.tab motif_modules.tab \
     	modules_motifs.gxm modules_motif_positions.tab gene_position.tab
 
 # LOADING REGULATORY CODE STUFF (In Progress 2004-9-22 -Jim)
     cd /cluster/data/sacCer1/bed
     mkdir harbisonGordon
     cd harbisonGordon
 
     # Create growthCondition.tab by editing by hand
     # http://jura.wi.mit.edu/young_public/regulatory_code/GrowthEnvironments.html
     # and then load:
     hgsql sacCer1 < $HOME/kent/src/hg/lib/growthCondition.sql
     hgsql sacCer1 -e 'load data local infile "growthCondition.tab" into table growthCondition'
 
     # Get GFF files and motif file from downloads section of 
     # http://jura.wi.mit.edu/fraenkel/regulatory_map
     # Also get v24_probes_041004.GFF from Ben Gordon dbgordon@wi.mit.edu via email.
     # Get Conditions_Summary.txt also from Ben from email.
 
     sort v24_probes_041004.GFF | uniq > probes.GFF
     hgYeastRegCode motifGff Final_InTableS2_v24.motifs probes.GFF Conditions_Summary.txt\
     	transRegCode.bed transRegCodeMotif.tab transRegCodeProbe.bed transRegCodeCondition.tab
     hgLoadBed sacCer1 transRegCode transRegCode.bed -sqlTable=$HOME/kent/src/hg/lib/transRegCode.sql
     hgLoadBed sacCer1 transRegCodeProbe transRegCodeProbe.bed -sqlTable=$HOME/kent/src/hg/lib/transRegCodeProbe.sql -tab
     hgsql sacCer1 < $HOME/kent/src/hg/lib/transRegCodeCondition.sql
     hgsql sacCer1 -e 'load data local infile "transRegCodeCondition.tab" into table transRegCodeCondition'
     hgsql sacCer1 < $HOME/kent/src/hg/lib/transRegCodeMotif.sql
     hgsql sacCer1 -e 'load data local infile "transRegCodeMotif.tab" into table transRegCodeMotif'
     # Get rid of some crufty motif placements Ben Gordon doesn't like anymore that aren't in the
     # transRegCodeMotif table.
     hgsql sacCer1 -e 'delete from transRegCode where name="CRZ1" or name="ECM22" or name="SFL1"'
     # Trim some motif columns
     fixHarbisonMotifs sacCer1
 
 #CREATING DOWNLOADS DIRECTORY
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath
     mkdir sacCer1
     cd sacCer1
     mkdir bigZips database
     cd bigZips
     zip -j chromFa.zip /cluster/data/sacCer1/*/chr*.fa
     zip -j multizYeast.zip /gbdb/sacCer1/multizYeast/*.maf
     # Create a README.txt file here.
 
 
 # MITOPRED DATA FOR HGGENE (DONE 7/30/04 angie)
     ssh hgwdev
     mkdir /cluster/data/sacCer1/bed/mitopred
     cd /cluster/data/sacCer1/bed/mitopred
     wget http://mitopred.sdsc.edu/data/yst_30.out
     perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' yst_30.out > mitopred.tab
     cat > mitopred.sql << '_EOF_'
 # Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
 CREATE TABLE mitopred (
     name varchar(10) not null,      # SwissProt ID
     confidence varchar(8) not null, # Confidence level
               #Indices
     PRIMARY KEY(name(6))
 );
 '_EOF_'
     # << this line makes emacs coloring happy
     hgsql sacCer1 < mitopred.sql
     hgsql sacCer1 -e 'load data local infile "mitopred.tab" into table mitopred'
 
 # MAKE Human Proteins track (DONE 2004-08-25 braney)
     ssh kksilo
     mkdir -p /cluster/data/sacCer1/blastDb
     cd /cluster/data/sacCer1/blastDb
     for i in ../*/*.fa; do ln -s $i .; done
     for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
     rm *.fa *.log
 
     ssh kkr1u00
     mkdir -p /iscratch/i/sacCer1/blastDb
     cp /cluster/data/sacCer1/blastDb/* /iscratch/i/sacCer1/blastDb
     (~kent/bin/iSync) 2>&1 > sync.out
     
     mkdir -p /cluster/data/sacCer1/bed/tblastn.hg16KG
     cd /cluster/data/sacCer1/bed/tblastn.hg16KG
     ls -1S /iscratch/i/sacCer1/blastDb/*.nsq | sed "s/\.nsq//" > yeast.lst
     exit
 
     # back to kksilo
     cd /cluster/data/sacCer1/bed/tblastn.hg16KG
     mkdir kgfa
     ls -1S /iscratch/i/squirt/ci1/kgfas/*.fa > kg.lst
     mkdir blastOut
     for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
 if /cluster/bin/i386/blastToPsl $f.1 $f.2
 then
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2
 	if pslCheck -prot $3.tmp
 	then
 	    mv $3.tmp $3
 	    rm -f $f.1 $f.2 
 	    exit 0
 	fi
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp $f.8
 exit 1
 '_EOF_'
 
     chmod +x blastSome
     gensub2 yeast.lst kg.lst blastGsub blastSpec
 
     ssh kk
     cd /cluster/data/sacCer1/bed/tblastn.hg16KG
     para create blastSpec
     para try, push
 
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainSome
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=7500 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainSome
 
     ls -1dS `pwd`/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
 
     ssh kki
     cd /cluster/data/sacCer1/bed/tblastn.hg16KG
     para create chainSpec
     para push
 
     exit
     # back to kksilo
     cd /cluster/data/sacCer1/bed/tblastn.hg16KG/blastOut
     for i in kg??
     do 
 	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../blastHg16KG.psl
 
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/tblastn.hg16KG
     hgLoadPsl sacCer1 blastHg16KG.psl
     exit
 
     # back to kksilo
     rm -rf blastOut
 
 # End tblastn
 
 # BLAT SGD predicted sacCer1 proteins against sacCer1 (DONE 2004-08-31 braney)
     ssh hgwdev
     cd /cluster/data/sacCer1/bed
     mkdir blat.sacCer1SG
     cd  blat.sacCer1SG
     pepPredToFa sacCer1 sgdPep sacCer1SG.fa
     hgPepPred sacCer1 generic blastSGPep00 sacCer1SG.fa
     ssh kk
     cd /cluster/data/sacCer1/bed/blat.sacCer1SG
     cat << '_EOF_' > blatSome
 #!/bin/csh -fe
 /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
 '_EOF_'
     # << keep emacs happy
     chmod +x blatSome
     ls -1S /cluster/bluearc/sacCer/sacCer1/nib/*.nib > yeast.lst
     mkdir sgfas
     cd sgfas
     faSplit sequence ../sacCer1SG.fa 1000 sg
     cd ..
     ls -1S sgfas/*.fa > sg.lst
     cat << '_EOF_' > blatGsub
 #LOOP
 blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
     gensub2 yeast.lst sg.lst blatGsub blatSpec
     mkdir psl
     cd psl
     foreach i (`cat ../yeast.lst`)
 	mkdir `basename $i .nib`
     end
     cd ..
     para create blatSpec
     para push
 
 # Completed: 16490 of 16490 jobs
 # CPU time in finished jobs:      52796s     879.94m    14.67h    0.61d  0.002 y
 # IO & Wait Time:                 44955s     749.25m    12.49h    0.52d  0.001 y
 # Average job time:                   6s       0.10m     0.00h    0.00d
 # Longest job:                       22s       0.37m     0.01h    0.00d
 # Submission to last job:           714s      11.90m     0.20h    0.01d
 
     ssh kksilo
     cd /cluster/data/sacCer1/bed/blat.sacCer1SG
     pslSort dirs raw.psl /tmp psl/*
     pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null
     pslMaxMap -maxMap=1 cov90.psl sacCer1SG.psl
     pslxToFa sacCer1SG.psl sacCer1SG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
     sgName sacCer1 sacCer1SG.psl blastSGRef00
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/blat.sacCer1SG
     hgsql sacCer1 < ~/kent/src/hg/lib/blastRef.sql
     echo "rename table blastRef to blastSGRef00" | hgsql sacCer1
     echo "load data local infile 'blastSGRef00' into table blastSGRef00" | hgsql sacCer1
 
 # PHASTCONS AND PHASTCONS ELEMENTS (acs, 9/15/04)
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/otherYeast
     mkdir phastCons
     cd phastCons
 
     # split up alignments; no need to use cluster here
     MAF=/cluster/data/sacCer1/bed/otherYeast/align/mafOut
     FA=/cluster/data/sacCer1
     CHROMS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M"
     mkdir -p SS
     for i in $CHROMS ; do \
 	echo chr${i} ; \
 	msa_view $MAF/chr${i}.maf -M $FA/$i/chr${i}.fa -i MAF -o SS -O sacCer1,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu > SS/chr${i}.ss ; \
     done
 
     # produce rough starting model
     msa_view -i SS --aggregate sacCer1,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu -o SS -z SS/*.ss > all.ss
     phyloFit -i SS all.ss --tree "((((((sacCer1,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" -o starting-tree
     # (cheat the branch lengths up a bit, since this represents an
     # average of conserved and nonconserved sites; we want to
     # initialize for nonconserved)
     tree_doctor  --scale 2 starting-tree.mod > tmp.mod
     mv tmp.mod starting-tree.mod 
 
     # also get average GC content of aligned regions
     msa_view -i SS all.ss --summary-only
 #descrip.                      A          C          G          T        G+C  length   all_gaps  some_gaps
 #all.ss                   0.3052     0.1954     0.1951     0.3042     0.3906   13251474          0    2260153
 
     # save some I/O and space
     gzip SS/*.ss
     
     # now set up cluster job to estimate model parameters.  See
     # makeHg17.doc for add'l details
 
     cat << 'EOF' > doEstimate.sh
 #!/bin/sh
 zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.3906 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 75 --target-coverage 0.5 --quiet --log $2 --estimate-trees $3
 EOF
     chmod u+x doEstimate.sh
 
     rm -fr LOG TREES
     mkdir -p LOG TREES
     rm -f jobs.lst
     for f in SS/*.ss.gz ; do
 	root=`basename $f .ss.gz` ;\
 	echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\
     done
 
     # run cluster job
     ssh kk9... para create ...
     # (takes about 15 minutes)
 
     # Now combine parameter estimates.  
     ls TREES/*.cons.mod > cons.txt
     phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
     ls TREES/*.noncons.mod > noncons.txt
     phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
 
     # Now set up cluster job for computing consevation scores and
     # predicted elements
 cat << 'EOF' > doPhastCons.sh
 #!/bin/sh
 
 mkdir -p POSTPROBS ELEMENTS
 pref=`basename $1 .ss.gz`
 chr=`echo $pref | awk -F\. '{print $1}'`
 tmpfile=/scratch/phastCons.$$
 zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 75 --target-coverage 0.5 --quiet --seqname $chr --idpref $pref --viterbi ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
 gzip -c $tmpfile > POSTPROBS/$pref.pp.gz
 rm $tmpfile
 EOF
     chmod u+x doPhastCons.sh
 
     rm -fr POSTPROBS ELEMENTS
     rm -f jobs2.lst
     for f in SS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done
 
     # run cluster job
     ssh kk9... para create, ...
     # (takes about 30 sec)
 
     # set up phastConsElements track
     awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' ELEMENTS/*.bed > all.raw.bed 
     /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
     hgLoadBed sacCer1 phastConsElements all.bed
     featureBits sacCer1 phastConsElements
 # 7517116 bases of 12156302 (61.837%) in intersection
 
     # set up wiggle
     mkdir -p wib
     for i in $CHROMS ; do \
 	wigAsciiToBinary -chrom=chr${i} -wibFile=wib/chr${i}_phastCons POSTPROBS/chr${i}.pp.gz ;\
     done
     #	Changed this path 2004-09-27 so a new set of data could go out
     #	under the same table name - Hiram, was simply:
     #	/gbdb/sacCer1/wib/chr*_phastCons.wib
     hgLoadWiggle sacCer1 phastCons -pathPrefix=/gbdb/sacCer1/wib/phastCons wib/chr*_phastCons.wig
     mkdir -p /gbdb/sacCer1/wib/phastCons
     rm -f /gbdb/sacCer1/wib/phastCons/chr*_phastCons.wib
     ln -s /cluster/data/sacCer1/bed/otherYeast/phastCons/wib/*.wib /gbdb/sacCer1/wib/phastCons
     chmod 775 . wib /gbdb/sacCer1/wib/phastCons /gbdb/sacCer1/wib/phastCons/*.wib
     chmod 664 wib/*.wib
 
     # set up full alignment/conservation track
     rm -rf pwMaf /gbdb/sacCer1/pwMaf
     mkdir -p pwMaf /gbdb/sacCer1/pwMaf
     cd pwMaf ;\
     for org in sacBay sacCas sacKlu sacKud sacMik sacPar ; do \
 	mkdir -p $org ; \
 	cp /cluster/data/sacCer1/bed/otherYeast/align/mafIn/chr*.$org $org ; \
 	for f in $org/* ; do chr=`basename $f .$org` ; mv $f $org/$chr.maf ; done ;\
 	ln -s /cluster/data/sacCer1/bed/otherYeast/phastCons/pwMaf/$org /gbdb/sacCer1/pwMaf/${org}_pwMaf ; \
 	hgLoadMaf sacCer1 -warn ${org}_pwMaf -pathPrefix=/gbdb/sacCer1/pwMaf/${org}_pwMaf ;\
     done
     cd ..
     chmod 755 pwMaf pwMaf/* /gbdb/sacCer1/pwMaf /gbdb/sacCer1/pwMaf/*
     chmod 644 pwMaf/*/*.maf
 
     # trackDb entries:
 #track multizYeast 
 #shortLabel Conservation
 #longLabel Seven Species of Saccharomyces, Alignments & Conservation
 #group compGeno
 #priority 100
 #visibility pack
 #type wigMaf 0.0 1.0
 #maxHeightPixels 100:40:11
 #wiggle phastCons
 #yLineOnOff Off
-#autoScaleDefault Off
+#autoScale Off
 #pairwise pwMaf
 #speciesOrder sacPar sacMik sacKud sacBay sacCas sacKlu
 
 #track phastConsElements
 #shortLabel Most Conserved
 #longLabel PhastCons Conserved Elements (Seven Species of Saccharomyces)
 #group compGeno
 #priority 105
 #visibility hide
 #spectrum on
 #exonArrows off
 #showTopScorers 200
 #type bed 5 .
 
 
 # MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie)
     # First, get ranges of target coverage:
     ssh kksilo
     mkdir /cluster/data/sacCer1/bed/otherYeast/align/coverage
     cd /cluster/data/sacCer1/bed/otherYeast/align/coverage
     cat /cluster/data/sacCer1/bed/otherYeast/align/mafOut/*.maf \
     | nice mafRanges -notAllOGap stdin sacCer1 sacCer1.mafRanges.bed
     # Get pairwise coverage as well.
     foreach other (sacBay sacCas sacKlu sacKud sacMik sacPar)
       cat /cluster/data/sacCer1/bed/otherYeast/align/mafIn/chr*.$other \
       | nice mafRanges -notAllOGap stdin sacCer1 sacCer1.$other.mafRanges.bed
     end
 
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/otherYeast/align/coverage
     # To make subsequent intersections a bit quicker, output a bed with 
     # duplicate/overlapping ranges collapsed:
     nice featureBits sacCer1 sacCer1.mafRanges.bed \
       -bed=sacCer1.mafRangesCollapsed.bed
 #11718319 bases of 12156302 (96.397%) in intersection
     # mafCoverage barfs on chr15 currently, so pass on this for now:
     #cat /cluster/data/sacCer1/bed/otherYeast/align/mafOut/*.maf \
     #| nice mafCoverage -count=2 sacCer1 stdin > sacCer1.mafCoverage
 
     # Intersect maf target coverage with gene regions:
     nice featureBits sacCer1 -enrichment sgdGene:cds \
       sacCer1.mafRangesCollapsed.bed \
       -bed=sacCer1.mafCds.bed
 #sgdGene:cds 69.491%, sacCer1.mafRangesCollapsed.bed 96.397%, both 69.114%, cover 99.46%, enrich 1.03x
     # No UTR info for yeast, so can't intersect.
 
     # Intersect pairwise target coverages with gene regions:
     foreach other (sacBay sacCas sacKlu sacKud sacMik sacPar)
       nice featureBits sacCer1 -enrichment sgdGene:cds \
         sacCer1.$other.mafRanges.bed -bed=sacCer1.${other}Cds.bed \
       |& grep -v "table gap doesn't exist"
     end
 #sgdGene:cds 69.491%, sacCer1.sacBay.mafRanges.bed 89.478%, both 66.581%, cover 95.81%, enrich 1.07x
 #sgdGene:cds 69.491%, sacCer1.sacCas.mafRanges.bed 63.359%, both 55.560%, cover 79.95%, enrich 1.26x
 #sgdGene:cds 69.491%, sacCer1.sacKlu.mafRanges.bed 56.086%, both 49.245%, cover 70.87%, enrich 1.26x
 #sgdGene:cds 69.491%, sacCer1.sacKud.mafRanges.bed 89.684%, both 64.873%, cover 93.35%, enrich 1.04x
 #sgdGene:cds 69.491%, sacCer1.sacMik.mafRanges.bed 92.989%, both 67.178%, cover 96.67%, enrich 1.04x
 #sgdGene:cds 69.491%, sacCer1.sacPar.mafRanges.bed 96.550%, both 68.464%, cover 98.52%, enrich 1.02x
 
 # CREATE TABLES TO SUPPORT SHOWING SAM-T02 RESULTS (DONE 1/4/05, Fan)
 # Kevin just did his monthly update, so reuild ... (REBUILT 1/7/05 Fan)   
     ssh hgwdev
     cd /cluster/data/sacCer1/bed
     mkdir sam
     cd sam
 
 # create script to process 1 SAM subdirectory
     cat << '_EOF_' >do1Subdir
 ls /projects/compbio/experiments/protein-predict/yeast/$1/*/summary.html >j1.tmp
 cat j1.tmp |sed -e 's/yeast\// /g' |sed -e 's/\/summary/ /g' >j2.tmp
 cat j2.tmp | awk '{print $2}' |sed -e 's/\// /g ' | awk '{print "do1 " $1 " " $2}' >> doall
 '_EOF_'
 
     chmod +x do1Subdir
 
 # create high level script to parse all SAM results
     ls -l /projects/compbio/experiments/protein-predict/yeast | grep drw >j1
     cp j1 j2
 
 #edit j2 to remove non SAM result subdirectories
     vi j2
     
     cat j2 |awk '{print "do1Subdir " $9}' >doAllSubdir
     chmod +x doAllSubdir
     rm j1 j2
     
     rm doall
     doAllSubdir
     rm j1.tmp j2.tmp
 
 # create data needed for the samSubdir table and load them
     cat doall |awk '{print $3"\t"$2}' >samSubdir.lis
     hgsql sacCer1 -e "drop table samSubdir"
     hgsql sacCer1 < ~/src/hg/lib/samSubdir.sql
     hgsql sacCer1 -e 'load data local infile "samSubdir.lis" into table samSubdir'
 
 # create script to parse SAM output results in one subdirectory
     cat << '_EOF_' >do1
 echo processing $2
 samHit $2 /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.best-scores.rdb >$2.tab
 '_EOF_'
 
     chmod +x do1
     chmod +x doall
 
 # run the top level script to parse all SAM output results
     rm *.tab
     doall
 # collect all results    
     cat *.tab |sort -u >protHomolog.all
     
 # load the results into the protHomolog table
     hgsql sacCer1 -e "drop table protHomolog"
     hgsql sacCer1 < ~/src/hg/lib/protHomolog.sql
     hgsql sacCer1 -e 'load data local infile "protHomolog.all" into table protHomolog'
 	
 # remove all intermediate .tab files
     rm *.tab
 
 # COPY PART OF SAM-T02 RESULTS TO UCSC GB SERVER (DONE 1/10/05, Fan)
 # Kevin is going to do monthly updates on SAM-T02 results.
 # To prevent data inconsistency problems, we now will copy 
 # over some key files and host them on our own server.
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/sam
     mkdir yeast050107
     cd yeast050107
 
 # create script to process 1 SAM subdirectory
     cat << '_EOF_' >do1Subdir
 mkdir -p yeast/$1
 '_EOF_'
 
     chmod +x do1Subdir
     cp -p ../doAllSubdir .
     mkdir yeast
     doAllSubdir
 
     cp -p ../doall .
     
 # create script doall to copy over needed SAM output result files
     cat << '_EOF_' >do1
 echo processing $2
 mkdir yeast/$1/$2
 cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/*.jpg yeast/$1/$2
 cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.w0.5-logo.pdf yeast/$1/$2
 cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.dssp-ehl2-logo.pdf yeast/$1/$2
 cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.undertaker-align.pdb.gz yeast/$1/$2
 '_EOF_'
 
     chmod +x do1
     doall
 
     ln -s ./yeast /usr/local/apache/htdocs/goldenPath/sacCer1/sam    
 
 # REBUIL MOUSE-PROTEIN ALIGNMENTS AND LOADING (DONE 2005-12-16 Fan)
     # Update mouse ortholog column using blastp on mouse known genes.
     # First make mouse protein database and copy it to iscratch/i
     # if it doesn't exist already
     cd /cluster/data/mm7/bed
     mkdir blastp
     cd blastp
     pepPredToFa mm7 knownGenePep known.faa
     formatdb -i known.faa -t known -n known
     ssh kkr1u00
     if (-e /iscratch/i/mm7/blastp) then
        rm -r /iscratch/i/mm7/blastp
     endif
     mkdir -p /iscratch/i/mm7/blastp
     cp -p /cluster/data/mm7/bed/blastp/known.p?? /iscratch/i/mm7/blastp
     iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/sacCer1/bed/blastp/mm7
     cd /cluster/data/sacCer1/bed/blastp/mm7
     mkdir run
     cd run
     mkdir out
 
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /scratch/blast/blastall -p blastp -d /iscratch/i/mm7/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
 end
 chmod a+x blastSome
 
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
 
     # Create parasol batch
     echo ../../split/*.fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try,push,check ...
 # Completed: 5743 of 5743 jobs
 # CPU time in finished jobs:      11476s     191.27m     3.19h    0.13d  0.000 y
 # IO & Wait Time:                 15126s     252.10m     4.20h    0.18d  0.000 y
 # Average job time:                   5s       0.08m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              29s       0.48m     0.01h    0.00d
 # Submission to last job:           700s      11.67m     0.19h    0.01d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/sacCer1/bed/blastp/mm7/run/out
     hgLoadBlastTab sacCer1 mmBlastTab -maxPer=1 *.tab
 
 
 ##########################################################################
 # HGNEAR PROTEIN BLAST TABLES (DONE 5/22/06 angie)
     ssh hgwdev
     mkdir /cluster/data/sacCer1/bed/hgNearBlastp
     cd /cluster/data/sacCer1/bed/hgNearBlastp
     # Re-fetch current sgdPep because the one in ../blastz/sgdPep.faa 
     # has been overwritten with more recent data from SGD.  That's OK 
     # for other organisms who will be linking straight to SGD instead 
     # of to sacCer1, but sacCer1 *BlastTab should jive with sgdGene.
     pepPredToFa sacCer1 sgdPep sgdPep.faa
     cat << _EOF_ > config.ra
 # Yeast vs. other Gene Sorter orgs that have recently been updated:
 # human, mouse, fly
 
 targetGenesetPrefix sgd
 targetDb sacCer1
 queryDbs hg18 mm8 dm2
 
 sacCer1Fa /cluster/data/sacCer1/bed/hgNearBlastp/sgdPep.faa
 hg18Fa /cluster/data/hg18/bed/blastp/known.faa
 mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
 dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
 
 buildDir /cluster/data/sacCer1/bed/hgNearBlastp
 scratchDir /san/sanvol1/scratch/sacCer1HgNearBlastp
 _EOF_
     doHgNearBlastp.pl -noSelf -targetOnly config.ra >& do.log & tail -f do.log
 # *** hgBlastTab mmBlastTab dmBlastTab 
 
 
 ##########################################################################
 # RELOAD GENBANK (DONE 2006-09-06 markd)
 # reloaded due to xenos track being dropped confusing joinerCheck
 
 nice bin/gbDbLoadStep -drop -initialLoad sacCer1
 
 #########################################################################
 # ORegAnno - Open Regulatory Annotations
 # update Oct 26, 2007;  update July 7, 2008
 # loaded by Belinda Giardine, in same manner as hg18 ORegAnno track
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE - 2009-02-04 - Hiram)
     #	repMatch = 1024 * sizeof(sacCer1)/sizeof(hg18)
     #	4.32 = 1024 * (12156302/2881515245)
     #	use 10 to be very conservative
     ssh hgwdev
     cd /hive/data/genomes/sacCer1
     blat sacCer1.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc \
 	-repMatch=10
     #	Wrote 3145 overused 11-mers to 11.ooc
     #	copy this to scratch data
     cp -p 11.ooc /hive/data/staging/data/sacCer1/sacCer1.11.ooc
     #	make push request to kluster nodes /scratch/data/
 
 #############################################################################
 # LIFTOVER TO SacCer2 (DONE - 2009-01-04 - Hiram )
     mkdir /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04
     cd /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04
     # -debug run to create run dir, preview scripts...
     doSameSpeciesLiftOver.pl -debug sacCer1 sacCer2
     # Real run:
     time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
 	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
 	 sacCer1 sacCer2 > do.log 2>&1
     #	real    3m21.840s
 
 #############################################################################
 # uwFootprints: (2009-04-04 markd)
 # obtained from Nobel lab
 #   William Stafford Noble <noble@gs.washington.edu>
 #   Xiaoyu Chen <xchen@cs.washington.edu>
 
     mkdir /hive/data/genomes/sacCer1/bed/uwFootprints
     cd /hive/data/genomes/sacCer1/bed/uwFootprints
 
 
     wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.wig
     wget http://noble.gs.washington.edu/proj/footprinting/yeast.mappability.bed
     wget http://noble.gs.washington.edu/proj/footprinting/yeast.footprints.bed
     chmod a-w yeast.*
 
     wigEncode yeast.dnaseI.tagCounts.wig uwFootprintsTagCounts.wig uwFootprintsTagCounts.wib
     # Converted yeast.dnaseI.tagCounts.wig, upper limit 13798.00, lower limit 1.00
 
     ln -sf /hive/data/genomes/sacCer1/bed/uwFootprints/uwFootprintsTagCounts.wib /gbdb/sacCer1/wib
     hgLoadWiggle -tmpDir=/data/tmp sacCer1 uwFootprintsTagCounts uwFootprintsTagCounts.wig 
 
     hgLoadBed -tab -tmpDir=/data/tmp sacCer1 uwFootprintsMappability yeast.mappability.bed
     # drop yeast.footprints.bed and truncate name to three decimal places
     tawk 'NR>1{print $1,$2,$3,sprintf("%0.3f",$4)}' yeast.footprints.bed | hgLoadBed -tmpDir=/data/tmp sacCer1 uwFootprintsPrints stdin
 
     # lift counts to sacSer2 and give back to UW
     wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.bed
     liftOver yeast.dnaseI.tagCounts.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.bed  /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.nolift.bed
 
 #############################################################################