src/hg/makeDb/doc/mm5.txt 1.4

1.4 2009/11/25 21:48:41 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/mm5.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm5.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/makeDb/doc/mm5.txt	14 Jan 2008 23:06:14 -0000	1.3
+++ src/hg/makeDb/doc/mm5.txt	25 Nov 2009 21:48:41 -0000	1.4
@@ -1,7901 +1,7901 @@
 # This file describes how we made the browser database on the mouse
 # genome, June 2004 build. - Mm5
 #
 #
 #	NOTE:  There is a new chrMT sequence in the build 32
 #	>gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion
 #
 #   Will have to beware of this NC_ contig in the processing since
 #	all previous builds had only NT_ contigs
 #
 # NOTE: The README_PREBUILD file for this assembly mentions several
 # differences from the previous release (build 30):
 # 1. seq_contig.md - new first line is a comment containing column name
 #       Also, last two columns (group label and weight, have been swapped)
 #       Also, some lines have id with CONTIG: prepended, and upper-case
 #               feature type (CONTIG)
 # 2. contig.idmap - has an additional column "contig label"
 # This required changing the jkStuff ncbi* utilities (7/1/03 KRR)
 #
 # DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2004-06-27 - Fan)
     ssh kksilo
     mkdir -p /cluster/store6/mm5/ncbi
     ln -s /cluster/store6/mm5 /cluster/data
     cd /cluster/data/mm5/ncbi
     mkdir chrfasta contigfasta
     ftp ftp.ncbi.nih.gov
       # user hgpguest, password from /cse/faculty/kent/buildHg6.doc
       cd mouse_33
       prompt
       bin
       mget *
       quit
     gunzip *.agp.gz
 
 # compress chrY.fa (at NCBI site, this one file some how was not compressed)
 	cd chrfasta
 gzip chrY.fa
 cd ..
 
 #use chrMT.fa.gz from mm4 instead because its first line format is correct
  
 	cp -p /cluster/store6/mm4/ncbi/chrfasta/chrMT.fa.gz chrfasta
 cp -p /cluster/store6/mm4/ncbi/contigfasta/chrMT.fa.gz contigfasta
 
 # Fix the troubles caused by chrMT released later separately
 
 # Fixed allcontig.agp
 # add the last line of .../mm4/ncbi/allcontig.agp to allcontig.agp
 
 # Fixed allrefcontig.chr.agp
 # add the last line of .../mm4/ncbi/allrefcontig.chr.agp to allrefcontig.chr.agp
 
 # Fix contig.idmap
     cat contig.idmap chrMT/contig.idmap >new.idmap
     mv new.idmap contig.idmap
 
 # Fix seq_contig.md
 # Edit seq_contig.md to add 3 lines (from mm4) in its middle before  Un|...
 10090   MT      0       0       +       start   -1      CONTIG  C57BL/6J        
 1010090   MT      1       16299   +       NC_005089       GI:34538597     CONTIG  
 C57BL/6J        na10090   MT      16299   16299   +       end     -2      CONTIG  C57BL/6J        
 10
 
 # ctg_coords, contig_overlaps.agp and sequence.inf not fixed.
 
 # Check chromosome files  (DONE - 2004-06-27 - Fan)
 cd chrfasta
 
 foreach f (*.fa.gz)
 echo $f:r >> faSize.out
 gunzip $f
 /cluster/bin/i386/faSize $f:r >> faSize.out
 echo $f:r done
 end
 
 /cluster/bin/i386/faSize *.fa >> faSize.out
 grep "^>" *.fa > ../chrfasta.all.fa.headers
 
 gzip *.fa
 
 cd ../contigfasta
 gunzip *.fa.gz
 grep "^>" *.fa > ../contigfasta.all.fa.headers
 gzip *.fa
 
 # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS
 #					(DONE - 2004-06-27 - Fan)
 
     ssh kksilo
     cd /cluster/data/mm5
     gunzip ncbi/allrefcontig.chr.agp.gz
     # splitFaIntoContigs doesn't do right with agp lines arriving in a
     # different order than fasta chrom sequences.  so split up the agp
     # into one per chrom.
     foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y MT Un)
       mkdir $c
       perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
         ./ncbi/allrefcontig.chr.agp \
         > $c/chr$c.agp
       gunzip -c ./ncbi/chrfasta/chr$c.fa.gz \
         | perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' \
         | splitFaIntoContigs $c/chr$c.agp \
           stdin /cluster/data/mm5 -nSize=5000000
     end
 
 #    gzip ncbi/chrfasta/chr*.fa
 
 # CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2004-06-27 - Fan)
     ssh kksilo
     cd /cluster/data/mm5/ncbi
 
     gunzip seq_contig.md.gz
 
     # reorder random contigs in allrefcontig agp file to match seq_contig.md
     # this is required by the ncbiToRandomAgps scripts
     # had to fixup ncbiToRandomAgps from previous use to match the
     #	lines better, and to do the MT/NC_ mitochondrion thing
 
     mkdir /cluster/store6/mm5/jkStuff
 
 # copy scripts used from previous trial mm5 build
     cd /cluster/data/mm5
     cp -p ~/mm50/jkStuff/* jkStuff
     cd /cluster/data/mm5/ncbi
     ../jkStuff/ncbiFixAgp allrefcontig.chr.agp > \
                         allrefcontig.chr.ordered.agp
 
 #Edit MANUALLY ../jkStuff/ncbiToRandomAgps, to change build 32 to build 33.
 
     ../jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.ordered.agp \
                         contig.idmap ..
         # creating ../mm5/1/chr1_random.agp...
         # ... creating ../mm5/Un/chrUn_random.agp...
     #  The chrUn_random.agp created by this is too large with the 5000
     #  gaps.  it will work with 1000 gaps, so fixup the chrUn_random agp:
     ../jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \
       seq_contig.md allrefcontig.chr.ordered.agp contig.idmap ..
 
     ssh kksilo
     cd /cluster/data/mm5
     foreach c (?{,?})
       if (-e $c/chr${c}_random.ctg.agp) then
         echo building $c/chr${c}_random.fa
         gunzip -c ./ncbi/contigfasta/chr$c.fa.gz \
           | perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \
           > ./tmp.fa
         agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \
           $c/chr${c}_random.fa ./tmp.fa
         rm tmp.fa
       endif
     end
     # building 1/chr1_random.fa
     # ... etc ...
     # building Un/chrUn_random.fa
     # Writing 102265694 bases to Un/chrUn_random.fa
 
     # Clean these up to avoid confusion later... they're easily rebuilt
     #	with the ncbiToRandomAgps script above
     rm ?/*.ctg.agp ??/*.ctg.agp
 
 # BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (DONE 2004-06-27 - Fan)
     ssh kksilo
     cd /cluster/data/mm5
     foreach c (?{,?})
       if (-e $c/chr${c}_random.agp) then
         splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \
           -nSize=5000000
         mkdir -p $c/lift
         mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst
         mv ${c}_random/lift/ordered.lft $c/lift/random.lft
         mv ${c}_random/lift/ordered.lst $c/lift/random.lst
         rmdir ${c}_random/lift
         rm ${c}_random/chr${c}_random.{agp,fa}
         mv ${c}_random/* $c
         rmdir ${c}_random
       endif
     end
     #  This has a lot of output.  It is difficult to see if anything
     #   goes wrong.
 
 #  Fixup chrMT name to be chrM (DONE - 2004-06-27 - Fan)
 
     ssh kksilo
     cd /cluster/data/mm5
     mv MT MT.ncbi
     mkdir M
     mkdir M/chrM_1
     mkdir M/lift
     cd MT.ncbi
 
     bash
     find . -type f | while read FN
     do
 	NF=`echo $FN | sed -e "s/MT/M/g"`
 	sed -e "s/chrMT/chrM/g" $FN > ../M/$NF
     done
 
 # MAKE LIFTALL.LFT (DONE - 2003-06-27 - Fan)
 
     cd /cluster/data/mm5
     cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft
 
 # 7:40 PM 6/27/04, used dark blue color above.
 # Now changed to use dark pink color for things done. 
 
 # CREATING DATABASE (DONE 2004-06-27 - Fan)
 
 # First, clean out mm5 tables built by previous trail build.
 # Rename all mm5.* tables to mm5_old4.*,
 # then drop database mm5
 
 o - Create the database.
     ssh hgwdev
     hgsql -e 'create database mm5;' ''
     # if you need to delete this database:  !!! WILL DELETE EVERYTHING !!!
     #	hgsql -e "drop database mm5;" mm5
 o - Use df to make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
     df -h /var/lib/mysql
     Filesystem            Size  Used Avail Use% Mounted on
     /dev/sdc1             1.8T  383G  1.3T  24% /var/lib/mysql
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2004-06-27 - Fan)
     #	Use any of the newest databases to ensure that the organization
     #	of the grp table is up to date
     ssh hgwdev
     hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" mm5
 
 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE - 2004-06-27 - Fan)
     # Create (unmasked) nib files
     ssh kksilo
     cd /cluster/data/mm5
     mkdir -p unmaskedNib
     foreach f (?{,?}/chr?{,?}{,_random}.fa)
       echo $f:t:r
       faToNib $f unmaskedNib/$f:t:r.nib
     end
     # Create symbolic links from /gbdb/mm5/nib to real nib files
     #	These unmasked Nib files are temporary just to get the browser
     #	up an running immediately.  After the masking is done and masked
     #	sequence is created, these nibs will be replaced with the masked
     #	nibs
     ssh hgwdev
     mkdir -p /gbdb/mm5/nib
     cd /gbdb/mm5/nib
     ln -s /cluster/data/mm5/unmaskedNib/chr*.nib .
 
     # Load /gbdb nib paths into database and save size info.
     ssh hgwdev
     cd /cluster/data/mm5
     hgsql mm5  < ~/kent/src/hg/lib/chromInfo.sql
     hgNibSeq -preMadeNib mm5 /gbdb/mm5/nib ?{,?}/chr?{,?}{,_random}.fa
     # 3164952073 total bases
     # NOTE: mm4 was 2952612207, an increase of 212 Mb (~7.2%)
     hgsql -N -e "select chrom,size from chromInfo;" mm5 > chrom.sizes
     # check the resulting file chrom.sizes
 
     # Store o+o info in database.
     cd /cluster/data/mm5/ncbi
     gunzip sequence.inf
     cd /cluster/data/mm5
     ln -s ncbi ffa
     # remove so as not to confuse hgGoldGap -- they are easily regenerated
     rm */chr*.ctg.agp
     # to undo/redo:
     #     jkStuff/dropSplitTable.csh gap
     #     jkStuff/dropSplitTable.csh gold
     /cluster/bin/i386/hgGoldGapGl mm5 /cluster/data/mm5 .
     featureBits mm5 gold
     # 2615483787 bases of 2615483787 (100.000%) in intersection
     featureBits mm4 gold
     # 2627444668 bases of 2627444668 (100.000%) in intersection
 
     featureBits mm5 gap
     # 549468286 bases of 2615483787 (21.008%) in intersection
     featureBits mm4 gap
     # 325167539 bases of 2627444668 (12.376%) in intersection
     featureBits mm3 gap
     # 202319873 bases of 2505900260 (8.074%) in intersection
 
 
 # Make and load GC percent table	(DONE - 2004-06-27 - Fan)
 #	NOT REQUIRED, been replaced by gc5Base procedure below
      ssh hgwdev
      mkdir -p /cluster/data/mm5/bed/gcPercent
      cd /cluster/data/mm5/bed/gcPercent
      hgsql mm5  < ~/kent/src/hg/lib/gcPercent.sql
      hgGcPercent mm5 ../../unmaskedNib
 
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR MM5 (DONE - 2004-06-27 - Fan)
     #	using the Mm3 position blatted onto Mm5:
     # Enter mm5 into hgcentraltest.dbDb so test browser knows about it:
     hgsql -e 'INSERT INTO dbDb \
         (name, description, nibPath, organism, defaultPos, \
          active, orderKey, genome, scientificName, htmlPath, \
          hgNearOk, hgPbOk, sourceName) \
       VALUES("mm5", "May 2004", "/gbdb/mm5/nib", "Mouse", \
 	"chr6:121658238-121674165", \
          1, 20, "Mouse", "Mus musculus", "/gbdb/mm5/html/description.html",\
 	0, 0, "NCBI Build 33");' \
 	-h genome-testdb hgcentraltest
     #	If you need to delete that entry:
     hgsql -e 'delete from dbDb where name="mm5";' -h genome-testdb hgcentraltest
 
     # Make trackDb table so browser knows what tracks to expect:
     ssh hgwdev
     cd ~kent/src/hg/makeDb/trackDb
     cvs up -d -P
     # Edit that makefile to add mm5 in all the right places and do
     make update
     make alpha
     cvs commit makefile
 
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR MM5 (DONE - 2004-07-14 Fan)
     ssh hgwdev
 
     # Make one big 2bit file as well, and make a link to it in
     # /gbdb/mm5/nib because hgBlat looks there:
     cd /cluster/data/mm5
     faToTwoBit */chr*.fa mm5.2bit
     ln -s /cluster/data/mm5/mm5.2bit /gbdb/mm5/nib/
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
     VALUES ("mm5", "snort", "17778", "1", "0"); \
     INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
     VALUES ("mm5", "snort", "17779", "0", "1");' \
     -h genome-testdb hgcentraltest
 
 # REPEAT MASKING (Working on 2004-06-27 Fan)
     #	TRF simpleRepeat below can be run at the same time
     # Split contigs, run RepeatMasker, lift results
     # * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make
     #   RepeatMasker runs manageable on the cluster ==> results need lifting.
     # * For the NCBI assembly we repeat mask on the sensitive mode setting
     #  (RepeatMasker -m -s -ali)
 
     #- Split contigs into 500kb chunks:
     ssh kksilo
     cd /cluster/data/mm5
     foreach d ( */chr?{,?}{,_random}_?{,?} )
 	cd $d
 	set contig = $d:t
 	faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
 	    -maxN=500000
 	cd ../..
     end
     #	...
     #	11 pieces of 11 written
     #	1 pieces of 1 written
     #	...
 
     #- Make the run directory and job list:
 
     cd /cluster/data/mm5
     cat << '_EOF_' > jkStuff/RMMouse
 #!/bin/csh -fe
 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/mm5/$2
 /bin/cp $2 /tmp/mm5/$2
 cd /tmp/mm5/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species mus $2
 popd
 /bin/cp /tmp/mm5/$2/$2.out ./
 if (-e /tmp/mm5/$2/$2.align) /bin/cp /tmp/mm5/$2/$2.align ./
 if (-e /tmp/mm5/$2/$2.tbl) /bin/cp /tmp/mm5/$2/$2.tbl ./
 if (-e /tmp/mm5/$2/$2.cat) /bin/cp /tmp/mm5/$2/$2.cat ./
 /bin/rm -fr /tmp/mm5/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/mm5/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/mm5
 '_EOF_'
     chmod +x jkStuff/RMMouse
 
     mkdir -p RMRun
     rm -f RMRun/RMJobs
     foreach d ( ?{,?}/chr*_?{,?} )
 	foreach f ( $d/chr*_?{,?}_?{,?}.fa )
 	    set f = $f:t
 	    echo /cluster/data/mm5/jkStuff/RMMouse \
 		/cluster/data/mm5/$d $f \
 		'{'check out line+ /cluster/data/mm5/$d/$f.out'}' \
 		>> RMRun/RMJobs
 	end
     end
 
     #- Do the run
     ssh kk
     cd /cluster/data/mm5/RMRun
     para create RMJobs
     para try, para check, para check, para push, para check,...
 
 [kk:RMRun> para check
 6885 jobs in batch
 8 jobs (including everybody's) in Parasol queue.
 Checking finished jobs.
 ranOk: 6885
 total jobs in batch: 6885
 [kk:RMRun> para time
 6885 jobs in batch
 8 jobs (including everybody's) in Parasol queue.
 Checking finished jobs
 Completed: 6885 of 6885 jobs
 CPU time in finished jobs:   40084305s  668071.74m 11134.53h  463.94d  1.271 y
 IO & Wait Time:                122589s    2043.16m    34.05h    1.42d  0.004 y
 Average job time:                5840s      97.33m     1.62h    0.07d
 Longest job:                     9804s     163.40m     2.72h    0.11d
 Submission to last job:         46771s     779.52m    12.99h    0.54d
 
 # Done 11:57 AM 6/28/04
 
     #- Lift up the split-contig .out's to contig-level .out's
     ssh kksilo
     cd /cluster/data/mm5
     foreach d ( ?{,?}/chr*_?{,?} )
       cd $d
       set contig = $d:t
       liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
       cd ../..
     end
 
     #- Lift up the contig-level .out's to chr-level
     ssh kksilo
     cd /cluster/data/mm5
     ./jkStuff/liftOut5.csh
     #	This one error is OK
     #	Can not find Un/lift/ordered.lft .
 
     #- Load the .out files into the database with:
     ssh hgwdev
     cd /cluster/data/mm5
     # to redo:
     #    ./jkStuff/dropSplitTable.csh rmsk
     # make sure there's no chrUn -- rm Un/chrUn.fa.out
     hgLoadOut mm5 ?/*.fa.out ??/*.fa.out
 
 # VERIFY REPEATMASKER RESULTS (DONE - 2004-06-28 Fan)
 
     # Run featureBits on mm5 and on a comparable genome build, and compare:
     ssh hgwdev
 featureBits mm5 rmsk
 #1137310280 bases of 2615483787 (43.484%) in intersection
 #featureBits mm4 rmsk
 1130883581 bases of 2627444668 (43.041%) in intersection
 #featureBits mm3 rmsk
 1080265553 bases of 2505900260 (43.109%) in intersection
 
 #cd /cluster/data/mm5
 #awk '{print $1}' chrom.sizes | sed -e "s/chr//" | grep -v random > chrom.lst
 
 # SIMPLE REPEAT TRACK (DONE - 2004-06-29 Fan)
     # TRF can be run in parallel with RepeatMasker on the file server
     #	since it doesn't require masked input sequence.
     ssh kksilo
     mkdir /cluster/data/mm5/bed/simpleRepeat
     cd /cluster/data/mm5/bed/simpleRepeat
     mkdir trf
     rm -f jobs.csh
     echo '#\!/bin/csh -fe' > jobs.csh
     # create job list of 5MB chunks
     foreach f \
        (/cluster/data/mm5/?{,?}/chr?{,?}_[0-9]*/chr?{,?}_?{,?}.fa \
        /cluster/data/mm5/?{,?}/chr*_random_?{,?}/chr*_random_?{,?}.fa)
       set fout = $f:t:r.bed
       echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
         >> jobs.csh
     end
     chmod +x jobs.csh
     wc jobs.csh
     # 640    3836   90839 jobs.csh
 
     ./jobs.csh >&! jobs.log &
     # in bash:  ./jobs.csh > jobs.log 2>&1 &
     tail -f jobs.log
     # Done 3:07 PM 6/29/04, took about 6 hours.
 
     # When job is done lift output files
     liftUp simpleRepeat.bed /cluster/data/mm5/jkStuff/liftAll.lft warn trf/*.bed
 
     # Load into the database
     ssh hgwdev
     cd /cluster/data/mm5/bed/simpleRepeat
     hgLoadBed mm5 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
     # Loaded 1150615 elements of size 16
 
     featureBits mm5 simpleRepeat
     # 81414259 bases of 2615483787 (3.113%) in intersection
     featureBits mm4 simpleRepeat
     # 82600648 bases of 2627444668 (3.144%) in intersection
     featureBits mm3 simpleRepeat
     # 75457193 bases of 2505900260 (3.011%) in intersection
 
 
 # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-06-29 - Fan)
 
     # After the simpleRepeats track has been built, make a filtered version
     # of the trf output: keep trf's with period <= 12:
     ssh kksilo
     cd /cluster/data/mm5/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/chr*.bed)
       awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
 
     # Lift up filtered trf output to chrom coords
     cd /cluster/data/mm5
     mkdir -p bed/simpleRepeat/trfMaskChrom
     foreach c (?{,?})
       if (-e $c/lift/ordered.lst) then
 	perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
 	  $c/lift/ordered.lst > $c/lift/oTrf.lst
 	liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
 	  jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
       else
 	echo "WARNING NO FILE:  $c/lift/ordered.lst"
       endif
       if (-e $c/lift/random.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
            $c/lift/random.lst > $c/lift/rTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
       endif
     end
     # NOTE: ignore warning about non-existent Un/Lift/ordered.lift
     # since there is no chrUn
 
 # MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
 #				(Working on - 2004-06-29 Fan)
     ssh kksilo
     cd /cluster/data/mm5
     #- Soft-mask (lower-case) the contig and chr .fa's
     ./jkStuff/makeFaMasked.csh >&! maskFa.out &
     #	bash:	./jkStuff/makeFaMasked.csh > maskFa.out 2>&1 &
     tail -100f maskFa.out
 
     #- Make hard-masked .fa.masked files as well:
     ./jkStuff/makeHardMasked.csh
 
 Edited ./jkStuff/makeNib.csh to comment out "if ..." and "endif" as below:
 
 #!/bin/csh -fe
 
 mkdir -p nib mixedNib maskedNib
 foreach i (?{,?})
    cd $i
 #   foreach j (chr$i{,_random}.fa)
    foreach j (*.fa)
 #       if (-e "${j}")
         set r = $j:r
        /cluster/bin/i386/faToNib $j ../nib/$r.nib
        /cluster/bin/i386/faToNib -softMask $j ../mixedNib/$r.nib
        /cluster/bin/i386/faToNib -hardMask $j ../maskedNib/$r.nib
 #       endif
        echo done $j
    end
    cd ..
 end
 
     #- Rebuild the nib, mixedNib, maskedNib files:
     ./jkStuff/makeNib.csh
     # ignore complaints about missing chrUn
 
     # Redo symbolic links from /gbdb/mm5/nib to
     #   mixed (RM and TRF) soft-masked nib files
     ssh hgwdev
     rm -fr /gbdb/mm5/nib/*
     ln -s /cluster/data/mm5/mixedNib/chr*.nib /gbdb/mm5/nib
 
     # Copy data to /cluster/bluearc for cluster runs
     ssh kksilo
 
     # masked contigs
     rm -fr /cluster/bluearc/scratch/mus/mm5/trfFa
     mkdir -p /cluster/bluearc/scratch/mus/mm5/trfFa
     cp -p /cluster/data/mm5/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa \
 	/cluster/bluearc/scratch/mus/mm5/trfFa
 
     # masked chrom nibs
     cd /cluster/data/mm5
     rm -fr /cluster/bluearc/scratch/mus/mm5/softNib
     mkdir -p /cluster/bluearc/scratch/mus/mm5/softNib
     cp -p mixedNib/chr*.nib /cluster/bluearc/scratch/mus/mm5/softNib
     rm -fr /cluster/bluearc/scratch/mus/mm5/hardNib
     mkdir -p /cluster/bluearc/scratch/mus/mm5/hardNib
     cp -p maskedNib/chr*.nib /cluster/bluearc/scratch/mus/mm5/hardNib
 
     # fasta files
     rm -fr /cluster/bluearc/scratch/mus/mm5/fasta
     mkdir -p /cluster/bluearc/scratch/mus/mm5/fasta
     cp -p ?/*.fa ??/*.fa /cluster/bluearc/scratch/mus/mm5/fasta
 
     # RepeatMasker *.out files
     rm -rf /cluster/bluearc/scratch/mus/mm5/rmsk
     mkdir -p /cluster/bluearc/scratch/mus/mm5/rmsk
     cp -p ?{,?}/chr?{,?}{,_random}.fa.out /cluster/bluearc/scratch/mus/mm5/rmsk
 
     # lift file, for mrna processing
     cp -p jkStuff/liftAll.lft /cluster/bluearc/scratch/mus/mm5
 #above was done 6/29/04 4:50PM
 
     # also copy to iservers
     ssh kkr1u00
     #cd ~/mm5
     cd /cluster/bluearc/scratch/mus/mm5
 
     mkdir /iscratch/i/mus/mm5
     cp -p liftAll.lft /iscratch/i/mus/mm5
     mkdir -p /iscratch/i/mus/mm5/softNib
     cp -p /cluster/bluearc/scratch/mus/mm5/softNib/chr*.nib /iscratch/i/mus/mm5/softNib
 
     mkdir -p /iscratch/i/mus/mm5/trfFa
     cd /cluster/store6/mm5
     cp ?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /cluster/bluearc/scratch/mus/mm5/trfFa
     /cluster/bin/scripts/iSync
 
 ssh kkr1u00
 mkdir /iscratch/i/mus/mm5
 cd /iscratch/i/mus
 rsync -arlv /cluster/bluearc/scratch/mus/mm5 .
 
 #wrote 8660800915 bytes  read 15380 bytes  17729409.00 bytes/sec
 #total size is 10242205742  speedup is 1.18
 
 cd /iserver/kkr1u00/i/mus/mm5
 mv trfFa maskedContigs
 cd /cluster/bluearc/scratch/mus/mm5
 mv trfFa maskedContigs
 
 # PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2004-06-29 - Fan)
 
     ssh kksilo
     mkdir -p /cluster/bluearc/scratch/mus/mm5/rmsk.spec
     cd /cluster/bluearc/scratch/mus/mm5/rmsk.spec
     ln -s ../rmsk/*.out .
 
 # NOTE: DON't leave indentations in the script below.
 cat << '_EOF_' > runArian.sh
 #!/bin/sh
 for FN in *.out
 do
 echo ${FN}
 /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
 ${FN} -query mouse -comp human -comp rat
 done
 '_EOF_'
 
     chmod +x runArian.sh
     ./runArian.sh 
 
     cd /cluster/bluearc/scratch/mus/mm5
     mkdir linSpecRep.notInHuman
     mkdir linSpecRep.notInRat
     foreach f (rmsk.spec/*.out_hum_rat)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                         linSpecRep.notInHuman/$base.out.spec
     end
 
     foreach f (rmsk.spec/*.out_hum_rat)
         set base = $f:t:r:r
 	echo $base.out.spec
 	/cluster/bin/scripts/extractLinSpecReps 2 $f > \
 		linSpecRep.notInRat/$base.out.spec
 	end
 
     cp rmsk.spec /iscratch/i/mus/mm5 -Rp
     cp linSpecRep.notInRat /iscratch/i/mus/mm5 -Rp
     cp linSpecRep.notInHuman /iscratch/i/mus/mm5 -Rp
 
     /cluster/bin/scripts/iSync
 
     # Request rsync /cluster/bluearc/scratch/mus/mm5 to the KiloKluster
 
 #  GC5BASE WIGGLE TRACK (DONE - 2004-06-24 - Hiram)
     #	This previously was a script that ran through each nib.
     #	Recently transformed into a mini cluster run.
     ssh kki
     mkdir /cluster/data/mm5/bed/gc5Base
     cd /cluster/data/mm5/bed/gc5Base
 
     mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K
 
     cat << '_EOF_' > kkRun.sh
 #!/bin/sh
 NIB=$1
 
 chr=${NIB/.nib/}
 chrom=${chr#chr}
 
 hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
         /cluster/data/mm5/mixedNib | \
     grep -w GC | \
     awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
     wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
         -wibFile=wigData5/gc5Base_${chrom} \
             -name=${chrom} stdin 2> dataLimits5/${chr}
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x kkRun.sh
 
     ls /cluster/data/mm5/mixedNib > nibList
     cat << '_EOF_' > gsub
 #LOOP
 ./kkRun.sh $(path1)
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 nibList single gsub jobList
     para create jobList
     para try, check, ... etc
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       4969s      82.81m     1.38h    0.06d  0.000 y
 # IO & Wait Time:                   611s      10.19m     0.17h    0.01d  0.000 y
 # Average job time:                 130s       2.16m     0.04h    0.00d
 # Longest job:                      370s       6.17m     0.10h    0.00d
 # Submission to last job:           598s       9.97m     0.17h    0.01d
 
     # load the .wig files back on hgwdev:
     ssh hgwdev
     cd /cluster/data/mm5/bed/gc5Base
     hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base mm5 gc5Base wigData5/*.wig
     # and symlink the .wib files into /gbdb
     mkdir /gbdb/mm5/wib/gc5Base
     ln -s `pwd`/wigData5/*.wib /gbdb/mm5/wib/gc5Base
 
     #	And then the zoomed data view
     ssh kki
     cd /cluster/data/mm5/bed/gc5Base
     mkdir wigData5_1K dataLimits5_1K
 
     cat << '_EOF_' > kkRunZoom.sh
 #!/bin/sh
 NIB=$1
 
 chr=${NIB/.nib/}
 chrom=${chr#chr}
 
 hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
         /cluster/data/mm5/mixedNib | \
     grep -w GC | \
     awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
     wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
 	-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
             -name=${chrom} stdin 2> dataLimits5_1K/${chr}
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x kkRunZoom.sh
 
     cat << '_EOF_' > gsubZoom
 #LOOP
 ./kkRunZoom.sh $(path1)
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 nibList single gsubZoom jobListZoom
     para create jobListZoom
     para try ... check ... etc ...
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       4878s      81.29m     1.35h    0.06d  0.000 y
 # IO & Wait Time:                   488s       8.14m     0.14h    0.01d  0.000 y
 # Average job time:                 125s       2.08m     0.03h    0.00d
 # Longest job:                      378s       6.30m     0.10h    0.00d
 # Submission to last job:           665s      11.08m     0.18h    0.01d
 
     #	Then load these .wig files into the same database as above
     ssh hgwdev
     cd /cluster/data/mm5/bed/gc5Base
     hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base \
 	-oldTable mm5 gc5Base wigData5_1K/*.wig
     # and symlink these .wib files into /gbdb
     ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm5/wib/gc5Base
 
 #  GC5BASE WIGGLE TRACK (DONE - 2004-07-01 - Hiram)
     #	This previously was a script that ran through each nib.
     #	Recently transformed into a mini cluster run.
     ssh kki
     mkdir /cluster/data/mm5/bed/gc5Base
     cd /cluster/data/mm5/bed/gc5Base
 
     mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K
 
     cat << '_EOF_' > kkRun.sh
 #!/bin/sh
 NIB=$1
 
 chr=${NIB/.nib/}
 chrom=${chr#chr}
 
 hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
         /cluster/data/mm5/mixedNib | \
     grep -w GC | \
     awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
     wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
         -wibFile=wigData5/gc5Base_${chrom} \
             -name=${chrom} stdin 2> dataLimits5/${chr}
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x kkRun.sh
 
     ls /cluster/data/mm5/mixedNib > nibList
     cat << '_EOF_' > gsub
 #LOOP
 ./kkRun.sh $(path1)
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 nibList single gsub jobList
     para create jobList
     para try, check, ... etc
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       4857s      80.94m     1.35h    0.06d  0.000 y
 # IO & Wait Time:                   121s       2.02m     0.03h    0.00d  0.000 y
 # Average job time:                 116s       1.93m     0.03h    0.00d
 # Longest job:                      335s       5.58m     0.09h    0.00d
 # Submission to last job:           516s       8.60m     0.14h    0.01d
 
     # load the .wig files back on hgwdev:
     ssh hgwdev
     cd /cluster/data/mm5/bed/gc5Base
     hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base mm5 gc5Base wigData5/*.wig
     # and symlink the .wib files into /gbdb
     mkdir /gbdb/mm5/wib
     mkdir /gbdb/mm5/wib/gc5Base
     ln -s `pwd`/wigData5/*.wib /gbdb/mm5/wib/gc5Base
 
     #	And then the zoomed data view
     ssh kki
     cd /cluster/data/mm5/bed/gc5Base
     mkdir wigData5_1K dataLimits5_1K
 
     cat << '_EOF_' > kkRunZoom.sh
 #!/bin/sh
 NIB=$1
 
 chr=${NIB/.nib/}
 chrom=${chr#chr}
 
 hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 mm5 \
         /cluster/data/mm5/mixedNib | \
     grep -w GC | \
     awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
     wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
 	-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
             -name=${chrom} stdin 2> dataLimits5_1K/${chr}
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x kkRunZoom.sh
 
     cat << '_EOF_' > gsubZoom
 #LOOP
 ./kkRunZoom.sh $(path1)
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 nibList single gsubZoom jobListZoom
     para create jobListZoom
     para try ... check ... etc ...
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       4819s      80.31m     1.34h    0.06d  0.000 y
 # IO & Wait Time:                    82s       1.37m     0.02h    0.00d  0.000 y
 # Average job time:                 114s       1.90m     0.03h    0.00d
 # Longest job:                      336s       5.60m     0.09h    0.00d
 # Submission to last job:           500s       8.33m     0.14h    0.01d
 
     #	Then load these .wig files into the same database as above
     ssh hgwdev
     cd /cluster/data/mm5/bed/gc5Base
     hgLoadWiggle -pathPrefix=/gbdb/mm5/wib/gc5Base \
 	-oldTable mm5 gc5Base wigData5_1K/*.wig
     # and symlink these .wib files into /gbdb
     ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm5/wib/gc5Base
 
 # BLASTZ HG17 (WORKING - 2004-07-06 - Hiram)
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastz.hg17.2004-07-06
     cd /cluster/data/mm5/bed
     ln -s  blastz.hg17.2004-07-06 blastz.hg17
     cd blastz.hg17
 
     cat << '_EOF_' > DEF
 # mouse vs. human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Mouse
 SEQ1_DIR=/scratch/mus/mm5/softNib
 # not used
 SEQ1_RMSK=/scratch/mus/mm5/rmsk
 # not used
 SEQ1_FLAG=-rodent
 SEQ1_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Human
 SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
 # RMSK not currently used
 SEQ2_RMSK=
 # FLAG not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInMouse
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm5/bed/blastz.hg17
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/mm5/bed/blastz.hg17
     #	OK to use this script here, it is generic, works anywhere
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 # Completed: 46717 of 46717 jobs
 # CPU time in finished jobs:   16171136s  269518.93m  4491.98h  187.17d  0.513 y
 # IO & Wait Time:                534501s    8908.35m   148.47h    6.19d  0.017 y
 # Average job time:                 358s       5.96m     0.10h    0.00d
 # Longest job:                     5263s      87.72m     1.46h    0.06d
 # Submission to last job:         30066s     501.10m     8.35h    0.35d
 
     #	the file server to its knees.  Run this on the small cluster.
     ssh kki
     cd /cluster/data/mm5/bed/blastz.hg17
     /cluster/data/hg17/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:       2186s      36.43m     0.61h    0.03d  0.000 y
 # IO & Wait Time:                  1804s      30.07m     0.50h    0.02d  0.000 y
 # Average job time:                  12s       0.20m     0.00h    0.00d
 # Longest job:                       82s       1.37m     0.02h    0.00d
 # Submission to last job:          3895s      64.92m     1.08h    0.05d
 
     #	Third cluster run to convert lav's to axt's
     #	Does not work on kki since /scratch on the iservers is not the
     #	same as /scratch on the other clusters.
     ssh kk
     cd /cluster/data/mm5/bed/blastz.hg17
     /cluster/data/hg17/jkStuff/BlastZ_run2.sh
     cd run.2
     para try, check, push, etc ...
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       2099s      34.98m     0.58h    0.02d  0.000 y
 # IO & Wait Time:                  6862s     114.37m     1.91h    0.08d  0.000 y
 # Average job time:                 208s       3.47m     0.06h    0.00d
 # Longest job:                     1276s      21.27m     0.35h    0.01d
 # Submission to last job:          1291s      21.52m     0.36h    0.01d
 
     # translate sorted axt files into psl
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.hg17
     mkdir p pslChrom
     set tbl = "blastzHg17"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	This takes more than an hour.  You can shorten this by changing
     #	that command to a simple echo, put the results into a file,
     #	split the file into four parts and run the four files as shell
     #	scripts on kksilo to have four processes running at the same
     #	time.  Load on kksilo gets up to about 20 which is reasonable.
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.hg17/pslChrom
     bash		#	for tcsh users
     for F in chr*_blastzHg17.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${F}
 	echo "${F} done"
     done
     # this is a 40 minute job
     # exit bash if you are tcsh
 
     # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
     # memory.  But if you reset your ~/.hg.conf to use the read-only
     #	user and contact the hgwdev host, then use the x86_64 featureBits
     # featureBits mm5 blastzHg17
     #	1057836001 bases of 2615483787 (40.445%) in intersection
     # featureBits mm4 blastzHg16
     #	1068995521 bases of 2627444668 (40.686%) in intersection
 
 # CHAIN MM5 BLASTZ (DONE - 2004-07-02 - Hiram)
 
 # The axtChain is best run on the small kluster, or the kk9 kluster
     ssh kki
     mkdir -p /cluster/data/mm5/bed/blastz.hg17/axtChain/run1
     cd /cluster/data/mm5/bed/blastz.hg17/axtChain/run1
     mkdir out chain
 
     ls -1S /cluster/data/mm5/bed/blastz.hg17/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
 #  May need -minScore=5000 for all chroms if chr19 won't finish on kolossus
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 /iscratch/i/mus/mm5/softNib \
 	/iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
 
     # 46 jobs
     gensub2 input.lst single gsub jobList
     para create jobList
     para try
     para push # ... etc ...
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       5354s      89.23m     1.49h    0.06d  0.000 y
 # IO & Wait Time:                 10543s     175.72m     2.93h    0.12d  0.000 y
 # Average job time:                 370s       6.16m     0.10h    0.00d
 # Longest job:                     1694s      28.23m     0.47h    0.02d
 # Submission to last job:          1694s      28.23m     0.47h    0.02d
 
     # now on the file server, sort chains
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.hg17/axtChain
     time chainMergeSort run1/chain/*.chain > all.chain
     #	real    4m53.428s
     #	user    4m3.040s
     #	sys     0m29.440s
 
     time chainSplit chain all.chain
     #	real    4m34.674s
     #	user    3m38.370s
     #	sys     0m29.990s
 
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.hg17/axtChain/chain
     bash	#	for tcsh users
     for I in *.chain
     do
         c=${I/.chain/}
         hgLoadChain mm5 ${c}_chainHg17 $I
         echo done $c
     done
     # exit bash if you are tcsh
     #	This is a 50 minute job
 
     #	featureBits mm5 chainHg17
     #	2507720521 bases of 2615483787 (95.880%) in intersection
     #	featureBits mm4 chainHg16
     #	2558968088 bases of 2627444668 (97.394%) in intersection
 
 # NET MM5 (WORKING - 2004-07-02 - Hiram)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.hg17/axtChain
     mkdir preNet
     cd chain
     bash	#	for tcsh users
     for I in *.chain
     do
       echo preNetting $I
       /cluster/bin/i386/chainPreNet $I /cluster/data/mm5/chrom.sizes \
 		/cluster/data/hg17/chrom.sizes ../preNet/$I
     done
     # exit bash if you are tcsh
     #	7 minute job
 
     cd ..
     mkdir n1
     cd preNet
     bash	#	for tcsh users
     for I in *.chain
     do
       n=${I/.chain/}.net
       echo primary netting $I $n
       /cluster/bin/i386/chainNet $I -minSpace=1 /cluster/data/mm5/chrom.sizes \
 	/cluster/data/hg17/chrom.sizes ../n1/$n /dev/null
     done
     # exit bash if you are tcsh
     #	5 minute job
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     #	memory usage 2546110464, utime 16327 s/100, stime 3546
 
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.hg17/axtChain
     time netClass hNoClass.net mm5 hg17 human.net \
 	-tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInHuman \
 	-qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse
     #	real    9m45.271s
     #	user    6m47.170s
     #	sys     1m20.440s
 
     # If things look good do
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.hg17/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     time netFilter -syn human.net > humanSyn.net
     #	real    12m3.701s
     #	user    8m44.180s
     #	sys     1m1.610s
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.hg17/axtChain
     netFilter -minGap=10 human.net |  hgLoadNet mm5 netHg17 stdin
     netFilter -minGap=10 humanSyn.net | hgLoadNet mm5 syntenyNetHg17 stdin
 
     # check results
     # featureBits mm5 netHg17
     #	2504056038 bases of 2615483787 (95.740%) in intersection
     # featureBits mm4 netHg16
     #	2553137690 bases of 2627444668 (97.172%) in intersection
 
     # featureBits mm5 syntenyNetHg17
     #	2460442823 bases of 2615483787 (94.072%) in intersection
     # featureBits mm4 syntenyNetHg16
     #	2495783103 bases of 2627444668 (94.989%) in intersection
 
     # Add entries for net and chain to mouse/hg17 trackDb
 
     # make net
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.hg17/axtChain
     mkdir humanNet
     time netSplit human.net humanNet
     #	real    4m46.190s
     #	user    3m27.740s
     #	sys     0m38.900s
 
     #	extract axt's from net, and convert to maf's
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.hg17/axtChain
     mkdir ../axtNet ../mafNet
 cat > makeMaf.csh << '_EOF_'
 #!/bin/csh -ef
     foreach f (humanNet/chr*.net)
         set c = $f:t:r
         echo "netToAxt: $c.net -> $c.axt"
         rm -f ../axtNet/$c.axt
         netToAxt humanNet/$c.net chain/$c.chain \
 	    /cluster/data/mm5/nib /cluster/data/hg17/nib stdout | \
 	    axtSort stdin ../axtNet/$c.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/mm5/chrom.sizes /cluster/data/hg17/chrom.sizes \
             ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=hg17.
 	echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf"
     end
 '_EOF_'
 # << for emacs
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     #	real    39m53.316s
     #	user    20m2.530s
     #	sys     4m40.120s
 
 
     ssh hgwdev
     mkdir /cluster/data/mm5/bed/blastz.hg17/axtBest
     cd /cluster/data/mm5/bed/blastz.hg17/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.hg17/axtNet
     mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet
     cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtNet
     gzip *.axt
 XXX - running 2004-07-13 14;18
     # add README.txt file to dir (use previous assembly's copy as template)
     #	32 minute gzip
 
     #  Convert those axt files to psl
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.hg17
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo -n "processing $c.axt -> ${c}_blastzBesthg17.psl ..."
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestHg17.psl
 	echo "Done"
     end
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.hg17/pslBest
     for I in chr*BestHg17.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
 	echo "done ${I}"
     done
 
      # check results
     # featureBits mm5 blastzBestHg17
     #	1020692679 bases of 2615483787 (39.025%) in intersection
     # featureBits mm4 blastzBestHg16
     #	1030510540 bases of 2627444668 (39.221%) in intersection
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/mm5/axtBest/Hg17
      cd /gbdb/mm5/axtBest/Hg17
      ln -s /cluster/data/mm5/bed/blastz.hg17/axtNet/chr*.axt .
      cd /cluster/data/mm5/bed/blastz.hg17/axtNet
      rm -f axtInfoInserts.sql
      foreach f (/gbdb/mm5/axtBest/Hg17/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('hg17','Blastz Best in Genome','$chr','$f');" \
          >>! axtInfoInserts.sql
      end
     hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql
     #	table axtInfo may already exist, ignore create error.
     hgsql mm5 < axtInfoInserts.sql
 
 # MM5 TO CANFAM1 LIFTOVER CHAIN (DONE 1/7/05 Andy)
     ssh kolossus
     cd /cluster/data/mm5/bed/blastz.canFam1/axtChain
     mkdir net
     netSplit dog.net net 
     mkdir over
     for file in chain/*.chain; do
        chrom=`basename $file .chain`
        netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
        cat over/$chrom.over >> /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain    
     done
     rm -rf over/
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver
     cp /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain .
     gzip mm5ToCanFam1.chain
     mkdir -p /gbdb/mm5/liftOver
     ln -s /cluster/data/mm5/bed/liftOver/mm5ToCanFam1.chain /gbdb/mm5/liftOver/mm5ToCanFam1.over.chain
     hgAddLiftOverChain -multiple mm5 canFam1
 
 # ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE Sept. 8th, 2004, Heather)
     ssh hgwdev
     cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/all.chain.gz \
       /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.chain.gz
     cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/human.net.gz \
       /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.net.gz
     cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17
     md5sum *.gz */*.gz > md5sum.txt
     # Update the README.txt
 
 # LIFTOVER CHAIN TO MM6 (DONE 4/20/2005 Andy)
     ssh kkstore
     cd /cluster/data/mm6
     mkdir liftSplits/
     cat << _EOF_ > split.csh
 #!/bin/tcsh
 set liftDir = /cluster/data/mm6/liftSplits
 cd /cluster/data/mm6
 foreach n (\`ls ?{,?}/*.fa\`)
     set d = \$n:h
     set c = \$n:t:r
     echo \$c
     faSplit -lift=\$liftDir/lift/\$c.lft size /cluster/data/mm6/\$d/\$c.fa -oneFile 3000 \$liftDir/split/\$c
 end
 _EOF_
     chmod +x split.csh
     ./split.csh
     # kkstore not mounting /panasas ... weird.  
     ssh hgwdev
     cd /cluster/data/mm6
     cp -r liftSplits/ /panasas/store/mm6
     ssh kk 
     cd /cluster/data/mm5
     makeLoChain-align mm5 /scratch/mus/mm5/softNib \
                     mm6 /panasas/store/mm6/liftSplits/split
         # Created parasol job in bed/blat.mm6.2005-04-20/run
     cd bed/blat.mm6.2005-04-20/run/
     para create spec
     para push
     # para time was complicated by the fact I redid some hippos (mostly chrUn_random
     # alignments) on kk9.  Basically, it took about a day.
     # In the end, the chrUn_random vs. chrUn_random just took wayyyyyy too long.
     # Later, if a more rigorous chain file is desired, it can be made after rerunning 
     # that blat.
 
     # Lifting
     ssh kksilo
     cd /cluster/data/mm5/bed/blat.mm6
     makeLoChain-lift mm5 mm6 /panasas/store/mm6/liftSplits/lift \
                         > lift.log &
     tail -f lift.log
     # OK so I remember this problem with makeLoChain-lift: it always stops with chr1.
     # I'll just do it manually.
     cd raw/
     for nib in `ls /cluster/data/mm6/nib`; do
        chrom=${nib%.nib}
        echo $chrom
        liftUp -pslQ ../psl/${chrom}.psl /panasas/store/mm6/liftSplits/lift/${chrom}.lft warn chr*_${chrom}.psl
        echo done $chrom
     done    
 
     ssh kk9
     cd /cluster/data/mm5/bed
     ln -s blat.mm6.2005-04-20 blat.mm6.2005-04-22
     makeLoChain-chain mm5 /cluster/data/mm5/nib mm6 /cluster/data/mm6/nib
     cd /cluster/data/mm5/bed/blat.mm5.2005-02-08/chainRun
     para try
     para check
     para push
     para time
 #Completed: 40 of 40 jobs
 #CPU time in finished jobs:      27315s     455.25m     7.59h    0.32d  0.001 y
 #IO & Wait Time:                 67093s    1118.22m    18.64h    0.78d  0.002 y
 #Average job time:                2360s      39.34m     0.66h    0.03d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:           11656s     194.27m     3.24h    0.13d
 #Submission to last job:         31329s     522.15m     8.70h    0.36d
     # That looks weird but I think it was because 8 jobs crashed because there was no disk space.
     # I freed up some space but then there wasn't much room for the netting stage.
     # It crashed twice when I tried it using the script makeLoChain-net after the 
     # chainMergeSort/split.  I figured out that it needed more memory.  So I ran it manually on
     # kolossus
     ssh kolossus
     mkdir -p /tmp/andy
     cd /tmp/andy
     cp -r /cluster/data/mm5/bed/blat.mm6/chainRaw .
     rm -rf /cluster/data/mm5/bed/blat.mm6/chainRaw
     mkdir chain
     chainMergeSort chainRaw/*.chain | chainSplit chain stdin
     mkdir net over
     cd chain
     for c in *.chain; do
        echo ${c%.chain}; 
        chainNet $c /cluster/data/mm5/chrom.sizes \
         /cluster/data/mm6/chrom.sizes ../net/${c%.chain}.net /dev/null
        echo done $c
     done
     for chain in *; do 
        c=${chain%.chain}
        netChainSubset ../net/$c.net $chain ../over/$c.over 
     done
     cd ../over/
     cat * >> ../mm5ToMm6.chain
     cd ../
     cp mm5ToMm6.chain /cluster/data/mm5/bed/liftOver/
     cd /cluster/data/mm5/bed/liftOver
     mv mm5ToMm6.chain mm5ToMm6.over.chain
     ssh hgwdev
     ln -s /cluster/data/mm5/bed/liftOver/mm5ToMm6.over.chain /gbdb/mm5/liftOver/mm5ToMm6.over.chain
     hgAddLiftOverChain mm5 mm6 /gbdb/mm5/liftOver/mm5ToMm6.over.chain
     cd /usr/local/apache/htdocs/goldenPath/mm5/liftOver
     cp /gbdb/mm5/liftOver/mm5ToMm6.over.chain .
     gzip mm5ToMm6.over.chain
 
 # MAKING HUMAN SYNTENY (DONE - 2004-07-13 - Hiram)
 
 ssh hgwdev
 mkdir /cluster/data/mm5/bed/syntenyHg17
 cd /cluster/data/mm5/bed/syntenyHg17
 
 # Copy all the needed scripts from /cluster/data/hg16/bed/syntenyRn3
 cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .
 
 ./syntenicBest.pl -db=mm5 -table=blastzBestHg17 > synBest.out 2>&1
 ./smooth.pl > smooth.out 2>&1
 ./joinsmallgaps.pl > joingaps.out 2>&1
 ./fillgap.pl -db=mm5 -table=blastzBestHg17 > fillgap.out 2>&1
 ./synteny2bed.pl > syn2bed.out 2>&1
 
     #	The five commands above
     #	real    168m43.627s
     #	user    0m18.680s
     #	sys     0m4.990s
 
 #	Used to load this in syntenyHg17, but that type is misleading to
 #	the table browser and fails the checkTableCoords check.
 #	Better to use this ensRatMusHom type:
 #	Need a new name here for the Hg17 to not conflict with the
 #	others
 sed -e 's/ensPhusionBlast/ensRatMusHg17/g' \
       $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
       > ensRatMusHg17.sql
 hgLoadBed mm5 ensRatMusHg17 ucsc100k.bed -sqlTable=ensRatMusHg17.sql
 
     # featureBits mm5 ensRatMusHg17
     #	2366463967 bases of 2615483787 (90.479%) in intersection
     # featureBits mm4 syntenyHg16
     #	2299774191 bases of 2627444668 (87.529%) in intersection
 
 # MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-07-13 - Hiram)
     # After creating axtBest alignments above, use subsetAxt to get axtTight:
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.hg17/axtNet
     mkdir -p ../axtTight
     bash	#	for tcsh users
     for I in *.axt
     do
       echo "axtNet/$I -> ../axtTight/$I"
       subsetAxt  $I ../axtTight/$I \
 	~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
     done
     # exit bash if you are tcsh
     #	An 8 minute job
 
     # translate to psl
     cd ../axtTight
     mkdir ../pslTight
     bash	#	for tcsh users
     for I in *.axt
     do
       C=${I/.axt/}
       axtToPsl $I ../S1.len ../S2.len ../pslTight/${C}_blastzTightHg17.psl
       echo "Done: $I -> ${C}_blastzTightHg17.psl"
     done
     # exit bash if you are tcsh
 
     # Load tables into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.hg17/pslTight
     for I in chr*TightHg17.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
 	echo "done ${I}"
     done
 
     #	Compare results with previous assembly:
     #	featureBits mm5 blastzTightHg17
     #	168148800 bases of 2615483787 (6.429%) in intersection
     #	featureBits mm4 blastzTightHg16
     #	170163839 bases of 2627444668 (6.476%) in intersection
 
     # copy  axt's to download area
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.hg17/axtTight
     mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight
     cd /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/axtTight
     gzip *.axt
     # add README.txt file to dir (use previous assembly's copy as template)
     #	4 minute gzip
 
 #### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-07-13 - Fan)
 
 # PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS BUILT USING ENSMART DATA OF MOUSE BUILD 32.
 # THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER.
 # WHEN ENSEMBL FINISHES THEIR MOUSE BUILD 33 RELEASE, WE NEED TO REBUILD THIS
 # TABLE.
     # Get the ensembl gene/protein cross-reference data from
     # http://www.ensembl.org/Multi/martview?species=Mus_musculus
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Mus musculus choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs,
 	      SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC 
     # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
     # Save as ensXref
 
     sed ensXref.tsv -e 's/\./\t/g' > ensemblXref3.tab
 
     hgsql mm5 -e "drop table ensemblXref3"
     hgsql mm5 < ~/src/hg/lib/ensemblXref3.sql
 
     hgsql mm5 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'
 
 # CPGISLANDS (DONE - 2004-07-13 - Fan)
     ssh hgwdev
     mkdir -p /cluster/data/mm5/bed/cpgIsland
     cd /cluster/data/mm5/bed/cpgIsland
 
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     #	gcc readseq.c cpg_lh.c -o cpglh.exe
     mv cpglh.exe /cluster/data/mm5/bed/cpgIsland/
     
     # cpglh.exe requires hard-masked (N) .fa's.  
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     ssh kksilo
     cd /cluster/data/mm5/bed/cpgIsland
     foreach f (../../*/chr*.fa.masked)
       set fout=$f:t:r:r.cpg
       echo running cpglh on $f to $fout
       ./cpglh.exe $f > $fout
     end
     #	the warnings:
     # Bad char 0x52 = 'R' at line 117472, base 5873535, sequence chr14
     # Bad char 0x53 = 'S' at line 120651, base 6032462, sequence chr14
     # Bad char 0x53 = 'S' at line 120652, base 6032546, sequence chr14
     #	real    21m47.823s
     #	user    18m30.810s
     #	sys     1m13.420s
 
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     # << this line makes emacs coloring happy
     awk -f filter.awk chr*.cpg > cpgIsland.bed
 
     ssh hgwdev
     cd /cluster/data/mm5/bed/cpgIsland
     hgLoadBed mm5 cpgIslandExt -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
     
     # Reading cpgIsland.bed
     # Loaded 16238 elements of size 10
     # Sorted
     # Saving bed.tab
     # Loading mm5
 
 # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 2004-07-14 Fan)
     ssh kksilo
     cd /cluster/data/mm5
 
     # Build the .zip files
     cp /cluster/data/rn3/jkStuff/zipAll.sh jkStuff
     # edit this zipAll.sh to produce output to /cluster/data/mm5/bigZips
     jkStuff/zipAll.sh > zipAll.log
     #	bash:	./jkStuff/zipAll.sh > zipAll.log 2>&1 &
     tail -f zipAll.log
 
     mkdir zip
     mv *.zip zip
     cd zip
     # Look at zipAll.log to make sure all file lists look reasonable.
     # Check zip file integrity:
     foreach f (*.zip)
       unzip -t $f > $f.test
       tail -1 $f.test
     end
 
     wc -l *.zip.test
     # 46 chromAgp.zip.test
     # 45 chromFa.zip.test
     # 45 chromFaMasked.zip.test
     # 45 chromOut.zip.test
     # 45 chromTrf.zip.test
     # 641 contigAgp.zip.test
     # 641 contigFa.zip.test
     # 641 contigFaMasked.zip.test
     # 641 contigOut.zip.test
     # 641 contigTrf.zip.test
     #3431 total
 
     ssh hgwdev
     cd /cluster/data/mm5/jkStuff
     # create generic copy program
     cat << '_EOF_' > cpToWeb.sh
 #!/bin/sh
 if [ $# -ne 1 ]; then
 	echo "usage: cpToWeb.sh <goldenPath download directory>"
 	echo -e "\texample: cpToWeb.sh mm5"
 	exit 255
 fi
 GP=/usr/local/apache/htdocs/goldenPath/$1
 mkdir -p ${GP}
 mkdir -p ${GP}/chromosomes
 for f in ../?/*.fa ../??/*.fa
 do
     BN=`basename ${f}`
     zip -j ${GP}/chromosomes/${BN}.zip ${f}
     echo "zipped: ${BN}"
 done
 mkdir -p ${GP}/bigZips
 for Z in *.zip
 do
 	cp -p ${Z} ${GP}/bigZips
 	echo "copied: ${Z}"
 done
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x cpToWeb.sh
     cd /cluster/data/mm5/zip
     ../jkStuff/cpToWeb.sh mm5
     cd /usr/local/apache/htdocs/goldenPath/mm5
     # Take a look at bigZips/* and chromosomes/*, update their README.txt's
 
     # Make the upstream sequence files.
     # NOTE: must be redone due to bad gap track
     cd bigZips
     featureBits mm5 refGene:upstream:1000 -fa=upstream1000.fa
     zip upstream1000.zip upstream1000.fa
     rm upstream1000.fa
     featureBits mm5 refGene:upstream:2000 -fa=upstream2000.fa
     zip upstream2000.zip upstream2000.fa
     rm upstream2000.fa
     featureBits mm5 refGene:upstream:5000 -fa=upstream5000.fa
     zip upstream5000.zip upstream5000.fa
     rm upstream5000.fa
     # mrna zips -- auto dump process takes care of this
 
 
 # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 7/15/04 angie)
     # In an email 2/13/04, Arian said we could treat all human repeats as 
     # lineage-specific for human-chicken blastz.  Do the same for mouse.  
     # Scripts expect *.out.spec filenames, so set that up:
     ssh kkr1u00
     cd /cluster/data/mm5
     mkdir /iscratch/i/mus/mm5/linSpecRep.notInChicken
     foreach f (/iscratch/i/mus/mm5/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/mus/mm5/linSpecRep.notInChicken/$f:t:r:r.out.spec
     end
     iSync
     # Use these the next time we run human-chicken blastz.
 
 
 # BLASTZ CHICKEN (GALGAL2) (DONE 7/19/04 angie)
     ssh kk
     mkdir /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
     ln -s blastz.galGal2.2004-07-15 /cluster/data/mm5/bed/blastz.galGal2
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
     # Use human-chicken params: set L=10000 (higher threshold on blastz's 
     # outer loop) and abridge repeats.
     cat << '_EOF_' > DEF
 # mouse vs. chicken
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse
 SEQ1_DIR=/scratch/mus/mm5/softNib
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInChicken
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken
 SEQ2_DIR=/iscratch/i/galGal2/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm5/bed/blastz.galGal2.2004-07-15
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # first cluster run: raw blastz alignments
     ssh kk
     bash # if a csh/tcsh user
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
     para try, check, push, check, ....
 #Completed: 51491 of 51491 jobs
 #Average job time:                 357s       5.95m     0.10h    0.00d
 #Longest job:                     1015s      16.92m     0.28h    0.01d
 #Submission to last job:         89841s    1497.35m    24.96h    1.04d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     bash # if a csh/tcsh user
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
     para try, check, push, etc ...
 #Completed: 341 of 341 jobs
 #Average job time:                  11s       0.18m     0.00h    0.00d
 #Longest job:                       55s       0.92m     0.02h    0.00d
 #Submission to last job:           245s       4.08m     0.07h    0.00d
 
     # third run: lav -> axt
     # NOTE: use axtRescore here because we used a non-default BLASTZ_Q matrix 
     # and abridged repeats (Penn State's restore_rpts program rescores with 
     # default matrix, oops).
     ssh kki
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 set path = (/cluster/bin/x86_64 $path)
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin \
     /iscratch/i/mus/mm5/softNib /iscratch/i/galGal2/nib stdout \
 | axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q stdin stdout \
 | axtSort stdin ../../axtChrom/$chr.axt 
 axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     foreach d (../lav/chr*)
       echo "do.csh $d" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 43 of 43 jobs
 #Average job time:                  38s       0.63m     0.01h    0.00d
 #Longest job:                      160s       2.67m     0.04h    0.00d
 #Submission to last job:           233s       3.88m     0.06h    0.00d
 
 
 # CHAIN CHICKEN BLASTZ (DONE 7/19/04 angie)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
          -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
          -minScore=5000 $1 \
     /iscratch/i/mus/mm5/softNib \
     /iscratch/i/galGal2/nib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 #Completed: 43 of 43 jobs
 #Average job time:                  60s       1.00m     0.02h    0.00d
 #Longest job:                      355s       5.92m     0.10h    0.00d
 #Submission to last job:           355s       5.92m     0.10h    0.00d
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r
       textHistogram -binSize=5000 /tmp/score.$f:t:r
       echo ""
     end
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain mm5 ${c}_chainGalGal2 $i
     end
     featureBits mm5 chainGalGal2Link
 #78951466 bases of 2615483787 (3.019%) in intersection
     featureBits hg17 chainGalGal2Link
 #103882699 bases of 2866216770 (3.624%) in intersection
 
 
 # NET CHICKEN BLASTZ (DONE 7/19/04 angie)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
     netClass -noAr noClass.net mm5 galGal2 chicken.net
 
     # Make a 'syntenic' subset:
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn chicken.net > chickenSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
     netFilter -minGap=10 chicken.net |  hgLoadNet mm5 netGalGal2 stdin
     netFilter -minGap=10 chickenSyn.net | hgLoadNet mm5 syntenyNetGalGal2 stdin
     # Add entries for chainGalGal2, netGalGal2, syntenyNetGalGal2 to 
     # mouse/mm5 trackDb
 
 
 # GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
     netSplit chicken.net net
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
     mkdir axtNet
     foreach f (axtChain/net/*)
       set chr = $f:t:r
       netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/mixedNib \
         /cluster/data/galGal2/nib stdout \
       | axtSort stdin axtNet/$chr.axt
     end
     mkdir mafNet
     foreach f (axtNet/chr*.axt)
       set maf = mafNet/$f:t:r.maf
       axtToMaf $f \
             /cluster/data/mm5/chrom.sizes /cluster/data/galGal2/chrom.sizes \
             $maf -tPrefix=mm5. -qPrefix=galGal2.
     end
 
 # XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk)
 # see makeXenTro1.doc and search for zb.mm5
 # The results of this are also symlinked under mm5/bed
 
 # MAKE VSGALGAL2 DOWNLOADABLES (DONE 7/19/04 angie)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
     gzip axtNet/*.axt
     cd /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChain
     ln all.chain chicken.chain
     zip /cluster/data/mm5/zip/chicken.chain.zip chicken.chain
     rm chicken.chain
     zip /cluster/data/mm5/zip/chicken.net.zip chicken.net
     zip /cluster/data/mm5/zip/chickenSyn.net.zip chickenSyn.net
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsGalGal2
     cd /usr/local/apache/htdocs/goldenPath/mm5/vsGalGal2
     mv /cluster/data/mm5/zip/chicken*.zip .
     cp -pR /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtNet .
     md5sum *.zip axtNet/* > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 
 # EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 7/15/04 angie)
     ssh kkr1u00
     cd /cluster/bluearc/scratch/mus/mm5/rmsk
     # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
     # whether repeats in -query are also expected in -comp species.  
     # Even though we already have the mouse-human linSpecReps,
     # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
     # additions.  So add human, then ignore it.  
     # Dog in extra column 1, Human in extra column 2
     foreach outfl ( *.out )
         echo "$outfl"
         /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
           ${outfl} -query mouse -comp dog -comp human
     end
     # Now extract dog (extra column 1), ignore human.
     cd /iscratch/i/mus/mm5
     mkdir linSpecRep.notInDog
     foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/*.out_dog_hum)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                         linSpecRep.notInDog/$base.out.spec
     end
     # Clean up.
     rm /cluster/bluearc/scratch/mus/mm5/rmsk/*.out_dog_hum
     iSync
 
 
 # BLASTZ DOG (CANFAM1) (DONE 7/16/04 angie)
     ssh kk
     mkdir /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
     ln -s blastz.canFam1.2004-07-15 /cluster/data/mm5/bed/blastz.canFam1
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
     # Use default (Human-Mouse) settings for starters.
     cat << '_EOF_' > DEF
 # mouse vs. dog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Default
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse
 SEQ1_DIR=/scratch/mus/mm5/softNib
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInDog
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog
 SEQ2_DIR=/scratch/hg/canFam1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInMouse
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm5/bed/blastz.canFam1.2004-07-15
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # first cluster run: raw blastz alignments
     ssh kk
     bash # if a csh/tcsh user
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
     para try, check, push, check, ....
     # cluster was mobbed...
 #Completed: 93775 of 93775 jobs
 #Average job time:                 187s       3.11m     0.05h    0.00d
 #Longest job:                     3907s      65.12m     1.09h    0.05d
 #Submission to last job:         76763s    1279.38m    21.32h    0.89d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     bash # if a csh/tcsh user
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
     para try, check, push, etc ...
 #Completed: 341 of 341 jobs
 #Average job time:                  98s       1.63m     0.03h    0.00d
 #Longest job:                      281s       4.68m     0.08h    0.00d
 #Submission to last job:          2102s      35.03m     0.58h    0.02d
 
     # third run: lav -> axt
     # (if non-default BLASTZ_Q is used in the future, put axtRescore in 
     # the pipe after lavToAxt)
     ssh kki
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 cat `ls -1 *.lav | sort -g` \
 | $HOME/bin/x86_64/lavToAxt stdin \
     /iscratch/i/mus/mm5/softNib /iscratch/i/canFam1/nib stdout \
 | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
 $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     foreach d (../lav/chr*)
       echo "do.csh $d" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 43 of 43 jobs
 #Average job time:                 671s      11.18m     0.19h    0.01d
 #Longest job:                     2398s      39.97m     0.67h    0.03d
 #Submission to last job:          2417s      40.28m     0.67h    0.03d
 
 
 # CHAIN DOG BLASTZ (DONE 7/16/04 angie)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chainchimpSuperQuals
     ls -1S /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 \
     /iscratch/i/mus/mm5/softNib \
     /iscratch/i/canFam1/nib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 #Completed: 43 of 43 jobs
 #Average job time:                 537s       8.96m     0.15h    0.01d
 #Longest job:                     2071s      34.52m     0.58h    0.02d
 #Submission to last job:          2071s      34.52m     0.58h    0.02d
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r
       textHistogram -binSize=5000 /tmp/score.$f:t:r
       echo ""
     end
 
     # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
     # chains.  So filter the chain down somewhat...
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain
     rm chain/*
     chainSplit chain all.chain
     gzip all.chain.unfiltered
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain mm5 ${c}_chainCanFam1 $i
     end
     # mouse-dog gets significantly less coverage than human-dog:
     featureBits mm5 -chrom=chr1 chainCanFam1Link
 #63386139 bases of 185739816 (34.126%) in intersection
     featureBits hg17 -chrom=chr1 chainCanFam1Link
 #123999291 bases of 222827847 (55.648%) in intersection
     # mouse-dog isn't a whole lot less than mouse-human though:
     featureBits mm5 -chrom=chr1 chainHg17Link
 #75492250 bases of 185739816 (40.644%) in intersection
 
 
 # NET DOG BLASTZ (DONE 7/16/04 angie)
     ssh kolossus
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
     netClass -noAr noClass.net mm5 canFam1 dog.net
 
     # Make a 'syntenic' subset:
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn dog.net > dogSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
     netFilter -minGap=10 dog.net |  hgLoadNet mm5 netCanFam1 stdin
     netFilter -minGap=10 dogSyn.net | hgLoadNet mm5 syntenyNetCanFam1 stdin
     # Add entries for chainCanFam1, netCanFam1 to mouse/mm5 trackDb
 
 
 # MAKE VSCANFAM1 DOWNLOADABLES (DONE 7/19/04 angie)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
     gzip axtNet/chr*.axt
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
     ln all.chain dog.chain
     zip /cluster/data/mm5/zip/dog.chain.zip dog.chain
     rm dog.chain
     zip /cluster/data/mm5/zip/dog.net.zip dog.net
     zip /cluster/data/mm5/zip/dogSyn.net.zip dogSyn.net
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsCanFam1
     cd /usr/local/apache/htdocs/goldenPath/mm5/vsCanFam1
     mv /cluster/data/mm5/zip/dog*.zip .
     cp -pR /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtNet .
     md5sum *.zip axtNet/* > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 
 # GENERATE CANFAM1 MAF FOR MULTIZ FROM NET (DONE 7/19/04 angie)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15/axtChain
     netSplit dog.net net
     cd /cluster/data/mm5/bed/blastz.canFam1.2004-07-15
     mkdir axtNet
     foreach f (axtChain/net/*)
       set chr = $f:t:r
       netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/nib \
         /cluster/data/canFam1/nib stdout \
       | axtSort stdin axtNet/$chr.axt
     end
     mkdir mafNet
     foreach f (axtNet/chr*.axt)
       set maf = mafNet/$f:t:r.maf
       axtToMaf $f \
             /cluster/data/mm5/chrom.sizes /cluster/data/canFam1/chrom.sizes \
             $maf -tPrefix=mm5. -qPrefix=canFam1.
     end
 
 
 ### MAKE THE affyU74 TRACK - needed for the Gene Sorter 
 #                              (DONE - 2004-07-16 - Fan)
 # MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
 # target sequences. Recalculate alignments and load data
 ----------------------------------
 # Load up semi-local disk with target sequences for Affy mouse U74 chips.
 ssh kkr1u00
 mkdir -p /iscratch/i/affy
 #	This /projects filesystem is not available on kkr1u00
 #	but it is on kk
 ssh kk
 cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy
 
 ssh kkr1u00
 iSync
 
 # Run cluster job to do alignments
 ssh kk
 mkdir /cluster/data/mm5/bed/affyU74.2004-07-16
 cd /cluster/data/mm5/bed/affyU74.2004-07-16
 mkdir run
 cd run
 mkdir psl
 echo /scratch/mus/mm5/maskedContigs/*.fa | wordLine stdin > genome.lst
 ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
 cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 # << this line makes emacs coloring happy
 
 gensub2 genome.lst affy.lst gsub jobList
 para create jobList
 para try
 # do usual para check/para push etc. until the job is done. 
 
 # Completed: 1917 of 1917 jobs
 # CPU time in finished jobs:      14240s     237.34m     3.96h    0.16d  0.000 y
 # IO & Wait Time:                  7946s     132.43m     2.21h    0.09d  0.000 y
 # Average job time:                  12s       0.19m     0.00h    0.00d
 # Longest job:                       40s       0.67m     0.01h    0.00d
 # Submission to last job:           307s       5.12m     0.09h    0.00d
 
 # Do sort, best in genome filter, and convert to chromosome coordinates
 # to create affyU74.psl.
 ssh kksilo
 cd /cluster/data/mm5/bed/affyU74.2004-07-16/run
 pslSort dirs raw.psl tmp psl
 
 # change filter parameters for these sequences. only use alignments that
 # cover 30% of sequence and have at least minAli = 0.95.
 # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
 pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
 
 # Processed 44630 alignments
 liftUp ../all_affyU74.psl ../../../jkStuff/liftAll.lft warn contig.psl
 
 # Sort by chromosome and load into database.
 ssh hgwdev
 cd /cluster/data/mm5/bed/affyU74.2004-07-16
 pslSortAcc nohead chrom temp all_affyU74.psl
 cat chrom/*.psl > affyU74.psl
 # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
 # and reload data into table
 hgLoadPsl mm5 affyU74.psl
 rm -fr chrom temp run
 
 ##   MAKE THE affyGnfU74 TRACKs (DONE - 2004-07-18 - Fan)
 # Make bed files and load consensus sequences for Affy U74 chip set.
 # Fix broken symlinks to microarray data after directory structure changed
 # (DONE, 2005-05-03, hartera)
 ----------------------------------
 #This needs to be done after affyU74 is already made.
 ssh hgwdev
 mkdir -p /cluster/data/mm5/bed/affyGnf.2004-07-16
 cd /cluster/data/mm5/bed/affyGnf.2004-07-16
 #	may need to build this command in src/hg/affyGnf
 affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \
 	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
 	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
 affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \
 	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
 	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
 affyPslAndAtlasToBed ../affyU74.2004-07-16/affyU74.psl \
 	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
 	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2
 
 # edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
 
 # and reload data into table
 hgLoadBed mm5 affyGnfU74A affyGnfU74A.bed
 hgLoadBed mm5 affyGnfU74B affyGnfU74B.bed
 hgLoadBed mm5 affyGnfU74C affyGnfU74C.bed
 
 # Add in sequence data for U74 tracks.
 # Copy consensus sequence to /gbdb if it isn't already
 # [THE SYM LINKS WERE ALREADY DONE.]
     mkdir -p /gbdb/hgFixed/affyProbes
     cd /gbdb/hgFixed/affyProbes
     # fix broken symlinks after directory structure changed
     # /projects/compbiodata ----> /projects/compbio/data
     rm U74*
     # make correct symlinks (hartera, 2005-05-03)
     ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
     ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
     ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .
 
     # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
     # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
     # reload sequences with prefix removed so acc matches name used in
     # other dependent tables
                                                     
     hgLoadSeq -abbr=U74Av2: mm5 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
     hgLoadSeq -abbr=U74Bv2: mm5 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
     hgLoadSeq -abbr=U74Cv2: mm5 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa
 
 ### GNF ATLAS 2  [DONE Fan 7/18/2004]
     # Align probes from GNF1M chip.
     ssh kk
     cd /cluster/data/mm5/bed
     mkdir -p geneAtlas2/run/psl
     cd geneAtlas2/run
     mkdir -p /cluster/bluearc/geneAtlas2
     cp /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /cluster/bluearc/geneAtlas2
     ls -1 /scratch/mus/mm5/maskedContigs/ > genome.lst
     ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
     echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc  /scratch/mus/mm5/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
     gensub2 genome.lst mrna.lst gsub spec
     para create spec
     para try
     para check
     para push
     para time
 # Completed: 639 of 639 jobs
 # CPU time in finished jobs:      58174s     969.57m    16.16h    0.67d  0.002 y
 # IO & Wait Time:                  4833s      80.55m     1.34h    0.06d  0.000 y
 # Average job time:                  99s       1.64m     0.03h    0.00d
 # Longest job:                      189s       3.15m     0.05h    0.00d
 # Submission to last job:          1749s      29.15m     0.49h    0.02d
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create gnf1h.psl.
     pslSort dirs raw.psl tmp psl
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp ../affyGnf1m.psl ../../../jkStuff/liftAll.lft warn contig.psl
     rm -r contig.psl raw.psl psl
 
     # Load probes and alignments from GNF1H into database.
     ssh hgwdev
     cd /cluster/data/mm5/bed/geneAtlas2
     ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
     hgLoadPsl mm5 affyGnf1m.psl
     hgLoadSeq mm5 /gbdb/hgFixed/affyProbes/gnf1m.fa
 
     # Load up track
     hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
     	affyGnf1m.psl
     # Note that the unmapped 5000 records are from all-N sequences.
     hgLoadBed mm5 gnfAtlas2 gnfAtlas2.bed
 
 # MOUSE AFFYMETRIX MOE430 TRACK (DONE, 2004-07-19, Fan)
     mkdir -p /projects/compbio/data/microarray/affyMouse
     # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
     # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
     unzip MOE430*_consensus.zip
 
     # check for duplicate probes: there are none, all have unique names
     # check for duplicate probes: 100 from 136745_at to 1367551_a_at
     # remove "consensus:" and ";" from FASTA headers to shorten probeset
     # names for database
 
     sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
     sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
  
     cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
        /cluster/bluearc/affy/
 
     # THE ABOVE WAS ALREADY DONE BY RACHEL 4/16/04.
 
     # Set up cluster job to align MOE430 consensus sequences to mm5
     ssh kkr1u00
     cd /cluster/data/mm5/bed
     mkdir -p affyMOE430
     cd affyMOE430
     mkdir -p /iscratch/i/affy
     cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
     iSync
 
     ssh kk
     cd /cluster/data/mm5/bed/affyMOE430
     ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
     ls -1 /scratch/mus/mm5/maskedContigs/ > allctg.lst
 
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -
 ooc=/scratch/hg/h/mouse11.ooc  /scratch/mus/mm5/maskedContigs/$(path1) $(path2) 
 {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
 
     gensub2 allctg.lst affy.lst template.sub para.spec
     mkdir psl
     para create para.spec
     # Actually do the job with usual para try/check/push/time etc.
 # para time
 # Completed: 639 of 639 jobs
 # CPU time in finished jobs:      24369s     406.14m     6.77h    0.28d  0.001 y
 # IO & Wait Time:                  2263s      37.72m     0.63h    0.03d  0.000 y
 # Average job time:                  42s       0.69m     0.01h    0.00d
 # Longest job:                       63s       1.05m     0.02h    0.00d
 # Submission to last job:           671s      11.18m     0.19h    0.01d
 
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyRAE230.psl
     pslSort dirs raw.psl tmp psl
 
     # only use alignments that cover 30% of sequence and have at least
     # 95% identity in aligned region. 
     # low minCover as a lot of n's in these sequences
     pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl 
 contig.psl /dev/null
     liftUp affyMOE430.psl ../../jkStuff/liftAll.lft warn contig.psl
 
     # Load alignments and sequences into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/affyMOE430
     # shorten names in psl file
     sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
     mv affyMOE430.psl.bak affyMOE430.psl
 
     # load track into database
 
     hgLoadPsl mm5 affyMOE430.psl
     # 1 warning on loading: Blat error so that 1449824_at has a 
     # negative entry (-195) in the qBaseInsert field. 
     # Loading into the database forces this to 0.
  
     # Add consensus sequences for MOE430
     # Copy sequences to gbdb is they are not there already
     mkdir -p /gbdb/hgFixed/affyProbes
     ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
        /gbdb/hgFixed/affyProbes
 
     hgLoadSeq -abbr=MOE430 mm5 /gbdb/hgFixed/affyProbes/MOE430_all.fa
     
     # Clean up
     rm batch.bak contig.psl raw.psl 
     
     # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4
     # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
     # add affyMOE430.html file and then do make alpha to add to trackDb table
 
 
 ######## MAKING GENE SORTER TABLES #######  (STARTED - 2004-07-15 - Hiram)
 # These are instructions for building the
 # Gene Sorter.  Don't start these until
 # there is a knownGene track. and the affy tracks
 
 # Cluster together various alt-splicing isoforms.
 #	Creates the knownIsoforms and knownCanonical tables
 ssh hgwdev
 cd /tmp
 hgClusterGenes mm5 knownGene knownIsoforms knownCanonical
 #	You may need to build this binary in src/hg/near/hgClusterGenes
 #	Got 24603 clusters, from 41208 genes in 43 chromosomes
 #	featureBits mm5 knownCanonical
 #	853516995 bases of 2615483787 (32.633%) in intersection
 #	featureBits mm4 knownCanonical
 #	840021165 bases of 2627444668 (31.971%) in intersection
 #	featureBits mm3 knownCanonical
 #	825943052 bases of 2505900260 (32.960%) in intersection
 #	! ! ! Can not do featureBits on knownIsoforms
 
 # Extract peptides from knownGenes into fasta file
 # and create a blast database out of them.
 ssh hgwdev
 mkdir -p  /cluster/data/mm5/bed/geneSorter/blastp
 cd /cluster/data/mm5/bed/geneSorter/blastp
 pepPredToFa mm5 knownGenePep known.faa
 #	You may need to build this binary in src/hg/near/pepPredToFa
 /cluster/bluearc/blast229/formatdb -i known.faa -t known -n known
 
 # Copy over database to bluearc scratch
 mkdir /cluster/bluearc/scratch/mus/mm5/blastp
 cp -p /cluster/data/mm5/bed/geneSorter/blastp/known.* \
 	/cluster/bluearc/scratch/mus/mm5/blastp
 
 # Split up fasta file into bite sized chunks for cluster
 cd /cluster/data/mm5/bed/geneSorter/blastp
 mkdir split
 faSplit sequence known.faa 8000 split/kg
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/mm5/bed/geneSorter/blastp/self
 cd /cluster/data/mm5/bed/geneSorter/blastp/self
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1000
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	'ls ../../split/*.fa' is too much, hence the echo
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:     120685s    2011.42m    33.52h    1.40d  0.004 y
 # IO & Wait Time:                 22722s     378.69m     6.31h    0.26d  0.001 y
 # Average job time:                  19s       0.31m     0.01h    0.00d
 # Longest job:                      147s       2.45m     0.04h    0.00d
 # Submission to last job:           705s      11.75m     0.20h    0.01d
 
 # Load into database.  This takes about an hour.
 ssh hgwdev
 cd /cluster/data/mm5/bed/geneSorter/blastp/self/run/out
 hgLoadBlastTab mm5 knownBlastTab *.tab
 # Scanning through 7739 files
     #	Loading database with 8017562 rows
     #	real    17m9.104s
     #	user    3m8.980s
     #	sys     0m28.800s
 
 # Create known gene mapping table and expression distance tables
 # for GNF Atlas 2.  (The hgExpDistance takes an hour.)
 # DONE (04-07-18 Fan)
 
 hgMapToGene mm5 affyGnf1m knownGene knownToGnf1m
 hgExpDistance mm5 hgFixed.gnfMouseAtlas2MedianRatio \
 	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnf1m
 
 
 # Create table that maps between known genes and RefSeq
 hgMapToGene mm5 refGene knownGene knownToRefSeq
 #	may need to build this command in src/hg/near/hgMapToGene
 
 # Create a table that maps between known genes and 
 # the nice affy expression data.
 hgMapToGene mm5 affyU74  knownGene knownToU74
 hgMapToGene mm5 affyMOE430 knownGene knownToMOE430
 hgMapToGene mm5 affyMOE430 -prefix=A: knownGene knownToMOE430A
 
 # Format and load Rinn et al sex expression data
 mkdir /cluster/data/mm5/bed/rinnSex
 cd !$
 hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
     ../affyMOE430/affyMOE430.psl
 hgLoadBed mm5 rinnSex rinnSex.bed
 
 # Format and load the GNF data
 mkdir /cluster/data/mm5/bed/affyGnf95
 cd /cluster/data/mm5/bed/affyGnf95
 affyPslAndAtlasToBed -newType ../affyU95.psl \
 	/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
 	affyGnfU95.tab affyGnfU95Exps.tab -shortOut
 
 #	this .sql load was in preceeding instructions, but this .sql file
 #	appears to not exist and it doesn't seem to be needed anyway.
 #	Everything below this seems to create tables OK.
 #  hgsql mm5 < ~/kent/src/hg/affyGnf/affyGnfU95.sql
 
 # Create table that gives distance in expression space between 
 # GNF genes.  These commands take about 15 minutes each
 #	The affyGnfU74?Exps arguments appear to be unused in 
 hgExpDistance
 hgExpDistance mm5 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
 	-lookup=knownToU74
 # Got 13593 unique elements in affyGnfU74A
 hgExpDistance mm5 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
 	-lookup=knownToU74
 # Got 8512 unique elements in affyGnfU74B
 hgExpDistance mm5 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
 	-lookup=knownToU74
 # Got 2318 unique elements in affyGnfU74C
 
 
 # C.ELEGANS BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
     # Make C. elegans ortholog column using blastp on wormpep.
     # First make C. elegans protein database and copy it to iscratch/i
     # if it doesn't exist already:
     ssh eieio
     mkdir /cluster/data/ce2/bed/blastp
     cd /cluster/data/ce2/bed/blastp
     # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
     # to find out the latest version.  Then use that in place of 128 below.
     wget -O wormPep128.faa \
       ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep128/wormpep128
     formatdb -i wormPep128.faa -t wormPep128 -n wormPep128
     ssh kkr1u00
     if (-e /iscratch/i/ce2/blastp) then
       rm -r /iscratch/i/ce2/blastp
     endif
     mkdir -p /iscratch/i/ce2/blastp
     cp /cluster/data/ce2/bed/blastp/wormPep128.p?? /iscratch/i/ce2/blastp
     iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastp/ce2/run/out
     cd /cluster/data/mm5/bed/blastp/ce2/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep128 -i \$1 
 -o \$2 -e 0.01 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
     #ls -1S ../../split/*.fa > split.lst
     #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:      54871s     914.51m    15.24h    0.64d  0.002 y
 # IO & Wait Time:                 26157s     435.95m     7.27h    0.30d  0.001 y
 # Average job time:                  10s       0.17m     0.00h    0.00d
 # Longest job:                       41s       0.68m     0.01h    0.00d
 # Submission to last job:           210s       3.50m     0.06h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastp/ce2/run/out
     hgLoadBlastTab mm5 ceBlastTab -maxPer=1 *.tab
 
 # HUMAN BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
     # Make human ortholog column using blastp on human known genes.
     # First make human protein database and copy it to iscratch/i
     # if it doesn't exist already:
     mkdir /cluster/data/hg17/bed/blastp
     cd /cluster/data/hg17/bed/blastp
     pepPredToFa hg17 knownGenePep known.faa
     formatdb -i known.faa -t known -n known
     ssh kkr1u00
     if (-e /iscratch/i/hg17/blastp) then
       rm -r /iscratch/i/hg17/blastp
     endif
     mkdir -p /iscratch/i/hg17/blastp
     cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp
     iSync
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastp/hg17/run/out
     cd /cluster/data/mm5/bed/blastp/hg17/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17/blastp/known -i \$1 -o 
 \$2 -e 0.001 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
     #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:     125830s    2097.17m    34.95h    1.46d  0.004 y
 # IO & Wait Time:                 22740s     379.00m     6.32h    0.26d  0.001 y
 # Average job time:                  19s       0.32m     0.01h    0.00d
 # Longest job:                      137s       2.28m     0.04h    0.00d
 # Submission to last job:           301s       5.02m     0.08h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastp/hg17/run/out
     hgLoadBlastTab mm5 hgBlastTab -maxPer=1 *.tab
 
 
 # ZEBRAFISH BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
     # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
     # First make protein database and copy it to iscratch/i
     # if it doesn't exist already:
     ssh kkstore
     mkdir /cluster/data/danRer1/bed/blastp
     cd /cluster/data/danRer1/bed/blastp
     wget 
 ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH3.ma
 y.pep.fa.gz 
     zcat Dan*.pep.fa.gz > ensembl.faa
     formatdb -i ensembl.faa -t ensembl -n ensembl
     ssh kkr1u00
     if (-e /iscratch/i/danRer1/blastp) then
       rm -r /iscratch/i/danRer1/blastp
     endif
     mkdir -p /iscratch/i/danRer1/blastp
     cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp
     iSync
     # THE ABOVE IS ALREADY DONE BY ANGIE
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastp/danRer1/run/out
     cd /cluster/data/mm5/bed/blastp/danRer1/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer1/blastp/ensembl -i 
 \$1 -o \$2 -e 0.005 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
     #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:      96773s    1612.89m    26.88h    1.12d  0.003 y
 # IO & Wait Time:                 29356s     489.26m     8.15h    0.34d  0.001 y
 # Average job time:                  16s       0.27m     0.00h    0.00d
 # Longest job:                       73s       1.22m     0.02h    0.00d
 # Submission to last job:           282s       4.70m     0.08h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastp/danRer1/run/out
     hgLoadBlastTab mm5 drBlastTab -maxPer=1 *.tab
 
 
 # YEAST BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
     # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on 
     # RefSeq.  First make protein database and copy it to iscratch/i
     # if it doesn't exist already:
     mkdir /cluster/data/sacCer1/bed/blastp
     cd /cluster/data/sacCer1/bed/blastp
     wget ftp://genome-
 ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/o
 rf_trans.fasta.gz
     zcat orf_trans.fasta.gz > sgdPep.faa
     formatdb -i sgdPep.faa -t sgdPep -n sgdPep
     #ABOVE WAS ALREDY DONE BY JIM
 
     ssh kkr1u00
     # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, 
     # fortunately we won't be looking for homologs there.  :)
     if (-e /iscratch/i/sacCer1/blastp) then
       rm -r /iscratch/i/sacCer1/blastp
     endif
     mkdir -p /iscratch/i/sacCer1/blastp
     cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp
     iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastp/sacCer1/run/out
     cd /cluster/data/mm5/bed/blastp/sacCer1/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/sacCer1/blastp/sgdPep -i \$1 
 -o \$2 -e 0.01 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
     #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:      16348s     272.46m     4.54h    0.19d  0.001 y
 # IO & Wait Time:                 23063s     384.39m     6.41h    0.27d  0.001 y
 # Average job time:                   5s       0.08m     0.00h    0.00d
 # Longest job:                       14s       0.23m     0.00h    0.00d
 # Submission to last job:           203s       3.38m     0.06h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastp/sacCer1/run/out
     hgLoadBlastTab mm5 scBlastTab -maxPer=1 *.tab
 
 # DM1 BLASTP FOR GENE SORTER (DONE 7/20/04 Fan)
     # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
     # First make protein database and copy it to iscratch/i
     # if it doesn't exist already:
     # This is already done, see makeMm3.doc for procedure
     # the directory: /cluster/bluearc/dm1/blastp should have data
 
     ssh kkr1u00
     if (-e /iscratch/i/dm1/blastp) then
       rm -r /iscratch/i/dm1/blastp
     endif
     mkdir -p /iscratch/i/dm1/blastp
     cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
     iSync
     # THE ABOVE IS ALREADY DONE BY ANGIE
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastp/dm1/run/out
     cd /cluster/data/mm5/bed/blastp/dm1/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o 
 \$2 -e 0.001 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls –1S /cluster/store6/mm5/bed/geneSorter/blastp/split >split.lst
     #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:      64033s    1067.22m    17.79h    0.74d  0.002 y
 # IO & Wait Time:                 20868s     347.79m     5.80h    0.24d  0.001 y
 # Average job time:                  11s       0.18m     0.00h    0.00d
 # Longest job:                       45s       0.75m     0.01h    0.00d
 # Submission to last job:           351s       5.85m     0.10h    0.00d
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastp/dm1/run/out
     hgLoadBlastTab mm5 dmBlastTab -maxPer=1 *.tab
 
 # Create table that maps between known genes and LocusLink (DONE 7/20/04 Fan)
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm5 \
         > refToLl.txt
 hgMapToGene mm5 refGene knownGene knownToLocusLink -lookup=refToLl.txt
 #       row count is 30303
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt mm5 knownGene name proteinID Pfam knownToPfam
 # row count is 29069
 
 # Create table to map between known genes and GNF Atlas2
 # expression data.
     hgMapToGene mm5 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 
 # Create table that maps between known genes and genePix database (DONE 3/15/05 JK)
     knownToGenePix mm5
 
 # ENABLE GENE SORTER FOR mm5 IN HGCENTRALTEST (DONE 7/20/04 Fan)
     echo "update dbDb set hgNearOk = 1 where name = 'mm5';" \
       | hgsql -h genome-testdb hgcentraltest
 
 
 # RAT BLASTP FOR GENE SORTER (DONE 4/20/05 Fan)
     # Make RAT ortholog column using blastp on RAT known genes.
     # First make RAT protein database and copy it to iscratch/i
     # if it doesn't exist already:
     mkdir /cluster/data/rn3/bed/blastp
     cd /cluster/data/rn3/bed/blastp
     pepPredToFa rn3 knownGenePep known.faa
     formatdb -i known.faa -t known -n known
 
     ssh kkr1u00
     if (-e /iscratch/i/rn3/blastp) then
       rm -r /iscratch/i/rn3/blastp
     endif
     mkdir -p /iscratch/i/rn3/blastp
     cp /cluster/data/rn3/bed/blastp/known.p?? /iscratch/i/rn3/blastp
     iSync
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastp/rn3/run/out
     cd /cluster/data/mm5/bed/blastp/rn3/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/rn3/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls -1S /cluster/data/mm5/bed/geneSorter/blastp/split >split.lst
     #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 
 Completed: 7739 of 7739 jobs
 CPU time in finished jobs:      24369s     406.14m     6.77h    0.28d  0.001 y
 IO & Wait Time:                 21867s     364.46m     6.07h    0.25d  0.001 y
 Average job time:                   6s       0.10m     0.00h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:              25s       0.42m     0.01h    0.00d
 Submission to last job:           276s       4.60m     0.08h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastp/rn3/run/out
     hgLoadBlastTab mm5 rnBlastTab -maxPer=1 *.tab
 
 # END OF GENE SORTER STUFF
 #############################################################################
 
 #  BLASTZ RAT RN3 (DONE - 2004-07-15 - Fan)
 #  NOTE: THIS IS RE-DONE.  SEE THE SAME SECTION OF 2004-08-30. Fan.
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-07-14
     cd /cluster/data/mm5/bed
     ln -s  blastz.rn3.2004-07-14 blastz.rn3
     cd blastz.rn3
 
     cat << '_EOF_' > DEF
 # rat vs. mouse
 export 
 PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartz
 bin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Mouse
 SEQ1_DIR=/scratch/mus/mm5/softNib
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInRat
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Rat
 SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
 # not currently used
 SEQ2_RMSK=
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm5/bed/blastz.rn3
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/mm5/bed/blastz.rn3
     bash
     source ./DEF
     # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     #	it is a generic script and works for any assembly
 
     cp /cluster/data/hg17/jkStuff/BlastZ_run0.sh \
        /cluster/data/mm5/jkStuff/BlastZ_run0.sh
     /cluster/data/mm5/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 
 Completed: 41943 of 41943 jobs
 CPU time in finished jobs:   16854319s  280905.31m  4681.76h  195.07d  0.534 y
 IO & Wait Time:                448464s    7474.41m   124.57h    5.19d  0.014 y
 Average job time:                 413s       6.88m     0.11h    0.00d
 Longest job:                     9358s     155.97m     2.60h    0.11d
 Submission to last job:         73416s    1223.60m    20.39h    0.85d
 
     #	Second cluster run to convert the .out's to .lav's
     #	You do NOT want to run this on the big cluster.  It brings
     #	the file server to its knees.  Run this on the small cluster.
     ssh kki
     cd /cluster/data/mm5/bed/blastz.rn3
     # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
     #	fixup machine check, should be kki, not kk
     cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh \
        /cluster/data/mm5/jkStuff/BlastZ_run1.sh
     vi /cluster/data/mm5/jkStuff/BlastZ_run1.sh
 
     /cluster/data/mm5/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:       7859s     130.98m     2.18h    0.09d  0.000 y
 # IO & Wait Time:                104771s    1746.19m    29.10h    1.21d  0.003 y
 # Average job time:                 330s       5.50m     0.09h    0.00d
 # Longest job:                     1625s      27.08m     0.45h    0.02d
 # Submission to last job:          8535s     142.25m     2.37h    0.10d
 
     #	Third cluster run to convert lav's to axt's
     ssh kki
     cd /cluster/data/mm5/bed/blastz.rn3
     bash
     source ./DEF
     #	The copy of this in mm4 was broken, use the hg17 one instead
     cp /cluster/data/hg17/jkStuff/BlastZ_run2.sh \
        /cluster/data/mm5/jkStuff/BlastZ_run2.sh
     # vi /cluster/data/mm5/jkStuff/BlastZ_run2.sh
     /cluster/data/mm5/jkStuff/BlastZ_run2.sh
     cd run.2
     #edited gsub to change /scratch/mus/mm5 to /cluster/bluearc/scratch/mus/mm5
     # and recreated jobList by:
     gensub2 chrom.list single gsub jobList
     para create jobList
     para try, check, push, etc ...
 
 # Completed: 42 of 43 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:       2050s      34.17m     0.57h    0.02d  0.000 y
 # IO & Wait Time:                143135s    2385.58m    39.76h    1.66d  0.005 y
 # Average job time:                3457s      57.61m     0.96h    0.04d
 # Longest job:                    14460s     241.00m     4.02h    0.17d
 # Submission to last job:         14849s     247.48m     4.12h    0.17d
 
 # Note: previous numbers were:
 
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:        426s       7.09m     0.12h    0.00d  0.000 y
 # IO & Wait Time:                  7283s     121.39m     2.02h    0.08d  0.000 y
 # Average job time:                 168s       2.79m     0.05h    0.00d
 # Longest job:                      642s      10.70m     0.18h    0.01d
 # Submission to last job:           642s      10.70m     0.18h    0.01d
 # probably due to data on bluearc instead of on kki nodes.
 
 # One job failed consistently because short of memory error
 # went to kkr4u00 to run the following job:
 # Per Angie's advice, created /cluster/bin/scripts/blastz-chromlav2axtLargeMem
 # by from /cluster/bin/scripts/blastz-chromlav2axt and changed /cluster/bin/i386
 # to /cluster/bin/x86_64 and then ran: 
 
 /cluster/bin/scripts/blastz-chromlav2axtLargeMem 
 /cluster/data/mm5/bed/blastz.rn3/lav/chr2 
 /cluster/data/mm5/bed/blastz.rn3/axtChrom/chr2.axt 
 /cluster/bluearc/scratch/mus/mm5/softNib /iscratch/i/rn3/bothMaskedNibs
 
 # It worked!
 
     # translate sorted axt files into psl
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3
     mkdir pslChrom
     set tbl = "blastzRn3"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	That takes about 2 hours
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3/pslChrom
     bash
 for I in *.psl
 do
 /cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
 echo "done: ${I}"
 done
     # Check results
     #	featureBits hg16 blastzRn3
     # 1013603401 bases of 2865248791 (35.376%) in intersection
 
     #	featureBits mm5 blastzRn3 ran out of memory.  
     # So check a few specific chromosomes
 
     # featureBits mm5 blastzRn3 -chrom=chr17
     # 61029084 bases of 86658738 (70.425%) in intersection
     # featureBits mm4 blastzRn3 -chrom=chr17
     # 62824556 bases of 89616841 (70.104%) in intersection
 
     # featureBits mm5 blastzRn3 -chrom=chr18
     # 61442155 bases of 86685738 (70.879%) in intersection
     # featureBits mm4 blastzRn3 -chrom=chr18
     # 57158006 bases of 81388777 (70.228%) in intersection
     
 # CHAIN RN3 BLASTZ (DONE - 2004-07-22 - Fan)
 #  NOTE: THIS IS RE-DONE.  SEE THE SAME SECTION OF 2004-08-30. Fan.
 
 # The axtChain is best run on the small kluster, or the kk9 kluster
     ssh kki
     mkdir -p /cluster/data/mm5/bed/blastz.rn3/axtChain/run1
     cd /cluster/data/mm5/bed/blastz.rn3/axtChain/run1
     mkdir out chain
 
     ls -1S /cluster/data/mm5/bed/blastz.rn3/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} 
 out/$(root1).out
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
     axtChain $1 \
 	/iscratch/i/mus/mm5/softNib \
 	/iscratch/i/rn3/bothMaskedNibs $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
 
     # 46 jobs
     gensub2 input.lst single gsub jobList
     para create jobList
     para try
     para push # ... etc ...
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:      18318s     305.30m     5.09h    0.21d  0.001 y
 # IO & Wait Time:                 41906s     698.44m    11.64h    0.49d  0.001 y
 # Average job time:                1401s      23.34m     0.39h    0.02d
 # Longest job:                     5598s      93.30m     1.55h    0.06d
 # Submission to last job:          5635s      93.92m     1.57h    0.07d
     # now on the file server, sort chains
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3/axtChain
     time chainMergeSort run1/chain/*.chain > all.chain &
     # real    26m14.694s
     # user    16m16.190s
     # sys     2m19.520s
 
     time chainSplit chain all.chain &
     # real    26m29.801s
     # user    15m40.780s
     # sys     2m40.610s
 
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain mm5 ${c}_chainRn3 $i
         echo done $c
     end
 
     # featureBits mm4 chainRn3Link -chrom=chr16
     # 67474802 bases of 95076222 (70.969%) in intersection
     # featureBits mm5 chainRn3Link -chrom=chr16
     # 66703715 bases of 92679592 (71.972%) in intersection
 
     # featureBits mm4 chainRn3Link -chrom=chr17
     # 61932430 bases of 89616841 (69.108%) in intersection
     # featureBits mm5 chainRn3Link -chrom=chr17
     # 60676019 bases of 86658738 (70.017%) in intersection
 
 # NET RN3 (DONE - 2004-07-23 - Fan)
 #  NOTE: THIS IS RE-DONE.  SEE THE SAME SECTION OF 2004-08-31. Fan.
 
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i /cluster/data/mm5/chrom.sizes \
                         /cluster/data/rn3/chrom.sizes ../preNet/$i
     end
 
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/mm5/chrom.sizes \
                             /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
     end
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     # memory usage 1850904576, utime 9294 s/100, stime 2079
 
     # The netClass operations requires an "ancientRepeat" table to exist
     # in either mm5 or rn3.  So, create the table:
 
     ssh hgwdev
     mkdir -p /cluster/data/mm5/bed/ancientRepeat
     cd /cluster/data/mm5/bed/ancientRepeat
     # mysqldump needs write permission to this directory
     # and you need to use your read/write enabled user with password
     chmod 777 .
     hgsqldump --all --tab=. mm4 ancientRepeat
     chmod 775 .
     hgsql mm5 < ancientRepeat.sql
     mysqlimport -u<r/w user> -p<r/w pass> mm5 ancientRepeat.txt
     # This is a hand curated table obtained from Arian.
 
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3/axtChain
     time netClass hNoClass.net mm5 rn3 rat.net \
 	-tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat \
 	-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
     # 508.060u 89.340s 12:10.36 81.7% 0+0k 0+0io 201pf+0w
     
     # If things look good do
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     time netFilter -syn rat.net > ratSyn.net
 
     # real    5m5.494s
     # user    3m52.710s
     # sys     0m32.670s
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3/axtChain
     netFilter -minGap=10 rat.net |  hgLoadNet mm5 netRn3 stdin
     netFilter -minGap=10 ratSyn.net | hgLoadNet mm5 syntenyNetRn3 stdin
 
     # real    8m50.781s
     # user    4m59.660s
     # sys     0m52.840s
     
     # check results
     # featureBits mm4 netRn3
     # 96806381 bases of 95076222 (101.820%) in intersection
     # featureBits mm5 netRn3
     # 2638255333 bases of 2615483787 (100.871%) in intersection
 
     # featureBits mm4 syntenyNetRn3
     # 96760405 bases of 95076222 (101.771%) in intersection
     # featureBits mm5 syntenyNetRn3
     # 2600849289 bases of 2615483787 (99.440%) in intersection
 
     # Add entries for net and chain to mouse/mm5 trackDb
 
     # make net
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3/axtChain
     mkdir ratNet
     time netSplit rat.net ratNet
     # real    5m28.037s
     # user    3m58.150s
     # sys     0m37.870s
 
     # extract axts from net 
     mkdir ../axtNet 
     foreach n (ratNet/chr*.net)
 	set c=$n:t:r
 	echo "netToAxt: $c.net -> $c.axt"
 	rm -f ../axtNet/$c.axt
 	netToAxt ratNet/$c.net chain/$c.chain \
 		/cluster/data/mm5/nib \
 		/cluster/data/rn3/nib ../axtNet/$c.axt
 	echo "Complete: $c.net -> axtNet/$c.axt"
     end
     # sort axt's and convert to maf format
     mkdir ../mafNet
 cat << 'EOF' > makeMaf.csh
     foreach f (../axtNet/chr*.axt)
         set c=$f:t:r
         echo $c.axt
         mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
         axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
         rm ../axtNet/$c.unsorted.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \
                 ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3.
     end
 'EOF'
     #csh makeMaf.csh >&! makeMaf.log &
     csh makeMaf.csh > makeMaf.log &
     tail -100f makeMaf.log
     # THE ABOVE DID NOT WORK.  TRIED THE FOLLOWING:
     foreach f (../axtNet/chr*.axt)
         set c=$f:t:r
         echo $c.axt
         mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
         axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
         rm ../axtNet/$c.unsorted.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \
                 ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3.
     end
 
     ssh hgwdev
     mkdir -p /cluster/data/mm5/bed/blastz.rn3/axtBest
     cd /cluster/data/mm5/bed/blastz.rn3/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3/axtNet
     mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
     cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
     gzip *.axt
     # add README.txt file to dir (use previous assembly's copy as template)
 
     #  Convert those axt files to psl
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestRn3.psl
 	echo "Done: ${c}_blastzBestRn3.psl"
     end
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3/pslBest
     bash
     for I in chr*BestRn3.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
 	echo "done ${I}"
     done
 
      # check results
     # featureBits mm5 blastzBestRn3
     # 1778653886 bases of 2615483787 (68.005%) in intersection
     # featureBits mm4 blastzBestRn3
     # 1780774716 bases of 2627444668 (67.776%) in intersection
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/mm5/axtBest/Rn3
      cd /gbdb/mm5/axtBest/Rn3
      ln -s /cluster/data/mm5/bed/blastz.rn3/axtNet/chr*.axt .
      cd /cluster/data/mm5/bed/blastz.rn3/axtNet
      rm -f axtInfoInserts.sql
      foreach f (/gbdb/mm5/axtBest/Rn3/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
          >> axtInfoInserts.sql
      end
     hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql
     #	table axtInfo may already exist, ignore create error.
     hgsql mm5 < axtInfoInserts.sql
 
 # BLASTZ RN3 CLEAN UP (DONE - 2004-07-26 - Fan)
 #  NOTE: THIS IS RE-DONE.  SEE THE SAME SECTION OF 2004-08-31. Fan.
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3
     nice rm -rf raw &
     nice rm axtChain/run1/chain/* &
     nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} &
 
 # MAKE VSRN3 DOWNLOADABLES (DONE 9/14/04 Fan)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3/axtChain
     ln all.chain rat.chain
     foreach f (rat.chain rat.net)
       gzip -c $f > $f.gz
     end
     rm rat.chain
 
     # Make chain-format of raw alignments
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3
     mkdir blastzECF
     foreach f (axtChrom/chr*.axt)
       set chr = $f:t:r
       axtToChain $f S1.len S2.len stdout \
       | gzip -c - > blastzECF/$chr.ecf.gz
     end
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsRn3
     cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3
     mv /cluster/data/mm5/bed/blastz.rn3/axtChain/rat*.gz .
     cp -p /cluster/data/mm5/bed/blastz.rn3/axtChain/all.chain.gz \
           /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/rat.chain.gz
     md5sum *.gz > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
     # Not for pushing -- handle separately.
     mv /cluster/data/mm5/bed/blastz.rn3/blastzECF .
     cd blastzECF
     md5sum *.gz > md5sum.txt
 
 # BLASTZ ZEBRAFISH (DANRER1) (DONE, 2004-07-29, hartera)
 
     ssh kkr1u00
     # blastz requires lineage-specific repeats
     # Treat all repeats as lineage-specific.
 
     mkdir -p /iscratch/i/mm5/linSpecRep.notInZebrafish
     foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/mm5/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
     end
 
     mkdir -p /iscratch/i/danRer1/linSpecRep.notInMouse
     foreach f (/iscratch/i/danRer1/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/danRer1/linSpecRep.notInMouse/$f:t:r:r.out.spec
     end
     iSync
 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastz.danRer1.2004-07-27
     ln -s /cluster/data/mm5/bed/blastz.danRer1.2004-07-27 \
           /cluster/data/mm5/bed/blastz.danRer1
     cd /cluster/data/mm5/bed/blastz.danRer1
     # use same parameters as for danRer1-hg17
     cat << '_EOF_' > DEF
 # mouse (mm5) vs zebrafish (danRer1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Reuse parameters from hg16-fr1 and danRer1-hg17.
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse (mm5)
 SEQ1_DIR=/cluster/bluearc/scratch/mus/mm5/softNib
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer1)
 SEQ2_DIR=/iscratch/i/danRer1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/danRer1/linSpecRep.notInMouse
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm5/bed/blastz.danRer1
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     # Save the DEF file in the current standard place
     chmod +x DEF
     cp DEF ~angie/hummus/DEF.mm5-danRer1.2004-07-27
     # setup cluster run
     # copy shell scripts for blastz runs if not there already
     cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/
     # edit BlastZ_run0.sh
     # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
     # this is the directory for the latest version of blastz-run
 
         # source the DEF file
     bash
     . ./DEF
     /cluster/data/mm5/jkStuff/BlastZ_run0.sh
     cd run.0
     # check batch looks ok then
     para try, check, push, check, ....
 # para time
 # Completed: 57970 of 57970 jobs
 # CPU time in finished jobs:   18228826s  303813.77m  5063.56h  210.98d  0.578 y
 # IO & Wait Time:               1019215s   16986.92m   283.12h   11.80d  0.032 y
 # Average job time:                 332s       5.53m     0.09h    0.00d
 # Longest job:                     2211s      36.85m     0.61h    0.03d
 # Submission to last job:         45422s     757.03m    12.62h    0.53d
     # Took about 12 hours to run and output is 1.7G
     # second cluster run to convert the .out's to .lav's
     cd /cluster/data/mm5/bed/blastz.danRer1
     bash # if a csh/tcsh user
     . ./DEF
     /cluster/data/mm5/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # para time
 # Checking finished jobs
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:       4536s      75.60m     1.26h    0.05d  0.000 y
 # IO & Wait Time:                 65931s    1098.85m    18.31h    0.76d  0.002 y
 # Average job time:                 207s       3.44m     0.06h    0.00d
 # Longest job:                      636s      10.60m     0.18h    0.01d
 # Submission to last job:          1282s      21.37m     0.36h    0.01d
 
     #   Third cluster run to convert lav's to axt's
     ssh kki
     cd /cluster/data/mm5/bed/blastz.danRer1
     mkdir axtChrom
     # a new run directory
     mkdir run.2
     cd run.2
 cat << '_EOF_' > do.csh
 #!/bin/csh
 cd $1
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin /cluster/bluearc/scratch/mus/mm5/softNib \
 /iscratch/i/danRer1/nib stdout \
 | axtSort stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.danRer1/axtChrom/$(root1).axt}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     \ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
     head jobList
     para create jobList
     para try, check, push, check,...
 # para time
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:        246s       4.10m     0.07h    0.00d  0.000 y
 # IO & Wait Time:                  4985s      83.08m     1.38h    0.06d  0.000 y
 # Average job time:                 122s       2.03m     0.03h    0.00d
 # Longest job:                      446s       7.43m     0.12h    0.01d
 # Submission to last job:           653s      10.88m     0.18h    0.01d
 
     # translate sorted axt files into psl
     ssh kolossus
     cd /cluster/data/mm5/bed/blastz.danRer1
     mkdir -p pslChrom
     set tbl = "blastzDanRer1"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     # Load database tables
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.danRer1/pslChrom
 
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl mm5 $f
     end
 # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1 -enrichment
 #refGene:cds 0.763%,blastzDanRer1 2.918%,both 0.512%,cover 67.12%,enrich 23.00x
 # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L4000 -enrichment
 # refGene:cds 0.763%, blastzDanRer1L4000 17.878%, both 0.581%, cover 76.18%, 
 # enrich 4.26x
 # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L5000 -enrichment
 # refGene:cds 0.763%,blastzDanRer1L5000 6.013%,both 0.540%,cover 70.81%,
 # enrich 11.78x
 # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L6500 -enrichment
 # refGene:cds 0.763%, blastzDanRer1L6500 2.386%, both 0.495%, cover 64.91%, 
 # enrich 27.20x
 # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1L7000 -enrichment
 # refGene:cds 0.763%, blastzDanRer1L7000 2.062%, both 0.480%, cover 62.87%, 
 # enrich 30.50x
 # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1HumanParams -enrichment
 # refGene:cds 0.763%,blastzDanRer1HumanParams 1.661%,both 0.502%, cover 65.82%,
 # enrich 39.64x
 
 # row counts:   172167 blastzDanRer1, 
 #               2288714 blastzDanRer1HumanParams,
 #               3373525 blastzDanRer1L4000
 #               700927 blastzDanRer1L5000
 #               13719318 blastzDanRer1L3000
 #               103190 blastzDanRer1L6500
 #               76758 blastzDanRer1L7000 
 # Do test runs - repeat above using L=4000 and then try the mm5-hg17 parameters
 # also L=2000, L=3000 and L=5000. Use only mm5 chr1 for tests.
 # L=2000 and L=3000 lavToAxt crashed so re-do on kolossus. L2000 crashed again
 # probably ran out of memory. 
 # The orginal blastzDanRer1 with L= 6000 looks best: good coverage and 
 # enrichment without too many alignments in the database table.
 
 # RESCORE DANRER1 BLASTZ ALIGNMENTS (DONE, 2004-08-02, hartera)
 
     # Low scores can occur with repeats abridged and using the
     # HoxD55.q matrix. PSU's restore_rpts program rescored alignments
     # with the default matrix instead of the BLASTZ_Q matrix.
     # Rescore them here so the chainer sees the higher scores:
     ssh kolossus
     cd /cluster/data/mm5/bed/blastz.danRer1
     mkdir axtChrom.rescore
     foreach f (axtChrom/chr*.axt)
         axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
         $f axtChrom.rescore/$f:t
     end
     mv axtChrom axtChrom.orig
     mv axtChrom.rescore axtChrom
 
 #   psl files and blastz tables will be the same regardless of score so
 #   no need to reload
 
 # CHAIN DANRER1 BLASTZ (DONE, 2004-08-03, hartera)
 # FILTERED WITH A HIGHER MINSCORE THRESHOLD (DONE, 2004-08-04, hartera)
 # RELOADED TABLES (DONE, 2004-08-18, hartera)
 # removed all chainDanRer1 and chainDanRer1Link tables, some extra tables had 
 # been accidentally loaded with this name from a different genome so there
 # were duplicate chain ids causing joinerCheck to complain.
 
     # Re do chains with rescored blastz danRer1
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/mm5/bed/blastz.danRer1
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/mm5/bed/blastz.danRer1/axtChrom/*.axt \
         > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     # Make our own linear gap file with reduced gap penalties, 
     # in hopes of getting longer chains:
     cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize^V     11
 smallSize^V     111
 position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V 72111^V 152111^V        252111
 qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
 tGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
 bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V  16000^V 32000^V 57000
 '_EOF_'
     # << this line makes emacs coloring happy
 cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                       -linearGap=../../chickenHumanTuned.gap \
                       -minScore=5000 $1 \
     /cluster/bluearc/scratch/mus/mm5/softNib \
     /iscratch/i/danRer1/nib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # para time
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       2260s      37.67m     0.63h    0.03d  0.000 y
 # IO & Wait Time:                   863s      14.38m     0.24h    0.01d  0.000 y
 # Average job time:                  73s       1.21m     0.02h    0.00d
 # Longest job:                      342s       5.70m     0.10h    0.00d
 # Submission to last job:         36951s     615.85m    10.26h    0.43d
 
    # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     # filter again to use minScore of 7500 (see featureBits below) (2004-08-04)
     mv all.chain all.chain.filt5k
     chainFilter -minScore=7500 all.chain.unfiltered > all.chain
     # remove old chains
     rm -r chain
     chainSplit chain all.chain
     gzip all.chain.filt5k
 
 # take a look at score distr's,try also with smaller bin size.
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r >> hist.out
       textHistogram -binSize=10000 /tmp/score.$f:t:r >> hist.out
       echo ""
     end
     # also hist5000.out has bin size 5000. looks good so load into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.danRer1/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain mm5 ${c}_chainDanRer1 $i
         echo done $c
     end
 # featureBits still shows good coverage and enrichment
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Link -enrichment
 # refGene:cds 0.763%, chainDanRer1Link 2.246%, both 0.508%, cover 66.61%, 
 # enrich 29.65x
 # Human Parameters Blastz Chain with minScore = 5,000 filter:
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1HPLink -enrichment
 # refGene:cds 0.763%, chainDanRer1HPLink 1.208%, both 0.484%, cover 63.43%, 
 # enrich 52.49x
 # L=5000 Blastz Chain with minScore = 5,000 filter:
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1L5kLink -enrichment
 # refGene:cds 0.763%, chainDanRer1L5kLink 4.137%, both 0.534%, cover 69.96%, 
 # enrich 16.91x
 # L=5000 Blastz Chain with minScore =10,000 filter:
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1L5k10kLink -enrichment
 # refGene:cds 0.763%, chainDanRer1L5k10kLink 1.038%, both 0.448%, cover 58.69%,
 # enrich 56.54x
 # filter too stringent, coverage has dropped a lot
 # with less filtering of blastzDanRer1 where minScore =3000
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt3kLink -enrichment
 # refGene:cds 0.763%, chainDanRer1Filt3kLink 2.487%, both 0.509%, cover 66.78%,
 # enrich 26.86x
 # with more filtering, minScore = 6000
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt6kLink -enrichment
 # refGene:cds 0.763%, chainDanRer1Filt6kLink 2.172%, both 0.508%, cover 66.54%, # enrich 30.64x
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Filt7500Link -enrichment
 # refGene:cds 0.763%, chainDanRer1Filt75kLink 2.022%, both 0.504%, cover 66.10%,# enrich 32.70x
 
 # rows in database table:
 # chr1_blastzDanRer1Link: 515119
 # chr1_chainDanRer1L5kLink: 1241480
 # chr1_chainDanRer1L5k10kLink: 74963
 # chr1_chainDanRer1HPLink: 309740
 # chr1_chainDanRer1Filt3k: 594057
 # chr1_chainDanRer1Filt6kLink: 479368 
 # chr1_chainDanRer1Filt7500Link: 378954
 # Using the original parameters is a good compromise between high coverage
 # and high enrichment but a filter of 7500 on the score produces only a tiny
 # reduction in coverage with higher enrichment as there are a lot less 
 # alignments of low score of the same regions or other low scoring alignments.
 
 # NET DANRER1 BLASTZ (DONE, 2004-08-04, hartera)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                      ../preNet/$i
     end
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 103493632, utime 668 s/100, stime 127
 
 # Add classification info using db tables:
     cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
     # netClass looks for ancient repeats in one of the databases
     # hg17 has this table - hand-curated by Arian but this is for
     # human-rodent comparisons so do not use here, use -noAr option
     mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
     mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInMouse
     cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
        /cluster/bluearc/mm5/linSpecRep.notInZebrafish
     cp /iscratch/i/danRer1/linSpecRep.notInMouse/* \
        /cluster/bluearc/danRer1/linSpecRep.notInMouse
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
     time netClass noClass.net mm5 danRer1 danRer1.net \
          -tNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish \
          -qNewR=/cluster/bluearc/danRer1/linSpecRep.notInMouse -noAr
     # 77.700u 46.610s 3:05.75 66.9%   0+0k 0+0io 215pf+0w
     netFilter -minGap=10 danRer1.net |  hgLoadNet mm5 netDanRer1 stdin
 
 # EXTRACT AXTs AND MAFs FROM ZEBRAFISH (danRer1) NET
 # (DONE, 2004-08-04, hartera)
     ssh eieio
     # create axts
     cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
     netSplit danRer1.net danRer1Net
     mkdir -p ../axtNet
 cat > axtNet.csh << 'EOF'
     foreach f (danRer1Net/chr*.net)
         set c = $f:t:r
         echo "axtNet on $c"
         netToAxt danRer1Net/$c.net chain/$c.chain \
                  /cluster/data/mm5/mixedNib \
                  /cluster/data/danRer1/nib ../axtNet/$c.axt
     echo "Complete: $c.net -> $c.axt"
     end
 'EOF'
                                                                                 
     chmod +x axtNet.csh
     csh axtNet.csh >&! axtNet.log &
     tail -100f axtNet.log
 
     # sort axts before making mafs - must be sorted for multiz
     cd /cluster/data/mm5/bed/blastz.danRer1
     mv axtNet axtNet.unsorted
     mkdir axtNet
     foreach f (axtNet.unsorted/*.axt)
         set c = $f:t:r
         echo "Sorting $c"
         axtSort $f axtNet/$c.axt
     end
     # create maf
     ssh eieio
     cd /cluster/data/mm5/bed/blastz.danRer1
     cd axtNet
     mkdir ../mafNet
 cat > makeMaf.csh << 'EOF'
     foreach f (chr*.axt)
       set maf = $f:t:r.danRer1.maf
       echo translating $f to $maf
       axtToMaf $f \
             /cluster/data/mm5/chrom.sizes /cluster/data/danRer1/chrom.sizes \
             ../mafNet/$maf -tPrefix=mm5.  -qPrefix=danRer1.
     end
 'EOF'
     chmod +x makeMaf.csh
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
  
 # BLASTZ DANRER1 CLEAN UP (DONE, 2004-08-04, hartera)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.danRer1
     nice rm -rf raw &
     nice rm -rf lav &
     nice rm -rf axtChrom.orig &
     nice rm axtChain/run1/chain/* &
     nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
 # unzip all.chain.gz and danRer1.net.gz to make vsDanRer1 downloadables
 # then zip these again (hartera, 2004-09-10)
 
 # UPDATE BACEND SEQUENCES (DONE - 2004-07-20 - Fan)
 
     # Download new files
     ssh kksilo
     mkdir –p /cluster/data/mm5/bed/bacends/ncbi
     cd /cluster/data/mm5/bed/bacends/ncbi
     wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/AllBACends.mfa.gz
     wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/cl_acc_gi_len.gz
     wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/BACENDS/README
     gunzip AllBACends.mfa.gz
     gunzip cl_acc_gi_len.gz
 
     # Convert fa file
     cat << '_EOF_' > convert.pl
 #!/usr/local/bin/perl -w
 
 use strict;
 
 while (my $line = <>) {
     if (substr($line,0,1) ne ">") {
         print $line;
     } else {
         my @fields = split(/\|/, $line);
         my $printed = 0;
         for (my $i = 0; $i < $#fields; $i++) {
                 if ($fields[$i] eq "gb") {
                         (my $name, my $vers) = split(/\./,$fields[$i+1]);
                         print ">$name\n";
                         $i= $#fields;
                         $printed = 1;
                 }
         }
         if (!$printed) {
                 die("Failed for $line\n");
         }
     }
 }
 '_EOF_'
     chmod +x convert.pl
     ./convert.pl < AllBACends.mfa > BACends.fa
 
     # Create new pairs files
     convertBacEndPairInfo cl_acc_gi_len
 
     # Split file into pieces and copy to cluster to propagate
     ssh kksilo
     cd /cluster/data/mm5/bed/bacends/ncbi
     /cluster/bin/i386/faSplit sequence BACends.fa 100 BACends
     rm -rf /cluster/bluearc/scratch/mus/mm5/bacEnds
     mkdir /cluster/bluearc/scratch/mus/mm5/bacEnds
     mv BACends???.fa /cluster/bluearc/scratch/mus/mm5/bacEnds
     cp -p BACends.fa /cluster/bluearc/scratch/mus/mm5/bacEnds
 
     # Ask for propagation from sysadmin
 
     # Load the sequences (change bacends.# to match correct location)
     ssh hgwdev
     mkdir /gbdb/mm5/bacends
     cd /gbdb/mm5/bacends
     ln -s /cluster/data/mm5/bed/bacends/ncbi/BACends.fa .
     cd /tmp
     hgLoadSeq mm5 /gbdb/mm5/bacends/BACends.fa
     #Adding /gbdb/mm5/bacends/BACends.fa
     #452237 sequences
     #Updating seq table
     
     # One additional step 9/10/04 Fan.
     # Create a composite index to speed up hgTracks display when BAC Ends track selected.
     hgsql mm5 -e 'create index bacIndex2 on all_bacends(bin, qName(8));'
     # This will take hours.
     
     #All done
 
 # BACEND SEQUENCE ALIGNMENTS (DONE - 2004-07-23 - Fan)
     # (alignments done without RepeatMasking)
     #	We need an ooc file for this genome
     ssh kksilo
     mkdir /cluster/data/mm5/ooc
     cd /cluster/data/mm5/ooc
     ls ../unmaskedNib/chr*.nib > nib.list
     blat -makeOoc=11.ooc -repMatch=1024 nib.list nib.list output.psl
     # Wrote 26077 overused 11-mers to 11.ooc
     # Did not end using this.  Used an old one instead.
 
     # Create full sequence alignments
     ssh kk
     cd /cluster/data/mm5/bed/bacends
 
     /cluster/bin/scripts/splitContigList -scratch 
 /iscratch/i/mus/mm5/maskedContigs 1
 
     # allow blat to run politely in /tmp while it writes output, then
     # copy results to results file:
 
     cat << '_EOF_' > runBlat.sh
 #!/bin/sh
 path1=$1
 path2=$2
 root1=$3
 root2=$4
 result=$5
 rm -fr /tmp/${root1}_${root2}
 mkdir /tmp/${root1}_${root2}
 pushd /tmp/${root1}_${root2}
 /cluster/bin/i386/blat ${path1} ${path2} -ooc=/scratch/hg/h/mouse11.ooc \
 	${root1}.${root2}.psl
 popd
 rm -f ${result}
 mv /tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
 rm -fr /tmp/${root1}_${root2}
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x runBlat.sh
 
     cat << '_EOF_' > template
 #LOOP
 ./runBlat.sh {check in exists $(path1)} {check in exists $(path2)} $(root1) 
 $(root2) {check out line+ bacEnds.out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line keeps emacs coloring happy
     #ls -1S /iscratch/i/mm5/bacEnds/BACends???.fa > bacEnds.lst
     ls -1S /scratch/mus/mm5/bacEnds/BACends???.fa > bacEnds.lst
     mkdir bacEnds.out
     #	create results directories for each to avoid the all result files in
     #	one directory problem
     foreach f (`cat bacEnds.lst`)
 	set b = $f:t:r
 	echo $b
 	mkdir bacEnds.out/$b
     end
 
     gensub2 contig.lst bacEnds.lst template jobList
     para create jobList
     # 62622 jobs written to batch
     para try, check, push, etc ...
 
 # Completed: 62622 of 62622 jobs
 # CPU time in finished jobs:    3760354s   62672.57m  1044.54h   43.52d  0.119 y
 # IO & Wait Time:               3216480s   53608.00m   893.47h   37.23d  0.102 y
 # Average job time:                 111s       1.86m     0.03h    0.00d
 # Longest job:                     2841s      47.35m     0.79h    0.03d
 # Submission to last job:          9395s     156.58m     2.61h    0.11d
 
 # Compile alignments and lift the files.
 # First attempt failed due to /cluster/store6 ran out of space.
 # Redoing it 7/22/04.
 
     ssh kksilo
     cd /cluster/data/mm5/bed/bacends
 
     mkdir /cluster/store8/fanTemp
     time pslSort dirs raw.psl /cluster/store8/fanTemp bacEnds.out/* \
       > time.out &
 
     # This may take over over 14 hours!
 
     ssh kolossus
     cd /cluster/data/mm5/bed/bacends
 
     time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl  
 bacEnds.psl /dev/null
 # Processed 562840490 alignments
 
     rmdir temp
     #	You will want to keep this file around until later processing is
     #	proven correct
     rm raw.psl		# 72 Gb !  It takes a while even to remove it.
 
     ssh kksilo
     cd /cluster/data/mm5/bed/bacends
 
     time /cluster/bin/scripts/lifter -psl -mouse /cluster/data/mm5 bacEnds.psl 
     # real    130m36.149s
     # user    82m38.180s
     # sys     10m59.580s
 
     cp -p ~booch/clusterJobs/bacends/split.pl .
     cp -p ~booch/clusterJobs/bacends/header .
     time ./split.pl header < bacEnds.psl.lifted
     # real    2m16.354s
     # user    0m36.390s
     # sys     0m42.290s
 
     cp -p bacEnds.psl.lifted bacEnds.psl.lifted.save
     time pslSort dirs bacEnds.psl.lifted temp split
 
     # real    17m2.353s
     # user    14m17.040s
     # sys     1m38.560s
 
     rmdir temp
     rm -r split
 
     # Copy files to final destination and remove
     mkdir /cluster/data/mm5/bacends
     cp -p bacEnds.psl.lifted /cluster/data/mm5/bacends
 
 # BACEND PAIRS TRACK (DONE  2004-07-27 - Fan)
 
     ssh kolossus
     cd /cluster/data/mm5/bacends
 
 bash
 time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 -mismatch -verbose bacEnds.psl.lifted \
 	../bed/bacends/ncbi/bacEndPairs.txt all_bacends bacEnds
 
     # create header required by "rdb" tools
     echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > 
 header
     echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
     # edit header to make sure \t is/become tab character
 
     cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -
 del > bacEndPairs.bed
     cat header  bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch 
 bacEnds.orphan \
         | row score ge 300 | sorttbl chr start | headchg -del > 
 bacEndPairsBad.bed
 
 # The following took too long, break it into 3 steps.
 #    extractPslLoad -noBin bacEnds.psl.lifted bacEndPairs.bed \
 #    bacEndPairsBad.bed | sorttbl tname tstart | headchg -del > bacEnds.load.psl
 
     extractPslLoad -noBin bacEnds.psl.lifted bacEndPairs.bed   \
     bacEndPairsBad.bed >j1.out
     cat j1.out| sorttbl tname tstart >j2.out
     cat j2.out | headchg -del > bacEnds.load.psl
 
     rm j1.out j2.out
 
     # load into database
     ssh hgwdev
     cd /cluster/data/mm5/bacends
 
     # edit bacEndPairs.bed to fix one ID that has a blank character in it.
     hgLoadBed mm5 bacEndPairs bacEndPairs.bed \
                  -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql
         # Loaded 168535
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed mm5 bacEndPairsBad bacEndPairsBad.bed \
                  -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
         # Loaded 43182
     #hgLoadPsl mm5 -nobin -table=all_bacends bacEnds.load.psl
     # NOTE: truncates file to 0 if -nobin is used
     hgLoadPsl mm5 -table=all_bacends bacEnds.load.psl
     # load of all_bacends did not go as planned: 14426473 record(s), 0 row(s) 
 skipped, 4519 warning(s) loading psl.tab
     
 # featureBits mm5 all_bacends
 # 268502414 bases of 2615483787 (10.266%) in intersection
 # featureBits mm4 all_bacends
 # 243096171 bases of 2627444668 (9.252%) in intersection
 
 # featureBits mm5 bacEndPairs
 # 2567958504 bases of 2615483787 (98.183%) in intersection
 # featureBits mm4 bacEndPairs
 # 2549945356 bases of 2627444668 (97.050%) in intersection
 
 # featureBits mm5 bacEndPairsBad
 # 541027882 bases of 2615483787 (20.686%) in intersection
 # featureBits mm4 bacEndPairsBad
 # 1074505863 bases of 2627444668 (40.895%) in intersection
 
 
 # BLASTZ FUGU (FR1) (WORKING 7/28/04 kate)
     # Using Angie's hg17/fugu as a model
 
     # Treat all mouse repeats as lineage-specific (same as chicken, so just
     # reuse linSpecRep.Chicken).
     ssh kkr1u00
     ln -s /iscratch/i/mus/mm5/linSpecRep.notInChicken \
                 /iscratch/i/mus/mm5/linSpecRep.notInFugu
     iSync
 
     ssh kk
     cd /cluster/data/mm5/bed
     mkdir blastz.fr1.2004-07-28
     ln -s blastz.fr1.2004-07-28 blastz.fr1
     cd blastz.fr1
 
     cat << '_EOF_' > DEF
 # mouse vs. fugu
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Reuse parameters from human-chicken, except L=6000 (more relaxed)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse
 SEQ1_DIR=/iscratch/i/mus/mm5/softNib
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInFugu
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Fugu
 SEQ2_DIR=/iscratch/i/fr1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/fr1/linSpecRep
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm5/bed/blastz.fr1.2004-07-28
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # first cluster run: raw blastz alignments
     ssh kk
     bash # if a csh/tcsh user
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
     para try, check, push, check, ....
     
 # GOT HERE
 
 #Completed: 93775 of 93775 jobs
 #Average job time:                 187s       3.11m     0.05h    0.00d
 #Longest job:                     3907s      65.12m     1.09h    0.05d
 #Submission to last job:         76763s    1279.38m    21.32h    0.89d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     bash # if a csh/tcsh user
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
     para try, check, push, etc ...
 #Completed: 341 of 341 jobs
 #Average job time:                  98s       1.63m     0.03h    0.00d
 #Longest job:                      281s       4.68m     0.08h    0.00d
 #Submission to last job:          2102s      35.03m     0.58h    0.02d
 
     # third run: lav -> axt
     # (if non-default BLASTZ_Q is used in the future, put axtRescore in 
     # the pipe after lavToAxt)
     ssh kki
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 cat `ls -1 *.lav | sort -g` \
 | $HOME/bin/x86_64/lavToAxt stdin \
     /iscratch/i/mus/mm5/softNib /iscratch/i/fr1/nib stdout \
 | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
 $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     foreach d (../lav/chr*)
       echo "do.csh $d" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 43 of 43 jobs
 #Average job time:                 671s      11.18m     0.19h    0.01d
 #Longest job:                     2398s      39.97m     0.67h    0.03d
 #Submission to last job:          2417s      40.28m     0.67h    0.03d
 
 
 # CHAIN FUGU BLASTZ (WORKING 7/16/04 kate)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chainchimpSuperQuals
     ls -1S /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 \
     /iscratch/i/mus/mm5/softNib \
     /iscratch/i/fr1/nib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 #Completed: 43 of 43 jobs
 #Average job time:                 537s       8.96m     0.15h    0.01d
 #Longest job:                     2071s      34.52m     0.58h    0.02d
 #Submission to last job:          2071s      34.52m     0.58h    0.02d
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r
       textHistogram -binSize=5000 /tmp/score.$f:t:r
       echo ""
     end
 
     # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
     # chains.  So filter the chain down somewhat...
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain
     rm chain/*
     chainSplit chain all.chain
     gzip all.chain.unfiltered
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain mm5 ${c}_chainFr1 $i
     end
     # mouse-fugu gets significantly less coverage than human-fugu:
     featureBits mm5 -chrom=chr1 chainFr1Link
 #63386139 bases of 185739816 (34.126%) in intersection
     featureBits hg17 -chrom=chr1 chainFr1Link
 #123999291 bases of 222827847 (55.648%) in intersection
     # mouse-fugu isn't a whole lot less than mouse-human though:
     featureBits mm5 -chrom=chr1 chainHg17Link
 #75492250 bases of 185739816 (40.644%) in intersection
     featureBits mm5 -chrom=chr1 chainCanFam1Link
 #63386139 bases of 185739816 (34.126%) in intersection
 
 
 # NET FUGU BLASTZ (WORKING 7/16/04 kate)
     ssh kolossus
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
     netClass -noAr noClass.net mm5 fr1 fugu.net
 
     # Make a 'syntenic' subset:
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn fugu.net > fuguSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
     netFilter -minGap=10 fugu.net |  hgLoadNet mm5 netFr1 stdin
     netFilter -minGap=10 fuguSyn.net | hgLoadNet mm5 syntenyNetFr1 stdin
     # Add entries for chainFr1, netFr1 to mouse/mm5 trackDb
 
 
 # MAKE VSFR1 DOWNLOADABLES (WORKING 7/19/04 kate)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
     gzip axtNet/chr*.axt
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
     ln all.chain fugu.chain
     zip /cluster/data/mm5/zip/fugu.chain.zip fugu.chain
     rm fugu.chain
     zip /cluster/data/mm5/zip/fugu.net.zip fugu.net
     zip /cluster/data/mm5/zip/fuguSyn.net.zip fuguSyn.net
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/mm5/vsFr1
     cd /usr/local/apache/htdocs/goldenPath/mm5/vsFr1
     mv /cluster/data/mm5/zip/fugu*.zip .
     cp -pR /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtNet .
     md5sum *.zip axtNet/* > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 
 # GENERATE FR1 MAF FOR MULTIZ FROM NET (WORKING 7/19/04 kate)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28/axtChain
     netSplit fugu.net net
     cd /cluster/data/mm5/bed/blastz.fr1.2004-07-28
     mkdir axtNet
     foreach f (axtChain/net/*)
       set chr = $f:t:r
       netToAxt $f axtChain/chain/$chr.chain /cluster/data/mm5/nib \
         /cluster/data/fr1/nib stdout \
       | axtSort stdin axtNet/$chr.axt
     end
     mkdir mafNet
     foreach f (axtNet/chr*.axt)
       set maf = mafNet/$f:t:r.mc.maf
       axtToMaf $f \
             /cluster/data/mm5/chrom.sizes /cluster/data/fr1/chrom.sizes \
             $maf -tPrefix=mm5. -qPrefix=fr1.
     end
 
 
 # BLASTZ FR1 CLEAN UP (WORKING - 2004-07-28 - kate)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.fr1
     nice rm -rf raw &
     nice rm axtChain/run1/chain/* &
     nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} &
 
 
 # CONSERVATION TRACK - MULTIZ AND PHASTCONS (WORKING 2004-07-29 kate)
 
     ssh kksilo
     set multizDir = multiz.2004-07-29
     set workingDir = /cluster/bluearc/mm5/$multizDir
     ln -s $workingDir /cluster/bluearc/mm5/multiz5way
     mkdir -p $workingDir
     mkdir -p /cluster/data/mm5/bed/$multizDir
     cd /cluster/data/mm5/bed/$multizDir
 
 # wrapper script for multiz
     # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) 
     # NOTE: next time, modify script so it only needs one arg -- saves the
     # multiple dirname in a file for use by the next run
     cat << 'EOF' > doMultiz.csh
 #!/bin/csh -fe
 mkdir -p $3:h
 /cluster/bin/penn/multiz $1 $2 - > $3
 'EOF'
 # << for emacs
     cat << 'EOF' > gsub
 #LOOP
 ../doMultiz.csh {check in line /cluster/bluearc/mm5/multiz.2004-07-29/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/mm5/multiz.2004-07-29/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/mm5/multiz.2004-07-29/$(root1)$(dir1)/$(root2).maf}
 #ENDLOOP
 'EOF'
 # << for emacs
     chmod +x doMultiz.csh
 
     # copy mafs to bluearc -- rat
     ssh kksilo
     set workingDir = /cluster/bluearc/mm5/multiz.2004-07-29
     mkdir $workingDir/rn3
     cp /cluster/data/mm5/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3
     ls $workingDir/rn3/*.maf > chrom.lst
 
     # human
     mkdir $workingDir/hg17
     cp /cluster/data/mm5/bed/blastz.hg17/mafNet/chr*.maf $workingDir/hg17
 
     # dog
     mkdir $workingDir/canFam1
     cp /cluster/data/mm5/bed/blastz.canFam1/mafNet/chr*.maf $workingDir/canFam1
 
     # chicken
     mkdir $workingDir/galGal2
     cp /cluster/data/mm5/bed/blastz.galGal2/mafNet/chr*.maf $workingDir/galGal2
 
     # first multiz - add in human to mouse/rat
     # 
     ssh kki
     set multizDir = multiz.2004-07-29
     set workingDir = /cluster/bluearc/mm5/$multizDir
     cd /cluster/data/mm5/bed/$multizDir
     mkdir run.hg17
     cd run.hg17
     echo "hg17/rn3" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     para create jobList
         # 43 jobs
     para try, check, push, check
     cd ..
 
     # dog
     mkdir run.canFam1
     cd run.canFam1
     echo "canFam1/rn3hg17" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     para create jobList
     para try, check, push, check
     cd ..
 
     # chicken
     mkdir run.galGal2
     cd run.galGal2
     echo "galGal2/rn3hg17canFam1" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     # no alignment file for chr18_random -- create one so we can create jobList
     para create jobList
     para try, check, push, check
     cd ..
 
     # copy 5-way mafs to build directory
     ssh kksilo
     set multizDir = multiz.2004-07-29
     set workingDir = /cluster/bluearc/mm5/$multizDir
     ln -s $workingDir/rn3hg17canFam1galGal2 $workingDir/maf
     cd /cluster/data/mm5/bed/multiz.2004-07-29
     mkdir maf
     cp $workingDir/maf/*.maf maf
 
 
 # PHYLO-HMM CONSERVATION FOR 5-WAY MULTIZ (DONE 2004-07-29 kate)
 # updated 09-13-04 acs
 
     ssh kksilo
     set path = ($path /cluster/bin/phast)
     cd /cluster/data/mm5/bed/multiz.2004-07-29
     mkdir cons
     cd cons
 
     #break up the genome-wide MAFs into pieces
     mkdir /cluster/bluearc/mm5/chrom
     cd /cluster/data/mm5
     foreach f (?{,?}/*.fa)
         echo $f
         cp $f /cluster/bluearc/mm5/chrom
     end
 
     ssh kki
     cd /cluster/data/mm5/bed/multiz.2004-07-29/cons
     mkdir run.split
     cd run.split
     set WINDOWS = /cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
     cat << 'EOF' > doSplit.sh
 #!/bin/sh
 
 PHAST=/cluster/bin/phast
 FA_SRC=/cluster/bluearc/mm5/chrom
 WINDOWS=/cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS
 
 maf=$1
 c=`basename $maf .maf`
 echo $c
 mkdir -p /scratch/msa_split
 ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O mm5,rn3,hg17,canFam1,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000
 [ $? -eq 0 ] || exit 1
 echo "Copying..."
 cd /scratch/msa_split
 for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
 [ $? -eq 0 ] || exit 1
 rm -f /scratch/msa_split/$c.*.ss
 echo "Done copying"
 echo "Done" >> ${WINDOWS}/$c.done
 'EOF'
 # << for emacs
     chmod +x doSplit.sh
     rm -f jobList
     foreach file (/cluster/bluearc/mm5/multiz.2004-07-29/maf/*.maf) 
         set c = $file:t:r
 	echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList
     end
     
     para create jobList
         # 43 jobs
     para try
     para check
     para push
 #CPU time in finished jobs:       4354s      72.57m     1.21h    0.05d  0.000 y
 #IO & Wait Time:                  6102s     101.70m     1.70h    0.07d  0.000 y
 #Average job time:                 243s       4.05m     0.07h    0.00d
 #Longest job:                      728s      12.13m     0.20h    0.01d
 #Submission to last job:          1300s      21.67m     0.36h    0.02d
     cd ..
 
     # generate conservation scoring using phastCons
     ssh kk
     cd /cluster/data/mm5/bed/multiz.2004-07-29/cons
     mkdir run.cons
     cd run.cons
 
     # skip parameter estimation step: use parameters already estimated for
     # hg17 (see makeHg17.doc)
     cp /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/ave.cons.mod /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/ave.noncons.mod .
 
     cat << 'EOF' > doPhastCons.sh
 #!/bin/sh
 
 mkdir -p /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/ELEMENTS
 pref=`basename $1 .ss.gz`
 chr=`echo $pref | awk -F\. '{print $1}'`
 tmpfile=/scratch/phastCons.$$
 zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod
 --expected-lengths 12 --target-coverage 0.15 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/mm5/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
 gzip -c $tmpfile > /cluster/bluearc/mm5/phastCons/POSTPROBS/$pref.pp.gz
 rm $tmpfile
 EOF
     chmod u+x doPhastCons.sh
 
     rm -fr /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/ELEMENTS
     rm -f jobs.lst
     for f in /cluster/bluearc/mm5/multiz.2004-07-29/cons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs.lst ; done
 
     # run cluster job
     para create, ...
     # took about 10 minutes
 
     # combine predictions and transform scores to be in 0-1000 interval
     # do in a way that avoids limits on numbers of args
     find /cluster/bluearc/mm5/phastCons/ELEMENTS -name "*.bed" > files
     rm -f splitfiles* all.raw.bed
     split files splitfiles
     for s in splitfiles* ; do awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed ; done
     /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
     rm files splitfiles* 
 
     hgLoadBed mm5 phastConsElements all.bed
 
     # check coverage
     featureBits mm5 phastConsElements
 #135605549 bases of 2615483787 (5.185%) in intersection
     # This should be close enough.  If necessary, you can rerun the
     # steps above with a different target coverage.  When hitting the
     # target is important, you may want to perform several iterations
     # using a representative subset of the entire dataset (in human, chr1
     # seems to work pretty well)
 
     # set up wiggle
     mkdir -p /cluster/bluearc/mm5/phastCons/wib
     cat << 'EOF' > doWigAsciiToBinary.sh
 #!/bin/sh
 chr=$1
 zcat `ls /cluster/bluearc/mm5/phastCons/POSTPROBS/$chr.*.pp.gz | sort -t\. -k2,2n` | wigAsciiToBinary -chrom=$chr -wibFile=/cluster/bluearc/mm5/phastCons/wib/${chr}_phastCons stdin 
 EOF
     chmod u+x doWigAsciiToBinary.sh
 
     rm -f jobs2.lst
     for chr in `ls /cluster/bluearc/mm5/phastCons/POSTPROBS | awk -F\. '{print $1}' | sort -u` ; do echo doWigAsciiToBinary.sh $chr >> jobs2.lst ; done
 
     # run a little wigAsciiToBinary cluster job
     ssh kk, etc.
 
     # copy wibs and wigs from bluearc
     rsync -av /cluster/bluearc/mm5/phastCons/wib .
 
     # load track
     hgLoadWiggle mm5 phastCons -pathPrefix=/gbdb/mm5/phastCons/wib \
                 wib/chr*_phastCons.wig
     mkdir -p /gbdb/mm5/phastCons/wib
     rm -f /gbdb/mm5/phastCons/wib/chr*phastCons.wib
     ln -s /cluster/data/mm5/bed/multiz.2004-07-29/cons/run.cons/wib/*.wib /gbdb/mm5/phastCons/wib
     chmod 775 . wib /gbdb/mm5/phastCons /gbdb/mm5/phastCons/wib
     chmod 664 wib/*.wib
 
     # move postprobs over and clean up bluearc 
     rsync -av /cluster/bluearc/mm5/phastCons/POSTPROBS .
     # (people sometimes want the raw scores)
     rm -r /cluster/bluearc/mm5/phastCons/ELEMENTS /cluster/bluearc/mm5/phastCons/POSTPROBS /cluster/bluearc/mm5/phastCons/wib
 
     # load data for track name "multiz5way"    
 
     # load multiz maf tables 
     ssh hgwdev
     cd /cluster/data/mm5/bed/multiz.2004-07-29
     set mafDir = /gbdb/mm5/multiz5way/maf
     set table = multiz5way
     mkdir -p $mafDir/$table
     ln -s `pwd`/maf/*.maf $mafDir/$table
     cd maf
     hgLoadMaf mm5 -warn multiz5way -pathPrefix=$mafDir/$table
 
    # load blastz maf tables
     # TODO: change mafWiggle to use db names instead of species names
     # in speciesOrder 
     ssh hgwdev
     cd /cluster/data/mm5/bed
     ln -s multiz.2004-07-29 multiz5way
 cat > multiz5way/loadMaf.csh << 'EOF'
     set mafDir = /gbdb/mm5/multiz5way/maf
     foreach s (rn3 hg17 canFam1 galGal2)
         set O = `echo "select genome from dbDb where name='$s'" | \
                 hgsql -s -h genome-testdb hgcentraltest`
         set o = $O:l
         set table = ${o}_netBlastz
         mkdir -p $mafDir/$table
         ln -s `pwd`/blastz.$s/mafNet/*.maf $mafDir/$table
         echo $o
         hgLoadMaf mm5 -warn ${o}_netBlastz -pathPrefix=$mafDir/$table
     end
 'EOF'
     # <<EOF for emacs
     csh multiz5way/loadMaf.csh >&! multiz5way/loadMaf.log &
     
 # track multiz5way
 # shortLabel Conservation
 # longLabel Rat/Human/Dog/Chicken Multiz Alignments & PhyloHMM Cons
 # group compGeno
 # priority 149
 # visibility pack
 #color 0, 10, 100
 # type wigMaf 0.0 1.0
 # maxHeightPixels 100:40:11
 # wiggle phastCons
 # yLineOnOff Off
-# autoScaleDefault Off
+# autoScale Off
 # pairwise netBlastz
 # speciesOrder rat human dog chicken
 
 
 # MULTIZ DOWNLOAD FILES (DONE kate 2004-08-03)
     ssh kksilo
     cd /cluster/data/mm5/bed/multiz5way
 
     # multiz
     mkdir gzMaf
     foreach f (maf/*.maf)
       gzip -c $f > gzMaf/$f:t.gz
       echo $f
     end
     ssh hgwdev
     mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/multiz5way
     cd /usr/local/apache/htdocs/goldenPath/mm5/multiz5way
     mv /cluster/data/mm5/bed/multiz5way/gzMaf/* .
     rmdir /cluster/data/mm5/bed/multiz5way/gzMaf
     md5sum *.gz > md5sum.txt
     # make a README.txt file
 
 
 # PHASTCONS SCORES DOWNLOADABLES (DONE 10/11/04 angie)
     ssh kksilo
     mkdir /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2
     cd /cluster/data/mm5/bed/multiz5way/cons/run.cons/POSTPROBS
     foreach chr (`awk '{print $1;}' /cluster/data/mm5/chrom.sizes`)
       echo $chr
       zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` \
       | gzip -c \
       > /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2/$chr.gz
     end
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/mm5/phastCons
     # Doh!  /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2 is 8.6G now -- too much 
     # to dump on hgwdev's / which is at 94%.  Instead of doing this:
     #mv /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2 .
     # make symbolic links:
     mkdir /usr/local/apache/htdocs/goldenPath/mm5/phastCons/mzRn3Hg17Cf1Gg2
     cd /usr/local/apache/htdocs/goldenPath/mm5/phastCons/mzRn3Hg17Cf1Gg2
     ln -s /cluster/data/mm5/zip/mzRn3Hg17Cf1Gg2/* .
     md5sum *.gz > md5sum.txt
     # make a README.txt.
 
 
 # PREP FOR LIFTOVER CHAINS TO THIS ASSEMBLY (2004-08-02 kate)
     
     # split into 3K chunks
     ssh kksilo
     cd /cluster/data/mm5
     set liftDir = /iscratch/i/mm5/liftOver/liftSplit
     mkdir -p $liftDir
     cd $liftDir
     mkdir -p split lift
 cat > split.csh << 'EOF'
     set liftDir = /iscratch/i/mm5/liftOver/liftSplit
     cd /cluster/data/mm5
     foreach n (`ls ?{,?}/*.fa`)
         set d = $n:h
         set c = $n:t:r
         echo $c
         faSplit -lift=$liftDir/lift/$c.lft size \
             /cluster/data/mm5/$d/$c.fa -oneFile 3000 $liftDir/split/$c
     end
 'EOF'
 # << for emacs
     csh split.csh >&! split.log &
     tail -100f split.log
     ssh kkr1u00
     iSync
 
 
 # LOAD GENEID GENES (DONE 8/2/04 Fan)
     # reloaded 3/16/04 with -gtf instead of -exon=CDS (nec. now! for stop_codon)
     mkdir -p /cluster/data/mm5/bed/geneid/download
     cd /cluster/data/mm5/bed/geneid/download
     foreach f (/cluster/data/mm5/*/chr*.fa)
       set chr = $f:t:r
       wget \
 http://genome.imim.es/genepredictions/M.musculus/mmMay2004/geneid_v1.2/$chr.gtf
       wget \
 http://genome.imim.es/genepredictions/M.musculus/mmMay2004/geneid_v1.2/$chr.prot
     end
     # Add missing .1 to protein id's
     foreach f (*.prot)
       perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
     end
     cd ..
     ldHgGene -genePredExt -gtf mm5 geneid download/*.gtf
     hgPepPred mm5 generic geneidPep download/*-fixed.prot
 
 
 # PRODUCING GENSCAN PREDICTIONS (DONE 08-03-04 Fan)
     ssh hgwdev
     mkdir /cluster/data/mm5/bed/genscan
     cd /cluster/data/mm5/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
     # Run on small cluster (more mem than big cluster).
     ssh kki
     cd /cluster/data/mm5/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     rm -f genome.list
     touch genome.list
     foreach f ( `ls -1S /cluster/data/mm5/*/chr*_*/chr*_?{,?}.fa.masked` )
       egrep '[ACGT]' $f > /dev/null
       if ($status == 0) echo $f >> genome.list
     end
     wc -l genome.list
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line 
 gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out 
 line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -
 par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 genome.list single gsub jobList
     para create jobList
     para try, check, push, check, ...
 # Completed: 638 of 639 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:     386282s    6438.03m   107.30h    4.47d  0.012 y
 # IO & Wait Time:                  3735s      62.25m     1.04h    0.04d  0.000 y
 # Average job time:                 611s      10.19m     0.17h    0.01d
 # Longest job:                    22687s     378.12m     6.30h    0.26d
 # Submission to last job:         33710s     561.83m     9.36h    0.39d
 
     # If there are crashes, diagnose with "para problems".  
     # If a job crashes due to genscan running out of memory, re-run it 
     # manually with "-window=1200000" instead of "-window=2400000".
 /cluster/bin/i386/gsBig /cluster/data/mm5/19/chr19_1/chr19_1.fa.masked 
 gtf/chr19_1.fa.gtf -trans=pep/chr19_1.fa.pep -subopt=subopt/chr19_1.fa.bed -
 exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -
 tmp=/tmp -window=1200000
 
     # Convert these to chromosome level files as so:
     ssh kksilo
     cd /cluster/data/mm5/bed/genscan
     liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
     liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
     cat pep/*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/mm5/bed/genscan
     # Reloaded without -genePredExt 1/6/05:
     ldHgGene mm5 -gtf genscan genscan.gtf
     hgPepPred mm5 generic genscanPep genscan.pep
     hgLoadBed mm5 genscanSubopt genscanSubopt.bed
 
 
 # MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie)
     ssh hgwdev
     mkdir /cluster/data/mm5/bed/mitopred
     cd /cluster/data/mm5/bed/mitopred
     wget http://mitopred.sdsc.edu/data/mus_30.out
     perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' mus_30.out > mitopred.tab
     cat > mitopred.sql << '_EOF_'
 # Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
 CREATE TABLE mitopred (
     name varchar(10) not null,      # SwissProt ID
     confidence varchar(8) not null, # Confidence level
               #Indices
     PRIMARY KEY(name(6))
 );
 '_EOF_'
     # << this line makes emacs coloring happy
     hgsql mm5 < mitopred.sql
     hgsql mm5 -e 'load data local infile "mitopred.tab" into table mitopred'
 
 # STS MARKERS TRACK (RE-BUILT - 2004-08-24- Fan)
 
     ssh kksilo
     mkdir -p /cluster/data/mm5/bed/STSmarkers/downloads
     cd /cluster/data/mm5/bed/STSmarkers/downloads
     # these files appear to be new almost every day
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
 
     # these map files appear to be old, 2002 Data
     wget --timestamping \
 ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*
     #	Picks up files:
     #	345184 Feb 20  2002 10090.MGD.txt
     #	173294 Jun 27  2002 10090.WI_Mouse_Genetic.txt
     #	240637 Jun 27  2002 10090.WI_Mouse_YAC.txt
     #	390088 Jun 27  2002 10090.Whitehead-MRC_RH.txt
     # If these files have not been changing, then no need to worry about
     #	them.  We are just picking them up to see if they have changed
     #	since the last time we worked on this.
 
     # these reports from jax.org appear to be changing daily
     wget --timestamping \
 	ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
     wget --timestamping \
 	ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
     wget --timestamping \
 	ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt
 
     # compare them with previous versions.  Before this these were
     #	in /cluster/store5/mouseMarker/orig
     # these newly picked up files:
     sum -r 10090*
 # 48882   338 10090.MGD.txt
 # 24176   381 10090.Whitehead-MRC_RH.txt
 # 62367   170 10090.WI_Mouse_Genetic.txt
 # 50616   235 10090.WI_Mouse_YAC.txt
     sum -r *.rpt
 # 21267  4442 MRK_Dump2.rpt
 # 51274  3743 MRK_Sequence.rpt
 # 35293  2315 PRB_PrimerSeq.rpt
     sum -r UniSTS*
 # 40884 10502 UniSTS.aliases
 # 14407  2931 UniSTS_mouse.sts
 
 # the previous copies
     cd /cluster/store5/mouseMarker/orig
     sum -r 10090*
 # 48882   338 10090.MGD.txt
 # 24176   381 10090.Whitehead-MRC_RH.txt
 # 62367   170 10090.WI_Mouse_Genetic.txt
 # 50616   235 10090.WI_Mouse_YAC.txt
     sum -r *.rpt
 # 36880  4160 MRK_Dump2.rpt
 # 02447  3132 MRK_Sequence.rpt
 # 57914  2220 PRB_PrimerSeq.rpt
     sum -r UniSTS*
 # 36201  8843 UniSTS.aliases
 # 58524   970 UniSTS_mouse.alias
 # 42464  2291 UniSTS_mouse.sts
 
     # back to our work area, update the bed file
     #	to do this we need a new UniSTS_mouse.alias file
     # it is created by a combination of information from several
     # of the above files ! AND ! the previous stsInfoMouse.bed file
 
     cp /cluster/data/mm4/bed/STSmarkers/downloads/*.sh . -p
     cp /cluster/data/mm4/bed/STSmarkers/downloads/*.pl . -p
 
     #	This process has been captured in the script:
     #	/cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
     # which uses a couple of perl scripts in that same directory.
     # briefly it is:
     
     # cd /cluster/data/mm5/bed/STSmarkers/downloads
     # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0
     # grep MGI: UniSTS.aliases > MGI.aliases
     # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \
     #	stsInfoAliases.txt
     # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases
     # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \
     #    | sort -n > UniSTS_mouse.alias
 
     /cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
 
     # with that, we can create a new stsInfoMouse.bed file:
     bash
     cd /cluster/data/mm5/bed/STSmarkers
     /cluster/store5/mouseMarker/code/updateBed.pl \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed \
 	downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \
 	downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \
 	downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile
 
     # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04
     /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed
 	
     # copy the stsInfoMouse.bed file from working dir to the marker info storage fold.
     # added 2 new steps by Yontao	
 	mv /cluster/store5/mouseMarker/stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed_mm3
 	cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed
 
     # comparing to Mm4, this file was used there:
     # /cluster/store6/mm4/bed/STSmarkers
     # a wc of it shows:
     # 56406  786036 6425721 stsInfoMouse.bed
     # Now we have:
     # 58488  790056 6602318 stsInfoMouse.bed
 
     # and from that, create new primer fa, epcr, etc:
     /cluster/store5/mouseMarker/code/luConvertPrimerToFa \
 	stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
     # the mouseC.fa file will be empty
     wc mouse?.*
     #      0       0       0 mouseC.fa
     # 286740  286686 6474893 mouseP.fa
     # 32232  161234 2044810 mouseP.info
     # 318972  447920 8519703 total
 
     # the equivalent Mm4 versions:
     #      0       0       0 mouseC.fa
     # 258307  258245 5815248 mouseP.fa
     # 29906  149545 1890926 mouseP.info
 
     #	copy the primers over to the bluearc for the kluster run
     cp -p mouseP.fa /cluster/bluearc/scratch/mus/mm5
     cp -p mouseP.info /cluster/bluearc/scratch/mus/mm5
 
 #  CLUSTER RUN FOR THE STS PRIMERS
 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/STSmarkers/primer
     mkdir -p /cluster/data/mm5/bed/STSmarkers/ePCR
     cd /cluster/data/mm5/bed/STSmarkers/primer
     # the mouseP.fa comes from above
     echo "/cluster/bluearc/scratch/mus/mm5/mouseP.fa" > primers.lst
     # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 
 cat << '_EOF_' > template
 #LOOP
 /cluster/bin/i386/blat.2 $(path1) $(path2) -ooc=/scratch/hg/h/mouse11.ooc  -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl}
 #ENDLOOP
 '_EOF_'
     mkdir primers.out
     /cluster/bin/scripts/splitContigList -mouse -scratch \
 	/cluster/bluearc/scratch/mus/mm5/maskedContigs 1
     /cluster/bin/i386/gensub2 contig.lst primers.lst template jobList
     para create jobList
     para try
     para check
     para push
     ... etc ...
 # Completed: 639 of 639 jobs
 # CPU time in finished jobs:     334066s    5567.76m    92.80h    3.87d  0.011 y
 # IO & Wait Time:                 72565s    1209.42m    20.16h    0.84d  0.002 y
 # Average job time:                 636s      10.61m     0.18h    0.01d
 # Longest job:                      800s      13.33m     0.22h    0.01d
 # Submission to last job:          1090s      18.17m     0.30h    0.01d
 
     # on the file server
     ssh kksilo
     cd /cluster/data/mm5/bed/STSmarkers/primer
     /cluster/bin/i386/pslSort dirs primers.psl temp primers.out
     rmdir temp
 
     # comparing results to Mm4:
     wc primers.psl
     # 5719969 120119288 590806241 primers.psl
     # Mm4 wc primers.psl /cluster/data/mm4/bed/STSmarkers/primer/primers.psl
     # 5745617 120657896 592135728 primers.psl
 
     # another kluster run
     ssh kk
     cd /cluster/data/mm5/bed/STSmarkers/ePCR
     ls -1S /cluster/bluearc/scratch/mus/mm5/maskedContigs > contig.lst
 # Edit this list to get full path names!
     mkdir epcr.out
     cat << '_EOF_' > template
 #LOOP
 /cluster/bin/scripts/luRunEpcr $(path1) $(path2) epcr.out/$(num2).epcr
 #ENDLOOP
 '_EOF_'
     # the mouseP.info was created above
     echo "/cluster/bluearc/scratch/mus/mm5/mouseP.info" > epcr.lst
     gensub2 epcr.lst contig.lst template jobList
     para create jobList
     para try
     para check
     para push
     ... etc ...
 # Completed: 639 of 639 jobs
 # CPU time in finished jobs:     146365s    2439.41m    40.66h    1.69d  0.005 y
 # IO & Wait Time:                 67691s    1128.19m    18.80h    0.78d  0.002 y
 # Average job time:                 335s       5.58m     0.09h    0.00d
 # Longest job:                      427s       7.12m     0.12h    0.00d
 # Submission to last job:           485s       8.08m     0.13h    0.01d
     ssh hgwdev
     cd /cluster/data/mm5/bed/STSmarkers/ePCR
     # all those results become all.epcr
     cat epcr.out/*.epcr > all.epcr
     # comparing results to Mm4:
 
     wc *.epcr
     # 55677  222708 2945623 all.epcr
     wc /cluster/store6/mm4/bed/STSmarkers/ePCR/*.epcr
     # 74705  298820 3971712 /cluster/store6/mm4/bed/STSmarkers/ePCR/all.epcr
 
     cd /cluster/data/mm5/bed/STSmarkers/primer
 
     /cluster/bin/scripts/filterSTSPrimers \
     -mouse ../stsInfoMouse.bed primers.psl \
         ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat
 
     #  The output should show an increasing count:
     #	Reading name info
     #	Reading primer info
     #	Processing file
     #	100000
     #	200000
     #	300000
     #	...
     #	5700000
     #	Determining ePCR not found
     #
     wc primers.psl.filter.blat
     # 33476  702996 3442402 primers.psl.filter.blat
 
     # Mm4:  wc primers.psl.filter.blat
     # 32729  687309 3331894 primers.psl.filter.blat
 
     # create accession_info.rdb  (chrM added to Terry's script for mouse)
     touch empty_sequence.inf
     /cluster/bin/scripts/compileAccInfo -mouse \
 	/cluster/data/mm5 empty_sequence.inf
     # works with two seemingly errors:
     # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory
     # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory
     mv accession_info.rdb accession_info.rdb.tmp
     /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \
 accession_info.rdb
     rm accession_info.rdb.tmp
     # comparing results to Mm4:
     #  Mm5 wc accession_info.rdb
     # 131845 1450299 9681940 accession_info.rdb
     #  Mm4 wc accession_info.rdb
     #  86935  956289 6374930 accession_info.rdb
 
     # 
     # 219652 1885501 11875772 total
     # wc /cluster/data/mm5/?/*.agp /cluster/data/mm5/??/*.agp
     # 252515 2152346 13568720 total
 
     # creates epcr.not.found.nomatch and epcr.not.found.psl
     /cluster/bin/scripts/epcrToPsl -mouse \
 	epcr.not.found ../mouseP.info \
 	accession_info.rdb /cluster/data/mm5
     # Comparing results to Mm4:
     # Mm5 wc epcr*
     # 463    1852   17080 epcr.not.found
     #  61     732    5845 epcr.not.found.nomatch
     # 402    8442   39011 epcr.not.found.psl
 
     # Mm4 wc epcr*
     # 328    1312   12011 epcr.not.found
     #  57     684    5474 epcr.not.found.nomatch
     # 266    5586   25711 epcr.not.found.psl
 
     # there is a single error being propagated here from the file
     # /cluster/store5/mouseMarker/stsInfoMouse.bed which has an error
     # at line 53958:
 62943   D2J3    91947   D2J3                            CAACCAGCTCAC    
 CAACCAGCTCAC    1825, 1025BP    0       MUS MUSCULUS
     # The value '1825,' is incorrect.  Should be a small integer here.
     # to work around this problem, I'm manually eliminating this problem
     # from the epcr.not.found.psl file where it has now become four bad
     # lines:
 # 24    0       0       0       1       1801    1       1789    +       27119   
 1825    0       1825chr11_16 0       1115413 1117226 2       12,12,  0,1813, 
 1115413,1117214,
 # 24    0       0       0       1       1801    1       1789    +       27119   
 1825    0       1825chr11_16 0       1115413 1117226 2       12,12,  0,1813, 
 1115413,1117214,
 216a219,220
 # 24    0       0       0       1       1801    1       1789    +       62943   
 1825,   0       1825,chr11_16        0       1115413 1117226 2       12,12,  
 0,1813, 1115413,1117214,
 # 24    0       0       0       1       1801    1       1789    +       62943   
 1825,   0       1825,chr11_16        0       1115413 1117226 2       12,12,  
 0,1813, 1115413,1117214,
     # taking those four lines out. 
     
     cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
 
     # lift those primers  (added chrM to this lifter script for mouse)
     # creates primers.psl.filter.lifted
     /cluster/bin/scripts/lifter -mouse -psl \
 	/cluster/data/mm5 primers.psl.filter
     # wc primers.psl.filter.lifted
     # 33691  707511 3601164 primers.psl.filter.lifted
 
     # create primers.psl.filter.lifted.initial
       bash
 	PATH=/cluster/bin/scripts:$PATH 
 	/cluster/bin/scripts/extractPslInfo primers.psl.filter.lifted
     #	wc primers.psl.filter.lifted.initial
     # 33689  202134 1799016 primers.psl.filter.lifted.initial
 
     # create primers.psl.filter.lifted.initial.acc
     /cluster/bin/scripts/findAccession -agp \
 	-mouse primers.psl.filter.lifted.initial /cluster/data/mm5
     # wc primers.psl.filter.lifted.initial.acc
     # 33689  235823 2158029 primers.psl.filter.lifted.initial.acc
 
     # this needs to be -rat as that specifies how to scan the
     # stsInfoMouse.bed file and it does not work if you use -mouse
     /cluster/bin/scripts/getStsId -rat \
 	../stsInfoMouse.bed  primers.psl.filter.lifted.initial.acc \
 	> primers.initial.acc.trans
     # wc primers.initial.acc.trans
     # 33689  235823 1834889 primers.initial.acc.trans
 
     sort -k 4n primers.initial.acc.trans > primers.final
     rm primers.psl.filter.lifted.initial.acc primers.initial.acc.trans
     # comparing results to Mm4:
     # Mm5 wc primers.final
     # 33689  235823 1834889 primers.final
     # Mm4 wc primers.final
     # 32983  230881 1771293 primers.final
 
     cd /cluster/data/mm5/bed/STSmarkers
     # stsMarkers.final is empty for mouse
     touch stsMarkers.final dummy
     bash
     PATH=/cluster/bin/scripts:$PATH \
     /cluster/bin/scripts/combineSeqPrimerPos \
 	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
     # Comparing results to Mm4
     # Mm5 wc stsMarkers_pos.rdb
     # 32085  224595 1862816 stsMarkers_pos.rdb
     # Mm4 wc stsMarkers_pos.rdb
     # 31270  218890 1869417 stsMarkers_pos.rdb
 
     /projects/cc/hg/ytlu/bin/script/perl/createStsBed \
 	stsInfoMouse.bed  stsMarkers_pos.rdb 500 > stsMapMouse.bed
     # wc stsMapMouse.bed
     # 29069  301535 2123622 stsMapMouse.bed
 
 #  loading STS markers tables
     ssh hgwdev
     cd /cluster/data/mm5/bed/STSmarkers
     cp -p /cluster/store6/mm4/bed/STSmarkers/ucscAlias.pl .
     bash
     ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
      
     # wc ucscStsAlias.tab
     # 126624  379859 3037850 ucscStsAlias.tab
     hgsql -e "drop table stsAlias;" mm5
     hgsql mm5 < ~/kent/src/hg/lib/stsAlias.sql
     hgsql -e \
 	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm5
     hgsql -e "drop table stsMapMouseNew;" mm5
     hgsql mm5 < ~/kent/src/hg/lib/stsMapMouseNew.sql
     hgsql -e \
 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm5
     hgsql -e "drop table stsInfoMouseNew;" mm5
     hgsql mm5 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
     hgsql -e \
      'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm5
 
     hgLoadPsl -nobin -table=all_sts_primer mm5 primer/primers.psl.filter.lifted
 
     # load primer sequences	
     mkdir /gbdb/mm5/stsMarker
     ln -s /cluster/data/mm5/bed/STSmarkers/mouseP.fa \
 	/gbdb/mm5/stsMarker/mouseP.fa
 # PLEASE NOTE THAT THE -replace option is used because this is a rebuild,
 # otherwise there will be a problem that the seq and extFile tables 
 # will be out of sync. 
     hgLoadSeq -replace mm5 /gbdb/mm5/stsMarker/mouseP.fa
 #  Adding /gbdb/mm5/stsMarker/mouseP.fa
 #  32232 sequences
    # DONE - 2004-08-24 17:02
 
 # QA repush 2006-02-08 seq table to remove old STS sequences with no extFile reference (Jen)
   Heather found problem found on rr. RR table matched dev and beta was correct, so no 
   joinerCheck errors for the mismatch were flagged for review.
 
 #  BLASTZ RAT RN3 (RE-DONE - 2004-08-30 - Fan)
 
 #  !!! PLEASE NOTE AS OF 9/2/04, THE 8/30/04-8/31/04 REBUILD OF BLASTZ, CHAIN, AND NET 
 #  FOR MM5-RN3 IS NO LONG USED FOR MM5.  THE OLD MM5-RN3 CHAIN AND NET BUILD OF 7/14/04
 #  IS REVERSE PUSHED FROM RR BACK TO HGWDEV.
 
 # Reason for rebuild is to use more stringent blastz parameters to reduce size
 # of output files.
 
 # BLASTZ_H=2000
 # BLASTZ_Y=3400
 # BLASTZ_L=50000
 # scoring matrix
 # BLASTZ_Q=/cluster/data/blastz/mus_rat.q
 
 # MAKE SURE TO INCLUDE THE RESCORE STEP TO CORRECT A BLASTZ PROBLEM.
 # (axtRescore -scoreScheme=/cluster/data/blastz/mus_rat.q ...)
 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-08-29
     cd blastz.rn3.2004-08-29
 
     cat << '_EOF_' > DEF
 # rat vs. mouse
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=50000
 BLASTZ_T=2
 # scoring matrix
 BLASTZ_Q=/cluster/data/blastz/mus_rat.q 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Mouse
 SEQ1_DIR=/scratch/mus/mm5/softNib
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/mus/mm5/linSpecRep.notInRat
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Rat
 SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
 # not currently used
 SEQ2_RMSK=
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm5/bed/blastz.rn3.2004-08-29
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
     bash
     source ./DEF
     # script copied over from /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     #	it is a generic script and works for any assembly
 
     cp -p /cluster/data/hg17/jkStuff/BlastZ_run0.sh \
        /cluster/data/mm5/jkStuff/BlastZ_run0.sh
     /cluster/data/mm5/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 
 # Completed: 41943 of 41943 jobs
 # CPU time in finished jobs:    4656727s   77612.11m  1293.54h   53.90d  0.148 y
 # IO & Wait Time:                460782s    7679.70m   128.00h    5.33d  0.015 y
 # Average job time:                 122s       2.03m     0.03h    0.00d
 # Longest job:                     2042s      34.03m     0.57h    0.02d
 # Submission to last job:          8307s     138.45m     2.31h    0.10d
 
     #	Second cluster run to convert the .out's to .lav's
     #	You do NOT want to run this on the big cluster.  It brings
     #	the file server to its knees.  Run this on the small cluster.
     ssh kki
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
     # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
     #	fixup machine check, should be kki, not kk
     cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh \
        /cluster/data/mm5/jkStuff/BlastZ_run1.sh
     vi /cluster/data/mm5/jkStuff/BlastZ_run1.sh
 
     /cluster/data/mm5/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:       1293s      21.54m     0.36h    0.01d  0.000 y
 # IO & Wait Time:                  2113s      35.22m     0.59h    0.02d  0.000 y
 # Average job time:                  10s       0.17m     0.00h    0.00d
 # Longest job:                       54s       0.90m     0.01h    0.00d
 # Submission to last job:           719s      11.98m     0.20h    0.01d
 
 # NOTE: BlastZ_run2.sh is not used here.  Instead Angie's approach 
 # (using Rescore) is adopted here.
 
     # third run: lav -> axt
     # NOTE: use axtRescore here because we used a non-default BLASTZ_Q matrix 
     # and abridged repeats (Penn State's restore_rpts program rescores with 
     # default matrix, oops).
     ssh kki
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
     # mv old subdirectories
     mv axtChrom axtChrom.old
     mv run.2 run.2.old
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 set path = (/cluster/bin/x86_64 $path)
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin \
     /iscratch/i/mus/mm5/softNib /iscratch/i/rn3/bothMaskedNibs stdout \
 | axtRescore -scoreScheme=/cluster/data/blastz/mus_rat.q stdin stdout \
 | axtSort stdin ../../axtChrom/$chr.axt 
 axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     foreach d (../lav/chr*)
       echo "do.csh $d" >> jobList
     end
     para create jobList
     para try, check, push, check
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:        498s       8.31m     0.14h    0.01d  0.000 y
 # IO & Wait Time:                  3367s      56.11m     0.94h    0.04d  0.000 y
 # Average job time:                  90s       1.50m     0.02h    0.00d
 # Longest job:                      299s       4.98m     0.08h    0.00d
 # Submission to last job:           685s      11.42m     0.19h    0.01d
 
 # CHAIN RAT BLASTZ (RE-DONE 8/30/04 Fan)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/mus_rat.q \
          -minScore=5000 $1 \
     /iscratch/i/mus/mm5/softNib \
     /iscratch/i/rn3/bothMaskedNibs $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       3145s      52.42m     0.87h    0.04d  0.000 y
 # IO & Wait Time:                   989s      16.48m     0.27h    0.01d  0.000 y
 # Average job time:                  96s       1.60m     0.03h    0.00d
 # Longest job:                      280s       4.67m     0.08h    0.00d
 # Submission to last job:          1219s      20.32m     0.34h    0.01d
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r
       textHistogram -binSize=5000 /tmp/score.$f:t:r
       echo ""
     end
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain mm5 ${c}_chainRn3 $i
     end
     featureBits mm5 chainRn3Link
 # 1677291680 bases of 2615483787 (64.129%) in intersection
     nice featureBits hg17 chainRn3Link
 # 982059013 bases of 2866216770 (34.263%) in intersection
 
 # NET RAT BLASTZ (RE-DONE 8/31/04 Fan)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin hNoClass.net
 # memory usage 1710399488, utime 7360 s/100, stime 1891
 
 # The above adapted from Angie's approach 
 
     # The netClass operations requires an "ancientRepeat" table to exist
     # in either mm5 or rn3.  So, create the table:
 
     ssh hgwdev
     mkdir -p /cluster/data/mm5/bed/ancientRepeat
     cd /cluster/data/mm5/bed/ancientRepeat
     # mysqldump needs write permission to this directory
     # and you need to use your read/write enabled user with password
     chmod 777 .
     hgsqldump --all --tab=. mm4 ancientRepeat
     chmod 775 .
     hgsql mm5 < ancientRepeat.sql
     mysqlimport -u<r/w user> -p<r/w pass> mm5 ancientRepeat.txt
     # This is a hand curated table obtained from Arian.
 
 # The ancientRepeat table was loaded during the first build of NET RAT BLASTZ.
 
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
     time netClass hNoClass.net mm5 rn3 rat.net \
 	-tNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInRat \
 	-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInMouse
     # 491.210u 96.250s 12:27.37 78.6% 0+0k 0+0io 249pf+0w
     
     # If things look good do
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
     rm -r hNoClass.net
     # Make a 'syntenic' subset of these with
     time netFilter -syn rat.net > ratSyn.net
     # 216.290u 34.220s 4:27.60 93.6%  0+0k 0+0io 119pf+0w
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
     netFilter -minGap=10 rat.net |  hgLoadNet mm5 netRn3 stdin
     netFilter -minGap=10 ratSyn.net | hgLoadNet mm5 syntenyNetRn3 stdin
 
     # check results
     # featureBits mm4 netRn3
     # 96806381 bases of 95076222 (101.820%) in intersection
     # featureBits mm5 netRn3
     # 2601384082 bases of 2615483787 (99.461%) in intersection
 
     # featureBits mm4 syntenyNetRn3
     # 96760405 bases of 95076222 (101.771%) in intersection
     # featureBits mm5 syntenyNetRn3
     # 2575035774 bases of 2615483787 (98.454%) in intersection
 
     # Add entries for net and chain to mouse/mm5 trackDb
 
     # make net
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtChain
     mkdir ratNet
     time netSplit rat.net ratNet
     # 218.990u 29.290s 4:27.86 92.6%  0+0k 0+0io 190pf+0w
 
 
     # extract axts from net 
     mkdir ../axtNet 
     foreach n (ratNet/chr*.net)
 	set c=$n:t:r
 	echo "netToAxt: $c.net -> $c.axt"
 	rm -f ../axtNet/$c.axt
 	netToAxt ratNet/$c.net chain/$c.chain \
 		/cluster/data/mm5/nib \
 		/cluster/data/rn3/nib ../axtNet/$c.axt
 	echo "Complete: $c.net -> axtNet/$c.axt"
     end
     # sort axt's and convert to maf format
     mkdir ../mafNet
     foreach f (../axtNet/chr*.axt)
         set c=$f:t:r
         echo $c.axt
         mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
         axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
         rm ../axtNet/$c.unsorted.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/mm5/chrom.sizes /cluster/data/rn3/chrom.sizes \
                 ../mafNet/$c.maf -tPrefix=mm5. -qPrefix=rn3.
     end
 
     ssh hgwdev
     mkdir -p /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtBest
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet
     mkdir -p /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
     cd /usr/local/apache/htdocs/goldenPath/mm5/vsRn3/axtNet
     nice gzip *.axt
     # add README.txt file to dir (use previous assembly's copy as template)
 
     #  Convert those axt files to psl
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestRn3.psl
 	echo "Done: ${c}_blastzBestRn3.psl"
     end
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/pslBest
     bash
     for I in chr*BestRn3.psl
 do
 /cluster/bin/i386/hgLoadPsl -noTNameIx mm5 ${I}
 echo "done ${I}"
 done
 
      # check results
     # featureBits mm5 blastzBestRn3
     # 1674716868 bases of 2615483787 (64.031%) in intersection
     # featureBits mm4 blastzBestRn3
     # 1780774716 bases of 2627444668 (67.776%) in intersection
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/mm5/axtBest/Rn3
      cd /gbdb/mm5/axtBest/Rn3
      rm *
      ln -s /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet/chr*.axt .
 
      ssh hgwdev
      cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29/axtNet
      rm -f axtInfoInserts.sql
      foreach f (/gbdb/mm5/axtBest/Rn3/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
          >> axtInfoInserts.sql
      end
     
     # these axtInfo file entries should be appended to the table,
     # not replacing it.  The previous hg17 entries are needed  --  bob kuhn
     hgsql mm5 -e 'drop table mm5.axtInfo;'
     hgsql mm5 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql mm5 < axtInfoInserts.sql
 
     cd /cluster/data/mm5/bed
     rm blastz.rn3
     ln -s  blastz.rn3.2004-08-29 blastz.rn3
     
 # BLASTZ RN3 CLEAN UP (RE-DONE - 2004-08-31 - Fan)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.rn3.2004-08-29
     nice rm -rf raw 
     nice rm axtChain/run1/chain/* 
 
 # do the following later, after rn3-mm5 net and chain done.
     nice gzip {axt,psl}Chrom/* lav/*/* axtChain/{all.chain,*.net} &
 # The above line done on 9/7/04. Fan.
 :
 # CREATE CYTOBAND TRACK (DONE - 2004-09-7 - Fan)
     # Should be done after NCBI updated their MapViewer to the latest release.
     ssh hgwdev
     cd /cluster/data/mm5
     mkdir cytoBand
     cd cytoBand
     # Get file from NCBI
     wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.33/ideogram.gz
     gunzip ideogram
     # Create bed file
     /cluster/bin/scripts/createNcbiCytoBand ideogram
     # Load the bed file
     hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm5 cytoBand cytoBand.bed
 
 # Make cytoBandIdeo track for ideogram gif on hgTracks page.
     # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
     # Make the cytoBand track (above) and then:
     echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql mm5
 
 # REBUILD CYTOBAND TRACK (DONE - 2004-09-15 - Fan)
     # NCBI updated the ideogram.gz file and also changed its format,
     # added a new density field after stein.
     ssh hgwdev
     cd /cluster/data/mm5
     mv cytoBand cytoBand.old
     mkdir cytoBand
     cd cytoBand
     # Get file from NCBI
     wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.33/ideogram.gz
     gunzip ideogram
     # Create bed file
     /cluster/bin/scripts/createNcbiCytoBand ideogram
     # Load the bed file
     hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm5 cytoBand cytoBand.bed
 
 # Make cytoBandIdeo track for ideogram gif on hgTracks page.
     # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
     # First, drop the cytoBandIdeo table in mm5.
     # Make the cytoBand track (above) and then:
     echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"|hgsql mm5
 
 # ADD MAP CONTIGS TRACK (DONE - 2004-09-07 - Fan)
     ssh hgwdev
     mkdir -p /cluster/data/mm5/bed/ctgPos
     cd /cluster/data/mm5/bed/ctgPos
     # hgCtgPos uses the lift files... but mouse lift files are for the
     # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs
     # from the assembly.  (In the future, we should go with the NT's!)
     # So... just for this release, go straight from the seq_contig.md
     # to the table def'n: contig, size, chrom, chromStart, chromEnd
     cat << '_EOF_' > parseSeqContig.pl
 #!/usr/local/bin/perl -w
 
 use strict;
 
 while (<>) {
     if (/^\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(N[TC]_\d+)\s+(\S+)\s+contig\s+\S+\s+\S+\s*$/i) {
         my $chr=$1; my $start=$2; $start -= 1; my $end=$3; my $ctg=$5;
         if ($chr !~ /N/ ) {
         print "$ctg\t" . ($end-$start) . "\tchr$chr\t$start\t$end\n";
         }
     }
 }
 '_EOF_'
     chmod +x parseSeqContig.pl
       ./parseSeqContig.pl ../../ncbi/seq_contig.md > ctgPos.tab
     hgsql mm5 < ~/kent/src/hg/lib/ctgPos.sql
     echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql mm5
     # Note: the info is there in seq_contig.md to also do the _random's,
     # but we'd have to do some more work: duplicate the gaps of 50000 between
     # contigs for all _random's except chrUn_random (1000 between).
 
     # featureBits mm5 ctgPos
     # 2557516950 bases of 2615483787 (97.784%) in intersection
     # featureBits mm4 ctgPos 
     # 2554101163 bases of 2627444668 (97.209%) in intersection
     # featureBits mm3 ctgPos
     # 2500661074 bases of 2505900260 (99.791%) in intersection
 
 # RELOAD MAP CONTIGS TRACK (DONE - 2005-Mar-03 - Heather)
 # /cluster/data/mm5/ncbi/seq_contig.md contains more than just C57BL/6J.
 # Filter those out.
 
     ssh hgwdev
     cd /cluster/data/mm5/bed/ctgPos
     cp /cluster/data/mm5/ncbi/seq_contig.md .
     grep C57BL seq_contig.md > contig.C57BL
     # contig.C57BL has 41061 lines (252 lines fewer than seq_contig.md)
     ./parseSeqContig.pl contig.C57BL > ctgPosFiltered.tab
     # ctgPosFiltered.tab has 302 rows (227 fewer than ctgPos.tab)
     echo "delete from ctgPos" | hgsql mm5
     echo "load data local infile 'ctgPosFiltered.tab' into table ctgPos" | hgsql mm5
     # echo "update ctgPos set chrom = "chrM" where chrom = "chrMT" | hgsql mm5
     # featureBits mm5 ctgPos
     # 2557064874 bases of 2615483787 (97.766%) in intersection
 
   
 
 # FUGU BLAT ALIGNMENTS (DONE 2004-09-08 Fan)
     ssh kk
     mkdir /cluster/data/mm5/bed/blatFr1
     cd /cluster/data/mm5/bed/blatFr1
     ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
     ls -1S /scratch/mus/mm5/softNib/*.nib > mouse.lst
     cat << '_EOF_' > gsub
 #LOOP
 blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     mkdir psl
     gensub2 mouse.lst fugu.lst gsub spec
     para create spec
     para try, check, push, check, ...
 Completed: 24854 of 24854 jobs
 CPU time in finished jobs:    8215774s  136929.56m  2282.16h   95.09d  0.261 y
 IO & Wait Time:               1415723s   23595.39m   393.26h   16.39d  0.045 y
 Average job time:                 388s       6.46m     0.11h    0.00d
 Longest job:                    46761s     779.35m    12.99h    0.54d
 Submission to last job:         46761s     779.35m    12.99h    0.54d
 
     # Sort alignments:
     ssh kksilo
     cd /cluster/data/mm5/bed/blatFr1
     pslCat -dir psl | pslSortAcc nohead chrom temp stdin
     # Processed 1116383 lines into 5 temp files
     # lift query side to Fugu browser chrUn coordinates
     liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl
 
     # load into database:
     ssh hgwdev
     cd /cluster/data/mm5/bed/blatFr1
     hgLoadPsl -fastLoad -table=blatFr1 mm5 all.psl
     # Processing all.psl
     # load of blatFr1 did not go as planned: 1116383 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab
     # a record is already in trackDb as type xeno psl fr1, with colorChromDefault off
 
 # BLASTZ TETRAODON (tetNig1) (DONE, 2004-09-08, hartera)
 
     ssh kkr1u00
     # blastz requires lineage-specific repeats
     # Treat all repeats as lineage-specific.
 
     mkdir -p /iscratch/i/mm5/linSpecRep.notInTetraodon
     foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/mm5/linSpecRep.notInTetraodon/$f:t:r:r.out.spec
     end
 
     mkdir -p /iscratch/i/tetNig1/linSpecRep.notInMouse
     foreach f (/iscratch/i/tetNig1/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/tetNig1/linSpecRep.notInMouse/$f:t:r:r.out.spec
     end
     iSync
 
     ssh kksilo
     # more space on store8 than store6
     mkdir -p /cluster/store8/mm5/blastz.tetNig1.2004-09-02
     ln -s /cluster/store8/mm5/blastz.tetNig1.2004-09-02 \
           /cluster/data/mm5/bed
     ln -s /cluster/data/mm5/bed/blastz.tetNig1.2004-09-02 \
           /cluster/data/mm5/bed/blastz.tetNig1
     ssh kk
     cd /cluster/data/mm5/bed/blastz.tetNig1
     # use same parameters as for danRer1-mm5
     cat << '_EOF_' > DEF
 # mouse (mm5) vs Tetraodon (tetNig1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Reuse parameters from hg16-fr1 and danRer1-hg17.
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse (mm5)
 SEQ1_DIR=/iscratch/i/mus/mm5/test
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInTetraodon
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Tetraodon (tetNig1)
 SEQ2_DIR=/iscratch/i/tetNig1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/tetNig1/linSpecRep.notInMouse
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm5/bed/blastz.tetNig1
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     # Save the DEF file in the current standard place
     chmod +x DEF
     cp DEF ~angie/hummus/DEF.mm5-tetNig1.2004-09-02
     # setup cluster run
     # copy shell scripts for blastz runs if not there already
     cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/
     # edit BlastZ_run0.sh
     # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
     # this is the directory for the latest version of blastz-run
 
     # source the DEF file
     bash
     . ./DEF
     /cluster/data/mm5/jkStuff/BlastZ_run0.sh
     cd run.0
     # check batch looks ok then
     para try, check, push, check, ....
 # para time
 # Completed: 19437 of 19437 jobs
 # CPU time in finished jobs:    4681483s   78024.71m  1300.41h   54.18d  0.148 y
 # IO & Wait Time:                176260s    2937.67m    48.96h    2.04d  0.006 y
 # Average job time:                 250s       4.17m     0.07h    0.00d
 # Longest job:                      790s      13.17m     0.22h    0.01d
 # Submission to last job:          5475s      91.25m     1.52h    0.06d
     
     # second cluster run to convert the .out's to .lav's
     ssh kki
     cd /cluster/data/mm5/bed/blastz.tetNig1
     bash # if a csh/tcsh user
     . ./DEF
     /cluster/data/mm5/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # para time
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:        262s       4.37m     0.07h    0.00d  0.000 y
 # IO & Wait Time:                   981s      16.35m     0.27h    0.01d  0.000 y
 # Average job time:                   4s       0.06m     0.00h    0.00d
 # Longest job:                        9s       0.15m     0.00h    0.00d
 # Submission to last job:           108s       1.80m     0.03h    0.00d
 
     #   Third cluster run to convert lav's to axt's
     ssh kki
     cd /cluster/data/mm5/bed/blastz.tetNig1
     mkdir axtChrom
     # a new run directory
     mkdir run.2
     cd run.2
 cat << '_EOF_' > do.csh
 #!/bin/csh
 cd $1
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin /iscratch/i/mus/mm5/softNib \
 /iscratch/i/tetNig1/nib stdout \
 | axtSort stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.tetNig1/axtChrom/$(root1).axt}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     \ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
     head jobList
     para create jobList
     para try, check, push, check,...
 # para time
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:         41s       0.68m     0.01h    0.00d  0.000 y
 # IO & Wait Time:                   414s       6.90m     0.12h    0.00d  0.000 y
 # Average job time:                  11s       0.18m     0.00h    0.00d
 # Longest job:                       28s       0.47m     0.01h    0.00d
 # Submission to last job:           396s       6.60m     0.11h    0.00d
 
     # translate sorted axt files into psl
     ssh kolossus
     cd /cluster/data/mm5/bed/blastz.tetNig1
     mkdir -p pslChrom
     set tbl = "blastzTetNig1"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     # Load database tables
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.tetNig1/pslChrom
 
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl mm5 $f
     end
 
 # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1 -enrichment
 # refGene:cds 0.765%, blastzTetNig1 1.709%, both 0.519%, cover 67.80%, 
 # enrich 39.67x
 # default with H=2000
 # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2000 -enrichment
 # refGene:cds 0.765%, blastzTetNig1H2000 1.239%, both 0.502%, cover 65.59%, 
 # enrich 52.92x
 # blastzDanRer1 with L=8000
 # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1L8k -enrichment
 # refGene:cds 0.765%, blastzTetNig1L8k 1.333%, both 0.444%, cover 58.05%, 
 # enrich 43.56x
 # too much drop in coverage
 # H=2000, L=4000
 # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2kL4k -enrichment
 # refGene:cds 0.765%, blastzTetNig1H2kL4k 1.166%, both 0.489%, cover 63.91%, 
 # enrich 54.81x
 # H=2000, L=6000
 # featureBits -chrom=chr1 mm5 refGene:cds blastzTetNig1H2kL6k -enrichment
 # refGene:cds 0.765%, blastzTetNig1H2kL6k 1.014%, both 0.437%, cover 57.15%, 
 # enrich 56.36x
 # too much drop in coverage
 
 # number of rows in table
 # blastzTetNig1 	38196
 # blatzTetNig1H2000	38314
 # blastzTetNig1L8k	24749
 # blastzTetNig1H2kL4k	31433
 # blastzTetNig1H2kL6k	21389
 
 # use blastzTetNig1 as this has the best coverage. enrich is quite high too.
 # featureBits -chrom=chr1 hg17 refGene:cds blastzFr1 -enrichment
 # refGene:cds 1.246%, blastzFr1 2.319%, both 0.833%, cover 66.87%, enrich 28.83x
 # similar coverage to blastzFr1 for hg17
 
 # RESCORE TETNIG1 BLASTZ (DONE, 2004-09-08, hartera)
     # Low scores can occur with repeats abridged and using the
     # HoxD55.q matrix. PSU's restore_rpts program rescored alignments
     # with the default matrix instead of the BLASTZ_Q matrix.
     # Rescore them here so the chainer sees the higher scores:
                                           
     ssh kolossus
     cd /cluster/data/mm5/bed/blastz.tetNig1
     mkdir axtChrom.rescore
     foreach f (axtChrom/chr*.axt)
         axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
         $f axtChrom.rescore/$f:t
     end
     mv axtChrom axtChrom.orig
     mv axtChrom.rescore axtChrom
 
 # CHAIN TETRAODON (TETNIG1) BLASTZ (DONE, 2004-09-08, hartera)
     # Re do chains with rescored blastz Hg17
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/mm5/bed/blastz.tetNig1
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/mm5/bed/blastz.tetNig1/axtChrom/*.axt \
         > input.lst
     # Reuse gap penalties from hg16 vs chicken run.
     cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize^V     11
 smallSize^V     111
 position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V
 72111^V 152111^V        252111
 qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V
 31600^V 56600
 bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V
 16000^V 32000^V 57000
 '_EOF_'
     # << this line makes emacs coloring happy
 
  cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
 
  cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -linearGap=../../chickenHumanTuned.gap $1 \
     /iscratch/i/mus/mm5/softNib \
     /iscratch/i/tetNig1/nib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
 
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 
 # para time
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:        524s       8.74m     0.15h    0.01d  0.000 y
 # IO & Wait Time:                   140s       2.33m     0.04h    0.00d  0.000 y
 # Average job time:                  15s       0.26m     0.00h    0.00d
 # Longest job:                       25s       0.42m     0.01h    0.00d
 # Submission to last job:           632s      10.53m     0.18h    0.01d
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     # take a look at score distr's,try also with larger bin size.
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r >> hist5000.out
       textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
       echo ""
     end
     # not a large amount of changes with score < 5000
     # load chr1 into database to check
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chain
     hgLoadChain mm5 chr1_chainTetNig1 chr1.chain
 # featureBits -chrom=chr1 mm5 refGene:cds chainTetNig1Link -enrichment
 # refGene:cds 0.765%, chainTetNig1Link 1.563%, both 0.512%, cover 66.84%, 
 # enrich 42.76x
    # try filtering with minScore=5000
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain
     chainSplit chainFilt5k all.chain
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chainFilt5k
     hgLoadChain mm5 chr1_chainTetNig1Filt5k chr1.chain
 # featureBits -chrom=chr1 mm5 refGene:cds chainTetNig1Filt5kLink -enrichment
 # refGene:cds 0.765%, chainTetNig1Filt5kLink 1.398%, both 0.504%, cover 65.91%, # enrich 47.13x
 # chr1_chainTetNig1 21782
 # chr1_chainTetNig1Filt5k 9670
 
     # loses very little in coverage so use filtering with minScore=5000
     # remove chain
     rm -r chain
     mv chainFilt5k chain
     rm all.chain.unfiltered
 
     ssh hgwdev
     # remove test tables
     hgsql -e "drop table chr1_chainTetNig1Filt5k;" mm5
     hgsql -e "drop table chr1_chainTetNig1Filt5kLink;" mm5
     # load chains into database
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain mm5 ${c}_chainTetNig1 $i
         echo done $c
     end
 
 # NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-09-08, hartera)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                      ../preNet/$i
     end
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 69083136, utime 402 s/100, stime 37
     # Add classification info using db tables:
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
     # netClass looks for ancient repeats in one of the databases
     # hg17 has this table - hand-curated by Arian but this is for
     # human-rodent comparisons so do not use here, use -noAr option
     mkdir -p /cluster/bluearc/mm5/linSpecRep.notInTetraodon
     mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInMouse
     cp /iscratch/i/mm5/linSpecRep.notInTetraodon/* \
        /cluster/bluearc/mm5/linSpecRep.notInTetraodon
     cp /iscratch/i/tetNig1/linSpecRep.notInMouse/* \
        /cluster/bluearc/tetNig1/linSpecRep.notInMouse
 
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
     # there is no ancient repeats table for rodent vs fish so use -noAr flag
     time netClass noClass.net mm5 tetNig1 tetNig1.net \
           -tNewR=/cluster/bluearc/mm5/linSpecRep.notInTetraodon \
           -qNewR=/cluster/bluearc/tetNig1/linSpecRep.notInMouse -noAr
     # 59.490u 37.630s 2:41.82 60.0%   0+0k 0+0io 216pf+0w
 
     netFilter -minGap=10 tetNig1.net |  hgLoadNet mm5 netTetNig1 stdin
     # featureBits mm5 refGene:cds netTetNig1 -enrichment
     # refGene:cds 0.921%, netTetNig1 23.633%, both 0.725%, cover 78.70%, 
     # enrich 3.33x
 
 # MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-09-10, hartera)
     ssh kksilo
     # zip chains and nets
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChain
     cp all.chain tetNig1.chain
     zip -j /cluster/data/mm5/zip/tetNig1.chain.zip tetNig1.chain
     rm tetNig1.chain
     zip -j /cluster/data/mm5/zip/tetNig1.net.zip tetNig1.net
 
     ssh hgwdev
     # copy chains and nets to downloads area
     set gp = /usr/local/apache/htdocs/goldenPath/mm5
     mkdir -p $gp/vsTetNig1
     cd $gp/vsTetNig1
     mv /cluster/data/mm5/zip/tetNig1*.zip .
     md5sum *.zip > md5sum.txt
 
     # move axt files to downloads area and zip
     cd /cluster/data/mm5/bed/blastz.tetNig1/axtChrom
     mkdir -p $gp/vsTetNig1/axtChrom
     cp -p *.axt $gp/vsTetNig1/axtChrom
     cd $gp/vsTetNig1/axtChrom
     gzip *.axt
     md5sum *.gz > md5sum.txt
 
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 # MAKE VSDANRER1 DOWNLOADABLES (DONE, 2004-09-10, hartera)
     ssh kksilo
     # zip chains and nets
     cd /cluster/data/mm5/bed/blastz.danRer1/axtChain
     gunzip all.chain.gz
     cp all.chain danRer1.chain
     zip -j /cluster/data/mm5/zip/danRer1.chain.zip danRer1.chain
     rm danRer1.chain
     gunzip danRer1.net.gz
     zip -j /cluster/data/mm5/zip/danRer1.net.zip danRer1.net
 
     ssh hgwdev
     # copy chains and nets to downloads area
     set gp = /usr/local/apache/htdocs/goldenPath/mm5
     mkdir -p $gp/vsDanRer1
     cd $gp/vsDanRer1
     mv /cluster/data/mm5/zip/danRer1*.zip .
     md5sum *.zip > md5sum.txt
 
     # move axt files to downloads area and zip
     cd /cluster/data/mm5/bed/blastz.danRer1/axtChrom
     mkdir -p $gp/vsDanRer1/axtChrom
     cp -p *.axt $gp/vsDanRer1/axtChrom
     cd $gp/vsDanRer1/axtChrom
     gzip *.axt
     md5sum *.gz > md5sum.txt
   
     # add the axtNet *.axt in blastz.danRer1/axtNet
     cd /cluster/data/mm5/bed/blastz.danRer1/axtNet
     set gp = /usr/local/apache/htdocs/goldenPath/mm5
     mkdir -p $gp/vsDanRer1/axtNet
     nice cp -p *.axt $gp/vsDanRer1/axtNet
     cd $gp/vsDanRer1/axtNet
     nice gzip *.axt
     md5sum *.gz > md5sum.txt
 
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 # BLASTZ TETNIG1 CLEAN UP (DONE, 2004-09-10, hartera)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.tetNig1
     nice rm -rf raw &
     nice rm -rf lav &
     nice rm -rf axtChrom.orig &
     nice rm axtChain/run1/chain/* &
     nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
 
 
 # SGP GENES (REDONE 5/24/05 angie)
     # Originally loaded 9/17/04; user noticed chrX was missing; IMIM folks 
     # regenerated & we reloaded.
     ssh kksilo
     mkdir /cluster/data/mm5/bed/sgp
     cd /cluster/data/mm5/bed/sgp
     foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
       wget http://genome.imim.es/genepredictions/M.musculus/mmMay2004/SGP/humangp200405/$chr.gtf
       wget http://genome.imim.es/genepredictions/M.musculus/mmMay2004/SGP/humangp200405/$chr.prot
     end
     # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
     cp /dev/null sgpPep.fa
     foreach f (chr*.prot)
       nice perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa
     end
     ssh hgwdev
     cd /cluster/data/mm5/bed/sgp
     ldHgGene -gtf -genePredExt mm5 sgpGene chr*.gtf
     hgPepPred mm5 generic sgpPep sgpPep.fa
 
 # SGP GENES (UPDATE 1/18/2006)
     sgpPep table dropped, replaced by hgc generated protein seq in browser
 
 # MAKE mm5-hg17 OVER.CHAIN FOR LIFTOVER  (DONE 2004-09-24 braney)
     ssh kolossus
     mkdir -p /cluster/data/mm5/bed/bedOver/mm5Tohg17
     cd /cluster/data/mm5/bed/bedOver/mm5Tohg17
     set chainDir = /cluster/data/mm5/bed/blastz.hg17/axtChain
     netSplit $chainDir/human.net net
     mkdir subset
     foreach f ($chainDir/chain/*.chain)
       echo subsetting $f:t:r
       netChainSubset net/$f:t:r.net $f subset/$f:t
     end
     cat subset/*.chain > /cluster/data/mm5/bed/bedOver/mm5Tohg17.chain
     hgAddLiftOverChain -multiple mm5 hg17
 
 #  miRNA track (DONE - 2004-09-30 - Fan)
     #   data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
     #   and Michel.Weber@ibcg.biotoul.fr
     #   notify them when done.
     cd /cluster/data/mm5/bed
     mkdir miRNA
     cd miRNA
     wget --timestamping \
     ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/mmu.bed
     grep -v "tion" mmu.bed | sed -e "s/ /\t/g" > mm5.bed
     # check previous release track before update
     nice featureBits mm4 miRNA
     # 17782 bases of 2627444668 (0.001%) in intersection
     hgLoadBed mm5 miRNA mm5.bed
     # entry in trackDb/trackDb.ra already there
     # and verify similar numbers after:
     nice featureBits mm5 miRNA
     # 17957 bases of 2615483787 (0.001%) in intersection
  
 # BLASTZSELF Done (Tue Oct 19 18:06:45 PDT 2004) sugnet
 
     # blastzSelf run for mm5. This took about a week due to
     # being busy with other things and some crashed jobs in a
     # few places. Think all of the instructions ended up here.
     # based off of Hiram's instructions for blastzSelf in hg16 & hg17
     mkdir -p /cluster/store6/mm5/bed/blastzSelf
     cd /cluster/store6/mm5/bed/blastzSelf
     
     # Create the definitions file.
     cat << '_EOF_' > DEF
     # mouse vs. mouse
     export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
     
     ALIGN=blastz-run
     BLASTZ=blastz
     BLASTZ_H=2000
     BLASTZ_ABRIDGE_REPEATS=1
     
     # TARGET
     # Mouse
     SEQ1_DIR=/scratch/mus/mm5/softNib
     # RMSK not currently used
     SEQ1_RMSK=/scratch/mus/mm5/rmsk
     # FLAG not currently used
     SEQ1_FLAG=-rodent
     SEQ1_SMSK=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInMouse
     SEQ1_IN_CONTIGS=0
     SEQ1_CHUNK=10000000
     SEQ1_LAP=10000
     
     # QUERY
     # Mouse
     SEQ2_DIR=/scratch/mus/mm5/softNib
     # RMSK not currently used
     SEQ2_RMSK=/scratch/mus/mm5/rmsk
     # FLAG not currently used
     SEQ2_FLAG=-rodent
     SEQ2_SMSK=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInMouse
     SEQ2_IN_CONTIGS=0
     SEQ2_CHUNK=30000000
     SEQ2_LAP=0
     
     BASE=/cluster/data/mm5/bed/blastzSelf
     
     DEF=$BASE/DEF
     RAW=$BASE/raw
     CDBDIR=$BASE
     SEQ1_LEN=$BASE/S1.len
     SEQ2_LEN=$BASE/S2.len
     '_EOF_'
         # << this line makes emacs coloring happy
     
     ssh kk
     cd /cluster/store6/mm5/bed/blastzSelf
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, push, check
     
     # on mini-cluster, otherwise I/O gets very demanding....
     ssh kki
     cd /cluster/store6/mm5/bed/blastzSelf
     mkdir -p run.1
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > run.1/jobList
     cd run.1
     wc -l jobList 
     #    341 jobList
     head jobList 
     para create jobList 
     para try
     
     #	Third cluster run to convert lav's to axt's
     mkdir run.2
     cd run.2
     cat << '_EOF_' > gsub
     #LOOP
     /cluster/bin/scripts/blastz-chromlav2axt /cluster/data/mm5/bed/blastzSelf/lav/$(root1) {check out line+ /cluster/data/mm5/bed/blastzSelf/axtChrom/$(root1).axt} /scratch/mus/mm5/softNib /scratch/mus/mm5/softNib
     #ENDLOOP
     '_EOF_'
     ls -1S /cluster/data/mm5/bed/blastzSelf/lav > chrom.list
     gensub2 chrom.list single gsub jobList
     para create
     para push
     # This seems to beat up on the file server a little, load up to 56 on kksilo
     
     # Number of jobs died, unsure why. Try them on kksilo:
     ssh kksilo
     cat << '_EOF_' > doStragglers.csh
     #!/bin/tcsh
     
     cd /cluster/store6/mm5/bed/blastzSelf
     set base=/cluster/data/hg16/bed/blastzSelf
     set seq1_dir=/cluster/data/mm5/nib
     set seq2_dir=/cluster/data/mm5/nib
     foreach c (lav/chr17  lav/chr2  lav/chr3  lav/chr7  lav/chrUn_random  lav/chrX  lav/chrY)
       echo "Doing $c"
       pushd $c
       set chr=$c:t
       set out=axtChrom/$chr.axt
       echo "Translating $chr lav to $out"
       foreach d (*.lav)
         set smallout=$d.axt
         lavToAxt $d $seq1_dir $seq2_dir stdout \
         | axtDropSelf stdin stdout \
         | axtSort stdin $smallout
       end
       cat `ls -1 *.lav.axt | sort -g` > $base/$out
       popd
     end
     '_EOF_'
     
     #  Need to drop overlaps to eliminate diagonals
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "doing $c"
       /cluster/bin/i386/axtDropOverlap axtChrom/$c.axt chromSizes.tab chromSizes.tab \
 	 	/cluster/store6/mm5/bed/blastzSelf/axtChromDropped/$c.axt
       echo "Done: $c"
     end
     cd axtChromDropped
     gzip *.axt
 
     # Translate to psls
     cd /cluster/data/mm5/bed/blastzSelf
     mkdir pslChrom
     set tbl = "blastzSelf"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       zcat /cluster/data/mm5/bed/blastzSelf/axtChromDropped/${c}.axt.gz | \
        /cluster/bin/i386/axtToPsl stdin S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
 
     # Load files into the database
     /cluster/bin/i386/hgLoadPsl -noTNameIx mm5  *_blastzSelf.psl
 
 # end BLASTZSELF
 
 # CREATE kgSpAlias TABLE FOR PB (Done 10/20/04)
 
     hgsql mm5 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
     hgsql mm5 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >>j.tmp
     cat j.tmp|sort -u |grep -v 'kgID' >mm5.kgSpAlias.tab
     rm j.tmp
 
     hgsql mm5 -e 'drop table kgSpAlias';
     hgsql mm5 < ~/src/hg/lib/kgSpAlis.sql
     hgsql mm5 -e 'load data local infile "mm5.kgSpAlias.tab" into table kgSpAlias'
 
 
 # ECGENE TRACK (DONE, 2004-10-29, hartera)
     ssh kksilo
     mkdir -p /cluster/data/mm5/bed/ECgene.2004-10-29
     ln -s /cluster/data/mm5/bed/ECgene.2004-10-29 \
           /cluster/data/mm5/bed/ECgene
     cd /cluster/data/mm5/bed/ECgene
     wget \
 "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_mm5_low_gene.txt.gz"
     wget \
 "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_mm5_low_pep.txt.gz"
     gunzip *.gz
     # load database
     ssh hgwdev
     cd /cluster/data/mm5/bed/ECgene
     ldHgGene -predTab mm5 ECgene v1.2_mm5_low_gene.txt
     # 343337 gene predictions
     hgPepPred mm5 tab ECgenePep v1.2_mm5_low_pep.txt
     rm *.tab
     nice gzip *.txt
 
 
 ## NIA Mouse Gene Index - (DONE - 2004-11-16 Fan)
 #       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
 
 #       pick up data
     ssh hgwdev 
     mkdir -p /cluster/data/mm5/bed/NIAGene
     cd /cluster/data/mm5/bed/NIAGene
     wget --timestamp http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex4-Transcript-to-Genome.psl
 
     wget --timestamping \
     http://lgsun.grc.nia.nih.gov/temp/NIA-Mouse-GeneIndex4-Transcripts.fasta
     
     hgLoadPsl mm5 -table=NIAGene NIA-Mouse-GeneIndex4-Transcript-to-Genome.psl
 
     mkdir /gbdb/mm5/NIAGene
     ln -s /cluster/data/mm5/bed/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta \
         /gbdb/mm5/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta
     
     hgLoadSeq mm5 /gbdb/mm5/NIAGene/NIA-Mouse-GeneIndex4-Transcripts.fasta
 
     Added and edited NIAGene.html and trackDb.ra under
     
         kent/src/hg/makeDb/trackDb/mouse/mm5
 
 
 # CREATE jaxQTL3 (MOUSE QTL) TRACK (DONE - 2004-11-18 Fan)
 
     cd /cluster/data/mm5/bed
     mkdir qtl.2004-11-08
     ln -s qtl.2004-11-08 qtl
     cd qtl
 
 # Get the raw data file, mouse_qtl_100804.txt, sent by Carol Bult [cjb@informatics.jax.org].
 
     hgsql mm5 -e 'drop table jaxQtlRaw'
     hgsql mm5 < ~/src/hg/lib/jaxQtlRaw.sql
     hgsql mm5 -e 'load data local infile "mouse_qtl_100804.txt" into table jaxQtlRaw ignore 1 lines'
 
 # Make sure hgJaxQtl binary executable exist.  hgJaxQtl is under ~/src/hg/hgJaxQtl
 
     hgJaxQtl mm5
     wc jaxQTL3.tab
 #    981   15310  105164 jaxQTL3.tab
 
     hgLoadBed -nobin -tab -sqlTable=$HOME/src/hg/lib/jaxQTL3.sql mm5 jaxQTL3 jaxQTL3.tab
 
 	
 # TWINSCAN (DONE 11/29/04 angie)
     ssh kksilo
     mkdir /cluster/data/mm5/bed/twinscan
     cd /cluster/data/mm5/bed/twinscan
     foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
       wget http://genes.cs.wustl.edu/predictions/mouse/mm5_11-24-04/chr_gtf/$chr.gtf
       wget http://genes.cs.wustl.edu/predictions/mouse/mm5_11-24-04/chr_ptx/$chr.ptx
     end
     # Add '.a' to end of protein fasta id's, to match gtf transcript_id's:
     perl -wpe 's/^(>\S+).*/$1.a/' *.ptx > twinscanPep.fa
     # load.
     ssh hgwdev
     cd /cluster/data/mm5/bed/twinscan
     ldHgGene -gtf -genePredExt mm5 twinscan chr*.gtf
     hgPepPred mm5 generic twinscanPep twinscanPep.fa
     featureBits -enrichment mm5 refGene twinscan
 #refGene 1.551%, twinscan 1.245%, both 0.783%, cover 50.46%, enrich 40.52x
 
 # Create mm5GeneList.html (to be used by Google).
 # This step was done 12/08/04.
     
     cd /cluster/data/mm5/bed
     mkdir geneList
     cd geneList
     wget -O mm5GeneList.html "http://hgwdev-fanhsu.cse.ucsc.edu/cgi-bin/hgGeneList?db=mm5"
     cp -p mm5GeneList.html /usr/local/apache/htdocs/goldenPath
 # Check this html file into CVS.
 
 
 # BLASTZ ZEBRAFISH (danRer2) (DONE, 2004-12-12, hartera)
 
     ssh kkr1u00
     # blastz requires lineage-specific repeats
     # Treat all repeats as lineage-specific.
     # this directory of mouse repeats exists already
     mkdir -p /iscratch/i/mm5/linSpecRep.notInZebrafish
     foreach f (/cluster/bluearc/scratch/mus/mm5/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/mm5/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
     end
     
     mkdir -p /iscratch/i/danRer2/linSpecRep.notInMouse
     foreach f (/iscratch/i/danRer2/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/danRer2/linSpecRep.notInMouse/$f:t:r:r.out.spec
     end
     iSync
 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastz.danRer2.2004-12-10
     ln -s /cluster/data/mm5/bed/blastz.danRer2.2004-12-10 \
           /cluster/data/mm5/bed/blastz.danRer2
     cd /cluster/data/mm5/bed/blastz.danRer2
  # use same parameters as for danRer[1|2]-hg17 and for hg16-fr1 and mm5-danRer1
  # and similar to those originally used for hg17-galGal2
     cat << '_EOF_' > DEF
 # mouse (mm5) vs zebrafish (danRer2)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer1
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse (mm5)
 SEQ1_DIR=/cluster/bluearc/scratch/mus/mm5/softNib
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/mm5/linSpecRep.notInZebrafish
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer2)
 SEQ2_DIR=/iscratch/i/danRer2/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/danRer2/linSpecRep.notInMouse
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm5/bed/blastz.danRer2
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     # Save the DEF file in the current standard place
     chmod +x DEF
     cp DEF ~angie/hummus/DEF.mm5-danRer2.2004-12-10
     # setup cluster run
     # copy shell scripts for blastz runs if not there already
     cp -p /cluster/data/danRer1/jkStuff/BlastZ* /cluster/data/mm5/jkStuff/
     # edit BlastZ_run0.sh
     # replace line 22: /cluster/home/angie/schwartzbin/ with /cluster/bin/penn/
     # this is the directory for the latest version of blastz-run
 
     # source the DEF file
     bash
     . ./DEF
     /cluster/data/mm5/jkStuff/BlastZ_run0.sh
     cd run.0
     # check batch looks ok then
     para try, check, push, check, ....
 # para time
 # Completed: 58993 of 58993 jobs
 # CPU time in finished jobs:   17513361s  291889.35m  4864.82h  202.70d  0.555 y
 # IO & Wait Time:               1506128s   25102.13m   418.37h   17.43d  0.048 y
 # Average job time:                 322s       5.37m     0.09h    0.00d
 # Longest job:                     2552s      42.53m     0.71h    0.03d
 # Submission to last job:         50001s     833.35m    13.89h    0.58d
 
     # output is 864M
     # second cluster run to convert the .out's to .lav's
     ssh kki
     cd /cluster/data/mm5/bed/blastz.danRer2
     bash # if a csh/tcsh user
     . ./DEF
     /cluster/data/mm5/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # para time
 # Checking finished jobs
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:        689s      11.48m     0.19h    0.01d  0.000 y
 # IO & Wait Time:                  1305s      21.76m     0.36h    0.02d  0.000 y
 # Average job time:                   6s       0.10m     0.00h    0.00d
 # Longest job:                       14s       0.23m     0.00h    0.00d
 # Submission to last job:           250s       4.17m     0.07h    0.00d
 
     #   Third cluster run to convert lav's to axt's
     ssh kki
     cd /cluster/data/mm5/bed/blastz.danRer2
     mkdir axtChrom
     # a new run directory
     mkdir run.2
     cd run.2
 cat << '_EOF_' > do.csh
 #!/bin/csh
 cd $1
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin /cluster/bluearc/scratch/mus/mm5/softNib \
 /iscratch/i/danRer2/nib stdout \
 | axtSort stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/mm5/bed/blastz.danRer2/axtChrom/$(root1).axt}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     \ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
     head jobList
     para create jobList
     para try, check, push, check,...
 # para time
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:         82s       1.37m     0.02h    0.00d  0.000 y
 # IO & Wait Time:                  1429s      23.82m     0.40h    0.02d  0.000 y
 # Average job time:                  35s       0.59m     0.01h    0.00d
 # Longest job:                       91s       1.52m     0.03h    0.00d
 # Submission to last job:          1421s      23.68m     0.39h    0.02d
 
     # translate sorted axt files into psl
     ssh kolossus
     cd /cluster/data/mm5/bed/blastz.danRer2
     mkdir -p pslChrom
     set tbl = "blastzDanRer2"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     # Load database tables
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.danRer2/pslChrom
 
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl mm5 $f
     end
 # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer1 -enrichment
 #refGene:cds 0.763%,blastzDanRer1 2.918%,both 0.512%,cover 67.12%,enrich 23.00x
 # featureBits -chrom=chr1 mm5 refGene:cds blastzDanRer2 -enrichment
 # refGene:cds 0.780%, blastzDanRer2 2.816%, both 0.529%, cover 67.89%, 
 # enrich 24.11x
  
 # RESCORE DANRER2 BLASTZ ALIGNMENTS (DONE, 2004-12-12, hartera)
 
     # Low scores can occur with repeats abridged and using the
     # HoxD55.q matrix. PSU's restore_rpts program rescored alignments
     # with the default matrix instead of the BLASTZ_Q matrix.
     # Rescore them here so the chainer sees the higher scores:
     ssh kolossus
     cd /cluster/data/mm5/bed/blastz.danRer2
     mkdir axtChrom.rescore
     foreach f (axtChrom/chr*.axt)
         axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
         $f axtChrom.rescore/$f:t
     end
     mv axtChrom axtChrom.orig
     mv axtChrom.rescore axtChrom
 
 #   psl files and blastz tables will be the same regardless of score so
 #   no need to reload
 
 # CHAIN ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-13, hartera)
 # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE PRIMARILY THE RESULTS OF 
 # REPEATS AND DEGENERATE DNA (DONE, 2004-12-22, hartera)
     # Make chains with rescored blastz danRer2
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/mm5/bed/blastz.danRer2
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/mm5/bed/blastz.danRer2/axtChrom/*.axt \
         > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     # Make our own linear gap file with reduced gap penalties, 
     # in hopes of getting longer chains:
     cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize^V     11
 smallSize^V     111
 position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V 72111^V 152111^V        252111
 qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
 tGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
 bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V  16000^V 32000^V 57000
 '_EOF_'
     # << this line makes emacs coloring happy
 cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
          -linearGap=../../chickenHumanTuned.gap $1 \
     /cluster/bluearc/scratch/mus/mm5/softNib \
     /iscratch/i/danRer1/nib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # para time
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       1797s      29.95m     0.50h    0.02d  0.000 y
 # IO & Wait Time:                   575s       9.59m     0.16h    0.01d  0.000 y
 # Average job time:                  55s       0.92m     0.02h    0.00d
 # Longest job:                      133s       2.22m     0.04h    0.00d
 # Submission to last job:           514s       8.57m     0.14h    0.01d
 
    # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
 
 # take a look at score distr's,try also with smaller bin size.
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r >> hist5000.out
       textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
       echo ""
     end
     # filter on minScore = 5000
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain.filt5k
     # remove old chains
     rm -r chain
     chainSplit chain all.chain.filt5k
    
     # remove repeats from chains and reload into database
     # (2004-12-22, hartera)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
     mv chain chainRaw
     mkdir chain
     cd chainRaw
     foreach f (*.chain)
        set c = $f:r
        echo $c
        nice chainAntiRepeat /cluster/bluearc/scratch/mus/mm5/softNib \
                             /cluster/bluearc/danRer2/nib $f \
                             ../chain/$c.chain
     end
     cd ..
     chainMergeSort ./chain/*.chain > all.chain.antirepeat
     chainSplit chainAR all.chain.antirepeat
     # load filtered chains with chains removed that are mostly due to repeats 
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.danRer2/axtChain/chainAR
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain mm5 ${c}_chainDanRer2 $i
         echo done $c
     end
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2 -enrichment
 # refGene:cds 0.780%, chainDanRer2 22.478%, both 0.604%, cover 77.48%, 
 # enrich 3.45x
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment
 # refGene:cds 0.780%, chainDanRer2Link 2.164%, both 0.526%, cover 67.43%, 
 # enrich 31.17x
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1 -enrichment
 # refGene:cds 0.780%, chainDanRer1 20.053%, both 0.593%, cover 75.99%, 
 # enrich 3.79x
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer1Link -enrichment
 # refGene:cds 0.780%, chainDanRer1Link 2.022%, both 0.512%, cover 65.64%, 
 # enrich 32.47x
 # after chainAntiRepeat:
 # featureBits -chrom=chr1 mm5 refGene:cds chainDanRer2Link -enrichment
 # refGene:cds 0.785%, chainDanRer2Link 2.058%, both 0.530%, cover 67.53%, 
 # enrich 32.81x
  
 # NET ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-13, hartera)
 # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-22,hartera)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
     rm -r preNet
     mkdir preNet
     cd chainAR
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                      ../preNet/$i
     end
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 105357312, utime 632 s/100, stime 117
 # Add classification info using db tables:
     cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
     # netClass looks for ancient repeats in one of the databases
     # hg17 has this table - hand-curated by Arian but this is for
     # human-rodent comparisons so do not use here, use -noAr option
     mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
     mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInMouse
     cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
        /cluster/bluearc/mm5/linSpecRep.notInZebrafish
     cp /iscratch/i/danRer2/linSpecRep.notInMouse/* \
        /cluster/bluearc/danRer2/linSpecRep.notInMouse
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
     time netClass noClass.net mm5 danRer2 zfishdanRer2.net \
          -tNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish \
          -qNewR=/cluster/bluearc/danRer2/linSpecRep.notInMouse -noAr
     # 87.010u 56.100s 5:15.16 45.4%   0+0k 0+0io 207pf+0w
     netFilter -minGap=10 zfishdanRer2.net |  hgLoadNet mm5 netDanRer2 stdin
 # featureBits mm5 refGene:cds netDanRer2 -enrichment
 # refGene:cds 0.938%, netDanRer2 21.447%, both 0.714%, cover 76.17%, 
 # enrich 3.55x
 # featureBits mm5 refGene:cds netDanRer1 -enrichment
 # refGene:cds 0.938%, netDanRer1 19.993%, both 0.702%, cover 74.87%, 
 # enrich 3.74x
 # after chainAntiRepeat:
 # featureBits mm5 refGene:cds netDanRer2 -enrichment
 # refGene:cds 0.942%, netDanRer2 21.161%, both 0.717%, cover 76.14%, 
 # enrich 3.60x
 # add trackDb.ra entries and html for details pages
 
 # TIGR GENE INDEX (DONE 2004-12-13 Fan)
     mkdir -p /cluster/data/mm5/bed/tigr
     cd /cluster/data/mm5/bed/tigr
     wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm5_05-2004.tgz
     
     tar xvzf TGI*.tgz
     foreach f (*cattle*)
       set f1 = `echo $f | sed -e 's/cattle/cow/g'`
       mv $f $f1
     end
 
     foreach o (mouse cow human pig rat)
       echo $o
       setenv O $o
       foreach f (chr*_$o*s)
         tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
       end
     end
 
     ssh hgwdev
     cd /cluster/data/mm5/bed/tigr
     hgsql mm5 -e "drop table tigrGeneIndex"
     hgsql mm5 < ~/kent/src/hg/lib/tigrGeneIndex.sql
 
     foreach f (*.gff)
         echo Processing $f ...
         /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm5 tigrGeneIndex $f
         hgsql mm5 -e "select count(*) from tigrGeneIndex"
     end
     # Total of 354491 entries created in tigrGeneIndex table.
 
     hgsql mm5 -e "update tigrGeneIndex set cdsStart = txStart;"
     hgsql mm5 -e "update tigrGeneIndex set cdsEnd = txEnd;"
 
     checkTableCoords mm5 tigrGeneIndex
     
     gzip *.gff *TCs
 
 # TIGR GENE INDEX (RE-DONE 2004-12-21 Fan)
 # This track is re-done due to an error (no strand info) in the original files provided by TIGR.
     cd /cluster/data/mm5/bed
     mv tigr tigr_old_wrong
     mkdir -p /cluster/data/mm5/bed/tigr
     cd /cluster/data/mm5/bed/tigr
     wget --timestamp ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_mm5_12-2004.tgz 
     tar xvzf TGI*.tgz
 
     foreach f (*cattle*)
       set f1 = `echo $f | sed -e 's/cattle/cow/g'`
       mv $f $f1
     end
 
     foreach o (mouse cow human pig rat)
       echo $o
       setenv O $o
       foreach f (chr*_$o*s)
         tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
       end
     end
 
     ssh hgwdev
     cd /cluster/data/mm5/bed/tigr
     hgsql mm5 -e "drop table tigrGeneIndex"
     hgsql mm5 < ~/kent/src/hg/lib/tigrGeneIndex.sql
 
     foreach f (*.gff)
         echo Processing $f ...
         /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC mm5 tigrGeneIndex $f
         hgsql mm5 -e "select count(*) from tigrGeneIndex"
     end
     # Total of 385814 entries created in tigrGeneIndex table.
 
     hgsql mm5 -e "update tigrGeneIndex set cdsStart = txStart;"
     hgsql mm5 -e "update tigrGeneIndex set cdsEnd = txEnd;"
 
     checkTableCoords mm5 tigrGeneIndex
     
     gzip *.gff *TCs
 
 #### LOAD ENSEMBL GENES (DONE - 2004-12-17 Fan)
 # ADDDED STABLE URL TO TRACKDB BLOCK (V27, DEC 2004) (2008-01-11, rhead)
 #	needed for Gene Sorter procedure below
 #	Ensembl released Mouse build 33 the week of Dec 4 2004
      mkdir /cluster/data/mm5/bed/ensembl
      cd /cluster/data/mm5/bed/ensembl
 
         Get the ensembl gene data from http://www.ensembl.org/
         Go to the EnsMart link
         Choose Mus musculus as the organism
         Follow this sequence through the pages:
         Page 1) Choose the Ensembl Genes choice. Hit next.
         Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.        
         Page 3) Choose the "Structures" tab. 
         Page 4) Choose GTF as the ouput, choose gzip compression , name the
 	output file ensGeneMm5.gtf.gz and then hit Export
 
 # Ensembl handles random chromosomes differently than us, so we
 # strip this data.  Fortunately it just loses a couple of genes.
      zcat ensGene.gtf.gz | grep -v ^6_DR51 | grep -v _NT_ > unrandom.gtf
 #	Let's see how much it loses:
 #  	None.
 
 # Add "chr" to front of each line in the gene data gtf file to make 
 # it compatible with ldHgGene
     sed -e "s/^/chr/" unrandom.gtf > ensGene.gtf
 #	(should also fixup chrMT name here too - 2005-02-28 - Hiram)
 #    sed -e "s/^/chr/" unrandom.gtf | sed -e "s/chrMT/chrM/" > ensGene.gtf
     ldHgGene mm5 ensGene ensGene.gtf
 # Read 31035 transcripts in 551352 lines in 1 files
 #   31035 groups 22 seqs 1 sources 4 feature types
 # 31035 gene predictions
 #	save space, gzip them:
     gzip unrandom.gtf
     gzip ensGene.gtf
 #  The name on chrM was incorrect, fixed (2005-02-28 - Hiram)
     hgsql mm5 -e 'update ensGene set chrom="chrM" where chrom="chrMT";'
 
 # Load Ensembl peptides:
         Get the ensembl protein data from http://www.ensembl.org/
         Go to the EnsMart link
         Choose Mus musculus as the organism
         Follow this sequence through the pages:
         Page 1) Choose the Ensembl Genes choice. Hit next.
         Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
         Page 3) Choose the "Sequences" tab. 
         Page 4) Choose Transcripts/Proteins and peptide Only as the output,
 		choose text/fasta and gzip compression,
 		name the file ensGeneMm5.pep.gz and then hit export.
 
 #delete * at end of each protein
      bash
      zcat ensGeneMm5.pep.gz | sed "s/\*$//" > ensembl.pep
     ~matt/bin/fixPep.pl ensembl.pep fixPep_ensembl.pep
      hgPepPred mm5 generic ensPep fixPep_ensembl.pep
 #
 #	The chrMT (chrM) peptides as obtained via EnsMart have only
 #	aa's of: X				(2005-02-28 - Hiram)
 #	These 13 peptides were fixed up manually by fetching each
 #	one individually by following the 13 links from our browser
 #	to the ensemble protein, asking it to dump the protein
 #	sequence, cut and paste that answer to a local file.
 #	The 13 peptides were dropped from ensPep table via:
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082392.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082396.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082402.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082405.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082407.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082408.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082409.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082411.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082413.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082414.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082418.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082419.1";'
 hgsql mm5 -e 'delete from ensPep where name="ENSMUST00000082421.1";'
 #	Then explicitly reloaded with SQL statements such as:
 INSERT into ensPep (name, seq) VALUES ('ENSMUST00000082407.1', 'MPQLDTSTWFITIISSMITLFILFQLKVSSQTFPLAPSPKSLTTMKVKTPWELKWTKIYLPHSLPQQ');
 #	The 13 SQL statements were left in the file:
 #	/cluster/data/mm5/bed/ensembl/chrMPep.sql
 #	loaded via:
     hgsql mm5 < chrMPep.sql
 #	The following files were "touched" on the RR/MGC after the chrMT/M 
 #	change to prevent false errors with joinerCheck. J.Jackson 2005-03-01
 #	mm5.superfamily.name 
 #	mm5.ensGtp.transcript 
 #	mm5.ensPep.name 
 #	mm5.knownToEnsembl.value 
 #	mm5.sfDescription.name 
 
 
 # Load ensGtp table.
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" tab. In "Ensembl Attributes", check 
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Choose Text, tab-separated as the output format, gzip.
     #  Result name file as ensGtpMm5.tab.gz
     gunzip ensGtpMm5.tab.gz
     hgsql mm5 < ~/kent/src/hg/lib/ensGtp.sql
     hgsql -N -e 'load data local infile "ensGtpMm5.tab" into table ensGtp ignore 1 lines;' mm5
 
 # Create knownToEnsembl column
     hgMapToGene mm5 ensGene knownGene knownToEnsembl
 # Compress everthing to save space
     gzip *.tab
     gzip *.pep
 
 #### RE-BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-11-17 - Fan)
 
 # PLEASE NOTE THAT THE ENSEMBLXREF3 TABLE IS RE-BUILT USING ENSMART DATA OF MOUSE BUILD 33.
 # THIS TABLE IS NEEDED TO SUPPORT SUPERFAMILY TRACK OF THE PROTEOME BROWSER.
     # Get the ensembl gene/protein cross-reference data from
     # http://www.ensembl.org/Multi/martview?species=Mus_musculus
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Mus musculus choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Feature" box, select Ensembl gene, transcript, and peptid IDs,
 	      SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC 
     # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
     # Save as ensXref
 
     zcat ensXref.tsv.gz|sed -e 's/\./\t/g' > ensemblXref3.tab
 
     hgsql mm5 -e "drop table ensemblXref3"
     hgsql mm5 < ~/src/hg/lib/ensemblXref3.sql
 
     hgsql mm5 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'
 
 # CREATE SUPERFAMILY TRACK (DONE 2004-12-17 - Fan)
    mkdir /cluster/data/mm5/bed/superfamily
    cd    /cluster/data/mm5/bed/superfamily
    hgSuperfam mm5 superfam041128 > sf.log
    wc *
 # It is normal that many proteins does not have corresponding Superfamily entries.
 
 # Load the sfDescription table.
    hgsql mm5 < ~/src/hg/lib/sfDescription.sql
    hgsql mm5 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table mm5.sfDescription;'
 
 # Finally, load the superfamily table.
    hgLoadBed mm5 superfamily superfamily.tab -tab
 
 # Create knownToSuperfamily table
    
    cat /cluster/data/superfamily/041128/ass_28-Nov-2004.tab | hgKnownToSuper mm5 mm stdin
 # created 21899 records output
 
 # MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-14, hartera)
 # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat
 # (DONE, 2004-12-22, hartera)
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastz.danRer2/axtChrom
     set gp = /usr/local/apache/htdocs/goldenPath/mm5
     mkdir -p $gp/vsDanRer2/axtChrom
     cp -p *.axt $gp/vsDanRer2/axtChrom
     cd $gp/vsDanRer2/axtChrom
     gzip *.axt
     md5sum *.gz > md5sum.txt
                                                                                
     # copy chains and nets to downloads area
     # re-make chains and net downloadables (2004-12-22, hartera)
     rm $gp/vsDanRer2/zebrafish*.gz $gp/vsDanRer2/md5sum.txt
     cd /cluster/data/mm5/bed/blastz.danRer2/axtChain
     gzip -c all.chain.antirepeat > \
             /cluster/data/mm5/zip/zebrafishDanRer2.chain.gz
     gzip -c zfishdanRer2.net > /cluster/data/mm5/zip/zebrafishDanRer2.net.gz
     cd $gp/vsDanRer2
     mv /cluster/data/mm5/zip/zebrafish*.gz .
     md5sum *.gz > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 # BLASTZ DANRER2 CLEANUP (DONE, 2004-12-14, hartera)
 # RE-DONE (DONE, 2004-12-22, hartera)
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.danRer2
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/noClass.net &
     nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
     nice gzip axtChain/all.chain.antirepeat axtChain/all.chain.filt5k axtChain/chainAR/*.chain &
     nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
 
 #  MOUSE PHOTOGRAPH added to gateway page
 #	Obtained from Jackson Labs press office via email:
 #
 #	Subject: Re: mouse press photographs
 #	Date: Wed, 29 Dec 2004 14:26:15 -0500
 #	From: Joyce Peterson <joyce@jax.org>
 #	To: Hiram Clawson <hiram@soe.ucsc.edu>
 #	References: <41D2FF0B.3090207@soe.ucsc.edu>
 
 #	Hi, Hiram. You may use the attached photo, noting credit to "The 
 #	Jackson Laboratory."
 #	
 #	Cheers,
 #	--Joyce
 #
 #	Joyce Peterson
 #	Public Information Manager
 #	The Jackson Laboratory
 #	610 Main Street, Mailbox 664
 #	Bar Harbor, ME 04609-1526
 #	Tel. 207-288-6058
 #	Mobile 207-266-5745
 #	E-mail joyce@jax.org
 #	http://www.jax.org/news
 #
 #  Original from this email placed into /cluster/data/mm5/html/C57BL_6J.JPG
 
     ssh hgwdev
     cd /cluster/data/mm5/html
     #	view that image in 'display' to determine crop edges, then:
     convert -crop 890x690+330+70 -quality 80 -sharpen 0 \
 	-normalize C57BL_6J.JPG mm.jpg
     convert -geometry 300x200 -quality 80 mm.jpg Mus_musculus.jpg
     rm -f mm.jpg
 
     cp -p Mus_musculus.jpg /usr/local/apache/htdocs/images
     #	add links to this image in the description.html page, request push
 
 
 # ANDY LAW CPGISSLANDS (DONE 1/14/05 angie)
     # See notes about this in makeGalGal2.doc.
     # Running only on masked sequence.  
     ssh kksilo
     mkdir /cluster/data/mm5/bed/cpgIslandGgfAndy
     cd /cluster/data/mm5/bed/cpgIslandGgfAndy
     cp /dev/null cpgIslandGgfAndyMasked.bed
     foreach f (../../?{,?}/chr*.fa.masked)
       set chr = $f:t:r:r
       echo preproc masked $chr
       /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc
       echo running on $chr masked
       /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \
       | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                    $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                    $pGc = (100.0 * $gc / $n); \
                    $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                         "$pCpG\t$pGc\t$oE\n";' \
       >> cpgIslandGgfAndyMasked.bed
     end
     # load into database:
     ssh hgwdev
     cd /cluster/data/mm5/bed/cpgIslandGgfAndy
     sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
     hgLoadBed mm5 cpgIslandGgfAndyMasked -tab -noBin \
       -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
     featureBits mm5 cpgIslandExt
 #10422989 bases of 2615483787 (0.399%) in intersection
     featureBits mm5 cpgIslandGgfAndyMasked
 #38305840 bases of 2615483787 (1.465%) in intersection
     wc -l ../cpgIsland/cpgIsland.bed cpgIslandGgfAndyMasked.bed
 #  16238 ../cpgIsland/cpgIsland.bed
 #  67737 cpgIslandGgfAndyMasked.bed
     # 1/26/05: Make better island names in cpgIslandGgfAndyMasked,
     # for Dave Burt's cross-species island comparisons.
     ssh kksilo
     cd /cluster/data/mm5/bed/cpgIslandGgfAndy
     mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig
     perl -wpe '@w=split("\t"); $w[3] = "mm5.$w[0]." . ($w[1]+1) . ".$w[2]"; \
                $_ = join("\t", @w);' \
       cpgIslandGgfAndyMasked.bed.orig \
     > cpgIslandGgfAndyMasked.bed
     ssh hgwdev
     cd /cluster/data/mm5/bed/cpgIslandGgfAndy
     hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
       mm5 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed
 
 
 # MAKE MM5-RN3 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
     ssh kolossus
     set chainDir = /cluster/data/mm5/bed/blastz.rn3/axtChain
     mkdir -p /cluster/data/mm5/bed/bedOver
     mkdir /tmp/mm5ToRn3
     foreach f ($chainDir/ratNet/chr*.net.gz)
       set chr = $f:t:r:r
       echo $chr
       netChainSubset $f $chainDir/chain/$chr.chain.gz \
         /tmp/mm5ToRn3/$chr.chain
     end
     cat /tmp/mm5ToRn3/*.chain \
       > /cluster/data/mm5/bed/bedOver/mm5ToRn3.over.chain
     rm -r /tmp/mm5ToRn3
     
 
 # MAKE MM5-GALGAL2 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
     ssh kolossus
     set chainDir = /cluster/data/mm5/bed/blastz.galGal2/axtChain
     mkdir -p /cluster/data/mm5/bed/bedOver
     netChainSubset $chainDir/chicken.net $chainDir/all.chain \
       /cluster/data/mm5/bed/bedOver/mm5ToGalGal2.over.chain
     
 
 
 # UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
 # Add new mm5 protein display IDs to the alias table to support user search
     
     ssh hgwdev
     mkdir -p /cluster/data/mm5/bed/pb/newDisplayId
     cd /cluster/data/mm5/bed/pb/newDisplayId
  
     hgsql proteome -e 'select mm5.kgSpAlias.kgID, mm5.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, mm5.kgSpAlias where spOldNew.acc=mm5.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >mm5.tab
    
 # get rid of the header line at the end of the file
     vi mm5.tab
 
     hgsql mm5 -e 'load data local infile "mm5.tab" into table mm5.kgSpAlias'
 
 # UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
 # Add new mm5 protein display IDs to the alias table to support user search
     
     ssh hgwdev
     cd /cluster/data/mm5/bed/pb/newDisplayId
 
      hgsql proteome -e 'select mm5.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, mm5.kgSpAlias where spOldNew.acc=mm5.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >mm5.kgProtAlias.tab
 
 # get rid of the header line at the end of the file
     vi mm5.kgProtAlias.tab 
 
     hgsql mm5 -e 'load data local infile "mm5.kgProtAlias.tab" into table mm5.kgProtAlias'
 
 
 # BLASTZ/CHAIN/NET BOSTAU1 (DONE 2/21/05 angie)
     ssh kksilo
     mkdir /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
     cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
     cat << '_EOF_' > DEF
 # mouse vs. cow
 
 # TARGET
 # Mouse
 SEQ1_DIR=/scratch/mus/mm5/softNib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/mm5/chrom.sizes
 
 # QUERY
 # Cow
 SEQ2_DIR=/iscratch/i/bosTau1/nib/bosTau1.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=0
 SEQ2_LEN=/iscratch/i/bosTau1/chrom.sizes
 
 BASE=/cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /cluster/bluearc/mouseVsCow >& do.log &
     tail -f do.log
     # kksilo was rebooted so original invocation of doBlastzChainNet.pl
     # was killed in the middle of the cluster run.  I watched the job
     # progress and restarted 70 failed jobs like this:
     ssh kk
     cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19/run.blastz
     para check
     para push
     para check ...
     # When the batch was complete:
     para time > run.time
     # (doBlastzChainNet.pl uses run.time as a checkpoint)
     # Then to continue the run:
     ssh kksilo
     cd /cluster/data/mm5/bed/blastz.bosTau1.2005-02-19
     doBlastzChainNet.pl -continue=cat DEF \
       -blastzOutRoot /cluster/bluearc/mouseVsCow >>& do.log &
     tail -f do.log
     # For some reason the script got hung waiting for tty input; I 
     # foregrounded it, hit return a few times, and it eventually completed.
     # That should be fixed in a future version of doBlastzChainNet.pl.  
     ln -s blastz.bosTau1.2005-02-19 /cluster/data/mm5/bed/blastz.bosTau1
     # Add chainBosTau1 and netBosTau1 to mm5/trackDb.ra
     # Add /usr/local/apache/htdocs/goldenPath/mm5/vsBosTau1/README.txt
 
 
 # LOAD SNPS (Done; March 3, 2005; Heather)
 
 
   # directory structure
   ssh hgwdev
   cd /cluster/bluearc/snp
   mkdir mm5.heather
   cd mm5.heather
   mkdir det loc seq str xml
 
   # get data
   ftp ftp.ncbi.nih.gov
   cd snp/mouse/XML
   prompt
   mget ds_ch*.xml.gz
 
   # make sure script is current (should add makefile so general build does this)
   cp -f /cluster/home/heather/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts
 
   # build jobList for parsing
   touch jobList
   foreach file ( ds_ch*.xml.gz )
     set out = $file:t:r
     echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/mm5.heather $out.contig >> jobList
   end
 
   # do the parsing
   ssh kk
   cd /cluster/bluearc/snp/mm5.heather
   para create jobList
   para try
   para check
   para push
 
   # output goes to det, loc, seq, str and xml directories
 
   # concatenate details
   ssh hgwdev
   zcat det/ds_ch*.xml.contig.det.gz > in.bed
 
   # couldn't find contig-based lift file from mm5
   # generate from ctgPos
   echo "select chromStart, chrom, contig, size, chrom from ctgPos;" > ctgPos.sql
   hgsql mm5 < ctgPos.sql > ctgPos.out
   # edit ctgPos.out to put in proper format -- next time write script for this
 
   # lift
   # expect warnings from non-reference assemblies (limited to first 10)
   liftUp out.bed ctgPos.out warn in.bed
 
   # load (exception column will be empty for all rows)
   hgLoadBed mm5 snp out.bed -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp.sql
 
   # generate exceptions 1-20; drop 7 and 9 as they will be changing
   cd /usr/local/apache/htdocs/qa/test-results/snpException
   mkdir mm5
   cd mm5
   snpException mm5 0 mm5snpException
 
   # Invariant 1 has 0 exceptions, written to this file: mm5snpException.01.bed
   # Invariant 2 has 0 exceptions, written to this file: mm5snpException.02.bed
   # Invariant 3 has 0 exceptions, written to this file: mm5snpException.03.bed
   # Invariant 4 has 0 exceptions, written to this file: mm5snpException.04.bed
   # Invariant 5 has 0 exceptions, written to this file: mm5snpException.05.bed
   # Invariant 6 has 3 exceptions, written to this file: mm5snpException.06.bed
   # Invariant 7 has 1 exceptions, written to this file: mm5snpException.07.bed
   # Invariant 8 has 0 exceptions, written to this file: mm5snpException.08.bed
   # Invariant 9 has 22 exceptions, written to this file: mm5snpException.09.bed
   # Invariant 10 has 0 exceptions, written to this file: mm5snpException.10.bed
   # Invariant 11 has 0 exceptions, written to this file: mm5snpException.11.bed
   # Invariant 12 has 0 exceptions, written to this file: mm5snpException.12.bed
   # Invariant 13 has 0 exceptions, written to this file: mm5snpException.13.bed
   # Invariant 14 has 0 exceptions, written to this file: mm5snpException.14.bed
   # Invariant 15 has 0 exceptions, written to this file: mm5snpException.15.bed
   # Invariant 16 has 0 exceptions, written to this file: mm5snpException.16.bed
   # Invariant 17 has 0 exceptions, written to this file: mm5snpException.17.bed
   # Invariant 18 has 3634 exceptions, written to this file: mm5snpException.18.bed
   # Invariant 19 has 0 exceptions, written to this file: mm5snpException.19.bed
   # Invariant 20 has 0 exceptions, written to this file: mm5snpException.20.bed
   # Invariant 21 has no query string
   # Invariant 22 has no query string
   # Invariant 23 has no query string
   # Invariant 24 has no query string
 
   mv mm5snpException.07.bed mm5snpException.07.bed.notused
   mv mm5snpException.09.bed mm5snpException.09.bed.notused
 
 
   # snpValid
   cd /cluster/bluearc/snp/mm5.heather/seq
   nice snpValid mm5 . > & snpValid.out &
   tail -20 snpValid.out
 
   # Grand Totals:
   # matches: 494545
   # mismatches: 246 (exceptionId #22)
   # missing from flanks: 0 (exceptionId #23)
   # rev compl matches: 56285
   # not rptd strand : 1 (exceptionId #24)
   # assembly = -: 0
   # nib in gap : 0 (must be 0)
   # Total rows in snp: 494791
   # no dna found for : 0
   # Total goodExact: 493886
   # Total  badExact: 534 (exceptionId #21)
 
   # copy 21-24 exceptions to location of 1-20
   cp *bed /usr/local/apache/htdocs/qa/test-results/snpException/mm5
 
   # add exception data to snp table
   cp ../build124/updateExceptionList.pl .
 
   tail +3 mm5snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt  
   updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql
   hgsql mm5 < updateExceptionList.sql
 
 # HUMAN BLASTP FOR GENE SORTER (RE-DONE 7/28/05 Fan)
     # Make human ortholog column using blastp on human known genes.
     # First make human protein database and copy it to iscratch/i
     # if it doesn't exist already:
 # NOTE: THE SECTION BELOW WAS ALREADY DONE.
     cd /cluster/data/hg17/bed/blastp
     pepPredToFa hg17 knownGenePep known.faa
     formatdb -i known.faa -t known -n known
 
     ssh kkr1u00
     if (-e /iscratch/i/hg17/blastp) then
       rm -r /iscratch/i/hg17/blastp
     endif
     mkdir -p /iscratch/i/hg17/blastp
     cp /cluster/data/hg17/bed/blastp/known.p?? /iscratch/i/hg17/blastp
     iSync
 
 # THE SECTION ABOVE WAS ALREADY DONE PREVIOUSLY.
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/mm5/bed/blastp/hg17/run/out
     cd /cluster/data/mm5/bed/blastp/hg17/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg17/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls -1S /cluster/data/mm5/bed/geneSorter/blastp/split >split.lst
     #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 Completed: 7739 of 7739 jobs
 CPU time in finished jobs:     113019s    1883.65m    31.39h    1.31d  0.004 y
 IO & Wait Time:                 22145s     369.08m     6.15h    0.26d  0.001 y
 Average job time:                  17s       0.29m     0.00h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:             124s       2.07m     0.03h    0.00d
 Submission to last job:           495s       8.25m     0.14h    0.01d
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/mm5/bed/blastp/hg17/run/out
     hgLoadBlastTab mm5 hgBlastTab -maxPer=1 *.tab
 
 # KNOWN GENES 
 # This was built using ~/kent/src/hg/protein/KGprocess.sh
 # and it was not documented. 
 
 # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
 # This depends on the go and uniProt databases as well as 
 # the kgAlias and kgProAlias tables.  The hgKgGetText takes
 # about 5 minutes when the database is not too busy.  The rest
 # is real quick.
      ssh hgwdev
      cd /cluster/data/mm5/bed/
      mkdir -p kgMm5/index
      cd kgMm5/index
      hgKgGetText mm5 knownGene.text
      ixIxx knownGene.text knownGene.ix knownGene.ixx
      ln -s /cluster/data/mm5/bed/kgMm5/index/knownGene.ix /gbdb/mm5/knownGene.ix
      ln -s /cluster/data/mm5/bed/kgMm5/index/knownGene.ixx /gbdb/mm5/knownGene.ixx
 
 # RE-BUILD cgapAlias TABLE 
 # ORIGINALLY TABLE WAS BUILT BY THE KNOWN GENES PROCESS
 # cgapAlias table has replicate rows so remove (DONE, 2005-07-26, hartera)
 # RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)
 
     ssh hgwdev
     cd /cluster/store6/kgDB/bed/kgMm5B
     # DO TABLE RELOAD AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u
     # OR sort -n | uniq.
     # USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ 
     # (hartera, 2005-10-06)
 
     sort -n cgapAlias.tab | uniq > cgapAliasSorted.tab
     hgsql mm5 -e "drop table cgapAlias"
     hgsql mm5 < ~/kent/src/hg/lib/cgapAlias.sql
     hgsql mm5 -e 'load data local infile "cgapAliasSorted.tab" \
           into table cgapAlias'
 
 # Create table that maps between known genes and visiGene database (DONE 2005-10-10 galt)
     knownToVisiGene mm5
     #Made hashes of image: geneImageHash 2117, locusLinkImageHash 780, refSeqImageHash 780, 
     #genbankImageHash 1301
     #knownToLocusLink 30303, knownToRefSeq 30291, knownToGene 266841
 
 # RIKEN CAGE STUFF (DONE 11-16-2005 Andy)
     # Make download area.
     ssh hgwdev
     cd /cluster/data/mm5/bed
     mkdir rikenCageCtss
     cd rikenCageCtss/
     wget -r http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/
     # stupid thing didn't work.  Tried tinkering with wget almost every way possible.
     # Finally just did it the hard way.
     wget -O /dev/stdout http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/ 2> /dev/null 
           | egrep ".sql|.bz2" | grep href | sed 's/^.*href=\"//;s/\".*$//' > files.lst 
     rm -rf fantom*
     for f in `cat files.lst`; do
        wget http://fantom31p.gsc.riken.jp/cage_analysis/export/mm5/$f; 
     done
     bunzip2 *.bz2
 
     # Make the simple table of the CAGE-related TSSs.
     awk 'BEGIN{FS="\t"};{printf("%s\t%s\t%s\t%s\t%s\t1000\t%s\n",$9,$4,$7,$8,$1,($6 == "F") ? "+" : "-")}' \
        tss_summary.tsv | grep "^CAGE" | cut -f2- > basicCAGE.bed 
     # Make CAGE wiggle tracks for plus and minus strands
     awk 'BEGIN{FS="\t"}; {if ($4=="F") printf("%s\t%s\t%d\t%s\n", $2, $5, $5+1, $6)}' \
         ctss_summary.tsv | wigEncode stdin ctssForward.wig ctssForward.wib
     awk 'BEGIN{FS="\t"}; {if ($4=="R") printf("%s\t%s\t%d\t%s\n", $2, $5-1, $5, $6)}' \
         ctss_summary.tsv | wigEncode stdin ctssReverse.wig ctssReverse.wib
     mkdir wiggle
     mv ctss*.wi{g,b} wiggle/
  
     # Load stuff up:  
     hgLoadBed mm5 rikenCageTc basicCAGE.bed
     ln -s /cluster/data/mm5/bed/rikenCAGE/wiggle /gbdb/mm5/wib/ctssForward
     ln -s /cluster/data/mm5/bed/rikenCAGE/wiggle /gbdb/mm5/wib/ctssReverse
     hgLoadWiggle mm5 ctssForward ctssForward.wig
     hgLoadWiggle mm5 ctssReverse ctssReverse.wig    
 
     # OK make them bedGraphs instead.
     cd ../
     rm -rf wiggle/
     rm /gbdb/mm5/wib/ctss*
     hgsql mm5 -e 'drop table ctssForward'
     hgsql mm5 -e 'drop table ctssReverse'
     awk 'BEGIN{FS="\t"}; {if ($4=="F") printf("%s\t%s\t%d\t%s\n", $2, $5, $5+1, $6)}' \
        ctss_summary.tsv | hgLoadBed -strict -bedGraph=4 mm5 rikenCageCtssPlus stdin
     awk 'BEGIN{FS="\t"}; {if ($4=="R") printf("%s\t%s\t%d\t%s\n", $2, $5-1, $5, $6)}' \
        ctss_summary.tsv | hgLoadBed -strict -bedGraph=4 mm5 rikenCageCtssMinus stdin
 
     # track html:
     cp rikenCageCtss.html ~/kent/src/hg/makeDb/trackDb/mouse/
     # trackDb:
 track rikenCageTc
 shortLabel Riken CAGE TC
 longLabel Riken CAGE - Associated Transcript Clusters
 group genes
 priority 47.5
 visibility hide
 type bed 6 .
 
 track rikenCageCtss
 compositeTrack on
 shortLabel Riken CAGE
 longLabel Riken CAGE - Predicted Gene Start Sites
 group genes
 priority 47.51
 visibility hide
 type bedGraph 4
 maxHeightPixels 128:16:16
 minLimit 1
 maxLimit 4316
 viewLimits 1.0:10.0
 windowingFunction mean
 autoScale Off
 origAssembly hg16
 
     track rikenCageCtssPlus
     subTrack rikenCageCtss
     shortLabel Riken CAGE +
     longLabel Riken CAGE Plus Strand - Predicted Gene Start Sites
     priority 1
     color 109,51,43
 
     track rikenCageCtssMinus
     subTrack rikenCageCtss
     shortLabel Riken CAGE -
     longLabel Riken CAGE Minus Strand - Predicted Gene Start Sites
     priority 2
     color 43,51,109
 
 # MYTOUCH FIX - jen - 2006-01-24
   sudo mytouch mm5 geneidPep 0408071900.00
   sudo mytouch mm5 genscanPep 0501071300.00
   sudo mytouch mm5 superfamily 0503011100.00
   sudo mytouch mm5 ensGtp 0503011100.00
   sudo mytouch mm5 knownToEnsembl 0503011100.00
   sudo mytouch mm5 sfDescription 0503011100.00
 
 ############################################################################
 #	Mm7 to Mm5 liftOver creation (DONE - 2006-02-22 - 2006-02-24 - Hiram)
 #	instructions lifted from Andy's sequence in makeMm7.doc
 ######## LIFTOVER PREPARATION
     # Split up mm5
     ssh kkr1u00
     cd /iscratch/i/mm5
     mkdir liftSplits
     mkdir liftSplits/split
     mkdir liftSplits/lift
     for fa in /cluster/data/mm5/?/*.fa /cluster/data/mm5/??/*.fa
     do
       c=`basename $fa .fa`
       echo $c
       faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 \
 	liftSplits/split/$c
     done
     mkdir biggerSplits
     mkdir biggerSplits/split
     cd biggerSplits/
     ln -s ../liftSplits/lift
     cd split/
     ln -s ../../liftSplits/split/* .
     faSplit sequence chr1.fa 5 chr1_
     faSplit sequence chrX.fa 10 chrX_
     rm chr{1,X}.fa
     for R in 2 3 4 5 6 7 8
     do
 	rsync -a --progress /iscratch/i/mm5/ kkr${R}u00:/iscratch/i/mm5/
     done
 
     ######## LIFTOVER BLATING    
     # MM7
     ssh kk
     cd /cluster/data/mm7
     /cluster/bin/scripts/makeLoChain-align mm7 /scratch/hg/mm7/nib mm5 \
 	/iscratch/i/mm5/biggerSplits/split
     cd bed/blat.mm5.2006-02-22/run
 
     #	target is Mm7
     #	query is Mm5
     cat << '_EOF_' > blat.csh
 #!/bin/csh -fe
 
 set target=$1
 set query=$2
 set output=$3
 set chain=$4
 
 set tPart=$target:t:r
 set qPart=$query:t:r
 
 set tmpDir=/scratch/tmp/${chain}.${tPart}_${qPart}
 set tmpOutput=$tmpDir/$output:t
 
 mkdir -p $tmpDir
 sleep 2
 /cluster/bin/$MACHTYPE/blat $target $query $tmpOutput \
         -tileSize=11 -minScore=100 -minIdentity=98 -fastMap \
 	-ooc=/iscratch/i/mm5/11.ooc
 mkdir -p `dirname $output`
 cp $tmpOutput $output
 rm $tmpOutput
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     #	happy emacs
     chmod +x blat.csh
 
     sed 's#^blat#./blat.csh#; s/\}.*$/}/; s/$/ mm7ToMm5/' spec > jobList
 
     para create jobList
     para -maxNode=200 -priority=25 push
     para time
 # Completed: 2451 of 2451 jobs
 # CPU time in finished jobs:    1266001s   21100.02m   351.67h   14.65d  0.040 y
 # IO & Wait Time:                 13972s     232.87m     3.88h    0.16d  0.000 y
 # Average job time:                 522s       8.70m     0.15h    0.01d
 # Longest finished job:            6769s     112.82m     1.88h    0.08d
 # Submission to last job:         26506s     441.77m     7.36h    0.31d
 
     ######## LIFTOVER CHAINING
     # LIFTING
     ssh kki
     cd /cluster/data/mm7/bed/blat.mm5.2006-02-22
     cat << '_EOF_' > mm5SplitLift.sh
 #!/bin/bash
 for C in chr1 chrX
 do
     echo joining $C
     for P in `ls *_${C}_[0-9]*.psl | sed -e "s/_chr.*//" | sort -u`
     do
 	echo "${P}_${C}.psl"
 	tail --lines=+6 -q "${P}_${C}_[0-9]*.psl"  > ${P}_${C}.psl
    done
    for f in *_${C}.psl; do
       cat /san/sanvol1/scratch/andy/psl.header $f > tmp
       mv tmp $f
    done
 done
 echo Lifting...
 for C in `awk '{print $1}' /cluster/data/mm5/chrom.sizes`; do
    echo "lifting $C ... "
    liftUp -pslQ ../psl/${C}.psl \
 	/iscratch/i/mm5/biggerSplits/lift/${C}.lft error chr*_${C}.psl
    echo done $C
 done    
 '_EOF_'
     #	happy emacs
     chmod +x mm5SplitLift.sh
 
     cat << "EOF" > mm5ChainMergeSplit.sh
 #!/bin/bash
 cp -r chainRaw/ /scratch/andy/mm5Lifts
 pushd /scratch/andy/mm5Lifts
 mkdir chain
 /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
 cp -r chain `dirs +1`
 rm -rf chain chainRaw
 EOF
     chmod +x mm5ChainMergeSplit.sh
 
     cd /cluster/data/mm7/bed/blat.mm5.2006-02-22/raw
     ../mm5SplitLift.sh
 
     cd ../    
     mkdir chainRun chainRaw
     cd chainRun
     cat << '_EOF_' > template
 #LOOP
 axtChain -linearGap=medium -verbose=0 -psl $(path1) /scratch/hg/mm7/nib /cluster/data/mm5/nib {check out line+ ../chainRaw/$(root1).chain}
 #ENDLOOP
 '_EOF_'
     ls -1S ../psl/*.psl > in.lst
     gensub2 in.lst single template jobList
     para create jobList
     para push
     para time
 # Completed: 43 of 43 jobs
 # CPU time in finished jobs:       7259s     120.98m     2.02h    0.08d  0.000 y
 # IO & Wait Time:                  1086s      18.10m     0.30h    0.01d  0.000 y
 # Average job time:                 194s       3.23m     0.05h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1088s      18.13m     0.30h    0.01d
 # Submission to last job:          2289s      38.15m     0.64h    0.03d
 
     ssh kkstore02
     cd /cluster/data/mm7/bed/blat.mm5.2006-02-22
     mkdir chain
     time chainMergeSort chainRaw/* | chainSplit chain stdin
     #	real    29m42.365s
 
     mkdir net over
     cd chain
     for c in *.chain
     do
        echo ${c%.chain}; 
        nice chainNet $c /cluster/data/mm7/chrom.sizes \
         /cluster/data/mm5/chrom.sizes ../net/${c%.chain}.net /dev/null
        echo done $c
     done
     #	real    15m33.593s
     for chain in *.chain
     do 
        c=${chain%.chain}
        nice netChainSubset ../net/$c.net $chain ../over/$c.over 
     done
     #	real    10m48.898s
 
     ########## FINISHING
     ssh kkstore02
     cd /cluster/data/mm7/bed/blat.mm5.2006-02-22/over
     cat * > ../mm7ToMm5.over.chain
     cd ..
     gzip mm7ToMm5.over.chain
     rm -rf psl net chain chainRaw over
     ssh hgwdev
     cd /cluster/data/mm7/bed
     ln -s blat.mm5.2006-02-22 blat.mm5
     ln -s `pwd`/blat.mm5/mm7ToMm5.over.chain.gz liftOver/mm7ToMm5.over.chain.gz
     ln -s `pwd`/liftOver/mm7ToMm5.over.chain.gz \
 	/gbdb/mm7/liftOver/mm7ToMm5.over.chain.gz
     ln -s `pwd`/liftOver/mm7ToMm5.over.chain.gz \
 	/usr/local/apache/htdocs/goldenPath/mm7/liftOver/mm7ToMm5.over.chain.gz
     hgAddLiftOverChain mm7 mm5 /gbdb/mm7/liftOver/mm7ToMm5.over.chain.gz
 
 ############################################################################
 # UPDATED mm5.knownToVisiGene (2006-03-21 galt)
 ssh hgwdev
 knownToVisiGene mm5
 
 #######################################################################
 ## LIFTOVER To Mm8 (DONE - 2006-05-15 - 2006-06-05 - Hiram)
     ssh kkr1u00
     #	do not need to run this command since /cluster/data/mm8/split10k
     #	already exists from previous liftOver jobs (mm7 to mm8)
     # $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
     #	mm8 /cluster/data/mm8/nib
     # as it says, DO THIS NEXT:
     ssh kk
     #	if bin/scripts is not in your PATH, add it for this command:
     PATH=$PATH:/cluster/bin/scripts \
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
 	mm5 /cluster/data/mm5/nib mm8 /iscratch/i/mm8/split10k \
 	/cluster/data/mm8/11.ooc
     # as it says, DO THIS NEXT:
     cd /cluster/data/mm5/bed/blat.mm8.2006-05-15/run
     para try, check, push, check, ...
 # Completed: 1462 of 1462 jobs
 # CPU time in finished jobs:    3990246s   66504.10m  1108.40h   46.18d  0.127 y
 # IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 # Average job time:                2371s      39.51m     0.66h    0.03d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           24307s     405.12m     6.75h    0.28d
 # Submission to last job:       1474509s   24575.15m   409.59h   17.07d
 
     # as it says, DO THIS NEXT:
     #	this does the liftUp and makes the psl files
     #	kkr1u00 is down these days
     ssh kkr3u00
     cd /cluster/data/mm5/bed
     ln -s blat.mm8.2006-05-15 blat.mm8
     #	edit this script to allow use on kkr3u00
     time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm5 mm8
     #	real    16m5.091s
     # as it says, DO THIS NEXT:
     #	the prepares the batch to run for the chaining
     ssh kki
     time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
 	mm5 /cluster/data/mm5/nib mm8 /cluster/data/mm8/nib
 
     # as it says, DO THIS NEXT:
     #	running the chain batch
     cd /cluster/data/mm5/bed/blat.mm8.2006-05-15/chainRun
     para try, check, push, check, ...
 # Completed: 34 of 34 jobs
 # CPU time in finished jobs:       6893s     114.88m     1.91h    0.08d  0.000 y
 # IO & Wait Time:                  7183s     119.72m     2.00h    0.08d  0.000 y
 # Average job time:                 414s       6.90m     0.12h    0.00d
 # Longest finished job:            1130s      18.83m     0.31h    0.01d
 # Submission to last job:          1130s      18.83m     0.31h    0.01d
 
     ssh kkstore03
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm5 mm8
 
     #	Created /cluster/data/mm5/bed/liftOver/mm5ToMm8.over.chain.gz
     # as it says, DO THIS NEXT:
     ssh hgwdev
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm5 mm8
     #	It says this:
     # 	Now, add link for
     #	/usr/local/apache/htdocs/goldenPath/mm5/liftOver/mm5ToMm8.over.chain
     #	to hgLiftOver
     #	But I believe that link was already done:
     cd /gbdb/mm5/liftOver
     ls -og mm5ToMm8*
     #	lrwxrwxrwx  1 53 Jun  5 16:10 mm5ToMm8.over.chain.gz ->
     #		/cluster/data/mm5/bed/liftOver/mm5ToMm8.over.chain.gz
 
 
 #####################################################################
 # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
     # File emailed from Xinwei She <xws@u.washington.edu>
     mkdir /cluster/data/mm5/bed/genomicSuperDups
     cd /cluster/data/mm5/bed/genomicSuperDups
     sed -e 's/\t_\t/\t-\t/' mm5_genomicSuperDup.tab \
     | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
     | hgLoadBed mm5 genomicSuperDups stdin \
       -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
 
 
 ##########################################################################
 # GenBank gbMiscDiff table (markd 2007-01-10)
 # Supports `NCBI Clone Validation' section of mgcGenes details page
 
    # genbank release 157.0 now contains misc_diff fields for MGC clones
    # reloading mRNAs results in gbMiscDiff table being created.
    ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm5