src/hg/makeDb/doc/canFam2.txt 1.22

1.22 2009/08/10 16:38:13 hartera
Documented production of zebrafish danRer5 chains, nets and downloads.
Index: src/hg/makeDb/doc/canFam2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/canFam2.txt,v
retrieving revision 1.21
retrieving revision 1.22
diff -b -B -U 1000000 -r1.21 -r1.22
--- src/hg/makeDb/doc/canFam2.txt	21 Jul 2009 21:01:41 -0000	1.21
+++ src/hg/makeDb/doc/canFam2.txt	10 Aug 2009 16:38:13 -0000	1.22
@@ -1,2912 +1,2956 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes how we made the browser database on the 
 # Dog (Canis familiaris) May 2005 release.
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +-------------+---------------------------------+
 #  | tableName   | type                            |
 #  +-------------+---------------------------------+
 #  | refGene     | genePred refPep refMrna         |
 #  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
 #  | nscanGene   | genePred nscanPep               |
 #  | genscan     | genePred genscanPep             |
 #  +-------------+---------------------------------+
 
 
 # CREATE BUILD DIRECTORY (DONE 6/1/05 angie)
     ssh kkstore01
     mkdir /cluster/store9/canFam2
     ln -s /cluster/store9/canFam2 /cluster/data/canFam2
 
 
 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 6/1/05 angie)
     mkdir /cluster/data/canFam2/M
     cd /cluster/data/canFam2/M
     # go to http://www.ncbi.nih.gov/ and search Nucleotide for 
     # "canis familiaris mitochondrion genome".  That shows the gi number:
     # 17737322
     # Use that number in the entrez linking interface to get fasta:
     wget -O chrM.fa \
       'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=17737322&dopt=FASTA'
     # Edit chrM.fa: make sure the long fancy header line says it's the 
     # Canis familiaris mitochondrion complete genome, and then replace the 
     # header line with just ">chrM".
 
 
 # MAKE JKSTUFF AND BED DIRECTORIES (DONE 6/1/05 angie)
     # This used to hold scripts -- better to keep them inline in the .doc 
     # so they're in CVS.  Now it should just hold lift file(s) and 
     # temporary scripts made by copy-paste from this file.  
     mkdir /cluster/data/canFam2/jkStuff
     # This is where most tracks will be built:
     mkdir /cluster/data/canFam2/bed
 
 
 # DOWNLOAD AGP, FASTA & QUAL (DONE 6/2/05 angie)
     ssh kkstore01
     mkdir /cluster/data/canFam2/broad
     cd /cluster/data/canFam2/broad
     ftp ftp.broad.mit.edu
       prompt
       bin
       cd pub/assemblies/mammals/canFam2
       mget contigs.bases.gz AAEX02.full_AGP Dog2.0.agp supercontigs
       mget assembly.agp assembly.format contigs.quals.gz supercontigs.summary
       bye
     # Jean Chang is going to remove AAEX02.full_AGP to avoid confusion,
     # so make sure we can tell people how to quickly regenerate it from 
     # Dog2.0.agp:
     perl -wpe 'if (/contig_(\d+)/) { \
                  $a = sprintf "AAEX020%05d", $1+1;  s/contig_\d+/$a/; }' \
       Dog2.0.agp > /tmp/1.agp 
     diff /tmp/1.agp AAEX02.full_AGP | wc -l
 #0
     # Meanwhile, the AGP has chr01, chr02 instead of chr1, chr2... yecch.
     # Substitute those out:
     sed -e 's/^chr0/chr/' Dog2.0.agp > UCSC_Dog2.0.agp
 
 
 # BUILD CHROM FA (DONE 6/28/05 angie)
     ssh kkstore01
     cd /cluster/data/canFam2/broad
     awk '{print $1;}' UCSC_Dog2.0.agp | uniq > ../chrom.lst
     nice gunzip contigs.bases.gz 
     foreach chr (`cat ../chrom.lst`)
       set c = `echo $chr | sed -e 's/chr//'`
       mkdir ../$c
       awk '$1 == "'$chr'" {print;}' UCSC_Dog2.0.agp > ../$c/$chr.agp
       agpToFa -simpleMultiMixed ../$c/$chr.agp $chr ../$c/$chr.fa contigs.bases
     end
     faSize ../*/chr*.fa
 #2531673953 bases (146677410 N's 2384996543 real 2384996543 upper 0 lower) in 41 sequences in 41 files
 #Total size: mean 61748145.2 sd 25246010.8 min 16727 (chrM) max 126883977 (chrX) median 61280721
 #N count: mean 3577497.8 sd 1401887.8
 #U count: mean 58170647.4 sd 24664411.9
 #L count: mean 0.0 sd 0.0
     # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
     cd /cluster/data/canFam2
     foreach agp (?{,?}/chr*.agp)
       set fa = $agp:r.fa
       echo checking consistency of $agp and $fa
       checkAgpAndFa $agp $fa | tail -1
     end
     nice gzip broad/contigs.bases
     echo "chrM" >> chrom.lst
 
 
 # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 6/28/05 angie)
     ssh kkstore01
     cd /cluster/data/canFam2
     foreach agp (?{,?}/chr*.agp)
       set fa = $agp:r.fa
       echo splitting $agp and $fa
       cp -p $agp $agp.bak
       cp -p $fa $fa.bak
       nice splitFaIntoContigs $agp $fa . -nSize=5000000
     end
 
     # No _random's in this assembly, so no need to clean up after them.
     # Make a "pseudo-contig" for processing chrM too:
     mkdir M/chrM_1
     sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
     mkdir M/lift
     echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
     echo "chrM_1" > M/lift/ordered.lst
     set msize = `faSize M/chrM.fa | awk '{print $1;}'`
     echo "0\tM/chrM_1\t$msize\tchrM\t$msize" > M/lift/ordered.lft
     foreach f ( ?{,?}/chr*.fa.bak )
       nice faCmp $f $f:r
     end
 
 
 # MAKE LIFTALL.LFT (DONE 6/28/05 angie)
     ssh kkstore01
     cd /cluster/data/canFam2
     cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
 
 
 # CREATING DATABASE (DONE 6/28/05 angie)
     ssh hgwdev
     hgsql '' -e 'create database canFam2'
     # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
     df -h /var/lib/mysql
 #/dev/sdc1             1.8T  940G  720G  57% /var/lib/mysql
 
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 6/28/05 angie)
     ssh hgwdev
     hgsql canFam2 -e \
       "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp"
 
 
 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 6/28/05 angie)
     # Make .2bit, unmasked until RepeatMasker and TRF steps are done.
     # Do this now so we can load up RepeatMasker and run featureBits; 
     # can also load up other tables that don't depend on masking.  
     ssh kkstore01
     cd /cluster/data/canFam2
     nice faToTwoBit ?{,?}/chr*.fa canFam2.2bit
     mkdir bed/chromInfo
     twoBitInfo canFam2.2bit stdout \
     | awk '{print $1 "\t" $2 "\t/gbdb/canFam2/canFam2.2bit";}' \
       > bed/chromInfo/chromInfo.tab
 
     # Make symbolic links from /gbdb/canFam2/ to the real .2bit.
     ssh hgwdev
     mkdir /gbdb/canFam2
     ln -s /cluster/data/canFam2/canFam2.2bit /gbdb/canFam2/
     # Load /gbdb/canFam2/canFam2.2bit paths into database and save size info.
     cd /cluster/data/canFam2
     hgsql canFam2  < $HOME/kent/src/hg/lib/chromInfo.sql
     hgsql canFam2 -e 'load data local infile \
       "/cluster/data/canFam2/bed/chromInfo/chromInfo.tab" \
       into table chromInfo;'
     echo "select chrom,size from chromInfo" | hgsql -N canFam2 > chrom.sizes
     # take a look at chrom.sizes, should be 41 lines
     wc chrom.sizes
 #     41      82     603 chrom.sizes
 
 
 # GOLD AND GAP TRACKS (DONE 6/28/05 angie)
     ssh hgwdev
     cd /cluster/data/canFam2
     hgGoldGapGl -noGl -chromLst=chrom.lst canFam2 /cluster/data/canFam2 .
     # featureBits fails if there's no chrM_gap, so make one:
     # echo "create table chrM_gap like chr1_gap" | hgsql canFam2
     # oops, that won't work until v4.1, so do this for the time being:
     hgsql canFam2 -e "create table chrM_gap select * from chr1_gap where 0=1"
 
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CANFAM2 (DONE 8/1/05 angie)
     ssh hgwdev
     cd $HOME/kent/src/hg/makeDb/trackDb
     cvs up -d -P
     # Edit that makefile to add canFam2 in all the right places and do
     make update
     cvs commit makefile
     mkdir -p dog/canFam2
     cvs add dog dog/canFam2
     cvs ci -m "trackDb dir for dog genome(s)" dog/canFam2
     # Do this in a clean (up-to-date, no edits) tree:
     make alpha
 
     # Add dbDb entry (not a new organism so defaultDb and genomeClade already 
     # have entries):
     hgsql -h genome-testdb hgcentraltest \
       -e 'insert into dbDb (name, description, nibPath, organism,  \
           defaultPos, active, orderKey, genome, scientificName,  \
           htmlPath, hgNearOk, hgPbOk, sourceName)  \
           values("canFam2", "May 2005", \
           "/gbdb/canFam2/nib", "Dog", "chr14:11072309-11078928", 1, \
           18, "Dog", "Canis familiaris", \
           "/gbdb/canFam2/html/description.html", 0, 0, \
           "Broad Institute v. 2.0");'
 
 
 # REPEAT MASKING (DONE braney/angie 7-10-05)
     #- Split contigs into 500kb chunks, at gaps if possible:
     ssh kkstore01
     cd /cluster/data/canFam2
     foreach chr (`cat chrom.lst`)
       set c = `echo $chr | sed -e 's/chr//'`
       foreach d ($c/chr${c}*_?{,?})
         cd $d
         echo "splitting $d"
         set contig = $d:t
         faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
           -minGapSize=100
         cd ../..
       end
     end
 
     #- Make the run directory and job list:
     cd /cluster/data/canFam2
     cat << '_EOF_' > jkStuff/RMDog
 #!/bin/csh -fe
 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/canFam2/$2
 /bin/cp $2 /tmp/canFam2/$2/
 cd /tmp/canFam2/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec dog $2
 popd
 /bin/cp /tmp/canFam2/$2/$2.out ./
 if (-e /tmp/canFam2/$2/$2.align) /bin/cp /tmp/canFam2/$2/$2.align ./
 if (-e /tmp/canFam2/$2/$2.tbl) /bin/cp /tmp/canFam2/$2/$2.tbl ./
 if (-e /tmp/canFam2/$2/$2.cat) /bin/cp /tmp/canFam2/$2/$2.cat ./
 /bin/rm -fr /tmp/canFam2/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam2/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/canFam2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x jkStuff/RMDog
     mkdir RMRun
     cp /dev/null RMRun/RMJobs
     foreach chr (`cat chrom.lst`)
       set c = `echo $chr | sed -e 's/chr//'`
       foreach d ($c/chr${c}_?{,?})
           set ctg = $d:t
           foreach f ( $d/${ctg}_?{,?}.fa )
             set f = $f:t
             echo /cluster/data/canFam2/jkStuff/RMDog \
                  /cluster/data/canFam2/$d $f \
                '{'check out line+ /cluster/data/canFam2/$d/$f.out'}' \
               >> RMRun/RMJobs
           end
       end
     end
 
     #- Do the run
     ssh kk
     cd /cluster/data/canFam2/RMRun
     para create RMJobs
     para try, para check, para check, para push, para check,...
 
 # Completed: 6149 of 6149 jobs
 # CPU time in finished jobs:   32138805s  535646.75m  8927.45h  371.98d  1.019 y
 # IO & Wait Time:                346449s    5774.15m    96.24h    4.01d  0.011 y
 # Average job time:                5283s      88.05m     1.47h    0.06d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            8642s     144.03m     2.40h    0.10d
 # Submission to last job:        147094s    2451.57m    40.86h    1.70d
 
     #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
     ssh kkstore01
     cd /cluster/data/canFam2
     foreach d (*/chr*_?{,?})
       set contig = $d:t
       echo $contig
       liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out > /dev/null
     end
 
     #- Lift pseudo-contigs to chromosome level
     foreach c (`cat chrom.lst`)
       echo lifting $c
       set dir=`echo $c | sed "s/chr//" `
       cd $dir
       liftUp $c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` > /dev/null
       cd ..
     end
 
     #- Load the .out files into the database with:
     ssh hgwdev
     cd /cluster/data/canFam2
     hgLoadOut canFam2 */chr*.fa.out
 
 
 # VERIFY REPEATMASKER RESULTS  (DONE 2005-07-11 braney)
     # Eyeball some repeat annotations in the browser, compare to lib seqs.
     # Run featureBits on canFam2 and on a comparable genome build, and compare:
     ssh hgwdev
     featureBits canFam2 rmsk
 # 968054174 bases of 2384996543 (40.589%) in intersection
 #canFam1 is
 #896773874 bases of 2359845093 (38.001%) in intersection
 
 # SIMPLE REPEATS (TRF)  (DONE 2005-07-11 braney)
 # PARTIALLY REDONE 2005-12-02 angie -- See partial redo notes below...
     ssh kkstore01
     mkdir /cluster/data/canFam2/bed/simpleRepeat
     cd /cluster/data/canFam2/bed/simpleRepeat
     mkdir trf
     cp /dev/null jobs.csh
     foreach d (/cluster/data/canFam2/*/chr*_?{,?})
       set ctg = $d:t
       foreach f ($d/${ctg}.fa)
         set fout = $f:t:r.bed
         echo $fout
         echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" >> jobs.csh
       end
     end
     csh -ef jobs.csh >&! jobs.log &
     # check on this with
     tail -f jobs.log
     wc -l jobs.csh
     ls -1 trf | wc -l
     endsInLf trf/*
     # When job is done do:
     liftUp simpleRepeat.bed /cluster/data/canFam2/jkStuff/liftAll.lft warn trf/*.bed
 
     # Load into the database:
     ssh hgwdev
     hgLoadBed canFam2 simpleRepeat \
       /cluster/data/canFam2/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
 # load of simpleRepeat did not go as planned: 637318 record(s), 0 row(s) skipped, 1172 warning(s) loading bed.tab
 
     featureBits canFam2 simpleRepeat
 # 52855902 bases of 2384996543 (2.216%) in intersection
 # canFam1 is
 #36509895 bases of 2359845093 (1.547%) in intersection
 
 # 2005-12-02 Error with simpleRepeats found during QA. 1622 entries have what
 # appears to be a parsing error. The "sequence" value is a float number of
 # format X.XX instead of an ATCG sequence. Lines removed from active tables.
 # Copy of original data on hgwdev.canFam2.simpleRepeat_1202problem
 
     # Partial redo 12/2/05 -- Jen found some lines with invalid values in 
     # the database.  All were from trf/chr10_1.bed which I moved aside to 
     # trf/chr10_1.bed.bad.  When I reran the chr10_1 job from jobs.csh, 
     # it produced a valid output file.  So I did the rest of the steps and 
     # reloaded.
     ssh kolossus
     cd /cluster/data/canFam2/bed/simpleRepeat
     mv trf/chr10_1.bed{,.bad}
     /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf /cluster/data/canFam2/10/chr10_1/chr10_1.fa /dev/null -bedAt=trf/chr10_1.bed -tempDir=/tmp
     mv simpleRepeat.bed simpleRepeat.bed.bad
     liftUp simpleRepeat.bed /cluster/data/canFam2/jkStuff/liftAll.lft warn trf/*.bed
     # on hgwdev:
     featureBits canFam2 simpleRepeat
 #52885863 bases of 2384996543 (2.217%) in intersection
 
 # CREATE MICROSAT TRACK (done 2006-7-5 JK)
      ssh hgwdev
      cd /cluster/data/canFam2/bed
      mkdir microsat
      cd microsat
      awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
     /cluster/bin/i386/hgLoadBed canFam2 microsat microsat.bed
 
 
 # PROCESS SIMPLE REPEATS INTO MASK  (DONE 2005-07-11 braney)
 # 2005-12-02 angie: continuing redo of chr10_1, see below. 
     # After the simpleRepeats track has been built, make a filtered version 
     # of the trf output: keep trf's with period <= 12:
     ssh kkstore01
     cd /cluster/data/canFam2/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/chr*.bed)
       awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
     # Lift up filtered trf output to chrom coords as well:
     cd /cluster/data/canFam2
     mkdir bed/simpleRepeat/trfMaskChrom
     foreach c (`cat chrom.lst`)
       set dir=`echo $c | sed "s/chr//" `
       if (-e $dir/lift/ordered.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' $dir/lift/ordered.lst > $dir/lift/oTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr$dir.bed jkStuff/liftAll.lft warn `cat $dir/lift/oTrf.lst`
       endif
     end
     # Here's the coverage for the filtered TRF:
     ssh hgwdev
     cat /cluster/data/canFam2/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed
     featureBits canFam2 /tmp/filtTrf.bed
 #23111877 bases of 2384996543 (0.969%) in intersection
 # canFam1 was
 #23017541 bases of 2359845093 (0.975%) in intersection
     featureBits canFam2 /tmp/filtTrf.bed \!rmsk
 # 1205611 bases of 2384996543 (0.051%) in intersection
 #canFam1 was
 #1275941 bases of 2359845093 (0.054%) in intersection
 
     # 12/2/2005 -- since I regenerated trf/chr10_1.bed above, continue here...
     ssh kolossus
     cd /cluster/data/canFam2/bed/simpleRepeat
     mv trfMask/chr10_1.bed{,.orig}
     awk '{if ($5 <= 12) print;}' trf/chr10_1.bed  > trfMask/chr10_1.bed
     mv trfMaskChrom/chr10.bed{,.orig}
     cd ../..
     liftUp bed/simpleRepeat/trfMaskChrom/chr10.bed jkStuff/liftAll.lft warn `cat 10/lift/oTrf.lst`
     ssh hgwdev
     cat /cluster/data/canFam2/bed/simpleRepeat/trfMaskChrom/*.bed > /tmp/filtTrf.bed
     featureBits canFam2 /tmp/filtTrf.bed
 #23133898 bases of 2384996543 (0.970%) in intersection
     featureBits canFam2 /tmp/filtTrf.bed \!rmsk
 #1206965 bases of 2384996543 (0.051%) in intersection
 
 
 # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 2005-07-11 braney)
 # REDONE 2005-12-02 angie (to pick up the chr10_1 simpleRepeat redo)
     ssh kkstore01
     cd /cluster/data/canFam2
     # Soft-mask (lower-case) the contig and chr .fa's, 
     # then make hard-masked versions from the soft-masked.  
     set trfCtg=bed/simpleRepeat/trfMask
     set trfChr=bed/simpleRepeat/trfMaskChrom
     foreach f (*/chr*.fa)
       echo "repeat- and trf-masking $f"
       maskOutFa -soft $f $f.out $f
       set chr = $f:t:r
       maskOutFa -softAdd $f $trfChr/$chr.bed $f
       echo "hard-masking $f"
       maskOutFa $f hard $f.masked
     end
     # Tons of warnings like this, mostly for L1M*:
 #WARNING: negative rEnd: -189 chrX:117586389-117586475 L1M3e
     foreach c (`cat chrom.lst`)
       set c=`echo $c | sed "s/chr//" `
       echo "repeat- and trf-masking contigs of chr$c"
       foreach d ($c/chr*_?{,?})
         set ctg=$d:t
         set f=$d/$ctg.fa
         maskOutFa -soft $f $f.out $f
         maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
         maskOutFa $f hard $f.masked
       end
     end
     #- Rebuild the nib files, using the soft masking in the fa:
     foreach f (*/chr*.fa)
       faToNib -softMask $f nib/$f:t:r.nib
     end
     # Make one big 2bit file as well, and make a link to it in 
     # /gbdb/canFam2/nib because hgBlat looks there:
     faToTwoBit */chr*.fa canFam2.2bit
     ssh hgwdev
     ln -s /cluster/data/canFam2/canFam2.2bit /gbdb/canFam2/nib/
 
 
 # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 8/1/05 angie)
     ssh hgwdev
     mkdir /gbdb/canFam2/html
     # Write ~/kent/src/hg/makeDb/trackDb/dog/canFam2/description.html 
     # with a description of the assembly and some sample position queries.  
     chmod a+r $HOME/kent/src/hg/makeDb/trackDb/dog/canFam2/description.html
     # Check it in and copy (ideally using "make alpha" in trackDb) to 
     # /gbdb/canFam2/html
 
 
 # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 8/1/05 angie)
     # pitakluster (blastz etc):
     ssh pk
     mkdir /san/sanvol1/scratch/canFam2
     rsync -av /cluster/data/canFam2/nib /san/sanvol1/scratch/canFam2/
     mkdir /san/sanvol1/scratch/canFam2/rmsk
     cp -p /cluster/data/canFam2/*/chr*.fa.out \
        /san/sanvol1/scratch/canFam2/rmsk
     # small cluster (chaining etc):
     ssh kkr1u00
     mkdir -p /iscratch/i/canFam2/nib
     rsync -av /cluster/data/canFam2/nib /iscratch/i/canFam2/
     iSync
     # big cluster (genbank):
     ssh kkstore01
     mkdir /cluster/bluearc/scratch/hg/canFam2
     rsync -av /cluster/data/canFam2/nib /cluster/bluearc/scratch/hg/canFam2/
     # ask cluster-admin to rsync that to all big cluster nodes' /scratch/...
 
 
 # MAKE LINEAGE-SPECIFIC REPEATS VS. HUMAN, MOUSE (DONE 8/1/05 angie)
     ssh kolossus
     cd /san/sanvol1/scratch/canFam2/rmsk
     # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
     # whether repeats in -query are also expected in -comp species.  
     # Human in extra column 1, Mouse in extra column 2
     foreach outfl ( *.out )
         echo "$outfl"
         /cluster/bluearc/RepeatMasker/DateRepeats \
           ${outfl} -query dog -comp human -comp mouse
     end
     # Now extract human (extra column 1), mouse (extra column).
     cd ..
     mkdir linSpecRep.notInHuman
     mkdir linSpecRep.notInMouse
     foreach f (rmsk/*.out_homo-sapiens_mus-musculus)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                         linSpecRep.notInHuman/$base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                         linSpecRep.notInMouse/$base.out.spec
     end
     wc -l rmsk/*.out
 #4533630 total
     wc -l linSpecRep.notInHuman/*
 #1542788 total
     wc -l linSpecRep.notInMouse/*
 #1546408 total
     # Clean up.
     rm rmsk/*.out_h*
 
 
 # PRODUCING GENSCAN PREDICTIONS (DONE 8/11/05 angie)
     ssh hgwdev
     mkdir /cluster/data/canFam2/bed/genscan
     cd /cluster/data/canFam2/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
     # Run on small cluster (more mem than big cluster).
     ssh kki
     cd /cluster/data/canFam2/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     rm -f genome.list
     touch genome.list
     foreach f ( `ls -1S /cluster/data/canFam2/*/chr*_*/chr*_?{,?}.fa.masked` )
       egrep '[ACGT]' $f > /dev/null
       if ($status == 0) echo $f >> genome.list
     end
     wc -l genome.list
 #495
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 genome.list single gsub jobList
     para make jobList
 #Completed: 493 of 495 jobs
 #Crashed: 2 jobs
 #Average job time:                 918s      15.30m     0.25h    0.01d
 #Longest finished job:           30383s     506.38m     8.44h    0.35d
 #Submission to last job:        132236s    2203.93m    36.73h    1.53d
     # If there are crashes, diagnose with "para problems" and "para crashed".  
     # If a job crashes due to genscan running out of memory, re-run it 
     # manually with "-window=1200000" instead of "-window=2400000".
     ssh kkr7u00
     cd /cluster/data/canFam2/bed/genscan
     /cluster/bin/x86_64/gsBig /cluster/data/canFam2/1/chr1_23/chr1_23.fa.masked gtf/chr1_23.fa.gtf -trans=pep/chr1_23.fa.pep -subopt=subopt/chr1_23.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
     /cluster/bin/x86_64/gsBig /cluster/data/canFam2/27/chr27_1/chr27_1.fa.masked gtf/chr27_1.fa.gtf -trans=pep/chr27_1.fa.pep -subopt=subopt/chr27_1.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000   
     ls -1 gtf | wc -l
 #    495
     endsInLf gtf/*
 
     # Convert these to chromosome level files as so:
     ssh kkstore01
     cd /cluster/data/canFam2/bed/genscan
     liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
     liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
     cat pep/*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/canFam2/bed/genscan
     ldHgGene -gtf canFam2 genscan genscan.gtf
     hgPepPred canFam2 generic genscanPep genscan.pep
     hgLoadBed canFam2 genscanSubopt genscanSubopt.bed
 
 
 # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 8/1/05 angie)
     ssh kolossus
     # numerator is canFam2 gapless bases as reported by featureBits, 
     # denominator is hg17 gapless bases as reported by featureBits,
     # 1024 is threshold used for human -repMatch:
     calc \( 2384996543 / 2867328468 \) \* 1024
 # ( 2384996543 / 2867328468 ) * 1024 = 851.746316
     # ==> use -repMatch=852 according to size scaled down from 1024 for human.
     mkdir /cluster/bluearc/canFam2
     mkdir /cluster/data/canFam2/bed/ooc
     cd /cluster/data/canFam2/bed/ooc
     ls -1 /cluster/data/canFam2/nib/chr*.nib > nib.lst
     blat nib.lst /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/canFam2/11.ooc -repMatch=852
 #Wrote 27388 overused 11-mers to /cluster/bluearc/canFam2/11.ooc
 
 
 # AUTO UPDATE GENBANK MRNA RUN  (DONE 8/8/05 angie)
     # 2/22/06: added refGene
     ssh hgwdev
     # Update genbank config and source in CVS:
     cd ~/kent/src/hg/makeDb/genbank
     cvsup .
 
     # Edit etc/genbank.conf and add these lines:
 # canFam2 (dog)
 canFam2.genome = /scratch/hg/canFam2/nib/chr*.nib
 canFam2.lift = /cluster/data/canFam2/jkStuff/liftAll.lft
 canFam2.refseq.mrna.native.load = no
 canFam2.refseq.mrna.xeno.load = yes
 canFam2.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
 canFam2.genbank.mrna.xeno.load = yes
 canFam2.genbank.est.xeno.load = no
 canFam2.downloadDir = canFam2
 
     cvs ci etc/genbank.conf
     # Edit src/align/gbBlat to add /cluster/bluearc/canFam2/11.ooc
     cvs diff src/align/gbBlat
     make
     cvs ci src/align/gbBlat
     # Install to /cluster/data/genbank:
     make install-server
 
     ssh eieio
     cd /cluster/data/genbank
     # This is an -initial run, (xeno) refseq only:
     nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial canFam2 &
     # Load results:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2
     featureBits canFam2 xenoRefGene
 #41278562 bases of 2384996543 (1.731%) in intersection
     # Clean up:
     rm -rf work/initial.canFam2
 
     ssh eieio
     cd /cluster/data/genbank
     # This is an -initial run, mRNA only:
     nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial canFam2 &
     # Load results:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2
     featureBits canFam2 mrna
 #2061533 bases of 2384996543 (0.086%) in intersection
     featureBits canFam2 xenoMrna
 #63057460 bases of 2384996543 (2.644%) in intersection
     # Clean up:
     rm -rf work/initial.canFam2
 
     ssh eieio
     # -initial for ESTs:
     nice bin/gbAlignStep -srcDb=genbank -type=est -initial canFam2 &
     # Load results:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 canFam2 &
     featureBits canFam2 intronEst
 #16241242 bases of 2384996543 (0.681%) in intersection
     featureBits canFam2 est
 #41719045 bases of 2384996543 (1.749%) in intersection
     # Clean up:
     rm -rf work/initial.canFam2
 
     # 2/22/06: add native refseq
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     # genbank.conf: canFam2.refseq.mrna.native.load = yes
     make etc-update
     ssh kkstore02
     cd /cluster/data/genbank
     nice bin/gbAlignStep -srcDb=refseq -type=mrna -orgCat=native -initial \
       canFam2 &
     tail -f var/build/logs/2006.02.22-21:03:48.canFam2.initalign.log
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad canFam2 &
     tail -f var/dbload/hgwdev/logs/2006.02.22-21:19:53.dbload.log
     featureBits canFam2 refGene
 #1222436 bases of 2384996543 (0.051%) in intersection
 
 
 # SWAP CHAINS FROM HG17, BUILD NETS ETC. (DONE 8/3/05 angie - REDONE 12/13/05 - REDONE 2/8/06)
 # hg17-canFam2 was redone 12/12/05 (makeHg17.doc) so re-running this...
 # and again 2/8/06, sheesh...
     mkdir /cluster/data/canFam2/bed/blastz.hg17.swap
     cd /cluster/data/canFam2/bed/blastz.hg17.swap
     doBlastzChainNet.pl -swap /cluster/data/hg17/bed/blastz.canFam2/DEF \
       >& do.log &
     tail -f do.log
     # Add {chain,net}Hg17 to trackDb.ra if necessary.
 
 # RE-RUN NETTOAXT, AXTTOMAF FOR HG17 (DONE 10/31/05 angie)
 # Obsoleted 12/13/05 by re-run of canFam2-hg17 above.
     # Kate fixed netToAxt to avoid duplicated blocks, which is important 
     # for input to multiz.  Regenerate maf using commands from sub-script 
     # netChains.csh generated by doBlastzChainNet.pl above.  
     ssh kolossus
     cd /cluster/data/canFam2/bed/blastz.hg17.swap/axtChain
     netSplit canFam2.hg17.net.gz net
     chainSplit chain canFam2.hg17.all.chain.gz
     cd ..
     mv axtNet axtNet.orig
     mkdir axtNet
     foreach f (axtChain/net/*.net)
       netToAxt $f axtChain/chain/$f:t:r.chain \
         /cluster/data/canFam2/nib /iscratch/i/hg17/nib stdout \
       | axtSort stdin stdout \
       | gzip -c > axtNet/$f:t:r.canFam2.hg17.net.axt.gz
     end
     rm -r mafNet
     mkdir mafNet
     foreach f (axtNet/*.canFam2.hg17.net.axt.gz)
       axtToMaf -tPrefix=canFam2. -qPrefix=hg17. $f \
             /cluster/data/canFam2/chrom.sizes /cluster/data/hg17/chrom.sizes \
             stdout \
       | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
     end
     rm -r axtChain/{chain,net}/ axtNet.orig
 
 
 # QUALITY SCORES (DONE 8/11/05 angie)
     ssh kkstore01
     mkdir /cluster/data/canFam2/bed/quality
     cd /cluster/data/canFam2/bed/quality
     qaToQac ../../broad/contigs.quals.gz stdout \
     | qacAgpLift ../../broad/UCSC_Dog2.0.agp stdin chrom.qac
     mkdir wigData
     # Build .wig, .wib files in current directory so that "wigData/" doesn't 
     # appear in the .wig's:
     cd wigData
     foreach agp (../../../*/chr*.agp)
       set chr = $agp:t:r
       set abbrev = `echo $chr | sed -e 's/^chr//;  s/_random/r/;'`
       echo $chr to qual_$abbrev wiggle
       qacToWig -fixed ../chrom.qac -name=$chr stdout \
       | wigEncode stdin qual_$abbrev.{wig,wib}
     end
     # Verify size of .wib file = chrom length
     foreach f (*.wib)
       set abbrev = $f:t:r
       set chr = `echo $abbrev | sed -e 's/^qual_/chr/;  s/r$/_random/;'`
       set wibLen = `ls -l $f | awk '{print $5;}'`
       set chromLen = `grep -w $chr ../../../chrom.sizes | awk '{print $2;}'`
       if ($wibLen != $chromLen) then
         echo "ERROR: $chr size is $chromLen but wib size is $wibLen"
       else
         echo $chr OK.
       endif
     end
 
     # /gbdb & load:
     ssh hgwdev
     cd /cluster/data/canFam2/bed/quality/wigData
     mkdir -p /gbdb/canFam2/wib
     ln -s `pwd`/*.wib /gbdb/canFam2/wib
     hgLoadWiggle canFam2 quality *.wig
 
 
 # GC 5 BASE WIGGLE TRACK (DONE 8/11/05 angie)
     ssh kki
     mkdir /cluster/data/canFam2/bed/gc5Base
     cd /cluster/data/canFam2/bed/gc5Base
     cat > doGc5Base.csh <<'_EOF_'
 #!/bin/csh -fe
 set chr = $1
 set c = `echo $chr | sed -e 's/^chr//;  s/_random/r/;'`
 /cluster/bin/x86_64/hgGcPercent \
   -chr=${chr} -wigOut -doGaps -file=stdout -win=5 canFam2 \
   /iscratch/i/canFam2/nib \
 | /cluster/bin/x86_64/wigEncode stdin gc5Base_${c}{.wig,.wib}
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doGc5Base.csh
     cp /dev/null spec
     foreach c (`cat ../../chrom.lst`)
       echo "./doGc5Base.csh $c" >> spec
     end
     para make spec
     para time
 #Completed: 41 of 41 jobs
 #Average job time:                  23s       0.38m     0.01h    0.00d
 #Longest finished job:              44s       0.73m     0.01h    0.00d
 #Submission to last job:           259s       4.32m     0.07h    0.00d
     # /gbdb and load track on hgwdev
     ssh hgwdev
     cd /cluster/data/canFam2/bed/gc5Base
     mkdir -p /gbdb/canFam2/wib
     ln -s `pwd`/*.wib /gbdb/canFam2/wib
     hgLoadWiggle canFam2 gc5Base *.wig
 
 
 # EXTRACT LINEAGE-SPECIFIC REPEATS FOR RAT (DONE 8/12/05 angie)
     ssh kolossus
     cd /panasas/store/canFam2/rmsk
     # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
     # whether repeats in -query are also expected in -comp species.  
     foreach outfl ( *.out )
         echo "$outfl"
         /cluster/bluearc/RepeatMasker/DateRepeats \
           ${outfl} -query dog -comp rat
     end
     # Now extract rat (extra column 1):
     cd ..
     mkdir linSpecRep.notInRat
     foreach f (rmsk/*.out_rattus)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractRepeats 1 $f > \
 		linSpecRep.notInRat/$base.out.spec
     end
     # Clean up.
     rm rmsk/*.out_rat*
 
 
 # LOAD CPGISSLANDS (DONE 8/12/05 angie)
     ssh hgwdev
     mkdir -p /cluster/data/canFam2/bed/cpgIsland
     cd /cluster/data/canFam2/bed/cpgIsland
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     mv cpglh.exe /cluster/data/canFam2/bed/cpgIsland/
     
     ssh kolossus
     cd /cluster/data/canFam2/bed/cpgIsland
     foreach f (../../*/chr*.fa.masked)
       set fout=$f:t:r:r.cpg
       echo running cpglh on $f to $fout
       ./cpglh.exe $f > $fout
     end
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 /* Input columns: */
 /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
 /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
 /* Output columns: */
 /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
 /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     # << this line makes emacs coloring happy
     awk -f filter.awk chr*.cpg > cpgIsland.bed
 
     # load into database:
     ssh hgwdev
     cd /cluster/data/canFam2/bed/cpgIsland
     hgLoadBed canFam2 cpgIslandExt -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
     wc -l cpgIsland.bed 
 #  47804 cpgIsland.bed
     featureBits canFam2 cpgIslandExt
 #38974374 bases of 2384996543 (1.634%) in intersection
 
 
 # ANDY LAW CPGISSLANDS (DONE 8/12/05 angie)
     # See notes in makeGalGal2.doc.
     ssh kolossus
     mkdir /cluster/data/canFam2/bed/cpgIslandGgfAndy
     cd /cluster/data/canFam2/bed/cpgIslandGgfAndy
     # Use masked sequence since this is a mammal...
     cp /dev/null cpgIslandGgfAndyMasked.bed
     foreach f (../../*/chr*.fa.masked)
       set chr = $f:t:r:r
       echo preproc and run on masked $chr
       /cluster/home/angie/bin/i386/preProcGgfAndy $f \
       | /cluster/home/angie/ggf-andy-cpg-island.pl \
       | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                    $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                    $pGc = (100.0 * $gc / $n); \
                    $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                         "$pCpG\t$pGc\t$oE\n";' \
       >> cpgIslandGgfAndyMasked.bed
     end
     # load into database:
     ssh hgwdev
     cd /cluster/data/canFam2/bed/cpgIslandGgfAndy
     sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
     hgLoadBed canFam2 cpgIslandGgfAndyMasked -tab -noBin \
       -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
     featureBits canFam2 cpgIslandExt
 #38974374 bases of 2384996543 (1.634%) in intersection
     featureBits canFam2 cpgIslandGgfAndyMasked
 #99187178 bases of 2384996543 (4.159%) in intersection
     wc -l ../cpgIsland/cpgIsland.bed *bed
 #   47804 ../cpgIsland/cpgIsland.bed
 #  138037 cpgIslandGgfAndyMasked.bed
 
 
 # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN & BEYOND (DONE 8/15/05 angie)
     # In an email 2/13/04, Arian said we could treat all human repeats as 
     # lineage-specific for human-chicken blastz.  Do the same for dog.  
     # Scripts expect *.out.spec filenames, so set that up:
     ssh kkstore01
     cd /cluster/data/canFam2
     mkdir /panasas/store/canFam2/linSpecRep.notInNonMammal
     foreach f (/panasas/store/canFam2/rmsk/chr*.fa.out)
       ln $f /panasas/store/canFam2/linSpecRep.notInNonMammal/$f:t:r:r.out.spec
     end
 
 
 # SWAP CHAINS FROM MM6, BUILD NETS ETC. (DONE 8/15/05 angie)
 # REDONE by hiram 12/7/05 -- tables loaded as {chain,net}Mm6u1 then 
 # renamed to {chain,net}Mm6 12/13/05 angie
     mkdir /cluster/data/canFam2/bed/blastz.mm6.swap
     cd /cluster/data/canFam2/bed/blastz.mm6.swap
     doBlastzChainNet.pl -swap /cluster/data/mm6/bed/blastz.canFam2/DEF \
       >& do.log
     echo "check /cluster/data/canFam2/bed/blastz.mm6.swap/do.log" \
     | mail -s "check do.log" $USER
     # Add {chain,net}Mm6 to trackDb.ra if necessary.
 
 # RE-RUN NETTOAXT, AXTTOMAF FOR MM6 (DONE 11/1/05 angie)
 # Obsoleted by re-run of canFam2-mm6 above.
     # Kate fixed netToAxt to avoid duplicated blocks, which is important 
     # for input to multiz.  Regenerate maf using commands from sub-script 
     # netChains.csh generated by doBlastzChainNet.pl above.  
     ssh kolossus
     cd /cluster/data/canFam2/bed/blastz.mm6.swap/axtChain
     netSplit canFam2.mm6.net.gz net
     chainSplit chain canFam2.mm6.all.chain.gz
     cd ..
     mv axtNet axtNet.orig
     mkdir axtNet
     foreach f (axtChain/net/*.net)
       netToAxt $f axtChain/chain/$f:t:r.chain \
         /cluster/data/canFam2/nib /cluster/data/mm6/nib stdout \
       | axtSort stdin stdout \
       | gzip -c > axtNet/$f:t:r.canFam2.mm6.net.axt.gz
     end
     rm -r mafNet
     mkdir mafNet
     foreach f (axtNet/*.canFam2.mm6.net.axt.gz)
       axtToMaf -tPrefix=canFam2. -qPrefix=mm6. $f \
             /cluster/data/canFam2/chrom.sizes /cluster/data/mm6/chrom.sizes \
             stdout \
       | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
     end
     rm -r axtChain/{chain,net}/ axtNet.orig
 
 
 # SWAP CHAINS FROM RN3, BUILD NETS ETC. (DONE 8/16/05 angie)
     mkdir /cluster/data/canFam2/bed/blastz.rn3.swap
     cd /cluster/data/canFam2/bed/blastz.rn3.swap
     doBlastzChainNet.pl -swap /cluster/data/rn3/bed/blastz.canFam2/DEF \
       >& do.log
     echo "check /cluster/data/canFam2/bed/blastz.rn3.swap/do.log" \
     | mail -s "check do.log" $USER
     # Add {chain,net}Rn3 to trackDb.ra if necessary.
     # Found that downloads in rn3 was missing 2005-12-21 - remake - Hiram
     cd /cluster/data/rn3/bed/blastz.canFam2
     /cluster/bin/scripts/doBlastzChainNet.pl -continue=download \
 	/cluster/data/rn3/bed/blastz.canFam2/DEF \
 	-chainMinScore=1000 -chainLinearGap=loose > downloads.out 2>&1
     
 # RE-RUN NETTOAXT, AXTTOMAF FOR RN3 (DONE 11/2/05 angie)
     # Kate fixed netToAxt to avoid duplicated blocks, which is important 
     # for input to multiz.  Regenerate maf using commands from sub-script 
     # netChains.csh generated by doBlastzChainNet.pl above.  
     ssh kolossus
     cd /cluster/data/canFam2/bed/blastz.rn3.swap/axtChain
     netSplit canFam2.rn3.net.gz net
     chainSplit chain canFam2.rn3.all.chain.gz
     cd ..
     mv axtNet axtNet.orig
     mkdir axtNet
     foreach f (axtChain/net/*.net)
       netToAxt $f axtChain/chain/$f:t:r.chain \
         /cluster/data/canFam2/nib /cluster/data/rn3/nib stdout \
       | axtSort stdin stdout \
       | gzip -c > axtNet/$f:t:r.canFam2.rn3.net.axt.gz
     end
     rm -r mafNet
     mkdir mafNet
     foreach f (axtNet/*.canFam2.rn3.net.axt.gz)
       axtToMaf -tPrefix=canFam2. -qPrefix=rn3. $f \
             /cluster/data/canFam2/chrom.sizes /cluster/data/rn3/chrom.sizes \
             stdout \
       | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
     end
     rm -r axtChain/{chain,net}/ axtNet.orig
 
 
 # MAKE THIS THE DEFAULT ASSEMBLY WHEN THERE ARE ENOUGH TRACKS (DONE 11/28/05 angie)
     hgsql -h genome-testdb hgcentraltest \
       -e 'update defaultDb set name = "canFam2" where genome = "Dog";'
 
 
 # MAKE Human Proteins track (DONE braney 2005-12-10)
     ssh kkstore01
     cd /cluster/data/canFam2
     cat */chr*/*.lft > jkStuff/subChr.lft   
     mkdir blastDb
     for i in */*/*_*_*.fa; do ln `pwd`/$i blastDb; done 
     cd blastDb
     for i in *.fa; do /cluster/bluearc/blast229/formatdb -p F -i $i; rm $i; done
 
     ssh pk
     destDir=/san/sanvol1/scratch/canFam2/blastDb
     mkdir -p $destDir
     cd /cluster/data/canFam2/blastDb
     for i in nin nsq nhr; do cp *.$i $destDir; done
     mkdir -p /cluster/data/canFam2/bed/tblastn.hg17KG
     cd /cluster/data/canFam2/bed/tblastn.hg17KG
     find $destDir -name "*.nsq" | sed "s/\.nsq//" > target.lst        
 
     mkdir kgfa
     # calculate a reasonable number of jobs
     calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(500000/`wc target.lst | awk "{print \\\$1}"`\)
 # 37365/(500000/6149) = 459.514770
     split -l 460 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg
     cd kgfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     rm -rf  /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut
     mkdir -p /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut
     ln -s  /cluster/bluearc/canFam2/bed/tblastn.hg17KG/blastOut
     for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
     tcsh
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
 if /cluster/bin/i386/blastToPsl $f.1 $f.2
 then
         liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
 	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/canFam2/jkStuff/subChr.lft carry $f.3
 	liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/canFam2/jkStuff/liftAll.lft carry $f.4
         mv $3.tmp $3
         rm -f $f.1 $f.2 
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp $f.8
 exit 1
 '_EOF_'
 
     exit
     chmod +x blastSome
     gensub2 target.lst kg.lst blastGsub blastSpec
 
     para create blastSpec
     para push
 
 # Completed: 504218 of 504218 jobs
 # CPU time in finished jobs:   28863259s  481054.31m  8017.57h  334.07d  0.915 y
 # IO & Wait Time:               4255045s   70917.42m  1181.96h   49.25d  0.135 y
 # Average job time:                  66s       1.09m     0.02h    0.00d
 # Longest finished job:             443s       7.38m     0.12h    0.01d
 # Submission to last job:        166648s    2777.47m    46.29h    1.93d
 
     ssh kki
     cd /cluster/data/canFam2/bed/tblastn.hg17KG
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1) $(path2)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainSome
 (cd $1; cat q.$2_*.psl | /cluster/home/braney/bin/i386/simpleChain -chrom=$2 -prot -outPsl -maxGap=250000 stdin ../c.$2.`bas
 ename $1`.psl)
 '_EOF_'
 
     chmod +x chainSome
     ls -1dS `pwd`/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec ../../chrom.lst
     para create chainSpec
     para push
 
 # Completed: 3362 of 3362 jobs
 # CPU time in finished jobs:    1078406s   17973.43m   299.56h   12.48d  0.034 y
 # IO & Wait Time:                 85905s    1431.75m    23.86h    0.99d  0.003 y
 # Average job time:                 346s       5.77m     0.10h    0.00d
 # Longest finished job:           22556s     375.93m     6.27h    0.26d
 # Submission to last job:        106376s    1772.93m    29.55h    1.23d
 
     ssh kkstore01
     cd /cluster/data/canFam2/bed/tblastn.hg17KG/blastOut
     for i in kg??
     do 
 	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.*.$i.psl > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     sort -u -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/canFam2/bed/tblastn.hg17KG/blastHg17KG.psl
     cd ..
     wc blastHg17KG.psl
 # 64343 1351203 10913814 blastHg17KG.psl
     pslUniq blastDm2FB.psl stdout | wc                                                                                    
 #  18827  395367 2992653
     cat kgfa/*fa | grep ">" | wc                                                                                  
 # 309494  309494 4810327
 
     ssh hgwdev
     cd /cluster/data/canFam2/bed/tblastn.hg17KG
     hgLoadPsl canFam2 blastHg17KG.psl
     featureBits canFam2 blastHg17KG
 # 35781547 bases of 2384996543 (1.500%) in intersection
 
     exit
 
     # back to kksilo
     rm -rf blastOut
 
 # End tblastn
 
 # BLASTZ SELF  (DONE braney 8-29-2005)
 # For future reference -- doBlastzChainNet.pl should have been used
     ssh pk
     mkdir -p /cluster/data/canFam2/bed/blastz.canFam2
     cd /cluster/data/canFam2/bed/blastz.canFam2
     cat << '_EOF_' > DEF
 # dog vs. dog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET
 # Dog
 SEQ1_DIR=/scratch/hg/canFam2/nib
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Dog
 SEQ2_DIR=/scratch/hg/canFam2/nib
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 
 BASE=/cluster/data/canFam2/bed/blastz.canFam2
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cp /cluster/data/canFam2/chrom.sizes S1.len
     cp /cluster/data/canFam2/chrom.sizes S2.len
     mkdir run
     cd run
     partitionSequence.pl 10000000 10000 /cluster/data/canFam2/nib ../S1.len \
       -xdir xdir.sh -rawDir ../lav \
     > canFam2.lst
     csh -ef xdir.sh
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/scripts/blastz-run-ucsc $(path1) $(path2) ../DEF {check out line ../lav/$(file1)/$(file1)_$(file2).lav}
 #ENDLOOP
 '_EOF_'
     # << this line keeps emacs coloring happy
     gensub2 canFam2.lst canFam2.lst gsub jobList
     para create jobList
     para try, check, push, check, ...
 
 # Completed: 70756 of 70756 jobs
 # CPU time in finished jobs:    5743034s   95717.24m  1595.29h   66.47d  0.182 y
 # IO & Wait Time:                199800s    3330.00m    55.50h    2.31d  0.006 y
 # Average job time:                  84s       1.40m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            2758s      45.97m     0.77h    0.03d
 # Submission to last job:         17852s     297.53m     4.96h    0.21d
 
     # convert lav files to psl 
     # First, combine all files per partition (too many files per chrom
     # to do in one swoop!).  Then do another round to collect all files 
     # per chrom.  
     ssh kki
     cd /cluster/data/canFam2/bed/blastz.canFam2
     mkdir pslChrom pslParts run.lavToPsl
     cd run.lavToPsl
     ls -1d ../lav/* | sed -e 's@/$@@' > parts.lst
     # For self alignments, we need lavToAxt's -dropSelf behavior, 
     # so go lav->axt->psl... could add -dropSelf to lavToPsl, but that 
     # might be somewhat invasive and perhaps not really much faster (?) 
     # because the sequence must be dug up in order to rescore...
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cat $1/*.lav \
 | lavToAxt -dropSelf stdin /iscratch/i/canFam2/nib \
                            /iscratch/i/canFam2/nib stdout \
 | axtToPsl stdin ../S1.len ../S2.len stdout \
 | sort -k 14,14 -k 16n,17n \
 | gzip -c > $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh $(path1) {check out exists ../pslParts/$(file1).psl.gz }
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 parts.lst single gsub jobList
     para create jobList
     para try, check, push, check, ...
 # Completed: 266 of 275 jobs
 # Crashed: 9 jobs
 # CPU time in finished jobs:       1228s      20.47m     0.34h    0.01d  0.000 y
 # IO & Wait Time:                  7117s     118.61m     1.98h    0.08d  0.000 y
 # Average job time:                  31s       0.52m     0.01h    0.00d
 # Longest finished job:              74s       1.23m     0.02h    0.00d
 # Submission to last job:           641s      10.68m     0.18h    0.01d
 
     awk '{print $1;}' /cluster/data/canFam2/chrom.sizes > chroms.lst
     cat << '_EOF_' > doChrom.csh
 #!/bin/csh -ef
 zcat ../pslParts/$1.*.psl.gz \
 | sort -k 14,14 -k 16n,17n \
 | gzip -c > $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChrom.csh
     cat << '_EOF_' > gsub.chroms
 #LOOP
 ./doChrom.csh $(root1) {check out exists ../pslChrom/$(root1).psl.gz }
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 chroms.lst single gsub.chroms jobList
     para create jobList
     para try, check, push, check, ...
 
 # Completed: 40 of 40 jobs
 # CPU time in finished jobs:        463s       7.71m     0.13h    0.01d  0.000 y
 # IO & Wait Time:                   178s       2.97m     0.05h    0.00d  0.000 y
 # Average job time:                  16s       0.27m     0.00h    0.00d
 # Longest finished job:              38s       0.63m     0.01h    0.00d
 # Submission to last job:            61s       1.02m     0.02h    0.00d
 
 # CHAIN SELF BLASTZ 
 # For future reference -- doBlastzChainNet.pl should have been used
 # REDONE partially, see below (from the filtering step onward, to get rid 
 # of duplicates, 12/6/05 angie)
 # CHAINS REMERGED to reassign IDs so that they're unique genome-wide, and 
 # nets deleted to avoid confusion, 12/14/05 angie.  The IDs were not unique 
 # genome-wide (only per-chrom) because the filtering was done directly on 
 # run1/chain files which had not yet had their IDs reassigned by 
 # chainMergeSort, and after filtering, chainMergeSort -saveId was used... doh.
 # It would have been OK if the filtering had been done post-merging,
 # or if -saveId had been omitted after filtering.  caught by joinerCheck.
 # Also could have been avoided by just using doBlastzChainNet.pl the first time.
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/canFam2/bed/blastz.canFam2
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S ../../pslChrom/*.psl.gz > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh -ef
 setenv PATH /cluster/bin/x86_64:$PATH
 axtChain -verbose=0 -psl $1 /iscratch/i/canFam2/nib \
                             /iscratch/i/canFam2/nib stdout \
 | chainAntiRepeat /iscratch/i/canFam2/nib /iscratch/i/canFam2/nib \
     stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 #Completed: 40 of 41 jobs
 #Crashed: 1 jobs
 #Average job time:                 185s       3.09m     0.05h    0.00d
 #Longest job:                      490s       8.17m     0.14h    0.01d
 #Submission to last job:         74789s    1246.48m    20.77h    0.87d
 
     # now on the cluster server, sort chains
     ssh kolossus
     cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
     # Had these steps actually been followed it would have saved some 
     # trouble, but instead some custom filtering was performed directly 
     # on run1/chain files and they never got their IDs reassigned:
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r
       textHistogram -binSize=10000 /tmp/score.$f:t:r
       echo ""
     end
 
     # trim to minScore=20000 to cut some of the fluff
     mkdir chainFilt
     foreach f (chain/*.chain)
       chainFilter -minScore=20000 $f > chainFilt/$f:t
     end
     gzip -c all.chain > all.chain.unfiltered.gz
     rm -r chain
     mv chainFilt chain
     chainMergeSort -saveId chain/*.chain > all.chain
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain canFam2 ${c}_chainSelf $i
     end
 
     # 12/6/05 angie -- Jen found that all chains were duplicated --
     # turns out that axtChain/run1/chain/ contained some chr*.filt.chain
     # in addition to chr*.psl.chain, so *.chain resulted in duplicates.  
     # Redo the filtering and chain merge, then re-run
     # doBlastzChainNet.pl -continue net
     # to redo the subsequent steps.
     ssh kolossus
     cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
     chainMergeSort run1/chain/chr*.psl.chain \
     | gzip -c > all.chain.unfiltered.gz
     rm -r chain
     chainSplit chain all.chain.unfiltered.gz
     mkdir chainFilt
     foreach f (run1/chain/chr*.psl.chain)
       chainFilter -minScore=20000 $f > chainFilt/$f:t:r:r.chain
     end
     rm -r chain
     mv chainFilt chain
     chainMergeSort -saveId chain/*.chain \
     | gzip -c > canFam2.canFam2.all.chain.gz
 
     ssh hgwdev
     cd /cluster/data/canFam2/bed/blastz.canFam2
     rm -r axtNet mafNet
     rm -r /usr/local/apache/htdocs/goldenPath/canFam2/vsSelf/
     doBlastzChainNet.pl -continue net DEF >& redo.log &
     tail -f redo.log
 
     # 12/14/05 angie -- Jen ran joinerCheck and it complained about 
     # non-unique IDs in the chain tables.  fixing...
     ssh kolossus
     cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
     chainSplit chain canFam2.canFam2.all.chain.gz 
     mv canFam2.canFam2.all.chain.gz canFam2.canFam2.all.chain.badIDs.gz
     chainMergeSort chain/*.chain \
     | gzip -c > canFam2.canFam2.all.chain.gz
     rm -r chain
     chainSplit chain canFam2.canFam2.all.chain.gz 
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain canFam2 ${c}_chainSelf $i
     end
     cd ..
     rm -r chain
     # I removed the self net in order to avoid confusion, so no need to 
     # regenerate it.
 
 
 # NET SELF BLASTZ 
 # For future reference -- doBlastzChainNet.pl should have been used
 # REDONE 12/6/05 angie as part of doBlastzChainNet.pl command above
     ssh kolossus
     cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
     chainPreNet all.chain.gz ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
     netClass -noAr noClass.net canFam2 canFam2 self.net
     rm noClass.net
     # Load the nets into database 
     netFilter -minGap=10 self.net |  hgLoadNet canFam2 netSelf stdin
     # Add entries for chainSelf, netSelf to dog/canFam2 trackDb
 
 
 # ADD SELF DOWNLOADABLE FILES (DONE 11/21/05 angie)
 # For future reference -- doBlastzChainNet.pl should have been used for 
 # all steps from blastz up to this point.
 # REDONE 12/6/05 angie as part of doBlastzChainNet.pl command above
     cd /cluster/data/canFam2/bed/blastz.canFam2/axtChain
     mv self.net canFam2.canFam2.net
     gzip canFam2.canFam2.net
     cd ..
     doBlastzChainNet.pl -continue download -stop download DEF \
       >& doDownload.log
     tail -f doDownload.log
 
 
 # MULTIZ.V10 4WAY (DOG/H/M/R) (DONE 11/7/05 angie - REDONE 12/13/05)
 # hg17 and mm6 alignments were redone -- re-run this to pick up improvements.
     # Tree: ((canFam2 hg17) (mm6 rn3))
     ssh kkstore01
     mkdir /cluster/data/canFam2/bed/multiz4way.2005-12-13
     rm -f /cluster/data/canFam2/bed/multiz4way
     ln -s /cluster/data/canFam2/bed/multiz4way.2005-12-13 \
       /cluster/data/canFam2/bed/multiz4way
     cd /cluster/data/canFam2/bed/multiz4way
     # Setup: Copy pairwise MAF to /san/sanvol1/scratch:
     mkdir /san/sanvol1/scratch/dogMultiz4way
     foreach db (hg17 mm6 rn3)
       echo $db
       cp -pR /cluster/data/canFam2/bed/blastz.$db/mafNet \
         /san/sanvol1/scratch/dogMultiz4way/$db
     end
     ls -lLR /san/sanvol1/scratch/dogMultiz4way
     # Make output dir and script to use /cluster/bin/scripts/runMultizV10.csh:
     mkdir maf
     cat << '_EOF_' > doMultizAll.csh
 #!/bin/csh -fex
 set chr = $1
 set tmpDir = /scratch/dogMultiz4way.$chr
 mkdir $tmpDir
 
 set mafScratch = /san/sanvol1/scratch/dogMultiz4way
 
 # Really should write a perl script to take a tree like this and generate 
 # commands like the ones below:
 # ((canFam2 hg17) (mm6 rn3))
 
 /cluster/bin/scripts/runMultizV10.csh \
   $mafScratch/mm6/$chr.maf.gz \
   $mafScratch/rn3/$chr.maf.gz \
   0 canFam2.$chr $tmpDir $tmpDir/$chr.MusRat.maf
 /cluster/bin/scripts/runMultizV10.csh \
   $mafScratch/hg17/$chr.maf.gz \
   $tmpDir/$chr.MusRat.maf \
   1 canFam2.$chr $tmpDir maf/$chr.maf
 
 rm -f $tmpDir/*.maf
 rmdir $tmpDir
 '_EOF_'
     # << for emacs
     chmod 775 doMultizAll.csh
     awk '{print "./doMultizAll.csh " $1;}' /cluster/data/canFam2/chrom.sizes \
       > jobs.lst
     # Run on brawny cluster
     ssh pk
     cd /cluster/data/canFam2/bed/multiz4way
     para make jobs.lst
     para time
 #Completed: 41 of 41 jobs
 #Average job time:                1296s      21.61m     0.36h    0.02d
 #Longest finished job:            2879s      47.98m     0.80h    0.03d
 #Submission to last job:          2879s      47.98m     0.80h    0.03d
     ls -1 missing*
 #ls: No match.
 
     # make /gbdb/ links to 4way maf files:
     ssh hgwdev
     mkdir -p /gbdb/canFam2/multiz4way/maf/multiz4way
     ln -s /cluster/data/canFam2/bed/multiz4way/maf/chr*.maf \
       /gbdb/canFam2/multiz4way/maf/multiz4way/
     # load into database
     cd /tmp
     hgLoadMaf -warn canFam2 multiz4way \
       -pathPrefix=/gbdb/canFam2/multiz4way/maf/multiz4way
     # load summary table to replace pairwise
     cat /cluster/data/canFam2/bed/multiz4way/maf/chr*.maf \
     | nice hgLoadMafSummary canFam2 multiz4waySummary stdin
 
     # Dropped unused indexes (2006-05-09 kate)
     # NOTE: this is not required in the future, as the loader
     # has been fixed to not generate these indexes
     hgsql canFam2 -e "alter table multiz4waySummary drop index chrom_2"
     hgsql canFam2 -e "alter table multiz4waySummary drop index chrom_3"
 
     # put 4way MAF out for download
     ssh kolossus
     cd /cluster/data/canFam2/bed/multiz4way
     mkdir mafDownload
     foreach f (maf/*.maf)
       nice gzip -c $f > mafDownload/$f:t.gz
     end
     cd mafDownload
     md5sum *.maf.gz > md5sum.txt
     # make a README.txt
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/canFam2/multiz4way
     ln -s /cluster/data/canFam2/bed/multiz4way/mafDownload/{*.maf.gz,*.txt} \
       /usr/local/apache/htdocs/goldenPath/canFam2/multiz4way
 
     # Cleanup
     rm -rf /san/sanvol1/scratch/dogMultiz4way/
 
 
 # PHASTCONS 4WAY WITH METHODS FROM PAPER (DONE 11/9/05 angie - REDONE 12/13/05)
 # ((canFam2,hg17),(mm6,rn3))
 # redo of hg17, mm6 -> multiz -> phastCons.
     ssh kkstore01
     mkdir -p /san/sanvol1/scratch/canFam2/chrom
     cp -p /cluster/data/canFam2/?{,?}/chr*.fa /san/sanvol1/scratch/canFam2/chrom/
     # Split chrom fa into smaller windows for phastCons:
     ssh pk
     mkdir /cluster/data/canFam2/bed/multiz4way/phastCons
     mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.split
     cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.split
     set WINDOWS = /san/sanvol1/scratch/canFam2/phastCons/WINDOWS
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
     cat << 'EOF' > doSplit.sh
 #!/bin/csh -ef
 
 set PHAST=/cluster/bin/phast
 set FA_SRC=/san/sanvol1/scratch/canFam2/chrom
 set WINDOWS=/san/sanvol1/scratch/canFam2/phastCons/WINDOWS
 
 set maf=$1
 set c = $maf:t:r
 set tmpDir = /scratch/msa_split/$c
 rm -rf $tmpDir
 mkdir -p $tmpDir
 ${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
    -O canFam2,hg17,mm6,rn3 \
    -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
 cd $tmpDir
 foreach file ($c.*.ss)
   gzip -c $file > ${WINDOWS}/$file.gz
 end
 rm -f $tmpDir/$c.*.ss
 rmdir $tmpDir
 'EOF'
 # << for emacs
     chmod a+x doSplit.sh
     rm -f jobList
     foreach file (/cluster/data/canFam2/bed/multiz4way/maf/*.maf)
       if (-s $file) then
         echo "doSplit.sh {check in exists+ $file}" >> jobList
       endif
     end
     para make jobList
     para time
 #Completed: 41 of 41 jobs
 #Average job time:                 132s       2.21m     0.04h    0.00d
 #Longest finished job:             232s       3.87m     0.06h    0.00d
 #Submission to last job:           232s       3.87m     0.06h    0.00d
 
     ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
     # Use consEntropy --NH to make it suggest a --expected-lengths param 
     # that we should try next.  Adam ran this on hg17 to find out the 
     # total entropy of the hg17 model:
     # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
 #Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
 #Relative entropy: H=0.608216 bits/site
 #Required length: N=16.085437 sites
 #Total entropy: NH=9.783421 bits
     # Our target is that NH result: 9.7834 bits.  
     # Use phyloFit to make an initial model:
     ssh kolossus
     cd /cluster/data/canFam2/bed/multiz4way/phastCons
     /cluster/bin/phast/msa_view ../maf/chr{2,20,37}.maf \
         --aggregate canFam2,hg17,mm6,rn3 \
         -i MAF -o SS > all.ss
     /cluster/bin/phast/phyloFit all.ss \
       --tree "((canFam2,hg17),(mm6,rn3))" \
       -i SS --out-root starting-tree
     cat starting-tree.mod 
 #ALPHABET: A C G T 
 #ORDER: 0
 #SUBST_MOD: REV
 #TRAINING_LNL: -378601835.277927
 #BACKGROUND: 0.285684 0.213599 0.213743 0.286974 
 #RATE_MAT:
 #  -0.889874    0.169851    0.563150    0.156874 
 #   0.227172   -1.149929    0.168661    0.754097 
 #   0.752694    0.168547   -1.148977    0.227736 
 #   0.156169    0.561285    0.169622   -0.887076 
 #TREE: ((canFam2:0.161521,hg17:0.166335):0.114597,(mm6:0.071038,rn3:0.076403):0.114597);
 # also get GC content from model -- if similar enough, no need to extract it 
 # separately above.
 awk '$1 == "BACKGROUND:" {print $3 + $4;}' starting-tree.mod 
 #0.427342
     # OK, use .427 for --gc below.
 
     # Use values of --target-coverage and --expected-lengths sort of like 
     # those in the latest run on human, 0.25 and 12, just as a starting point.
     # Multiply each subst rate on the TREE line by 3.75 which is roughly the 
     # ratio of noncons to cons in 
     # /cluster/data/canFam1/bed/multiz.canFam1hg17mm5/phastCons/run.estimate/ave.*
     /cluster/bin/phast/tree_doctor -s 3.75 starting-tree.mod \
       > starting-tree.noncons.mod
     /cluster/bin/phast/consEntropy --NH 9.7834 0.25 12 \
       starting-tree{,.noncons}.mod
 #( Solving for new omega: 12.000000 13.069907 13.002421 13.002166 )
 #Transition parameters: gamma=0.250000, omega=12.000000, mu=0.083333, nu=0.027778
 #Relative entropy: H=0.803151 bits/site
 #Expected min. length: L_min=11.957635 sites
 #Expected max. length: L_max=9.271001 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.603785 bits
 #Recommended expected length: omega=13.002166 sites (for L_min*H=9.783400)
     # OK, use --expected-lengths 13.
 
     ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
     # We're here because the actual target coverage was not satisfactory,
     # so we're changing the --target-coverage param.  Given that we're 
     # changing that, take a guess at how we should change --expected-lengths
     # in order to also hit the total entropy target.
     cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
     # SECOND ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.18 13 ave.{cons,noncons}.mod
 #ERROR: too many iterations, not converging; try without --NH.
     # -- it gets that error unless I raise coverage to 0.34.  Well, 
     # stick with 13 for now...
     # THIRD ITERATION:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.15 13 ave.{cons,noncons}.mod
 #ERROR: too many iterations, not converging; try without --NH.
     # -- it gets that error unless I raise coverage to 0.31.  Well, 
     # stick with 13 for now...
 
     # Now set up cluster job to estimate model parameters given free params 
     # --target-coverage and --expected-lengths and the data.  
     ssh pk
     mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
     cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
 
     # REDO ONLY
     cp ../../../multiz4way.2005-11-07/phastCons/run.estimate/ave.*.mod .
     # REDO ONLY -- skip the first iteration stuff from here on out, and 
     # do subsequent iteration stuff using params from 11-07 run.
 
     # FIRST ITERATION: Use ../starting-tree.mod:
     # REDO ONLY
     cp ../../../multiz4way.2005-11-07/phastCons/run.estimate/ave.*.mod .
     # REDO ONLY -- skip the First iteration stuff from here on out, and 
     # do subsequent iteration stuff using params from 11-07 run.
 
     cat << '_EOF_' > doEstimate.sh
 #!/bin/csh -ef
 zcat $1 \
 | /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.427 --nrates 1,1 \
     --no-post-probs --ignore-missing \
     --expected-lengths 13 --target-coverage 0.25 \
     --quiet --log $2 --estimate-trees $3
 '_EOF_'
 # << for emacs
     # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
     cat << '_EOF_' > doEstimate.sh
 #!/bin/csh -ef
 zcat $1 \
 | /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.427 --nrates 1,1 \
     --no-post-probs --ignore-missing \
     --expected-lengths 13 --target-coverage 0.15 \
     --quiet --log $2 --estimate-trees $3
 '_EOF_'
 # << for emacs
     chmod a+x doEstimate.sh
     rm -fr LOG TREES
     mkdir -p LOG TREES
     rm -f jobList
     foreach f (/san/sanvol1/scratch/canFam2/phastCons/WINDOWS/*.ss.gz)
       set root = $f:t:r:r
       echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
     end
     para make jobList
     para time
 #Completed: 2434 of 2434 jobs
 #Average job time:                  29s       0.48m     0.01h    0.00d
 #Longest finished job:              59s       0.98m     0.02h    0.00d
 #Submission to last job:           214s       3.57m     0.06h    0.00d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ssh kolossus
     cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.estimate
     ls -1 TREES/*.cons.mod > cons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
       --output-average ave.cons.mod > cons_summary.txt
     ls -1 TREES/*.noncons.mod > noncons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
       --output-average ave.noncons.mod > noncons_summary.txt
     grep TREE ave*.mod
     # FIRST ITERATION:
 #ave.cons.mod:TREE: ((canFam2:0.063241,hg17:0.067179):0.047808,(mm6:0.029756,rn3:0.031924):0.047808);
 #ave.noncons.mod:TREE: ((canFam2:0.201135,hg17:0.213229):0.151937,(mm6:0.094208,rn3:0.101249):0.151937);
     # SECOND ITERATION:
 #ave.cons.mod:TREE: ((canFam2:0.056147,hg17:0.059693):0.042487,(mm6:0.026610,rn3:0.028533):0.042487);
 #ave.noncons.mod:TREE: ((canFam2:0.192186,hg17:0.203958):0.145480,(mm6:0.090727,rn3:0.097451):0.145480);
     # THIRD ITERATION:
 #ave.cons.mod:TREE: ((canFam2:0.052533,hg17:0.055873):0.039758,(mm6:0.024967,rn3:0.026765):0.039758);
 #ave.noncons.mod:TREE: ((canFam2:0.188439,hg17:0.200074):0.142743,(mm6:0.089242,rn3:0.095829):0.142743);
     # REDO:
 #ave.cons.mod:TREE: ((canFam2:0.052883,hg17:0.056022):0.039955,(mm6:0.024815,rn3:0.026742):0.039955);
 #ave.noncons.mod:TREE: ((canFam2:0.188924,hg17:0.199837):0.142839,(mm6:0.088391,rn3:0.095457):0.142839);
 
     cat cons_summary.txt 
     # look over the files cons_summary.txt and noncons_summary.txt.
     # The means and medians should be roughly equal and the stdevs
     # should be reasonably small compared to the means, particularly
     # for rate matrix parameters (at bottom) and for branches to the
     # leaves of the tree.  The stdevs may be fairly high for branches
     # near the root of the tree; that's okay.  Some min values may be
     # 0 for some parameters.  That's okay, but watch out for very large
     # values in the max column, which might skew the mean.  If you see
     # any signs of bad outliers, you may have to track down the
     # responsible .mod files and throw them out.  I've never had to do
     # this; the estimates generally seem pretty well behaved.
 
     # NOTE: Actually, a random sample of several hundred to a thousand
     # alignment fragments (say, a number equal to the number of
     # available cluster nodes) should be more than adequate for
     # parameter estimation.  If pressed for time, use this strategy.
 
     # FIRST ITERATION ONLY:
     # Check the total entropy figure to see if we're way off.
     # This takes an hour for 4way (exponential in #species) and has never 
     # produced a different answer from the input after the first iteration,
     # so do this for the first iteration only:
     /cluster/bin/phast/consEntropy --NH 9.7834 0.25 13 ave.{cons,noncons}.mod
 #ERROR: too many iterations, not converging; try without --NH
     # Dang.  That is a new one.  Oh well, I'll proceed with 13.
 
     # Now we are ready to set up the cluster job for computing the
     # conservation scores and predicted elements.  The we measure the 
     # conserved elements coverage, and if that's not satisfactory then we 
     # adjust parameters and repeat.  
     ssh pk
     mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.phast
     cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.phast
     cat << 'EOF' > doPhastCons.sh
 #!/bin/csh -ef
 set pref = $1:t:r:r
 set chr = `echo $pref | awk -F\. '{print $1}'`
 set tmpfile = /scratch/phastCons.$$
 zcat $1 \
 | /cluster/bin/phast/phastCons - \
     ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
     --expected-lengths 13 --target-coverage 0.15 \
     --quiet --seqname $chr --idpref $pref \
     --viterbi /san/sanvol1/scratch/canFam2/phastCons/ELEMENTS/$chr/$pref.bed --score \
     --require-informative 0 \
   > $tmpfile
 gzip -c $tmpfile > /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS/$chr/$pref.pp.gz
 rm $tmpfile
 'EOF'
 # << for emacs
     chmod a+x doPhastCons.sh
     rm -fr /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}
     mkdir -p /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}
     foreach chr (`awk '{print $1;}' /cluster/data/canFam2/chrom.sizes`)
       mkdir /san/sanvol1/scratch/canFam2/phastCons/{POSTPROBS,ELEMENTS}/$chr
     end
     rm -f jobList
     foreach f (/san/sanvol1/scratch/canFam2/phastCons/WINDOWS/*.ss.gz)
       echo doPhastCons.sh $f >> jobList
     end
     para make jobList
     para time
 #Completed: 2434 of 2434 jobs
 #Average job time:                  11s       0.18m     0.00h    0.00d
 #Longest finished job:              39s       0.65m     0.01h    0.00d
 #Submission to last job:            74s       1.23m     0.02h    0.00d
 
     # back on kolossus:
     # combine predictions and transform scores to be in 0-1000 interval
     cd /cluster/data/canFam2/bed/multiz4way/phastCons
     cp /dev/null all.bed
     foreach d (/san/sanvol1/scratch/canFam2/phastCons/ELEMENTS/*)
       echo $d:t
       awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
         $d/*.bed \
       | /cluster/bin/scripts/lodToBedScore >> all.bed
     end
 
     ssh hgwdev
     # Now measure coverage of CDS by conserved elements. 
     # We want the "cover" figure to be close to 68.9%.
     # However we don't have dog gene annotations -- we just have xenoRefGene 
     # which is of course biased towards conserved genes.  So cover will be 
     # artificially high.  Shoot for "all.bed 5%" and make sure cover is 
     # somewhat higher than 68.9%.  
     cd /cluster/data/canFam2/bed/multiz4way/phastCons
     featureBits -enrichment canFam2 xenoRefGene:cds all.bed
     # FIRST ITERATION: too high, reduce target-coverage:
 #xenoRefGene:cds 1.268%, all.bed 6.812%, both 1.091%, cover 86.06%, enrich 12.63x
     # SECOND ITERATION: still too high.  
 #xenoRefGene:cds 1.268%, all.bed 5.432%, both 1.048%, cover 82.67%, enrich 15.22x
     # THIRD ITERATION: close enough.
 #xenoRefGene:cds 1.268%, all.bed 4.960%, both 1.025%, cover 80.83%, enrich 16.29x
     # REDO: close enough.
 #xenoRefGene:cds 1.268%, all.bed 4.960%, both 1.025%, cover 80.85%, enrich 16.30x
 
     # Having met the CDS coverage target, load up the results.
     hgLoadBed canFam2 phastConsElements4way all.bed
 
     # Create wiggle
     ssh pk
     mkdir /cluster/data/canFam2/bed/multiz4way/phastCons/run.wib
     cd /cluster/data/canFam2/bed/multiz4way/phastCons/run.wib
     rm -rf /san/sanvol1/scratch/canFam2/phastCons/wib
     mkdir -p /san/sanvol1/scratch/canFam2/phastCons/wib
     cat << 'EOF' > doWigEncode
 #!/bin/csh -ef
 set chr = $1
 cd /san/sanvol1/scratch/canFam2/phastCons/wib
 zcat `ls -1 /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS/$chr/*.pp.gz \
       | sort -t\. -k2,2n` \
 | wigEncode stdin ${chr}_phastCons.wi{g,b}
 'EOF'
 # << for emacs
     chmod a+x doWigEncode
     rm -f jobList
     foreach chr (`ls -1 /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS \
                   | sed -e 's/\/$//'`)
       echo doWigEncode $chr >> jobList
     end
     para make jobList
     para time
 #Completed: 41 of 41 jobs
 #Average job time:                  35s       0.59m     0.01h    0.00d
 #Longest finished job:              67s       1.12m     0.02h    0.00d
 #Submission to last job:            67s       1.12m     0.02h    0.00d
 
     # back on kkstore01, copy wibs, wigs and POSTPROBS (people sometimes want 
     # the raw scores) from san/sanvol1
     cd /cluster/data/canFam2/bed/multiz4way/phastCons
     rm -rf wib POSTPROBS
     rsync -av /san/sanvol1/scratch/canFam2/phastCons/wib .
     rsync -av /san/sanvol1/scratch/canFam2/phastCons/POSTPROBS .
 
     # load wiggle component of Conservation track
     ssh hgwdev
     mkdir /gbdb/canFam2/multiz4way/wib
     cd /cluster/data/canFam2/bed/multiz4way/phastCons
     chmod 775 . wib
     chmod 664 wib/*.wib
     ln -s `pwd`/wib/*.wib /gbdb/canFam2/multiz4way/wib/
     hgLoadWiggle canFam2 phastCons4way \
       -pathPrefix=/gbdb/canFam2/multiz4way/wib wib/*.wig
     rm wiggle.tab
 
     # and clean up san/sanvol1.
     rm -r /san/sanvol1/scratch/canFam2/phastCons/{ELEMENTS,POSTPROBS,wib}
     rm -r /san/sanvol1/scratch/canFam2/chrom
     # Offer raw scores for download since fly folks are likely to be interested:
     # back on kolossus
     cd /cluster/data/canFam2/bed/multiz4way/phastCons/POSTPROBS
     mkdir ../postprobsDownload
     foreach chr (`awk '{print $1;}' ../../../../chrom.sizes`)
       zcat `ls -1 $chr/$chr.*.pp.gz | sort -t\. -k2,2n` | gzip -c \
         > ../postprobsDownload/$chr.pp.gz
     end
     cd ../postprobsDownload
     md5sum *.gz > md5sum.txt
     # Make a README.txt there too.
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/canFam2/phastCons4way
     cd /usr/local/apache/htdocs/goldenPath/canFam2/phastCons4way
     ln -s /cluster/data/canFam2/bed/multiz4way/phastCons/postprobsDownload/* .
 
 
 # UNCERTIFIED ASSEMBLY REGIONS (DONE 11/9/05 angie)
     ssh hgwdev
     mkdir /cluster/data/canFam2/bed/uncertified
     cd /cluster/data/canFam2/bed/uncertified
     mv ../../broad/canFam2.0uncertifiedRegions.txt .
     tail +2 canFam2.0uncertifiedRegions.txt \
     | awk -F"\t" '{print "chr" $1 "\t" $2 "\t" $3 "\t" $7 "\t" $6;}' \
     | sed -e 's/Missing read partners/MRP/; s/Haplotype inconsistency/HI/; \
               s/Negative gap/NG/; s/Linking inconsistency/LI/; \
               s/Illogical links/IL/; s/; /+/g;' \
     > uncertified.bed
     hgLoadBed -tab canFam2 uncertified uncertified.bed
 
 
 # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 11/21/05 angie)
 # REDONE parts: Sequence and simpleRepeat.  12/5/05 angie
     ssh kolossus
     cd /cluster/data/canFam2
     #- Build the .tar.gz files -- no genbank for now.
     cat << '_EOF_' > jkStuff/zipAll.csh
 rm -rf bigZips
 mkdir bigZips
 tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp
 tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
 tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa
 tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked
 cd bed/simpleRepeat
 tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed
 cd ../..
 '_EOF_'
     # << this line makes emacs coloring happy
     csh -efx ./jkStuff/zipAll.csh |& tee zipAll.log
     #- Look at zipAll.log to make sure all file lists look reasonable.  
     cd bigZips
     md5sum *.gz > md5sum.txt
     # Make a README.txt
     cd ..
     mkdir chromGz
     foreach f ( ?{,?}/chr*.fa )
       echo $f:t:r
       gzip -c $f > chromGz/$f:t.gz
     end
     cd chromGz
     md5sum *.gz > md5sum.txt
     # Make a README.txt
 
     #- Link the .gz and .txt files to hgwdev:/usr/local/apache/...
     ssh hgwdev
     set gp = /usr/local/apache/htdocs/goldenPath/canFam2
     mkdir -p $gp/bigZips
     ln -s /cluster/data/canFam2/bigZips/{chrom*.tar.gz,*.txt} $gp/bigZips
     mkdir -p $gp/chromosomes
     ln -s /cluster/data/canFam2/chromGz/{chr*.gz,*.txt} $gp/chromosomes
     # Take a look at bigZips/* and chromosomes/*
     # Can't make refGene upstream sequence files - no refSeq for dog.
     mkdir database
     # Create README.txt files in database/ to explain the files.
 
 
 # CHORI BAC END PAIRS  (DONE 11/22/05 angie)
     # Rachel downloaded and parsed BAC end sequences and pair info from 
     # CHORI and NCBI -- seee makeCanFam1.doc "BAC END PAIRS".  Use those 
     # files and align to canFam2.  
 
     # Do BLAT alignments of BAC ends to the genome on the pitakluster.
     # copy over masked contigs to the san
     ssh kkstore01
     cd /cluster/data/canFam2
     mkdir /san/sanvol1/scratch/canFam2/maskedContigs
     foreach d (?{,?}/chr*_?{,?})
       echo $d:t
       cp -p $d/$d:t.fa /san/sanvol1/scratch/canFam2/maskedContigs/
     end
     # copy over 11.ooc file to the san
     cp -p /cluster/bluearc/canFam2/11.ooc /san/sanvol1/scratch/canFam2/
     # make output directory, run directory and bed/ directory
     mkdir -p /san/sanvol1/scratch/canFam2/bacendsRun/psl
     mkdir /cluster/data/canFam2/bed/bacends
     ssh pk
     cd /san/sanvol1/scratch/canFam2/bacendsRun
     echo '#LOOP\n/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/canFam2/11.ooc {check out line+ /san/sanvol1/scratch/canFam2/bacendsRun/psl/$(root1).$(root2).psl}\n#ENDLOOP' > gsub
     ls -1S /san/sanvol1/scratch/canFam2/maskedContigs/*.fa > contigs.lst
     ls -1S /san/sanvol1/scratch/canFam1/bacends/bacends*.fa > bacends.lst
     gensub2 contigs.lst bacends.lst gsub jobList
     para make jobList
     para time
 #Completed: 49005 of 49005 jobs
 #Average job time:                   7s       0.12m     0.00h    0.00d
 #Longest finished job:             105s       1.75m     0.03h    0.00d
 #Submission to last job:          1884s      31.40m     0.52h    0.02d
 
     # back on kkstore01, retrieve pitakluster results
     cd /cluster/data/canFam2/bed/bacends
     rsync -av /san/sanvol1/scratch/canFam2/bacendsRun/{*.lst,batch*,g*,j*,p*} .
     
     # lift alignments
     ssh kolossus
     cd /cluster/data/canFam2/bed/bacends
     pslSort dirs raw.psl tmp psl
     # started 08:15, PID: 16561
     pslCheck raw.psl >& check.log 
     wc -l check.log
 #4379 check.log
     grep '< previous block' check.log | wc -l
 #2194
     grep -v 'Error: invalid PSL' check.log | grep -v '< previous block' | wc -l
 #0
     wc -l raw.psl
 #27352955 raw.psl -- 2194 / 27352955 = 0.000080 = 0.008% overlapping
     pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                 raw.psl  bacEnds.psl /dev/null
     # Processed 27147454 alignments
     wc -l bacEnds.psl
 #769149 bacEnds.psl
     liftUp bacEnds.lifted.psl /cluster/data/canFam2/jkStuff/liftAll.lft \
            warn bacEnds.psl
     awk '{print $10}' bacEnds.lifted.psl | sort | uniq | wc -l
 #317042
     faSize /san/sanvol1/scratch/canFam1/bacends/bacends*.fa
 #290482800 bases (10817785 N's 279665015 real 279665015 upper 0 lower) in 393408 sequences in 99 files
     calc 317042 / 393408
 #317042 / 393408 = 0.805886
     # Make BAC end pairs track: 
     mkdir pairs
     cd pairs
     set ncbiDir = /cluster/data/ncbi/bacends/dog/bacends.1
     /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose ../bacEnds.lifted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds
     wc -l *
 #      40 bacEnds.long
 #     460 bacEnds.mismatch
 #   44366 bacEnds.orphan
 #  133248 bacEnds.pairs
 #     478 bacEnds.short
 #     816 bacEnds.slop
 #  179408 total
 
     # Filter by score and sort by {chrom,chromStart}:
     awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
     cat bacEnds.{slop,short,long,mismatch,orphan} \
     | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
     wc -l *.bed
 #  133241 bacEndPairs.bed
 #   46057 bacEndPairsBad.bed
 #  179298 total
     extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed   \
       bacEndPairsBad.bed \
     | sorttbl tname tstart | headchg -del \
     > bacEnds.load.psl
 
     # load into database
     ssh hgwdev
     cd /cluster/data/canFam2/bed/bacends/pairs
     hgLoadBed canFam2 bacEndPairs bacEndPairs.bed -notItemRgb \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
 #Loaded 133241 elements of size 11
     # note - this next track isn't pushed to RR, just used for assembly QA
     hgLoadBed canFam2 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
 #Loaded 46057 elements of size 11
     hgLoadPsl canFam2 -table=all_bacends bacEnds.load.psl
 #load of all_bacends did not go as planned: 748544 record(s), 0 row(s) skipped, 928 warning(s) loading psl.tab
     # Diagnose...
     echo select \* from all_bacends | hgsql -N canFam2 > /tmp/1
     diff psl.tab /tmp/1 | less
     # Looks like some rows of psl.tab have negative numbers in the 
     # qBaseInsert column!
 
     # load BAC end sequences into seq table so alignments may be viewed
     # symlink to FASTA sequence file in ncbi directory
     mkdir -p /gbdb/canFam2/bacends
     ln -s /cluster/data/ncbi/bacends/dog/bacends.1/canFamBacends.fa \
           /gbdb/canFam2/bacends/canFamBacends.fa
     hgLoadSeq canFam2 /gbdb/canFam2/bacends/canFamBacends.fa
 #393408 sequences
 # featureBits canFam1 all_bacends
 #211644790 bases of 2359845093 (8.969%) in intersection
     featureBits canFam2 all_bacends
 #218709219 bases of 2384996543 (9.170%) in intersection
 
 # featureBits canFam1 bacEndPairs
 #2334084046 bases of 2359845093 (98.908%) in intersection
     featureBits canFam2 bacEndPairs
 #2353239742 bases of 2384996543 (98.668%) in intersection
 
 # featureBits canFam1 bacEndPairsBad
 # 548130287 bases of 2359845093 (23.227%) in intersection
     featureBits canFam2 bacEndPairsBad
 #534195657 bases of 2384996543 (22.398%) in intersection
 
 # add trackDb entry and html
     # Clean up
     ssh pk
     rm -r /san/sanvol1/scratch/canFam2/bacendsRun
 
 
 # ACCESSIONS FOR CONTIGS (DONE 2006-01-04 kate)
 # Found WGS project entry in Genbank for canFam2:
 # LOCUS       AAEX02000000           35696 rc    DNA     linear   MAM 13-DEC-2005
 # DEFINITION  Canis familiaris whole genome shotgun sequencing project.
 # Project has a total of 35696 contigs, accessioned as
 #  AAEX02000001-AAEX02035696, also named cont2.0--cont2.35695
 # Poked around after submitting "canis[ORGN] AND WGS[KYWD]" search
 # to Entrez Nucleotide
 
     cd /cluster/data/canFam2/broad
     grep contig Dog2.0.agp | wc -l
         # 35696
     grep contig Dog2.0.agp | head -1
         # chr01   3000001 3024025 2       W       contig_30884   ...
     hgsql canFam2 -Ns -e "select frag from chr1_gold limit 1"
         # contig_30884
     grep contig_0 Dog2.0.agp
         # chrUn   57343783        57365839        7071    W       contig_0
     grep contig_35695 Dog2.0.agp
         # chrUn   73893245        73902697        11515   W       contig_35695 
     grep contig_35696 Dog2.0.agp
         # nothing
     # 
     # in Genbank entries)
 
     # Looked at summary file of contig entries from Genbank by
     # clicking the project entry (AAEX02000000), then clicking on
     # the WGS contig acession list at the end of the page
     # the AGP and our Gap tables frag names contig_* correspond to the
     # cont2.* names and AEX02* names (-1) in the Genbank entries,
     # so we don't need to to parse out the comments from the summary
     # entries.
 
     # create file mapping our contig names to Genbank accessions
     cd /cluster/data/canFam2/bed
     mkdir -p contigAcc
     cd contigAcc
     sed -n 's/.*contig_\([0-9][0-9]*\).*/\1/p' \
                 /cluster/data/canFam2/broad/Dog2.0.agp | \
         sort -un | \
     awk '{printf ("contig_%d\tAAEX020%05d.1\n", $1, $1+1)}' > contigAcc.tab
     wc -l contigAcc.tab
         # 35696 contigAcc.tab
 
     # load into database
     hgsql canFam2 < $HOME/kent/src/hg/lib/contigAcc.sql
     echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
         hgsql canFam2
     hgsql canFam2 -Nse "SELECT COUNT(*) FROM contigAcc"
         # 35696
 
 ############################################################################
 # QA NOTE (2007-08-16 brooke):
 # rhesus nets on canFam2 were requested by Merck (see the following
 # thread: http://www.soe.ucsc.edu/pipermail/genome/2007-August/014414.html).
 # We may want to consider adding rhesus nets/chains to next dog release.
 
 ############################################################################
 # BLASTZ BOSTAU2 (DONE - 2006-01-17 - 2006-01-18 - Hiram)
 
     ssh kk
     mkdir /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
     cd /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
 
     cat << '_EOF_' > DEF
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
 
 BLASTZ=blastz.v7
 BLASTZ_M=50
 
 # dog vs. cow
 # TARGET: Dog
 SEQ1_DIR=/iscratch/i/canFam2/nib
 SEQ1_LEN=/iscratch/i/canFam2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow (bosTau2)
 SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
 SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=300
 
 BASE=/cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	 << happy emacs
 
     time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	running 2006-01-17 13:58
     #	done 2005-01-18 03:40
     #	real    823m27.169s
 
     ssh hgwdev
     cd /cluster/data/canFam2/bed/blastzBosTau2.2006-01-17
     time featureBits canFam2 chainBosTau2Link \
 	> fb.canFam2.chainBosTau2Link 2>&1 &
     #	real    32m3.460s
     #	1376066425 bases of 2384996543 (57.697%) in intersection
 
 
 # SWAP CHAINS FROM RN4, BUILD NETS ETC. (DONE 2/25/06 angie)
     mkdir /cluster/data/canFam2/bed/blastz.rn4.swap
     cd /cluster/data/canFam2/bed/blastz.rn4.swap
     doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.canFam2/DEF \
       >& do.log
     echo "check /cluster/data/canFam2/bed/blastz.rn4.swap/do.log" \
     | mail -s "check do.log" $USER
     ln -s blastz.rn4.swap /cluster/data/canFam2/bed/blastz.rn4
 
 
 ############################################################################
 ##  BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
     ssh pk
     cd /cluster/data/mm8/bed/blastzCanFam2.2006-02-18
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
         `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits canFam2 chainMm8Link
     #   816262344 bases of 2384996543 (34.225%) in intersection
 
 ############################################################################
 ##  BLASTZ swap from panTro2 alignments (DONE 2006-05-04 markd)
 # FINISHED (2006-07-21 kate)
     ssh hgwdev
     mkdir /cluster/data/canFam2/bed/blastz.panTro2.swap
     ln -s blastz.panTro2.swap /cluster/data/canFam2/bed/blastz.panTro2
     cd /cluster/data/canFam2/bed/blastz.panTro2.swap
     
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
 	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	/cluster/data/panTro2/bed/blastz.canFam2/DEF >& swap.out&
 
    # create the net files
     ssh hgwdev
     cd /cluster/data/canFam2/bed/blastz.panTro2.swap/axtChain
     nice netClass -verbose=0 -noAr noClass.net canFam2 panTro2 canFam2.panTro2.net
     nice gzip canFam2.panTro2.net
 
     # Mark stopped here
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -continue=load \
 	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	/cluster/data/panTro2/bed/blastz.canFam2/DEF >& swap.2.out&
 
 
 ##########################################################################
 # NSCAN track - (2006-06-01 markd)
     cd /cluster/data/canFam2/bed/nscan/
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     wget -nv http://genes.cs.wustl.edu/jeltje/canFam2/canFam2.nscan.gtf
     gzip canFam2.nscan.gtf
     wget -nv -r -np http://genes.cs.wustl.edu/jeltje/canFam2/chr_ptx/
     cat genes.cs.wustl.edu/jeltje/canFam2/chr_ptx/*.fa >canFam2.nscan.pep.fa
     gzip canFam2.nscan.pep.fa
     rm -rf genes.cs.wustl.edu
     chmod a-w *.gz
 
     # load tracks.  Note that these have *utr features, rather than
     # exon features.  currently ldHgGene creates separate genePred exons
     # for these.
     ldHgGene -bin -gtf -genePredExt canFam2 nscanGene canFam2.nscan.gtf.gz
     # add .a suffix to match transcript id
     WRONG, see below: hgPepPred -suffix=.a canFam2 generic nscanPep canFam2.nscan.pep.fa.gz
     rm *.tab
 
     # update trackDb; need a canFam2-specific page to describe informants
     dog/canFam2/nscanGene.html
     dog/canFam2/trackDb.ra
 
     # 2006-07-24 markd: reloaded without -suffix
     hgPepPred canFam2 generic nscanPep canFam2.nscan.pep.fa.gz
 
 ##########################################################################
 # MAKE Human Proteins (hg18) track (DONE braney 2006-08-17)
     ssh pk
     destDir=/san/sanvol1/scratch/canFam2/blastDb
     mkdir -p /cluster/data/canFam2/bed/tblastn.hg18KG
     cd /cluster/data/canFam2/bed/tblastn.hg18KG
     find $destDir -name "*.nsq" | sed "s/\.nsq//" > target.lst        
 
     mkdir kgfa
     # calculate a reasonable number of jobs
     calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(500000/`wc target.lst | awk "{print \\\$1}"`\)
 # 36727/(500000/6149) = 451.668646
     split -l 452 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
     cd kgfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     rm -rf  /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut
     mkdir -p /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut
     ln -s  /cluster/bluearc/canFam2/bed/tblastn.hg18KG/blastOut
     for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
     tcsh
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
 if /cluster/bin/i386/blastToPsl $f.1 $f.2
 then
         liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
 	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/canFam2/jkStuff/subChr.lft carry $f.3
 	liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/canFam2/jkStuff/liftAll.lft carry $f.4
         mv $3.tmp $3
         rm -f $f.1 $f.2 
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp $f.8
 exit 1
 '_EOF_'
 
     exit
     chmod +x blastSome
     gensub2 target.lst kg.lst blastGsub blastSpec
 
     para create blastSpec
     para push
 
 # Completed: 504218 of 504218 jobs
 # CPU time in finished jobs:   28268903s  471148.38m  7852.47h  327.19d  0.896 y
 # IO & Wait Time:               5565118s   92751.97m  1545.87h   64.41d  0.176 y
 # Average job time:                  67s       1.12m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             710s      11.83m     0.20h    0.01d
 # Submission to last job:         91362s    1522.70m    25.38h    1.06d
 
     ssh kki
     cd /cluster/data/canFam2/bed/tblastn.hg18KG
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1) $(path2)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainSome
 (cd $1; cat q.$2_*.psl | /cluster/home/braney/bin/i386/simpleChain -chrom=$2 -prot -outPsl -maxGap=250000 stdin ../c.$2.`bas
 ename $1`.psl)
 '_EOF_'
 
     chmod +x chainSome
     ls -1dS `pwd`/blastOut/kg?? > chain.lst
     gensub2 chain.lst ../../chrom.lst chainGsub chainSpec 
     para create chainSpec
     para push
 
 # Completed: 3335 of 3335 jobs
 # CPU time in finished jobs:    1056945s   17615.76m   293.60h   12.23d  0.034 y
 # IO & Wait Time:                 75548s    1259.13m    20.99h    0.87d  0.002 y
 # Average job time:                 340s       5.66m     0.09h    0.00d
 # Longest finished job:           25716s     428.60m     7.14h    0.30d
 # Submission to last job:         61450s    1024.17m    17.07h    0.71d
 
     ssh kkstore04
     cd /cluster/data/canFam2/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do 
 	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.*.$i.psl > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     sort -k 14,14 -k 16,16n -k 18,18n u.*.psl m60* | uniq  > /cluster/data/canFam2/bed/tblastn.hg18KG/blastHg18KG.psl
     cd ..
     wc blastHg18KG.psl
 # 62741  1317561 12491782 blastHg18KG.psl
 # hg17  64343 1351203 10913814 blastHg18KG.psl
     pslUniq blastHg18KG.psl stdout | wc                                                                                    
 # 36272  761712 8427975
 #  hg17 18827  395367 2992653
     cat kgfa/*fa | grep ">" | wc                                                                                  
 #  303516  303516 4739313
 # hg17 309494  309494 4810327
 
     ssh hgwdev
     cd /cluster/data/canFam2/bed/tblastn.hg18KG
     hgLoadPsl canFam2 blastHg18KG.psl
     featureBits canFam2 blastHg18KG
 # 32565727 bases of 2384996543 (1.365%) in intersection
 # hg17  35781547 bases of 2384996543 (1.500%) in intersection
 
     exit
 
     # back to kksilo
     rm -rf blastOut
 
 # End tblastn
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET FELCAT3 (Done Nov 16 2006 heather)
 # working in /cluster/data/felCat3 because /cluster/data/canFam2 is 94% full
     mkdir /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16
     ln -s /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16 /cluster/data/canFam2/bed/blastz.felCat3
     cd /cluster/data/felCat3/bed/blastz.canFam2.2006-11-16
     cat << '_EOF_' > DEF
 
 BLASTZ_M=50
 
 # TARGET: Dog canFam2
 SEQ1_DIR=/scratch/hg/canFam2/nib
 SEQ1_LEN=/scratch/hg/canFam2/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=200000000
 SEQ1_LAP=0
 
 # QUERY: Cat felCat3 
 SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
 SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ2_LIMIT=500
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/felCat3/bed/blastz.canFam2.2006-11-16
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk
       -chainMinScore=3000 -chainLinearGap=medium
       -blastzOutRoot /cluster/bluearc/felCat3/blastz.canFam2 >& do.log &
     tail -f do.log
 
     nice featureBits -chrom=chr1 canFam2 chainFelCat3Link
     # 65223887 bases of 121613690 (53.632%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET DOG AND HORSE (DONE 2/19/07 Fan)
     ssh kkstore05
     mkdir /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
     cd /cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
     cat << '_EOF_' > DEF
 # Horse vs. Dog
 
 BLASTZ_M=50
 
 # TARGET: Horse equCab1
 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
 SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
 # Maximum number of scaffolds that can be lumped together
 SEQ1_LIMIT=500     
 SEQ1_CHUNK=30000000
 SEQ1_LAP=10000
 
 # QUERY: Dog canFam2
 SEQ2_DIR=/scratch/hg/canFam2/nib
 SEQ2_LEN=/cluster/data/canFam2/chrom.sizes 
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
       -bigClusterHub pk \
       -blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do.log &
     tail -f do.log
 
 # 6 jobs failed.
 # Robert re-ran those 6 jobs. (2/22/07)
 
 doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
 -continue cat -bigClusterHub pk \
 -blastzOutRoot /cluster/bluearc/equCab1CanFam2 >& do2.log &
 tail -f do2.log
     ln -s blastz.canFam2.2007-02-18 /cluster/data/equCab1/bed/blastz.canFam2
     
     ssh hgwdev
     nice featureBits equCab1 -chrom=chrI chainCanFam2Link
     # 128747357 bases of 177498097 (72.534%) in intersection
 
     bash
     time nice -n 19 featureBits equCab1 chainCanFam2Link \
 	> fb.equCab1.chainCanFam2Link.txt 2>&1
     # 1717664968 bases of 2421923695 (70.922%) in intersection
 
     ssh kkstore04
     mkdir /cluster/data/canFam2/bed/blastz.equCab1.swap
     cd /cluster/data/canFam2/bed/blastz.equCab1.swap
     bash
     time doBlastzChainNet.pl \
 	/cluster/data/equCab1/bed/blastz.canFam2.2007-02-18/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
     # real    115m55.880s
 
     cat *.txt
     # 1673177547 bases of 2384996543 (70.154%) in intersection
 
 #########################################################################
 
 #########################################################################
 # BLASTZ OPOSSUM monDom4 (DONE 2007-05-27 braney)
     ssh kolossus
     mkdir /cluster/data/canFam2/bed/blastz.monDom4
     cd /cluster/data/canFam2/bed/blastz.monDom4
     cat << '_EOF_' > DEF
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
 
 BLASTZ=blastz.v7
 
 # settings for more distant organism alignments
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Dog (canFam2)
 SEQ1_DIR=/san/sanvol1/scratch/canFam2/nib
 SEQ1_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom4
 #SEQ2_DIR=/iscratch/i/monDom4/monDom4RMExtra.2bit
 SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 #SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes        
 SEQ2_LEN=/cluster/data/monDom4/chrom.sizes        
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/canFam2/bed/blastz.monDom4
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << emacs
 
     doBlastzChainNet.pl DEF \
       -bigClusterHub=pk  -chainMinScore=5000 -chainLinearGap=loose \
       >& blastz.out 2>&1 &
 
 #########################################################################
 # Blastz Marmoset calJac1 (DONE - 2007-11-28 - 30 - Hiram)
     ssh kkstore04
     screen # use screen to control this job
     #	managing disk space on full filesystem
     mkdir -p /cluster/store8/canFam2/bed/blastzCalJac1.2007-11-28
     ln -s /cluster/store8/canFam2/bed/blastzCalJac1.2007-11-28 \
 	/cluster/data/canFam2/bed
     cd /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28
 
     cat << '_EOF_' > DEF
 # Dog vs. Marmoset
 BLASTZ_M=50
 
 # TARGET: Dog MonDom4
 SEQ1_DIR=/scratch/data/canFam2/nib
 SEQ1_LEN=/cluster/data/canFam2/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=100
 
 # QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs
 SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
 SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
 SEQ2_CHUNK=2000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=kk -verbose=2 \
 	> do.log 2>&1 &
     #	real    864m49.290s
 # Completed: 266999 of 267000 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:   17058860s  284314.33m  4738.57h  197.44d  0.541 y
 # IO & Wait Time:               1031855s   17197.59m   286.63h   11.94d  0.033 y
 # Average job time:                  68s       1.13m     0.02h    0.00d
 # Longest finished job:            1302s      21.70m     0.36h    0.02d
 # Submission to last job:         47998s     799.97m    13.33h    0.56d
     #	one job ran out of memory on the kk nodes.  It was finished on kolossus
     /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
 /scratch/data/canFam2/nib/chr21.nib:chr21:20000000-40010000 qParts/part238.lst \
 ../DEF \
 ../psl/chr21.nib:chr21:20000000-40010000/chr21.nib:chr21:20000000-40010000_part238.lst.psl
 
     #	continuing
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
 	-continue=cat -chainLinearGap=medium -bigClusterHub=kk -verbose=2 \
 	> cat.log 2>&1 &
     #	had a problem with files missing from /scratch/data/ on
     #	the Iservers - rsync that directory /iscratch/data/ full on
     #	kkr1u00 from /cluster/bluearc/scratch/data/ and push it to
     #	the other Iservers
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
 	-continue=chainMerge -chainLinearGap=medium -bigClusterHub=kk \
 	-verbose=2 > chainMerge.log 2>&1 &
     #	real    79m39.909s
 
     cat fb.canFam2.chainCalJac1Link.txt
     #	1369690756 bases of 2384996543 (57.429%) in intersection
 
     mkdir /cluster/data/calJac1/bed/blastz.canFam2.swap
     cd /cluster/data/calJac1/bed/blastz.canFam2.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-swap -bigClusterHub=kk > swap.log 2>&1 &
     #	encountered difficulties with /scratch/data/ on kolossus
     #	had to finish the netChains.csh script manually, then continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \
 	-continue=load -chainMinScore=3000 -chainLinearGap=medium \
 	-swap -bigClusterHub=kk > load.log 2>&1 &
     #	real    56m44.375s
 
     cat fb.calJac1.chainCanFam2Link.txt
     #	1451345669 bases of 2929139385 (49.549%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11 - Hiram)
     ssh kkstore04
     screen # use a screen to manage this multi-day job
     mkdir /cluster/data/canFam2/bed/blastzBosTau4.2008-03-11
     cd /cluster/data/canFam2/bed/blastzBosTau4.2008-03-11
 
     cat << '_EOF_' > DEF
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/canFam2/nib
 SEQ1_LEN=/cluster/data/canFam2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau4
 SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
 SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ2_LIMIT=300
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/canFam2/bed/blastzBosTau4.2008-03-11
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	`pwd`/DEF -verbose=2 \
 	-bigClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \
 	    -syntenicNet > do.log 2>&1
     #	the pk became free as this job had 3 days to go.  So, chill out
     #	the batch on memk, let it finish its running jobs, then finish
     #	the batch manually on pk.  Continuing:
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	`pwd`/DEF -verbose=2 -smallClusterHub=memk \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	    -continue=cat -syntenicNet > cat.log 2>&1
     #	real    175m41.674s
     cat fb.canFam2.chainBosTau4Link.txt
     #	1367017426 bases of 2384996543 (57.317%) in intersection
 
     mkdir /cluster/data/bosTau4/bed/blastz.canFam2.swap
     cd /cluster/data/bosTau4/bed/blastz.canFam2.swap
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	/cluster/data/canFam2/bed/blastzBosTau4.2008-03-11/DEF \
 	-verbose=2 -smallClusterHub=memk \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	    -swap -syntenicNet > swap.log 2>&1
     #	real    388m46.213s
     cat fb.bosTau4.chainCanFam2Link.txt
     #	1447088347 bases of 2731830700 (52.971%) in intersection
 
 #############################################################################
 # BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-18 - larrym)
     ssh kkstore04
     screen #	use screen to control this multi-day job
     mkdir /cluster/data/canFam2/bed/blastz.equCab2.2008-04-18
     cd /cluster/data/canFam2/bed/blastz.equCab2.2008-04-18
     cat << '_EOF_' > DEF
 # Dog vs. Horse
 
 BLASTZ_M=50
 
 # TARGET: Dog canFam2
 SEQ1_DIR=/scratch/data/canFam2/nib
 SEQ1_LEN=/cluster/data/canFam2/chrom.sizes 
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse
 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
 SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
 SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/canFam2/bed/blastz.equCab2.2008-04-18
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab2/blastz.canFam2 >& do.log &
 	0.117u 0.073s 9:20:00.22 0.0%   0+0k 0+0io 1pf+0w
 
 
 six parasol jobs failed; retried with:
 
     para -retries=5 push
 
 disable bad servers:
 
       1 kkr10u31.kilokluster.ucsc.edu
       1 kkr11u16.kilokluster.ucsc.edu
       1 kkr12u29.kilokluster.ucsc.edu
 
     pushd /cluster/bluearc/equCab2/blastz.canFam2/psl
     find . -name '*.psl' | wc
     64625   64625 6657995
 
 This is the correct number of psl files, so continue with cat step:
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -continue=cat \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab2/blastz.canFam2 >>& do.log &
     0.212u 0.173s 2:49:45.35 0.0%   0+0k 0+0io 62pf+0w
 
     ln -s blastz.equCab2.2008-04-18 /cluster/data/canFam2/bed/blastz.equCab2
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 
 ############################################################################
 # lastz Poodle canFamPoodle1 (DONE - 2009-06-06,22 - Hiram)
     mkdir /hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06
     cd /hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06
 
     cat << '_EOF_' > DEF
 # Tasha boxer dog vs Shadow poodle
 # parameters for very close alignment from human-chimp example
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 BLASTZ_O=600
 BLASTZ_E=150
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Tasha, canFam2
 SEQ1_DIR=/scratch/data/canFam2/nib
 SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog CanFam2
 SEQ2_DIR=/scratch/data/canFamPoodle1/canFamPoodle1.2bit
 SEQ2_LEN=/scratch/data/canFamPoodle1/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LAP=0
 SEQ2_LIMIT=800
 
 BASE=/hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl \
         -verbose=2 \
         `pwd`/DEF \
         -noDbNameCheck -noLoadChainSplit \
         -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
         -chainMinScore=5000 -chainLinearGap=medium > do.log 2>&1
     #	real    8250m12.188s
 # Completed: 374825 of 374825 jobs
 # CPU time in finished jobs:  187350504s 3122508.40m 52041.81h 2168.41d  5.941 y
 # IO & Wait Time:               4127960s   68799.33m  1146.66h   47.78d  0.131 y
 # Average job time:                 511s       8.51m     0.14h    0.01d
 # Longest finished job:            2339s      38.98m     0.65h    0.03d
 # Submission to last job:        494836s    8247.27m   137.45h    5.73d
 
     #	the lastz run thought it failed, but it didn't, continuing:
     time nice -n +19 doBlastzChainNet.pl \
         -verbose=2 \
         `pwd`/DEF \
         -continue=cat -noDbNameCheck -noLoadChainSplit \
         -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
         -chainMinScore=5000 -chainLinearGap=medium > cat.log 2>&1
     #	real    4841m48.581s <- this is actually a fb time on cavPor3
     #	this finish step was less than an hour
     fb.canFam2.chainCanFamPoodle1Link.txt 
     #	1405528799 bases of 2384996543 (58.932%) in intersection
 
     mkdir /hive/data/genomes/canFamPoodle1/bed/blastz.canFam2.swap
     cd /hive/data/genomes/canFamPoodle1/bed/blastz.canFam2.swap
 
     time nice -n +19 doBlastzChainNet.pl \
         -verbose=2 \
 	/hive/data/genomes/canFam2/bed/lastzCanFamPoodle1.2009-06-06/DEF \
         -swap -noDbNameCheck -noLoadChainSplit \
         -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
         -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1
     #	real    8585m58.789s
     cat fb.canFamPoodle1.chainCanFam2Link.txt 
     #	1377478896 bases of 1517497798 (90.773%) in intersection
 
 ############################################################################
 # Re-Run equCab2 alignment (DONE - 2009-06-29 - Hiram
     mkdir /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29
     cd /hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29
 
     cat << '_EOF_' > DEF
 # Dog vs. Horse
 
 BLASTZ_M=50
 
 # TARGET: Dog canFam2
 SEQ1_DIR=/scratch/data/canFam2/nib
 SEQ1_LEN=/scratch/data/canFam2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse
 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
 SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
 SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
 SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    338m57.973s
     cat fb.canFam2.chainEquCab2Link.txt 
     #	1676663178 bases of 2384996543 (70.300%) in intersection
     #	this is identical to what went down before ?
 
     mkdir /hive/data/genomes/equCab2/bed/blastz.canFam2.swap
     cd /hive/data/genomes/equCab2/bed/blastz.canFam2.swap
     time doBlastzChainNet.pl \
 	/hive/data/genomes/canFam2/bed/lastzEquCab2.2009-06-29/DEF \
 	-swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    286m51.658s
     fb.equCab2.chainCanFam2Link.txt 
     #	1721407500 bases of 2428790173 (70.875%) in intersection
 
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
+# BLASTZ FOR ZEBRAFISH (danRer5) (DONE, 2009-08-07, hartera)
+# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
+   mkdir /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07
+   cd /hive/data/genomes/canFam2/bed
+   ln -s blastz.danRer5.2009-08-07 blastz.danRer5
+   cd /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07
+ 
+   cat << '_EOF_' > DEF
+# dog (canFam2) vs zebrafish (danRer5)
+BLASTZ_Y=3400
+BLASTZ_L=6000
+BLASTZ_K=2200
+BLASTZ_M=50
+
+# TARGET: Dog canFam2
+SEQ1_DIR=/scratch/data/canFam2/canFam2.2bit
+SEQ1_LEN=/hive/data/genomes/canFam2/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY - zebrafish (danRer5)
+SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
+SEQ2_LEN=/hive/data/genomes/danRer5/chrom.sizes
+SEQ2_CHUNK=10000000
+SEQ2_LIMIT=50
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07
+TMPDIR=/scratch/tmp
+'_EOF_'
+   # << happy emacs
+   chmod +x DEF
+time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
+        `pwd`/DEF \
+        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
+        -chainMinScore=5000 -chainLinearGap=loose \
+        >& doBlastz.log &
+# 0.806u 0.488s 3:58:52.14 0.0%   0+0k 0+0io 0pf+0w
+
+cat fb.canFam2.chainDanRer5Link.txt 
+# 29234486 bases of 2384996543 (1.226%) in intersection
+
+# DO BLASTZ SWAP TO CREATE CANFAM2 CHAINS, NETS, AXNET, MAFNET AND
+# ALIGNMENT DOWNLOADS ON DANRER5 - documented in danRer5.txt