src/hg/makeDb/doc/galGal3.txt 1.28

1.28 2009/07/02 22:21:43 hiram
Rerun all the equCab2 chains and nets to clean up problems
Index: src/hg/makeDb/doc/galGal3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/galGal3.txt,v
retrieving revision 1.27
retrieving revision 1.28
diff -b -B -U 1000000 -r1.27 -r1.28
--- src/hg/makeDb/doc/galGal3.txt	13 Mar 2009 23:55:40 -0000	1.27
+++ src/hg/makeDb/doc/galGal3.txt	2 Jul 2009 22:21:43 -0000	1.28
@@ -1,2676 +1,2727 @@
 
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes how we made the browser database on the 
 # Chicken (Gallus gallus) May 2006 release.
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +-------------+---------------------------------+
 #  | tableName   | type                            |
 #  +-------------+---------------------------------+
 #  | refGene     | genePred refPep refMrna         |
 #  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
 #  | genscan     | genePred genscanPep             |
 #  +-------------+---------------------------------+
 
 
 
 #########################################################################
 # CREATE BUILD DIRECTORY (DONE 5/14/06 angie)
     ssh kkstore03
     mkdir /cluster/store6/galGal3
     ln -s /cluster/store6/galGal3 /cluster/data/
 
 
 #########################################################################
 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 5/14/06 angie)
     mkdir /cluster/data/galGal3/M
     cd /cluster/data/galGal3/M
     # go to http://www.ncbi.nih.gov/ and search Nucleotide for 
     # "gallus mitochondrion genome".  That shows the gi number:
     # 5834843 
     # Use that number in the entrez linking interface to get fasta:
     wget -O chrM.fa \
       'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834843&dopt=FASTA'
     # Edit chrM.fa: make sure the long fancy header line says it's the 
     # Gallus gallus mitochondrion complete genome, and then replace the 
     # header line with just ">chrM".
 
 
 #########################################################################
 # DOWNLOAD, UNPACK AND CHECK CHROM FASTA & AGP (DONE 5/14/06 angie)
     mkdir /cluster/data/galGal3/downloads
     cd /cluster/data/galGal3/downloads
     wget ftp://genome.wustl.edu/pub/user/lhillier/private/chicken_060412.tar.gz
     tar xvzf chicken_060412.tar.gz
     cp /dev/null ../chrom.lst
     foreach agp (*.agp)
       set chr = $agp:r
       echo $chr
       set fa = $chr.fa
       if (! -e $fa) then
         echo "*** No fasta for $agp"
         break
       endif
       set c = `echo $chr | sed -e 's/^chr//; s/_random$//;'`
       if (! -d ../$c) mkdir ../$c
       mv $agp $fa ../$c
     end
     cd ..
     # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
     ls -1d ?{,?} E* | sed -e 's@/$@@' > chrom.lst
     foreach c (`cat chrom.lst`)
       foreach agp ($c/chr$c{,_random}.agp)
         if (-e $agp) then
           set fa = $agp:r.fa
           echo checking consistency of $agp and $fa
           checkAgpAndFa $agp $fa | tail -1
         endif
       end
     end
     faSize */chr*.fa
 #Total size: mean 19306674.4 sd 39157213.7 min 1028 (chr32) max 200994015 (chr1) median 2031799
 
 
 #########################################################################
 # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 5/14/06 angie)
     ssh kkstore03
     cd /cluster/data/galGal3
     foreach c (`cat chrom.lst`)
       foreach agp ($c/chr$c{,_random}.agp)
         if (-e $agp) then
           set fa = $agp:r.fa
           echo splitting $agp and $fa
           cp -p $agp $agp.bak
           cp -p $fa $fa.bak
           splitFaIntoContigs $agp $fa . -nSize=5000000
         endif
       end
     end
     # splitFaIntoContigs makes new dirs for _randoms.  Move their contents 
     # back into the main chrom dirs and get rid of the _random dirs.
     foreach d (*_random)
       set base = `echo $d | sed -e 's/_random$//'`
       mv $d/lift/oOut.lst $base/lift/rOut.lst
       mv $d/lift/ordered.lft $base/lift/random.lft
       mv $d/lift/ordered.lst $base/lift/random.lst
       rmdir $d/lift
       mv $d/* $base
       rmdir $d
     end
     # Un/ has Un_random but for some reason o* files were created.  Rename:
     foreach f (Un/lift/*)
       set g = `echo $f | sed -e 's/ordered/random/; s@/o@/r@;'`
       mv $f $g
     end
     # Make a "pseudo-contig" for processing chrM too:
     mkdir M/chrM_1
     sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
     mkdir M/lift
     echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
     echo "chrM_1" > M/lift/ordered.lst
     set mSize = `faSize M/chrM.fa | awk '{print $1;}'`
     echo "0	M/chrM_1	$mSize	chrM	$mSize" > M/lift/ordered.lft
 
 
 #########################################################################
 # REPEAT MASKING (DONE 5/15/06 angie)
 
     # RepeatMasker version and library version:
 #    March 20 2006 (open-3-1-5) version of RepeatMasker
     grep RELEASE /cluster/bluearc/RepeatMasker060320/Libraries/RepeatMaskerLib.embl
 #CC   RELEASE 20060315;                                            *
 
     #- Split contigs into 500kb chunks, at gaps if possible:
     ssh kkstore03
     cd /cluster/data/galGal3
     foreach c (`cat chrom.lst`)
       foreach d ($c/chr${c}*_?{,?})
         cd $d
         echo "splitting $d"
         set contig = $d:t
         faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
             -minGapSize=100
         cd ../..
       end
     end
 
     #- Make the run directory and job list:
     cd /cluster/data/galGal3
     mkdir jkStuff
     cat << '_EOF_' > jkStuff/RMChicken
 #!/bin/csh -fe
 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/galGal3/$2
 /bin/cp $2 /tmp/galGal3/$2/
 cd /tmp/galGal3/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec chicken $2
 popd
 /bin/cp /tmp/galGal3/$2/$2.out ./
 if (-e /tmp/galGal3/$2/$2.align) /bin/cp /tmp/galGal3/$2/$2.align ./
 if (-e /tmp/galGal3/$2/$2.tbl) /bin/cp /tmp/galGal3/$2/$2.tbl ./
 if (-e /tmp/galGal3/$2/$2.cat) /bin/cp /tmp/galGal3/$2/$2.cat ./
 /bin/rm -fr /tmp/galGal3/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/galGal3/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/galGal3
 '_EOF_'
     # << emacs
     chmod +x jkStuff/RMChicken
     mkdir RMRun
     cp /dev/null RMRun/RMJobs
     foreach c (`cat chrom.lst`)
       foreach d ($c/chr${c}{,_random}_?{,?})
           set ctg = $d:t
           foreach f ( $d/${ctg}_?{,?}.fa )
             set f = $f:t
             echo /cluster/data/galGal3/jkStuff/RMChicken \
                  /cluster/data/galGal3/$d $f \
                '{'check out line+ /cluster/data/galGal3/$d/$f.out'}' \
               >> RMRun/RMJobs
           end
       end
     end
 
     #- Do the run
     ssh pk
     cd /cluster/data/galGal3/RMRun
     para make RMJobs
     para time
 #Completed: 2492 of 2492 jobs
 #CPU time in finished jobs:    2613579s   43559.64m   725.99h   30.25d  0.083 y
 #IO & Wait Time:                 17626s     293.77m     4.90h    0.20d  0.001 y
 #Average job time:                1056s      17.60m     0.29h    0.01d
 #Longest finished job:            1335s      22.25m     0.37h    0.02d
 #Submission to last job:          8741s     145.68m     2.43h    0.10d
 
     #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
     ssh kkstore03
     cd /cluster/data/galGal3
     foreach d (*/chr*_?{,?})
       set contig = $d:t
       echo $contig
       liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
         > /dev/null
     end
 
     #- Lift pseudo-contigs to chromosome level
     foreach c (`cat chrom.lst`)
       echo lifting $c
       cd $c
       if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
         liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
         > /dev/null
       endif
       if (-e lift/random.lft && ! -z lift/random.lft) then
         liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
         > /dev/null
       endif
       cd ..
     end
 
     #- Load the .out files into the database with:
     ssh hgwdev
     cd /cluster/data/galGal3
     hgLoadOut galGal3 */chr*.fa.out
 
     # Compare coverage to previous release:
     featureBits galGal3 rmsk
 #102214417 bases of 1042591351 (9.804%) in intersection
     # galGal2 coverage:
 #104249260 bases of 1054197620 (9.889%) in intersection
     # Uh-oh -- in the past, a drop in coverage has meant an RM problem.
     # However, spot-checking the per-chrom coverage of galGal2 vs. galGal3,
     # it seems like many small or random chroms have simply had a lot
     # of repetitive sequence cut from them (significant drop in size as well 
     # as drop in coverage), while for many of the large chroms, the coverage 
     # has gone up.  Some chroms grew but the rmsk coverage did not keep up.
     # chrUn is quite a bit smaller (121M --> 53M).  I saved the per-chrom 
     # coverage figures in /cluster/data/galGal3/RMCompare/ .
 #galGal2:chr1    24219568        183744490       13.181
 #galGal3:chr1    25804441        195192300       13.220
 #galGal2:chr1_random     186317  1261352 14.771
 #galGal3:chr1_random     12930   213918  6.044
 #galGal2:chr10   807202  18954178        4.259
 #galGal3:chr10   858376  20484937        4.190
 #galGal2:chr10_random    525320  3515812 14.942
 #galGal3:chr10_random    644     13679   4.708
 #galGal2:chr2    16579726        143798269       11.530	[exception.
 #galGal3:chr2    17182793        150358687       11.428	[
 #galGal2:chr2_random     4670    53846   8.673		[
 #galGal3:chr2_random     16589   127706  12.990		[
 #galGal2:chr18   461248  8797585  5.243
 #galGal3:chr18   507614  10513347 4.828
 
 #galGal2:chrE22C19W28    2628    47202   5.568
 #galGal2:chrE50C23       690     10171   6.784
 #galGal3:chrE22C19W28_E50C23     42486   822662  5.164
 
 #galGal2:chrUn   18697113        121198700       15.427
 #galGal3:chrUn_random    11075578        53400422        20.741
 #galGal2:chrW    530067  4135691 12.817
 #galGal3:chrW    133707  233885  57.168
 #galGal2:chrW_random     109531  229903  47.642
 #galGal3:chrW_random     311315  625108  49.802
 #galGal2:chrZ    4629721 30832492        15.016
 #galGal3:chrZ    10599358        67536383        15.694
 #galGal2:chrZ_random     2172389 14348615 15.140
 #galGal3:chrZ_random     115568  309836   37.300
     # So I'm inclined to think that it's most likely not a RepeatMasker bug
     # but a change in the underlying assembly.  Something to run by LaDeana 
     # after reading the release notes...
 
 
 #########################################################################
 # MAKE LIFTALL.LFT (DONE 5/15/06 angie)
     ssh kkstore03
     cd /cluster/data/galGal3
     cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
 
 
 #########################################################################
 # CREATING DATABASE (DONE 5/15/06 angie)
     ssh hgwdev
     echo 'create database galGal3' | hgsql ''
     # old hgwdev 5/15:
     # We are awfully tight here but at least not out of space:
     df -h /var/lib/mysql
 #/dev/sdc1             1.8T  1.6T   65G  97% /var/lib/mysql
     # New hgwdev 5/18:
     df -h /data/mysql
 #/dev/sdc1             1.8T  685G  1.1T  40% /data/mysql
     # Copy over the grp table from previous release:
     echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \
       | hgsql galGal3
 
 
 #########################################################################
 # GOLD AND GAP TRACKS (DONE 5/15/06 angie)
     ssh hgwdev
     cd /cluster/data/galGal3
     hgGoldGapGl -noGl -chromLst=chrom.lst galGal3 /cluster/data/galGal3 .
     # featureBits complains if there's no chrM_gap, so make one:
     # echo "create table chrM_gap like chr1_gap" | hgsql galGal3
     # oops, that won't work until v4.1, so do this for the time being:
     echo "create table chrM_gap select * from chr1_gap where 0=1" \
     | hgsql galGal3
 
 
 #########################################################################
 # SIMPLE REPEATS (TRF) (DONE 5/15/06 angie)
     # TRF runs pretty quickly now... it takes a few hours total runtime, 
     # so instead of binrsyncing and para-running, just do this on the
     # local fileserver
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed
     mkdir /cluster/data/galGal3/bed/simpleRepeat
     cd /cluster/data/galGal3/bed/simpleRepeat
     mkdir trf
     cp /dev/null jobs.csh
     foreach d (/cluster/data/galGal3/*/chr*_?{,?})
       set ctg = $d:t
       foreach f ($d/${ctg}.fa)
         set fout = $f:t:r.bed
         echo $fout
         echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
         >> jobs.csh
       end
     end
     csh -efx jobs.csh >&! jobs.log &
     # check on this with
     tail -f jobs.log
     wc -l jobs.csh
 #259 jobs.csh
     ls -1 trf | wc -l
 #259
     endsInLf trf/*
 #trf/chr12_random_1.bed zero length
 #trf/chr13_random_1.bed zero length
     # That's OK.
     # When job is done do:
     liftUp simpleRepeat.bed /cluster/data/galGal3/jkStuff/liftAll.lft warn \
       trf/*.bed
 
     # Load into the database:
     ssh hgwdev
     hgLoadBed galGal3 simpleRepeat \
       /cluster/data/galGal3/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     featureBits galGal3 simpleRepeat
 #9650062 bases of 1042591351 (0.926%) in intersection
     # galGal2 coverage:
 #8434365 bases of 1054197620 (0.800%) in intersection
 
 ###########################################################################
 # CREATE MICROSAT TRACK (done 2006-7-5 JK)
      ssh hgwdev
      cd /cluster/data/galGal3/bed
      mkdir microsat
      cd microsat
      awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
     /cluster/bin/i386/hgLoadBed galGal3 microsat microsat.bed
 
 
 #########################################################################
 # PROCESS SIMPLE REPEATS INTO MASK (DONE 5/15/06 angie)
     # After the simpleRepeats track has been built, make a filtered version 
     # of the trf output: keep trf's with period <= 12:
     ssh kkstore03
     cd /cluster/data/galGal3/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/chr*.bed)
       awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
     # Lift up filtered trf output to chrom coords as well:
     cd /cluster/data/galGal3
     mkdir bed/simpleRepeat/trfMaskChrom
     foreach c (`cat chrom.lst`)
       if (-e $c/lift/ordered.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/ordered.lst > $c/lift/oTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
       endif
       if (-e $c/lift/random.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
            $c/lift/random.lst > $c/lift/rTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
       endif
     end
     # Here's the coverage for the filtered TRF:
     ssh hgwdev
     cat /cluster/data/galGal3/bed/simpleRepeat/trfMaskChrom/*.bed \
       > /tmp/filtTrf.bed
     featureBits galGal3 /tmp/filtTrf.bed
 #4110365 bases of 1042591351 (0.394%) in intersection
     # galGal2 coverage:
 #4510381 bases of 1054197620 (0.428%) in intersection
 
 
 #########################################################################
 # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 5/15/06 angie)
     # Note: just to keep things consistent, redid chr1 and chr2 2/26 with 
     # the ProcessRepeats-only rerun results (only masking changes were 
     # 5bp in chr1 and 3bp in chr2)
     ssh kkstore03
     cd /cluster/data/galGal3
     # Soft-mask (lower-case) the contig and chr .fa's, 
     # then make hard-masked versions from the soft-masked.  
     set trfCtg=bed/simpleRepeat/trfMask
     set trfChr=bed/simpleRepeat/trfMaskChrom
     foreach f (*/chr*.fa)
       echo "repeat- and trf-masking $f"
       maskOutFa -soft $f $f.out $f
       set chr = $f:t:r
       maskOutFa -softAdd $f $trfChr/$chr.bed $f
       echo "hard-masking $f"
       maskOutFa $f hard $f.masked
     end
     foreach c (`cat chrom.lst`)
       echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
       foreach d ($c/chr*_?{,?})
         set ctg=$d:t
         set f=$d/$ctg.fa
         maskOutFa -soft $f $f.out $f
         maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
         maskOutFa $f hard $f.masked
       end
     end
     # Make 2bit for blat/browser usage:
     faToTwoBit */chr*.fa galGal3.2bit
     # Make soft-masked nib for blastz:
     mkdir nib
     foreach f (*/chr*.fa)
       faToNib -softMask $f nib/$f:t:r.nib
     end
 
 
 #########################################################################
 # MAKE CHROMINFO TABLE WITH 2BIT (DONE 5/15/06 angie)
     ssh kkstore03
     cd /cluster/data/galGal3
     mkdir bed/chromInfo
     twoBitInfo galGal3.2bit stdout \
     | awk '{print $1 "\t" $2 "\t/gbdb/galGal3/galGal3.2bit";}' \
       > bed/chromInfo/chromInfo.tab
 
     # Link to 2bit from /gbdb/galGal3/:
     ssh hgwdev
     mkdir /gbdb/galGal3
     ln -s /cluster/data/galGal3/galGal3.2bit /gbdb/galGal3/
     # Load /gbdb/galGal3/galGal3.2bit paths into database and save size info.
     # Make a special chromInfo.sql with large index size so that 
     # chrE22C19W28_E50C23 and chrE22C19W28_E50C23_random don't get collapsed.
     cd /cluster/data/galGal3/bed/chromInfo
     perl -wpe 's/chrom\([0-9]+/chrom\(21/' \
       $HOME/kent/src/hg/lib/chromInfo.sql > chromInfo.sql
     hgLoadSqlTab galGal3 chromInfo chromInfo.sql chromInfo.tab
 
     hgsql -N galGal3 -e "select chrom,size from chromInfo" \
       > /cluster/data/galGal3/chrom.sizes
     # take a look at chrom.sizes size
     wc chrom.sizes
 #     57     114     947 ../../chrom.sizes
 
 
 
 #########################################################################
 # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 5/15/06 angie)
     # Use -repMatch=380 (based on size -- for human we use 1024, and 
     # chicken size is ~37% of human)
     ssh kkr1u00
     cd /cluster/data/galGal3
     mkdir /cluster/bluearc/galGal3
     blat galGal3.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/galGal3/11.ooc -repMatch=380
 #Wrote 13061 overused 11-mers to /cluster/bluearc/galGal3/11.ooc
     blat galGal3.2bit /dev/null /dev/null -tileSize=10 \
       -makeOoc=/cluster/bluearc/galGal3/10.ooc -repMatch=380
 #Wrote 166633 overused 10-mers to /cluster/bluearc/galGal3/10.ooc
     mkdir /iscratch/i/galGal3
     cp -p /cluster/bluearc/galGal3/*.ooc /iscratch/i/galGal3/
     iSync
 
 
 #########################################################################
 # PUT NIBS ON /SCRATCH (DONE 5/15/06 angie)
     ssh kkstore03
     mkdir /cluster/bluearc/scratch/hg/galGal3
     rsync -av /cluster/data/galGal3/nib/* /cluster/bluearc/scratch/hg/galGal3/nib/
     cp -p /cluster/data/galGal3/galGal3.2bit /cluster/bluearc/scratch/hg/galGal3/
     # Ask cluster-admin to distribute to /scratch on big & small cluster
 
 
 #########################################################################
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 5/15/06 angie)
     # Make trackDb table so browser knows what tracks to expect:
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/trackDb
     cvsup
 
     # Add trackDb directories and a description.html
     mkdir chicken/galGal3
     cvs add chicken/galGal3
     cvs add chicken/galGal3/description.html
     cvs ci -m "Initial description for galGal3." chicken/galGal3
     # Edit that makefile to add galGal3 in all the right places and do
     make update DBS=galGal3
 
     mkdir /gbdb/galGal3/html
     mkdir /cluster/data/galGal3/html
     ln -s /cluster/data/galGal3/html/description.html /gbdb/galGal3/html/
     cvs ci -m "Added galGal3." makefile
     # Go public on genome-test.  In a clean tree (no mods, up-to-date):
     cvs up makefile
     make alpha
     # Note: hgcentral*.genome values must correspond
     # with defaultDb.genome values
     hgsql -h genome-testdb hgcentraltest \
       -e 'INSERT INTO dbDb \
         (name, description, nibPath, organism, \
                 defaultPos, active, orderKey, genome, scientificName, \
                 htmlPath, hgNearOk, hgPbOk, sourceName) values \
         ("galGal3", "May 2006", "/gbdb/galGal3", "Chicken", \
                "chr5:57710001-57780000", 1, 30, "Chicken", \
                 "Gallus Gallus", "/gbdb/galGal3/html/description.html", \
                0, 0, "Chicken Genome Sequencing Consortium May 2006 release");'
 
 
 #########################################################################
 # PRODUCING GENSCAN PREDICTIONS (DONE 5/16/06 angie)
     ssh hgwdev
     mkdir /cluster/data/galGal3/bed/genscan
     cd /cluster/data/galGal3/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
     # Run on small cluster (more mem than big cluster).
     ssh kki
     cd /cluster/data/galGal3/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     cp /dev/null genome.list
     foreach f ( `ls -1S /cluster/data/galGal3/*/chr*_*/chr*_?{,?}.fa.masked` )
       egrep '[ACGT]' $f > /dev/null
       if ($status == 0) echo $f >> genome.list
     end
     wc -l genome.list
 #    259 genome.list
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 genome.list single gsub jobList
     para make jobList
     para time
 #Completed: 255 of 259 jobs
 #Crashed: 4 jobs
 #CPU time in finished jobs:      82258s    1370.97m    22.85h    0.95d  0.003 y
 #IO & Wait Time:                  1053s      17.55m     0.29h    0.01d  0.000 y
 #Average job time:                 327s       5.45m     0.09h    0.00d
 #Longest finished job:           20131s     335.52m     5.59h    0.23d
 #Submission to last job:         23171s     386.18m     6.44h    0.27d
 
     # If there are crashes, diagnose with "para problems" / "para crashed".  
     # If a job crashes due to genscan running out of memory, re-run it 
     # manually with "-window=1200000" instead of "-window=2400000".
     ssh kkr5u00
     cd /cluster/data/galGal3/bed/genscan
     /cluster/bin/x86_64/gsBig /cluster/data/galGal3/4/chr4_13/chr4_13.fa.masked gtf/chr4_13.fa.gtf -trans=pep/chr4_13.fa.pep -subopt=subopt/chr4_13.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
     /cluster/bin/x86_64/gsBig /cluster/data/galGal3/3/chr3_14/chr3_14.fa.masked gtf/chr3_14.fa.gtf -trans=pep/chr3_14.fa.pep -subopt=subopt/chr3_14.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
     /cluster/bin/x86_64/gsBig /cluster/data/galGal3/2/chr2_22/chr2_22.fa.masked gtf/chr2_22.fa.gtf -trans=pep/chr2_22.fa.pep -subopt=subopt/chr2_22.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
     /cluster/bin/x86_64/gsBig /cluster/data/galGal3/Un/chrUn_random_3/chrUn_random_3.fa.masked gtf/chrUn_random_3.fa.gtf -trans=pep/chrUn_random_3.fa.pep -subopt=subopt/chrUn_random_3.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
 
     # Convert these to chromosome level files as so:
     ssh kkstore03
     cd /cluster/data/galGal3/bed/genscan
     liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
     liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
     cat pep/*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/galGal3/bed/genscan
     ldHgGene galGal3 genscan genscan.gtf
     hgPepPred galGal3 generic genscanPep genscan.pep
     hgLoadBed galGal3 genscanSubopt genscanSubopt.bed
 
 
 #########################################################################
 # MAKE GCPERCENT (DONE 5/15/06 angie)
     ssh kolossus
     mkdir /cluster/data/galGal3/bed/gc5Base
     cd /cluster/data/galGal3/bed/gc5Base
     hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 galGal3 \
        /cluster/data/galGal3 \
     | wigEncode stdin gc5Base.wig gc5Base.wib
     ssh hgwdev
     mkdir /gbdb/galGal3/wib
     cd /cluster/data/galGal3/bed/gc5Base
     ln -s `pwd`/gc5Base.wib /gbdb/galGal3/wib
     hgLoadWiggle -pathPrefix=/gbdb/galGal3/wib galGal3 gc5Base gc5Base.wig
 
 
 #########################################################################
 # CPGISSLANDS (WUSTL) (DONE 5/15/06 angie)
     ssh hgwdev
     mkdir /cluster/data/galGal3/bed/cpgIsland
     cd /cluster/data/galGal3/bed/cpgIsland
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     mv cpglh.exe /cluster/data/galGal3/bed/cpgIsland/
     
     ssh kolossus
     cd /cluster/data/galGal3/bed/cpgIsland
     foreach f (../../*/chr*.fa.masked)
       set fout=$f:t:r:r.cpg
       echo running cpglh on $f to $fout
       nice ./cpglh.exe $f > $fout
     end
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 /* Input columns: */
 /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
 /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
 /* Output columns: */
 /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
 /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     # << this line makes emacs coloring happy
     awk -f filter.awk chr*.cpg > cpgIsland.bed
 
     # load into database:
     ssh hgwdev
     cd /cluster/data/galGal3/bed/cpgIsland
     hgLoadBed galGal3 cpgIslandExt -tab \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
 
 
 #########################################################################
 # CPGISLANDS (ANDY LAW) (DONE 5/15/06 angie)
     # See notes in makeGalGal2.doc
     ssh kolossus
     mkdir /cluster/data/galGal3/bed/cpgIslandGgfAndy
     cd /cluster/data/galGal3/bed/cpgIslandGgfAndy
     #	Build the preProcGgfAndy program in
     #	kent/src/oneShot/preProcGgfAndy into your ~/bin/
     # Use unmasked sequence since this is not a mammal...
     cp /dev/null cpgIslandGgfAndy.bed
     foreach f (../../*/chr*.fa)
       set chr = $f:t:r:r
       echo preproc and run on unmasked $chr
       ~/bin/x86_64/preProcGgfAndy $f \
       | /cluster/home/angie/ggf-andy-cpg-island.pl \
       | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                    $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                    $pGc = (100.0 * $gc / $n); \
                    $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                         "$pCpG\t$pGc\t$oE\n";' \
       >> cpgIslandGgfAndy.bed
     end
     wc -l ../cpgIsland/cpgIsland.bed *bed
 #  22806 ../cpgIsland/cpgIsland.bed
 #  76488 cpgIslandGgfAndy.bed
     # load into database:
     ssh hgwdev
     cd /cluster/data/galGal3/bed/cpgIslandGgfAndy
     sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
     hgLoadBed galGal3 cpgIslandGgfAndy -tab \
       -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
     featureBits galGal3 cpgIslandExt
 #15533065 bases of 1042591351 (1.490%) in intersection
     featureBits galGal3 cpgIslandGgfAndy
 #62026174 bases of 1042591351 (5.949%) in intersection
 
 
 #########################################################################
 # MAKE CHICKEN LINEAGE-SPECIFIC REPEATS (DONE 5/22/06 angie)
     # In an email 2/13/04, Arian said we could treat all human repeats as 
     # lineage-specific, but could exclude these from chicken as ancestral:
     # L3, L3a, L3b, MIR, MIR3, MIRb, MIRm
     ssh kkstore03
     cd /cluster/data/galGal3
     mkdir -p /san/sanvol1/galGal3/linSpecRep
     foreach f (*/chr*.fa.out)
       awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \
       > /san/sanvol1/galGal3/linSpecRep/$f:t:r:r.out.spec
     end
 
     #	rebuild these as they appear to have become lost
     mkdir /hive/data/staging/data/galGal3/linSpecRep
     foreach f (*/chr*.fa.out)
       awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \
 	> /hive/data/staging/data/galGal3/linSpecRep/$f:t:r:r.out.spec
     end
 
 #########################################################################
 # SWAP CHAINS/NET HG18 (DONE 5/23/06 angie)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/blastz.hg18.swap
     cd /cluster/data/galGal3/bed/blastz.hg18.swap
     doBlastzChainNet.pl -swap /cluster/data/hg18/bed/blastz.galGal3/DEF \
       >& do.log & tail -f do.log
     ln -s blastz.hg18.swap /cluster/data/galGal3/bed/blastz.hg18
     
 
 #########################################################################
 # SWAP CHAINS/NET MM8 (DONE 5/24/06 angie)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/blastz.mm8.swap
     cd /cluster/data/galGal3/bed/blastz.mm8.swap
     doBlastzChainNet.pl -swap /cluster/data/mm8/bed/blastz.galGal3/DEF \
       >& do.log & tail -f do.log
     ln -s blastz.mm8.swap /cluster/data/galGal3/bed/blastz.mm8
     
 
 #########################################################################
 # SWAP CHAINS/NET RN4 (DONE 5/25/06 angie)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/blastz.rn4.swap
     cd /cluster/data/galGal3/bed/blastz.rn4.swap
     doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.galGal3/DEF \
       >& do.log & tail -f do.log
     ln -s blastz.rn4.swap /cluster/data/galGal3/bed/blastz.rn4
     
 
 #########################################################################
 # SWAP CHAINS/NET MONDOM4 (DONE 5/26/06 angie)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/blastz.monDom4.swap
     cd /cluster/data/galGal3/bed/blastz.monDom4.swap
     doBlastzChainNet.pl -swap /cluster/data/monDom4/bed/blastz.galGal3/DEF \
       >& do.log & tail -f do.log
     ln -s blastz.monDom4.swap /cluster/data/galGal3/bed/blastz.monDom4
     
 
 #########################################################################
 # BLASTZ/CHAIN/NET XENTRO2 (DONE 5/27/06 angie)
     ssh pk
     mkdir /cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26
     cd /cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26
     cat << '_EOF_' > DEF
 # chicken vs. frog
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 
 # Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Chicken galGal3
 SEQ1_DIR=/san/sanvol1/galGal3/nib
 SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Frog xenTro2 - single chunk big enough to run two of the
 #               largest scaffolds in one job
 SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/cluster/data/galGal3/bed/blastz.xenTro2.2006-05-26
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/scratch/galGal3XenTro2 \
       -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
       >& do.log & tail -f do.log
     ln -s blastz.xenTro2.2006-05-26 /cluster/data/galGal3/bed/blastz.xenTro2
 
 
 #########################################################################
 # SWAP CHAINS/NET DANRER4 (DONE 5/31/06 angie)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/blastz.danRer4.swap
     cd /cluster/data/galGal3/bed/blastz.danRer4.swap
     doBlastzChainNet.pl -swap /cluster/data/danRer4/bed/blastz.galGal3/DEF \
       >& do.log & tail -f do.log
     ln -s blastz.danRer4.swap /cluster/data/galGal3/bed/blastz.danRer4
     
 
 #########################################################################
 # GENBANK AUTO UPDATE (DONE 6/9/06 angie)
     # align with revised genbank process. drop xeno ESTs.
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add galGal3
 
 # galGal3
 galGal3.serverGenome = /cluster/data/galGal3/galGal3.2bit
 galGal3.clusterGenome = /scratch/hg/galGal3/galGal3.2bit
 galGal3.ooc = /cluster/bluearc/galGal3/11.ooc
 galGal3.align.unplacedChroms = chr*_random
 galGal3.lift = /cluster/data/galGal3/jkStuff/liftAll.lft
 galGal3.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 galGal3.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 galGal3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 galGal3.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 galGal3.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 galGal3.refseq.mrna.native.load = yes
 galGal3.refseq.mrna.xeno.load = yes
 galGal3.genbank.mrna.xeno.load = yes
 galGal3.downloadDir = galGal3
 
     cvs ci etc/genbank.conf
     # update /cluster/data/genbank/
     make etc-update
 
     ssh kkstore02
     cd /cluster/data/genbank
     nice bin/gbAlignStep -initial galGal3 &
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     nice ./bin/gbDbLoadStep -drop -initialLoad galGal3 &
 
     # enable daily alignment and update of hgwdev
     cd ~/kent/src/makeDb/genbank
     cvsup
     # add galGal3 to:
         etc/align.dbs
         etc/hgwdev.dbs 
     cvs commit
     make etc-update
 
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK FOR hg18 (DONE,  2006-06-16, braney)
     ssh kkstore03
     bash # if not using bash shell already
     # make Blast database for non-random chrom sequences
     mkdir -p /cluster/data/galGal3/blastDb
     cd /cluster/data/galGal3/blastDb
     ls ../*/*/*.lft | grep -v lift | sed 's/lft/fa/' > list
     ln -s `cat list` .
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.log *.fa list
 
     cd /cluster/data/galGal3
     for i in */*/*.lft
        do cat  $i  ; 
     done > jkStuff/subChr.lft
 
     mkdir -p  /san/sanvol1/scratch/galGal3/blastDb; 
     cd /cluster/data/galGal3/blastDb
     for i in nhr nin nsq; 
     do 
 	cp *.$i /san/sanvol1/scratch/galGal3/blastDb; 
     done
 
    mkdir /cluster/data/galGal3/bed/tblastnHg18KG
    cd  /cluster/data/galGal3/bed/tblastnHg18KG
    echo  /san/sanvol1/scratch/galGal3/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
 # 259 query.lst
 
    # we want around 250000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
 # 36727/(250000/259) = 38.049172
 
    mkdir -p /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa
    split -l 60 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/galGal3/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/galGal3/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
         liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/galGal3/jkStuff/liftAll.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4
 
         if pslCheck -prot $3.tmp                                                  
         then                                                                      
             mv $3.tmp $3                                                          
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0                                                                    
     fi                                                                            
 fi                                                                                
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     
     # then run the Blast cluster jobs
     ssh kk
     cd /cluster/data/galGal3/bed/tblastn.hg18KG
     para create blastSpec
     para try, check, push, check etc.
     # pushed 100,000 jobs at a time so need to do para push again later
     para time
 # Completed: 158767 of 158767 jobs
 # CPU time in finished jobs:    9352133s  155868.88m  2597.81h  108.24d  0.297 y
 # IO & Wait Time:                526347s    8772.45m   146.21h    6.09d  0.017 y
 # Average job time:                  62s       1.04m     0.02h    0.00d
 # Longest finished job:             221s       3.68m     0.06h    0.00d
 # Submission to last job:        104854s    1747.57m    29.13h    1.21d
 
     ssh kkstore03
     cd /cluster/data/galGal3/bed/tblastn.hg18KG
     tcsh
     mkdir chainRun
     cd chainRun
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS \
       /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh pk
     cd /cluster/data/galGal3/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para try, check, push, check etc.
 # Completed: 613 of 613 jobs
 # CPU time in finished jobs:      26409s     440.15m     7.34h    0.31d  0.001 y
 # IO & Wait Time:                  7787s     129.79m     2.16h    0.09d  0.000 y
 # Average job time:                  56s       0.93m     0.02h    0.00d
 # Longest finished job:             553s       9.22m     0.15h    0.01d
 # Submission to last job:          1834s      30.57m     0.51h    0.02d
 
     ssh kkstore03
     cd /cluster/data/galGal3/bed/tblastn.hg18KG/blastOut
     bash # if using another shell
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/galGal3/bed/tblastn.hg18KG/blastHg18KG.psl
     pslCheck /cluster/data/galGal3/bed/tblastn.hg18KG/blastHg18KG.psl
     # this is ok.
     # load table 
     ssh hgwdev
     cd /cluster/data/galGal3/bed/tblastn.hg18KG
     hgLoadPsl galGal3 blastHg18KG.psl
     # check coverage
     featureBits galGal3 blastHg18KG
 # 19638830 bases of 1042591351 (1.884%) in intersection
     
     featureBits galGal2 blastHg16KG
 # 17057637 bases of 1054197620 (1.618%) in intersection
 
     featureBits galGal3 refGene:cds blastHg18KG -enrichment
 # refGene:cds 0.504%, blastHg18KG 1.884%, both 0.417%, cover 82.71%, enrich 43.91x
 
     featureBits galGal2 refGene:cds blastHg16KG -enrichment
 # refGene:cds 0.497%, blastHg16KG 1.618%, both 0.335%, cover 67.42%, enrich 41.66x
 
     ssh kkstore04
     rm -rf /cluster/data/galGal3/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/galGal3/bed/tblastn.hg18KG/blastOut
 
 #end human proteins
 
 
 #########################################################################
 # BACEND PAIRS TRACK
 # DOWNLOAD CLONEENDS (BACENDS) FROM NCBI (DONE 6/20/06 angie)
     ssh kkstore03
     cd /cluster/data/galGal3
     # Plenty of room at the moment:
     df -h .
 #                      2.0T  1.6T  280G  86% /cluster/store6
     mkdir -p bed/cloneend/ncbi
     cd bed/cloneend/ncbi
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/gallus_gallus/\*
     cd ..
     # Extract unversioned accessions from headers and combine into one 
     # uncompressed file which we will link to from /gbdb/:
     zcat ncbi/*ends*.mfa.gz \
     | perl -wpe 'if (/^>/) { \
        s/^>.*\|(gb|dbj)\|(\w+)\.\d+\|.*/>$2/ || die "parse line $.:$_\n"; }' \
       > cloneEnds.fa
     # Make sure the sequences are intact after the header-munging:
     faSize ncbi/*.mfa.gz
 #146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 8 files
 #Total size: mean 1084.6 sd 183.8 min 64 (gi|30716935|gb|CC322877.1|) max 2431 (gi|30609044|gb|CC263290.1|) median 1095
     faSize cloneEnds.fa
 #146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 1 files
 #Total size: mean 1084.6 sd 183.8 min 64 (CC322877) max 2431 (CC263290) median 1095
 
     # Extract pairings from info files:
     zcat ncbi/*info*.txt.gz \
     | /cluster/bin/scripts/convertCloneEndInfo stdin
 #55099 pairs and 22655 singles
 
     # split for cluster run
     mkdir /cluster/bluearc/galGal3/cloneEnds
     faSplit sequence cloneEnds.fa 100 /cluster/bluearc/galGal3/cloneEnds/cloneEnds
     #	Check to ensure no breakage:
     faSize /cluster/bluearc/galGal3/cloneEnds/*.fa
 #146777127 bases (3202228 N's 143574899 real 131081897 upper 12493002 lower) in 135327 sequences in 99 files
 #Total size: mean 1084.6 sd 183.8 min 64 (CC322877) max 2431 (CC263290) median 1095
     #	same numbers as before
 
     # load sequences
     ssh hgwdev
     mkdir /gbdb/galGal3/cloneend
     ln -s /cluster/data/galGal3/bed/cloneend/cloneEnds.fa /gbdb/galGal3/cloneend/
     cd /tmp
     hgLoadSeq galGal3 /gbdb/galGal3/cloneend/cloneEnds.fa
 #135327 sequences
 
 
 # BACEND SEQUENCE ALIGNMENTS (DONE 7/17/06 angie)
     ssh kkstore03
     # Make unmasked fasta.
     cd /cluster/data/galGal3
     mkdir /cluster/bluearc/galGal3/noMask
     foreach f (?{,?}/chr*.fa)
       echo $f:t:r
       perl -wpe 'tr/a-z/A-Z/ if (! /^>/);' $f \
         > /cluster/bluearc/galGal3/noMask/$f:t
     end
 
     # kluster run  [san is down today so use kk this time]
     ssh kk
     mkdir /cluster/data/galGal3/bed/bacends
     cd /cluster/data/galGal3/bed/bacends
     mkdir out
     # allow blat to run politely in /tmp while it writes output, then
     # copy results to results file:
     cat << '_EOF_' > runBlat.csh
 #!/bin/csh -ef
 set root1 = $1
 set root2 = $2
 set result = $3
 rm -fr /scratch/tmp/${root1}_${root2}
 mkdir /scratch/tmp/${root1}_${root2}
 pushd /scratch/tmp/${root1}_${root2}
 /cluster/bin/i386/blat /cluster/bluearc/galGal3/noMask/${root1}.fa \
   /cluster/bluearc/galGal3/cloneEnds/${root2}.fa ${root1}.${root2}.psl \
   -ooc=/cluster/bluearc/galGal3/10.ooc -tileSize=10
 popd
 mkdir -p out/${root2}
 rm -f ${result}
 mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
 rm -fr /scratch/tmp/${root1}_${root2}
 '_EOF_'
     #	<< emacs
     chmod +x runBlat.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./runBlat.csh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << emacs
     ls -1S /cluster/bluearc/galGal3/cloneEnds/cloneEnds???.fa > bacEnds.lst
     ls -1S /cluster/bluearc/galGal3/noMask/chr*.fa > contig.lst
     gensub2 contig.lst bacEnds.lst template jobList
     para make jobList
 #Completed: 5247 of 5247 jobs
 #CPU time in finished jobs:      69864s    1164.39m    19.41h    0.81d  0.002 y
 #IO & Wait Time:                614130s   10235.51m   170.59h    7.11d  0.019 y
 #Average job time:                 130s       2.17m     0.04h    0.00d
 #Longest finished job:            3306s      55.10m     0.92h    0.04d
 #Submission to last job:          3674s      61.23m     1.02h    0.04d
 
     ssh kkstore03
     cd /cluster/data/galGal3/bed/bacends
     mkdir temp
     time pslSort dirs raw.psl temp out/*
 #9.349u 1.074s 0:10.54 98.7%     0+0k 0+0io 2pf+0w
     time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons raw.psl \
       bacEnds.psl /dev/null
 #4.838u 0.158s 0:05.09 97.8%     0+0k 0+0io 2pf+0w
 
 
 # BACEND PAIRS TRACK (DONE 7/17/06 angie)
     ssh kolossus
     cd /cluster/data/galGal3/bed/bacends
     time /cluster/bin/x86_64/pslPairs -tInsert=10000 \
       -minId=0.91 -noBin -min=25000 \
       -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
       -mismatch -verbose bacEnds.psl \
         ../cloneend/cloneEndPairs.txt all_bacends bacEnds
 #1.076u 0.292s 0:01.68 80.9%     0+0k 0+0io 3pf+0w
 
     # Filter by score and sort by {chrom,chromStart}:
     awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
     cat bacEnds.{slop,short,long,mismatch,orphan} \
     | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
 
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $5}' bacEndPairs.bed | sort -u
 
     /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
       bacEndPairsBad.bed \
     | sorttbl tname tstart | headchg -del \
     > bacEnds.load.psl
     wc -l bacEnds.*
 #  136054 bacEnds.psl
 #  106769 bacEnds.load.psl
 #   29879 bacEnds.pairs
 #       5 bacEnds.long
 #      99 bacEnds.lst
 #      56 bacEnds.mismatch
 #   20125 bacEnds.orphan
 #     106 bacEnds.short
 #      78 bacEnds.slop
 
     # load into database
     ssh hgwdev
     cd /cluster/data/galGal3/bed/bacends
     hgLoadBed -strict -notItemRgb galGal3 bacEndPairs bacEndPairs.bed \
         -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
 #Loaded 29879 elements of size 11
     # note - the "Bad" track isn't pushed to RR, just used for assembly QA.
     hgLoadBed -strict -notItemRgb galGal3 bacEndPairsBad bacEndPairsBad.bed \
       -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
 #Loaded 20360 elements of size 11
     time hgLoadPsl galGal3 -table=all_bacends bacEnds.load.psl
 #1.123u 0.104s 0:04.23 28.8%     0+0k 0+0io 3pf+0w
 
     # Compares favorably to previous release (good cov up, bad cov down):
     featureBits galGal3 all_bacends
 #58629720 bases of 1042591351 (5.623%) in intersection
     featureBits galGal2 all_bacends
 #52184234 bases of 1054197620 (4.950%) in intersection
     featureBits galGal3 bacEndPairs
 #43710407 bases of 1042591351 (4.192%) in intersection
     featureBits galGal2 bacEndPairs
 #39206208 bases of 1054197620 (3.719%) in intersection
     featureBits galGal3 bacEndPairsBad
 #15784430 bases of 1042591351 (1.514%) in intersection
     featureBits galGal2 bacEndPairsBad
 #27538232 bases of 1054197620 (2.612%) in intersection
 
     # Clean up.
     rm psl.tab psl.tab.loaded raw.psl bed.tab
     rm -r out
     rmdir temp
     rm -r /cluster/bluearc/galGal3/noMask/ /cluster/bluearc/galGal3/cloneEnds/
 
 # end BACEND PAIRS TRACK
 
 
 #########################################################################
 # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 7/13/06 angie - REDONE 8/4)
 # REDONE 8/4 after makeDownloads.pl was fixed to not try to include complete
 # paths in the tar files.
     ssh hgwdev
     cd /cluster/data/galGal3
     ~/kent/src/utils/makeDownloads.pl galGal3 -verbose=2 \
       >& jkStuff/downloads.log & tail -f jkStuff/downloads.log
     # Edit README.txt files when done:
 # *** Edit each README.txt to resolve any notes marked with "***":
 #     /cluster/data/galGal3/goldenPath/database/README.txt
 #     /cluster/data/galGal3/goldenPath/bigZips/README.txt
 #     /cluster/data/galGal3/goldenPath/chromosomes/README.txt
 #     (The htdocs/goldenPath/galGal3/*/README.txt "files" are just links to those.)
 
 
 
 #########################################################################
 # MAKE EMPTY CHRM_GOLD (DONE 7/14/06 angie)
 # Kayla found that hgTracks was complaining about missing chrM_gold.
 # We can use hgFakeAgp to make a fake chrM.agp, and load that.
 # However, then the chrM Assembly track shows faked contents that do not
 # match the track/assembly description.  So remove the faked contents,
 # leaving an empty table which is fine -- chrM is not part of the assembly.
     ssh hgwdev
     cd /cluster/data/galGal3
     hgFakeAgp M/chrM.fa M/chrM.agp
     hgGoldGapGl -noGl -chromLst=chrom.lst galGal3 /cluster/data/galGal3 .
     hgsql galGal3 -e 'delete from chrM_gold'
 
 
 #########################################################################
 # MULTIZ7WAY (DONE 7/18/06 angie)
 # (((galGal3 ((hg18 (mm8 rn4)) monDom4)) xenTro2) danRer4)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18
 
     # copy MAF's to cluster-friendly server
     mkdir /san/sanvol1/scratch/galGal3/mafNet
     foreach s (rn4 mm8 hg18 monDom4 xenTro2 danRer4)
       echo $s
       rsync -av /cluster/data/galGal3/bed/blastz.$s/mafNet/* \
         /san/sanvol1/scratch/galGal3/mafNet/$s/
     end
 
     # Prune the hg17 17way tree to just these 9 and update db names:
     /cluster/bin/phast/tree_doctor \
       --prune-all-but=rat_rn3,mouse_mm7,human_hg17,monodelphis_monDom2,chicken_galGal2,xenopus_xenTro1,zebrafish_danRer3 \
       --rename="rat_rn3 -> rat_rn4 ; mouse_mm7 -> mouse_mm8 ; human_hg17 -> human_hg18 ; monodelphis_monDom2 -> monodelphis_monDom4 ; chicken_galGal2 -> chicken_galGal3 ; zebrafish_danRer3 -> zebrafish_danRer4 ; xenopus_xenTro1 -> xenopus_xenTro2" \
       /cluster/data/hg17/bed/multiz17way/17way.nh > 7way.nh
     # *carefully* edit 7way.nh to put galGal3 first.
 
     /cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt
     grep galGal3 7way.distances.txt | sort -k3,3n | \
         awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
     cat distances.txt
 #0.9847  human_hg18
 #1.0149  monodelphis_monDom4
 #1.3425  mouse_mm8
 #1.3472  rat_rn4
 #1.3604  xenopus_xenTro2
 #1.6727  zebrafish_danRer4
     # the order in the browser display will be by tree topology,
     # not by distance, but in this case (unlike human) they happen to match.
 
     # create species list and stripped down tree for autoMZ
     sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 7way.nh \
       > tree-commas.nh
     sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
     sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     ssh pk
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18
     mkdir maf run
     cd run
 
     # stash binaries 
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
     set db = galGal3
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/mafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../{tree.nh,species.lst} $tmp
     pushd $tmp
     foreach s (`cat species.lst`)
         set in = $pairs/$s/$c.maf
         set out = $db.$s.sing.maf
         if ($s == $db) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
     # << emacs
     chmod +x autoMultiz.csh
 
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/galGal3/bed/multiz7way.2006-07-18/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
     # << emacs
 
     awk '{print $1}' /cluster/data/galGal3/chrom.sizes > chrom.lst
     gensub2 chrom.lst single spec jobList
     para make jobList
     para time
 #Completed: 57 of 57 jobs
 #CPU time in finished jobs:       6545s     109.08m     1.82h    0.08d  0.000 y
 #IO & Wait Time:                   206s       3.43m     0.06h    0.00d  0.000 y
 #Average job time:                 118s       1.97m     0.03h    0.00d
 #Longest finished job:             890s      14.83m     0.25h    0.01d
 #Submission to last job:           891s      14.85m     0.25h    0.01d
 
     # Make .gif for tree by pasting the contents of 7way.nh into Galt's 
     # cgi-bin/phyloGif tool.  Save the .gif in your checkout of the browser/ 
     # CVS tree as browser/images/phylo/galGal3_7way.gif .
     ssh hgwdev
     cd ~/browser/images/phylo
     cvs add galGal3_7way.gif
     # note:  markd says it is best to use "cvs add -kb" for binary files - b0b
     cvs ci -m "phyloGif-generated galGal3 7way tree image." galGal3_7way.gif
     # Do a "make alpha" to install the file in htdocs/images/phylo/ :
     cd ~/browser
     cvs up -d -P
     make alpha
     # Include the file in the push request, and include
     # "treeImage phylo/galGal3_7way.gif" in the multiz7way trackDb.ra entry.
 
 
 #########################################################################
 # ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE 7/18/2006 angie - REDONE 7/28)
     # REDONE 7/28/06 after Kayla found overlaps and Brian fixed mafAddIRows.
     ssh kolossus
     mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno
     mkdir maf run
     cd run
     rm -f sizes nBeds
     foreach db (`cat /cluster/data/galGal3/bed/multiz7way.2006-07-18/species.lst`)
       ln -s  /cluster/data/$db/chrom.sizes $db.len
       if (! -e /cluster/data/$db/$db.N.bed) then
         twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
       endif
       ln -s  /cluster/data/$db/$db.N.bed $db.bed
       echo $db.bed  >> nBeds
       echo $db.len  >> sizes
     end
     echo date > jobs.csh
     # do smaller jobs first:
     foreach f (`ls -1rS ../../maf/*.maf`)
       echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
         /cluster/data/galGal3/galGal3.2bit ../maf/$f:t  >> jobs.csh
     end 
     echo date >> jobs.csh
     csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
     # 23min
     # Run mafFilter -overlap to make sure everything is cool before loading:
     mafFilter -overlap -reject=/tmp/chr1.rej.maf ../maf/chr1.maf > /dev/null
 #rejected 0 blocks
 
     # Load anno/maf
     ssh hgwdev
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf
     mkdir -p /gbdb/galGal3/multiz7way/anno/maf
     ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf/*.maf \
       /gbdb/galGal3/multiz7way/anno/maf
     nice hgLoadMaf -pathPrefix=/gbdb/galGal3/multiz7way/anno/maf \
       galGal3 multiz7way
 #Loaded 1488826 mafs in 57 files from /gbdb/galGal3/multiz7way/anno/maf
     # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
     # machine and then load on hgwdev:
     ssh kolossus
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf
     cat *.maf \
     | nice hgLoadMafSummary galGal3 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
              -test multiz7waySummary stdin
 #Created 518561 summary blocks from 3179814 components and 1488826 mafs from stdin
     ssh hgwdev
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/anno/maf
     sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
       > /tmp/multiz7waySummary.sql
     time nice hgLoadSqlTab galGal3 multiz7waySummary /tmp/multiz7waySummary.sql \
       multiz7waySummary.tab
 #0.000u 0.002s 0:08.00 0.0%      0+0k 0+0io 3pf+0w
     rm *.tab
     ln -s multiz7way.2006-07-18 /cluster/data/galGal3/bed/multiz7way
 
 
 #########################################################################
 # MULTIZ7WAY DOWNLOADABLES (DONE 7/18/2006 angie - REDONE 7/28)
     # Annotated MAF is now documented, so use anno/maf for downloads.
     ssh hgwdev
     mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads
     # upstream mafs 
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits galGal3 refGene:upstream:$i -fa=/dev/null -bed=up.bad
         awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         nice mafFrags galGal3 multiz7way up.bed upstream$i.maf \
                 -orgs=../species.lst
         rm up.bed
     end
     date
 'EOF'
     # << emacs
     time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
 #10.144u 15.154s 0:52.22 48.4%   0+0k 0+0io 5pf+0w
 
     ssh kkstore03
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads
 cat > downloads.csh << 'EOF'
     date
     foreach f (../anno/maf/chr*.maf)
 	set c = $f:t:r
         echo $c
 	nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     # << emacs
     time csh -efx downloads.csh >&! downloads.log
 #355.340u 3.718s 5:59.55 99.8%   0+0k 0+0io 1pf+0w
     nice gzip up*.maf
     nice md5sum up*.maf.gz >> md5sum.txt
     # Make a README.txt
 
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/galGal3/multiz7way
     mkdir $dir
     ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/mafDownloads/{*.gz,*.txt} $dir
 
 
 
 #########################################################################
 # MULTIZ7WAY MAF FRAMES (DONE 7/18/2006 angie - REDONE 7/31)
     ssh hgwdev
     mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames
     # The following is adapted from MarkD's Makefile used for mm7...
 
     #------------------------------------------------------------------------
     # get the genes for all genomes
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
     # using mrna table as genes: danRer4
     mkdir genes
     foreach queryDb (danRer4)
       set tmpExt = `mktemp temp.XXXXXX`
       set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
       set tmpMrna = ${queryDb}.mrna.${tmpExt}
       set tmpCds = ${queryDb}.cds.${tmpExt}
       echo $queryDb
       hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                    from all_mrna,gbCdnaInfo,cds \
                    where (all_mrna.qName = gbCdnaInfo.acc) and \
                      (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
        ${queryDb} > ${tmpMrnaCds}
       cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
       cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
       mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
         stdout \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
     # using knownGene for rn4 mm8 hg18
     # using refGene for galGal3
     # using mgcGenes for xenTro2
     # no genes for monDom4
     # genePreds; (must keep only the first 10 columns for knownGene)
     foreach queryDb (galGal3 rn4 mm8 hg18 xenTro2)
       unset geneTbl range
       if ($queryDb == "xenTro2") then
         set geneTbl = mgcGenes
         set range = 1-10
       else if ($queryDb == "galGal3") then
         set geneTbl = refGene
         set range = 2-11
       else
         set geneTbl = knownGene
         set range = 1-10
       endif
       echo $queryDb
       hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f $range \
       | genePredSingleCover stdin stdout \
       | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
     end
 
     #------------------------------------------------------------------------
     # create frames
     ssh kkstore03
     nice tcsh # easy way to get process niced
     cd /cluster/data/galGal3/bed/multiz7way/frames
     (cat  ../anno/maf/*.maf \
      | time genePredToMafFrames galGal3 stdin stdout \
          danRer4 genes/danRer4.gp.gz \
          rn4 genes/rn4.gp.gz \
          hg18 genes/hg18.gp.gz \
          mm8 genes/mm8.gp.gz \
          xenTro2 genes/xenTro2.gp.gz \
          galGal3 genes/galGal3.gp.gz \
      | gzip > multiz7way.mafFrames.gz) >& mafFrames.log & tail -f mafFrames.log
 
     #------------------------------------------------------------------------
     # load the database
     ssh hgwdev
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/frames
     hgLoadMafFrames galGal3 multiz7wayFrames multiz7way.mafFrames.gz
 
 
 #########################################################################
 # PHASTCONS (DONE 7/20/2006 angie)
 # Using Kate's process from makeHg17.doc.
 # This process is distilled from Hiram and Adam's experiments
 # on mouse (mm7) 17way track.  Many parameters are now fixed, without
 # being experimentally derived, either because the experiments
 # were lengthy and produced similar results, or because they
 # weren't runnable given the alignment size.
 # These parameters are:
 # --rho
 # --expected-length
 # --target-coverage
 # Also, instead of generating cons and noncons tree models,
 # we use a single, pre-existing tree model -- Elliot Margulies' model 
 # from the (37-way) ENCODE alignments.
     ssh kkstore03
     cd /cluster/data/galGal3
     foreach f (`cat chrom.lst`)
       echo $f
       cp -p $f/*.fa /san/sanvol1/scratch/galGal3/chrom/
     end
 
     # Split chromosome MAF's into windows and use to generate
     # "sufficient statistics" (ss) files for phastCons input
     # NOTE: as the SAN fs has lotsa space, we're leaving these
     # big (temp) files unzipped, to save time during phastCons run.
     # Note also the larger chunk sizes from previous runs -- this
     # reduces run-time on the split, slows down the actual phastCons
     # enough so jobs don't crash (jobs are very quick, just a minute
     # or so), and according to Adam, will produce better results.
     # The previous small chunks were probably required by
     # the phyloFit step, which we are no longer using for the
     # human alignments.
     ssh pk
     mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
     /cluster/bin/phast/tree_doctor \
       --prune-all-but=rn3,mm7,hg17,monDom2,galGal2,xenTro1,danRer3 \
       --rename="galGal2 -> galGal3 ; rn3 -> rn4 ; hg17 -> hg18 ; mm7 -> mm8 ; monDom2 -> monDom4 ; xenTro1 -> xenTro2 ; danRer3 -> danRer4" \
       /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod \
       > elliotsEncodePruned.mod
     mkdir run.split
     cd run.split
     set WINDOWS = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
 
     cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
     set MAFS = /cluster/data/galGal3/bed/multiz7way.2006-07-18/maf
     set WINDOWS = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss
     cd $WINDOWS
     set c = $1
     echo $c
     rm -fr $c
     mkdir $c
     /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
         -M /san/sanvol1/scratch/galGal3/chrom/$c.fa \
         -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
     echo "Done" >> $c.done
 'EOF'
 # << emacs
     chmod +x doSplit.csh
 
     rm -f jobList
     foreach f (../../maf/*.maf) 
       set c = $f:t:r
       echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
     end
     
     para make jobList
     para time
 #Completed: 54 of 57 jobs
 #Crashed: 3 jobs
 #CPU time in finished jobs:        570s       9.49m     0.16h    0.01d  0.000 y
 #IO & Wait Time:                   229s       3.82m     0.06h    0.00d  0.000 y
 #Average job time:                  15s       0.25m     0.00h    0.00d
 #Longest finished job:             112s       1.87m     0.03h    0.00d
 #Submission to last job:           223s       3.72m     0.06h    0.00d
     para crashed
 #chr12_random, chr17_random and chr32 crashed because their maf's contain
 #nothing but comments -- no alignments.  That's OK, no windows for them.
 
     # check tree model on 5MB chunk, using params recommended by Adam,
     # (to verify branch lengths on 2X species)
     ssh kolossus
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
         --tree "`cat ../tree-commas.nh`" \
         /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss/chr7/chr7.10000001-19997442.ss \
         -o phyloFit.tree
     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s REV \
         --tree "`cat ../tree-commas.nh`" \
         /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/ss/chr7/chr7.10000001-19997442.ss \
         -o phyloFit.rev.tree
     # Comment from makeHg17.doc:
     # # he ok'ed the results -- not necessary for next human run
     # TODO: maybe run these by Adam... the numbers in the RATE_MAT and 
     # TREE are different between the phyloFit and elliotsEncode models,
     # not sure if the differences are significant.  I will proceed with 
     # elliotsEncode as in makeHg17.doc.
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     mkdir /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons
     cat > doPhast.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set tmp = /scratch/tmp/$f
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
 cp -p $san/ss/$c/$f.ss ../elliotsEncodePruned.mod $tmp
 pushd $tmp > /dev/null
 /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncodePruned.mod \
   --rho $rho --expected-length $len --target-coverage $cov --quiet \
   --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
 popd > /dev/null
 mkdir -p $san/pp/$c $san/bed/$c
 sleep 1
 mv $tmp/$f.pp $san/pp/$c
 mv $tmp/$f.bed $san/bed/$c
 rm -fr $tmp
 'EOF'
     # << emacs
     chmod a+x doPhast.csh
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
     cat > template << 'EOF'
 #LOOP
 doPhast.csh $(root1) $(file1) 12 .05 .20
 #ENDLOOP
 'EOF'
     # << emacs
 
     # Create parasol batch and run it
     ssh pk
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons
     pushd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
     ls -1S ss/chr*/chr*.ss \
     | sed 's/.ss$//' \
       > /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/run.cons/in.list
     popd
 
     gensub2 in.list single template jobList
     para make jobList
     para time
 #Completed: 151 of 151 jobs
 #CPU time in finished jobs:       2832s      47.20m     0.79h    0.03d  0.000 y
 #IO & Wait Time:                   644s      10.74m     0.18h    0.01d  0.000 y
 #Average job time:                  23s       0.38m     0.01h    0.00d
 #Longest finished job:              34s       0.57m     0.01h    0.00d
 #Submission to last job:           181s       3.02m     0.05h    0.00d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
     #	The sed's and the sort get the file names in chrom,start order
     # (Hiram tricks -- split into columns on [.-/] with 
     #    identifying x,y,z, to allow column sorting and
     #    restoring the filename.  Warning: the sort column
     # will depend on how deep you are in the dir
     find ./bed -name "chr*.bed" \
     | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \
     | sort -k7,7 -k9,9n \
     | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \
     | xargs cat \
     | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
     | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements7way.bed
     cp -p phastConsElements7way.bed /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
 
     # Measure coverage.  If good, load elements into database and proceed with wiggle.
     # Try for 5% overall cov, and 70% CDS cov.
     ssh hgwdev
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
     featureBits galGal3 -enrichment refGene:cds phastConsElements7way.bed
     featureBits galGal3 -enrichment xenoRefGene:cds phastConsElements7way.bed
 
     # FIRST ITERATION: doPhast (len cov rho) = (12 .1 .27)
 #refGene:cds 0.504%, phastConsElements7way.bed 7.891%, both 0.414%, cover 82.18%, enrich 10.41x
 #xenoRefGene:cds 1.785%, phastConsElements7way_12_100_27.bed 7.891%, both 1.610%, cover 90.22%, enrich 11.43x
     mv phastConsElements7way.bed phastConsElements7way_12_100_27.bed
     # SECOND ITERATION: doPhast (len cov rho) = (12 .1 .3)
 #refGene:cds 0.504%, phastConsElements7way.bed 8.336%, both 0.421%, cover 83.61%, enrich 10.03x
 #xenoRefGene:cds 1.785%, phastConsElements7way.bed 8.336%, both 1.635%, cover 91.59%, enrich 10.99x
     mv phastConsElements7way.bed phastConsElements7way_12_100_30.bed
     # THIRD ITERATION: doPhast (len cov rho) = (12 .05 .27)
 #refGene:cds 0.504%, phastConsElements7way.bed 7.650%, both 0.415%, cover 82.26%, enrich 10.75x
 #xenoRefGene:cds 1.785%, phastConsElements7way.bed 7.650%, both 1.615%, cover 90.49%, enrich 11.83x
     mv phastConsElements7way.bed phastConsElements7way_12_050_27.bed
     # FOURTH ITERATION: doPhast (len cov rho) = (12 .05 .20)
 #refGene:cds 0.504%, phastConsElements7way.bed 6.480%, both 0.392%, cover 77.82%, enrich 12.01x
 #xenoRefGene:cds 1.785%, phastConsElements7way.bed 6.480%, both 1.535%, cover 86.02%, enrich 13.28x
     mv phastConsElements7way.bed phastConsElements7way_12_050_20.bed
     # The coverage doesn't want to drop too much and maybe it shouldn't --
     # I don't like the idea of throwing extreme parameters in order to get
     # the desired coverage figure from a model that doesn't want to go there.
     # Load this up and see how it looks...
 
 
     # When happy:
     hgLoadBed -strict galGal3 phastConsElements7way \
       phastConsElements7way_12_050_20.bed
 
     # Create merged posterier probability file and wiggle track data files
     ssh kolossus
     cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     #next time try Angie's simpler sort, below
     time find ./pp -name "chr*.pp" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
         sort -k7,7 -k9,9n | \
         sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
         xargs cat | \
         nice wigEncode stdin phastCons7way.wig phastCons7way.wib
     cp -p phastCons7way.wi? \
       /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons
     ln -s `pwd`/phastCons7way.wib /gbdb/galGal3/multiz7way/phastCons7way.wib
     hgLoadWiggle -pathPrefix=/gbdb/galGal3/multiz7way galGal3 \
         phastCons7way phastCons7way.wig
 
 
 #########################################################################
 # PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE 7/20/2006 angie)
     ssh kolossus
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18
     mkdir phastConsDownloads
     cd phastConsDownloads
     cat > downloads.csh << 'EOF'
     date
     cd /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons/pp
     foreach chr (`awk '{print $1}' /cluster/data/galGal3/chrom.sizes`)
       echo $chr
       cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
         | nice gzip -c \
             > /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads/$chr.gz
     end
     date
 'EOF'
     # << emacs
     csh -efx downloads.csh >&! downloads.log & tail -f downloads.log
     # ~3min
     md5sum *.gz > md5sum.txt
     # Make a README.txt
 
     ssh hgwdev
     cd /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads
     set dir = /usr/local/apache/htdocs/goldenPath/galGal3/phastCons7way
     mkdir $dir
     ln -s /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastConsDownloads/{*.gz,*.txt} $dir
     # Clean up after phastCons run.
     ssh kkstore03
     rm /cluster/data/galGal3/bed/multiz7way.2006-07-18/phastCons/*.tab
     rm -r /san/sanvol1/scratch/galGal3/multiz7way.2006-07-18/phastCons
 
 
 #########################################################################
 # SELF CHAINS (DONE 9/11/06 angie)
     # requested by user
     ssh kk
     mkdir /cluster/data/galGal3/bed/blastz.galGal3.2006-09-08
     cd /cluster/data/galGal3/bed/blastz.galGal3.2006-09-08
     cat << '_EOF_' > DEF
 # chicken vs chicken
 
 BLASTZ=/cluster/bin/penn/i386/blastz
 BLASTZ_H=2000
 BLASTZ_M=200
 
 # TARGET: chicken galGal3
 SEQ1_DIR=/scratch/hg/galGal3/nib
 SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: chicken galGal3
 SEQ2_DIR=/scratch/hg/galGal3/nib
 SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/galGal3/bed/blastz.galGal3.2006-09-08
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -chainMinScore=10000 -chainLinearGap=medium \
       -blastzOutRoot=/cluster/bluearc/galGal3Self \
       -bigClusterHub=kk -workhorse=kkr7u00 DEF \
       >& do.log & tail -f do.log
     ln -s blastz.galGal3.2006-09-08 /cluster/data/galGal3/bed/blastz.galGal3
 
 
 #########################################################################
 # SNP & GENES FROM BEIJING GENOME INST. (DONE 12/4/06 angie)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/bgi
     cd /cluster/data/galGal3/bed/bgi
     wget --timestamping \
       'ftp://61.50.158.101/BGI/snps_by_strain/2006-11-28/*'
     # (username & password in email from Yong Zhang 11/29)
     # Each .tar.gz unpacks into identical sets of files so they will 
     # overwrite each other if unpacked into the same directory.  So create
     # a subdir for each strain, and unpack its files in the subdir:
     foreach f (*.tar.gz)
       set strain = `echo $f:r:r:r | perl -wpe 's/^(\w)/\u$1/'`
       mkdir -p $strain
       pushd $strain
       tar -xvzf ../$f
       popd
     end
     # Make coverage bed4:
     cp /dev/null bgiCov.bed
     foreach strain (Broiler Layer Silkie)
       foreach f ($strain/ContigCov/ContigCov-chr*.txt)
         set chr = `echo $f:t:r | sed -e 's/ContigCov-//'`
         grep -v ^Covered $f \
         | perl -wpe 'if (/^(\d+)\s+(\d+)\s*$/) { \
               $s = $1-1;  $_ = "'$chr'\t$s\t$2\t'$strain'.'$chr'.$1.$2\n"; \
             } else {die};' \
           >> bgiCov.bed
       end
     end
     # Make bgiSnp bed+:
     ~/kent/src/hg/snp/bgi/bgiSnp.pl */SNPtables/*.txt
     # The Layer files had the wrong strain ID code (last digit) --  the
     # script substituted in the correct strain ID (Yong Zhang's suggestion).
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/galGal3/bed/bgi
     hgLoadBed galGal3 bgiCov bgiCov.bed
     hgLoadBed galGal3 bgiSnp bgiSnp.bed \
       -sqlTable=$HOME/kent/src/hg/lib/bgiSnp.sql
     # Got this warning but I think it's because the final column is always 
     # empty so bed.tab doesn't have the final column... table looks OK.
 #load of bgiSnp did not go as planned: 3306633 record(s), 0 row(s) skipped, 3306633 warning(s) loading bed.tab
 
 
 
 #########################################################################
 # STS?
 #########################################################################
 #########################################################################
 # SWAP STICKLEBACK GASACU1
     ssh pk
     mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.swap
     cd /cluster/data/galGal3/bed/blastz.gasAcu1.swap
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
 	-smallClusterHub=pk \
         -chainMinScore=2000 -chainLinearGap=loose \
         -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
         -swap -bigClusterHub=pk > swap.log 2>&1 &
     #	failed on the net step:
 # HgStepManager: executing step 'net'.
 # netChains: looks like previous stage was not successful
 #	(can't find [galGal3.gasAcu1.]all.chain[.gz]).
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
 	-smallClusterHub=pk \
         -chainMinScore=2000 -chainLinearGap=loose \
         -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
         -continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 &
     #	real    7m11.286s
 
     featureBits galGal3 chainGasAcu1Link > fb.galGal3.chainGasAcu1Link.txt
     #	32781658 bases of 1042591351 (3.144%) in intersection
 
 #######################################################################
 ## BLASTZ Chicken chroms vs Stickleback chroms and random contigs
 ##  The above swap of gasAcu1 does *not* include the stickleback chrUn
 ##  To get alignments on the stickleback chrUn, do this alignment of
 ##  stickleback with chrUn to chicken, and swap them back to stickleback.
 ##  Don't do any Db loading, pick out the stickleback randoms after the swap
 ##	(WORKIN - 2007-01-11 - Hiram)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11
     cd /cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11
     cat << '_EOF_' > DEF
 # Chicken chroms vs. Stickleback chroms and chrUn in contigs
 
 # Try "human-fugu" (more distant, less repeat-killed than mammal) params
 # +M=50:
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
     
 # TARGET: Chicken galGal3 chroms only, no random sequence
 #       The largest is 200 million bases, there are 34 of them
 SEQ1_DIR=/san/sanvol1/scratch/galGal3/galGal3.noRandoms.sdTrf.2bit
 SEQ1_LEN=/san/sanvol1/scratch/galGal3/galGal3.noRandoms.sdTrf.sizes
 SEQ1_CHUNK=40000000
 SEQ1_LAP=10000
 
 # QUERY: Stickleback gasAcu1 chroms and chrUn in contigs
 #       The largest chrom is 32M bases, the largest contig 418,670 bases
 #       The smallest chrom chrM is 15,742 bases, smallest contig 60 bases
 #       there are 5,115 chroms and contig pieces
 #       total size 1,089,690,673 bases
 #       47107538 N's 1042583135 real 834274605 upper 208308530 lower
 SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
 SEQ2_CHUNK=1000000
 SEQ2_LIMIT=20
 SEQ2_LAP=0
 
 BASE=/cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 DEF \
 	-smallClusterHub=pk \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-blastzOutRoot /cluster/bluearc/galGal3ChrGasAcu1Un \
 	-stop=net -bigClusterHub=pk > do.log 2>&1 &
     #	real    116m59.888s
     #	And swapping:
     mkdir /cluster/data/gasAcu1/bed/blastz.galGal3.swap
     cd /cluster/data/gasAcu1/bed/blastz.galGal3.swap
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/galGal3/bed/blastz.gasAcu1.2007-01-11/DEF \
 	-smallClusterHub=pk \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -stop=net -bigClusterHub=pk > swap.log 2>&1 &
 
 #######################################################################
 ## BLASTZ LIZARD ANOCAR1 - (DONE - 2007-02-18 - Hiram)
     ssh kkstore05
     mkdir /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18
     cd /cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18
 
     cat << '_EOF_' > DEF
 # chicken vs. lizard
 
 # Use same params as used for galGal3-xenTro2
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Chicken galGal3
 SEQ1_DIR=/san/sanvol1/galGal3/nib
 SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
 SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 DEF \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-blastzOutRoot=/san/sanvol1/scratch/galGal3AnoCar1 > do.log 2>&1
 
     time doBlastzChainNet.pl -verbose=2 DEF \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net \
 	-blastzOutRoot=/san/sanvol1/scratch/galGal3AnoCar1 > net.log 2>&1 &
 
     time nice -n +19 featureBits galGal3 chainAnoCar1Link \
 	> fb.galGal3.chainAnoCar1Link.txt 2>&1
     #	real    0m43.752s
     #	106743952 bases of 1042591351 (10.238%) in intersection
 
     ##	swap documented in anoCar1.txt
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18/DEF \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap > swap.log 2>&1 &
 
 #######################################################################
 ## BLASTZ FUGU FR2 - (DONE - 2007-03-09 - 2007-03-12 - Hiram)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/blastz.fr2.2007-03-09
     cd /cluster/data/galGal3/bed/blastz.fr2.2007-03-09
 
     cat << '_EOF_' > DEF
 # chicken vs. fugu 
 
 # Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Chicken galGal3
 SEQ1_DIR=/san/sanvol1/galGal3/nib
 SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Fugu fr2 
 SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/galGal3/bed/blastz.fr2.2007-03-09
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 DEF \
     -qRepeats=windowmaskerSdust \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-blastzOutRoot=/cluster/bluearc/galGal3Fr2 > do.log  2>&1
     cat fb.galGal3.chainFr2Link.txt
     #	30935323 bases of 1042591351 (2.967%) in intersection
 
     mkdir /cluster/data/fr2/bed/blastz.galGal3.swap
     cd /cluster/data/fr2/bed/blastz.galGal3.swap
     time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
 	/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap > swap.log  2>&1 &
     time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
 	/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net -swap > swap_net.log  2>&1 &
     #	real    3m1.239s
     cat fb.fr2.chainGalGal3Link.txt
     #	36175581 bases of 393312790 (9.198%) in intersection
 
 #######################################################################
 ## BLASTZ FUGU FR1 - (DONE - 2007-03-09 - 2007-03-12 - Hiram)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/blastz.fr1.2007-03-09
     cd /cluster/data/galGal3/bed/blastz.fr1.2007-03-09
 
     cat << '_EOF_' > DEF
 # chicken vs. fugu 
 
 # Use same params as used for galGal3-xenTro1 (see makeXenTro1.doc)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Chicken galGal3
 SEQ1_DIR=/san/sanvol1/galGal3/nib
 SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Fugu fr1 
 SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/galGal3/bed/blastz.fr1.2007-03-09
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 DEF \
     -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-blastzOutRoot=/cluster/bluearc/galGal3Fr1 > do.log  2>&1
     cat fb.galGal3.chainFr1Link.txt
     #	29604522 bases of 1042591351 (2.840%) in intersection
 
     mkdir /cluster/data/fr1/bed/blastz.galGal3.swap
     cd /cluster/data/fr1/bed/blastz.galGal3.swap
     time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
 	/cluster/data/galGal3/bed/blastz.fr1.2007-03-09/DEF \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap > swap.log  2>&1 &
     #	real    3m14.494s
     cat fb.fr1.chainGalGal3Link.txt
     #	33536400 bases of 315518167 (10.629%) in intersection
 
 
 #######################################################################
 # BLASTZ PLATYPUS ORNANA1 (DONE 4/9/07 angie)
     ssh kkstore03
     mkdir /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06
     cd /cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06
     cat << '_EOF_' > DEF
 # chicken vs. platypus
 
 # Use same params as used for galGal3-xenTro2 but back off L to 6000.
 # If that causes us to get swamped, we can go back up to 8000.
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Chicken galGal3
 SEQ1_DIR=/iscratch/i/galGal3/galGal3.2bit
 SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Platypus ornAna1 - largest chunk big enough for largest scaffold
 SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
 SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/galGal3/bed/blastz.ornAna1.2007-04-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl DEF \
       >& do.log & tail -f do.log
     ln -s blastz.ornAna1.2007-04-06 /cluster/data/galGal3/bed/blastz.ornAna1
 
 #########################################################################
 # BLASTZ/CHAIN/NET HORSE AND CHICKEN (DONE 2/27/07 Fan)
     ssh kkstore05
     mkdir /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
     cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
     cat << '_EOF_' > DEF
 # Horse vs. Chicken
 
 BLASTZ_M=50
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Horse equCab1
 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
 SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
 # Maximum number of scaffolds that can be lumped together
 SEQ1_LIMIT=500     
 SEQ1_CHUNK=30000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3
 SEQ2_DIR=/scratch/hg/galGal3/galGal3.2bit
 SEQ2_LEN=/cluster/data/galGal3/chrom.sizes 
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
 TMPDIR=/scratch/tmp
 '_EOF_'
 
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=5000 -chainLinearGap=loose \
       -blastzOutRoot /cluster/bluearc/equCab1/blastz.galGal3 >& do.log &
     tail -f do.log
 
     ssh hgwdev
     cd /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22
     ln -s blastz.galGal3.2007-02-22 /cluster/data/equCab1/bed/blastz.galGal3
     nice featureBits equCab1 -chrom=chr1 chainGalGal3Link
 # 9136777 bases of 177498097 (5.148%) in intersection
 
     bash
     time nice -n 19 featureBits equCab1 chainGalGal3Link \
 	> fb.equCab1.chainGalGal3Link.txt 2>&1
 # 114216918 bases of 2421923695 (4.716%) in intersection
 
     ssh kkstore05
     mkdir /cluster/data/galGal3/bed/blastz.equCab1.swap
     cd /cluster/data/galGal3/bed/blastz.equCab1.swap
     bash
     time doBlastzChainNet.pl \
 	/cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
     tail -f swap.log
 
 # the above script stopped with the following lines in the .log file:
 
 # netToAxt axtChain/net/chrM.net axtChain/chain/chrM.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout
 # axtSort stdin stdout
 # gzip -c
 # Processing chrM
 # end
 # netToAxt axtChain/net/chrUn_random.net axtChain/chain/chrUn_random.chain /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout
 # axtSort stdin stdout
 # gzip -c
 # Processing chrUn_random
 
 # Hiram helped to created the following temp .csh:
 
     cat finiNet.csh
 #!/bin/csh -efx
 # This script was automatically generated by /cluster/bin/scripts/doBlastzChainNet.pl
 # from /cluster/data/equCab1/bed/blastz.galGal3.2007-02-22/DEF
 # It is to be executed on kolossus in /cluster/data/galGal3/bed/blastz.equCab1.swap/axtChain .
 # It generates nets (without repeat/gap stats -- those are added later on
 # hgwdev) from chains, and generates axt, maf and .over.chain from the nets.
 # This script will fail if any of its commands fail.
 
 cd /cluster/data/galGal3/bed/blastz.equCab1.swap
 
 foreach f (axtChain/net/*.net)
 netToAxt $f axtChain/chain/$f:t:r.chain \
   /scratch/hg/galGal3/galGal3.2bit /san/sanvol1/scratch/equCab1/equCab1.2bit stdout \
   | axtSort stdin stdout \
   | gzip -c > axtNet/$f:t:r.galGal3.equCab1.net.axt.gz
 end
 
 # Make mafNet for multiz: one .maf per galGal3 seq.
 mkdir mafNet
 foreach f (axtNet/*.galGal3.equCab1.net.axt.gz)
   axtToMaf -tPrefix=galGal3. -qPrefix=equCab1. $f \
         /cluster/data/galGal3/chrom.sizes  /san/sanvol1/scratch/equCab1/chrom.sizes        \
         stdout \
   | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
 end
 
     ssh kolossus
     cd /cluster/data/galGal3/bed/blastz.equCab1.swap
     fininet.csh
 
     ssh hgwdev
     cd /cluster/data/galGal3/bed/blastz.equCab1.swap
     bash
     time nice -n 19 featureBits galGal3 chainEquCab1Link \
 	> fb.galGal3.chainEquCab1Link.txt 2>&1
     # 104418042 bases of 1042591351 (10.015%) in intersection
     
 ###########################################################################
 # ENSEMBL GENES (DONE 11/2/07 angie)
     ssh hgwdev
     mkdir /cluster/data/galGal3/bed/ensembl
     cd /cluster/data/galGal3/bed/ensembl
     # Go to www.ensembl.org and click around their evolving interface 
     # to get the following types of data:
     # 1. a tab-separated file that relates Ensembl gene, transcript and
     #    protein IDs.  Save as ensGtp.txt.gz  
     # 2. peptide fasta with only transcript IDs in header -> ensPep.fa.gz
     
     # They have started dumping GTF files so we can download that directly:
     # wget
     # 'ftp://ftp.ensembl.org/pub/current_gallus_gallus/data/gtf/*'
     # -- but I forgot to do that and just grabbed the GTF from Ensembl
     # BioMart along with the GTP and Pep.
 
     # Add "chr" to chrom names in the gene data gtf file to make 
     # it compatible with our software.  Also change MT --> M.
     zcat ensembl.gff.gz | sed -e 's/^MT/M/; s/^/chr/;' > ensGene.gtf
     ldHgGene -gtf -genePredExt galGal3 ensGene ensGene.gtf
     nice featureBits galGal3 ensGene
 #30850267 bases of 1042591351 (2.959%) in intersection
 
     # strip header line:
     zcat ensGtp.txt.gz | tail +2 \
     | hgLoadSqlTab galGal3 ensGtp ~/kent/src/hg/lib/ensGtp.sql \
       /dev/stdin -notOnServer
 
     zcat ensPep.fa.gz | sed -e 's/^>.*ENSGALT/>ENSGALT/' \
     | hgPepPred galGal3 generic ensPep stdin
 
 ############################################################################
 #  BLASTZ petMar1 Lamprey (WORKING - 2008-04-11 - Hiram)
     ssh kkstore03
     screen	# use a screen to control this multi-day job
     mkdir /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
     cd /cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
 
     cat << '_EOF_' > DEF
 # Chicken vs Lamprey
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Chicken galGal3
 SEQ1_DIR=/scratch/data/galGal3/nib
 SEQ1_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Lamprey petMar1
 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
 SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=memk -verbose=2 > do.log 2>&1 &
 # Completed: 83468 of 83468 jobs
 # CPU time in finished jobs:    2199234s   36653.91m   610.90h   25.45d  0.070 y
 # IO & Wait Time:                426454s    7107.56m   118.46h    4.94d  0.014 y
 # Average job time:                  31s       0.52m     0.01h    0.00d
 # Longest finished job:             414s       6.90m     0.12h    0.00d
 # Submission to last job:        103466s    1724.43m    28.74h    1.20d
     #	continuing after some kluster interruptions
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-continue=cat -bigClusterHub=memk -verbose=2 > cat.log 2>&1 &
     #	real    13m54.957s
     cat fb.galGal3.chainPetMar1Link.txt
     #	20134896 bases of 1042591351 (1.931%) in intersection
 
     #	and the swap
     mkdir /cluster/data/petMar1/bed/blastz.galGal3.swap
     cd /cluster/data/petMar1/bed/blastz.galGal3.swap
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/galGal3/bed/blastzPetMar1.2008-04-11/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=memk -verbose=2 > swap.log 2>&1 &
     #	real    33m41.587s
     cat fb.petMar1.chainGalGal3Link.txt
     #	22308118 bases of 831696438 (2.682%) in intersection
 
 ############################################################################
 #  BLASTZ petMar1 Lanclet (WORKING - 2008-04-15 - Hiram)
     ssh kkstore03
     screen	# use a screen to control this multi-day job
     mkdir /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
     cd /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
 
     cat << '_EOF_' > DEF
 # Chicken vs Lanclet
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Chicken galGal3
 SEQ1_DIR=/scratch/data/galGal3/nib
 SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold
 #       Largest scaffold 7,200,735 - 3032 scaffolds + chrM
 SEQ2_DIR=/scratch/data/braFlo1/braFlo1.2bit
 SEQ2_LEN=/scratch/data/braFlo1/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/braFlo1/braFlo1UnScaffolds.2bit
 SEQ2_CTGLEN=/scratch/data/braFlo1/braFlo1UnScaffolds.sizes
 SEQ2_LIFT=/scratch/data/braFlo1/braFlo1.lift
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -verbose=2 > do.log 2>&1 &
     #	real    168m49.357s
     cat fb.galGal3.chainBraFlo1Link.txt
     #	19795687 bases of 1042591351 (1.899%) in intersection
 
     #	and the swap
     mkdir /cluster/data/braFlo1/bed/blastz.galGal3.swap
     cd /cluster/data/braFlo1/bed/blastz.galGal3.swap
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=pk -verbose=2 > swap.log 2>&1 &
     #	real    4m45.054s
     cat fb.braFlo1.chainGalGal3Link.txt
     #	30287175 bases of 923355587 (3.280%) in intersection
 
 #############################################################################
 # BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-18 - larrym)
     ssh kkstore04
     screen #	use screen to control this multi-day job
     mkdir /cluster/data/galGal3/bed/blastz.equCab2.2008-04-18
     cd /cluster/data/galGal3/bed/blastz.equCab2.2008-04-18
     cat << '_EOF_' > DEF
 # Chicken vs. Horse
 
 BLASTZ_M=50
 
 # TARGET: Chicken calGal3
 SEQ1_DIR=/scratch/data/galGal3/nib
 SEQ1_LEN=/cluster/data/galGal3/chrom.sizes 
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse
 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
 SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
 SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/galGal3/bed/blastz.equCab2.2008-04-18
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >& do.log &
 
 failed b/c para.results got currupted:
 
     para recover jobList recoverJobList
     para create recoverJobList
     Checking input files
     0 jobs written to batch
 
 so all jobs had successfully completed.
 
 Generate run.time file manually:
 
     time doBlastzChainNet.pl `pwd`/DEF -continue=cat \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >>& do.log &
 
 but the batch file has disappeared ... I just decided to remove the psl
 files and re-run from scratch:
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab2/blastz.galGal3 >& do.log &
 0.248u 0.190s 6:10:43.78 0.0%   0+0k 0+0io 26pf+0w
 
     ln -s blastz.equCab2.2008-04-18 /cluster/data/galGal3/bed/blastz.equCab2
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 
 
 #####################################################
 # build 2bit with contigs from random chroms and chrUn (DONE braney 2008-08-30)
     cd /cluster/data/galGal3
     fgrep -h 'random 
 Un' */*.agp | awk '{if ($5 == "W") print $1, $2-1, $3, $6}' \
     | while read chr start stop contig; \
     do \
 	echo ">$contig"; twoBitToFa galGal3.2bit:$chr:$start-$stop stdout \
 	| tail -n +2; \
     done > unRandom.fa
 
     fgrep -v 'random 
 Un' chrom.sizes | awk '{print $1}' | \
     twoBitToFa galGal3.2bit -seqList=stdin noUn.fa
     cat noUn.fa unRandom.fa | faToTwoBit stdin galGal3.blastz.2bit
 
     twoBitInfo galGal3.blastz.2bit galGal3.blastz.sizes
 
 #########################################################################
 # BLASTZ/CHAIN/NET TAEGUT1 (DONE braney 2008-09-04)
     ssh swarm
     screen
     mkdir /cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06
     cd /cluster/data/galGal3/bed/blastz.taeGut1.2008-09-06
     cat << _EOF_ > DEF
 # chicken vs. zebra finch
 BLASTZ_T=2
 BLASTZ_M=50
 
 # TARGET: Chicken galGal3
 # SEQ1_DIR=/scratch/data/galGal3/galGal3.2bit
 # SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
 SEQ1_DIR=/hive/data/genomes/galGal3/galGal3.2bit
 SEQ1_LEN=/hive/data/genomes/galGal3/chrom.sizes
 # SEQ1_CTGDIR=/hive/data/genomes/galGal3/galGal3.blastz.2bit
 # SEQ1_CTGLEN=/hive/data/genomes/galGal3/galGal3.blastz.sizes
 # SEQ1_LIFT=/hive/data/genomes/galGal3/jkStuff/liftAll.lft
 # one chrom at a time
 SEQ1_CHUNK=200000000
 SEQ1_LAP=0
 
 # QUERY: Zebra finch taeGut1
 # SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
 # SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
 SEQ2_DIR=/hive/data/genomes/taeGut1/taeGut1.2bit
 SEQ2_LEN=/hive/data/genomes/taeGut1/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
 SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
 SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/hive/data/genomes/galGal3/bed/blastz.taeGut1.2008-09-06
 _EOF_
     # << emacs
      doBlastzChainNet.pl -syntenicNet \
      -bigClusterHub=swarm -chainMinScore=3000 -chainLinearGap=medium \
      -smallClusterHub=swarm DEF -workhorse=swarm \
      -qRepeats=windowmaskerSdust > do.log 2>&1 &
 
     cd /cluster/data/galGal3/bed
     rm -f blastz.taeGut1
     ln -s blastz.taeGut1.2008-09-06 /cluster/data/galGal3/bed/blastz.taeGut1
 
 ############
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 galGal3.upstreamGeneTbl = refGene
 galGal3.upstreamMaf = multiz7way /hive/data/genomes/galGal3/bed/multiz7way/species.lst
 
 ############################################################################
 # QUALITY TRACK (DONE - 2008-11-24 - Hiram)
     cd /hive/data/genomes/galGal3
     # create an agp file
 for C in `cut -f1 chrom.sizes | grep -v random | sed -e "s/chr//" | sort`
 do
     cat $C/chr${C}.agp
 done > galGal3.agp
 
 for C in `cut -f1 chrom.sizes | grep -v random | sed -e "s/chr//" | sort`
 do
     F=$C/chr${C}_random.agp
     if [ -s $F ]; then
         cat $F
     fi
 done >> galGal3.agp
 
     cd /hive/data/genomes/galGal3/downloads
     qaToQac contigs.fa.qual.gz assembly.qac
     qacAgpLift -mScore=99 -verbose=2 ../galGal3.agp assembly.qac scaffolds.qac
 
     mkdir /hive/data/genomes/galGal3/bed/qual
     cd /hive/data/genomes/galGal3/bed/qual
     qacToWig -fixed ../../downloads/scaffolds.qac stdout \
         | wigEncode stdin qual.wig qual.wib
     ln -s `pwd`/qual.wib /gbdb/galGal3/wib
     hgLoadWiggle -pathPrefix=/gbdb/galGal3/wib galGal3 quality qual.wig
 
 #############################################################################
 # N-SCAN gene predictions (nscanGene) - (2009-03-13 markd)
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     cd /cluster/data/galGal3/bed/nscan/
     http://mblab.wustl.edu/predictions/chicken/galGal3/
 
     wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/galGal3.gtf
     wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/galGal3.fa
     wget -nv http://mblab.wustl.edu/predictions/chicken/galGal3/README
     bzip2 galGal3.*
     chmod a-w *
     # load track
     ldHgGene -bin -gtf -genePredExt galGal3 nscanGene galGal3.gtf.bz2
     hgPepPred galGal3 generic nscanPep  galGal3.fa.bz2
     rm *.tab
 
     # chicken/galGal3/trackDb.ra, add:
     track nscanGene override
     informant Chicken N-SCAN uses Zerba Finch (taeGut1) as the informant, updated with PASA clusters of chicken cDNAs.
 
     searchTable nscanGene
     searchType genePred
     termRegex chr[0-9a-zA-Z_]+\.([0-9]+|pasa)\.[0-9]+(\.[0-9a-z]+)?
     searchPriority 50
 
+############################################################################
+# Re-Run equCab2 alignment (DONE - 2009-06-29,07-01 - Hiram)
+    mkdir /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29
+    cd /hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29
+
+    cat << '_EOF_' > DEF
+# Chicken vs. Horse
+
+BLASTZ_M=50
+
+# TARGET: Human galGal3
+SEQ1_DIR=/scratch/data/galGal3/bothMaskedNibs
+SEQ1_LEN=/scratch/data/galGal3/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY: Horse
+SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
+SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
+SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
+SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
+SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
+SEQ2_CHUNK=20000000
+SEQ2_LIMIT=100
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << happy emacs
+
+    time doBlastzChainNet.pl `pwd`/DEF \
+	-noLoadChainSplit -verbose=2 -bigClusterHub=pk \
+	-workhorse=hgwdev \
+	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
+    #	real    220m54.623s
+    #	 cat fb.galGal3.chainEquCab2Link.txt 
+    #	68476046 bases of 1042591351 (6.568%) in intersection
+
+    mkdir /hive/data/genomes/equCab2/bed/blastz.galGal3.swap
+    cd /hive/data/genomes/equCab2/bed/blastz.galGal3.swap
+    time doBlastzChainNet.pl \
+	/hive/data/genomes/galGal3/bed/lastzEquCab2.2009-06-29/DEF \
+	-noLoadChainSplit -verbose=2 -bigClusterHub=pk \
+	-swap -workhorse=hgwdev \
+	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
+    #	real    10m1.381s
+    cat fb.equCab2.chainGalGal3Link.txt 
+    #	73336799 bases of 2428790173 (3.019%) in intersection
+
+############################################################################