src/hg/makeDb/doc/rn4.txt 1.38

1.38 2010/02/12 20:40:05 fanhsu
Added RGD Genes build pipeline.
Index: src/hg/makeDb/doc/rn4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/rn4.txt,v
retrieving revision 1.37
retrieving revision 1.38
diff -b -B -U 1000000 -r1.37 -r1.38
--- src/hg/makeDb/doc/rn4.txt	9 Feb 2010 23:17:59 -0000	1.37
+++ src/hg/makeDb/doc/rn4.txt	12 Feb 2010 20:40:05 -0000	1.38
@@ -1,5105 +1,5222 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes how we made the browser database on the Rattus 
 # Norvegicus genome, November 2004 update (Rnor3.4) from Baylor.
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +-------------+-------------------------------------+
 #  | tableName   | type                                |
 #  +-------------+-------------------------------------+
 #  | knownGene   | genePred knownGenePep knownGeneMrna |
 #  | refGene     | genePred refPep refMrna             |
 #  | xenoRefGene | genePred xenoRefPep xenoRefMrna     |
 #  | mgcGenes    | genePred                            |
 #  | ensGene     | genePred ensPep                     |
 #  | genscan     | genePred genscanPep                 |
 #  +-------------+-------------------------------------+
 
 
 # CREATE BUILD DIRECTORY (DONE 1/27/06 angie)
     # df -h /cluster/store*, choose the one with the most space...
     ssh kkstore01
     mkdir /cluster/store9/rn4
     ln -s /cluster/store9/rn4 /cluster/data/rn4
 
 
 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 1/27/06 angie)
     mkdir /cluster/data/rn4/M
     cd /cluster/data/rn4/M
     # go to http://www.ncbi.nih.gov/ and search Nucleotide for 
     # "Rattus norvegicus mitochondrion complete genome".  
     # There are more than one of those... I picked NC_001665 whose gi # is
     # 5835177
     # Use that number in the entrez linking interface to get fasta:
     wget -O chrM.fa \
       'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5835177&dopt=FASTA'
     # Edit chrM.fa: make sure the long fancy header line says it's the 
     # Rattus norvegicus mitochondrion complete genome, and then replace the 
     # header line with just ">chrM".
 
 
 # DOWNLOAD SEQUENCE (DONE 1/27/06 angie)
     ssh kkstore01
     cd /cluster/data/rn4
     mkdir downloads
     cd downloads
     wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/README
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X)
       echo chr$c
       wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}.fa.gz
       wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}.fa.qual.gz
       wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}-random.fa.gz
       wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/chromosomes/Rnor3.4chr${c}-random.fa.qual.gz
       echo ""
     end
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X)
       echo chr$c
       wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/contigs/chr{$c}.agp
     end
 
     mkdir bacs
     cd bacs
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X)
       echo chr$c
       wget ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Rnorvegicus/Rnor3.4/contigs/chr${c}.contig_bacs.fa.gz
     end
     cd ..
 
     # Distribute into chrom dirs.
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 Un X)
       echo chr$c
       mkdir ../$c
       zcat Rnor3.4chr${c}.fa.gz \
       | sed -e 's/^>gnl[|].*[|]/>/' > ../$c/chr${c}.fa
       zcat Rnor3.4chr${c}-random.fa.gz \
       | sed -e 's/^>gnl[|].*[|]/>/' > ../$c/chr${c}_random.fa
       mv chr${c}.agp ../$c/
     end
 #mv: cannot stat `chrUn.agp': No such file or directory
     # No agp for Un, nor *_random.... guess we'll have to use hgFakeAgp 
     # to at least get gaps for those.
 
     # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
     foreach agp (*/chr*.agp)
       set fa = $agp:r.fa
       echo checking consistency of $agp and $fa
       checkAgpAndFa $agp $fa | tail -1
     end
     faSize */chr*.fa
 #2834127293 bases (267832528 N's 2566294765 real 2566294765 upper 0 lower) in 45 sequences in 45 files
 #Total size: mean 62980606.5 sd 75162894.7 min 16300 (chrM) max 267910886 (chr1) median 6862066
 #N count: mean 5951834.0 sd 6940811.4
 #U count: mean 57028772.6 sd 68565220.7
 #L count: mean 0.0 sd 0.0
 
 
 # MAKE FAKE AGP WHERE NECESSARY (DONE 1/27/06 angie)
     # In the chromosomal AGP, all gaps are marked "fragment" *except* for 
     # gaps of exactly 50000 which are "clone".  There are gaps of >>50000
     # marked "fragment"!  However, in the chr*_random and chrUn here,
     # that just seems wrong... chrUn has a gap of 2602000, how could that 
     # possibly be bridged?  (Why are they wasting that kind of space?)
     # So just count >= 50000 as "clone no", even though that is not what 
     # they do in their AGP for the chroms.
     ssh kkstore01
     cd /cluster/data/rn4
     foreach f (?{,?}/chr*.fa)
       set agp = $f:r.agp
       if (! -e $agp) then
         echo Faking missing AGP $agp
         hgFakeAgp -minContigGap=50 -minScaffoldGap=50000 $f stdout \
         | sed -e 's/contig/fragment/; s/scaffold/clone/' \
           > $agp
       endif
     end
 
 
 # BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 1/27/06 angie)
     ssh kkstore01
     cd /cluster/data/rn4
     foreach agp (*/chr*.agp)
       set fa = $agp:r.fa
       echo splitting $agp and $fa
       cp -p $agp $agp.bak
       cp -p $fa $fa.bak
       splitFaIntoContigs $agp $fa . -nSize=5000000
     end
     # splitFaIntoContigs makes new dirs for _randoms.  Move their contents 
     # back into the main chrom dirs and get rid of the _random dirs.
     foreach d (*_random)
       set base = `echo $d | sed -e 's/_random$//'`
       mv $d/lift/oOut.lst $base/lift/rOut.lst
       mv $d/lift/ordered.lft $base/lift/random.lft
       mv $d/lift/ordered.lst $base/lift/random.lst
       rmdir $d/lift
       mv $d/* $base
       rmdir $d
     end
     # checkAgpAndFa again to get a warm-fuzzy
     foreach agp (*/chr*.agp)
       set fa = $agp:r.fa
       echo checking consistency of $agp and $fa
       checkAgpAndFa $agp $fa | tail -1
     end
     # Make a "pseudo-contig" for processing chrM too:
     mkdir M/chrM_1
     sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
     mkdir M/lift
     set mSize=`faSize M/chrM.fa | awk '{print $1;}'`
     echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
     echo "chrM_1" > M/lift/ordered.lst
     echo "0	M/chrM_1	"$mSize"	chrM	"$mSize"" > M/lift/ordered.lft
 
 
 # MAKE JKSTUFF AND BED DIRECTORIES (DONE 1/27/06 angie)
     # This used to hold scripts -- better to keep them inline here so 
     # they're in CVS.  Now it should just hold lift file(s) and 
     # temporary scripts made by copy-paste from this file.  
     mkdir /cluster/data/rn4/jkStuff
     # This is where most tracks will be built:
     mkdir /cluster/data/rn4/bed
 
 
 # REPEAT MASKING (DONE 3/16/06 angie)
     # Originally run 1/30/06.  Coverage was a bit low (43.020%, compared to 
     # 43.463% for rn3) so I made a test case (see below) for Robert Hubley,
     # who fixed a bug and re-released RepeatMasker 3/14/06.  So I'm 
     # re-running... original chr*.fa.out have been moved to .bak and 
     # lower-level .fa.out's have been removed.  Added -ali at Robert H's 
     # request.  
     # Record RM version used:
     ls -l /cluster/bluearc/RepeatMasker
 #lrwxrwxrwx  1 angie protein 18 Mar 14 16:56 /cluster/bluearc/RepeatMasker -> RepeatMasker060314/
     grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
 #CC   RELEASE 20060314;                                            *
     # Run RepeatMasker on a dummy input, just to make it initialize its 
     # rat libraries once before the cluster run:
     /cluster/bluearc/RepeatMasker/RepeatMasker -spec rat /dev/null
 #Building species libraries in: /cluster/bluearc/RepeatMasker060314/Libraries/20060314/rattus
 
     #- Split contigs into 500kb chunks, at gaps if possible:
     ssh kkstore01
     cd /cluster/data/rn4
     foreach c (?{,?})
       foreach d ($c/chr${c}*_?{,?})
         cd $d
         echo "splitting $d"
         set contig = $d:t
         faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
             -minGapSize=100
         cd ../..
       end
     end
 
     #- Make the run directory and job list:
     cd /cluster/data/rn4
     cat << '_EOF_' > jkStuff/RMRat
 #!/bin/csh -fe
 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/rn4/$2
 /bin/cp $2 /tmp/rn4/$2/
 cd /tmp/rn4/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -s -ali -spec rat $2
 popd
 /bin/cp /tmp/rn4/$2/$2.out ./
 if (-e /tmp/rn4/$2/$2.tbl) /bin/cp /tmp/rn4/$2/$2.tbl ./
 if (-e /tmp/rn4/$2/$2.cat) /bin/cp /tmp/rn4/$2/$2.cat ./
 /bin/rm -fr /tmp/rn4/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/rn4/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/rn4
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x jkStuff/RMRat
     mkdir RMRun
     cp /dev/null RMRun/RMJobs
     foreach c (?{,?})
       foreach d ($c/chr${c}{,_random}_?{,?})
           set ctg = $d:t
           foreach f ( $d/${ctg}_?{,?}.fa )
             set f = $f:t
             echo /cluster/data/rn4/jkStuff/RMRat \
                  /cluster/data/rn4/$d $f \
                '{'check out line+ /cluster/data/rn4/$d/$f.out'}' \
               >> RMRun/RMJobs
           end
       end
     end
 
     #- Do the run
     ssh pk
     cd /cluster/data/rn4/RMRun
     para create RMJobs
     para make RMJobs; para time | mail -s 'RM cluster run finished' $USER
 #Completed: 6487 of 6488 jobs
 #Crashed: 1 jobs
 #CPU time in finished jobs:   27797186s  463286.43m  7721.44h  321.73d  0.881 y
 #IO & Wait Time:                 41276s     687.94m    11.47h    0.48d  0.001 y
 #Average job time:                4291s      71.52m     1.19h    0.05d
 #Longest finished job:            6361s     106.02m     1.77h    0.07d
 #Submission to last job:        133586s    2226.43m    37.11h    1.55d
     # The one crash was a divide-by-zero error on chr1_24_04 -- the patch 
     # that Robert send for open-3-1-3 was missing from open-3-1-4.  So I 
     # applied it to open-3-1-4 and manually ran ProcessRepeats on the .cat.
 
     #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
     ssh kkstore01
     cd /cluster/data/rn4
     foreach d (*/chr*_?{,?})
       set contig = $d:t
       echo $contig
       liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
         > /dev/null
     end
 
     #- Lift pseudo-contigs to chromosome level
     foreach c (?{,?})
       echo lifting $c
       cd $c
       if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
         liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
         > /dev/null
       endif
       if (-e lift/random.lft && ! -z lift/random.lft) then
         liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
         > /dev/null
       endif
       cd ..
     end
 
     #- Load the .out files into the database with:
     ssh hgwdev
     cd /cluster/data/rn4
     hgLoadOut rn4 ?{,?}/chr*.fa.out
     # Many warnings like this one:
 #Strange perc. field -7282.5 line 396666 of 1/chr1.fa.out
 #Strange perc. field -143.1 line 396666 of 1/chr1.fa.out
     # That's from chr1_49, not chr1_24 so it is independent of the tweak to 
     # ProcessRepeats.
     # There was also this warning at the end:
 #note: 249 records dropped due to repStart > repEnd
 #      run with -verbose=2 for details
     # Ran with -verbose=2 on kolossus... these are some of the warnings:
 #bad rep range [4533, 4532] line 955 of 14/chr14.fa.out 
 #bad rep range [6936, 6935] line 15888 of 14/chr14.fa.out 
 #bad rep range [458, 457] line 30645 of 14/chr14.fa.out 
 #bad rep range [5798, 5797] line 85282 of 14/chr14.fa.out 
 #TODO: send those on to Robert too.
     nice featureBits rn4 rmsk
 #1137387810 bases of 2571531505 (44.230%) in intersection
     # Good -- it's now higher than rn3:
 #1117483165 bases of 2571104688 (43.463%) in intersection
 
 
 # CREATING DATABASE (DONE 1/27/06 angie)
     ssh hgwdev
     hgsql '' -e 'create database rn4'
     # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
     df -h /var/lib/mysql
 #/dev/sdc1             1.8T  1.6T   88G  95% /var/lib/mysql
     # Wow... we sure have filled that sucker up.
 
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE 1/27/06 angie)
     ssh hgwdev
     hgsql rn4 -e "create table grp (PRIMARY KEY(NAME)) select * from rn3.grp"
 
 
 # MAKE REPEATMASKER LOW-COVERAGE TEST CASE (DONE 3/10/06 angie)
     # Eyeball some repeat annotations in the browser, compare to lib seqs.
     # Run featureBits on rn4 and on previous genome build, and compare:
     ssh hgwdev
     nice featureBits rn4 rmsk
 # Hmmmm... seems wrong for this to decrease...!
 #1106283290 bases of 2571531505 (43.020%) in intersection
     nice featureBits rn3 rmsk
 #1117483165 bases of 2571104688 (43.463%) in intersection
     # Do some comparisons on the smallest chrom, chr20.
     featureBits rn3 -chrom=chr20 rmsk
 #20269238 bases of 49432254 (41.004%) in intersection
     # liftOver rn3 chr20 rmsk to rn4, find non-overlap and compare:
     ssh kolossus
     awk '$1 ~ /[0-9]/ {print $5 "\t" $6 "\t" $7;}' \
       /cluster/data/rn3/20/chr20.fa.out > /tmp/rn3.chr20_rmsk.bed
     liftOver /tmp/rn3.chr20_rmsk.bed \
       /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \
       /tmp/rn4.chr20_rmskLifted.bed /tmp/rn4.chr20.unmapped
     featureBits rn4 -chrom=chr20 rmsk
 #19994056 bases of 49409453 (40.466%) in intersection
     featureBits rn4 -chrom=chr20 /tmp/rn4.chr20_rmskLifted.bed
 #20117851 bases of 49409453 (40.717%) in intersection
     # Wow, even after some small liftOver losses, the rn3 rmsk run is 
     # still bigger than the rn4 run!  Look for large missing pieces:
     featureBits rn4 -chrom=chr20 /tmp/rn4.chr20_rmskLifted.bed \!rmsk \
       -minSize=500 -bed=stdout
 #chr20   107997  108538  chr20.1
 #chr20   15410534        15411405        chr20.2
 #chr20   26448452        26448976        chr20.3
 #chr20   34984144        34984704        chr20.4
 #chr20   41862299        41862843        chr20.5
 #chr20   49189694        49190470        chr20.6
 #chr20   50960151        50960656        chr20.7
 #chr20   53898688        53899203        chr20.8
 #chr20   54856700        54857268        chr20.9
 #634331 bases of 49409453 (1.284%) in intersection
     # For the first such region above, the rn3 location is the same 
     # (chr20:107998-108538) and is covered by a LINE... 34.1% divergence.
     # Well, that should make for a handy little test case because the 
     # sequence has not changed at all -- only the RM version.
     mkdir /cluster/data/rn4/RMTest
     cd /cluster/data/rn4/RMTest
     twoBitToFa ../rn4.2bit -seq=chr20 -start=100000 -end=110000 chr20_frag.fa
     /cluster/bluearc/RepeatMasker/RepeatMasker -s -spec rat chr20_frag.fa
     mkdir open-3-1-3
     mv chr20_frag.fa.* open-3-1-3/
     # The version used for rn3:
     /cluster/bluearc/RepeatMasker030619/RepeatMasker -s -spec rat chr20_frag.fa
     mkdir 030619
     mv chr20_frag.fa.* 030619/
     # The next-latest version:
     /cluster/bluearc/RepeatMasker051101/RepeatMasker -s -spec rat chr20_frag.fa
     mkdir open-3-1-2
     mv chr20_frag.fa.* open-3-1-2/
     # 3-1-2 makes exactly the same .out as 3-1-3.  060319 has one more LINE...
     # The first open- version that we have -- .out identical to 3-1-3:
     /cluster/bluearc/RepeatMasker040909/RepeatMasker -s -spec rat chr20_frag.fa
     mkdir open-3-0-5
     mv chr20_frag.fa.* open-3-0-5/
     # The only other version between 030619 and 040909:
     /cluster/bluearc/RepeatMasker040130/RepeatMasker -s -spec rat chr20_frag.fa
     mkdir 040130
     mv chr20_frag.fa.* 040130/
     # Sent the test case to Robert Hubley -- he found a rodent-specific bug.
 
 
 # MAKE LIFTALL.LFT (DONE 1/27/06 angie)
     ssh kkstore01
     cd /cluster/data/rn4
     cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
 
 
 # GOLD AND GAP TRACKS (DONE 1/27/06 angie)
     ssh hgwdev
     cd /cluster/data/rn4
     hgGoldGapGl -noGl rn4 /cluster/data/rn4 .
     # Wow, tons of warnings like this:
 #unexpected coords (6960, 6960) for frag chrX_random_2 in chrom chrX_random
     # -- There really are little single non-N bases between large blocks 
     # of N in the random fasta!  Sheesh, what a mess.  But randoms are 
     # the dregs...
 
 
 # SIMPLE REPEATS (TRF) (DONE 1/30/06 angie)
     ssh kolossus
     mkdir /cluster/data/rn4/bed/simpleRepeat
     cd /cluster/data/rn4/bed/simpleRepeat
     mkdir trf
     cp /dev/null jobs.csh
     foreach d (/cluster/data/rn4/*/chr*_?{,?})
       set ctg = $d:t
       foreach f ($d/${ctg}.fa)
         set fout = $f:t:r.bed
         echo $fout
         echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
         >> jobs.csh
       end
     end
     csh -ef jobs.csh >&! jobs.log &
     # check on this with
     tail -f jobs.log
     wc -l jobs.csh
     ls -1 trf | wc -l
 #591
     endsInLf trf/*
 #trf/chrM_1.bed zero length
     # When job is done do:
     liftUp simpleRepeat.bed /cluster/data/rn4/jkStuff/liftAll.lft warn \
       trf/*.bed
 
     # Load into the database:
     ssh hgwdev
     hgLoadBed rn4 simpleRepeat \
       /cluster/data/rn4/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     nice featureBits rn4 simpleRepeat
 #72859247 bases of 2571531505 (2.833%) in intersection
     # Compare to rn3:
     nice featureBits rn3 simpleRepeat
 #70073656 bases of 2571104688 (2.725%) in intersection
 
 
 # SET UP DB ON KOLOSSUS (DONE 1/30/06 angie)
     # to spare hgwdev, make an rn4 on kolossus so that we have the 
     # option of loading there and pushing to hgwdev.  It will need the 
     # gap tables and also chromInfo (added below) so that we can run 
     # featureBits on it.
     hgsql '' -e 'create database rn4'
     cd /cluster/data/rn4
     hgGoldGapGl -noGl rn4 /cluster/data/rn4 .
 
 
 # CYTOBAND (DONE 1/30/06 angie)
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/cytoBand
     cd /cluster/data/rn4/bed/cytoBand
     wget ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/mapview/ideogram.gz
     zcat ideogram.gz | sort -k1,1 -k6n,6n > ideogram.sorted
     /cluster/bin/scripts/createNcbiCytoBand ideogram.sorted
     # Load the bed file into both cytoBand and cytoBandIdeo:
     hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
       rn4 cytoBand cytoBand.bed
     hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBandIdeo.sql \
       rn4 cytoBandIdeo cytoBand.bed
     # Doh, the file does not have stain information!  All of the bands 
     # are the same as in rn3, so grab stain info from there.
     hgsql rn4 -e '\
       create table bandToStain select name,gieStain from rn3.cytoBand; \
       update cytoBand,bandToStain \
         set cytoBand.gieStain = bandToStain.gieStain \
         where cytoBand.name = bandToStain.name; \
       update cytoBandIdeo,bandToStain \
         set cytoBandIdeo.gieStain = bandToStain.gieStain \
         where cytoBandIdeo.name = bandToStain.name; \
       drop table bandToStain;'
 
 # CREATE MICROSAT TRACK (done 2006-7-5 JK)
      ssh hgwdev
      cd /cluster/data/rn4/bed
      mkdir microsat
      cd microsat
      awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
     /cluster/bin/i386/hgLoadBed rn4 microsat microsat.bed
 
 # PROCESS SIMPLE REPEATS INTO MASK (DONE 1/30/06 angie)
     # After the simpleRepeats track has been built, make a filtered version 
     # of the trf output: keep trf's with period <= 12:
     ssh kkstore01
     cd /cluster/data/rn4/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/chr*.bed)
       awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
     # Lift up filtered trf output to chrom coords as well:
     cd /cluster/data/rn4
     mkdir bed/simpleRepeat/trfMaskChrom
     foreach c (?{,?})
       if (-e $c/lift/ordered.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/ordered.lst > $c/lift/oTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
       endif
       if (-e $c/lift/random.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
            $c/lift/random.lst > $c/lift/rTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
       endif
     end
     # Here's the coverage for the filtered TRF:
     ssh kolossus
     cat /cluster/data/rn4/bed/simpleRepeat/trfMaskChrom/*.bed \
       > /tmp/filtTrf.bed
     featureBits rn4 /tmp/filtTrf.bed
 #50551402 bases of 2571531505 (1.966%) in intersection
 
 
 # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 3/17/06 angie)
     # Originally done 1/30 -- redoing after re-run of RepeatMasker.
     ssh kkstore01
     cd /cluster/data/rn4
     # Soft-mask (lower-case) the contig and chr .fa's, 
     # then make hard-masked versions from the soft-masked.  
     set trfCtg=bed/simpleRepeat/trfMask
     set trfChr=bed/simpleRepeat/trfMaskChrom
     foreach f (?{,?}/chr*.fa)
       echo "repeat- and trf-masking $f"
       maskOutFa -soft $f $f.out $f
       set chr = $f:t:r
       maskOutFa -softAdd $f $trfChr/$chr.bed $f
       echo "hard-masking $f"
       maskOutFa $f hard $f.masked
     end
     foreach c (?{,?})
       echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
       foreach d ($c/chr*_?{,?})
         set ctg=$d:t
         set f=$d/$ctg.fa
         maskOutFa -soft $f $f.out $f
         maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
         maskOutFa $f hard $f.masked
       end
     end
     # Make 2bit (for hgBlat, browser):
     faToTwoBit ?{,?}/chr*.fa rn4.2bit
     # Make nib (for blastz w/linSpecRep, OK for genbank too):
     mkdir nib
     foreach f (?{,?}/chr*.fa)
       echo $f:t:r
       faToNib -softMask $f nib/$f:t:r.nib
     end
 
 
 # PUT NIBS ON /SCRATCH (DONE 3/17/06 angie)
     # Originally done 1/30 -- redoing after re-run of RepeatMasker.
     ssh kkstore01
     mkdir /cluster/bluearc/scratch/hg/rn4
     rsync -av /cluster/data/rn4/nib/* /cluster/bluearc/scratch/hg/rn4/nib/
     cp -p /cluster/data/rn4/rn4.2bit /cluster/bluearc/scratch/hg/rn4/
     # Ask cluster-admin to distribute to /scratch on big & small cluster
 
 
 # MAKE CHROMINFO TABLE WITH 2BIT (DONE 1/30/06 angie)
     ssh kkstore01
     cd /cluster/data/rn4
     mkdir bed/chromInfo
     twoBitInfo rn4.2bit stdout \
     | awk '{print $1 "\t" $2 "\t/gbdb/rn4/rn4.2bit";}' \
       > bed/chromInfo/chromInfo.tab
 
     # Link to 2bit from /gbdb/rn4/:
     ssh hgwdev
     mkdir /gbdb/rn4
     ln -s /cluster/data/rn4/rn4.2bit /gbdb/rn4/
     # Load /gbdb/rn4/rn4.2bit paths into database and save size info.
     hgsql rn4  < $HOME/kent/src/hg/lib/chromInfo.sql
     hgsql rn4 -e 'load data local infile \
       "/cluster/data/rn4/bed/chromInfo/chromInfo.tab" \
       into table chromInfo;'
     echo "select chrom,size from chromInfo" | hgsql -N rn4 > chrom.sizes
     # take a look at chrom.sizes size
     wc chrom.sizes
 #     45      90     789 chrom.sizes
     # Load chromInfo on kolossus too (required for featureBits):
     ssh kolossus
     hgsql rn4  < $HOME/kent/src/hg/lib/chromInfo.sql
     hgsql rn4 -e 'load data local infile \
       "/cluster/data/rn4/bed/chromInfo/chromInfo.tab" \
       into table chromInfo;'
 
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 1/30/06 angie)
     # Make trackDb table so browser knows what tracks to expect:
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/trackDb
     cvsup
 
     # Add trackDb directories and a description.html
     mkdir rat/rn4
     cvs add rat/rn4
     cvs add rat/rn4/description.html
     cvs ci rat/rn4
     # Edit that makefile to add rn4 in all the right places and do
     make update DBS=rn4
 
     mkdir /gbdb/rn4/html
     cvs ci makefile
     # Go public on genome-test.  In a clean tree (no mods, up-to-date):
     cvs up makefile
     make alpha
     # Note: hgcentral*.genome values must correspond
     # with defaultDb.genome values
     hgsql -h genome-testdb hgcentraltest \
       -e 'INSERT INTO dbDb \
         (name, description, nibPath, organism, \
                 defaultPos, active, orderKey, genome, scientificName, \
                 htmlPath, hgNearOk, hgPbOk, sourceName) values \
         ("rn4", "Nov. 2004", "/gbdb/rn4", "Rat", \
                "chr2:22845558-23004134", 1, 30, "Rat", \
                 "Rattus norvegicus", "/gbdb/rn4/html/description.html", \
                 0, 0, "Baylor HGSC v. 3.4");'
 
 
 # MAKE DOWNLOADABLE SEQUENCE FILES (DONE 1/30/06 angie)
 # chrom{Fa*,Out}, md5sum.txt REDONE, 3/17/06, after RepeatMasker re-run.
 # upstream*.fa.gz added 6/23/06
     ssh kolossus
     cd /cluster/data/rn4
     #- Build the .tar.gz files -- no genbank for now.
     cat << '_EOF_' > jkStuff/zipAll.csh
 rm -rf bigZips
 mkdir bigZips
 tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp
 tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
 tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa
 tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked
 cd bed/simpleRepeat
 tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed
 cd ../..
 '_EOF_'
     # << this line makes emacs coloring happy
     csh -ef ./jkStuff/zipAll.csh >& zipAll.log &
     tail -f zipAll.log
     #- Look at zipAll.log to make sure all file lists look reasonable.  
     cd bigZips
     md5sum *.gz > md5sum.txt
     # Make a README.txt
     cd ..
     mkdir chromGz
     foreach f ( ?{,?}/chr*.fa )
       echo $f:t:r
       gzip -c $f > chromGz/$f:t.gz
     end
     cd chromGz
     md5sum *.gz > md5sum.txt
     # Make a README.txt
 
     #- Link the .gz and .txt files to hgwdev:/usr/local/apache/...
     ssh hgwdev
     set gp = /usr/local/apache/htdocs/goldenPath/rn4
     mkdir -p $gp/bigZips
     ln -s /cluster/data/rn4/bigZips/{chrom*.tar.gz,*.txt} $gp/bigZips
     mkdir -p $gp/chromosomes
     ln -s /cluster/data/rn4/chromGz/{chr*.gz,*.txt} $gp/chromosomes
     # Take a look at bigZips/* and chromosomes/*
     mkdir $gp/database
     # Create README.txt files in database/ to explain the files.
     # 6/23/06 -- Add upstream*.fa.gz.
     cd /cluster/data/rn4/bigZips
     foreach size (1000 2000 5000)
       echo $size
       featureBits rn4 refGene:upstream:$size -fa=stdout \
       | nice gzip -c > upstream$size.fa.gz
     end
     nice md5sum up*.gz >> md5sum.txt
     ln -s /cluster/data/rn4/bigZips/up*.gz $gp/bigZips/
 
 
 # MAKE 11.OOC (DONE 2/10/06 angie)
     ssh kolossus
     cd /cluster/data/rn4
     mkdir /cluster/bluearc/rn4
     blat rn4.2bit /dev/null /dev/null \
       -tileSize=11 -makeOoc=/cluster/bluearc/rn4/11.ooc -repMatch=1024
 #Wrote 25608 overused 11-mers to /cluster/bluearc/rn4/11.ooc
     #	and for use in other contexts:
     cp -p /cluster/bluearc/rn4/11.ooc /cluster/data/rn4
 
 
 # GENBANK AUTO UPDATE (DONE 2/10/06 angie)
     # align with revised genbank process. drop xeno ESTs.
     cd ~/kent/src/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add rn4
 
 # rn4
 rn4.serverGenome = /cluster/data/rn4/rn4.2bit
 rn4.clusterGenome = /iscratch/i/rn4/rn4.2bit
 rn4.ooc = /cluster/bluearc/rn4/11.ooc
 rn4.align.unplacedChroms = chrUn,chr*_random
 rn4.lift = /cluster/data/rn4/jkStuff/liftAll.lft
 rn4.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 rn4.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 rn4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 rn4.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 rn4.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 rn4.downloadDir = rn4
 rn4.refseq.mrna.xeno.load  = yes
 rn4.refseq.mrna.xeno.loadDesc = yes
 rn4.mgcTables.default = full
 rn4.mgcTables.mgc = all
 
     cvs ci etc/genbank.conf
     # update /cluster/data/genbank/
     make etc-update
 
     ssh kkstore02
     cd /cluster/data/genbank
     nice bin/gbAlignStep -initial rn4 &
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     nice ./bin/gbDbLoadStep -drop -initialLoad rn4 &
 
     # enable daily alignment and update of hgwdev
     cd ~/kent/src/makeDb/genbank
     cvsup
     # add rn4 to:
         etc/align.dbs
         etc/hgwdev.dbs 
     cvs commit
     make etc-update
 
 
 # PRODUCING GENSCAN PREDICTIONS (DONE 1/31/06 angie)
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/genscan
     cd /cluster/data/rn4/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
     # Run on small cluster (more mem than big cluster).
     ssh kki
     cd /cluster/data/rn4/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     cp /dev/null genome.list
     foreach f ( `ls -1S /cluster/data/rn4/*/chr*_*/chr*_?{,?}.fa.masked` )
       egrep '[ACGT]' $f > /dev/null
       if ($status == 0) echo $f >> genome.list
     end
     wc -l genome.list
 #591 genome.list
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 genome.list single gsub jobList
     para make jobList
     para time
 #Completed: 589 of 591 jobs
 #Crashed: 2 jobs
 #CPU time in finished jobs:     110813s    1846.88m    30.78h    1.28d  0.004 y
 #IO & Wait Time:                  8627s     143.79m     2.40h    0.10d  0.000 y
 #Average job time:                 203s       3.38m     0.06h    0.00d
 #Longest finished job:            4230s      70.50m     1.18h    0.05d
 #Submission to last job:         11260s     187.67m     3.13h    0.13d
     # If there are crashes, diagnose with "para problems" / "para crashed".  
     # If a job crashes due to genscan running out of memory, re-run it 
     # manually with "-window=1200000" instead of "-window=2400000".
     ssh kkr5u00
     cd /cluster/data/rn4/bed/genscan
     /cluster/bin/x86_64/gsBig /cluster/data/rn4/10/chr10_3/chr10_3.fa.masked gtf/chr10_3.fa.gtf -trans=pep/chr10_3.fa.pep -subopt=subopt/chr10_3.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
     /cluster/bin/x86_64/gsBig /cluster/data/rn4/9/chr9_21/chr9_21.fa.masked gtf/chr9_21.fa.gtf -trans=pep/chr9_21.fa.pep -subopt=subopt/chr9_21.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
 
     # Convert these to chromosome level files as so:
     ssh kkstore01
     cd /cluster/data/rn4/bed/genscan
     liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
     liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
     cat pep/*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/rn4/bed/genscan
     ldHgGene rn4 genscan genscan.gtf
     hgPepPred rn4 generic genscanPep genscan.pep
     hgLoadBed rn4 genscanSubopt genscanSubopt.bed
 
 
 # MAKE GCPERCENT (DONE 1/30/06 angie)
     ssh kolossus
     mkdir /cluster/data/rn4/bed/gc5Base
     cd /cluster/data/rn4/bed/gc5Base
     hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 rn4 \
        /cluster/data/rn4 \
     | wigEncode stdin gc5Base.wig gc5Base.wib
     ssh hgwdev
     mkdir /gbdb/rn4/wib
     cd /cluster/data/rn4/bed/gc5Base
     ln -s `pwd`/gc5Base.wib /gbdb/rn4/wib
     hgLoadWiggle -pathPrefix=/gbdb/rn4/wib rn4 gc5Base gc5Base.wig
 
 
 # ENSEMBL GENES (DONE 1/30/06 angie - UPDATED 6/13/06)
     mkdir /cluster/data/rn4/bed/ensembl
     cd /cluster/data/rn4/bed/ensembl
     # Get the ensembl gene data from 
     # http://www.ensembl.org/Rattus_norvegicus/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Rattus_norvegicus choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Structures" box. 
     # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
     # Save as ensembl.gff.gz
     # Add "chr" to front of each line in the gene data gtf file to make 
     # it compatible with our software.
     gunzip -c ensembl.gff.gz \
     | perl -wpe 's/^([0-9]+|X|Y|M|Un(_random)?)/chr$1/ \
                  || die "Line $. doesnt start with rat chrom:\n$_"; \
                  s/^chrMT/chrM/;' \
     > ensGene.gtf
     ssh hgwdev
     ldHgGene -gtf -genePredExt rn4 ensGene \
       /cluster/data/rn4/bed/ensembl/ensGene.gtf
 
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Choose Text, tab-separated as the output format.  Result name ensGtp.
     # Save file as ensGtp.txt.gz
     gunzip ensGtp.txt.gz
     hgLoadSqlTab rn4 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt
 
     # Load Ensembl peptides:
     # Get them from ensembl as above in the gene section except for
     # Page 3) Choose the "Sequences" box. 
     # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
     # Save file as ensemblPep.fa.gz
     gunzip -c ensemblPep.fa.gz \
     | perl -wpe 's/^>.*\|(ENSRNOT\d+\.\d+).*/>$1/' \
     > ensPep.fa
     hgPepPred rn4 generic ensPep ensPep.fa
 
 
 # CPGISSLANDS (WUSTL) (DONE 1/30/06 angie)
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/cpgIsland
     cd /cluster/data/rn4/bed/cpgIsland
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     mv cpglh.exe /cluster/data/rn4/bed/cpgIsland/
     
     ssh kolossus
     cd /cluster/data/rn4/bed/cpgIsland
     foreach f (../../*/chr*.fa.masked)
       set fout=$f:t:r:r.cpg
       echo running cpglh on $f to $fout
       nice ./cpglh.exe $f > $fout
     end
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 /* Input columns: */
 /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
 /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
 /* Output columns: */
 /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
 /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     # << this line makes emacs coloring happy
     awk -f filter.awk chr*.cpg > cpgIsland.bed
 
     # load into database:
     ssh hgwdev
     cd /cluster/data/rn4/bed/cpgIsland
     # Reloaded without -noBin (bin added to .sql) 4/7/06.
     hgLoadBed rn4 cpgIslandExt -tab \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
 
 
 # CPGISLANDS (ANDY LAW) (DONE 1/30/06 angie)
     # See notes in makeGalGal2.doc
     ssh kolossus
     mkdir /cluster/data/rn4/bed/cpgIslandGgfAndy
     cd /cluster/data/rn4/bed/cpgIslandGgfAndy
     #	Build the preProcGgfAndy program in
     #	kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE
     # Use masked sequence since this is a mammal...
     cp /dev/null cpgIslandGgfAndyMasked.bed
     foreach f (../../*/chr*.fa.masked)
       set chr = $f:t:r:r
       echo preproc and run on masked $chr
       ~/bin/x86_64/preProcGgfAndy $f \
       | /cluster/home/angie/ggf-andy-cpg-island.pl \
       | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                    $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                    $pGc = (100.0 * $gc / $n); \
                    $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                         "$pCpG\t$pGc\t$oE\n";' \
       >> cpgIslandGgfAndyMasked.bed
     end
     # load into database:
     ssh hgwdev
     cd /cluster/data/rn4/bed/cpgIslandGgfAndy
     # Reloaded without -noBin (bin added to .sql) 4/7/06.
     sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
     hgLoadBed rn4 cpgIslandGgfAndyMasked -tab \
       -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
 #Loaded 91097 elements of size 10
     featureBits rn4 cpgIslandExt
 #9629809 bases of 2571531505 (0.374%) in intersection
     featureBits rn4 cpgIslandGgfAndyMasked
 #43899646 bases of 2571531505 (1.707%) in intersection
     wc -l ../cpgIsland/cpgIsland.bed *bed
 #  15809 ../cpgIsland/cpgIsland.bed
 #  91097 cpgIslandGgfAndyMasked.bed
 
 
 # LIFTOVER RGD (DONE 2/9/06 angie)
     # ftp://rgd.mcw.edu/pub/RGD_genome_annotations/ still has 3.1 (rn3) not 
     # 3.4 (rn4).  Jim said these are interesting enough that we should lift 
     # over at least the QTLs.
     ssh kolossus
     mkdir /cluster/data/rn4/bed/rgdLiftover
     cd /cluster/data/rn4/bed/rgdLiftover
     wget ftp://rgd.mcw.edu/pub/RGD_genome_annotations/V3.1/GFF_files/rgd_rat_qtl_12052005.gff
     awk '{print $1"\t"$4-1"\t"$5"\t"$10}' rgd_rat_qtl_12052005.gff \
     | sed -e 's/Chr/chr/g; s/"//g; s/RGD://g; s/;//g' \
       > rgdQtl_rn3.bed
     awk '{printf "%s\t%s\t", $12, $10; \
           for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' \
       rgd_rat_qtl_12052005.gff \
     | sed -e 's/"//g; s/RGD://g; s/;\t/\t/g' \
       > rgdQtlLink.tab
     # liftOver rgdQtl_rn3.bed \
     #   /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \
     #   rgdQtl.bed rgdQtl.unmapped
     # liftOver of rgdQtl_rn3.bed did not go well -- out of 903 qtls,
     # only 95 were successfully mapped.  772 got "Partially split in new."
     # Since these are enormous, presumably fuzzy-edged regions, just lift 
     # start and end.  If the start and end of each feature seems to be 
     # liftOver'd coherently, "chain" those back up into big regions in rn4.
     perl -we 'while (<>) { \
                 chomp;  ($chr, $start, $end, $name) = split; \
                 print "$chr\t$start\t" . ($start+100) . "\tstart_$name\n"; \
                 print "$chr\t" . ($end-100) . "\t$end\tend_$name\n"; \
               }' \
       rgdQtl_rn3.bed > rgdQtl_rn3_endpoints.bed
     liftOver rgdQtl_rn3_endpoints.bed \
       /cluster/data/rn3/bed/blat.rn4.2006-02-07/rn3ToRn4.over.chain.gz \
       rgdQtl_endpoints.bed rgdQtl_endpoints.unmapped
     # Now 1706 out of 1806 survive... but 100 are deleted in new?!
     # I manually checked 10 of them and they all are either in a gap 
     # or off the end of a chrom.  Makes me wonder if these really are 
     # rn3 coords...  oh well, I guess we can load them up and ask 
     # RGD about them in the meantime.  I submitted a question using their 
     # online form -- my message ID is 439 and I can email rgd@rgd.mcw.edu 
     # with that ID if I don't hear from them in 3 biz days.
     perl -we 'while (<>) { \
                 chomp;  ($chr, $start, $end, $name) = split; \
                 if ($name =~ /^start_(\S+)/) { \
                   $starts{$1} = [$chr, $start]; \
                 } elsif ($name =~ /^end_(\S+)/) { \
                   $ends{$1} = [$chr, $end]; \
                 } else { die "parse error line $.: name $name\n"; } \
               } \
               foreach my $name (keys %starts) { \
                 if (defined $ends{$name}) { \
                   my ($sChr, $start) = @{$starts{$name}}; \
                   my ($eChr, $end)   = @{$ends{$name}}; \
                   if (($sChr eq $eChr) && $end > $start) { \
                     print "$sChr\t$start\t$end\t$name\n"; \
                   } else { \
                     print STDERR "reject: [$sChr, $start] / [$eChr, $end] / $name\n"; \
                   } \
                 } \
               }' \
       rgdQtl_endpoints.bed \
       | sort -k 1,1 -k 2n,2n > rgdQtl.bed
     wc -l rgdQtl.bed
 #806 rgdQtl.bed
     ssh hgwdev
     cd /cluster/data/rn4/bed/rgdLiftover
     hgLoadBed rn4 rgdQtl rgdQtl.bed
     hgsql rn4 < ~/kent/src/hg/lib/rgdQtlLink.sql
     hgsql rn4 -e \
       'load data local infile "rgdQtlLink.tab" into table rn4.rgdQtlLink;'
  
 
 # MAKE LINEAGE-SPECIFIC REPEATS VS. HUMAN, MOUSE (DONE 2/8/06 angie)
     ssh kolossus
     mkdir /cluster/data/rn4/rmsk
     cd /cluster/data/rn4/rmsk
     ln -s ../?{,?}/chr*.fa.out .
     # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
     # whether repeats in -query are also expected in -comp species.  
     # Human in extra column 1, Mouse in extra column 2
     foreach outfl ( *.out )
         echo "$outfl"
         nice /cluster/bluearc/RepeatMasker/DateRepeats \
           ${outfl} -query rat -comp human -comp mouse
     end
     # Now extract human (extra column 1), mouse (extra column).
     cd ..
     mkdir linSpecRep.notInHuman
     mkdir linSpecRep.notInMouse
     foreach f (rmsk/*.out_homo-sapiens_mus-musculus)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                         linSpecRep.notInHuman/$base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                         linSpecRep.notInMouse/$base.out.spec
     end
     wc -l rmsk/*.out
 #  4417757 total
     wc -l linSpecRep.notInHuman/*
 #  2676056 total
     wc -l linSpecRep.notInMouse/*
 #  471417 total
     # Clean up.
     rm -r rmsk
     # Distribute linSpecRep.* for cluster run
     ssh kkstore01
     mkdir /san/sanvol1/scratch/rn4
     rsync -av /cluster/data/rn4/linSpecRep.notInHuman/* \
       /san/sanvol1/scratch/rn4/linSpecRep.notInHuman/
     rsync -av /cluster/data/rn4/linSpecRep.notInMouse/* \
       /san/sanvol1/scratch/rn4/linSpecRep.notInMouse/
 
 
 # SWAP/CHAIN/NET HG17 (DONE 2/10/06 angie)
     mkdir /cluster/data/rn4/bed/blastz.hg17.swap
     cd /cluster/data/rn4/bed/blastz.hg17.swap
     doBlastzChainNet.pl -swap /cluster/data/hg17/bed/blastz.rn4/DEF \
       -workhorse=kkr5u00 >& do.log & tail -f do.log
     ln -s blastz.hg17.swap /cluster/data/rn4/bed/blastz.hg177
 
 
 # MAKE LINEAGE-SPECIFIC REPEATS FOR NON-MAMMALS (DONE 2/13/06 angie)
     # In an email 2/13/04 to Angie, Arian said we could treat all 
     # human repeats as lineage-specific for human-chicken blastz.  
     # Extrapolate to any mammal vs. anything at least as distant as chicken.
     ssh kkr1u00
     mkdir /iscratch/i/rn4/linSpecRep.notInNonMammal
     foreach f (/cluster/data/rn4/*/chr*.fa.out)
       cp -p $f /iscratch/i/rn4/linSpecRep.notInNonMammal/$f:t:r:r.out.spec
     end
     iSync
 
 
 # BLASTZ CHICKEN (GALGAL2) (DONE 2/14/06 angie)
     ssh kk
     mkdir /cluster/data/rn4/bed/blastz.galGal2.2006-02-13
     cd /cluster/data/rn4/bed/blastz.galGal2.2006-02-13
     # Set L=10000 (higher threshold on blastz's outer loop) and abridge 
     # repeats.
     cat << '_EOF_' > DEF
 #rat vs. chicken
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Rat
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_SMSK=/iscratch/i/rn4/linSpecRep.notInNonMammal
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 
 # QUERY: Chicken
 SEQ2_DIR=/iscratch/i/galGal2/nib
 SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LEN=/cluster/data/galGal2/chrom.sizes
 
 BASE=/cluster/data/rn4/bed/blastz.galGal2.2006-02-13
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot=/cluster/bluearc/blastzRn4GalGal2Out \
       > do.log & tail -f do.log
     cd /cluster/data/rn4/bed
     ln -s /cluster/data/rn4/bed/blastz.galGal2.2006-02-13 blastz.galGal2
 
 
 ##########################################################################
 # BACEND PAIRS TRACK
 
 # DOWNLOAD CLONEENDS (BACENDS) FROM NCBI (DONE 2/23/06 angie)
     ssh kkstore01
     cd /cluster/data/rn4
     # Plenty of room at the moment:
     df -h .
 #                      1.3T 1009G  234G  82% /cluster/store9
     mkdir -p bed/cloneend/ncbi
     cd bed/cloneend/ncbi
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/rattus_norvegicus/\*
     cd ..
     # Extract unversioned accessions from headers and combine into one 
     # uncompressed file which we will link to from /gbdb/:
     zcat ncbi/*ends*.mfa.gz \
     | perl -wpe 'if (/^>/) { \
        s/^>.*\|(gb|dbj)\|(\w+)\.\d+\|.*/>$2/ || die "parse line $.:$_\n"; }' \
       > cloneEnds.fa
     # Make sure the sequences are intact after the header-munging:
     faSize ncbi/*.mfa.gz
 #188448559 bases (4451 N's 188444108 real 102020307 upper 86423801 lower) in 307557 sequences in 18 files
 #Total size: mean 612.7 sd 198.9 min 83 (gi|30387889|gb|CC181697.1|) max 1158 (gi|24002262|gb|BZ277934.1|) median 634
     faSize cloneEnds.fa
 #188448559 bases (4451 N's 188444108 real 102020307 upper 86423801 lower) in 307557 sequences in 1 files
 #Total size: mean 612.7 sd 198.9 min 83 (CC181697) max 1158 (BZ277934) median 634
 
     # Extract pairings from info files:
     zcat ncbi/*info*.txt.gz \
     | /cluster/bin/scripts/convertCloneEndInfo stdin
 #134351 pairs and 38410 singles
 
     # split for cluster run
     mkdir /cluster/bluearc/rn4/cloneEnds
     faSplit sequence cloneEnds.fa 100 /cluster/bluearc/rn4/cloneEnds/cloneEnds
     #	Check to ensure no breakage:
     faSize /cluster/bluearc/rn4/cloneEnds/*.fa
 #188448559 bases (4451 N's 188444108 real 102020307 upper 86423801 lower) in 307557 sequences in 98 files
 #Total size: mean 612.7 sd 198.9 min 83 (CC181697) max 1158 (BZ277934) median 634
     #	same numbers as before
 
     # load sequences
     ssh hgwdev
     mkdir /gbdb/rn4/cloneend
     ln -s /cluster/data/rn4/bed/cloneend/cloneEnds.fa /gbdb/rn4/cloneend/
     cd /tmp
     hgLoadSeq rn4 /gbdb/rn4/cloneend/cloneEnds.fa
 #307557 sequences
 
 
 # BACEND SEQUENCE ALIGNMENTS (DONE 2/23/06 angie)
     ssh kkstore01
     # Make unmasked fasta.
     cd /cluster/data/rn4
     mkdir /cluster/bluearc/rn4/noMask
     foreach f (?{,?}/chr*.fa)
       echo $f:t:r
       perl -wpe 'tr/a-z/A-Z/ if (! /^>/);' $f \
         > /cluster/bluearc/rn4/noMask/$f:t
     end
 
     # kluster run
     ssh pk
     mkdir /cluster/data/rn4/bed/bacends
     cd /cluster/data/rn4/bed/bacends
     mkdir out
     # allow blat to run politely in /tmp while it writes output, then
     # copy results to results file:
     cat << '_EOF_' > runBlat.csh
 #!/bin/csh -ef
 set root1 = $1
 set root2 = $2
 set result = $3
 rm -fr /scratch/tmp/${root1}_${root2}
 mkdir /scratch/tmp/${root1}_${root2}
 pushd /scratch/tmp/${root1}_${root2}
 /cluster/bin/x86_64/blat /cluster/bluearc/rn4/noMask/${root1}.fa \
   /cluster/bluearc/rn4/cloneEnds/${root2}.fa \
   -ooc=/cluster/bluearc/rn4/11.ooc ${root1}.${root2}.psl
 popd
 mkdir -p out/${root2}
 rm -f ${result}
 mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
 rm -fr /scratch/tmp/${root1}_${root2}
 '_EOF_'
     #	<< happy emacs
     chmod +x runBlat.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./runBlat.csh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << emacs happy
     ls -1S /cluster/bluearc/rn4/cloneEnds/cloneEnds???.fa > bacEnds.lst
     ls -1S /cluster/bluearc/rn4/noMask/chr*.fa > contig.lst
     gensub2 contig.lst bacEnds.lst template jobList
     para make jobList
 #Completed: 4410 of 4410 jobs
 #CPU time in finished jobs:     450309s    7505.15m   125.09h    5.21d  0.014 y
 #IO & Wait Time:                812287s   13538.12m   225.64h    9.40d  0.026 y
 #Average job time:                 286s       4.77m     0.08h    0.00d
 #Longest finished job:            1834s      30.57m     0.51h    0.02d
 #Submission to last job:          6982s     116.37m     1.94h    0.08d
 
     ssh kkstore01
     cd /cluster/data/rn4/bed/bacends
     mkdir temp
     time pslSort dirs raw.psl temp out/*
 #538.065u 55.510s 14:46.80 66.9%       0+0k 0+0io 3pf+0w
     time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \
       raw.psl bacEnds.psl /dev/null
 #203.262u 8.877s 3:57.34 89.3%   0+0k 0+0io 2pf+0w
 
 
 # BACEND PAIRS TRACK (DONE 2/23/06 angie)
     ssh kolossus
     cd /cluster/data/rn4/bed/bacends
     time /cluster/bin/x86_64/pslPairs -tInsert=10000 \
       -minId=0.91 -noBin -min=25000 \
       -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
       -mismatch -verbose bacEnds.psl \
         ../cloneend/cloneEndPairs.txt all_bacends bacEnds
 #19.229u 4.720s 0:26.34 90.8%    0+0k 0+0io 0pf+0w
 
     # Filter by score and sort by {chrom,chromStart}:
     awk '$5 >= 300 {print;}' bacEnds.pairs | sort -k1,2n > bacEndPairs.bed
     cat bacEnds.{slop,short,long,mismatch,orphan} \
     | awk '$5 >= 300 {print;}' | sort -k1,2n > bacEndPairsBad.bed
 
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $5}' bacEndPairs.bed | sort -u
 
     /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
       bacEndPairsBad.bed \
     | sorttbl tname tstart | headchg -del \
     > bacEnds.load.psl
     wc -l bacEnds.*
 #   5590477 bacEnds.psl
 #   4462768 bacEnds.load.psl
 #    109315 bacEnds.pairs
 #       705 bacEnds.long
 #        98 bacEnds.lst
 #      3925 bacEnds.mismatch
 #     88292 bacEnds.orphan
 #       558 bacEnds.short
 #       687 bacEnds.slop
 
     # load into database
     ssh hgwdev
     cd /cluster/data/rn4/bed/bacends
     # Reloaded with updated .sql (no chromEnd index) 4/7/06.
     hgLoadBed -strict -notItemRgb rn4 bacEndPairs bacEndPairs.bed \
         -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
 #Loaded 109139 elements of size 11
     # note - the "Bad" track isn't pushed to RR, just used for assembly QA.
     hgLoadBed -strict -notItemRgb rn4 bacEndPairsBad bacEndPairsBad.bed \
       -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
 #Loaded 38981 elements of size 11
     time hgLoadPsl rn4 -table=all_bacends bacEnds.load.psl
 #load of all_bacends did not go as planned: 4462768 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab
 #165.730u 24.690s 10:57.43 28.9% 0+0k 0+0io 258pf+0w
 
     # To diagnose:
     echo select \* from all_bacends | hgsql -N rn4 > psl.tab.loaded
     diff psl.tab* 
 #1491015c1491015
 #< 120   908     82      0       0       3       -124    6       278579  -       BZ232973        866     0       866     chr17   97296363        49307517   49587086 8       21,5,71,86,327,250,116,114,     0,21,27,98,184,382,632,752,     49307517,49307539,49307544,49335694,49336333,49470455,49586851,49586972,
 #---
 #> 120   908     82      0       0       3       0       6       278579  -       BZ232973        866     0       866     chr17   97296363        49307517   49587086 8       21,5,71,86,327,250,116,114,     0,21,27,98,184,382,632,752,     49307517,49307539,49307544,49335694,49336333,49470455,49586851,49586972,
     # Q gap bases was -124 in... clipped to 0 in the db.  
     # Identify and save the out/*.psl file that contains the -124:
     grep '^>BZ232973' /cluster/bluearc/rn4/cloneEnds/*.fa
 #/cluster/bluearc/rn4/cloneEnds/cloneEnds079.fa:>BZ232973
     grep BZ232973 out/cloneEnds079/*.psl | grep .-124
 #out/cloneEnds079/chr17.cloneEnds079.psl:908     82      0       0       3       -124    6       278579  -       BZ232973        866     0       866     chr17       97296363        49307517        49587086        8       21,5,71,86,327,250,116,114,     0,21,27,98,184,382,632,752,     49307517,49307539,49307544,49335694,49336333,49470455,49586851,49586972,
     cp -p out/cloneEnds079/chr17.cloneEnds079.psl .
 
     featureBits rn4 all_bacends
 #241203668 bases of 2571531505 (9.380%) in intersection
     featureBits rn3 all_bacends
 #232313031 bases of 2571104688 (9.036%) in intersection
     featureBits rn4 bacEndPairs
 #2687958438 bases of 2571531505 (104.528%) in intersection
     featureBits rn3 bacEndPairs
 #2717444119 bases of 2571104688 (105.692%) in intersection
     featureBits rn4 bacEndPairsBad
 #664080718 bases of 2571531505 (25.824%) in intersection
     featureBits rn3 bacEndPairsBad
 #table bacEndPairsBad not found for any chroms
 
     # Clean up - this recovers >10G!
     rm psl.tab psl.tab.loaded raw.psl bed.tab
     rm -r out
     rmdir temp
     rm -r /cluster/bluearc/rn4/noMask/ /cluster/bluearc/rn4/cloneEnds/
 
 # end BACEND PAIRS TRACK
 ##########################################################################
 
 
 # SWAP/CHAIN/NET MM7 (DONE 2/24/06 angie)
     mkdir /cluster/data/rn4/bed/blastz.mm7.swap
     cd /cluster/data/rn4/bed/blastz.mm7.swap
     doBlastzChainNet.pl -swap /cluster/data/mm7/bed/blastz.rn4/DEF \
       >& do.log & tail -f do.log
     ln -s blastz.mm7.swap /cluster/data/rn4/bed/blastz.mm7
 
 
 # SWAP/CHAIN/NET HG18 (DONE 2/24/06 angie)
     mkdir /cluster/data/rn4/bed/blastz.hg18.swap
     cd /cluster/data/rn4/bed/blastz.hg18.swap
     doBlastzChainNet.pl -swap /cluster/data/hg18/bed/blastz.rn4/DEF \
       >& do.log & tail -f do.log
     ln -s blastz.hg18.swap /cluster/data/rn4/bed/blastz.hg18
 
 
 # MAKE LINEAGE-SPECIFIC REPEATS VS. DOG, COW, RABBIT (DONE 2/24/06 angie)
     ssh kolossus
     mkdir /cluster/data/rn4/rmsk
     cd /cluster/data/rn4/rmsk
     ln -s ../?{,?}/chr*.fa.out .
     # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
     # whether repeats in -query are also expected in -comp species.  
     # Dog in extra column 1, mouse in 2, rabbit in 3
     foreach outfl ( *.out )
         echo "$outfl"
         nice /cluster/bluearc/RepeatMasker/DateRepeats \
           ${outfl} -query rat -comp dog -comp cow -comp rabbit
     end
     # Now extract human (extra column 1), mouse (extra column).
     cd ..
     mkdir linSpecRep.notIn{Dog,Cow,Rabbit}
     foreach f (rmsk/*.out_*)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractRepeats 1 $f > \
                         linSpecRep.notInDog/$base.out.spec
         /cluster/bin/scripts/extractRepeats 2 $f > \
                         linSpecRep.notInCow/$base.out.spec
         /cluster/bin/scripts/extractRepeats 3 $f > \
                         linSpecRep.notInRabbit/$base.out.spec
     end
     wc -l rmsk/*.out
 #  4417757 total
     wc -l linSpecRep.notInDog/*
 #  2676056 total
     wc -l linSpecRep.notInCow/*
 #  2676056 total
     wc -l linSpecRep.notInRabbit/*
 #  2676056 total
     # These all look identical to each other (and to human except for header):
     foreach f (linSpecRep.notInDog/*)
       cmp $f linSpecRep.notInCow/$f:t
       cmp $f linSpecRep.notInRabbit/$f:t
     end
     # Consolidate to save space (probably could have done this for human too):
     mv linSpecRep.notInDog linSpecRep.notInNonRodentMammal
     # Clean up.
     rm -r rmsk linSpecRep.notInCow linSpecRep.notInRabbit
     # Distribute linSpecRep.* for cluster run
     ssh kkstore01
     rsync -av /cluster/data/rn4/linSpecRep.notInNonRodentMammal/* \
       /cluster/bluearc/rn4/linSpecRep.notInNonRodentMammal/
 
 
 # BLASTZ/CHAIN/NET CANFAM2 (DONE 2/24/06 angie)
     ssh kkstore01
     mkdir /cluster/data/rn4/bed/blastz.canFam2.2006-02-24
     cd /cluster/data/rn4/bed/blastz.canFam2.2006-02-24
     cat << '_EOF_' > DEF
 # rat vs. dog
 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Rat
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_SMSK=/cluster/bluearc/rn4/linSpecRep.notInNonRodentMammal
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog
 SEQ2_DIR=/scratch/hg/canFam2/nib
 SEQ2_SMSK=/cluster/bluearc/canFam2/linSpecRep.notInRat
 SEQ2_LEN=/cluster/data/canFam2/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.canFam2.2006-02-24
 '_EOF_'
     # << for emacs
     doBlastzChainNet.pl DEF -bigClusterHub=pk \
       -blastzOutRoot /san/sanvol1/scratch/blastzRn4CanFam2Out >& do.log &
     tail -f do.log
     ln -s blastz.canFam2.2006-02-24 /cluster/data/rn4/bed/blastz.canFam2 
 
 
 ##########################################################################
 # STS MARKERS
 # NOTE: Instead of using UniSTS_rat.sts as usual, I made a local version 
 # which uses the name given in UniSTS.sts if none is given in UniSTS_rat.sts 
 # and "UniSTS:$id" if no name is given in UniSTS.sts.  Some inconsistencies 
 # between UniSTS.sts and UniSTS_rat.sts reported 2/21/06; haven't heard 
 # back from them yet so I'll just forge ahead with latest UniSTS files.
 
 # MAKE STSINFORAT (DONE 3/6/06 angie)
     # This method was developed by Yontao.  ytlu/script is under CVS control
     # so I'm using ~/ytlu/script/perl/script/match but have copied several 
     # other scripts from /cluster/store4/ratMarker/code into 
     # kent/src/hg/stsMarkers/ to get them in CVS too.
     ssh kkstore01
     mkdir /cluster/data/rn4/bed/stsMarkers
     mkdir /cluster/data/rn4/bed/stsMarkers/info
     cd /cluster/data/rn4/bed/stsMarkers/info
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Rattus_norvegicus/10116.FHH_x_ACI.7.txt
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Rattus_norvegicus/10116.RH_map.3.4.txt
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Rattus_norvegicus/10116.SHRSP_x_BN.7.txt
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_rat.sts
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
     wget --timestamping ftp://rgd.mcw.edu/pub/rhmap/3.4/RHv3_4_MapData.txt
     wget --timestamping ftp://rgd.mcw.edu/pub/rhmap/3.4/RAW_MARKER_SEQINFO.txt
 
     # UniSTS_rat.sts is missing a lot of names that can be found in UniSTS.sts
     # so load that up and make a fixed-up UniSTS_rat.rat_extracted.sts:
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
     awk -F"\t" '$5 != "-" {print $1 "\t" $5 "\t" $8;}' UniSTS.sts \
       > UniSTS.names
     cat > extractRat.pl <<'_EOF_'
 #!/usr/bin/perl -w
 
 use strict;
 
 open(NAMES, "<UniSTS.names") || die "Can't open UniSTS.names: $!\n";
 print STDERR "Hashing UniSTS.names...\n";
 my %id2name;
 while (<NAMES>) {
   chomp;
   my @words = split("\t");
   $id2name{$words[0]} = $words[1];
 }
 
 print STDERR "Extracting rat and adding in names...\n";
 while (<>) {
   chomp;
   my @words = split("\t");
   next if ($words[7] ne "Rattus norvegicus");
   if ($words[4] eq '-') {
     $words[4] = $id2name{$words[0]};
     $words[4] = "UniSTS:$words[0]" if (! defined $words[4]);
   } elsif ((defined $id2name{$words[0]}) &&
            ($words[4] ne $id2name{$words[0]})) {
     print STDERR "Disagreement: name of $words[0] given as " .
                   $id2name{$words[0]} . " and as $words[4]\n";
   }
   print join("\t", @words) . "\n";
 }
 '_EOF_'
     # << emacs
     chmod a+x extractRat.pl
     ./extractRat.pl UniSTS_rat.sts > UniSTS_rat.rat_extracted.sts
     # Many warnings about different name given in UniSTS_rat.sts vs. 
     # UniSTS.sts -- reported to NCBI.
     # Make UniSTS_rat.alias by joining the first columns of
     # UniSTS_rat.rat_extracted.sts and UniSTS.aliases so we get just the 
     # aliases of items in UniSTS_rat.rat_extracted.sts:
     /cluster/home/ytlu/ytlu/script/perl/script/match \
       UniSTS_rat.rat_extracted.sts 1 UniSTS.aliases 1 \
       > UniSTS_rat.alias
     # Extract these columns from RHv3_4_MapData.txt:
     # RGD_Sym TEMPL_SEQ FORWARD_PRIMER REVERSE_PRIMER EXP_SIZE
     awk '{print $2,$1,$17,$8,$9,$10}' RHv3_4_MapData.txt | sed s/" "/"\t"/g \
       > inMapData.txt 
     # Find entries not in RAW_MARKER_SEQINFO.txt but in inMapData.txt, 
     /cluster/home/ytlu/ytlu/script/perl/script/match -M \
       RAW_MARKER_SEQINFO.txt 1 inMapData.txt 1 > inMapDataOnly.txt
     cat inMapDataOnly.txt RAW_MARKER_SEQINFO.txt > marker_seqinfo.txt
     # Combine sources into a "draft" stsInfoRat.tab file:
     ~/kent/src/hg/stsMarkers/createStsInfoRat \
       UniSTS_rat.rat_extracted.sts marker_seqinfo.txt UniSTS_rat.alias \
       10116.RH_map.3.4.txt 10116.FHH_x_ACI.7.txt 10116.SHRSP_x_BN.7.txt \
       > stsInfoRat-draft
     # clean some redundant and conflicting entries --> final stsInfoRat.tab:
     ~/kent/src/hg/stsMarkers/cleanInfo.pl -rat stsInfoRat-draft \
       > ../stsInfoRat.withSeq.tab
     # Create the primer.fa, cloneseq.fa, and primerinfo files for alignment:
     cd ..
     ~/kent/src/hg/stsMarkers/luConvertPrimerToFa stsInfoRat.withSeq.tab \
       ratPrimer.fa ratSeq.fa ratPrimer.info -a 200
     # Strip the sequence column off so that it isn't loaded into the db 
     # (causing mysql warnings due to truncation).  The seq column is not 
     # used from here on out:
     perl -wpe '@w=split("\t"); $w[24] = ""; $_ = join("\t", @w) . "\n";' \
       stsInfoRat.withSeq.tab > stsInfoRat.tab
 
 
 # BLAT CLONE SEQUENCES (DONE 3/8/06 angie)
     # Based on what was done for hg17, I think the thing to do is not use 
     # ooc, but filter out the results with tGapSize > 1000.
     ssh kkstore01
     cd /cluster/data/rn4
     mkdir -p /cluster/bluearc/rn4/ctgFa
     foreach d (*/chr*_?{,?})
       cp -p $d/$d:t.fa /cluster/bluearc/rn4/ctgFa/
     end
     cp -p /cluster/data/rn4/bed/stsMarkers/ratSeq.fa \
       /cluster/bluearc/rn4/
     ssh pk
     mkdir /cluster/data/rn4/bed/stsMarkers/clone
     cd /cluster/data/rn4/bed/stsMarkers/clone
     # A run with ooc, and then a run without ooc only on the unaligned seqs,
     # would probably save time next time around...
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/blat {check in line+ $(path1)} {check in line+ /cluster/bluearc/rn4/ratSeq.fa} -stepSize=5 {check out line+ clone.out/$(root1).psl}
 #ENDLOOP
 '_EOF_'
     # << emacs
     mkdir clone.out
     ls -1S /cluster/bluearc/rn4/ctgFa/chr*.fa > ctg.lst
     gensub2 ctg.lst single gsub jobList
     para make jobList
     para time
 #Completed: 591 of 591 jobs
 #CPU time in finished jobs:    1628303s   27138.38m   452.31h   18.85d  0.052 y
 #IO & Wait Time:                 26091s     434.85m     7.25h    0.30d  0.001 y
 #Average job time:                2799s      46.66m     0.78h    0.03d
 #Longest finished job:            6138s     102.30m     1.71h    0.07d
 #Submission to last job:          8714s     145.23m     2.42h    0.10d
     ssh kolossus
     cd /cluster/data/rn4/bed/stsMarkers/clone
     #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
     #	should not be more than 100 bases different.
     pslSort dirs stdout temp clone.out \
     | pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons stdin \
         clones.unlifted.tmp.psl /dev/null
     awk '($8 < 1000) {print;}' clones.unlifted.tmp.psl > clones.unlifted.psl
     wc -l clones.un*
 #  19097 clones.unlifted.psl
 #  22275 clones.unlifted.tmp.psl
     rmdir temp
     rm /cluster/bluearc/rn4/ratSeq.fa
 
 
 # BLAT.2 STS PRIMERS (DONE 3/7/06 angie)
     cp -p /cluster/data/rn4/bed/stsMarkers/ratPrimer.fa \
       /cluster/bluearc/rn4/
     ssh kk
     mkdir /cluster/data/rn4/bed/stsMarkers/primer
     cd /cluster/data/rn4/bed/stsMarkers/primer
     # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat.2 {check in line+ $(path1)} {check in line+ /cluster/bluearc/rn4/ratPrimer.fa} -ooc=/cluster/bluearc/rn4/11.ooc  -minMatch=1 -minScore=0 -minIdentity=80 -oneOff {check out line+ primers.out/$(root1).psl}
 #ENDLOOP
 '_EOF_'
     # << emacs
     mkdir primers.out
     ls -1S /cluster/bluearc/rn4/ctgFa/chr*.fa > ctg.lst
     gensub2 ctg.lst single gsub jobList
     para make jobList
     para time
 #Completed: 591 of 591 jobs
 #CPU time in finished jobs:     466140s    7769.00m   129.48h    5.40d  0.015 y
 #IO & Wait Time:                 63525s    1058.75m    17.65h    0.74d  0.002 y
 #Average job time:                 896s      14.94m     0.25h    0.01d
 #Longest finished job:            1756s      29.27m     0.49h    0.02d
 #Submission to last job:          5330s      88.83m     1.48h    0.06d
 
     ssh kkstore01
     cd /cluster/data/rn4/bed/stsMarkers/primer
     #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
     #	should not be more than 100 bases different.
     pslSort dirs stdout temp primers.out \
     | awk -F"\t" ' \
       { if (((($13 - $12) - ($17 - $16)) > -100) && \
             ((($13 - $12) - ($17 - $16)) < 100)) {print} \
       }' \
       > primers.psl.100.unlifted
     rmdir temp
     rm /cluster/bluearc/rn4/ratPrimer.fa
     # Would be very nice to compare wc output with rn3 run, but I cannot
     # find the rn3 run.
     wc primers.psl.100.unlifted
 #  8050086 169051724 799770587 primers.psl.100.unlifted
 
 
 # E-PCR STS PRIMERS (DONE 3/7/06 angie)
     ssh kkstore01
     cp -p /cluster/data/rn4/bed/stsMarkers/ratPrimer.info \
       /cluster/bluearc/rn4/
     ssh kk
     mkdir /cluster/data/rn4/bed/stsMarkers/ePCR
     cd /cluster/data/rn4/bed/stsMarkers/ePCR
     ls -1S /cluster/bluearc/rn4/ctgFa/chr*.fa > ctg.lst
     # Hiram built latest e-PCR into /cluster/bin/{i386,x86_64}/e-PCR,
     # see notes in makeMm7.doc.
     mkdir epcr.out
     cat << '_EOF_' > runPCR.csh
 #!/bin/csh -fe
 /cluster/bin/$MACHTYPE/e-PCR $1 $2 N=1 M=50 W=5 > $3
 '_EOF_'
     chmod +x runPCR.csh
 
     cat << '_EOF_' > gsub
 #LOOP
 ./runPCR.csh {check in line+ /cluster/bluearc/rn4/ratPrimer.info} {check in line+ $(path1)} {check out line+ epcr.out/$(num1).epcr}
 #ENDLOOP
 '_EOF_'
     gensub2 ctg.lst single gsub jobList
     para make jobList
     para time
     # Actually ran on pk -- kk is swamped at the moment.
 #Completed: 591 of 591 jobs
 #CPU time in finished jobs:     103485s    1724.75m    28.75h    1.20d  0.003 y
 #IO & Wait Time:                 29018s     483.63m     8.06h    0.34d  0.001 y
 #Average job time:                 224s       3.74m     0.06h    0.00d
 #Longest finished job:             341s       5.68m     0.09h    0.00d
 #Submission to last job:           482s       8.03m     0.13h    0.01d
 
     ssh kkstore01
     cd /cluster/data/rn4/bed/stsMarkers/ePCR
     cat epcr.out/*.epcr > all.epcr
     rm /cluster/bluearc/rn4/ratPrimer.info
     rm -r /cluster/bluearc/rn4/ctgFa
     # Would compare to rn3 if I could find it...
     wc all.epcr
 #  73885  295540 3915703 all.epcr
 
 
 # COMBINE AND FILTER BLAT.2 AND E-PCR RESULTS (DONE 3/8/06 angie -- stsMapRat.bed REDONE 6/15/06)
     ssh kolossus
     cd /cluster/data/rn4/bed/stsMarkers/primer
     # create accession_info.rdb: AGP info.
     /cluster/bin/scripts/compileAccInfo -rat /cluster/data/rn4 /dev/null
 #0 processed
     # "0 processed" sounds a little scary but it's 0 because we didn't pass 
     # in a -pre rdb.
     mv accession_info.rdb accession_info.rdb.tmp
     /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp \
       > accession_info.rdb
     # some lines have "chrN_random_M" in the Contig column but "chrN" 
     # (not chrN_random) in the chrom column.... looks weird, but reading 
     # compileAccInfo it seems intentional to lump ordered and random.
     rm accession_info.rdb.tmp
     # Would compare to rn3 if I could find it...
     wc accession_info.rdb
 # 138571 1524285 9921966 accession_info.rdb
 
     # filterSTSPrimers combines blat.2 and epcr results.  
     # For human there is a program pslFilterPrimers that combines isPcr and 
     # epcr results... might be interesting to try that next time around.
     /cluster/bin/scripts/filterSTSPrimers -rat \
       ../stsInfoRat.tab primers.psl.100.unlifted \
       ../ratPrimer.info ../ePCR/all.epcr \
     | liftUp -nohead -type=.psl primers.psl.filter.blat \
         ../../../jkStuff/liftAll.lft warn stdin
     # Would compare to rn3 if I could find it...
     wc primers.psl.100.unlifted
 #  8050086 169051724 799770587 primers.psl.100.unlifted
     wc primers.psl.filter.blat
 #  38707  812847 4100619 primers.psl.filter.blat
 
     # filterSTSPrimers creates epcr.not.found which actually contains 
     # items that *were* found by epcr but not by blat.  Translate to PSL:
     /cluster/bin/scripts/epcrToPsl -rat epcr.not.found ../ratPrimer.info \
       accession_info.rdb /cluster/data/rn4
     # epcrToPsl creates epcr.not.found.nomatch and epcr.not.found.psl
     # Would compare to rn3 if I could find it...
     wc epcr*
 #  748  2992 27112 epcr.not.found
 #    0     0     0 epcr.not.found.nomatch
 #  748 15708 72452 epcr.not.found.psl
     # Lift the PSL:
     mv epcr.not.found.psl epcr.not.found.unlifted.psl
     liftUp -nohead epcr.not.found.psl \
       ../../../jkStuff/liftAll.lft warn epcr.not.found.unlifted.psl
     wc epcr.not.found.psl
 #  748 15708 80899 epcr.not.found.psl
 
     cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
 
     # create primers.psl.filter.lifted.initial
     /cluster/bin/scripts/extractPslInfo primers.psl.filter
     # Would compare to rn3 if I could find it...
     wc primers.psl.filter.initial
 #  39451  236706 2071215 primers.psl.filter.initial
 
     # create primers.psl.filter.lifted.initial.acc
     /cluster/bin/scripts/findAccession -agp \
       -rat primers.psl.filter.initial /cluster/data/rn4
     # it warns about missing AGP for M_random -- OK.
     # Would compare to rn3 if I could find it...
     wc primers.psl.filter.initial.acc
 #  39451  276157 2579345 primers.psl.filter.initial.acc
 
     /cluster/bin/scripts/getStsId -rat \
       ../stsInfoRat.tab  primers.psl.filter.initial.acc \
     | sort -k 4n primers.initial.acc.trans > primers.final
     # Would compare to rn3 if I could find it...
     wc primers.final
 #  39451  276157 2232848 primers.final
 
     # Lift clone sequence alignments
     cd /cluster/data/rn4/bed/stsMarkers/clone
     liftUp -nohead clones.psl \
       ../../../jkStuff/liftAll.lft warn clones.unlifted.psl
     /cluster/bin/scripts/extractPslInfo clones.psl
     # extractPslInfo creates clones.psl.initial
     /cluster/bin/scripts/findAccession -agp \
       -rat clones.psl.initial /cluster/data/rn4
     # findAccession creates clones.psl.initial.acc
     sort -k 4n clones.psl.initial.acc > clones.final
 
     # Combine clone sequence alignments with primer alignments
     cd /cluster/data/rn4/bed/stsMarkers
     /cluster/bin/scripts/combineSeqPrimerPos \
       clone/clones.final primer/primers.final 
     # creates stsMarkers_pos.rdb
     # Would compare to rn3 if I could find it...
     wc stsMarkers_pos.rdb
 #  39291  275037 1893500 stsMarkers_pos.rdb
 
     # 6/15/06: re-running from here after fixing createStsMapRat to make 
     # empty chroms instead of "0" when the chrom is not specified -- 
     # hgTracks.c code depends on this.
     ~/kent/src/hg/stsMarkers/createStsMapRat \
       stsInfoRat.tab  stsMarkers_pos.rdb > stsMapRat.bed
     # Would compare to rn3 if I could find it...
     wc stsMapRat.bed
 #  38802  490471 3205607 stsMapRat.bed
 
     ~/kent/src/hg/stsMarkers/createStsAlias stsInfoRat.tab \
     | sort -u > stsAlias.tab
     # Warnings about aliases with multiple trueNames... would be good to 
     # pass that on to NCBI, if we ever get a resolution for the previous 
     # questions.
     # Would compare to rn3 if I could find it...
     wc stsAlias.tab
 #  69051  207153 1687764 stsAlias.tab
 
     #	compare old and new name lists:
     ssh -x hgwdev 'hgsql -N rn3 -e "select name from stsMapRat" | sort -u' \
     > rn3.nameList
     awk '{print $4}' stsMapRat.bed | sort -u > rn4.nameList
     comm -12 rn?.nameList | wc
 #  32693   32693  290774 [in common]
     comm -23 rn3.nameList rn4.nameList | wc
 #   1208    1208   10659 [in rn3 only]
     comm -13 rn3.nameList rn4.nameList | wc
 #   5234    5234   48832 [in rn4 only]
 
 
 # LOAD STS MARKER TABLES (DONE 3/8/06 angie - ratSeq.fa added 6/14/06, stsMapRat reloaded 6/15/06)
     ssh hgwdev
     cd /cluster/data/rn4/bed/stsMarkers
     # Add primer sequences to /gbdb and load:
     mkdir /gbdb/rn4/stsMarker
     ln -s /cluster/data/rn4/bed/stsMarkers/ratPrimer.fa \
       /gbdb/rn4/stsMarker/ratPrimer.fa
     ln -s /cluster/data/rn4/bed/stsMarkers/ratSeq.fa \
       /gbdb/rn4/stsMarker/ratSeq.fa
     # PLEASE NOTE: if re-running hgLoadSeq, you MUST add
     #	-replace
     #	hgLoadSeq -replace rn4 /gbdb/rn4/stsMarker/ratPrimer.fa
     # otherwise there will be a problem that the seq and extFile tables 
     # will be out of sync. 
     hgLoadSeq rn4 /gbdb/rn4/stsMarker/ratPrimer.fa
     hgLoadSeq rn4 /gbdb/rn4/stsMarker/ratSeq.fa
     hgLoadSqlTab rn4 stsAlias \
       ~/kent/src/hg/lib/stsAlias.sql stsAlias.tab
     hgLoadSqlTab rn4 stsInfoRat \
       ~/kent/src/hg/lib/stsInfoRat.sql stsInfoRat.tab
 
     # After some work on the stsInfoRat creation process which should help 
     # next time around (do not do this part next time, just for the record):
     hgLoadSqlTab rn4 stsInfoRat \
       ~/kent/src/hg/lib/stsInfoRat.sql stsInfoRat.tab.new
 #Warning: load of stsInfoRat did not go as planned: 48048 record(s), 0 row(s) skipped, 5 warning(s) loading stsInfoRat.tab
     echo 'select * from stsInfoRat' | hgsql -N rn4 > /tmp/tmp
     # compare /tmp/tmp and stsInfoRat.tmp -- looks like 5 alias fields are 
     # truncated at 255 chars, not so terrible because we have stsAlias.
     # -- But since the IDs are off-by-one for a large stretch of that due 
     # -- to getting rid of a garbage line, I reloaded and manually deleted 
     # -- the unused and often truncated clone sequence values:
     hgLoadSqlTab rn4 stsInfoRat \
       ~/kent/src/hg/lib/stsInfoRat.sql stsInfoRat.tab
 #Warning: load of stsInfoRat did not go as planned: 48049 record(s), 0 row(s) skipped, 187463 warning(s) loading stsInfoRat.tab
     hgsql rn4 -e 'update stsInfoRat set clone = "";'
     # Again, next time around the above should not be necessary.
 
     cat primer/primers.psl.filter clone/clones.psl \
     | hgLoadPsl -table=all_sts_primer rn4 stdin
     # Reloaded 6/15/06
     hgLoadBed rn4 stsMapRat -tab \
       -sqlTable=$HOME/kent/src/hg/lib/stsMapRat.sql stsMapRat.bed
 #Loaded 38802 elements of size 15
 
     nice featureBits rn4 all_sts_primer
 #10103096 bases of 2571531505 (0.393%) in intersection
     nice featureBits rn3 all_sts_primer
 #9722909 bases of 2571104688 (0.378%) in intersection
     nice featureBits rn4 stsMapRat
 #11755453 bases of 2571531505 (0.457%) in intersection
     nice featureBits rn3 stsMapRat
 #10119869 bases of 2571104688 (0.394%) in intersection
 
     # Clean up.
     rm -r primer/primers.out/ primer/primers.psl.100.unlifted clone/clone.out/
 
 # end STS MARKERS 
 ##########################################################################
 
 
 ###############################################################################
 # BUILD KNOWN GENES TABLES (DONE 3/2/06 angie)
 
 # Use protein databases built by Fan: sp060115 and proteins060115
 # See makeProteins060115.doc for details.
 # Ask cluster-admin to rync sp060115 and proteins060115 to kolossus 
 # so that database work can be done there; then final results will be 
 # rsync'd back to hgwdev.
 
     # Create working subdirectories and temporary databases (kgRn4A)
     ssh kolossus
     mkdir /cluster/data/rn4/bed/kgRn4A
     ln -s /cluster/data/rn4/bed/kgRn4A /cluster/store6/kgDB/bed/kgRn4A
     ln -s /cluster/data/rn4/bed/kgRn4A /cluster/store11/kg/kgRn4A
 
     hgsql rn4 -e "create database kgRn4A"   
     hgsql rn4 -e "create database kgRn4ATemp"
 
     mkdir /cluster/bluearc/kgDB/kgRn4A
     mkdir /cluster/bluearc/kgDB/kgRn4A/protBlat
     ln -s /cluster/bluearc/kgDB/kgRn4A/protBlat \
       /cluster/data/rn4/bed/kgRn4A/protBlat
 
     # Save a copy of several genbank tables so that we can produce consistent 
     # results if we need to add more tables later.  First, ask cluster-admin 
     # to rsync the tables from hgwdev rn4 --> kolossus.  Save files too:
     mkdir /cluster/data/rn4/bed/kgRn4A/genbankBackup
     cd /cluster/data/rn4/bed/kgRn4A/genbankBackup
     chmod 777 .
     foreach t (all_mrna cds gbCdnaInfo gbExtFile gbLoaded gbSeq gbStatus \
                mgcGenes \
                refFlat refGene refLink refSeqAli refSeqStatus refSeqSummary \
                xenoMrna xenoRefFlat xenoRefGene xenoRefSeqAli)
       echo $t
       hgsqldump --tab=. rn4 $t
     end
     gzip *.txt
     chmod 775 .
 
     # Get all rat protein sequences
     cd /cluster/data/rn4/bed/kgRn4A/protBlat
     hgsql -N sp060115 -e \
     'select p.acc, p.val from protein p, accToTaxon x \
       where x.taxon=10116 and p.acc=x.acc'\
     | awk '{print ">" $1; print $2;}' \
       > ratProt.fa
     hgsql -N sp060115 -e \
     'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x \
       where v.parAcc = p.acc and x.taxon=10116   and v.parAcc=x.acc'\
     | awk '{print ">" $1; print $2;}' \
       >> ratProt.fa
 
     # Prepare and perform cluster run for protein/genome alignment
     ssh kolossus
     cd /cluster/bluearc/kgDB/kgRn4A/protBlat
     mkdir prot
     faSplit sequence ratProt.fa 2000 prot/prot
     ls /cluster/bluearc/kgDB/kgRn4A/protBlat/prot/* > prot.lis
     hgsql rn4 -N -e 'select chrom from chromInfo' > chrom.lis
 
     ssh pk
     cd /cluster/bluearc/kgDB/kgRn4A/protBlat
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/blat -t=dnax -q=prot /scratch/hg/rn4/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgRn4A/protBlat/result/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
      # << emacs
     mkdir result
     gensub2 chrom.lis prot.lis gsub jobList
     para make jobList
 #Completed: 89100 of 89100 jobs
 #CPU time in finished jobs:    4215176s   70252.93m  1170.88h   48.79d  0.134 y
 #IO & Wait Time:                230997s    3849.96m    64.17h    2.67d  0.007 y
 #Average job time:                  50s       0.83m     0.01h    0.00d
 #Longest finished job:            5794s      96.57m     1.61h    0.07d
 #Submission to last job:         16257s     270.95m     4.52h    0.19d
 
     # collect BLAT results
     ssh kolossus
     cd /cluster/bluearc/kgDB/kgRn4A/protBlat
     pslSort -nohead dirs raw.psl temp result
     pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl \
       protBlat.psl /dev/null
     hgLoadPsl -fastLoad rn4 protBlat.psl
 #load of protBlat did not go as planned: 14768 record(s), 0 row(s) skipped, 48 warning(s) loading psl.tab
 # Some of the lines had negative qGapSizes.  ask Jim...
 
     # create all_mrna.psl and tight_mrna.psl
     zcat /cluster/data/rn4/bed/kgRn4A/genbankBackup/all_mrna.txt.gz \
     | cut -f 2-22 > all_mrna.psl
     pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 all_mrna.psl \
       tight_mrna.psl /dev/null
 
     # Use overlapSelect to get protein and mRNA alignment overlaps   
     overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
       -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.stat
     overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
       -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.out
 
     # Create protein/mRNA pair and protein lists
     cut -f 10,31 protMrna.out | sort -u > spMrna.tab
     cut -f 10    protMrna.out | sort -u > protein.lis
 
     # Load spMrna.tab into spMrna table in temp DB.
     hgLoadSqlTab -notOnServer kgRn4ATemp spMrna ~/kent/src/hg/lib/spMrna.sql \
       spMrna.tab
     hgsql kgRn4ATemp -e 'create index mrnaID on spMrna(mrnaID)'
 
 # Prepare and perform cluster run of protein/mRNA alignment
     # Get mRNA fa file.
     cd /cluster/data/rn4/bed/kgRn4A
     /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=rn4 \
       -gbRoot=/cluster/data/genbank genbank mrna mrna.fa
     # Create mrnaSeq table in kgRn4ATemp DB.
     hgPepPred kgRn4ATemp generic mrnaSeq mrna.fa
     # Prepare files for cluster run
     cd /cluster/bluearc/kgDB/kgRn4A
     ~/kent/src/hg/protein/KG2.sh kgRn4A rn4 060115
     # Perform cluster run of protein/mRNA alignment
     ~/kent/src/hg/protein/KG3.sh kgRn4A rn4 060115
     # Collect cluster run results
     cd kgBestMrna
     cp /dev/null protMrnaRaw.psl
     foreach d (out/*)
       echo $d
       cat $d/*.out >> protMrnaRaw.psl
     end
     # Filter out low quality alignments
     pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl \
       protMrnaBlat.psl /dev/null
 #Processed 33623 alignments
     cut -f 10,14 protMrnaBlat.psl | sort -u > protMrna.lis
     wc protMrna.lis
 # 21430  42860 334656 protMrna.lis
 
     # Load BLAT results into temp DB.
     ssh kolossus
     cd /cluster/data/rn4/bed/kgRn4A/kgBestMrna
     hgsql kgRn4ATemp -e 'create table chromInfo select * from rn4.chromInfo'
     hgLoadPsl -fastLoad kgRn4ATemp protMrnaBlat.psl
 #load of protMrnaBlat did not go as planned: 21432 record(s), 0 row(s) skipped, 97 warning(s) loading psl.tab
 #-- tGapBases (tGapInsert) this time -- ask Jim
     # Create CDS files from protein/mRNA alignment results.
     hgsql kgRn4ATemp -N -e \
     'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat \
       order by qName,tName,tEnd-tStart desc'\
     | sed 's/\t_\t/_/g; s/\t:\t/../g' > protMrna.cds
 
     # Create protMrna.psl with proteinID_mrnaID as query ID.
     cut -f 22-30 ../protBlat/protMrna.out > j1.tmp
     cut -f 32-42 ../protBlat/protMrna.out > j2.tmp
     cut -f 10,31 ../protBlat/protMrna.out | sed -e 's/\t/_/g' > j3.tmp
     paste j1.tmp j3.tmp j2.tmp > protMrna.psl
     rm j1.tmp j2.tmp j3.tmp
 
     # Run mrnaToGene to create protMrna.gp
     bash
     mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp \
       2>protMrna.err >protMrna.log
     exit
 
     # No need to move kgBestMrna anywhere else for now:
     du -sh
 #240M    .
     df -h .
 #kkstore01-10:/export/cluster/store9
 #                      1.3T  1.1T  199G  84% /cluster/store9
 
     # Prepare refGene and all_mrna gp files.
     cd ..
     zcat genbankBackup/refGene.txt.gz > ref.gp
     hgsql rn4 -N -e \
     'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna \
      where all_mrna.qName=gbCdnaInfo.acc and gbCdnaInfo.cds=cds.id' \
     | sort -u > all_mrna.cds
     bash
     mrnaToGene -cdsFile=all_mrna.cds \
       /cluster/bluearc/kgDB/kgRn4A/protBlat/all_mrna.psl all_mrna.gp \
       2>all_mrna.err > all_mrna.log
     exit
 
     # Align proteins to RefSeq.
     overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl \
       -inFmt=genePred protBlat/protBlat.psl ref.gp ref.stat
     overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 \
       -selectFmt=psl -inFmt=genePred protBlat/protBlat.psl ref.gp protRef.gp
     overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl \
       -overlapThreshold=0.80 -inFmt=psl \
       -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out
 
     cut -f 10,22 protRef.out | sort -u >spRef.tab
     cut -f 10 protRef.out    | sort -u >protRef.lis
 
     hgLoadSqlTab kgRn4ATemp spRef ~/kent/src/hg/lib/spRef.sql spRef.tab
 
     # Prepare and perform cluster runs for protein/RefSeq alignments
     ~/kent/src/hg/protein/KGRef2.sh kgRn4A rn4 060115
     ~/kent/src/hg/protein/KGRef3.sh kgRn4A rn4 060115
 
     cd kgBestRef
     cp /dev/null protRefRaw.psl
     foreach d (out/*)
       echo $d
       cat $d/*.out >> protRefRaw.psl
     end
     # Filter out low quality alignments.
     pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl \
       protRefBlat.psl /dev/null
 #Processed 15266 alignments
     cut -f 10,14 protRefBlat.psl | sort -u > protRef.lis
     wc protRef.lis
 # 11957  23914 216980 protRef.lis
 
     hgLoadPsl -fastLoad kgRn4ATemp protRefBlat.psl
 #load of protRefBlat did not go as planned: 11958 record(s), 0 row(s) skipped, 45 warning(s) loading psl.tab
     # Negative tGapSizes again.
 
     # Run gene-check to filter out invalid gp entries
     cd /cluster/data/rn4/bed/kgRn4A
     cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
     /cluster/bin/i386/gene-check  -incl-ok -ok-genepred-out \
       kgCandidate0.passed.gp \
       -nib-dir /cluster/data/rn4/nib kgCandidate0.gp kgCandidate0.check
 
     ldHgGene -predTab kgRn4ATemp kgCandidate0 kgCandidate0.gp
     tail +3 kgCandidate0.check > geneCheck.tab
     hgLoadSqlTab kgRn4ATemp geneCheck ~/kent/src/hg/lib/geneCheck.sql \
       geneCheck.tab
 
     # Run kgCheck to get all KG candidates that pass the KG gene check criteria
     kgCheck kgRn4ATemp rn4 kgCandidate0 geneCheck kgCandidate.tab
     hgLoadSqlTab  kgRn4ATemp kgCandidate ~/kent/src/hg/lib/kgCandidate.sql \
       kgCandidate.tab
     hgsql kgRn4ATemp -e 'create index alignID on kgCandidate(alignID)'
 
     # Construct the kgCandidateX table that has alignID in the name field. 
     cut -f 2-10 kgCandidate.tab > j2.tmp
     cut -f 11 kgCandidate.tab > j1.tmp
     paste j1.tmp j2.tmp > kgCandidateX.tab
     ldHgGene -predTab kgRn4ATemp kgCandidateX kgCandidateX.tab
 
     # Score protein/mRna and protein/RefSeq alignments
     ln -s /cluster/bluearc/kgDB/kgRn4A/protBlat/protein.lis .
     kgResultBestMrna2 060115 kgRn4ATemp rn4 protMrnaBlat | sort -u \
       > protMrnaBlatScore.tab
     kgResultBestRef2  060115 kgRn4ATemp rn4 protRefBlat | sort -u \
       > protRefScore.tab
 
     # Combine scoring results and load them into temp DB.
     cat protMrnaBlatScore.tab protRefScore.tab > protMrnaScore.tab
     hgLoadSqlTab kgRn4ATemp protMrnaScore ~/kent/src/hg/lib/protMrnaScore.sql \
       protMrnaScore.tab
     hgsql kgRn4ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
 
     # Run kgGetCds to get CDS structure of each gene
     kgGetCds kgRn4ATemp 060115 kgCandidateX stdout \
     | sort -u > kgCandidateY.tab
     hgLoadSqlTab kgRn4ATemp kgCandidateY ~/kent/src/hg/lib/kgCandidateY.sql \
       kgCandidateY.tab
 
     # Run kgPickPrep to replace long cds structure string with cdsId.
     kgPickPrep kgRn4ATemp kgCandidateZ.tab
     hgLoadSqlTab kgRn4ATemp kgCandidateZ ~/kent/src/hg/lib/kgCandidateZ.sql \
       kgCandidateZ.tab
     hgsql kgRn4ATemp -e 'create index cdsId on kgCandidateZ(cdsId)'
 
     # Run kgPick to pick the representative a mrna/protein pair for each 
     # unique CDS structure.
     kgPick kgRn4ATemp rn4 sp060115 kg3.tmp stdout \
     | sort -u > dupSpMrna.tab
 
     # Create put back list
     /cluster/data/genbank/bin/i386/gbGetSeqs \
       -gbRoot=/cluster/data/genbank db=rn4 -get=ra -native RefSeq mrna stdout \
     | perl -wpe 's/^(\w+) (.*)/$1\t$2/ || next; $acc = $2 if ($1 eq "acc"); \
                  s/^/$acc\t/;' \
     | sort -u | tail +2 > refRa.tab
     hgLoadSqlTab rn4 refRa ~/kent/src/hg/lib/refRa.sql refRa.tab
 
      hgsql rn4 -N -e \
      'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
       where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and \
         r2.val="rev" and r3.acc=r.acc and r3.attr="org" and \
         r3.val="Rattus norvegicus"' \
      > kgPutBack2.tab
      hgsql rn4 -N -e \
      'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
       where r.attr="cno" and r.val like "%ribosomal frameshift%" and \
         r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and \
         r3.acc=r.acc and r3.attr="org" and r3.val="Rattus norvegicus"' \
      >> kgPutBack2.tab
      hgsql rn4 -N -e \
      'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
       where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and \
         r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and \
         r3.attr="org" and r3.val="Rattus norvegicus"' \
      >> kgPutBack2.tab
      hgsql rn4 -N -e \
      'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
       where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and \
         r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and \
         r3.val="Rattus norvegicus"' \
      >> kgPutBack2.tab 
      hgsql rn4 -N -e \
      'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 \
       where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and \
         r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and \
         r3.val="Rattus norvegicus"' \
      >> kgPutBack2.tab
      wc -l kgPutBack2.tab
 #10 kgPutBack2.tab
 
     hgLoadSqlTab kgRn4ATemp kgPutBack2 ~/kent/src/hg/lib/kgPutBack2.sql \
       kgPutBack2.tab
     kgPutBack kgRn4ATemp rn4 sp060115 kgPutBack2 kgPutBack2.gp
 
     # Sort KG genes to make the kg4.gp table file.
     cat kgPutBack2.gp kg3.tmp > kg4.tmp
     ~/kent/src/hg/protein/sortKg.pl kg4.tmp \
     | cut -f 1-12 > knownGene.tab
     # Load data into knownGene table in both kgRn4ATemp and rn4.
     hgLoadSqlTab kgRn4ATemp knownGene ~/kent/src/hg/lib/knownGene.sql \
       knownGene.tab
     hgLoadSqlTab rn4 knownGene ~/kent/src/hg/lib/knownGene.sql \
       knownGene.tab
 
     # Load dupSpMrna table after knownGene table is loaded so that \
     # joinerCheck does not complain.
     hgLoadSqlTab rn4 dupSpMrna ~/kent/src/hg/lib/dupSpMrna.sql dupSpMrna.tab
 
     # Perform analysis on KG
     nice featureBits rn4 -enrichment refGene knownGene
 #refGene 0.722%, knownGene 0.582%, both 0.539%, cover 74.72%, enrich 128.33x
     nice featureBits rn4 -enrichment refGene:cds knownGene:cds
 #refGene:cds 0.500%, knownGene:cds 0.377%, both 0.352%, cover 70.41%, enrich 186.83x
 
     nice featureBits rn3 -enrichment refGene knownGene
 #refGene 0.721%, knownGene 0.523%, both 0.411%, cover 56.96%, enrich 108.94x
     nice featureBits rn3 -enrichment refGene:cds knownGene:cds
 #refGene:cds 0.500%, knownGene:cds 0.392%, both 0.294%, cover 58.91%, enrich 150.16x
 
     # Build knownGeneMrna and knownGenePep tables.
     kgPepMrna kgRn4ATemp rn4 060115
     hgPepPred rn4 tab knownGeneMrna knownGeneMrna.tab
     hgLoadSqlTab rn4 knownGenePep ~/kent/src/hg/lib/knownGenePep.sql knownGenePep.tab
 
     # Use hgwdev's entrez database (loaded by Fan, see makeHg18.doc) 
     # to buid the mrnaRefseq table.
     ssh hgwdev
     hgsql entrez -N -e \
     'select mrna, refseq from entrezRefseq, entrezMrna \
      where entrezRefseq.geneID=entrezMrna.geneID' \
       > /cluster/data/rn4/bed/kgRn4A/mrnaRefseq.tmp
     hgsql rn4 -N -e 'select name, name from refGene' \
      >> /cluster/data/rn4/bed/kgRn4A/mrnaRefseq.tmp
     ssh kolossus
     cd /cluster/data/rn4/bed/kgRn4A
     sort -u mrnaRefseq.tmp > mrnaRefseq.tab
     hgLoadSqlTab rn4 mrnaRefseq ~/kent/src/hg/lib/mrnaRefseq.sql mrnaRefseq.tab
 
     # Build kgXref table
     kgXref2 kgRn4ATemp 060115 rn4
     hgLoadSqlTab rn4 kgXref ~/kent/src/hg/lib/kgXref.sql kgXref.tab
 
     # Build spMrna table
     hgsql rn4 -N -e 'select proteinID, name from knownGene' > kgSpMrna.tab
     hgLoadSqlTab rn4 spMrna ~/kent/src/hg/lib/spMrna.sql kgSpMrna.tab
 
     # Build kgProtMap table
     ln -s /cluster/bluearc/kgDB/kgRn4A/protBlat/tight_mrna.psl .
     ~/kent/src/hg/protein/kgProtMap2.sh kgRn4A rn4 060115
 
 #####################################
 # Build alias tables.		
 
     kgAliasM rn4 proteins060115
     kgAliasKgXref rn4
     kgAliasRefseq rn4
 
     hgsql sp060115 -N -e \
     'select name,gene.val from rn4.knownGene,displayId,gene \
      where displayId.val=proteinID and displayId.acc=gene.acc' \
     | sort -u  > kgAliasP.tab
 
     hgsql rn4 -N -e 'select name, name from knownGene' > kgAliasDup.tab
     hgsql rn4 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >> kgAliasDup.tab
 
     sort -u kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab \
       kgAliasDup.tab > kgAlias.tab
 
     hgLoadSqlTab rn4 kgAlias ~/kent/src/hg/lib/kgAlias.sql kgAlias.tab
     kgProtAlias rn4 060115
 
     hgsql rn4 -N -e \
     'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""' \
     | sort -u > kgProtAliasNCBI.tab
 
     # include variant splice protein IDs
     hgsql rn4 -N -e \
     'select name, proteinID, parAcc from knownGene,sp060115.varAcc \
      where varAcc=proteinID' \
     | sort -u > kgProtAliasDup.tab
     # include duplicate protein IDs from dupSpMrna table
     hgsql rn4 -N -e \
     'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna \
      where name=mrnaID' \
     | sort -u >> kgProtAliasDup.tab
     # catch parent acc from dupProteinID too
     hgsql rn4 -N -e\
     'select name, knownGene.proteinID, parAcc \
      from knownGene,dupSpMrna,sp060115.varAcc \
      where name=mrnaID and dupProteinID=varAcc.varAcc' \
     | sort -u >> kgProtAliasDup.tab
     sort -u kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab \
     > kgProtAliasAll.tab
 
     hgLoadSqlTab rn4 kgProtAlias ~/kent/src/hg/lib/kgProtAlias.sql \
       kgProtAliasAll.tab
 
     # Build kgSpAlias table
     hgsql rn4 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias \
      where kgXref.kgID=kgAlias.kgID' > j.tmp
     hgsql rn4 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias \
      where kgXref.kgID=kgProtAlias.kgID' >> j.tmp
     sort -u j.tmp | grep -v 'kgID' > rn4.kgSpAlias.tab
     rm j.tmp
 
     hgLoadSqlTab rn4 kgSpAlias ~/kent/src/hg/lib/kgSpAlias.sql \
       rn4.kgSpAlias.tab
 
     # Ask cluster-admin to rsync the following tables from kolossus rn4 
     # back to hgwdev rn4:
     dupSpMrna
     kgAlias
     kgProtAlias
     kgProtMap
     kgSpAlias
     kgXref
     knownGene
     knownGeneMrna
     knownGenePep
     mrnaRefseq
     protBlat
     refRa
     spMrna
 
     # Copy history comments from kolossus to hgwdev's rn4.history table.
     hgsqldump rn4 history \
     | egrep 'dupSpMrna|kgAlias|kgProtAlias|kgProtMap|kgSpAlias|kgXref|knownGene|knownGeneMrna|knownGenePep|mrnaRefseq|protBlat|refRa|spMrna' \
     | perl -wpe 's/\([0-9]+,/(NULL,/' \
       > kgHistory.sql
     ssh hgwdev
     hgsql rn4 < /cluster/data/rn4/bed/kgRn4A/kgHistory.sql
 
 
 # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 3/6/06 angie)
     # This depends on the go and uniProt databases as well as 
     # the kgAlias and kgProAlias tables.
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/kgRn4A/index
     cd /cluster/data/rn4/bed/kgRn4A/index
     hgKgGetText rn4 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/rn4/bed/kgRn4A/index/knownGene.ix /gbdb/rn4/
     ln -s /cluster/data/rn4/bed/kgRn4A/index/knownGene.ixx /gbdb/rn4/
 
 # end KNOWN GENES
 ###############################################################################
 
 
 # BLASTZ/CHAIN/NET DANRER3 - NO LINSPECREP (DONE 3/2/06 angie)
     ssh kkstore01
     mkdir /cluster/data/rn4/bed/blastz.danRer3.2006-03-02
     cd /cluster/data/rn4/bed/blastz.danRer3.2006-03-02
     cat << '_EOF_' > DEF
 # rat vs zebrafish
 BLASTZ=blastz.v7.x86_64
 
 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Rat Rn4
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer3)
 #  large enough chunk to do complete chroms at once
 SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
 SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
 SEQ2_CHUNK=100000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.danRer3.2006-03-02
 '_EOF_'
     # << for emacs
     doBlastzChainNet.pl DEF -bigClusterHub=pk \
       -blastzOutRoot /san/sanvol1/scratch/blastzRn4DanRer3Out >& do.log &
     tail -f do.log
     ln -s blastz.danRer3.2006-03-02 /cluster/data/rn4/bed/blastz.danRer3 
 
 
 # BLASTZ/CHAIN/NET DANRER3 - WITH LINSPECREP (DONE 3/3/06 angie)
     # The coverage decreased so I ended up not using this run.
     # It would be interesting to try a run that uses all repeats 
     # *except for simple* as lineage-specific too sometime.
     ssh kkstore01
     # Set up rn4/linSpecRep.notInNonMammal
     mkdir /cluster/bluearc/rn4/linSpecRep.notInNonMammal
     cd /cluster/data/rn4
     foreach f (?{,?}/chr*.fa.out)
       echo $f:t:r:r
       cp -p $f /cluster/bluearc/rn4/linSpecRep.notInNonMammal/$f:t:r:r.out.spec
     end
 
     # Rename tables from previous run; remove download dir
     ssh hgwdev
     foreach t (netDanRer3 \
                `hgsql rn4 -N -e 'show tables like "%chainDanRer3%"'`)
       set newT = `echo $t | sed -e 's/Rer3/Rer3NoLSR/'`
       hgsql rn4 -e 'rename table '$t' to '$newT
     end
     rm -r /usr/local/apache/htdocs/goldenPath/rn4/vsDanRer3
 
     ssh pk
     mkdir /cluster/data/rn4/bed/blastz.danRer3.2006-03-03
     cd /cluster/data/rn4/bed/blastz.danRer3.2006-03-03
     cat << '_EOF_' > DEF
 # rat vs zebrafish
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Rat Rn4
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_SMSK=/cluster/bluearc/rn4/linSpecRep.notInNonMammal
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer3)
 #  large enough chunk to do complete chroms at once
 SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
 SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
 SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
 SEQ2_CHUNK=100000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.danRer3.2006-03-03
 '_EOF_'
     # << for emacs
     doBlastzChainNet.pl DEF -bigClusterHub=pk \
       -blastzOutRoot /san/sanvol1/scratch/blastzRn4DanRer3Out >& do.log &
     tail -f do.log
     ln -s blastz.danRer3.2006-03-03 /cluster/data/rn4/bed/blastz.danRer3 
 
 
 # COMPARE danRer3 runs, without vs. with LSR (DONE 3/13/06 angie)
     ssh kolossus
     cd /cluster/data/rn4/bed
     # have to use redirect because vprintf doesn't like being called 
     # twice on x86_64... should fix that sometime.  
     netStats /dev/null blastz.danRer3.2006-03-02/axtChain/rn4.danRer3.net.gz \
       > blastz.danRer3.2006-03-02/axtChain/netStats.out
     netStats /dev/null blastz.danRer3.2006-03-03/axtChain/rn4.danRer3.net.gz \
       > blastz.danRer3.2006-03-03/axtChain/netStats.out
     sdiff blastz.danRer3.2006-03-0[23]/axtChain/netStats.out
     # A few points of comparison from that:
     #                       Without:    With:
     # max depth:                  11        7
     # gap count:              618046   487550
     # gap average size T:      809.1   1024.5
     # gap average size Q:      529.2    688.0
     # fill count:              60347    58415
     # top count:               38000    37787
     # top percent of total:    84.9%    87.5%
     # nonSyn percent of total: 11.1%     9.6%
     # syn percent of total:     2.5%     1.8%
     # Coverage:
     netToBed blastz.danRer3.2006-03-02/axtChain/{rn4.danRer3.net.gz,net.bed}
     netToBed blastz.danRer3.2006-03-03/axtChain/{rn4.danRer3.net.gz,net.bed}
     featureBits rn4 blastz.danRer3.2006-03-02/axtChain/net.bed
 #513894027 bases of 2571531505 (19.984%) in intersection
     featureBits rn4 blastz.danRer3.2006-03-03/axtChain/net.bed
 #509076601 bases of 2571531505 (19.797%) in intersection
     # OK, so coverage drops with LSR.  Maybe some repeats are being snipped 
     # that could be aligned (ancestral)?  
     # Eyeball the one place where level == 11 in the 02 set
     # ("select * from netDanRer3NoLSR where level >= 11;" on hgwdev):
     # chr3  | 97071076 | 97071655, zoom out 1.5
     # NoLSR has a lot more chains... more levels filled.  
     # with LSR gets a "Level 2" chain that NoLSR does not get at all.  
     # Who's to say which is right???  NoLSR looks messier to me but that 
     # is a very subjective judgment.  
     # Interestingly, there are no repeats on rn4 in that region.
     # Panning to the right a ways, just long gaps for NoLSR but a few 
     # blocks for with.  NoLSR gets some sizable alignments where with 
     # does not... a stretch of simple/low-complexity repeats there.  
     # Those should probably be excluded when we are using all repeats 
     # as lin-spec!  Hopping over to a danRer3 region aligned only by 
     # NoLSR I see some more low-complexity.  Should run that by Arian.  
     # Could do yet another run with all repeats except simple/low-complexity
     # (what is DNA? maybe that too?) as linSpec.  
     # Panning to the right some more, a long dry spell for with LSR.
     # There are olfactory receptor genes around which could indicate a 
     # tougher job than usual.  Not much agreement between the nets though,
     # yikes.  High precision, but at what accuracy?
     # I just worry about the reliability of what comes out of the pipeline...
     # so sensitive to parameters.  
     # chr3:97,345,533-97,360,012 there is a withLSR chain but the net is empty.
 
     # Since the coverage is higher without LSR, go with that for now.
 
 
 # BLASTZ/CHAIN/NET XENTRO1 (DONE 3/20/06 angie)
     ssh kkstore01
     mkdir /cluster/data/rn4/bed/blastz.xenTro1.2006-03-20
     cd /cluster/data/rn4/bed/blastz.xenTro1.2006-03-20
     cat << '_EOF_' > DEF
 # rat vs. frog
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 
 # Mammal to non-mammal, with threshold of 8000 as in mm8-xenTro1
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Rat rn4
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Frog xenTro1 - single chunk big enough to run two of the
 #               largest scaffolds in one job
 SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
 SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/cluster/data/rn4/bed/blastz.xenTro1.2006-03-20
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/rn4XenTro1 \
       -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
       >& do.log & tail -f do.log
     ln -s blastz.xenTro1.2006-03-20 /cluster/data/rn4/bed/blastz.xenTro1
 
 
 # BLASTZ/CHAIN/NET RHEMAC2 (DONE 3/23/06 angie)
     # Won't put this in Conservation -- special request for ancestor recon.
     ssh pk
     mkdir /cluster/data/rn4/bed/blastz.rheMac2.2006-03-22
     cd /cluster/data/rn4/bed/blastz.rheMac2.2006-03-22
     cat << '_EOF_' > DEF
 # Rat vs. macacque
 
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Rat (rn4)
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Macacque (rheMac2)
 SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib
 SEQ2_SMSK=/cluster/bluearc/rheMac2/linSpecRep/notInRodent
 SEQ2_LEN=/cluster/data/rheMac2/chrom.sizes
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.rheMac2.2006-03-22
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -blastzOutRoot /san/sanvol1/rn4RheMac2 \
       -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium DEF \
       >& do.log & tail -f do.log
     ln -s blastz.rheMac2.2006-03-22 /cluster/data/rn4/bed/blastz.rheMac2
 
 
 #######################################################################
 # BLASTZ/CHAIN/NET MONDOM4 (DONE 3/24/06 angie)
     ssh pk
     mkdir /cluster/data/rn4/bed/blastz.monDom4.2006-03-23
     cd /cluster/data/rn4/bed/blastz.monDom4.2006-03-23
     cat << '_EOF_' > DEF
 # Rat vs. opossum
 
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 
 # Mammal to non-mammal -- well, I guess non-placental-mammal, with 
 # high L like chicken, and low M to guard against pileups w/monDom4:
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=20
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Rat (rn4)
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom4
 SEQ2_DIR=/cluster/bluearc/scratch/hg/monDom4/monDom4.2bit
 SEQ2_LEN=/cluster/bluearc/scratch/hg/monDom4/chrom.sizes
 SEQ2_CHUNK=50000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.monDom4.2006-03-23
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -blastzOutRoot /san/sanvol1/rn4MonDom4 \
       -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
       -workhorse kkr7u00 >& do.log & tail -f do.log
     ln -s blastz.monDom4.2006-03-23 /cluster/data/rn4/bed/blastz.monDom4
 
     ssh kolossus
     cd /cluster/data/rn4/bed/blastz.monDom4
     time nice -n +19 featureBits rn4 chainMonDom4Link \
 	> fb.rn4.chainMonDom4Link 2>&1
     cat fb.rn4.chainMonDom4Link
     #	200166664 bases of 2571531505 (7.784%) in intersection
 
 #######################################################################
 # BLASTZ/CHAIN/NET BOSTAU2 (DONE 3/24/06 angie)
     ssh pk
     mkdir /cluster/data/rn4/bed/blastz.bosTau2.2006-03-23
     cd /cluster/data/rn4/bed/blastz.bosTau2.2006-03-23
     cat << '_EOF_' > DEF
 # rat vs cow
 
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 
 # TARGET: Rat (rn4)
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Cow (bosTau2)
 #  large enough chunk to do chroms in one piece
 SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
 SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
 SEQ2_CHUNK=150000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.bosTau2.2006-03-23
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
       -bigClusterHub=pk -workhorse=kkr8u00 DEF \
       >& do.log & tail -f do.log
     ln -s blastz.bosTau2.2006-03-23 /cluster/data/rn4/bed/blastz.bosTau2
 
 
 # SELF CHAINS (DONE - NOT LOADED INTO DB 3/27/06 angie)
     # Won't put this in Conservation -- special request for ancestor recon.
     ssh pk
     mkdir /cluster/data/rn4/bed/blastz.rn4.2006-03-24
     cd /cluster/data/rn4/bed/blastz.rn4.2006-03-24
     cat << '_EOF_' > DEF
 # rat vs rat
 
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 BLASTZ_H=2000
 BLASTZ_M=200
 
 # TARGET: rat rn4
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: rat rn4
 SEQ2_DIR=/scratch/hg/rn4/nib
 SEQ2_LEN=/cluster/data/rn4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.rn4.2006-03-24
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -chainMinScore=10000 -chainLinearGap=medium \
       -bigClusterHub=pk -workhorse=kkr7u00 DEF \
       >& do.log & tail -f do.log
     # It died during loading because hgwdev /var/lib/mysql ran out of room!
     # There is enough space now but I deleted the tables anyway -- can load 
     # later if anyone wants to browse.  Did -continue download so that the 
     # ancestor-reconstruction folks who requested this can find the chains 
     # in the usual place.
     ln -s blastz.rn4.2006-03-24 /cluster/data/rn4/bed/blastz.rn4
 
 
 # BLASTZ/CHAIN/NET PANTRO2 (DONE 3/27/06 angie)
     # Won't put this in Conservation -- special request for ancestor recon.
     ssh pk
     mkdir /cluster/data/rn4/bed/blastz.panTro2.2006-03-27
     cd /cluster/data/rn4/bed/blastz.panTro2.2006-03-27
     cat << '_EOF_' > DEF
 # Rat vs. chimp
 
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Rat (rn4)
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chimp (panTro2)
 SEQ2_DIR=/scratch/hg/panTro2/nib
 SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
 SEQ2_LEN=/cluster/data/panTro2/chrom.sizes
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.panTro2.2006-03-27
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -blastzOutRoot /san/sanvol1/rn4PanTro2 \
       -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium DEF \
       -workhorse=kkr5u00 >& do.log & tail -f do.log
     ln -s blastz.panTro2.2006-03-27 /cluster/data/rn4/bed/blastz.panTro2
 
 
 ############################################################################
 ##  BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
     ssh kk
     cd /cluster/data/mm8/bed/blastzRn4.2006-02-16
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -swap -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
         `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits rn4 chainMm8Link
     #   1791093685 bases of 2571531505 (69.651%) in intersection
 
 
 # BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-03-13, Fan)
 #	This should be done after KG tables are complete from known genes build
 #	process.
 #
 # Cluster together various alt-splicing isoforms.
 #	Creates the knownIsoforms and knownCanonical tables
 ssh hgwdev
 mkdir /cluster/data/rn4/bed/geneSorter.2006-03-13
 # remove old symbolic link
 rm /cluster/data/rn4/bed/geneSorter
 ln -s /cluster/data/rn4/bed/geneSorter.2006-03-13 /cluster/data/rn4/bed/geneSorter
 cd /cluster/data/rn4/bed/geneSorter
 hgClusterGenes rn4 knownGene knownIsoforms knownCanonical
 
 # Extract peptides from knownGenes into fasta file
 # and create a blast database out of them.
 mkdir /cluster/data/rn4/bed/geneSorter/blastp
 ln -s /cluster/data/rn4/bed/geneSorter/blastp /cluster/data/rn4/bed/blastp
 cd /cluster/data/rn4/bed/geneSorter/blastp
 pepPredToFa rn4 knownGenePep known.faa
 #	You may need to build this binary in src/hg/near/pepPredToFa
 /scratch/blast/formatdb -i known.faa -t known -n known
 #	This command is in /projects/compbio/bin/$MACH/formatdb
 
 
 #TODO: swap rn4 to:  xenTro1
 
 # MULTIZ9WAY (DONE 3/30/06 angie)
 # (((((((rn4 mm8) hg18) (canFam2 bosTau2)) monDom4) galGal2) xenTro1) danRer3)
     ssh kkstore01
     mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30
 
     # copy MAF's to cluster-friendly server
     # These MAF's already on bluearc:
     #  canFam2, fr1, galGal2, panTro1, rn4
     mkdir /san/sanvol1/scratch/rn4/mafNet
     foreach s (mm8 hg18 canFam2 bosTau2 monDom4 galGal2 xenTro1 danRer3)
       echo $s
       rsync -av /cluster/data/rn4/bed/blastz.$s/mafNet/* \
         /san/sanvol1/scratch/rn4/mafNet/$s/
     end
 
     # Prune the hg17 17way tree to just these 9 and update db names:
     /cluster/bin/phast/tree_doctor \
       --prune-all-but=rat_rn3,mouse_mm7,human_hg17,dog_canFam2,cow_bosTau2,monodelphis_monDom2,chicken_galGal2,xenopus_xenTro1,zebrafish_danRer3 \
       --rename="rat_rn3 -> rat_rn4 ; mouse_mm7 -> mouse_mm8 ; human_hg17 -> human_hg18 ; monodelphis_monDom2 -> monodelphis_monDom4" \
       /cluster/data/hg17/bed/multiz17way/17way.nh > 9way.nh
     # *carefully* edit 9way.nh to move hg18 to appear after (rat mouse), 
     # save as 9way_ratFirst.nh, and make a tree picture:
     /cluster/bin/phast/draw_tree 9way_ratFirst.nh > 9way.ps
     ps2pdf 9way.ps > 9way.pdf
     /cluster/bin/phast/all_dists 9way_ratFirst.nh > 9way.distances.txt
     grep rn4 9way.distances.txt | sort -k3,3n | \
         awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
     cat distances.txt
 #0.1587  mouse_mm8
 #0.4724  human_hg18
 #0.6277  dog_canFam2
 #0.6392  cow_bosTau2
 #1.0745  monodelphis_monDom4
 #1.3472  chicken_galGal2
 #1.7983  xenopus_xenTro1
 #2.1105  zebrafish_danRer3
     # the order in the browser display will be by tree topology,
     # not by distance, but in this case (unlike human) they happen to match.
 
     # create species list and stripped down tree for autoMZ
     sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 9way_ratFirst.nh \
       > tree-commas.nh
     sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
     sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     ssh pk
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30
     mkdir maf run
     cd run
 
     # stash binaries 
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
     set db = rn4
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/mafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../{tree.nh,species.lst} $tmp
     pushd $tmp
     foreach s (`cat species.lst`)
         set in = $pairs/$s/$c.maf
         set out = $db.$s.sing.maf
         if ($s == $db) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
     # << emacs
     chmod +x autoMultiz.csh
 
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/rn4/bed/multiz9way.2006-03-30/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
     # << emacs
 
     awk '{print $1}' /cluster/data/rn4/chrom.sizes > chrom.lst
     gensub2 chrom.lst single spec jobList
     para make jobList
     para time
 #Completed: 45 of 45 jobs
 #CPU time in finished jobs:      60564s    1009.39m    16.82h    0.70d  0.002 y
 #IO & Wait Time:                  2996s      49.94m     0.83h    0.03d  0.000 y
 #Average job time:                1412s      23.54m     0.39h    0.02d
 #Longest finished job:            6172s     102.87m     1.71h    0.07d
 #Submission to last job:          6173s     102.88m     1.71h    0.07d
 
     # Make .jpg for tree and install in htdocs/images/phylo/... don't forget
     # to request a push of that file.  The treeImage setting in trackDb.ra 
     # is phylo/rn4_9way.jpg (relative to htdocs/images).
     ssh hgwdev
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30
     pstopnm -stdout 9way.ps | pnmtojpeg > rn4_9way.jpg
     ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/rn4_9way.jpg \
       /usr/local/apache/htdocs/images/phylo/rn4_9way.jpg
 
 
 # ANNOTATE MULTIZ9WAY MAF AND LOAD TABLES (DONE 4/4/2006 angie)
     ssh kolossus
     mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/anno
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno
     mkdir maf run
     cd run
     rm -f sizes nBeds
     foreach db (`cat /cluster/data/rn4/bed/multiz9way.2006-03-30/species.lst`)
       ln -s  /cluster/data/$db/chrom.sizes $db.len
       if (! -e /cluster/data/$db/$db.N.bed) then
         twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
       endif
       ln -s  /cluster/data/$db/$db.N.bed $db.bed
       echo $db.bed  >> nBeds
       echo $db.len  >> sizes
     end
     echo date > jobs.csh
     # do smaller jobs first:
     foreach f (`ls -1rS ../../maf/*.maf`)
       echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
         /cluster/data/rn4/rn4.2bit ../maf/`basename $f` >> jobs.csh
       echo "echo $f" >> jobs.csh
     end 
     echo date >> jobs.csh
     csh -efx jobs.csh >&! jobs.log & tail -f jobs.log
     # 1.5 hrs.
 
     # Load anno/maf
     ssh hgwdev
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf
     mkdir -p /gbdb/rn4/multiz9way/anno/maf
     ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf/*.maf \
       /gbdb/rn4/multiz9way/anno/maf
     cat > loadMaf.csh << 'EOF'
 date
 nice hgLoadMaf -pathPrefix=/gbdb/rn4/multiz9way/anno/maf rn4 multiz9way
 date
 'EOF'
     # << emacs
     csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log
     # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
     # machine and then load on hgwdev:
     ssh kkr7u00
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf
     cat *.maf \
     | nice hgLoadMafSummary rn4 -minSize=30000 -mergeGap=1500 -maxSize=200000 \
              -test multiz9waySummary stdin
 #Created 1408247 summary blocks from 22055250 components and 8577575 mafs from stdin
     ssh hgwdev
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/anno/maf
     sed -e 's/mafSummary/multiz9waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
       > /tmp/multiz9waySummary.sql
     time nice hgLoadSqlTab rn4 multiz9waySummary /tmp/multiz9waySummary.sql \
       multiz9waySummary.tab
 #0.000u 0.000s 4:04.62 0.0%      0+0k 0+0io 217pf+0w
     rm *.tab
     ln -s multiz9way.2006-03-30 /cluster/data/rn4/bed/multiz9way
 
 
 # MULTIZ9WAY DOWNLOADABLES (UNANNOTATED) (DONE 6/9/2006 angie)
     # Annotated MAF is now documented, so use anno/maf for downloads.
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads
     # upstream mafs 
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits rn4 refGene:upstream:$i -fa=/dev/null -bed=up.bad
         awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         nice mafFrags rn4 multiz9way up.bed upstream$i.maf \
                 -orgs=../species.lst
         rm up.bed
     end
     date
 'EOF'
     # << emacs
     time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
 #old hgwdev:
 #75.280u 480.470s 17:07.43 54.0% 0+0k 0+0io 4640pf+0w
 #new hgwdev:
 #85.814u 171.368s 7:37.96 56.1%  0+0k 0+0io 0pf+0w
 
     ssh kkstore04
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads
 cat > downloads.csh << 'EOF'
     date
     foreach f (../anno/maf/chr*.maf)
 	set c = $f:t:r
         echo $c
 	nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     # << emacs
     time csh -efx downloads.csh >&! downloads.log
 #1897.874u 25.317s 32:58.19 97.2%        0+0k 0+0io 2pf+0w
     nice gzip up*.maf
     nice md5sum up*.maf.gz >> md5sum.txt
 
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/rn4/multiz9way
     mkdir $dir
     ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/mafDownloads/{*.gz,md5sum.txt} $dir
     cp /usr/local/apache/htdocs/goldenPath/mm7/multiz17way/README.txt $dir
     # edit README.txt
 
 
 # MULTIZ9WAY MAF FRAMES (DONE 4/5/2006 angie)
 # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/frames
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/frames
     # The following is adapted from MarkD's Makefile used for mm7...
 
     #------------------------------------------------------------------------
     # get the genes for all genomes
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
     # using mrna table as genes: canFam2 bosTau2 danRer3
     mkdir genes
     foreach queryDb (canFam2 bosTau2 danRer3)
       set tmpExt = `mktemp temp.XXXXXX`
       set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
       set tmpMrna = ${queryDb}.mrna.${tmpExt}
       set tmpCds = ${queryDb}.cds.${tmpExt}
       echo $queryDb
       hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                    from all_mrna,gbCdnaInfo,cds \
                    where (all_mrna.qName = gbCdnaInfo.acc) and \
                      (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
        ${queryDb} > ${tmpMrnaCds}
       cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
       cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
       mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
         stdout \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
     # using knownGene for rn4 mm8 hg18
     # using refGene for galGal2
     # using mgcGenes for xenTro1
     # no genes for monDom4
     # genePreds; (must keep only the first 10 columns for knownGene)
     foreach queryDb (rn4 mm8 hg18 galGal2 xenTro1)
       if ($queryDb == "xenTro1") then
         set geneTbl = mgcGenes
       else if ($queryDb == "galGal2") then
         set geneTbl = refGene
       else
         set geneTbl = knownGene
       endif
       hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
 
     #------------------------------------------------------------------------
     # create frames
     set clusterDir = /cluster/bluearc/rn4/multiz9wayFrames
     set multizDir = /cluster/data/rn4/bed/multiz9way.2006-03-30
     set mafDir = $multizDir/mafDownloads
     set geneDir = $multizDir/frames/genes
     set clusterMafDir = ${clusterDir}/maf
     set clusterGeneDir = ${clusterDir}/genes
     set clusterFramesDir = ${clusterDir}/mafFrames.kki
 
     # copy mafs to cluster storage
     mkdir $clusterDir
     ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/"
 
     # copy genes to cluster storage
     ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/"
 
     # run cluster jobs
     set tmpExt = `mktemp temp.XXXXXX`
     set paraDir = $multizDir/frames/para.${tmpExt}
     mkdir mafFrames $paraDir
     rm -f $paraDir/jobList
     mkdir ${clusterFramesDir}
     foreach queryDb (`cat /cluster/data/rn4/bed/multiz9way.2006-03-30/species.lst`)
       mkdir ${clusterFramesDir}/${queryDb}
       foreach c (`awk '{print $1;}' /cluster/data/rn4/chrom.sizes`)
         if (-e ${clusterGeneDir}/${queryDb}.gp.gz) then
           echo /cluster/bin/scripts/mkMafFrames.pl ${queryDb} rn4 \
             ${clusterGeneDir}/${queryDb}.gp.gz ${clusterMafDir}/$c.maf.gz \
             ${clusterFramesDir}/${queryDb}/$c.mafFrames \
             >> $paraDir/jobList
         endif
       end
     end
     rm -f $tmpExt
     ssh -x kki "cd ${paraDir} && para make jobList && para time"
 #Completed: 360 of 360 jobs
 #CPU time in finished jobs:       2027s      33.78m     0.56h    0.02d  0.000 y
 #IO & Wait Time:                  3816s      63.60m     1.06h    0.04d  0.000 y
 #Average job time:                  16s       0.27m     0.00h    0.00d
 #Longest finished job:             124s       2.07m     0.03h    0.00d
 #Submission to last job:           493s       8.22m     0.14h    0.01d
 
     # combine results from cluster
     foreach queryDb (`cat ../species.lst`)
       echo $queryDb
       ssh -x kolossus "cat ${clusterFramesDir}/${queryDb}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${queryDb}.mafFrames.gz"
     end
 
     #------------------------------------------------------------------------
     # load the database
     hgLoadMafFrames rn4 multiz9wayFrames mafFrames/*.mafFrames.gz
 
     #------------------------------------------------------------------------
     # clean up
     rm -rf ${clusterDir}
 
     # rebuild frames to get bug fix, using 1-pass maf methodology
     # (2006-06-09 markd)
     ssh kkstore04
     cd /cluster/data/rn4/bed/multiz9way/frames
     mv mafFrames/ mafFrames.old
     nice tcsh # easy way to get process niced
     (cat  ../anno/maf/*.maf | time genePredToMafFrames rn4 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz rn4 genes/rn4.gp.gz xenTro1 genes/xenTro1.gp.gz |  gzip >multiz9way.mafFrames.gz)>&log&
     ssh hgwdev
     cd /cluster/data/rn4/bed/multiz9way/frames
 
     hgLoadMafFrames rn4 multiz9wayFrames multiz9way.mafFrames.gz >&log&
 
 
 # PHASTCONS (DONE 4/2/2006 angie)
 # Using Kate's process from makeHg17.doc.
 # This process is distilled from Hiram and Adam's experiments
 # on mouse (mm7) 17way track.  Many parameters are now fixed, without
 # being experimentally derived, either because the experiments
 # were lengthy and produced similar results, or because they
 # weren't runnable given the alignment size.
 # These parameters are:
 # --rho
 # --expected-length
 # --target-coverage
 # Also, instead of generating cons and noncons tree models,
 # we use a single, pre-existing tree model -- Elliot Margulies' model 
 # from the (37-way) ENCODE alignments.
 #
     # NOTE: reusing cluster-friendly chrom fasta files created earlier
     #cd /cluster/data/rn4
     #foreach f (`cat chrom.lst`)
         #echo $f
         #cp $f/*.fa /cluster/bluearc/rn4/chrom
     #end
 
     # Split chromosome MAF's into windows and use to generate
     # "sufficient statistics" (ss) files for phastCons input
     # NOTE: as the SAN fs has lotsa space, we're leaving these
     # big (temp) files unzipped, to save time during phastCons run.
     # Note also the larger chunk sizes from previous runs -- this
     # reduces run-time on the split, slows down the actual phastCons
     # enough so jobs don't crash (jobs are very quick, just a minute
     # or so), and according to Adam, will produce better results.
     # The previous small chunks were probably required by
     # the phyloFit step, which we are no longer using for the
     # human alignments.
     ssh pk
     mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
     /cluster/bin/phast/tree_doctor \
       --prune-all-but=rn3,mm7,hg17,canFam2,bosTau2,monDom2,galGal2,xenTro1,danRer3 \
       --rename="rn3 -> rn4 ; hg17 -> hg18 ; mm7 -> mm8 ; monDom2 -> monDom4" \
       /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod \
       > elliotsEncodePruned.mod
     mkdir run.split
     cd run.split
     set WINDOWS = /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ss
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
 
     cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
     set MAFS = /cluster/data/rn4/bed/multiz9way.2006-03-30/maf
     set WINDOWS = /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ss
     cd $WINDOWS
     set c = $1
     echo $c
     rm -fr $c
     mkdir $c
     /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
         -M /cluster/bluearc/rn4/chrom/$c.fa \
         -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
     echo "Done" >> $c.done
 'EOF'
 # << emacs
     chmod +x doSplit.csh
 
     rm -f jobList
     foreach f (../../maf/*.maf) 
       set c = $f:t:r
       echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
     end
     
     para make jobList
     para time
 #Completed: 45 of 45 jobs
 #CPU time in finished jobs:       2102s      35.03m     0.58h    0.02d  0.000 y
 #IO & Wait Time:                  8185s     136.42m     2.27h    0.09d  0.000 y
 #Average job time:                 229s       3.81m     0.06h    0.00d
 #Longest finished job:             661s      11.02m     0.18h    0.01d
 #Submission to last job:           661s      11.02m     0.18h    0.01d
 
     # check tree model on 5MB chunk, using params recommended by Adam,
     # (to verify branch lengths on 2X species)
     ssh kolossus
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
         --tree "`cat ../tree-commas.nh`" \
         /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/ss/chr7/chr7.119999202-129997313.ss \
         -o phyloFit.tree
     # Comment from makeHg17.doc:
     # # he ok'ed the results -- not necessary for next human run
     # TODO: maybe run these by Adam... the numbers in the RATE_MAT and 
     # TREE are different between the phyloFit and elliotsEncode models,
     # not sure if the differences are significant.  I will proceed with 
     # elliotsEncode as in makeHg17.doc.
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     mkdir /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons
     cat > doPhast.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set tmp = /scratch/tmp/$f
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons
 cp -p $san/ss/$c/$f.ss ../elliotsEncodePruned.mod $tmp
 pushd $tmp > /dev/null
 /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncodePruned.mod \
   --rho $rho --expected-length $len --target-coverage $cov --quiet \
   --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
 popd > /dev/null
 mkdir -p $san/pp/$c $san/bed/$c
 sleep 1
 mv $tmp/$f.pp $san/pp/$c
 mv $tmp/$f.bed $san/bed/$c
 rm -fr $tmp
 'EOF'
     # << emacs
     chmod a+x doPhast.csh
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
     cat > template << 'EOF'
 #LOOP
 doPhast.csh $(root1) $(file1) 14 .007 .27
 #ENDLOOP
 'EOF'
     # << emacs
 
     # Create parasol batch and run it
     ssh pk
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons
     pushd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons
     ls -1S ss/chr*/chr*.ss \
     | sed 's/.ss$//' \
       > /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/run.cons/in.list
     popd
 
     gensub2 in.list single template jobList
     para make jobList
     para time
 #Completed: 314 of 313 jobs
 #CPU time in finished jobs:       9623s     160.38m     2.67h    0.11d  0.000 y
 #IO & Wait Time:                 12815s     213.59m     3.56h    0.15d  0.000 y
 #Average job time:                  72s       1.19m     0.02h    0.00d
 #Longest finished job:             108s       1.80m     0.03h    0.00d
 #Submission to last job:           148s       2.47m     0.04h    0.00d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons
     #	The sed's and the sort get the file names in chrom,start order
     # (Hiram tricks -- split into columns on [.-/] with 
     #    identifying x,y,z, to allow column sorting and
     #    restoring the filename.  Warning: the sort column
     # will depend on how deep you are in the dir
     find ./bed -name "chr*.bed" \
     | sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" \
     | sort -k7,7 -k9,9n \
     | sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" \
     | xargs cat \
     | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
     | /cluster/bin/scripts/lodToBedScore /dev/stdin > phastConsElements9way.bed
     cp -p phastConsElements9way.bed /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
 
     # Measure coverage.  If good, load elements into database and proceed with wiggle.
     # Try for 5% overall cov, and 70% CDS cov.
     ssh hgwdev
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
     featureBits rn4 -enrichment refGene:cds phastConsElements9way.bed
 
     # FIRST ITERATION: doPhast (len cov rho) = (14 .008 .28)
 #refGene:cds 0.499%, phastConsElements9way.bed 5.101%, both 0.359%, cover 71.97%, enrich 14.11x
     mv phastConsElements9way.bed phastConsElements9way_14_008_28.bed
     # SECOND ITERATION: doPhast (len cov rho) = (13 .007 .27)
 #refGene:cds 0.499%, phastConsElements9way.bed 4.866%, both 0.355%, cover 71.07%, enrich 14.60x
     mv phastConsElements9way.bed phastConsElements9way_13_007_27.bed
     # THIRD ITERATION: doPhast (len cov rho) = (14 .007 .27)
 #refGene:cds 0.499%, phastConsElements9way.bed 4.982%, both 0.356%, cover 71.32%, enrich 14.31x
     mv phastConsElements9way.bed phastConsElements9way_14_007_27.bed
     # FOURTH ITERATION: doPhast (len cov rho) = (14 .007 .28)
 #refGene:cds 0.499%, phastConsElements9way.bed 5.073%, both 0.359%, cover 71.96%, enrich 14.19x
     mv phastConsElements9way.bed phastConsElements9way_14_007_28.bed
     # FIFTH ITERATION: doPhast (len cov rho) = (14 .006 .28)
 #refGene:cds 0.499%, phastConsElements9way.bed 5.042%, both 0.359%, cover 71.95%, enrich 14.27x
     mv phastConsElements9way.bed phastConsElements9way_14_006_28.bed
     # Wow, the CDS coverage doesn't move much.  Go with 14 .007 .27.
 
     # When happy:
     hgLoadBed -strict rn4 phastConsElements9way phastConsElements9way.bed
 
     # Create merged posterier probability file and wiggle track data files
     #	pk is currently closer to the san than any other machine
     ssh kolossus
     cd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     #next time try Angie's simpler sort, below
     time find ./pp -name "chr*.pp" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
         nice wigEncode stdin phastCons9way.wig phastCons9way.wib
     # about 23 minutes for above
 
     cp -p phastCons9way.wi? /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons
     ln -s `pwd`/phastCons9way.wib /gbdb/rn4/multiz9way/phastCons9way.wib
     hgLoadWiggle -pathPrefix=/gbdb/rn4/multiz9way rn4 \
         phastCons9way phastCons9way.wig
     #  ~ 3 minute load
 
 
 # PHASTCONS SCORES DOWNLOADABLES FOR 9WAY (DONE 4/3/2006 angie)
     ssh kolossus
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30
     mkdir phastConsDownloads
     cd phastConsDownloads
 cat > downloads.csh << 'EOF'
     date
     cd /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons/pp
     foreach chr (`awk '{print $1}' /cluster/data/rn4/chrom.sizes`)
       echo $chr
       cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
         | nice gzip -c \
             > /cluster/data/rn4/bed/multiz9way.2006-03-30/phastConsDownloads/$chr.gz
     end
     date
 'EOF'
     # << emacs
     csh -efx downloads.csh >&! downloads.log & tail -f downloads.log
     # ~20min
     md5sum *.gz > md5sum.txt
 
     ssh hgwdev
     cd /cluster/data/rn4/bed/multiz9way.2006-03-30/phastConsDownloads
     set dir = /usr/local/apache/htdocs/goldenPath/rn4/phastCons9way
     mkdir $dir
     ln -s /cluster/data/rn4/bed/multiz9way.2006-03-30/phastConsDownloads/{*.gz,md5sum.txt} $dir
     cp /usr/local/apache/htdocs/goldenPath/mm7/phastCons17Scores/README.txt $dir
     # edit README.txt
     # Clean up after phastCons run.
     ssh kkstore04
     rm /cluster/data/rn4/bed/multiz9way.2006-03-30/phastCons/*.tab
     rm -r /san/sanvol1/scratch/rn4/multiz9way.2006-03-30/phastCons
 
 
 # MIRNA (DONE 4/10/06 angie)
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/miRNA
     cd /cluster/data/rn4/bed/miRNA
     wget ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/8.0/database_files/tables.sql
     foreach t (mirna mirna_species mirna_mature mirna_pre_mature \
                mirna_chromosome_build)
       wget ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/8.0/database_files/$t.dat.gz
     end
     nice gunzip *.gz
     hgsql '' -e 'create database miRNAtmp'
     hgsql miRNAtmp < tables.sql
     foreach t (mirna mirna_species mirna_mature mirna_pre_mature \
                mirna_chromosome_build)
       hgsql miRNAtmp -e 'load data local infile "'$t.dat'" into table '$t
     end
     set species = `hgsql miRNAtmp -N -e 'select auto_id from mirna_species where name = "Rattus norvegicus"'`
     hgsql miRNAtmp -e "create table mirna_rat select * from mirna where auto_species = $species"
     hgsql miRNAtmp -N -e "select count(*) from mirna_rat"
 #| 234 |
     # This is a tiny database so this query takes almost no time at all:
     time nice hgsql miRNAtmp -N -e \
      "select mirna_chromosome_build.xsome, \
              mirna_chromosome_build.contig_start - 1, \
              mirna_chromosome_build.contig_end, \
              mirna_rat.mirna_id, 0, \
              mirna_chromosome_build.strand, \
        (mirna_chromosome_build.contig_start + mirna_mature.mature_from - 2), \
        (mirna_chromosome_build.contig_start + mirna_mature.mature_to - 1) \
       from mirna_rat, mirna_chromosome_build, mirna_mature, mirna_pre_mature \
       where mirna_rat.auto_mirna = mirna_chromosome_build.auto_mirna and \
             mirna_rat.auto_mirna = mirna_pre_mature.auto_mirna and \
             mirna_pre_mature.auto_mature = mirna_mature.auto_mature" \
        > mirna.almostBed
 #0.000u 0.010s 0:00.03 33.3% 0+0k 0+0io 211pf+0w
     awk '{$5 = 960;}  $6 == "-" {$5 = 480;}  {$1 = "chr" $1;  print;}' \
       mirna.almostBed > miRNA.bed
     hgLoadBed rn4 miRNA miRNA.bed
     hgsql miRNAtmp -e 'drop database miRNAtmp'
 
 
 # AFFYRAE230 (DONE 4/11/06 angie)
     # Rachel already set up consensus sequences in 
     # /iscratch/i/affy/RAE230_all.fa - see makeRn3.doc.
 
     # Align consensus sequences to genome.
     ssh kk
     mkdir /cluster/data/rn4/bed/affyRat.2006-04-11
     cd /cluster/data/rn4/bed/affyRat.2006-04-11
     ls -1 /iscratch/i/affy/RAE230_all.fa > affy.lst
     ls -1S /panasas/store/rn4/ctgFa/* > allctg.lst
 
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/rn4/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
 
     gensub2 allctg.lst affy.lst template.sub para.spec
     mkdir psl
     para make para.spec
     para time
 #Completed: 591 of 591 jobs
 #CPU time in finished jobs:      10704s     178.40m     2.97h    0.12d  0.000 y
 #IO & Wait Time:                  9584s     159.73m     2.66h    0.11d  0.000 y
 #Average job time:                  34s       0.57m     0.01h    0.00d
 #Longest finished job:              83s       1.38m     0.02h    0.00d
 #Submission to last job:           270s       4.50m     0.07h    0.00d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyRAE230.psl
     pslSort dirs raw.psl tmp psl
     # only use alignments that cover 30% of sequence and have at least 
     # 95% identity in aligned region. minAli = 0.97 too high. 
     # low minCover as a lot of n's in these sequences
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl \
         /dev/null
     liftUp affyRAE230.psl ../../jkStuff/liftAll.lft warn contig.psl
     
     # shorten names in psl file
     sed -e 's/RAE230//' affyRAE230.psl > affyRAE230.psl.bak
     mv affyRAE230.psl.bak affyRAE230.psl
     
     ssh hgwdev
     cd /cluster/data/rn4/bed/affyRat.2006-04-11
     # Rachel already set up /gbdb/hgFixed/affyProbes/RAE230_all.fa -
     # see makeRn3.doc.
     hgLoadSeq -abbr=RAE230 rn4 /gbdb/hgFixed/affyProbes/RAE230_all.fa
    
     # load track into database
     hgLoadPsl rn4 affyRAE230.psl
 
     # Load knownToRAE230:
     hgMapToGene rn4 affyRAE230 knownGene knownToRAE230
 
     # Clean up    
     rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab
 
 
 # AFFYRGU34A (DONE 4/11/06 angie)
 # RG-U34A is the chip used for the GNF Atlas 2 expression data for Rat
     # Rachel already set up consensus sequences in 
     # /iscratch/i/affy/RG-U34A_all.fa -see makeRn3.doc.
 
     # Align consensus sequences to genome.
     ssh kk
     mkdir /cluster/data/rn4/bed/affyU34A
     cd /cluster/data/rn4/bed/affyU34A
     ls -1 /iscratch/i/affy/RG-U34A_all.fa > affy.lst
     ls -1 /panasas/store/rn4/ctgFa/* > allctg.lst
 
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/rn4/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
 
     gensub2 allctg.lst affy.lst template.sub para.spec
     mkdir psl
     para make para.spec
 #Completed: 591 of 591 jobs
 #CPU time in finished jobs:       4911s      81.85m     1.36h    0.06d  0.000 y
 #IO & Wait Time:                 10019s     166.99m     2.78h    0.12d  0.000 y
 #Average job time:                  25s       0.42m     0.01h    0.00d
 #Longest finished job:              64s       1.07m     0.02h    0.00d
 #Submission to last job:           388s       6.47m     0.11h    0.00d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyRGU34A.psl
     pslSort dirs raw.psl tmp psl
 
     # only use alignments that cover 30% of sequence and have at least 
     # 95% identity in aligned region. minAli = 0.97 too high. 
     # low minCover as a lot of n's in these sequences
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl \
       /dev/null
     liftUp affyU34A.psl ../../jkStuff/liftAll.lft warn contig.psl
     
     # shorten names in psl file
     sed -e 's/RG-U34A://' affyU34A.psl > affyU34A.psl.bak
     mv affyU34A.psl.bak affyU34A.psl
     
     ssh hgwdev
     cd /cluster/data/rn4/bed/affyU34A
     # Rachel already set up /gbdb/hgFixed/affyProbes/RG-U34A_all.fa -
     # see makeRn3.doc.
     hgLoadSeq -abbr=RG-U34A: rn4 /gbdb/hgFixed/affyProbes/RG-U34A_all.fa
    
     # load track into database
     hgLoadPsl rn4 affyU34A.psl
 
     # create and load knownToU34A
     hgMapToGene rn4 affyU34A knownGene knownToU34A
 
     # Clean up    
     rm -r psl tmp err batch.bak contig.psl raw.psl seq.tab
 
 
 # MERGE GNF ATLAS 2 EXPRESSION DATA AND ALIGNMENT (DONE 4/13/06 angie)
     ssh hgwdev
     cd /cluster/data/rn4/bed/affyU34A
     hgMapMicroarray gnfRatAtlas2.bed hgFixed.gnfRatAtlas2MedianRatio \
         affyU34A.psl
 #Loaded 8022 rows of expression data from hgFixed.gnfRatAtlas2MedianRatio
 #Mapped 7447,  multiply-mapped 216, missed 695, unmapped 575
     hgLoadBed rn4 gnfAtlas2 gnfRatAtlas2.bed
     rm bed.tab
     # Create table to map between known genes and GNF Atlas2
     # expression data.
     hgMapToGene rn4 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
     hgExpDistance rn4 hgFixed.gnfRatAtlas2MedianRatio \
         hgFixed.gnfRatAtlas2MedianExps gnfAtlas2Distance \
         -lookup=knownToGnfAtlas2
 
 
 # HGNEAR PROTEIN BLAST TABLES -- TARGET (DONE 4/12/06 angie)
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/hgNearBlastp
     cd /cluster/data/rn4/bed/hgNearBlastp
     cat << _EOF_ > config.ra
 # Latest rat vs. other Gene Sorter orgs:
 # human, mouse, zebrafish, fly, worm, yeast
 
 targetGenesetPrefix known
 targetDb rn4
 queryDbs hg18 mm8 danRer3 dm2 ce2 sacCer1
 
 rn4Fa /cluster/data/rn4/bed/blastp/known.faa
 hg18Fa /cluster/data/hg18/bed/blastp/known.faa
 mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
 danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
 dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
 ce2Fa /cluster/data/ce2/bed/blastp/wormPep151.faa
 sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
 
 buildDir /cluster/data/rn4/bed/hgNearBlastp
 scratchDir /san/sanvol1/scratch/rn4HgNearBlastp
 _EOF_
     # Do -targetOnly until rn4 has been released -- then it will be OK to 
     # update rnBlastTab in the other species.
     doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log
 # *** Check these tables in rn4:
 # *** knownBaseBlastTab hgBlastTab mmBlastTab drBlastTab dmBlastTab ceBlastTab scBlastTab 
 
 
 # HGNEAR PROTEIN BLAST TABLES -- QUERIES (DONE 6/26/06 angie)
     ssh hgwdev
     cd /cluster/data/rn4/bed/hgNearBlastp
     # Do -queryOnly after rn4 has been released:
     doHgNearBlastp.pl -queryOnly config.ra >& do.queries.log &
     tail -f do.queries.log
 # *** Check rnBlastTab in these databases:
 # *** hg18 mm8 danRer3 dm2 ce2 sacCer1 
 
 
 # MORE HGNEAR TABLES (DONE 4/13/06 angie)
     ssh hgwdev
     cd /tmp
     # Create table that maps between known genes and RefSeq
     hgMapToGene rn4 refGene knownGene knownToRefSeq
 
     # Create knownToEnsembl column -- redone 6/13/06 after ensGene update
     hgMapToGene rn4 ensGene knownGene knownToEnsembl
 
 
 # MGI?
 
 # Recomb rate - no doc, but ls -l /cluster/bin/scripts/*ecomb* and see 
 # ftp://ftp.ncbi.nih.gov/genomes/R_norvegicus/Mapping_data/
 
 # SNPS - TBD by Heather's new process.
 
 #TODO after SNPS:
     # Make knownToCdsSnp column.
     hgMapToGene rn4 snpMap knownGene knownToCdsSnp -all -cds
 
 
 # BLASTZ/CHAIN/NET XENTRO2 (DONE 4/19/06 angie)
     ssh kkstore04
     mkdir /cluster/data/rn4/bed/blastz.xenTro2.2006-04-19
     cd /cluster/data/rn4/bed/blastz.xenTro2.2006-04-19
     cat << '_EOF_' > DEF
 # rat vs. frog
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 
 # Mammal to non-mammal, with threshold of 8000 as in mm8-xenTro2
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Rat rn4
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Frog xenTro2 - single chunk big enough to run two of the
 #               largest scaffolds in one job
 SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/cluster/data/rn4/bed/blastz.xenTro2.2006-04-19
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/rn4XenTro2 \
       -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
       >& do.log & tail -f do.log
     ln -s blastz.xenTro2.2006-04-19 /cluster/data/rn4/bed/blastz.xenTro2
 
 
 #########################################################################
 # BLASTZ CHICKEN galGal3 (DONE 5/25/06 angie)
     ssh pk
     mkdir /cluster/data/rn4/bed/blastz.galGal3.2006-05-24
     cd /cluster/data/rn4/bed/blastz.galGal3.2006-05-24
     cat << '_EOF_' > DEF
 # rat vs chicken
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Rat rn4
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInNonMammal
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
 SEQ2_DIR=/san/sanvol1/galGal3/nib
 SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.galGal3.2006-05-24
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsrn4 \
       -bigClusterHub=pk -smallClusterHub=pk \
       -chainMinScore=5000 -chainLinearGap=loose \
       >& do.log & tail -f do.log
     ln -s blastz.galGal3.2006-05-24 /cluster/data/rn4/bed/blastz.galGal3
 
 #########################################################################
 ###### ADD gdbPdb ENTRY FOR rn4 INTO THE CENTRAL DB. (DONE, 6/1/06, Fan)
 
     mysql -u hgcat -p$HGPSWD -h genome-testdb -A hgcentraltest
 
     	delete from gdbPdb where genomeDb='rn4';
     	insert into gdbPdb values('rn4', 'proteins060115');
 
 ###########################################################
 #BUILD KEGG PATHWAY TABLES. DONE 6/12/06.  Fan.
    ssh hgwdev
    cd /cluster/store11/kg/kgRn4A
    md kegg
    cd kegg
 
    ~/src/hg/protein/KGpath.sh kgRn4A rn4 060115
 
 # This script exited early due to SOAP module of perl seems missing
 # So go to hgwdevold and manually ran the following:
 
    ~/src/hg/protein/getKeggList2.pl rno > keggList.tab
 
    hgsql rn4 < ~/kent/src/hg/lib/keggList.sql
    hgsql -e 'LOAD DATA local INFILE "keggList.tab" into table keggList;' rn4
 
 # go back to hgwdev
 
    hgKegg3 rn4 rn4
  
    hgsql rn4 -e "drop table keggMapDesc"
    hgsql rn4 -e "drop table keggPathway"
    hgsql rn4 <~/src/hg/lib/keggMapDesc.sql
    hgsql rn4 <~/src/hg/lib/keggPathway.sql
    hgsql rn4 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
    hgsql rn4 -e 'load data local infile "keggPathway.tab" into table keggPathway'
   
 #########################################################################
 # BUILD SOME KNOWNTO... TABLES, DONE 6/12/06 Fan.
 
 cd /cluster/data/rn4/bed/geneSorter
 # Create table that maps between known genes and RefSeq
 hgMapToGene rn4 refGene knownGene knownToRefSeq
 
 # Create table that maps between known genes and LocusLink
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" rn4 > refToLl.txt
 hgMapToGene rn4 refGene knownGene knownToLocusLink -lookup=refToLl.txt
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt rn4 knownGene name proteinID Pfam knownToPfam
 hgsql -e "select count(*) from knownToPfam;" rn4
 
 #########################################################################
 # BLASTZ/CHAIN/NET DANRER4 - NO LINSPECREP (DONE, 2006-06-19, hartera) 
     ssh pk
     mkdir /cluster/data/rn4/bed/blastz.danRer4.2006-06-19
     cd /cluster/data/rn4/bed/blastz.danRer4.2006-06-19
     # There are no lineage-specific repeats for these two species. All repeats
     # were not used as this was found to lower coverage with rn4 and danRer3
     # so use dynamic masking instead (M parameter for Blastz).
     cat << '_EOF_' > DEF
 # rat (rn4) vs zebrafish (danRer4)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=0
 
 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Rat Rn4
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer4)
 #  large enough chunk to do complete chroms at once
 SEQ2_DIR=/san/sanvol1/scratch/danRer4/nib
 SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
 SEQ2_CHUNK=250000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.danRer4.2006-06-19
 '_EOF_'
     # << for emacs
     chmod +x DEF
     nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         `pwd`/DEF >& do.log &
     tail -f do.log
     # Took 2 hours and 50 minutes to run.
     ln -s blastz.danRer4.2006-06-19 /cluster/data/rn4/bed/blastz.danRer4 
     # Do swap to create danRer4 vs rn4:
     ssh pk
     cd /cluster/data/rn4/bed/blastz.danRer4.2006-06-19
     nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -swap `pwd`/DEF >& swap.log &
     # Took about 10 minutes to run.
     rm *.log
     # Check README.txt is correct for alignment downloads. 
     featureBits rn4 chainDanRer4Link
     # 67765280 bases of 2571531505 (2.635%) in intersection 
     featureBits rn4 refGene:cds chainDanRer4Link -chrom=chr1 -enrichment
     # refGene:cds 0.731%, chainDanRer4Link 3.337%, both 0.518%, 
     # cover 70.86%, enrich 21.24x
     featureBits rn4 refGene:cds chainDanRer3Link -chrom=chr1 -enrichment
     # refGene:cds 0.731%, chainDanRer3Link 2.699%, both 0.475%, 
     # cover 64.89%, enrich 24.04x
     featureBits rn4 refGene:cds netDanRer4 -chrom=chr1 -enrichment
     # refGene:cds 0.731%, netDanRer4 29.999%, both 0.575%, cover 78.65%, 
     # enrich 2.62x
     featureBits rn4 refGene:cds netDanRer3 -chrom=chr1 -enrichment
     # refGene:cds 0.731%, netDanRer3 25.934%, both 0.530%, cover 72.52%, 
     # enrich 2.80x
 
 #########################################################################
 #### REBUILD kgProtMap TABLE (DONE. 6/19/06 Fan)
 # Existing kgProtMap table only had 33 entries. 
 
     ssh kolossus
    
     cd /cluster/store9/rn4/bed/kgRn4A
     ~/kent/src/hg/protein/kgProtMap2.sh kgRn4A rn4 060115
 
     cd kgProtMap
     hgLoadPsl -tNameIx rn4 psl.tmp/kgProtMap.psl
 
 ########################################################################
 #### BUILD SUPERFAMILY RELATED TABLES (DONE - 2006-06-19 - Fan)
 
 # Download latest Superfamily data files and build the Superfamily DB
 # from supfam.mrc-lmb.cam.ac.uk
 
     mkdir /cluster/store10/superfamily/060619
     ln -s /cluster/store10/superfamily/060619 /cluster/data/superfamily/060619
     cd /cluster/data/superfamily/060619
 
 # ftp over the following two files:
 
     ass_18-Jun-2006.tab.gz
     supfam_18-Jun-2006.sql.gz
 
     gzip -d *.gz
 
 # Load the Superfamily database
     hgsql rn4 -e "create database superfam060619"
     nice hgsql superfam060619 < supfam_18-Jun-2006.sql &
 
 # This may take about an hour.
 
 # Make sure to add an index on id of the des table of superfam060619.
     hgsql superfam060619 -e "create index id on des(id);"
 
     hgsql superfam060619 < ~/src/hg/lib/sfAssign.sql
     hgsql superfam060619 -e 'load data local infile "ass_18-Jun-2006.tab" into table superfam060619.sfAssign;'
 
 # Build or rebuild Superfamily track and create sf tables needed for PB
 
    hgsql rn4 < ~/src/hg/lib/sfAssign.sql
 
    cd /cluster/data/superfamily/060619  
    hgsql rn4 -e 'load data local infile "ass_18-Jun-2006.tab" into table rn4.sfAssign;'
 
 # If rn4.sfDes already exists, drop it.
 
    hgsql superfam060619 -N -e "select * from des" >sfDes.tab
    hgsql rn4 < ~/src/hg/lib/sfDes.sql
    hgsql rn4 -e 'load data local infile "sfDes.tab" into table sfDes'
 
 # Build ensemblXref3 
 
     # Get the ensembl gene/protein cross-reference data from Ensembl BioMart
     # http://www.ensembl.org/Multi/martview
     # Follow this sequence through the pages:
     # Page 1) Select Ensembl39 and Rat. Hit next.
     # Page 2) Do not select anything. Hit next.
     # Page 3) Choose the "Feature" box, select Ensembl gene ID, transcript ID, peptide ID,
               UniProt/TrEMBL ID, UniProt/SWISSPROT ID, and UniProt/SWISSPROT Accession
     # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
     # Save as ensXref.tsv.gz
 
     ssh hgwdev
     cd /cluster/data/rn4/bed/sf
     hgsql rn4 < ~/src/hg/lib/ensemblXref3Temp.sql
     hgsql rn4 -e \
     'load data local infile "ensXref.tsv" into table ensemblXref3Temp ignore 1 lines'
 
     hgsql rn4 -N -e \
     'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \
     > ensemblXref3.tab
 
     hgsql rn4 -e 'drop table ensemblXref3'
     hgsql rn4 <~/src/hg/lib/ensemblXref3.sql
     hgsql rn4 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3'
 
 # If rn4.superfamily already exists, drop it.
    cd /cluster/data/rn4/bed
    mkdir /cluster/data/rn4/bed/sf.20060619
    ln -s sf.20060619 sf
    cd sf
    hgSuperfam rn4 superfam060619 > sf.log
 
 # It is normal that many proteins do not have corresponding Superfamily entries.
 
 # If rn4.sfDescription exists, drop it.
 
    hgsql rn4 < ~/src/hg/lib/sfDescription.sql
    hgsql rn4 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table rn4.sfDescription;'
 
 # Finally, load the superfamily table.
 
    hgLoadBed rn4 superfamily superfamily.tab -tab
 
 # Create knownToSuperfamily table
 # Note hs is changed into ht for this Superfamily release.
    
    cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \
    | hgKnownToSuper rn4 rn stdin
 # created 25287 rows in knownToSuper
 
 # Build tables needed by pbGlobal in proteins060115
 
    cd /cluster/data/superfamily/060619  
    hgsql proteins060115 -e 'delete from sfAssign'
    hgsql proteins060115 -e 'delete from sfDes'
    hgsql proteins060115 -e 'load data local infile "ass_18-Jun-2006.tab" into table sfAssign'
    hgsql proteins060115 -e 'load data local infile "sfDes.tab" into table sfDes'
 
 ########################################################################
 # Build rn4 PROTEOME BROWSER TABLES (DONE 6/20/06, Fan)
 
 # These are instructions for building tables 
 # needed for the Proteome Browser. 
  
 # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
 # ARE REBUILT.  
 # This build is based on proteins DBs dated 060115.
 
 # Create the working directory
 
     ssh hgwdev
     mkdir /cluster/store11/kg/kgRn4A/pb-2006-06-19
     cd /cluster/data/rn4/bed
     rm pb
     ln -s /cluster/store11/kg/kgRn4A/pb-2006-06-19 pb
     cd pb
 
 # Define pep* tables in rn4 DB
 
 	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
 
 #  First edit out pepPred table definition, then
 
 	hgsql rn4 < pepAll.sql
 
 # Build the pepMwAa table
 
   hgsql proteins060115 -N -e \
 "select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10116 and accToTaxon.acc = info.acc" > pepMwAa.tab
 
 hgsql rn4 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'
 
 o Build the pepPi table
 
     hgsql proteins060115 -e \
     "select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10116 and accToTaxon.acc = info.acc" > protAcc.lis
 
     hgsql rn4 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis
 
     pbCalPi protAcc.lis sp060115 pepPi.tab
     hgsql rn4 -e 'delete from pepPi'
     hgsql rn4 -e 'load data local infile "pepPi.tab" into table rn4.pepPi'
 
 # Calculate and load pep distributions
 
     pbCalDist sp060115 proteins060115 10116 rn4 >pbCalDist.out
     wc  pbCalDist.out
 
     hgsql rn4
     load data local infile "pepExonCntDist.tab" into table rn4.pepExonCntDist;
     load data local infile "pepCCntDist.tab" into table rn4.pepCCntDist;
     load data local infile "pepHydroDist.tab" into table rn4.pepHydroDist;
     load data local infile "pepMolWtDist.tab" into table rn4.pepMolWtDist;
     load data local infile "pepResDist.tab" into table rn4.pepResDist;
     load data local infile "pepIPCntDist.tab" into table rn4.pepIPCntDist;
     load data local infile "pepPiDist.tab" into table rn4.pepPiDist;
     quit
 
 # Calculate frequency distributions
 
     pbCalResStd sp060115 10116 rn4
 
 # Create pbAnomLimit and pbResAvgStd tables
 
    hgsql rn4 -e "drop table pbAnomLimit"
    hgsql rn4 -e "drop table pbResAvgStd"
    hgsql rn4 < ~/src/hg/lib/pbAnomLimit.sql
    hgsql rn4 < ~/src/hg/lib/pbResAvgStd.sql
 
    hgsql rn4 -e 'load data local infile "pbResAvgStd.tab" into table rn4.pbResAvgStd;'
    hgsql rn4 -e 'load data local infile "pbAnomLimit.tab" into table rn4.pbAnomLimit;'
 
 # Create pbStamp table for PB
   hgsql rn4 -e "drop table pbStamp"
   hgsql rn4 < ~/src/hg/lib/pbStamp.sql
   hgsql rn3 -N -e 'select * from pbStamp' > pbStamp.tab
 
   hgsql rn4 -e 'delete from pbStamp'
   hgsql rn4 -e 'load data local infile "pbStamp.tab" into table rn4.pbStamp'
 
 # Turn on Proteome Browser for rn4.
 
   hgsql -e 'update dbDb set hgPbOk=1 where name="rn4"' \
         -h genome-testdb hgcentraltest
 
 # Adjust drawing parameters for Proteome Browser stamps
 
   Now invoke Proteome Browser and adjust various drawing parameters
   (mostly the ymax of each stamp) if necessary, by updating the 
   pbStamp.tab file and then delete and reload the pbStamp table. 
 
   hgsql rn4 -e "drop table pbStamp"
   hgsql rn4 < ~/src/hg/lib/pbStamp.sql
   hgsql rn4 -e 'load data local infile "pbStamp.tab" into table rn4.pbStamp'
 
 # Perform preliminary review of Proteome Browser for rn4, then
   notify QA for formal review.
 
 ################################################################################
 # BUILD KNOWN GENE LIST FOR GOOGLE. (DONE, 6/6/06, Fan)  
 # make knownGeneLists.html rn4GeneList.html mm5GeneList.html rm3GeneList.html
 
     cd /cluster/data/rn4/bed
     rm -rf knownGeneList/rn4
 
 # Run hgKnownGeneList to generate the tree of HTML pages
 # under ./knownGeneList/rn4
 
     hgKnownGeneList rn4
 
 # copy over to /usr/local/apache/htdocs
 
     rm -rf /usr/local/apache/htdocs/knownGeneList/rn4
     mkdir -p /usr/local/apache/htdocs/knownGeneList/rn4
     cp -Rfp knownGeneList/rn4/* /usr/local/apache/htdocs/knownGeneList/rn4
 
 ################################################################################
 
 ##########################################################################
 # N-SCAN gene predictions (nscanGene) - (2006-09-11 markd)
     cd /cluster/data/rn4/bed/nscan/
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     wget -nv -r -l 1 -np --accept=gtf http://ardor.wustl.edu/predictions/rat/rn4/chr_gtf
     wget -nv -r -l 1 -np --accept=fa http://ardor.wustl.edu/predictions/rat/rn4/chr_ptx
 
     # clean up downloaded directorys and compress files.
     mv ardor.wustl.edu/predictions/rat/rn4/* .
     rm -rf ardor.wustl.edu
     gzip chr_*/*
     chmod a-w chr_*/*.gz
 
     # load tracks.  Note that these have *utr features, rather than
     # exon features.  currently ldHgGene creates separate genePred exons
     # for these.
     ldHgGene -bin -gtf -genePredExt rn4 nscanGene chr_gtf/chr*.gtf.gz
 
     # load protein, add .a suffix to match transcript id
     hgPepPred -suffix=.a rn4 generic nscanPep chr_ptx/chr*.fa.gz
     rm *.tab
 
     # update trackDb; need a rn4-specific page to describe informants
     rat/rn4/nscanGene.html   (copy from mm8 and edit)
     rat/rn4/trackDb.ra
 
 
 
 ###########################################################################
 # dbSNP BUILD 125 (Heather, September 2006)
 # We didn't have ctgPos.
 # dbSNP build 125 uses chrom coords (phys_pos_from and phys_pos) in ContigLoc
 # but not for chrUn.  chrUn has contig coords only, and without ctgPos
 # for supercontigs, I can't translate the chrUn contig coords into chrom coords.
 # Also note: rn4 has no chrY, and there were no SNPs for chrM.
 # Not having ctgPos also meant I couldn't process the new locTypes 
 # (4,5,6: rangeInsertion, rangeSubstitution, rangeDeletion)
 # because those are also only given in contig coords
 # These were a small number.
 
 # Set up directory structure
 ssh kkstore02
 cd /cluster/data/dbSNP/125/organisms/rat_10116
 mkdir data
 mkdir schema
 mkdir rs_fasta
 
 # Get data from NCBI (anonymous FTP)
 cd /cluster/data/dbSNP/125/organisms/rat_10116/data
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/rat_10116/database/organism_data
 # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
 get b125_SNPContigLoc_3_1.bcp.gz
 # ContigLocusId has function
 get b125_SNPContigLocusId_3_1.bcp.gz
 get b125_ContigInfo_3_1.bcp.gz
 # MapInfo has alignment weights
 get b125_SNPMapInfo_3_1.bcp.gz
 # SNP has univar_id, validation status and heterozygosity
 get SNP.bcp.gz
 
 # Get schema from NCBI
 cd /cluster/data/dbSNP/125/organisms/rat_10116/schema
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/rat_10116/database/organism_schema
 get rat_10116_table.sql.gz
 
 # Get fasta files from NCBI
 # using headers of fasta files for molType
 cd /cluster/data/dbSNP/125/organisms/rat_10116/rs_fasta
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/rat_10116/rs_fasta
 prompt
 mget *.gz
 
 # add rs_fasta to seq/extFile
 # 2 edits first: strip header to just rsId, and remove duplicates
 # work on /cluster/store12 (kkstore05) which has more disk space
 cp rs_ch*.fas.gz /cluster/store12/snp/125/rat/rs_fasta
 ssh kkstore05
 cd /cluster/store12/snp/125/rat/rs_fasta
 # concat into rsAll.fas
 cat << '_EOF_' > concat.csh
 #!/bin/csh -ef
 rm -f rsAll.fas
 foreach file (rs_ch*.fas)
     echo $file
     zcat $file >> rsAll.fas
 end
 '_EOF_' 
 # << emacs
 #snpCleanSeq strips the header and skips duplicates
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
 rm rsAll.fas
 # load on hgwdev
 ssh hgwdev
 mkdir /gbdb/rn4/snp
 ln -s /cluster/store12/snp/125/rat/rs_fasta/snp.fa /gbdb/rn4/snp/snp.fa
 cd /cluster/store12/snp/125/rat/rs_fasta
 hgLoadSeq rn4 /gbdb/rn4/snp/snp.fa
 
 # look up id in extFile
 hgsql rn4 < snpSeq.sql
 hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 420007' rn4
 hgsql -e 'delete from seq where extFile = 420007' rn4
 hgsql -e 'alter table snpSeq add index acc (acc)' rn4
 
 # clean up after hgLoadSeq
 rm seq.tab
 
 # Simplify names of data files
 cd /cluster/data/dbSNP/125/organisms/rat_10116/data
 mv b125_ContigInfo_3_1.bcp.gz ContigInfo.gz
 mv b125_SNPContigLoc_3_1.bcp.gz ContigLoc.gz
 mv b125_SNPContigLocusId_3_1.bcp.gz ContigLocusId.gz
 mv b125_SNPMapInfo_3_1.bcp.gz MapInfo.gz
 mv SNP.bcp.gz SNP.gz
 ls -1 *.gz > filelist
 
 # edit table descriptions
 cd /cluster/data/dbSNP/125/organisms/rat_10116/schema
 # get CREATE statements from rat_10116_table.sql for our 5 tables
 # store in table.tmp
 # convert and rename tables
 sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
 rm table.tmp
 sed -f 'tableRename.sed' table2.tmp > table.sql
 rm table2.tmp
 
 # get header lines from rs_fasta
 cd /cluster/data/dbSNP/125/organisms/rat_10116/rs_fasta
 /bin/csh gnl.csh
 
 # load on kkr5u00
 ssh kkr5u00
 hgsql -e mysql 'create database rn4snp125' 
 cd /cluster/data/dbSNP/125/organisms/rat_10116/schema
 hgsql rn4snp125 < table.sql
 cd ../data
 /bin/csh load.csh
 
 # note rowcount
 # ContigLoc        94945 
 # SNP              41658 
 # MapInfo          41658 
 # ContigLocusId    32954 
 
 # Get UniVariation from 126 (recently downloaded for mm8snp126)
 cd /cluster/data/dbSNP/126/shared
 hgsql rn4snp125 < UniVariation.sql
 zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' rn4snp125
 
 # create working /scratch dir
 cd /scratch/snp/125
 mkdir rat
 cd rat
 
 # no rn4.ctgPos table to compare against
 
 # get gnl files
 cp /cluster/data/dbSNP/125/organisms/rat_10116/rs_fasta/*.gnl .
 
 # examine ContigInfo for group_term and edit pipeline.csh
 # use "ref_assembly" 
 
 # filter ContigLoc into ContigLocFilter
 # this lifts from contig coords to chrom coords
 # phys_pos_from is used to check coords for non-random chroms
 # errors reported to stdout
 # this gets rid of alternate assemblies (using ContigInfo)
 # this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
 # assumes all contigs are positively oriented; will abort if not true
 
 mysql> desc ContigLocFilter;
 #  +---------------+-------------+------+-----+---------+-------+
 #  | Field         | Type        | Null | Key | Default | Extra |
 #  +---------------+-------------+------+-----+---------+-------+
 #  | snp_id        | int(11)     | NO   |     |         |       |
 #  | ctg_id        | int(11)     | NO   |     |         |       |
 #  | chromName     | varchar(32) | NO   |     |         |       |
 #  | loc_type      | tinyint(4)  | NO   |     |         |       |
 #  | start         | int(11)     | NO   |     |         |       |
 #  | end           | int(11)     | YES  |     | NULL    |       |
 #  | orientation   | tinyint(4)  | NO   |     |         |       |
 #  | allele        | blob        | YES  |     | NULL    |       |
 #  +---------------+-------------+------+-----+---------+-------+
  
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter125 rn4snp125 ref_assembly RGSC_v3.4.0
 # note rowcount
 # ContigLocFilter  45735
 # how many are positive strand? hopefully 90%
 # we get less than half here!
 mysql> select count(*) from ContigLocFilter where orientation = 0;
 # 23353
 # note count by loc_type
 mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
 # +----------+----------+
 # | count(*) | loc_type |
 # +----------+----------+
 # |       57 |        1 |
 # |    45482 |        2 |
 # |      196 |        3 |
 # +----------+----------+
 
 # filter ContigLocusId into ContigLocusIdFilter
 # need to filter on contig_acc because ctg_id = 0 for all rows in ContigLocusId
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter125 rn4snp125 ref_assembly
 # note rowcount 
 # 32954 (same as ContigLocusId)
 
 # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
 # assumes SNPs are in numerical order; will errAbort if not true
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense rn4snp125 
 # note rowcount; expect about 50% (ascertainment bias for SNPs within genes)
 # actually we get about 80% here
 # ContigLocusIdCondense 23754
 # could delete ContigLocusIdFilter table here
 
 # create chrN_snpFasta tables from *.gnl files
 # we are just using molType, but also storing class and observed
 # need chromInfo for this
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta rn4snp125 
 
 # (could start using pipeline.csh here)
 # (pipeline.csh takes about 5 minutes to run)
 
 # split ContigLocFilter by chrom 
 # create the first chrN_snpTmp
 # we will reuse this table name, adding/changing columns as we go
 # at this point chrN_snpTmp will have the same description as ContigLocFilter
 # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom rn4snp125 ref_assembly
 
 # check coords using loc_type
 # possible errors logged to snpLocType.error:
 # Unknown locType 
 # Between with allele != '-' 
 # Exact with end != start + 1
 # Range with end < start 
 # possible exceptions logged to snpLocType.exceptions:
 # RefAlleleWrongSize
 # This run no errors, no exceptions
 
 # morph chrN_snpTmp 
 
 mysql> desc chr1_snpTmp;
 
 #  +---------------+-------------+------+-----+---------+-------+
 #  | Field         | Type        | Null | Key | Default | Extra |
 #  +---------------+-------------+------+-----+---------+-------+
 #  | snp_id        | int(11)     | NO   |     |         |       |
 #  | ctg_id        | int(11)     | NO   |     |         |       |
 #  | chromStart    | int(11)     | NO   |     |         |       |
 #  | chromEnd      | int(11)     | NO   |     |         |       |
 #  | loc_type      | tinyint(4)  | NO   |     |         |       |
 #  | orientation   | tinyint(4)  | NO   |     |         |       |
 #  | allele        | blob        | YES  |     | NULL    |       |
 #  +---------------+-------------+------+-----+---------+-------+
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype125 rn4snp125 ref_assembly
 
 # expand allele as necessary
 # report syntax errors to snpExpandAllele.errors
 # possible exceptions logged to snpExpandAllele.exceptions:
 # RefAlleleWrongSize
 # This run just 1 error
 # 0 alleles expanded
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele rn4snp125 ref_assembly
 
 # the next few steps prepare for working in UCSC space
 
 # sort by position
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort rn4snp125 ref_assembly
 
 # get nib files 
 ssh hgwdev
 cd /cluster/data/rn4
 cp rn4.2bit /cluster/home/heather/transfer/snp/rn4
 ssh kkr5u00
 cd /scratch/snp/125/rat
 cp /cluster/home/heather/transfer/snp/rn4/rn4.2bit .
 # edit path in chromInfo, load into rn4snp125 with editted path
 
 # lookup reference allele in nibs
 # keep reverse complement to use in error checking (snpCheckAlleles)
 # check here for SNPs larger than 1024
 # errAbort if detected
 # check for coords that are too large, log to snpRefUCSC.error and skip
 # This run no errors
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC rn4snp125
 
 # morph chrN_snpTmp 
 
 mysql> desc chr1_snpTmp;
 
 #  +--------------------+-------------+------+-----+---------+-------+
 #  | Field              | Type        | Null | Key | Default | Extra |
 #  +--------------------+-------------+------+-----+---------+-------+
 #  | snp_id             | int(11)     | NO   |     |         |       |
 #  | ctg_id             | int(11)     | NO   |     |         |       |
 #  | chromStart         | int(11)     | NO   |     |         |       |
 #  | chromEnd           | int(11)     | NO   |     |         |       |
 #  | loc_type           | tinyint(4)  | NO   |     |         |       |
 #  | orientation        | tinyint(4)  | NO   |     |         |       |
 #  | allele             | blob        | YES  |     | NULL    |       |
 #  | refUCSC            | blob        | YES  |     | NULL    |       |
 #  | refUCSCReverseComp | blob        | YES  |     | NULL    |       |
 #  +--------------------+-------------+------+-----+---------+-------+
 
 # compare allele from dbSNP to refUCSC
 # locType between is excluded from this check
 # log exceptions to snpCheckAllele.exceptions
 # if SNP is positive strand, expect allele == refUCSC
 # log RefAlleleMismatch if not
 # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
 # If allele == refUCSCRevComp, log RefAlleleNotRevComp
 # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
 # This run we got:
 # 0 RefAlleleMismatch
 # 1830   RefAlleleNotRevComp
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles rn4snp125
 
 # add class and observed using univar_id from SNP table
 # to get class (subsnp_class) and observed (var_str) from UniVariation
 # log errors to snpClassAndObserved.errors
 # errors detected: 
 # class = 0 in UniVariation
 # class > 8 in UniVariation
 # univar_id = 0 in SNP
 # no row in SNP for snp_id in chrN_snpTmp
 # This run we got:
 # 3 class = 0 in UniVariation
 # 0 class > 8 in UniVariation
 # 0 univar_id = 0 in SNP 
 # 0 no row in SNP for snp_id in chrN_snpTmp 
 # dbSNP has class = 'in-del'
 # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved rn4snp125
 
 # morph chrN_snpTmp
 #  +--------------------+---------------+------+-----+---------+-------+
 #  | Field              | Type          | Null | Key | Default | Extra |
 #  +--------------------+---------------+------+-----+---------+-------+
 #  | snp_id             | int(11)       | NO   |     |         |       |
 #  | chromStart         | int(11)       | NO   |     |         |       |
 #  | chromEnd           | int(11)       | NO   |     |         |       |
 #  | loc_type           | tinyint(4)    | NO   |     |         |       |
 #  | class              | varchar(255)  | NO   |     |         |       |
 #  | orientation        | tinyint(4)    | NO   |     |         |       |
 #  | allele             | blob          | YES  |     | NULL    |       |
 #  | refUCSC            | blob          | YES  |     | NULL    |       |
 #  | refUCSCReverseComp | blob          | YES  |     | NULL    |       |
 #  | observed           | blob          | YES  |     | NULL    |       |
 #  +--------------------+---------------+------+-----+---------+-------+
 
 # generate exceptions for class and observed
 
 # SingleClassBetweenLocType
 # SingleClassRangeLocType
 # NamedClassWrongLocType
 
 # ObservedWrongFormat
 # ObservedWrongSize 
 # ObservedMismatch 
 
 # RangeSubstitutionLocTypeExactMatch
 
 # SingleClassTriAllelic
 # SingleClassQuadAllelic
 
 # This will also detect IUPAC symbols in allele
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved rn4snp125
 
 # add function
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction rn4snp125
 
 # add validation status and heterozygosity
 # log error if validation status > 31 or missing
 # no errors this run
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP rn4snp125
 
 # add molType
 # errors detected: missing or duplicate molType
 # this run no errors
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype rn4snp125
 
 # generate chrN_snp125 and snp125Exceptions tables
 cp snpCheckAlleles.exceptions snpCheckAlleles.tab
 cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
 cp snpExpandAllele.exceptions snpExpandAllele.tab
 cp snpLocType.exceptions snpLocType.tab
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable rn4snp125 125
 
 # concat into snp125.tab
 # cat chr*_snp125.tab >> snp125.tab
 /bin/sh concat.sh
 
 # check for multiple alignments
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple rn4snp125 125
 mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions;
 
 # load on hgwdev
 cp snp125.tab /cluster/home/heather/transfer/snp/rn4
 hgsql rn4snp125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/rn4/snp125Exceptions.tab
 ssh hgwdev
 cd transfer/snp/rn4
 mysql> load data local infile 'snp125.tab' into table snp125; 
 mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions; 
 
 # create indexes
 mysql> alter table snp125 add index name (name);
 mysql> alter table snp125 add index chrom (chrom, bin);
 mysql> alter table snp125Exceptions add index name(name);
 
 # create snp125ExceptionDesc table
 cd /cluster/data/dbSNP
 hgsql rn4 < snp125ExceptionDesc.sql
 # add counts to exception.human.125, can start with exception.template
 hgsql -e 'select count(*), exception from snp125Exceptions group by exception' canfam1
 mysql> load data local infile 'exception.rn4.snp125' into table snp125ExceptionDesc;
 
 mysql> select count(*), exception from snp125Exceptions group by exception;
 +----------+---------------------------+
 | count(*) | exception                 |
 +----------+---------------------------+
 |    15252 | MultipleAlignments        |
 |      517 | ObservedMismatch          |
 |        1 | ObservedWrongFormat       |
 |        1 | ObservedWrongSize         |
 |     1830 | RefAlleleNotRevComp       |
 |        1 | RefAlleleWrongSize        |
 |      194 | SingleClassBetweenLocType |
 |        1 | SingleClassQuadAllelic    |
 |       48 | SingleClassRangeLocType   |
 |       92 | SingleClassTriAllelic     |
 +----------+---------------------------+
 
 
 ##########################################################################
 #  xxBlastTab - Help filter out unwanted paralogs  (Galt 2007-01-11)
 #
 # We are starting with xxBlastTab tables already built in the usual way with
 # blastall/blastp, probably with doHgNearBlastp.pl script.
 #
 # we want to update rn4 for human and mouse, 
 # so check ./hgGeneData/Rat/rn4/otherOrgs.ra for current settings
 
 ssh hgwdev
 
 synBlastp.csh rn4 hg18 
 #rn4.hgBlastTab:
 #new number of unique query values:
 #7379
 #new number of unique target values
 #6373
 #old number of unique query values:
 #7977
 #old number of unique target values
 #6657
 
 synBlastp.csh rn4 mm8
 #rn4.mmBlastTab:
 #new number of unique query values:
 #7397
 #new number of unique target values
 #6502
 #old number of unique query values:
 #7988
 #old number of unique target values
 #6788
 
 
 ##########################################################################
 # GenBank gbMiscDiff table (markd 2007-01-10)
 # Supports `NCBI Clone Validation' section of mgcGenes details page
 
    # genbank release 157.0 now contains misc_diff fields for MGC clones
    # reloading mRNAs results in gbMiscDiff table being created.
    ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna rn4
 
 ###########################################################################
 # REVERSE LIFTOVER CHAINS RN4 -> RN3 (DONE 2007-02-09 kate)
 
     # copy rn3 2bit to cluster-friendly dir
     ssh kkstore06
     cp -p /cluster/data/rn3/rn3.2bit /san/sanvol1/scratch/rn3
 
     # create the directory and scripts
     ssh kkstore04
     cd /cluster/data/rn4
     doSameSpeciesLiftOver.pl rn4 rn3 -debug -bigClusterHub pk
 
     # go to created dir and run for real
     cd /cluster/data/rn4/bed/blat.rn3.2007-02-09
     doSameSpeciesLiftOver.pl rn4 rn3 -bigClusterHub pk >& do.log & tail -f do.log
 
     # failed for unknown region during chainNet of chr4.  Error message
     # indicated the file /cluster/data/rn3/chrom.sizes couldn't be found.
     # I extracted the remaining steps from the doNet.csh file and ran those.
 
     doNet.csh >&! doNet.log
 
     doSameSpeciesLiftOver.pl rn4 rn3 -continue load -bigClusterHub pk >& doLoad.log
 
 
 ###########################################################################
 # QUALITY SCORES (DONE 2/22/07 angie)
     ssh kolossus
     nice tcsh
     mkdir /cluster/data/rn4/bed/qual
     cd /cluster/data/rn4/bed/qual
     zcat ../../downloads/*.qual.gz \
     | sed -re 's/^>gnl\|Rnor3.[0-9_]+\|/>/' \
     | qaToQac stdin stdout \
     | qacToWig -fixed stdin stdout \
     | wigEncode stdin qual.{wig,wib}
 #Made 1 .wig files in stdout
 #Converted stdin, upper limit 99.00, lower limit 0.00
     ssh hgwdev
     cd /cluster/data/rn4/bed/qual
     rm -f /gbdb/rn4/wib/qual.wib
     ln -s /cluster/data/rn4/bed/qual/qual.wib /gbdb/rn4/wib/
     hgLoadWiggle -pathPrefix=/gbdb/rn4/wib rn4 quality qual.wig
 
 ##########################################################################
 # BLASTZ/CHAIN/NET HORSE AND RAT (DONE 2/18/07 Fan)
     ssh kkstore05
     mkdir /cluster/data/equCab1/bed/blastz.rn4.2007-02-18
     cd /cluster/data/equCab1/bed/blastz.rn4.2007-02-18
     cat << '_EOF_' > DEF
 # Horse vs. Rat
 
 BLASTZ_M=50
 
 # TARGET: Horse equCab1
 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
 SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
 # Maximum number of scaffolds that can be lumped together
 SEQ1_LIMIT=500     
 SEQ1_CHUNK=30000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse rn4
 SEQ2_DIR=/scratch/hg/rn4/nib
 SEQ2_LEN=/cluster/data/rn4/chrom.sizes 
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/equCab1/bed/blastz.rn4.2007-02-18
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
       -bigClusterHub pk \
       -blastzOutRoot /cluster/bluearc/equCab1Rn4 >& do.log &
     tail -f do.log
 
     ssh hgwdev
     cd /cluster/data/equCab1/bed
     ln -s blastz.rn4.2007-02-18 /cluster/data/equCab1/bed/blastz.rn4
     nice featureBits equCab1 -chrom=chr1 chainRn4Link
 # 67408374 bases of 177498097 (37.977%) in intersection
 
     bash
     time nice -n 19 featureBits equCab1 chainRn4Link \
 	> fb.equCab1.chainRn4Link.txt 2>&1
     #	859116859 bases of 2421923695 (35.472%) in intersection
 
     ssh kkstore05
     mkdir /cluster/data/rn4/bed/blastz.equCab1.swap
     cd /cluster/data/rn4/bed/blastz.equCab1.swap
     bash
     time doBlastzChainNet.pl \
 	/cluster/data/equCab1/bed/blastz.rn4.2007-02-18/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-verbose=2 -swap -bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/equCab1Rn4 > swap.log 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/rn4/bed/blastz.equCab1.swap
     bash
     time nice -n 19 featureBits rn4 chainEquCab1Link \
 	> fb.rn4.chainEquCab1Link.txt 2>&1
     # 867029914 bases of 2571531505 (33.716%) in intersection
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET BOSTAU3 (Done March 2007 heather)
     mkdir /cluster/data/rn4/bed/blastz.bosTau3.2007-03-25
     ln -s /cluster/data/rn4/bed/blastz.bosTau3.2007-03-25 /cluster/data/rn4/bed/blastz.bosTau3
     cd /cluster/data/rn4/bed/blastz.bosTau3
     cat << '_EOF_' > DEF
 
 BLASTZ_M=50
 
 # TARGET: Rat rn4
 SEQ1_DIR=/scratch/hg/rn4/nib
 SEQ1_LEN=/scratch/hg/rn4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau3
 SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
 SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes
 SEQ2_LIMIT=500
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastz.bosTau3.2007-03-25
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/bosTau3/blastz.rn4 >& do.log &
     tail -f do.log
 
     nice featureBits -chrom=chr1 rn4 chainBosTau3Link
     # 61426292 bases of 243058842 (25.272%) in intersection
 
 #########################################################################
 # BLASTZ Mouse mm9 swap (DONE - 2007-09-11 - Hiram)
     # the original
     ssh kkstore06
     cd  /cluster/data/mm9/bed/blastz.rn4
     cat fb.mm9.chainRn4Link.txt
     #	1713186474 bases of 2620346127 (65.380%) in intersection
 
     #	and the swap
     mkdir /cluster/data/rn4/bed/blastz.mm9.swap
     cd /cluster/data/rn4/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzRn4.2007-08-31/DEF \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
 	-swap -syntenicNet > swap.out 2>&1 &
 
 
 #########################################################################
 # RGD QTL (DONE 9/24/07 angie -- RELOADED 9/26/07)
     ssh kkstore04
     mkdir /cluster/data/rn4/bed/rgdQtl
     cd /cluster/data/rn4/bed/rgdQtl
     wget ftp://rgd.mcw.edu:21/pub/data_release/QTLS
     # Make bed4 and rgdQtlLink:
     perl -we 'open(BED, ">rgdQtl.bed") || die; \
               open(LINK, ">rgdQtlLink.txt") || die; \
               while (<>) { \
                 chomp; my @w = split("\t"); \
                 next unless ($w[1] =~ /^rat/ && $w[15]); \
                 $w[5] =~ s/^/chr/; \
                 $w[15] =~ s/^([-\d]+).*$/$1/ || die "parse start pos"; \
                 $w[16] =~ s/^(\d+).*$/$1/ || die "parse end pos"; \
                 if ($w[15] > $w[16]) { \
                   $tmp = $w[15];  $w[15] = $w[16];  $w[16] = $tmp; \
                 } \
                 $w[15]--; \
                 $w[15] = 0 if ($w[15] < 0); \
                 print BED "$w[5]\t$w[15]\t$w[16]\t$w[2]\n"; \
                 print LINK "$w[0]\t$w[2]\t$w[3]\n"; \
               } \
               close(BED);  close(LINK);' \
       QTLS
     ssh hgwdev
     cd /cluster/data/rn4/bed/rgdQtl
     # 9/26/07: RGD said that the duplicate Gluco13 was supposed to be 
     # Gluco13 + Gluco31.  Edited rgdQtl.bed to change Gluco13 -> 31 for 
     # the chr15 copy, and rgdQtlLink.txt for id 631516.
     hgLoadBed rn4 rgdQtl rgdQtl.bed
     hgLoadSqlTab rn4 rgdQtlLink ~/kent/src/hg/lib/rgdQtlLink.sql rgdQtlLink.txt
     # Fix coords > chromSize:
     perl -wpe 's/^(\w+)\t(\d+)$/ \
       delete from rgdQtl where chrom="$1" and chromStart >= $2; \
       update rgdQtl set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
       ../../chrom.sizes \
     | hgsql rn4
     checkTableCoords -verbose=2 rn4 rgdQtl
     runJoiner.csh rn4 rgdQtl
 
 
 #############################################################################
 # CONTRAST GENES (2007-10-02 markd)
 # recieved predictions from Sam Gross <ssgross@stanford.edu>
 
     cd /cluster/data/rn4/bed/contrastGene/
     wget http://www.stanford.edu/~ssgross/contrast.rn4.bed
     # this is a custom track, not a pure BED
     tail +2 contrast.rn4.bed | hgLoadBed -tab rn4 contrastGene stdin
 
     # verify 
     # load track db (ra and contrastGene.html are global
     # request push of contrastGene
 
 ###########################################################################
 #  loading affy mouse Exon probes and transcripts (DONE - 2007-10-04 - Hiram)
     # data was supplied from Venu Valmeekam Venu_Valmeekam@affymetrix.com
     #	dropped via FTP to genome-test
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/affyRaEx1
     cd /cluster/data/rn4/bed/affyRaEx1
     # the files received:
 # -rw-r--r--  1 10151819 Oct  3 10:53 transcript_cluster_rn.bed.gz
 # -rw-r--r--  1 42240033 Oct  4 13:32 probe_rn_score.bed.gz
     # loading:
     hgLoadBed -tmpDir=/scratch/tmp rn4 affyRaEx1Probe probe_rn_score.bed.gz
     #	Loaded 4025442 elements of size 6
     hgLoadBed -tmpDir=/scratch/tmp rn4 affyRaEx1Transcript \
 	transcript_cluster_rn.bed.gz
     #	Loaded 360668 elements of size 12
     # working on description pages for these with Venu.
 
     #	I manually set the scores in the affyRaEx1Transcript track to
     #	1000 so it would work OK (not color) with the useScore 1 so that
     #	the affyRaEx1Probe would color itself on the score
 
 #########################################################################
 # EXONIPHY Rn4, lifted from hg18 (DONE - 2007-11-19 - Hiram)
 #	needed for uscsGenes10 building
     # create a syntenic liftOver chain file
     ssh kolossus
     cd /cluster/data/hg18/bed/blastz.rn4/axtChain
     time nice -n +19 netFilter -syn hg18.rn4.net.gz \
 	| netChainSubset -verbose=0 stdin hg18.rn4.all.chain.gz stdout \
 	| chainStitchId stdin stdout | gzip -c > hg18.rn4.syn.chain.gz
     #	real    5m10.947s
     #	slightly smaller than the ordinary liftOver chain file:
 # -rw-rw-r--  1  73768012 Feb 17  2006 hg18.rn4.over.chain.gz
 # -rw-rw-r--  1  69198780 Nov 19 14:13 hg18.rn4.syn.chain.gz
 
     # exoniphyRn4.gp is prepared as follows
     ssh hgwdev
     mkdir /cluster/data/rn4/bed/exoniphy
     cd /cluster/data/rn4/bed/exoniphy
     hgsql hg18 -N -e "select * from exoniphy" > exoniphyHg18.gp
     time nice -n +19 liftOver -genePred exoniphyHg18.gp \
 	/cluster/data/hg18/bed/blastz.rn4/axtChain/hg18.rn4.syn.chain.gz \
 	    exoniphyRn4.gp unmapped
     #	real    32m0.405s
 
     wc -l *
     #	178162 exoniphyHg18.gp
     #	166859 exoniphyRn4.gp
     #	 22606 unmapped
 
     ssh hgwdev
     cd /cluster/data/rn4/bed/exoniphy
     nice -n +19 hgLoadGenePred -genePredExt rn4 exoniphy exoniphyRn4.gp
     nice -n +19 featureBits rn4 exoniphy
     #	24959589 bases of 2571531505 (0.971%) in intersection
     nice -n +19 featureBits mm9 exoniphy
     #	25931742 bases of 2620346127 (0.990%) in intersection
     nice -n +19 featureBits mm8 exoniphy
     #	25952211 bases of 2567283971 (1.011%) in intersection
 
 ############################################################################
 # SGP GENES (DONE - 2007-12-20 - Hiram)
     ssh kkstore04
     mkdir  /cluster/data/rn4/bed/sgp
     cd  /cluster/data/rn4/bed/sgp
 
     mkdir gtf
 for C in `awk '{print $1}' /cluster/data/rn4/chrom.sizes`
 do
     wget --timestamping \
 "http://genome.imim.es/genepredictions/R.norvegicus/rnNov2004/SGP/humangp200603/${C}.gtf" \
         -O "gtf/${C}.gtf"
 done
 
     ssh hgwdev
     cd /cluster/data/rn4/bed/sgp
     ldHgGene -gtf -genePredExt rn4 sgpGene gtf/chr*.gtf
     # Read 36569 transcripts in 286299 lines in 45 files
     # 36569 groups 44 seqs 1 sources 3 feature types
     # 36569 gene predictions
 
     featureBits rn4 sgpGene
     #	36219236 bases of 2571531505 (1.408%) in intersection
 
     featureBits rn4 -enrichment refGene:CDS sgpGene
 # refGene:CDS 0.770%, sgpGene 1.408%, both 0.672%, cover 87.35%, enrich 62.02x
 
 #####################################################################
 # LOAD GENEID GENES (DONE - 2007-12-20 - Hiram)
     ssh kkstore04
     mkdir -p /cluster/data/rn4/bed/geneid
     cd /cluster/data/rn4/bed/geneid
     mkdir gtf prot
 
 for C in `awk '{print $1}' /cluster/data/rn4/chrom.sizes`
 do
   echo $C
   wget --timestamping \
 "http://genome.imim.es/genepredictions/R.norvegicus/rnNov2004/geneid_v1.2/${C}.gtf" \
 	-O "gtf/${C}.gtf"
   wget --timestamping \
 "http://genome.imim.es/genepredictions/R.norvegicus/rnNov2004/geneid_v1.2/${C}.prot" \
 	-O "prot/${C}.prot"
 done
 
     # Add missing .1 to protein id's
     cd prot
     foreach f (*.prot)
       perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot
     end
 
     ssh hgwdev
     cd /cluster/data/rn4/bed/geneid
     ldHgGene -genePredExt -gtf rn4 geneid gtf/*.gtf
 # Read 38143 transcripts in 287158 lines in 45 files
 # 38143 groups 45 seqs 1 sources 3 feature types
 # 38143 gene predictions
 
     hgPepPred rn4 generic geneidPep prot/chr*-fixed.prot
     featureBits rn4 geneid
     #	40255245 bases of 2571531505 (1.565%) in intersection
     featureBits rn4 -enrichment refGene:CDS geneid
 # refGene:CDS 0.770%, geneid 1.565%, both 0.615%, cover 79.92%, enrich 51.05x
 
 ##########################################################################
 # AGILENT CGH PROBES (Done 2008-05-13, Andy)
 # (see hg18.txt)
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 ############################################################################
 
 
 #############################################################################
 # RAT TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)  
 # (to build the affyExonTissues track, see the steps outlined in hg18.txt)
 #############################################################################
 
 ########################################################################
 ## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy)
 ## (instructions are in hg18.txt)
 ########################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 rn4.upstreamGeneTbl = refGene
 rn4.upstreamMaf = multiz9way /hive/data/genomes/rn4/bed/multiz9way/species.lst
 
 #############################################################################
 # TEST LASTZ Mouse Mm9 (DONE - 2008-11-26,12-01 - Hiram)
 #	to compare like with like for Rn5 lastz
     screen	# use screen to manage this job
     mkdir /hive/data/genomes/rn4/bed/blastzMm9.2008-11-26
     cd /hive/data/genomes/rn4/bed/blastzMm9.2008-11-26
     cat << '_EOF_' > DEF
 # rat vs mouse
 # Specially tuned blastz parameters from Webb Miller
 BLASTZ=lastz
 BLASTZ_M=254
 BLASTZ_ABRIDGE_REPEATS=0
 BLASTZ_O=600
 BLASTZ_E=55
 BLASTZ_Y=15000
 BLASTZ_T=2
 BLASTZ_K=4500
 BLASTZ_Q=/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/mouse_rat.q
 
 # TARGET: Rat Rn4
 SEQ1_DIR=/scratch/data/rn4/nib
 SEQ1_LEN=/scratch/data/rn4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse Mm9
 SEQ2_DIR=/scratch/data/mm9/nib
 SEQ2_LEN=/scratch/data/mm9/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=medium \
 	-stop=net `pwd`/DEF > do.log 2>&1 &
     #	real    313m17.777s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=medium \
 	-continue=cat -stop=net `pwd`/DEF >cat.log 2>&1
 
     #	create a loadUp.csh file so it can be modified to change
     #	the table names to not clash with existing Mm9 tables
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=medium \
 	-debug -continue=load -stop=load `pwd`/DEF >load.log 2>&1
     #	real    79m48.305s
     cd /hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/axtChain
     time ./load.csh
     #	real    23m39.747s
     cat fb.rn4.chainMm9LastzLink.txt
     #	1710794258 bases of 2571531505 (66.528%) in intersection
 
     mkdir /hive/data/genomes/mm9/bed/blastz.rn4.swap
     cd /hive/data/genomes/mm9/bed/blastz.rn4.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/DEF \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=medium \
 	-swap -stop=net > swap.log 2>&1 &
     #	real    120m41.998s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn4/bed/lastzMm9.2008-11-26/DEF \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=medium \
 	-debug -swap -continue=load -stop=load > load.log 2>&1 &
     #	real    27m24.819s
     cat fb.mm9.chainRn4LastzLink.txt 
     #	1713323126 bases of 2620346127 (65.385%) in intersection
 
 #############################################################################
 # LIFTOVER TO Rn5 (DONE - 2008-12-14 - Hiram )
     mkdir /hive/data/genomes/rn4/bed/blat.rn5.2008-12-14
     cd /hive/data/genomes/rn4/bed/blat.rn5.2008-12-14
     # -debug run to create run dir, preview scripts...
     doSameSpeciesLiftOver.pl -debug rn4 rn5
     # Real run:
     time nice -n +19 doSameSpeciesLiftOver.pl rn4 rn5 > do.log 2>&1
 
 #############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
 
 see doc/builds.txt for specific details.
 ############################################################################
 # ADD LINK TO GENENETWORK (DONE. 11/06/09 Fan).
 
 # Received geneNetwork ID list file, GN_rat_RefSeq.txt, for rn4 from GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com].
 
     ssh hgwdev
     mkdir -p /cluster/data/rn4/bed/geneNetwork
     cd /cluster/data/rn4/bed/geneNetwork
 
     hgsql rn4 < ~/src/hg/lib/geneNetworkId.sql
     hgsql rn4 -e \
     'load data local infile "GN_rat_RefSeq.txt" into table geneNetworkId'
 
 #########################################################################
 # BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2010-01-19,22 - Hiram)
     ssh hgwdev
     screen # use a screen to manage this multi-day job
     mkdir /hive/data/genomes/rn4/bed/blastzBosTau4.2010-01-19
     cd /hive/data/genomes/rn4/bed/blastzBosTau4.2010-01-19
 
     cat << '_EOF_' > DEF
 # Rat vs. Cow
 BLASTZ_M=50
 
 # TARGET: Rat Rn4
 SEQ1_DIR=/scratch/data/rn4/nib
 SEQ1_LEN=/cluster/data/rn4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau4
 SEQ2_DIR=/scratch/data/bosTau4/bosTau4.2bit
 SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ2_LIMIT=100
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rn4/bed/blastzBosTau4.2010-01-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-workhorse=hgwdev -bigClusterHub=pk -chainMinScore=3000 \
 	-chainLinearGap=medium -noLoadChainSplit -syntenicNet > do.log 2>&1 &
     # interrupted by storm induced power failures, the finish lastz batch
     #	then continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=cat -workhorse=hgwdev -bigClusterHub=pk -chainMinScore=3000 \
 	-smallClusterHub=memk -chainLinearGap=medium -noLoadChainSplit \
 	-syntenicNet > cat.log 2>&1 &
     #	real    107m32.779s
     cat fb.rn4.chainBosTau4Link.txt 
     #	649931321 bases of 2571531505 (25.274%) in intersection
 
     mkdir /cluster/data/bosTau4/bed/blastz.rn4.swap
     cd /cluster/data/bosTau4/bed/blastz.rn4.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/rn4/bed/blastzBosTau4.2010-01-19/DEF \
 	-noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	    -swap -syntenicNet > swap.log 2>&1 &
     #	real    77m18.645s
     cat fb.bosTau4.chainRn4Link.txt 
     #	664253901 bases of 2731830700 (24.315%) in intersection
 
 #########################################################################
 # ailMel1 Panda alignment (DONE - 2010-01-21 - Hiram)
     mkdir /hive/data/genomes/rn4/bed/lastzAilMel1.2010-01-21
     cd /hive/data/genomes/rn4/bed/lastzAilMel1.2010-01-21
 
     cat << '_EOF_' > DEF
 # Rat vs. Panda
 #	parameters from the Panda paper supplemental where they describe
 #	their lastz parameters
 BLASTZ_K=2200
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_H=2000
 BLASTZ_C=2
 BLASTZ_T=2
 
 # our usual M
 BLASTZ_M=50
 
 # TARGET: Rat Rn4
 SEQ1_DIR=/scratch/data/rn4/nib
 SEQ1_LEN=/scratch/data/rn4/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Panda
 SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
 SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/rn4/bed/lastzAilMel1.2010-01-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    129m23.300s
     cat fb.rn4.chainAilMel1Link.txt 
     #	707335282 bases of 2571531505 (27.506%) in intersection
 
     mkdir /hive/data/genomes/ailMel1/bed/blastz.rn4.swap
     cd /hive/data/genomes/ailMel1/bed/blastz.rn4.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn4/bed/lastzAilMel1.2010-01-21/DEF \
 	-swap -noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    65m7.728s
     cat fb.ailMel1.chainRn4Link.txt 
     #	693099721 bases of 2225124764 (31.149%) in intersection
 
 ############################################################################
 # tRNAs track (2010-01-15, Fan DONE)
 #
     ssh hgwdev
     cd /hive/data/genomes/rn4/bed
     mkdir tRNAs
     cd tRNAs
 
 # Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/
 
     cp -p /projects/lowelab/users/lowe/Browser/vertebrates/rn4-tRNAs.bed .
     cp -p \
     /projects/lowelab/users/lowe/Browser/vertebrates/rn4_tRNAs_images.tar.gz\
     .
 
     hgsql rn4 -e 'drop table if exists tRNAs'
     hgLoadBed -tab rn4 tRNAs rn4-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql
 
     mkdir gif
     cd gif
     gzip -d ../rn4_tRNAs_images.tar.gz
     tar -xvf rn4_tRNAs_images.tar
     mkdir /hive/data/gbdb/rn4/RNA-img
     cp -p * /hive/data/gbdb/rn4/RNA-img
 
 #####################################################################
+# INITIAL RGD GENES BUILD (2010-02-12, Fan, DONE)
+# 
+# NOTE: THE BUILD PIPELINE IS DEVELOPED.  SEVERAL MAJOR PROBLEMS WERE
+# DISCOVERED IN THE ORIGINAL SOURCE DATA FILE.  RGD IS LOOKING INTO THEM.
+#
+# THIS PIPELINE SHOULD BE ADJUSTED/MODIFIED AND RE-RUN AFTER RGD FIXES THOSE PROBLEMS
+# AND PROVIDES AN UPDATED VERSION OF THEIR DATA FILE.
 
+ssh hgwdev
+cd /hive/data/genomes/rn4/bed
+mkdir rgdGene
+cd rgdGene
+
+# download raw data from RGD
+wget --timestamping ftp://rgd.mcw.edu/pub/data_release/GFF/rgd_genes_gff.zip
+
+unzip rgd_genes_gff.zip
+
+cat gff/RGDgenes_chr*.gff3 >RGDgenes_all.gff3
+
+hgsql rn4 -e 'drop table rgdGeneRaw0'
+hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw0.sql
+hgsql rn4 -e 'load data local infile "RGDgenes_all.gff3" into table rgdGeneRaw0'
+
+
+# build the rgdId.tab file 
+hgsql rn4 -N -e 'select "RGD:",seqname from rgdGeneRaw0 '|cut -f 1 >j.rgd
+hgsql rn4 -N -e 'select * from rgdGeneRaw0 '|cut -f 9 >j.0
+
+cat j.0 |sed -e 's/RGD:/\t/'|cut -f 2 >j.1
+cat j.1 |sed -e 's/,/\t/'|sed -e 's/;/\t/'|cut -f 1 >j.2
+
+paste j.rgd j.2 |sed -e 's/\t//' >rgdId.tab
+rm j.*
+
+# fix some currently known problems of RGD Genes
+
+# First, remove gene records that contains exons of different strands.
+# create two temp tables rgdGeneRaw2 and rgdGeneRaw3 
+
+cut -f 1-8 RGDgenes_all.gff3  >j.temp8
+paste j.temp8 rgdId.tab >rgdGeneRaw2.tab
+
+hgsql rn4 -e 'drop table rgdGeneRaw2'
+hgsql rn4 -e 'drop table rgdGeneRaw3'
+hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw2.sql
+hgsql rn4 < ~/kent/src/hg/lib/rgdGeneRaw3.sql
+hgsql rn4 -e 'load data local infile "rgdGeneRaw2.tab" into table rgdGeneRaw2'
+hgsql rn4 -e 'load data local infile "rgdGeneRaw2.tab" into table rgdGeneRaw3'
+
+# create the one line script del1 containing the following line:
+
+hgsql rn4 -N -e "delete from rgdGeneRaw2 where rgdId='${2}' and feature='exon' and seqname='${1}' and strand = '${3}'"
+
+chmod +x del1
+hgsql rn4 -N -e \
+"select 'del1', x.seqname, x.rgdId, x.strand from rgdGeneRaw2 x, rgdGeneRaw3 y where x.rgdId=y.rgdId and x.seqname=y.seqname and x.feature='exon' and y.feature='gene' and x.strand!=y.strand;" |\
+sort -u > delAll
+chmod +x delAll
+
+./delAll
+
+hgsql rn4 -N -e 'select "chr" from rgdGeneRaw2 where feature != "CDS"' >j.chr
+hgsql rn4 -N -e 'select * from rgdGeneRaw2  where feature != "CDS"' >j.1
+paste j.chr j.1 |sed -e 's/chr\t/chr/'>j.2
+
+# remove one problematic gene
+cat j.2|grep -v "RGD:3683" >rgdGeneTemp.gff
+
+ldHgGene -gtf rn4 rgdGeneTemp rgdGeneTemp.gff
+
+hgsql rn4 -N -e 'select x.*, y.start-1, y.end from rgdGeneTemp x, rgdGeneRaw2 y where x.name=y.rgdId and y.feature="CDS"' >j.0
+
+cut -f 2-6 j.0 >j.1
+
+cut -f 12-13 j.0 >j.2
+
+cut -f 9-11 j.0 >j.3
+
+#paste j.1 j.2 j.3 >rgdGene2.tab
+paste j.1 j.2 j.3 >j.10
+
+cut -f 1-3 j.10 >j.1-3
+
+cut -f 9 j.10 |sed -e 's/,/\t/'|cut -f 1 >j.start
+
+cut -f 5-10 j.10 >j.5-10
+
+paste j.1-3 j.start j.5-10 >rgdGene2.tab
+
+hgsql rn4 -e 'drop table rgdGene2'
+hgsql rn4 < ~/kent/src/hg/lib/rgdGene2.sql
+
+hgsql rn4 -e 'load data local infile "rgdGene2.tab" into table rgdGene2'
+
+hgsql rn4 -e 'drop table rgdGeneTemp'
+
+rm j.*
+
+####### temp fix to make txEnd the same as last exonStart
+
+getLastExonEnd  rn4 |sed -e 's/,//' >j.out
+cut -f 1 j.out >j.1
+cut -f 1 rgdGene2.tab >j.0
+diff j.0 j.1
+
+cut -f 2 j.out >j.last
+
+cut -f 1-4  rgdGene2.tab > j.1-4
+cut -f 6-10  rgdGene2.tab > j.6-10
+
+paste j.1-4 j.last j.6-10 >rgdGene2.tab.new
+
+hgsql rn4 -e 'drop table rgdGene2'
+hgsql rn4 < ~/kent/src/hg/lib/rgdGene2.sql
+
+hgsql rn4 -e 'load data local infile "rgdGene2.tab.new" into table rgdGene2'
+rm j.*