src/hg/makeDb/doc/rheMac2.txt 1.20

1.20 2010/02/12 23:51:36 hiram
last runs to Marmoset calJac3
Index: src/hg/makeDb/doc/rheMac2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/rheMac2.txt,v
retrieving revision 1.19
retrieving revision 1.20
diff -b -B -U 1000000 -r1.19 -r1.20
--- src/hg/makeDb/doc/rheMac2.txt	25 Nov 2009 21:48:42 -0000	1.19
+++ src/hg/makeDb/doc/rheMac2.txt	12 Feb 2010 23:51:36 -0000	1.20
@@ -1,1613 +1,1674 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # Macaca Mulatta  (rheMac2)
 # Venter Institute final split merge assembly using washU and baylor
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +-------------+---------------------------------+
 #  | tableName   | type                            |
 #  +-------------+---------------------------------+
 #  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
 #  | nscanGene   | genePred nscanPep               |
 #  +-------------+---------------------------------+
 
 
 # DOWNLOAD SEQUENCE (DONE jan27  2006 Robert)
     ssh kkstore01
     mkdir /cluster/store2/rheMac2
     mkdir /cluster/store2/rheMac2/final
     cd /cluster/data
     ln -s /cluster/store2/rheMac2/final rheMac2
     cd /cluster/data/rheMac2
     mkdir downloads
     cd downloads
 
 
 The macaque genome assembly (v. 1.0, Mmul051212) has been posted to the macaque genome ftp site:
 ftp://macftp@ftp.macaquegenome.hgsc.bcm.tmc.edu/
 
 The pw is analysis .
 
 
 mkdir final
 cd final
 mkdir download
 cd download
 
 # A.  The assembly itself is represented in three groups, 1) the main assembly, 2) the Chr#_random and ChrU set, and 3) the ChrUr set (a late addition to the ChrU set).
 
 #AGP and Fasta chromosome files for these three sets are:
 #   1) Main Assembly - these are scaffolds, not chromosomes.
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1-edit4.trimNoContam.agp.bz2
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1_edit4.scf.fasta.bz2
 #   2) Mapped to chromosomes but unplaced Chr#_random, and unmapped contigs ChrU
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.random.ctgs.agp.bz2 
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.random.scfs.agp.bz2 
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1_edit4.random.chrm.fasta.bz2
 #   3) Late addition of 353 contigs to ChrU
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.chrm.fasta.bz2  
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.ctgs.agp.bz2     
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.ChrUr.scfs.agp.bz2 
 
 # B.  The contigs fasta and quality files that go into these are in two sets, the bulk of the files (those in parts 1 and 2 of the assembly above) and the few that were added back at the last minute (part 3 above).  
 #    1)
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.noContam.ctg.fasta.bz2
 #    2)
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.noContam.ctg.qv.bz2
 #    3) The 353 contigs that were added back at the last minute are included in the following files (other contigs in these files are not included in the assembly)
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.degen.noContam.fasta.bz2
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/NewMergeSplit/v1.edit3.degen.noContam.qv.bz2
 
 
 # C.  The final assembly file in the VI internal assembly format is:
 wget ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.asm.bz2
 
 #D.  The chromosome files are below, note, all gaps are listed as "fragment yes" in the agp files, although some are spanned by clones and others are not (making use of human synteny or macaque map information instead).  These files will be corrected to list the uncaptured gaps as "clone no", but the intervals and other information will not change, so use these for other information now.
 
 wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrm.fasta.corr.bz2
 wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrome.ctgs.agp.corr.bz2
 wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu/Assemblies/VI/FinalMergeSplit/v1.edit4.chrome.scfs.agp.corr.bz2
 
 bunzip2 *.bz2
 #wrap extra long lines in fasta
 fold v1_edit4.random.chrm.fasta |sed -e 's/ChrUr/chrUr/' > rheMac2_random.fa
 fold v1.edit4.chrm.fasta.corr |sed -e 's/ChrUr/chrUr/' >rheMac2.fa
 fold v1_edit4.scf.fasta |sed -e 's/ChrUr/chrUr/' > rheMac2.scf.fa
 mkdir ../Ur
 fold v1.edit4.ChrUr.chrm.fasta |sed -e 's/ChrUr/chrUr/' > ../Ur/chrUr.fa
 mv v1.edit4.chrome.ctgs.agp.corr v1.edit4.chrome.ctgs.corr.agp
 mv v1.edit4.chrome.scfs.agp.corr v1.edit4.chrome.scfs.corr.agp
 
 #*** NOTE FOR NEXT TIME *** use makeGenomeDb.pl instead of the following
 #                           several sections:
 
 # Check that IDs in .fa match IDs in .qual
 
   ssh kkstore01
   cd /cluster/data/rheMac2/downloads
   # foreach Bin0 contigs linearScaffolds reads
   grep ">" *.fasta > id.fa
   grep ">" *.qv > id.qual
   diff id.fa id.qual
 
 # change chromosomes to lower case
 
 sed -e 's/Chr/chr/' v1.edit4.chrome.ctgs.agp.final > v1.edit4.chrome.ctgs.final.fix2.agp
 cat v1.edit4.chrome.scfs.agp.final |sed -e 's/Chr/chr/' > v1.edit4.final.fix2.agp
 hgGoldGapGl -noGl rheMac2 /cluster/data/rheMac2/downloads/v1.edit4.final.fix2.agp
 
   /cluster/bin/i386/checkAgpAndFa v1.edit4.final.fix.agp rheMac2.fa > check.out   
 
 # SPLIT UP chromosome and make 500k chunks for RM
 
 
   ssh kkstore02
   cd /cluster/data/rheMac2
 
   faSplit byname downloads/rheMac2.fa chrom
   for i in `awk '{print $1}' chrom.sizes` ; do echo $i ; mkdir -p ${i##chr} ; mv $i.fa ${i##chr} ; pushd ${i##chr} ; faSplit -lift=$i.lft size $i.fa 500000 split ; popd; done 
   # generate list
   find scaffolds -print > scaffolds.list.0
   grep fa scaffolds.list.0 > scaffolds.list
 
 #### MAKE 2BIT NIB FILE 
 
   ssh kkstore01
   nice /cluster/bin/i386/faToTwoBit rheMac2.fa rheMac2.2bit
   nice /cluster/bin/i386/twoBitToFa rheMac2.2bit check.fa
   mv rheMac2.2bit ..
   # could also use cmp
   # note: size match
   faSize -detailed=on rheMac2.fa >mac.len 
   faSize -detailed=on check.fa >check.len 
   diff mac.len check.len
   wc -l *.len
 
   ssh hgwdev
   mkdir /gbdb/rheMac2
   ln -s /cluster/data/rheMac2/rheMac2.2bit /gbdb/rheMac2/rheMac2.2bit
 
 
 #### CREATE DATABASE (DONE Jan 27 , 2006 Robert)
 
   ssh hgwdev
   hgsql hg17
   create database rheMac2;
   use rheMac2;
   create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;
   # add rows to hgcentraltest on hgwbeta in dbDb and defaultDb
   # set orderKey past dog, before chimp
 
 # LOAD GAP & GOLD TABLES FROM AGP 
     ssh hgwdev
     hgGoldGapGl -noGl rheMac2 /cluster/data/rheMac2/downloads/v1.edit4.chrome.scfs.final.fix.agp
 # CREATE GAP TABLE  from v1.edit4.chrome.ctgs.final.fix.agp to get gaps
 
   ssh hgwdev
   cd kent/src/hg/lib
   # remove bin column for now
   hgsql rheMac2 < gap2.sql
   cd /cluster/data/rheMac2/downloads
   grep fragment v1.edit4.chrome.ctgs.final.fix.agp > fragment.txt
   grep clone v1.edit4.chrome.ctgs.final.fix.agp >> fragment.txt 
   hgsql rheMac2
   load data local infile 'fragment.txt' into table gap
   # create trackDb/monkey/rheMac2/gap.html
 
 # CREATE CONTIG TABLE  (DONE Feb 15,2006 Robert)
 
   ssh hgwdev
   cd kent/src/hg/lib
   cp ~/kent/src/hg/lib/ctgPos.sql .
   # edit ctgPos.sql to increase size of primary key to 20
   # this doesn't include strand or contig start/end
   hgsql rheMac2 < ctgPos.sql
   cd /cluster/data/rheMac2/downloads
   grep -v clone v1.edit4.chrome.ctgs.final.fix.agp |grep -v fragment > Contig.out
 
   ssh kkstore02
   cd /cluster/data/rheMac2/downloads
   getContig.pl < Contig.out > ctgPos
   ssh hgwdev
   load data local infile 'ctgPos' into table ctgPos
   #convert to half open and fix bug in agp file
   hgsql rheMac2 -e "update ctgPos set chromStart = chromStart -1 "
   hgsql rheMac2 -e "update ctgPos set size = size -1 "
 # CREATE CHROMINFO TABLE 
 # an improvement here would be to have getMaxCoord output the 2bit file name
   ssh hgwdev
   cd /cluster/data/rheMac2
   # get coordinates from agp and put in database table
   getcoords.pl < downloads/v1.edit4.chrome.scfs.final.fix.agp > scaffolds.coords
   hgsql rheMac2 < /cluster/data/bosTau1/coords.sql
   load data local infile 'scaffolds.coords' into table scaffoldCoords
   faSize rheMac2.fa -detailed=on > chrom.sizes
 
   # calculate maximum coords; check that start coord is always 1
 ##  /cluster/home/heather/bin/i386/GetMaxCoord > getMaxCoord.rheMac2
 
    edit chromInfo.sql; allow fileName to be null
    cp ~baertsch/kent/src/hg/lib/chromInfo.sql .
    hgsql rheMac2 < chromInfo.sql
    load data local infile 'chrom.sizes' into table chromInfo
    update chromInfo set fileName = "/gbdb/rheMac2/rheMac2.2bit"
 
 
 
 # REPEATMASKER 
 # using the split scaffold fa files generated earlier
 
 #*** NOTE FOR NEXT TIME *** use doRepeatMasker.pl instead.
 
 # note the version of RepeatMasker in use; will want this for download README
   cat /cluster/bluearc/RepeatMasker/Libraries/version
 # RM database version 20060120
 
 # do a trial run
   ssh hgwdev
   cd /cluster/data/rheMac2/scaffolds/0/0/0
   /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec macaca Contig1000.fa
 # configure
   cd /cluster/data/rheMac2
   mkdir jkStuff
   cp /cluster/data/anoGam1/jkStuff/RMAnopheles jkStuff/RMRhesus
   # change references anoGam1 --> rheMac2
   mkdir RMRun
 
   # /bin/csh makeJoblist-RM.csh
 
   # do the run  (DONE Jan 30, 2006 Robert)
   ssh pk
   cd /cluster/data/rheMac2/RMRun
   para create RMJobs.2
   para try
   para push
 
   # concatenate into one output file; took about 90 minutes
   ssh kkstore01
   cd /cluster/data/rheMac2
   mkdir repeats
   /bin/tcsh concatRM.csh
   cd repeats
   # use grep -v to get rid of all the headers
   # then, add back in an initial header
   # this is so hgLoadOut is happy
   grep chr repeats.all > repeats.clean
   cat repeats.header repeats.clean > repeats.final
   grep "There were no repetitive sequences " repeats.final > repeats.notfound
   grep -v "There were no repetitive sequences " repeats.final > repeat.out
   ssh hgwdev
   hgLoadOut rheMac2 repeat.out 2>errors 
   # Strange perc. fields:
 
 Processing repeats.out
 Strange perc. field -6.4 line 652513 of repeat.out
 Strange perc. field -2236.0 line 749163 of repeat.out
 Strange perc. field -2637.9 line 749163 of repeat.out
 Strange perc. field -99.3 line 749163 of repeat.out
 Strange perc. field -0.3 line 1386112 of repeat.out
 Strange perc. field -0.2 line 2045133 of repeat.out
 Strange perc. field -0.4 line 2429164 of repeat.out
 Strange perc. field -4.8 line 2778536 of repeat.out
 Strange perc. field -0.4 line 2912076 of repeat.out
 Strange perc. field -5.5 line 3498109 of repeat.out
 Strange perc. field -1.0 line 3642974 of repeat.out
 Strange perc. field -2.7 line 3642974 of repeat.out
 Strange perc. field -263.9 line 3711627 of repeat.out
 Strange perc. field -187.0 line 3711627 of repeat.out
 Strange perc. field -3.4 line 3795344 of repeat.out
 Strange perc. field -0.1 line 3932239 of repeat.out
 Strange perc. field -0.2 line 4092279 of repeat.out
 Strange perc. field -0.1 line 4240026 of repeat.out
 Strange perc. field -0.1 line 4440809 of repeat.out
 Strange perc. field -7915.8 line 4506386 of repeat.out
 Strange perc. field -0.1 line 4514494 of repeat.out
 Strange perc. field -4361.2 line 4667281 of repeat.out
 Strange perc. field -840.4 line 4667281 of repeat.out
 
 Loading up table repeats_rmsk
 note: 997 records dropped due to repStart > repEnd
 
   hgsql rheMac2
   rename table repeats_rmsk to rmsk
 
   # select count(*) from rmsk;
 
 # SIMPLE REPEATS (DONE Jan 30, 2006 Robert)
 # put the results throughout the scaffold 0/0/0 directories,
 # same as RepeatMasker, to avoid too many files in the same directory
 # *** BUG: FILTERING WAS NOT DONE --> OVERMASKING ***
 
 #*** NOTE FOR NEXT TIME *** use doSimpleRepeat.pl instead.
 
   ssh kkstore01
   cd /cluster/data/rheMac2
   mkdir bed
   cd bed
   mkdir simpleRepeat
   cd simpleRepeat
 
   /bin/csh makeJoblist-trf.csh
   tcsh trf-run.csh > & ! trf.log &
 
   # concatenate into one output file; took about an hour
     cat chr*.bed > simpleRepeat.bed
 
 
   # load
   /cluster/bin/i386/hgLoadBed rheMac2 simpleRepeat simpleRepeat.bed \
     -sqlTable = /cluster/home/heather/kent/src/hg/lib/simpleRepeat.sql
 
 #Reading trf.bed
 #Loaded 628275 elements of size 16
 #Sorted
 #Saving bed.tab
 #Loading rheMac2
 
   create index chrom_2 on simpleRepeat (chrom(16), chromStart);
   create index chrom_3 on simpleRepeat (chrom(16), chromEnd);
 
 
 # CREATE MASKED FA USING REPEATMASKER AND FILTERED TRF FILES 
 
   ssh kkstore02
   cd /cluster/data/rheMac2
   /cluster/bin/i386/maskOutFa -soft downloads/rheMac2.fa repeats/repeat.out rheMac2.softmask.fa
 
 WARNING: negative rEnd: -11 chr1:17040504-17040634 MLT2B4
 WARNING: negative rEnd: -15 chr1:39400221-39400370 MLT2B4
 WARNING: negative rEnd: -28 chr1:75866521-75866637 MLT2B4
 WARNING: negative rEnd: -104 chr1:102866722-102866763 MLT2B4
 WARNING: negative rEnd: -52 chr1:132930605-132930673 MLT2B4
 WARNING: negative rEnd: -686 chr1:133186227-133186372 L1MCc
 WARNING: negative rEnd: -203 chr1:133186415-133186749 L1MCc
 WARNING: negative rEnd: -455 chr1:171379505-171379760 L1M3e
 WARNING: negative rEnd: -17 chr1:216018086-216018189 L1MCb
 WARNING: negative rEnd: -198 chr1:226920721-226920777 L1M4
 WARNING: negative rEnd: -131 chr1:226921141-226921200 L1M4
 WARNING: negative rEnd: -3 chr1:226921979-226922094 L1M4
 WARNING: negative rEnd: -335 chr1:227071405-227071650 L1MCc
 WARNING: negative rEnd: -278 chr2:16216076-16216253 L1P4b
 WARNING: negative rEnd: -72 chr2:18621605-18622180 L1PBa
 WARNING: negative rEnd: -285 chr2:24878971-24879214 L1MCc
 WARNING: negative rEnd: -419 chr2:25206834-25207017 L1MCc
 WARNING: negative rEnd: -183 chr2:25207033-25207203 L1MCc
 WARNING: negative rEnd: -3 chr2:48529930-48529991 MER6
 WARNING: negative rEnd: -426 chr2:80341345-80341639 L1M2c
 WARNING: negative rEnd: -314 chr2:82400541-82400763 L1MDa
 WARNING: negative rEnd: -1015 chr2:82731009-82731348 L1ME1
 WARNING: negative rEnd: -282 chr2:82731554-82731973 L1ME1
 WARNING: negative rEnd: -129 chr2:113802567-113802946 L1MCc
 WARNING: negative rEnd: -528 chr2:129549508-129549693 L1M3e
 WARNING: negative rEnd: -437 chr2:160075806-160076108 L1M2
 WARNING: negative rEnd: -21 chr3:39033062-39033110 MER6
 WARNING: negative rEnd: -271 chr3:58615645-58615852 L1MEb
 WARNING: negative rEnd: -202 chr3:70911642-70911963 L1M5
 WARNING: negative rEnd: -33 chr3:71817441-71818111 L1M3e
 WARNING: negative rEnd: -316 chr3:112797698-112798023 L1PB3
 WARNING: negative rEnd: -17 chr3:113476319-113476407 FRAM
 WARNING: negative rEnd: -27 chr3:116802770-116802992 L1MB7
 WARNING: negative rEnd: -4 chr3:126585635-126585722 FRAM
 WARNING: negative rEnd: -632 chr3:183268961-183269045 L1M3b
 WARNING: negative rEnd: -1132 chr4:16324742-16324819 L1MEb
 WARNING: negative rEnd: -930 chr4:38188841-38189000 L1MEd
 WARNING: negative rEnd: -448 chr4:122440460-122440750 L1M3e
 WARNING: negative rEnd: -39 chr5:7580689-7581026 L1MEe
 WARNING: negative rEnd: -1956 chr5:27414234-27414442 L1MB8
 WARNING: negative rEnd: -1682 chr5:27414453-27414671 L1MB8
 WARNING: negative rEnd: -513 chr5:66093155-66093311 L1M4b
 WARNING: negative rEnd: -35 chr5:97364737-97364827 L1MCa
 WARNING: negative rEnd: -1077 chr5:109355971-109356130 L1MEb
 WARNING: negative rEnd: -20 chr5:109357351-109357727 L1MEb
 WARNING: negative rEnd: -102 chr5:119072794-119072945 L1MB4
 WARNING: negative rEnd: -9 chr5:144123391-144123455 L1M3de
 WARNING: negative rEnd: -59 chr5:150681576-150681827 L1M4c
 WARNING: negative rEnd: -461 chr5:170348753-170349060 L1M3e
 WARNING: negative rEnd: -138 chr5:170614438-170615156 L1M1
 WARNING: negative rEnd: -103 chr5:180911655-180912016 L1MEd
 WARNING: negative rEnd: -38 chr5:181275921-181276086 L1M4
 WARNING: negative rEnd: -219 chr6:326885-327048 L1MCc
 WARNING: negative rEnd: -243 chr6:18237822-18238057 L1MC
 WARNING: negative rEnd: -458 chr6:24563135-24563511 L1M4b
 WARNING: negative rEnd: -14 chr6:27838867-27838984 MLT2B4
 WARNING: negative rEnd: -92 chr6:33082868-33082962 L1M4
 WARNING: negative rEnd: -505 chr6:60951240-60951389 L1PB3
 WARNING: negative rEnd: -9 chr6:76795983-76796095 MLT2B4
 WARNING: negative rEnd: -1039 chr6:153171051-153171255 L1MEb
 WARNING: negative rEnd: -448 chr6:156038512-156038675 L1M4b
 WARNING: negative rEnd: -1797 chr6:172500974-172501240 L1M4b
 WARNING: negative rEnd: -717 chr7:6530435-6530561 L1M4b
 WARNING: negative rEnd: -180 chr7:6530567-6530816 L1M4b
 WARNING: negative rEnd: -395 chr7:17573119-17573355 L1M4b
 WARNING: negative rEnd: -713 chr7:34841140-34841243 L1PA12
 WARNING: negative rEnd: -917 chr7:36602478-36602794 L1MEd
 WARNING: negative rEnd: -993 chr7:49724690-49725009 L1MEc
 WARNING: negative rEnd: -519 chr7:64924143-64924287 L1PA15-16
 WARNING: negative rEnd: -170 chr7:64924684-64924986 L1PA15-16
 WARNING: negative rEnd: -1526 chr7:130257854-130258027 L1MDa
 WARNING: negative rEnd: -78 chr8:18037533-18037781 L1M3e
 WARNING: negative rEnd: -643 chr8:25868692-25868877 L1MCc
 WARNING: negative rEnd: -5 chr8:37458013-37458057 FRAM
 WARNING: negative rEnd: -420 chr8:40430422-40430587 L1MCc
 WARNING: negative rEnd: -73 chr8:53713425-53713503 MLT2B4
 WARNING: negative rEnd: -38 chr8:58045714-58046896 L1MEd
 WARNING: negative rEnd: -46 chr8:70060490-70060930 L1M4
 WARNING: negative rEnd: -3 chr8:71535410-71535460 FRAM
 WARNING: negative rEnd: -781 chr8:71840470-71840637 L1MCc
 WARNING: negative rEnd: -613 chr8:71841566-71841717 L1MCc
 WARNING: negative rEnd: -17 chr9:17870182-17870240 FRAM
 WARNING: negative rEnd: -1077 chr9:36754551-36754681 L1MEb
 WARNING: negative rEnd: -354 chr9:36754801-36755003 L1MEb
 WARNING: negative rEnd: -189 chr9:53706555-53706958 L1MCc
 WARNING: negative rEnd: -28 chr9:55631220-55631257 MER6
 WARNING: negative rEnd: -259 chr9:83699632-83700159 L1MA4A
 WARNING: negative rEnd: -412 chr10:93305328-93305380 L1MDb
 WARNING: negative rEnd: -290 chr10:93305684-93305806 L1MDb
 WARNING: negative rEnd: -232 chr11:93725539-93725691 L1MCc
 WARNING: negative rEnd: -3 chr11:102814963-102815011 FRAM
 WARNING: negative rEnd: -614 chr12:24319914-24321380 L1M4b
 WARNING: negative rEnd: -557 chr12:24323027-24323094 L1M4b
 WARNING: negative rEnd: -259 chr12:100959333-100959494 L1M1
 WARNING: negative rEnd: -529 chr12:102350218-102350466 L1M2
 WARNING: negative rEnd: -654 chr13:53814747-53815048 L1M4b
 WARNING: negative rEnd: -7 chr13:58060931-58061578 L1M4c
 WARNING: negative rEnd: -1125 chr13:72166501-72166711 L1ME1
 WARNING: negative rEnd: -6 chr13:72540001-72540050 FRAM
 WARNING: negative rEnd: -397 chr13:82523197-82523664 L1M3e
 WARNING: negative rEnd: -119 chr13:120528068-120528107 MLT2B3
 WARNING: negative rEnd: -1934 chr14:27100172-27100323 L1MCc
 WARNING: negative rEnd: -158 chr14:27101192-27101535 L1MCc
 WARNING: negative rEnd: -281 chr14:60812899-60813046 L1MDa
 WARNING: negative rEnd: -376 chr15:22274805-22275047 L1M4b
 WARNING: negative rEnd: -669 chr15:23733251-23733307 L1M3e
 WARNING: negative rEnd: -422 chr15:24246692-24246910 L1MCc
 WARNING: negative rEnd: -290 chr15:24247095-24247225 L1MCc
 WARNING: negative rEnd: -252 chr15:24247564-24247617 L1MCc
 WARNING: negative rEnd: -11 chr15:24247667-24247964 L1MCc
 WARNING: negative rEnd: -261 chr15:77304256-77304526 L1M4c
 WARNING: negative rEnd: -121 chr15:80339141-80339347 L1M4b
 WARNING: negative rEnd: -488 chr15:83773572-83773877 L1MEd
 WARNING: negative rEnd: -101 chr15:83773914-83774074 L1MEd
 WARNING: negative rEnd: -92 chr16:40096072-40096144 MLT2B4
 WARNING: negative rEnd: -76 chr17:44749449-44749518 MLT2B4
 WARNING: negative rEnd: -97 chr17:85835577-85836019 L1MCc
 WARNING: negative rEnd: -112 chr18:12965293-12965685 L1MCc
 WARNING: negative rEnd: -879 chr18:41292895-41293389 L1MEc
 WARNING: negative rEnd: -322 chr18:41294544-41295033 L1MEc
 WARNING: negative rEnd: -1137 chr18:59335523-59335630 L1MEb
 WARNING: negative rEnd: -102 chr19:5934741-5934887 L1M4
 WARNING: negative rEnd: -6 chr19:11007926-11008075 FRAM
 WARNING: negative rEnd: -24 chr19:44501149-44501305 L1M3b
 WARNING: negative rEnd: -145 chr20:45957350-45957401 L1MDa
 WARNING: negative rEnd: -1106 chr20:74584188-74584318 L1ME1
 WARNING: negative rEnd: -348 chr20:74584451-74584806 L1ME1
 WARNING: negative rEnd: -711 chr20:80285832-80285915 L1MCc
 WARNING: negative rEnd: -37 chrX:4743643-4743679 MER6
 WARNING: negative rEnd: -1941 chrX:48147060-48147184 L1M4b
 WARNING: negative rEnd: -99 chrX:54535186-54535349 L1M4
 WARNING: negative rEnd: -606 chrX:80315181-80315657 L1P4b
 WARNING: negative rEnd: -65 chrX:97720783-97720963 L1M4
 WARNING: negative rEnd: -24 chrX:120523759-120523982 L1ME1
 WARNING: negative rEnd: -13 chrX:121193467-121193615 MLT2B4
 WARNING: negative rEnd: -939 chrX:153527918-153528288 L1M4
 
   # matches select count(*) from rmsk where repEnd < 0;
 +----------+
 | count(*) |
 +----------+
 |      131 |
 +----------+
 
   # DOH -- THIS MASKS TOO MUCH!  SHOULD HAVE USED FILTERED TRF!
   /cluster/bin/i386/maskOutFa -softAdd rheMac2.softmask.fa bed/simpleRepeat/simpleRepeat.bed rheMac2.softmask2.fa
 
   # hard masking (Ns instead of lower case) for download files
   # split for use by genscan
   /cluster/bin/i386/maskOutFa rheMac2.softmask2.fa hard rheMac2.hardmask.fa
   mkdir hardmask-split
   /cluster/bin/i386/faSplit about rheMac2.hardmask.fa 2000000 hardmask-split
 
 # DOWNLOAD FILES
 # REMAKE SOME BIGZIPS DOWNLOADS IN SAME FORMAT AS FOR OTHER ASSEMBLIES 
 # WITH ONE FILE PER CHROMOSOME (DONE, 2006-06-12, hartera)
 
 #*** NOTE FOR NEXT TIME *** use makeDownloads.pl instead.
 
   ssh hgwdev
   cd /cluster/data/rheMac2
   cp rheMac2.softmask.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
   cp rheMac2.softmask2.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
   cp rheMac2.hardmask.fa /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
 
   cd /usr/local/apache/htdocs/goldenPath/rheMac2/bigZips
   nice gzip rheMac2.softmask.fa
   nice gzip rheMac2.softmask2.fa
   nice gzip rheMac2.hardmask.fa
   # Remake some of the bigZips downlods in the same format as for 
   # other assemblies with one file per chrom (2006-06-12, hartera)
   ssh kkstore01
   cd /cluster/data/rheMac2
   # use rheMac2.softmask2.fa as they contain masking from RepeatMasker
   # and from trf.
   cat << '_EOF_' > jkStuff/zipAll.csh
 mkdir -p bigZips
 # RepeatMasker files
 tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
 tar cvzf bigZips/chromTrf.tar.gz bed/simpleRepeat/chr*.bed
 # get soft masked and hard masked sequences, one per chrom
 mkdir softMask hardMask
 foreach c (`awk '{print$1}' /cluster/data/rheMac2/chrom.sizes`)
    faOneRecord /cluster/data/rheMac2/rheMac2.softmask2.fa \
         $c > softMask/${c}.fa
    faOneRecord /cluster/data/rheMac2/rheMac2.hardmask.fa \
         $c > hardMask/${c}.fa.masked
 end
 tar cvzf bigZips/chromFa.tar.gz softMask/chr*.fa
 tar cvzf bigZips/chromFaMasked.tar.gz hardMask/chr*.fa.masked
 '_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/zipAll.csh
    csh -ef ./jkStuff/zipAll.csh >& zipAll.log &
    # Took about 50 minutes.
    rm repeats.all.gz  rheMac2.hardmask.fa.gz  rheMac2.softmask.fa.gz \
       rheMac2.chrom.fa.gz
    md5sum *.gz > md5sum.txt
    # link the *.gz and *.txt files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    set gp = /usr/local/apache/htdocs/goldenPath/rheMac2
    mkdir -p $gp/bigZips
    rm repeats.all.gz  rheMac2.hardmask.fa.gz  rheMac2.softmask.fa.gz \
       rheMac2.chrom.fa.gz md5sum.txt
    ln -s /cluster/data/rheMac2/bigZips/{chrom*.gz,*.txt} $gp/bigZips
    # Update README.txt to give correct file names.
 
 # REGENERATE 2BIT NIB (DONE Jan 30, 2006 Robert)
   ssh kkstore02
   cd /cluster/data/rheMac2
   /cluster/bin/i386/faToTwoBit rheMac2.softmask2.fa rheMac2.softmask.2bit
   /cluster/bin/i386/twoBitToFa rheMac2.softmask.2bit check.softmask.fa
   diff -q rheMac2.softmask2.fa check.softmask.fa
   mv rheMac2.2bit rheMac2.2bit.unmasked
   mv rheMac2.softmask.2bit rheMac2.2bit
   # note: size match
 
 # GENERATE 2bit for blastz 
   mkdir -p /san/sanvol1/scratch/rheMac2/ 
   cd /cluster/data/rheMac2
   cp rheMac2.2bit /san/sanvol1/scratch/rheMac2/ -p
 
 
 ssh kkstore02
 cd /cluster/data/hg17/bed/blastz.rheMac2
 # edit doBlastzChainNet.pl to add lines for hg17 glitch
 # in function getDbFromPath
 #  if ($db eq "hg") {
 #      $db = "hg17";
 #  }
 #doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/rheMac2/rheMacOut/ >& do.log &
 
 ############################################################################
 ##  BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
     ssh pk
     cd /cluster/data/mm8/bed/blastz.rheMac2.2006-02-17
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
         `pwd`/DEF > swap.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
         -swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &
     #   failed during a san hiccup,  finish that off, then:
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
         -swap -continue=load `pwd`/DEF > swap.load.out 2>&1 &
 
     time nice -n +19 featureBits rheMac2 chainMm8Link
     #   877906099 bases of 2646704109 (33.170%) in intersection
 
     #	This was done before the loadUp script was improved to load
     #	empty tables on chroms that have no chains, so sometime later:
     ssh hgwdev
     hgLoadChain rheMac2 chrUr_chainMm8 /dev/null
     #	Loading 0 chains into rheMac2.chrUr_chainMm8
     #	2006-04-24
 
 ############################################################################
 # QUALITY SCORES (Done March 9, 2006 Robert)
     ssh kkstore01
     wget -q ftp://macftp:analysis@ftp.macaquegenome.hgsc.bcm.tmc.edu:21/Assemblies/VI/FinalMergeSplit/v1.edit4.chrm.qv.bz2
     cd /cluster/data/rheMac2/downloads/
     bunzip2 v1.edit4.chrm.qv.bz2 &
     bunzip2 v1.edit4.chrUr.bz2 &
     nice fold --spaces v1.edit4.chrm.qv | sed -e 's/Chr/chr/' > rheMac2.chrm.qual.qv 
     nice fold --spaces v1.edit4.chrUr.qv | sed -e 's/Chr/chr/' > rheMac2.chrUr.qv 
     cat rheMac2.chrm.qual.qv rheMac2.chrUr.qv > rheMac2.qual.qv
     qaToQac rheMac2.qual.qv temp.qac 
     gzip rheMac2.qual.qv
     qacToWig -fixed temp.qac rheMac2.wig 
     rm temp.qac
     wigEncode rheMac2.wig rheMac2.qual.wig rheMac2.qual.wib 
     mv all.qv rheMac2.qual.qv
     qaToQac rheMac2.qual.qv temp.qac
     qacToWig -fixed temp.qac rheMac2.wig
     wigEncode rheMac2.wig rheMac2.qual.wig rheMac2.qual.wib
     ssh hgwdev
     hgLoadWiggle rheMac2 quality rheMac2.qual.wig
     cp -p rheMac2.qual.qv.gz /usr/local/apache/htdocs/rheMac2/bigZips
 
 
 # RUN DATEREPEATS TO GET LINEAGE-SPECIFIC (DONE 3/22/06 angie)
     ssh kkstore01
     cd /cluster/data/rheMac2
 
     # First make per-chrom .fa.out files as should have been done:
     tail +4 repeats/repeat.out \
     | perl -we '$prevChr = ""; \
                 while (<>) { \
                   @w = split; $chr = $w[4]; $c = $chr; $c =~ s/^chr//; \
                   if ($chr ne $prevChr) { \
                     print "creating $c/$chr.fa.out\n"; \
                     system("cat repeats/repeats.header > $c/$chr.fa.out"); \
                     close(OUT);  open(OUT, ">>$c/$chr.fa.out") || die; \
                   } \
                   print OUT; \
                   $prevChr = $chr; \
                 }'
 
     # Run DateRepeats to get outputs on rodent, non-rodent mammal:
     mkdir linSpecRep
     cd linSpecRep
     ln -s ../*/chr*.fa.out .
     foreach f (*.fa.out)
       echo $f:r:r
       /cluster/bluearc/RepeatMasker/DateRepeats $f -query human \
         -comp mouse -comp dog
     end
     mkdir /cluster/bluearc/rheMac2/linSpecRep
     mkdir /cluster/bluearc/rheMac2/linSpecRep/notIn{Rodent,NonRodentMammal}
     foreach f (*.out_mus*)
       set chr = $f:r:r
       echo $chr
       /cluster/bin/scripts/extractRepeats 1 $f \
        > /cluster/bluearc/rheMac2/linSpecRep/notInRodent/$chr.out.spec
       /cluster/bin/scripts/extractRepeats 2 $f \
        > /cluster/bluearc/rheMac2/linSpecRep/notInNonRodentMammal/$chr.out.spec
     end
 
 
 # BLASTZ SWAP FOR HUMAN (hg17) (DONE, 2006-03-22, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
     # Documented in makeHg17.doc in the section:
     # SWAP CHAIN AND NET ALIGNMENTS OVER TO RHESUS (rheMac2)
     # alignments are in: /cluster/data/rheMac2/bed/blastz.hg17.swap
 
 
 # SWAP CHAINS/NET RN4 (DONE 3/23/06 angie)
     ssh kkstore01
     mkdir /cluster/data/rheMac2/bed/blastz.rn4.swap
     cd /cluster/data/rheMac2/bed/blastz.rn4.swap
     doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.rheMac2/DEF \
       >& do.log & tail -f do.log
     ln -s blastz.rn4.swap /cluster/data/rheMac2/bed/blastz.rn4
 
 
 ###########################################################################
 # 5 way multiz WITH RheMac2 (robert, 5/3/06) -- MM8, RN4, CANFAM2, HG18
 # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
 
     # first build 5-way multiz alignment from syntenic nets (helps reduce
     # false positive predictions due to paralogous alignments)
     # (prepare mafNet files from syntenic nets and copy to
     # /san/sanvol1/scratch/rheMac2/mafNetSyn; do this for mm8, rn4, canFam2, rheMac2
 
 ssh kkstore02
 cd /cluster/data/rheMac2/bed
 mkdir exoniphy
 cd exoniphy
     cat > makeSynMaf.sh << 'EOF'
 #!/bin/bash
 fromdb=$1
 todb=$2
 DIR=$3
 outdir=$4/$2
 
 base=`pwd`
 cut -f1 /cluster/data/$fromdb/chrom.sizes > chrom.lst
 echo making syntenic mafs for $todb in $DIR from $base
 cd $DIR
 cd axtChain
 nice netFilter -syn $fromdb.$todb.net.gz > $fromdb.$todb.syn.net
 mkdir -p netSynSplit
 echo splitting nets
 nice netSplit $fromdb.$todb.syn.net netSynSplit
 mkdir -p chain
 echo splitting chains $fromdb.$todb.all.chain.gz
 nice chainSplit chain $fromdb.$todb.all.chain.gz
 echo making mafs
 mkdir -p $outdir
 for i in `cat $base/chrom.lst` ; do echo "netToAxt netSynSplit/$i.net chain/$i.chain /cluster/data/$fromdb/nib /cluster/data/$todb/nib stdout | axtToMaf stdin -tPrefix=$fromdb. -qPrefix=$todb. /cluster/data/$fromdb/chrom.sizes /cluster/data/$todb/chrom.sizes $outdir/$i.maf " ; nice netToAxt netSynSplit/$i.net chain/$i.chain /cluster/data/$fromdb/nib /cluster/data/$todb/nib stdout | axtToMaf stdin -tPrefix=$fromdb. -qPrefix=$todb. /cluster/data/$fromdb/chrom.sizes /cluster/data/$todb/chrom.sizes $outdir/$i.maf ; done
 echo removing temporary files
 rm -rf axtChain/chain
 rm -rf axtChain/netSynSplit/
 # makes syntenic net from net 
 #   with $1 as reference and $2 as query 
 #   uses blastz alignment in $3 then converts to maf written to $4
 # sample usage
 # makeSynMaf.sh rheMac2 canFam2 /cluster/data/rheMac2/bed/blastz.canFam2 /san/sanvol1/scratch/rheMac2/mafNetSyn
 'EOF'
 # << for emacs
 
 makeSynMaf.sh rheMac2 canFam2 /cluster/data/rheMac2/bed/blastz.canFam2 /san/sanvol1/scratch/rheMac2/mafNetSyn
 makeSynMaf.sh rheMac2 hg18 /cluster/data/rheMac2/bed/blastz.hg18 /san/sanvol1/scratch/rheMac2/mafNetSyn
 makeSynMaf.sh rheMac2 mm8 /cluster/data/rheMac2/bed/blastz.mm8 /san/sanvol1/scratch/rheMac2/mafNetSyn
 makeSynMaf.sh rheMac2 rn4 /cluster/data/rheMac2/bed/blastz.rn4 /san/sanvol1/scratch/rheMac2/mafNetSyn
 
     # make output dir and run dir
     ssh pk
     mkdir -p /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2
     cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2
 
     mkdir -p mafSyn runSyn
     cd runSyn
 
     # create scripts to run multiz on cluster
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
 
     set path=(/cluster/bin/penn/multiz.v11.x86_64/ $path )
     set db = rheMac2
     set c = $1
     set maf = $2
     set run = /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/
     set tmp = /scratch/tmp/$db/multiz.syn.$c
     set pairs = /san/sanvol1/scratch/$db/mafNetSyn
     rm -fr $tmp
     mkdir -p $tmp
     cp ../{tree.nh,species.lst} $tmp
     pushd $tmp
     foreach s (`cat species.lst`)
         set in = $pairs/$s/$c.maf
         set out = $db.$s.sing.maf
         if ($s == rheMac2) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run $path); rehash
     $run/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
 # << for emacs
     chmod +x autoMultiz.csh
 
 cat > gsub << 'EOF'
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/$(root1).maf}
 #ENDLOOP
 'EOF'
 # << for emacs
     cut -f 1 /cluster/data/rheMac2/chrom.sizes > chrom.lst
     set path = (/parasol/bin $path);rehash
     gensub2 chrom.lst single gsub jobList
     para create jobList
         # 46 jobs
     para try; para check
     para push
 
     # Load into database
     ssh hgwdev
     cd /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn
     mkdir -p /gbdb/rheMac2/multiz5waySyn/mafSyn
     ln -s /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn/*.maf \
         /gbdb/rheMac2/multiz5waySyn/maf
 cat > loadMaf.csh << 'EOF'
     time hgLoadMaf -pathPrefix=/gbdb/rheMac2/multiz5waySyn/maf rheMac2 multiz5waySyn
     cat *.maf | \
         nice hgLoadMafSummary rheMac2 -minSize=30000 -mergeGap=1500 -maxSize=200000  multiz5waySummary stdin
 'EOF'
 #<< happy emacs
 
     # Dropped unused indexes (2006-05-09 kate)
     # NOTE: this is not required in the future, as the loader
     # has been fixed to not generate these indexes
     hgsql rheMac2 -e "alter table multiz5waySummary drop index chrom_2"
     hgsql rheMac2 -e "alter table multiz5waySummary drop index chrom_3"
 
     # expect lengthy load time for this -- a few hours ?
     csh loadMaf.csh >&! loadMaf.log &
 #Loading multiz5waySyn into database
 #Loaded 6207706 mafs in 21 files from /gbdb/rheMac2/multiz5waySyn/maf
 #Advisory lock created
 #Advisory lock has been released
 #455.360u 156.210s 13:51.54 73.5%        0+0k 0+0io 237pf+0w
 #Indexing and tabulating stdin
 #Created 795772 summary blocks from 15566976 components and 6207706 mafs from stdin
 #Loading into rheMac2 table multiz5waySummary...
 #Loading completeAdvisory lock has been released
 
         #drop index to improve performance
     hgsql rheMac2 -e " alter table multiz5waySummary drop index chrom_3"
 
     # build chromosome-by-chromosome SS files
 
     cd /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2
     mkdir run-ss-syn
     cd run-ss-syn
     mkdir -p /san/sanvol1/scratch/rheMac2/multiz.rheMac2Hg18Mm8Rn4CanFam2/ssSyn
     cat > makeSS.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 /cluster/bin/phast/msa_view -i MAF -o SS /cluster/data/rheMac2/bed/multiz.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/rheMac2/chrom/$c.fa | gzip -c >  /san/sanvol1/scratch/rheMac2/multiz.rheMac2Hg18Mm8Rn4CanFam2/ssSyn/$c.ss.gz
 'EOF'
 # << for emacs
     chmod +x makeSS.csh
 
     foreach chr (`cut -f 1 /cluster/data/rheMac2/chrom.sizes`)
 	echo "makeSS.csh $chr" >> jobList.ss
     end
 
     para create jobList.ss
         # 20 jobs
     para try; para check
     para push
 
     # gzip mafs for downloads area
     ssh kkstore01
     cd cluster/data/hg17/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/mafDownloads
 cat > downloads.csh << 'EOF'
     date
     foreach f (../mafSyn/chr*.maf)
 	set c = $f:t:r
         echo $c
 	nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     time csh downloads.csh >&! downloads.log
         # ~2 hours
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/rheMac2/multiz5waySyn
     mkdir $dir
     ln -s /cluster/data/rheMac2/bed/multiz5waySyn/mafDownloads/{*.gz,md5sum.txt} $dir
     cp README.txt $dir
 
     #make multi5wayFrames table for browser track
     ssh kkstore01
     cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/
     set sanDir = /san/sanvol1/scratch/rheMac2/multiz5waySyn/frames
     mkdir -p $sanDir/maf
     cp -rp maf/* $sanDir/maf
     mkdir frames
     cd frames
     cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
     cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
     #edit Makefile to correct species names and set and sanDir
 
     ssh hgwdev
     cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames
     make getGenes >&! getGenes.log &
         # ~1 minute
     make getFrames >&! getFrames.log &
     # NOTE: if jobs get hung up (e.g. running for hours, when
     #   they should run for minutes, do 'para stop' so that
     #   the 'para make' can restart the job
     make loadDb >&! loadDb.log &
 
 
 #trackDb entry
 #track multiz5waySyn
 #shortLabel Conservation 
 #longLabel Human/Mouse/Rat/Dog Multiz Alignment & Conservation using syntenic alignments
 #group compGeno
 #priority 104.1
 #visibility pack
 #color 0, 10, 100
 #altColor 0,90,10
 #type wigMaf 0.0 1.0
 #maxHeightPixels 100:40:11
 #wiggle phastCons5way
 #pairwiseHeight 12
 #spanList 1
 #yLineOnOff Off
 #autoScale Off
 #summary multiz5waySummary
 #frames multiz5wayFrames
 #irows on
 #speciesGroups mammal 
 #sGroup_mammal hg18 mm8 rn4 canFam2 
 #speciesDefaultOff 
 
     # request push to hgwdev
 
     ###
     # rebuild frames to get bug fix, using 1-pass maf methodology
     # (2006-06-09 markd)
     ssh kkstore01
     cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames
     mv mafFrames/ mafFrames.old
     nice tcsh # easy way to get process niced
     (cat  ../mafSyn/*.maf | time genePredToMafFrames rheMac2 stdin stdout canFam2 genes/canFam2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz | gzip >multiz5wayFrames.mafFrames.gz)>&log
     ssh hgwdev
     cd /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/frames
     hgLoadMafFrames rheMac2 multiz5wayFrames multiz5wayFrames.mafFrames.gz >&log
     
 
 ###############################################################
 # PHASTCONS CONSERVATION (NOT DONE yet, 2006-05-06 Robert)
 # coverage is only 3% and needs more tweaking
 # These parameters are:
 # --rho
 # --expected-length
 # --target-coverage
 # Also, instead of generating cons and noncons tree models,
 # we use a single, pre-existing tree model -- Elliot Margulies' model 
 # from the (37-way) ENCODE alignments.
 
     # NOTE: reusing cluster-friendly chrom fasta files created earlier
 
     ssh kkstore01    
     mkdir /cluster/bluearc/rheMac2/chrom
     cd /cluster/data/rheMac2
     foreach f (`cat chrom.lst`)
     echo $f
     cp $f/*.fa /cluster/bluearc/rheMac2/chrom
     end
 
     # Split chromosome MAF's into windows and use to generate
     # "sufficient statistics" (ss) files for phastCons input
     # NOTE: as the SAN fs has lotsa space, we're leaving these
     # big (temp) files unzipped, to save time during phastCons run.
     # Note also the larger chunk sizes from previous runs -- this
     # reduces run-time on the split, slows down the actual phastCons
     # enough so jobs don't crash (jobs are very quick, just a minute
     # or so), and according to Adam, will produce better results.
     # The previous small chunks were probably required by
     # the phyloFit step, which we are no longer using for the
     # human alignments.
     ssh pk
     mkdir /cluster/data/rheMac2/bed/multiz5waySyn/cons
     cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
     cp /cluster/store5/gs.18/build35/bed/multiz17way.2005-12-20/cons/elliotsEncode.mod .
     # edit, change to hg18,  mm8, and rn4.
     mkdir run.split
     cd run.split
     set WINDOWS = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
 
     cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
     set MAFS = /cluster/data/rheMac2/bed/multiz5waySyn/mafSyn
     set WINDOWS = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss
     cd $WINDOWS
     set c = $1
     echo $c
     rm -fr $c
     mkdir $c
     /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
         -M /cluster/bluearc/rheMac2/chrom/$c.fa \
         -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
     echo "Done" >> $c.done
 'EOF'
 # << happy emacs
     chmod +x doSplit.csh
 
 rm -f jobList
 foreach f (../../mafSyn/*.maf) 
 set c = $f:t:r
 echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
 end
     
     para create jobList
         # 49 jobs
     para try
     para check
     para push
 
    # estimate model parameters
     # estimate avg. cross-species avg. GC content from chr1 maf's
 
     ssh kolossus
     set path = ($path /cluster/bin/phast); rehash
     cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
     msa_view --aggregate rheMac2,hg18,rn4,mm8,canFam2 \
         -i MAF \
         --summary-only /cluster/data/rheMac2/bed/multiz.syn.rheMac2Hg18Mm8Rn4CanFam2/mafSyn/chr1.maf \
            > maf_summary.txt
     awk '$1 == "[aggregate]" {printf "%0.3f\n", $3 + $4}' maf_summary.txt
     # 0.424
    
     # Be sure to substitute in the right G+C content. Also, notice the
     # target coverage of 0.17.  We actually want 5% coverage here but
     # the final (posterior) coverage is only indirectly related to the
     # expected (prior) coverage.  One thing to consider is that we
     # only have about 40% alignment coverage (excluding chimp, which
     # doesn't help us much in identifying conserved regions).  As far
     # as phastCons is concerned, we want to aim for about 0.05 / 0.4 =
     # 0.125 coverage.  In this case, though, --target-coverage
     # 0.125 resulted in only about 4.1% coverage.  I had to iterate
     # a couple of times (using only chromosome 1) to find a value that
     # got me close to the target of 5%
 
    ssh pk
    mkdir -p /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.elements
    cd /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.elements
    mkdir -p TREES
    mkdir -p LOG
    ls /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss/chr*/*ss > all.windows
    for f in `cat all.windows` ; do         root=`basename $f .ss.gz` ;        echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;    done
    para create jobs.lst
    para push
 
     # check tree model on 5MB chunk, using params recommended by Adam,
     # he ok'ed the results -- not necessary for next human run
     ssh kolossus
     cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
         --tree "`cat ../tree-commas.nh`" \
         /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/ss/chr7/chr7.110000001-120000000.ss \
         -o phyloFit.tree
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     # cd ..
     mkdir run.cons
     cd run.cons
     cat > doPhast.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set tmp = /scratch/tmp/$f
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons
 cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp
 pushd $tmp > /dev/null
 /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \
 --rho $rho --expected-length $len --target-coverage $cov --quiet \
 --not-informative panTro1,rheMac2 \
 --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
 popd > /dev/null
 mkdir -p $san/pp/$c $san/bed/$c
 sleep 1
 mv $tmp/$f.pp $san/pp/$c
 mv $tmp/$f.bed $san/bed/$c
 rm -fr $tmp
 'EOF'
     # emacs happy
     chmod a+x doPhast.csh
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
     cat > template << 'EOF'
 #LOOP
 doPhast.csh $(root1) $(file1) 14 .008 .28
 #ENDLOOP
 'EOF'
     #	happy emacs
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons
     # mkdir /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.cons
     ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/rheMac2/bed/multiz5waySyn/cons/run.cons/in.list
 
     ssh pk
     cd /cluster/store11/gs.19/build36/bed/multiz5waySyn/cons/run.cons
 
     gensub2 in.list single template jobList
     para create jobList
         # 295 jobs
     para try
     para check
     para push
 #Completed: 295 of 295 jobs
 #CPU time in finished jobs:       9160s     152.67m     2.54h    0.11d  0.000 y
 #IO & Wait Time:                  2969s      49.48m     0.82h    0.03d  0.000 y
 #Average job time:                  41s       0.69m     0.01h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              78s       1.30m     0.02h    0.00d
 #Submission to last job:           276s       4.60m     0.08h    0.00d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons
     #	The sed's and the sort get the file names in chrom,start order
     # (Hiram tricks -- split into columns on [.-/] with 
     #    identifying x,y,z, to allow column sorting and
     #    restoring the filename.  Warning: the sort column
     # will depend on how deep you are in the dir
     find ./bed -name "chr*.bed" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
 	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
 	/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/rheMac2/bed/multiz5waySyn/cons
 
     # load into database
     ssh hgwdev
     cd /cluster/data/rheMac2/bed/multiz5waySyn/cons
     hgLoadBed -strict rheMac2 phastConsElements5way mostConserved.bed
      #Loaded 602351 elements of size 5
     # compare with previous tracks
     hgsql rheMac2 -e "select count(*) from phastConsElements5way"
         # 2260575
     # hgsql hg18 -e "select count(*) from phastConsElements"
     # hg18 does not have phastConsElements table
         # 1601903
     # Try for 5% overall cov, and 70% CDS cov (used elen=13, tcov=.007, rho=.27)
     featureBits hg18 -enrichment refGene:cds phastConsElements17way
     # refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x
     featureBits hg17 -enrichment refGene:cds phastConsElements17way
     # refGene:cds 1.064%, phastConsElements17way 5.104%, both 0.748%, cover 70.29%, enrich 13.77x
 
     # compare with previous tracks
     featureBits hg18 -enrichment refGene:cds phastConsElements10way
         # refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x
     featureBits hg18 -enrichment refGene:cds phastConsElements
         # refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x
 
     # Create merged posterier probability file and wiggle track data files
     #	pk is currently closer to the san than any other machine
     ssh pk
     cd /san/sanvol1/scratch/rheMac2/multiz5waySyn/cons/
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     find ./pp -name "chr*.pp" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
         nice wigEncode stdin phastCons5way.wig phastCons5way.wib
     # about 23 minutes for above
 
     cp -p phastCons17way.wi? /cluster/data/hg18/bed/multiz17way/cons
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way/cons
     ln -s `pwd`/phastCons17way.wib /gbdb/hg18/multiz17way/phastCons17way.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz17way hg18 \
         phastCons17way phastCons17way.wig
     #  ~ 3 minute load
 
     # Downloads  (2006-02-22 Fan)
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way
     mkdir mafDownloads
     cd mafDownloads
     # upstream mafs (mafFrags takes a while)
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
         awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         nice mafFrags hg18 multiz17way up.bed upstream$i.maf \
                 -orgs=../species.lst
         rm up.bed
     end
     date
 'EOF'
 
     time csh mafFrags.csh > mafFrags.log 
     nice gzip up*.maf
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz17way/mafDownloads
 cat > downloads.csh << 'EOF'
     date
     foreach f (../maf/chr*.maf)
 	set c = $f:t:r
         echo $c
 	nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     time csh downloads.csh > downloads.log
 
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz17way
     mkdir $dir
     ln -s /cluster/data/hg18/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir
 
 
 #####################################################################
 # SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
     # File emailed from Xinwei She <xws@u.washington.edu>
     mkdir /cluster/data/rheMac2/bed/genomicSuperDups
     cd /cluster/data/rheMac2/bed/genomicSuperDups
     sed -e 's/\t_\t/\t-\t/' rheMac2_genomicSuperDup.tab \
     | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
     | hgLoadBed rheMac2 genomicSuperDups stdin\
       -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
 
 
 ######
 # ENSEMBL (done  6/20/06 robert)
 wget http://www.sanger.ac.uk/Users/lec/macaque_data/gtf.tar.gz
 wget http://www.sanger.ac.uk/Users/lec/macaque_data/macaque_ensembl_cdnas.fa.gz
 wget -q http://www.sanger.ac.uk/Users/lec/macaque_data/macaque_ensembl_peptides.fa.gz 
 gunzip *.gz
 for i in `ls ensembl_macaque_gtf_data/` ; do echo ${i%%.gtf} ; awk '{print "chr"$0}' ensembl_macaque_gtf_data/$i > chr$i ; done
 rm -f chrscaffold.gtf
 cp ensembl_macaque_gtf_data/scaffold.gtf scaffold.gtf 
 cat chr*.gtf > ensGene.gtf
 grep -v pseudo ensAll.gtf > ensGene.gtf
 ldHgGene rheMac2 ensGene ensGene.gtf -gtf -genePredExt
 Reading ensGene.gtf
 Read 38561 transcripts in 741623 lines in 1 files
   38561 groups 22 seqs 8 sources 4 feature types
 38561 gene predictions
 hgsql rheMac2 < ~/kent/src/hg/lib/ensGtp.sql
 awk '{print $10,$12,$16}' ensGene.gtf |grep ENSMMUP|awk 'NF==3{print $0}'|sed -e 's/\"//g' | sed -e 's/\;//g'|sed -e 's/ /\t/g' |sort -u > ensGtp.tab
 echo "load data local infile 'ensGtp.tab' into table ensGtp ignore 1 lines" | hgsql -N rheMac2
 sed -e 's/_/ /' macaque_ensembl_peptides.fa  > ensPep.fa
 hgPepPred rheMac2  ensembl ensPep.fa
 grep pseudogene ensAll.gtf > ensPseudo.gtf
 grep -v protein_coding ensAll.gtf |grep -v pseudo > ensRna.gtf
 
 ##########################################################################
 # N-SCAN gene predictions (nscanGene) - (2006-08-04 markd)
     cd /cluster/data/rheMac2/bed/nscan/
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/chr_gtf
     wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/proteins
     # clean up and rename downloaded directorys:
     mv ardor.wustl.edu/jeltje/rheMac2/chr_gtf .
     mv ardor.wustl.edu/jeltje/rheMac2/proteins chr_ptx
     rm -rf ardor.wustl.edu
     rm chr_*/index.html*
     gzip chr_*/*
     chmod a-w chr_*/*.gz
 
     # load tracks.  Note that these have *utr features, rather than
     # exon features.  currently ldHgGene creates separate genePred exons
     # for these.
     ldHgGene -bin -gtf -genePredExt rheMac2 nscanGene chr_gtf/chr*.gtf.gz
     # add .a suffix to match transcript id
 
     # add .a suffix to match transcript id
     hgPepPred -suffix=.a rheMac2 generic nscanPep chr_ptx/chr*.fa.gz
     rm *.tab
 
     # update trackDb; need a rheMac2-specific page to describe informants
     rhesus/rheMac2/nscanGene.html   (copy from panTro2 and edit)
     rhesus/rheMac2/trackDb.ra
 
     # QA found:
      Checking keys on database rheMac2
      rheMac2.nscanPep.name - hits 22192 of 24608
      Error: 2416 of 24608 elements of rheMac2.nscanPep.name are not in key nscanGene.name line 524 of all.joiner
      Example miss: CHR1.5.052.A
      # due to WUStL giving us a mismatched protein file.
 
     # 2006-08-11 markd - obtained new protein sequences
     wget -nv -r -np http://ardor.wustl.edu/jeltje/rheMac2/proteins
     hgPepPred -suffix=.a rheMac2 generic nscanPep proteins/chr*.fa.gz
 
     joinerCheck -keys -database=rheMac2 -identifier=nscanName  ~/compbio/kent/src/hg/makeDb/schema/all.joiner
     Checking keys on database rheMac2
     rheMac2.nscanPep.name - hits 22003 of 22003 ok
 
     # 2006-08-11 markd - N-SCAN track has UTR only predictions, reload discarding
     # these
     ldHgGene -bin -gtf -requireCDS -genePredExt rheMac2 nscanGene chr_gtf/chr*.gtf.gz
 # CPGISLANDS (DONE - 2006-09-12 - Robert)
     ssh hgwdev
     mkdir -p /cluster/data/rheMac2/bed/cpgIsland
     cd /cluster/data/rheMac2/bed/cpgIsland
 
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     #	gcc readseq.c cpg_lh.c -o cpglh.exe
     mv cpglh.exe /cluster/data/rheMac2/bed/cpgIsland/
     
     # cpglh.exe requires hard-masked (N) .fa's.  
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     ssh kkstore02
     cd /cluster/data/rheMac2/bed/cpgIsland
     foreach f (../../*/chr*.fa.masked)
       set fout=$f:t:r:r.cpg
       echo running cpglh on $f to $fout
       ./cpglh.exe $f > $fout
     end
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 /* Input columns: */
 /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
 /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
 /* Output columns: */
 /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
 /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     # << this line makes emacs coloring happy
     awk -f filter.awk chr*.cpg > cpgIsland.bed
 
     ssh hgwdev
     cd /cluster/data/rheMac2/bed/cpgIsland
     hgLoadBed rheMac2 cpgIslandExt -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
 #Loaded 24226 elements of size 10
 #Sorted
 #Saving bed.tab
 #Loading rheMac2
 
 ##########################################################################
 # Hg 18 Reciprocal Best Nets  (8-13-07, kpollard)
 
 ssh kolossus
 cd /cluster/data/hg18/bed/blastz.rheMac2/axtChain
 mkdir rBestNet
 cp /cluster/data/rheMac2/chrom.sizes /cluster/bluearc/rheMac2/.
 cat > rbest.csh << 'EOF'
     set swapChain = rheMac2ToHg18.swap.chain
     echo "merge and swap $swapChain"
     zcat /cluster/data/rheMac2/bed/liftOver/rheMac2ToHg18.over.chain.gz \ 
          | chainSwap stdin stdout | chainMergeSort stdin > $swapChain
     echo "split chain"
     chainSplit swapChain/ $swapChain
     echo "run netter"
     chainPreNet $swapChain /cluster/bluearc/hg18/chrom.sizes \
          /cluster/bluearc/rheMac2/chrom.sizes stdout | \
          chainNet stdin -minSpace=1 /cluster/bluearc/hg18/chrom.sizes \
          /cluster/bluearc/rheMac2/chrom.sizes stdout /dev/null | \
          netSyntenic stdin rbest.noClass.net
     echo "split net"
     netSplit rbest.noClass.net rBestNet
     echo "extract chains from net"
     netChainSubset -verbose=0 rbest.noClass.net $swapChain stdout | \ 
          chainMergeSort stdin | gzip -c > hg18.rheMac2.rbest.chain.gz
     echo "split extracted chain"
     chainSplit rBestChain/ hg18.rheMac2.rbest.chain.gz 
     cd ..
     mkdir axtRBestNet
     echo "extract axts"
     foreach f (axtChain/rBestNet/*.net)
         echo $f:t:r
         netToAxt $f axtChain/swapChain/$f:t:r.chain \
             /san/sanvol1/scratch/hg18/hg18.2bit \
             /cluster/bluearc/rheMac2/rheMac2.2bit stdout | \
             axtSort stdin stdout | gzip -c \
             > axtRBestNet/$f:t:r.hg18.rheMac2.net.axt.gz
     end
 'EOF'
 csh rbest.csh >&! rbest.log &
 #Not yet loaded
 
 ############################################################################
 ##  BLASTZ swap from monDom4 alignments (2007-11-11 - markd)
     ssh hgwdev
     mkdir /cluster/data/rheMac2/bed/blastz.monDom4.swap
     cd /cluster/data/rheMac2/bed/blastz.monDom4.swap
     ln -s blastz.monDom4.swap ../blastz.monDom4
     /cluster/bin/scripts/doBlastzChainNet.pl \
         -swap /cluster/data/monDom4/bed/blastz.rheMac2/DEF >& swap.out&
 
     # fb.rheMac2.chainMonDom4Link.txt:
     # 611785347 bases of 2646704109 (23.115%) in intersection
 
 #########################################################################
 # Blastz Marmoset calJac1 (DONE - 2007-11-15 - Hiram)
     ssh kkstore01
     screen # use screen to control this job
     mkdir /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16
     cd /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16
 
     cat << '_EOF_' > DEF
 # Rhesus vs marmoset
 BLASTZ_M=50
 
 # TARGET: Rhesus RheMac2
 SEQ1_DIR=/scratch/hg/rheMac2/rheMac2.2bit
 SEQ1_LEN=/cluster/data/rheMac2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Marmoset calJac1
 SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
 SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=kk > blastz.log 2>&1 &
     #	real    2115m6.616s
     # failed during the load due to MySQL problems on hgwdev
     #	run loadUp.csh manually on hgwdev
     #	real    28m35.926s
     cat fb.rheMac2.chainCalJac1Link.txt
     #	2055107003 bases of 2646704109 (77.648%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=download -chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=kk > download.log 2>&1 &
 
     mkdir /cluster/data/calJac1/bed/blastz.rheMac2.swap
     cd /cluster/data/calJac1/bed/blastz.rheMac2.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16/DEF \
 	-swap -chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=kk > swap.log 2>&1 &
     #	real    349m36.073s
     cat fb.calJac1.chainRheMac2Link.txt
     #	2191300051 bases of 2929139385 (74.810%) in intersection
 
 ###########################################################################
 # Blastz Orangutan ponAbe2 (DONE - 2007-11-16 - Hiram)
     ssh kkstore01
     screen # use screen to control this job
     mkdir /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
     cd /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
 
     cat << '_EOF_' > DEF
 # Rhesus vs marmoset
 BLASTZ_M=50
 
 # TARGET: Rhesus RheMac2
 SEQ1_DIR=/scratch/hg/rheMac2/rheMac2.2bit
 SEQ1_LEN=/cluster/data/rheMac2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Orangutan ponAbe2
 SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
 SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=kk > blastz.log 2>&1 &
     # something got lost during the para make.  A couple of jobs couldn't
     #	finish, due to memory limits.  Finish them off on hgwdev
     #	real    1911m11.906s
 # Completed: 108924 of 108928 jobs
 # Crashed: 4 jobs
 # CPU time in finished jobs:   20658708s  344311.80m  5738.53h  239.11d  0.655 y
 # IO & Wait Time:               1801215s   30020.25m   500.34h   20.85d  0.057 y
 # Average job time:                 206s       3.44m     0.06h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            5401s      90.02m     1.50h    0.06d
 # Submission to last job:        111924s    1865.40m    31.09h    1.30d
     #	continuing after those were done:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=cat -chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=kk > cat.log 2>&1 &
 
     # some jobs failed due to memory limits on kk nodes
     #	finished them manually on kolossus and hgwdev, then continuing
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-continue=cat -bigClusterHub=kk > cat.log 2>&1 &
     #	real    322m54.734s
     #	failed during load due to MySQL problems on hgwdev
     #	run loadUp.csh manually on hgwdev
     #	real    22m22.225s
     cat fb.rheMac2.chainPonAbe2Link.txt
     #	2333264093 bases of 2646704109 (88.157%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-continue=download -bigClusterHub=kk > download.log 2>&1 &
 
     #	and, for the swap
     mkdir /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
     cd /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \
 	-swap -chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=kk > swap.log 2>&1 &
     #	real    289m9.479s
     cat fb.ponAbe2.chainRheMac2Link.txt
     #	2556010573 bases of 3093572278 (82.623%) in intersection
 
 ############################################################################
 # SGP GENES (DONE - 2007-12-20 - Hiram)
     ssh kkstore01
     mkdir  /cluster/data/rheMac2/bed/sgp
     cd  /cluster/data/rheMac2/bed/sgp
 
     mkdir gtf
 for C in `awk '{print $1}' /cluster/data/rheMac2/chrom.sizes`
 do
     wget --timestamping \
 "http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/SGP/200603mm8/${C}.gtf" \
         -O "gtf/${C}.gtf"
 done
 
     ssh hgwdev
     cd /cluster/data/rheMac2/bed/sgp
     ldHgGene -gtf -genePredExt rheMac2 sgpGene gtf/chr*.gtf
     # Read 31416 transcripts in 264479 lines in 22 files
     # 31416 groups 22 seqs 1 sources 3 feature types
     # 31416 gene predictions
 
     featureBits rheMac2 sgpGene
     #	32556064 bases of 2646704109 (1.230%) in intersection
 
     featureBits rheMac2 -enrichment refGene:CDS sgpGene
 # refGene:CDS 0.017%, sgpGene 1.230%, both 0.014%, cover 83.53%, enrich 67.91x
 
 #####################################################################
 # LOAD GENEID GENES (DONE - 2007-12-20 - Hiram)
     ssh kkstore01
     mkdir -p /cluster/data/rheMac2/bed/geneid
     cd /cluster/data/rheMac2/bed/geneid
     mkdir gtf prot
 
 for C in `awk '{print $1}' /cluster/data/rheMac2/chrom.sizes`
 do
   echo $C
   wget --timestamping \
 "http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/geneid_v1.2/${C}.gtf" \
 	-O "gtf/${C}.gtf"
   wget --timestamping \
 "http://genome.imim.es/genepredictions/M.mulatta/rmJan2006/geneid_v1.2/${C}.prot" \
 	-O "prot/${C}.prot"
 done
 
     # Add missing .1 to protein id's
     cd prot
     foreach f (*.prot)
       perl -wpe 's/^(>chr\S+)$/$1.1/' $f > $f:r-fixed.prot
     end
 
     ssh hgwdev
     cd /cluster/data/rheMac2/bed/geneid
     ldHgGene -genePredExt -gtf rheMac2 geneid gtf/*.gtf
 # Read 30202 transcripts in 248094 lines in 22 files
 # 30202 groups 22 seqs 1 sources 3 feature types
 # 30202 gene predictions
 
     hgPepPred rheMac2 generic geneidPep prot/chr*-fixed.prot
     featureBits rheMac2 geneid
     #	33978840 bases of 2646704109 (1.284%) in intersection
 
     featureBits rheMac2 -enrichment refGene:CDS geneid
 # refGene:CDS 0.017%, geneid 1.284%, both 0.013%, cover 77.14%, enrich 60.09x
 
 ##########################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 ############################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 rheMac2.upstreamGeneTbl = refGene
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
 
 see doc/builds.txt for specific details.
+
 ############################################################################
+# calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram)
+    screen # use a screen to manage this multi-day job
+    mkdir /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11
+    cd /hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11
+
+    #	same kind of parameters as used in human<->marmoset
+    cat << '_EOF_' > DEF
+# Rhesus vs. marmoset
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Rhesus RheMac2
+SEQ1_DIR=/scratch/data/rheMac2/rheMac2.2bit
+SEQ1_LEN=/scratch/data/rheMac2/chrom.sizes
+SEQ1_CHUNK=20000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=5
+
+# QUERY: Marmoset (calJac3)
+SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
+SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
+SEQ2_LIMIT=50
+SEQ2_CHUNK=10000000
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << this line keeps emacs coloring happy
+
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-syntenicNet \
+	-chainMinScore=5000 -chainLinearGap=medium \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	> do.log 2>&1 &
+    #	real    248m5.651s
+    cat fb.rheMac2.chainCalJac3Link.txt 
+    #	1871513554 bases of 2646704109 (70.711%) in intersection
+
+    mkdir /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap
+    cd /hive/data/genomes/calJac3/bed/blastz.rheMac2.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/rheMac2/bed/lastzCalJac3.2010-02-11/DEF \
+	-swap -syntenicNet \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	-chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
+    #	real    142m34.894s
+    cat fb.calJac3.chainHg19Link.txt 
+    #	1916431926 bases of 2752505800 (69.625%) in intersection
+############################################################################
+