src/hg/makeDb/doc/hg17.txt 1.120

1.120 2009/03/11 18:31:05 angie
Updated DGV (v7).
Index: src/hg/makeDb/doc/hg17.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg17.txt,v
retrieving revision 1.119
retrieving revision 1.120
diff -b -B -U 1000000 -r1.119 -r1.120
--- src/hg/makeDb/doc/hg17.txt	23 Feb 2009 23:41:35 -0000	1.119
+++ src/hg/makeDb/doc/hg17.txt	11 Mar 2009 18:31:05 -0000	1.120
@@ -1,24682 +1,24683 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes how we made the browser database on 
 # NCBI build 35 (May 2004 freeze)
 
 # HOW TO BUILD AN ASSEMBLY FROM NCBI FILES 
 # ---------------------------------------
 
     # Make gs.18 directory, gs.18/build35 directory, and gs.18/ffa directory.
     ssh eieio
     mkdir /cluster/store5/gs.18
     mkdir /cluster/store5/gs.18/build35
     mkdir /cluster/store5/gs.18/agp
     mkdir /cluster/store5/gs.18/ffa
 
     #    Make a symbolic link from /cluster/store1 to this location
     #	(I assume there is some use for this later ?)
 	
     cd /cluster/store1
     ln -s /cluster/store5/gs.18 ./gs.18
     ln -s /cluster/store5/gs.18/build35 /cluster/data/hg17
 
     #    Make a symbolic link from your home directory to the build dir:
     #	(Investigate what this is used for, may no longer be necessary)
 
     ln -s /cluster/store5/gs.18/build35 ~/oo
 
 # NCBI download site, fetch everything into this one directory:
 
     #	with the machine and password in your $HOME/.netrc file, this
     #	wget command will require no login.  Your $HOME/.netrc file
     #	is set to 'chmod 600 .netrc' to prevent anyone from finding
     #	the data.  (There were some early files that later moved
     #		into an OLD subdirectory.  They were broken.)
     mkdir /cluster/store5/gs.18/ncbi
     cd /cluster/store5/gs.18/ncbi
     wget --timestamping ftp://ftp.ncbi.nlm.nih.gov/build_35/*
 
     # FYI: agp file format documented at:
     #	http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html    # fixup a couple of names for our own purposes here
     cd /cluster/store5/gs.18/agp
     ln -s ../ncbi/chr*.agp ../ncbi/chr*.fa.gz .
     sed -e "s#MT/NC_001807.4#NC_001807#" ../ncbi/chrMT.agp > chrM.agp
     sed -e "s/NG_002392.2/NG_002392/" ../ncbi/DR52.agp > chr6_hla_hap1.agp
     sed -e "s/NG_002433.1/NG_002433/" ../ncbi/DR53.agp > chr6_hla_hap2.agp
     zcat ../ncbi/DR52.fa.gz | \
 	sed -e "s/gi|29124352|ref|NG_002392.2/ref|NG_002392/" | \
 	gzip > chr6_hla_hap1.fa.gz
     zcat ../ncbi/DR53.fa.gz | \
 	sed -e "s/gi|28212470|ref|NG_002433.1/ref|NG_002433/" | \
 	gzip > chr6_hla_hap2.fa.gz
     zcat ../ncbi/chrMT.fa.gz | \
 	sed -e "s/gi|17981852|ref|NC_001807.4/ref|NC_001807/" | \
 	gzip > chrM.fa.gz
     
 
     #  Put all the agp files together into one.
     cd /cluster/store5/gs.18/build35
     #	The chrM sequence now has its own agp, remove it from
     #	ref_placed.agp
     sed -e "/^NC_001807/d" ../ncbi/ref_placed.agp > ref_placed.agp
     cat ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \
 	../agp/chr6_hla_hap1.agp ../agp/chr6_hla_hap2.agp \
 	../ncbi/PAR.agp > ncbi_build35.agp
 
     #	and into ffa
     cd /cluster/store5/gs.18/ffa
     #	There is a single bogus line at the end of ref_placed.fa.gz
     #	declaring the NC_001807 MT sequence, this was later replaced by
     #	chrMT.fa.gz, so remove that one line:
     zcat ../ncbi/ref_placed.fa.gz | sed -e "/^>ref|NC_001807/d" | \
 	gzip > ref_placed.fa.gz
     #	(That's a 40 minute job)
 
     #	sequence.inf is usually here, symlink it
     ln -s ../ncbi/sequence.inf
     #	put all the fa.gz files together in one big fa.gz
     time zcat ref_placed.fa.gz ../agp/chrM.fa.gz ../ncbi/ref_unplaced.fa.gz \
 	../agp/chr6_hla_hap?.fa.gz ../ncbi/PAR.fa.gz | gzip \
 	> ncbi_build35.fa.gz
     #	real    37m42.208s
     #	user    37m3.490s
     #	sys     0m31.430s
 
     #	Make a listing of all the fasta record headers, just FYI:
     cd /cluster/store5/gs.18/ffa
     zcat ffa/ncbi_build35.fa.gz | grep "^>" > ncbi.fa.headers
 
 
     #	New to this build is the sequence: NC_001807 which is the
     #	mitochondria sequence.  This prefix NC_ is new to the process
     #	and will have to be accounted for below.  The other two special
     #	prefixes are similar to what was seen before:
     #	from DR52.agp NG_002392
     #	Homo sapiens major histocompatibility complex, class II,
     #		DR52 haplotype (DR52) on chromosome 6
     #	and from DR53.agp NG_002433
     #	Homo sapiens major histocompatibility complex, class II,
     #		DR53 haplotype (DR53) on chromosome 6
 
     #	Fixup seq_contig.md
     #
     #	It has a bunch of stuff belonging to the Celera
     #	genome assembly.  Filter those out.  I don't know what the
     #	NT_07959[0-7] items are, but there are no definitions for them
     #	in the agp files and no sequence in any fa.gz file.
     #	Fixup the names for the NG_ items, and change chrom MT to be M
     cd /cluster/store5/gs.18/build35
     egrep -v "Celera|NT_07959[0-7]" ../ncbi/seq_contig.md | \
 	sed -e "s/6|NG_002392/6_hla_hap1/" \
 	-e "s/6|NG_002433/6_hla_hap2/" \
 	-e "s/^9606\tMT|NC_001807/9606\tM/" \
 	> temp_contig.md
 
     #	get the randoms sorted in proper order.  The createNcbiLifts
     #	does not work correctly if the randoms are not grouped together
     #	by chromosome
     grep -v "|" temp_contig.md  > seq_contig.md
     #	This pulls out all the randoms and groups them within the
     #	same chrom but leaving them in the same order as they orginally
     #	were  (warning this is BASH code ...)
     grep "|" temp_contig.md | awk -F"|" '{print $1}' | \
         awk '{print $2}' | sort -n -u | while read CHR
 do
         grep "[^0-9]${CHR}|" temp_contig.md
 done >> seq_contig.md
 
 
     # Sanity check, checkYbr was updated to handle the NC_ identifier
     time zcat ../ffa/ncbi_build35.fa.gz | $HOME/bin/i386/checkYbr \
 	ncbi_build35.agp stdin seq_contig.md > check.seq_contig
     #	real    2m34.143s
     #	user    2m24.970s
     #	sys     0m8.900s
     #	result should be clean:
     cat check.seq_contig
     #	Read 380 contigs from ncbi_build35.agp
     #	Verifying sequence sizes in stdin
     #	0 problems detected
 
 
     # Convert fa files into UCSC style fa files and place in "contigs"
     # directory inside the gs.18/build35 directory 
     #	(a check that can be done here is make a list of the contigs
     #	in this ./contigs directory before and compare it with the
     #	list of distributed contigs created after they have been
     #	disbursed.)
     #	faNcbiToUcsc was fixed to handle the NC_ identifier
 
     cd /cluster/store5/gs.18/build35
     #	We've been through this often
     mv contigs contigs.0
     time zcat ../ffa/ncbi_build35.fa.gz | $HOME/bin/i386/faNcbiToUcsc \
 	-split -ntLast stdin contigs
     #	real    5m10.938s
     #	user    2m20.070s
     #	sys     0m51.020s
     #	If you want to compare anything to previous work, check now, then:
     rm -fr contigs.0
 
     # Determine the chromosome sizes from agps
     #	Watch carefully how chrY gets constructed.  I'm not sure
     #	this chrom_sizes represents the whole length of chrY with
     #	the PAR added.  We will see about that.
     #	Script updated to handle new chrom names:
     #	my @chroms = (1 .. 22, 'X', 'Y', 'M', '6_hla_hap1', '6_hla_hap2');
 
     cd /cluster/store5/gs.18/build35
     /cluster/bin/scripts/getChromSizes ../agp
     #	Create chrom.lst list for use in foreach() loops
     awk '{print $1}' chrom_sizes | sed -e "s/chr//" > chrom.lst
 
     # Create lift files (this will create chromosome directory structure) and
     #	inserts file
   
     /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .
 
     # Create contig agp files (will create contig directory structure)
 	
     /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build35.agp .
 
     # Create chromsome random agp files.
 
     /cluster/bin/scripts/createNcbiChrAgp -randomonly .
 
     # Copy the original chrN.agp files from the gs.18/agp directory 
     #    into each of the chromosome directories since they contain better 
     #    gap information. Delete the comments at top from these.
     cd /cluster/store5/gs.18/build35
     foreach c ( `cat chrom.lst` )
 	sed -e "/^#.*/d" ../agp/chr${c}.agp > ./${c}/chr${c}.agp
     end
     #	chrM needs a name fixup
     sed -e "s#NC_001807#chrM#" ../agp/chrM.agp > M/chrM.agp
 
     # Distribute contig .fa to appropriate directory (assumes all files
     # are in "contigs" directory).
 
     # Create inserts file from agp and lift files (new - added by Terry, 2004-07-12)
     /cluster/bin/scripts/createInserts /cluster/data/hg17 > /cluster/data/hg17/inserts
 
     # create global data link for everyone.  No more home directory
     # links required.
     ln -s /cluster/store5/gs.18/build35 /cluster/data/hg17
     cd /cluster/data/hg17
     /cluster/bin/scripts/distNcbiCtgFa contigs .
     #	Verify that everything was moved properly, the contigs directory
     #	should be empty:
     ls contigs
     #	Nothing there, then remove it
     rmdir  contigs
 
     #	Make a list of the contigs for use later
     rm contig.lst
     touch contig.lst
     foreach chrom ( `cat chrom.lst` )
 	foreach c ( $chrom/N{C,G,T}_?????? )
 	    set contig = $c:t
 	    echo "${chrom}/${contig}/${contig}.fa" >> contig.lst
 	end
     end
     #   For later comparisons, this is how many contigs we have:
     wc -l contig.lst
     #	380
 
     #	Note 2004-06-30 - there are some clone numbers left in some of
     #	the NCBI files that are incorrect.  Due to version number
     #	changes, more than one version is listed.  Namely for accession
     #	numbers: AC004491 AC004921 AC004983 AC005088 AC006014 AC099654
     #	The AGP files are correct, the sequence.inf file lists these
     #	twice: AC004491.1 AC004491.2
     #	AC004921.1 AC004921.2 AC004983.2 AC004983.3
     #	AC005088.2 AC005088.3 AC006014.2 AC006014.3
     #	AC099654.4 AC099654.5 
 
     # FILES ARE NOW READY FOR REPEAT MASKING - start that process as
     #	other steps here can proceed in parallel.
 
     #	Previous practice used to copy everything over for jkStuff from a
     #	previous build.  Rather than do that, pick up whatever is needed
     #	at the time it is needed and verify that it is going to do what
     #	you expect.
 
     cd /cluster/data/hg17
     mkdir jkStuff
 
     # Create the contig.gl files - XXX - NCBI doesn't deliver
     # contig_overlaps.agp - 2004-06-18 - this is beginning to come
     # together and there is now a contig_overlaps.agp file
 
     #	This is properly done below with a combination of psLayout
     #	alignments to create the contig_overlaps.agp file
     # /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
     # Create chromosome gl files
     # jkStuff/liftGl.csh contig.gl
 
 # CREATING DATABASE  (DONE - 2004-05-20 - Hiram)
     #	RE-DONE for new NIBS - 2004-06-03
     ssh hgwdev
     # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
     df -h /var/lib/mysql
 #	Filesystem            Size  Used Avail Use% Mounted on
 #	/dev/sdc1             1.8T  303G  1.4T  19% /var/lib/mysql
 
     # Create the database.
     hgsql -e 'create database hg17' mysql
     # Copy over grp table (for track grouping) from another database:
     hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" hg17
 
     # ENCODE groups
     # Added 2005-08016 kate
     echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg17
     echo 'UPDATE grp SET priority=8 WHERE name="encode"'| hgsql hg17
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg17
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg17
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg17
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg17
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg17
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg17
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg17
     
 
 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS
 #	(DONE - 2004-05-21 - Hiram)
     #	RE-DONE with new NIBS - 2004-06-03
     # Make nib/, unmasked until RepeatMasker and TRF steps are done.
     # Do this now so that the chromInfo table will exist and thus the
     #	trackDb tables can be built in the next step.
     #	These unmasked nibs will be replaced by the masked nibs after
     #	repeat mask and trf are done.
     ssh eieio
     cd /cluster/data/hg17
     # Make chr*.fa from contig .fa
     #  Copied chrFa.sh from hg16/jkStuff, renamed it to chrFa.csh
     time ./jkStuff/chrFa.csh
     #	real    13m24.710s
     #	user    9m0.360s
     #	sys     1m15.820s
 
     mkdir nib
     foreach c (`cat chrom.lst`)
       foreach f ($c/chr${c}{,_random}.fa)
         if (-e $f) then
           echo "nibbing $f"
           /cluster/bin/i386/faToNib $f nib/$f:t:r.nib
         endif
       end
     end
 
     # Make symbolic links from /gbdb/hg17/nib to the real nibs.
     ssh hgwdev
     mkdir -p /gbdb/hg17/nib
     ln -s /cluster/data/hg17/nib/chr*.nib /gbdb/hg17/nib
     # Load /gbdb/hg17/nib paths into database and save size info.
     cd /cluster/data/hg17
     hgsql hg17  < $HOME/kent/src/hg/lib/chromInfo.sql
     hgNibSeq -preMadeNib hg17 /gbdb/hg17/nib */chr*.fa
     hgsql -N -e "select chrom,size from chromInfo order by chrom" hg17 \
 	> chrom.sizes
     # You can compare this chrom.sizes with the previously created
     # chrom_sizes.  Should be no difference
     sort chrom_sizes > s0
     sort chrom.sizes | grep -v random > s1
     diff s0 s1
     rm s0 s1
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2004-05-21 - Hiram)
     #	dbDb orderKey updated 2004-06-08 - Hiram
     ssh hgwdev
     #	reset dbDb orderKey - these have never been ordered properly
     #	before, this will get them on the program.
     hgsql -e 'update dbDb set orderKey=11 where name = "hg16";' \
 	-h genome-testdb hgcentraltest
     hgsql -e 'update dbDb set orderKey=12 where name = "hg15";' \
 	-h genome-testdb hgcentraltest
     hgsql -e 'update dbDb set orderKey=13 where name = "hg13";' \
 	-h genome-testdb hgcentraltest
 
     # Enter hg17 into hgcentraltest.dbDb so test browser knows about it:
     hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
 	defaultPos, active, orderKey, genome, scientificName, \
 	htmlPath, hgNearOk, hgPbOk, sourceName) \
 	VALUES("hg17", "May 2004", "/gbdb/hg17/nib", "Human", \
 	"chr4:56214201-56291736", 1, 10, "Human", "Homo sapiens", \
 	"/gbdb/hg17/html/description.html", 0, 0, "NCBI Build 35");' \
 	-h genome-testdb hgcentraltest
     # Make trackDb table so browser knows what tracks to expect:
     cd ~/kent/src/hg/makeDb/trackDb
     cvs up -d -P .
     # Edit the makefile to add hg17 in all the right places and do
     make update
     make alpha
     cvs commit makefile
 
 # MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2004-05-21 - Hiram)
     #	Re-DONE with new randoms - 2004-06-03 - Hiram)
     cd /cluster/data/hg17
     mkdir -p jkStuff
     cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
     # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
     # Note: this ncbi.lift will not lift floating contigs to chr_random coords,
     # but it will show the strand orientation of the floating contigs 
     # (grep for '|').
     #   mdToNcbiLift seq_contig.md jkStuff/ncbi.lft 
     #	XXXX - appears to be unused, not done - Hiram
 
 # REPEAT MASKING (DONE - 2004-05-24 - Hiram)
     #	The randoms were rearranged after this was first done,
     #	they are re-made below 2004-06-02)
     #	Record the RM version here:
     #	RepBase Update 8.12, RM database version 20040130
     #	as this changes over time and there is no record in the results
 
     # Split contigs, run RepeatMasker, lift results
 
     #	This split takes about 8 minutes
     ssh eieio
     cd /cluster/data/hg17
     foreach chrom ( `cat chrom.lst` )
 	foreach c ( $chrom/N{C,G,T}_?????? )
 	    set contig = $c:t
 	    echo "splitting ${chrom}/${contig}/${contig}.fa"
 	    faSplit size ${chrom}/${contig}/$contig.fa 500000 \
 		${chrom}/${contig}/${contig}_ \
 		-lift=${chrom}/${contig}/$contig.lft -maxN=500000
 	end
     end
 
     #- Make the run directory and job list:
     cd /cluster/data/hg17
     mkdir -p jkStuff
     #  According to RepeatMasker help file, no arguments are required to
     #	specify species because its default is set for primate (human)
     #  This run script saves the .tbl file to be sent to Arian.  He uses
     # those for his analysis.  Sometimes he needs the .cat and .align files for
     # checking problems.  Krish needs the .align files, they are large.
 
     cat << '_EOF_' > jkStuff/RMHuman
 #!/bin/csh -fe
 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/hg17/$2
 /bin/cp $2 /tmp/hg17/$2/
 cd /tmp/hg17/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2
 popd
 /bin/cp /tmp/hg17/$2/$2.out ./
  if (-e /tmp/hg17/$2/$2.align) /bin/cp /tmp/hg17/$2/$2.align ./
 if (-e /tmp/hg17/$2/$2.tbl) /bin/cp /tmp/hg17/$2/$2.tbl ./
 # if (-e /tmp/hg17/$2/$2.cat) /bin/cp /tmp/hg17/$2/$2.cat ./
 /bin/rm -fr /tmp/hg17/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/hg17/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/hg17
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x jkStuff/RMHuman
 
     ssh eieio
     cd /cluster/data/hg17
     mkdir RMRun
     rm -f RMRun/RMJobs
     touch RMRun/RMJobs
     foreach d ( `cat chrom.lst` )
      foreach c ( ${d}/N{C,G,T}_*/N{C,G,T}_*_*.fa )
         set f = $c:t
         set cc = $c:h
         set contig = $cc:t
         echo /cluster/store5/gs.18/build35/jkStuff/RMHuman \
    		/cluster/store5/gs.18/build35/${d}/${contig} $f \
    '{'check out line+ /cluster/store5/gs.18/build35/${d}/${contig}/$f.out'}' \
           >> RMRun/RMJobs
       end
     end
 
     # We have 5971 jobs in RMJobs:
     wc RMRun/RMJobs
     #	5970   41790 1105804 RMRun/RMJobs
 
     #- Do the run
     ssh kk
     cd /cluster/data/hg17/RMRun
     para create RMJobs
     para try, para check, para check, para push, para check,...
 
     #- While that is running, you can run TRF (simpleRepeat) on the small
     # cluster.  See SIMPLE REPEAT section below
 # Completed: 5970 of 5970 jobs
 # CPU time in finished jobs:   45189516s  753158.60m 12552.64h  523.03d  1.433 y
 # IO & Wait Time:                141333s    2355.55m    39.26h    1.64d  0.004 y
 # Average job time:                7593s     126.55m     2.11h    0.09d
 # Longest job:                    10268s     171.13m     2.85h    0.12d
 # Submission to last job:         81484s    1358.07m    22.63h    0.94d
 
     #	Lift up the split-contig .out's to contig-level .out's
     #
     #	If a mistake is made in the following it would be possible to
     #	destroy all the RM output.  So, just to be paranoid, save all
     #	the RM output in bluearc for the time being:
     ssh eieio
 
     cd /cluster/data/hg17
     mkdir /cluster/bluearc/hg17/RMOutput
     foreach c ( `cat chrom.lst` )
      foreach d ( ${c}/N{C,G,T}_* )
 	set T = /cluster/bluearc/hg17/RMOutput/${d}
 	mkdir -p ${T}
         cd ${d}
         set contig = $d:t
         cp -p ${contig}_?{,?,??}.fa.out ${T}
         cd ../..
 	echo "${d} done"
      end
     end
     #	Make sure we got them all:
     #	(this doesn't work later since there are more *.fa.out files
     #	after the lifting.  More explicitly to find just these:
     #		find . -name "N?_*_*.fa.out" -print | wc -l
     find . -name "*.fa.out" -print | wc -l
     #	5970
     find /cluster/bluearc/hg17/RMOutput -type f | wc -l
     #	5970
     #	same count
 
     #	OK, now you can try this operation, do it in a script like this
     #	and save the output of the script for a record of what happened.
 
     cat << '_EOF_' > jkStuff/liftRM.csh
 #!/bin/csh -fe
 foreach c ( `cat chrom.lst` )
  foreach d ( ${c}/N{C,G,T}_* )
     cd $d
     set contig = $d:t
     liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out 
     cd ../..
  end
 end
 '_EOF_'
     chmod +x jkStuff/liftRM.csh
     mkdir scriptsOutput
     time jkStuff/liftRM.csh > scriptsOutput/liftRM.1 2>&1
     #	real    4m37.572s
     #	user    1m19.130s
     #	sys     0m32.950s
     #	Check that they all were done:
     grep "fa.out" scriptsOutput/liftRM.1 | wc -l
     #	5959
     #	same count as above
 
     #- Lift up RepeatMask .out files to chromosome coordinates via
     # picked up jkStuff/liftOut2.sh from the hg16 build.  Renamed to
     # liftOut2.csh, changed the line that does the chrom listing
 
     time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2 2>&1
     #	real    9m46.780s
     #	user    1m18.900s
     #	sys     7m33.990s
 
     #- By this point, the database should have been created (above):
     ssh hgwdev
     cd /cluster/data/hg17
     time hgLoadOut hg17 ?/*.fa.out ??/*.fa.out 6_hla_hap?/*.fa.out > \
 	scriptsOutput/hgLoadOut 2>&1
     #	real    5m59.137s
     #	user    1m47.550s
     #	sys     0m15.410s
 
     # errors during this load:  (there are always a couple of these)
     #	Strange perc. field -6.1 line 243543 of 2/chr2.fa.out
     #	Strange perc. field -5.6 line 243543 of 2/chr2.fa.out
     #	Strange perc. field -6.1 line 243545 of 2/chr2.fa.out
     #	Strange perc. field -5.6 line 243545 of 2/chr2.fa.out
     #	Strange perc. field -0.2 line 30322 of 17/chr17.fa.out
     #	Strange perc. field -0.2 line 30324 of 17/chr17.fa.out
     #	Strange perc. field -0.2 line 30326 of 17/chr17.fa.out
     #	Strange perc. field -0.2 line 30328 of 17/chr17.fa.out
     #	Strange perc. field -18.6 line 77034 of 19/chr19.fa.out
 
     #	Verify we have similar results to previous assembly:
     #	featureBits hg17 rmsk
     #	1391378842 bases of 2867328468 (48.525%) in intersection
     #	featureBits hg16 rmsk
     #	1388770568 bases of 2865248791 (48.469%) in intersection
     #	Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
     #	following the SIMPLE REPEAT sections below
 
 # Re-Running REPEAT_MASKER on the new Randoms (DONE - 2004-06-02 - Hiram)
     ssh eieio
     cd /cluster/data/hg17
     grep "|" seq_contig.md | awk '{print $2}' | sed -e "s#|#/#" > randoms.lst
 
     mkdir /cluster/data/hg17/RMRandoms
     foreach r ( `cat randoms.lst` )
 	set d = $r:h
 	set contig = $r:t
 	foreach c ( ${r}/N{C,G,T}_*_*.fa )
 	    set f = $c:t
 	    echo /cluster/store5/gs.18/build35/jkStuff/RMHuman \
    		/cluster/store5/gs.18/build35/${d}/${contig} $f \
    '{'check out line+ /cluster/store5/gs.18/build35/${d}/${contig}/$f.out'}' \
           >> RMRandoms/RMJobs
 	end
     end
 
     ssh kk
     cd /cluster/data/hg17/RMRandoms
     para create RMJobs
     para try, para check, para check, para push, para check,...
 # Completed: 94 of 94 jobs
 # CPU time in finished jobs:     221454s    3690.91m    61.52h    2.56d  0.007 y
 # IO & Wait Time:                   866s      14.43m     0.24h    0.01d  0.000 y
 # Average job time:                2365s      39.42m     0.66h    0.03d
 # Longest job:                     9062s     151.03m     2.52h    0.10d
 # Submission to last job:          9106s     151.77m     2.53h    0.11d
 
     #	Continuing with the paranoia theme, let's backup all the RM output
     #
     ssh eieio
 
     cd /cluster/data/hg17
     mkdir /cluster/bluearc/hg17/RMRandoms
     foreach c ( `cat chrom.lst` )
      foreach d ( ${c}/N{C,G,T}_* )
 	set T = /cluster/bluearc/hg17/RMRandoms/${d}
 	mkdir -p ${T}
         cd ${d}
         set contig = $d:t
         cp -p ${contig}_?{,?,??}.fa.out ${T}
         cd ../..
 	echo "${d} done"
      end
     end
     #	Make sure we got them all:
     find . -name "N?_*_*.fa.out" -print | wc -l
     #	5959
     find /cluster/bluearc/hg17/RMRandoms -type f | wc -l
     #	5959
     #	same count
 
 
     time jkStuff/liftRM.csh > scriptsOutput/liftRM2.1 2>&1
     #	real    4m46.302s
     #	user    1m18.260s
     #	sys     0m18.000s
     #	Check that they all were done:
     grep "fa.out" scriptsOutput/liftRM2.1 | wc -l
     #	5959
     #	same count as above
 
     #- Lift up RepeatMask .out files to chromosome coordinates via
     # picked up jkStuff/liftOut2.sh from the hg16 build.  Renamed to
     # liftOut2.csh, changed the line that does the chrom listing
 
     time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2.1 2>&1
     #	real    2m46.347s
     #	user    1m18.650s
     #	sys     0m15.990s
 
     #- By this point, the database should have been created (above):
     ssh hgwdev
     cd /cluster/data/hg17
     time hgLoadOut hg17 ?/*.fa.out ??/*.fa.out 6_hla_hap?/*.fa.out > \
 	scriptsOutput/hgLoadOut 2>&1
     #	real    5m59.137s
     #	user    1m47.550s
     #	sys     0m15.410s
 
     # errors during this load:  (there are always a couple of these)
     #	Strange perc. field -6.1 line 243543 of 2/chr2.fa.out
     #	Strange perc. field -5.6 line 243543 of 2/chr2.fa.out
     #	Strange perc. field -6.1 line 243545 of 2/chr2.fa.out
     #	Strange perc. field -5.6 line 243545 of 2/chr2.fa.out
     #	Strange perc. field -0.2 line 30322 of 17/chr17.fa.out
     #	Strange perc. field -0.2 line 30324 of 17/chr17.fa.out
     #	Strange perc. field -0.2 line 30326 of 17/chr17.fa.out
     #	Strange perc. field -0.2 line 30328 of 17/chr17.fa.out
     #	Strange perc. field -18.6 line 77034 of 19/chr19.fa.out
 
     #	Verify we have similar results to previous assembly:
     #	featureBits hg17 rmsk
     #	1390952984 bases of 2866216770 (48.529%) in intersection
     #	featureBits hg17 rmsk  #with previous randoms:
     #	1391378842 bases of 2867328468 (48.525%) in intersection
     #	featureBits hg16 rmsk
     #	1388770568 bases of 2865248791 (48.469%) in intersection
     #	Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
     #	following the SIMPLE REPEAT sections below
 
 # SIMPLE REPEAT [TRF] TRACK (DONE - 2004-05-21 - Hiram)
     #	Re-done with new randoms, 2004-06-02 - Hiram
     #	Copy the contigs, first to the bluearc, then to /iscratch/i
     ssh eieio
     mkdir /cluster/bluearc/hg17
     mkdir /cluster/bluearc/hg17/contigs
 
     cd /cluster/data/hg17
     foreach ctg ( `cat contig.lst` )
 	set c = $ctg:t
  	echo "$ctg > /cluster/bluearc/hg17/contigs/$c"
 	cp -p $ctg /cluster/bluearc/hg17/contigs/$c
     end
     #	Check how much is there:
     #	du -hsc /cluster/bluearc/hg17/contigs
     #	2.8G    /cluster/bluearc/hg17/contigs
 
     # Distribute contigs to /iscratch/i
     ssh kkr1u00
     mkdir /iscratch/i/gs.18/build35/unmaskedContigs
     cd /iscratch/i/gs.18/build35/unmaskedContigs
     cp -p /cluster/bluearc/hg17/contigs/* .
 
     # Verify same amount made it there:
     #	du -hsc /iscratch/i/gs.18/build35/unmaskedContigs
     #	2.8G    /iscratch/i/gs.18/build35/unmaskedContigs
     #	Then send them to the other 7 Iservers
     /cluster/bin/iSync
 
     #	Go to the small cluster for this business:
     ssh kki
 
     mkdir -p /cluster/data/hg17/bed/simpleRepeat
     cd /cluster/data/hg17/bed/simpleRepeat
     mkdir trf
     cat << '_EOF_' > runTrf
 #!/bin/csh -fe
 #
 set path1 = $1
 set inputFN = $1:t
 set outpath = $2
 set outputFN = $2:t
 mkdir -p /tmp/$outputFN
 cp $path1 /tmp/$outputFN
 pushd .
 cd /tmp/$outputFN
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
 popd
 rm -f $outpath
 cp -p /tmp/$outputFN/$outputFN $outpath
 rm -fr /tmp/$outputFN/*
 rmdir --ignore-fail-on-non-empty /tmp/$outputFN
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x runTrf
 
     cat << '_EOF_' > gsub
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     ls -1S /iscratch/i/gs.18/build35/unmaskedContigs/*.fa > genome.lst
     gensub2 genome.lst single gsub jobList
     para create jobList
     para try
     para check
     para push
     para check
 # Completed: 380 of 380 jobs
 # CPU time in finished jobs:      13230s     220.49m     3.67h    0.15d  0.000 y
 # IO & Wait Time:                  2078s      34.64m     0.58h    0.02d  0.000 y
 # Average job time:                  40s       0.67m     0.01h    0.00d
 # Longest job:                     1590s      26.50m     0.44h    0.02d
 # Submission to last job:          2504s      41.73m     0.70h    0.03d
 
     liftUp simpleRepeat.bed /cluster/data/hg17/jkStuff/liftAll.lft \
 	warn trf/*.bed  > lu.out 2>&1
 
     # Load into the database:
     ssh hgwdev
     cd /cluster/data/hg17/bed/simpleRepeat
     /cluster/bin/i386/hgLoadBed hg17 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     #	Loaded 629076 elements of size 16
     #	Compare with previous assembly
     featureBits hg17 simpleRepeat
     #	54952425 bases of 2866216770 (1.917%) in intersection
 
     #	with previous randoms
     featureBits hg17 simpleRepeat
     #	54964044 bases of 3096628158 (1.775%) in intersection
     featureBits hg16 simpleRepeat
     #	54320136 bases of 2865248791 (1.896%) in intersection
     #	GAPS weren't in hg17 yet at this point, after gaps added:
     #	featureBits hg17 simpleRepeat
     #	54964044 bases of 2867328468 (1.917%) in intersection
     #	featureBits -countGaps hg17 simpleRepeat
     #	54964044 bases of 3096628158 (1.775%) in intersection
 
 ###########################################################################
 # CREATE MICROSAT TRACK (done 2006-7-5 JK)
      ssh hgwdev
      cd /cluster/data/hg17/bed
      mkdir microsat
      cd microsat
      awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
     /cluster/bin/i386/hgLoadBed hg17 microsat microsat.bed
 
 
 # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2004-05-21 - Hiram)
     #	re-done with new randoms - 2004-06-03 - Hiram
     # After the simpleRepeats track has been built, make a filtered version 
     # of the trf output: keep trf's with period <= 12:
     ssh eieio
     cd /cluster/data/hg17/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/*.bed)
       awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
 
     #	EXPERIMENT, at a filter of <= 12, we have coverage:
     #	20904399 bases of 2867328468 (0.729%) in intersection
     #	at a filter of <= 9, we have coverage:
     #	19271270 bases of 2867328468 (0.672%) in intersection
 
 
     # Lift up filtered trf output to chrom coords as well:
     cd /cluster/data/hg17
     mkdir bed/simpleRepeat/trfMaskChrom
     foreach c ( `cat chrom.lst` )
       if (-e $c/lift/ordered.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/ordered.lst > $c/lift/oTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
       endif
       if (-e $c/lift/random.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
            $c/lift/random.lst > $c/lift/rTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
       endif
     end
 
 # MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2004-05-25)
 #							 -Hiram
     #	re-done with new randoms - 2004-06-03 - Hiram
     # This used to be done right after RepeatMasking.  Now, we mask with 
     # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above,
     #	and after Repeat Masker is complete.
     ssh eieio
     cd /cluster/data/hg17
 
     # copied these scripts from hg16 - reset the lines that make
     # the chrom list to work on, reset the wild cards that find all the
     # contig .fa's
 
     # Make chr*.fa from contig .fa
     #  Copied chrFa.sh from hg16/jkStuff, renamed it to chrFa.csh
     time ./jkStuff/chrFa.csh > scriptsOutput/chrFa.out 2>&1 &
     #	real    13m18.512s
     #	user    9m1.670s
     #	sys     1m7.290s
 
     #- Soft-mask (lower-case) the contig and chr .fa's
     time ./jkStuff/makeFaMasked.csh > scriptsOutput/maFaMasked.out 2>&1
     #	real    29m31.623s
     #	user    13m49.700s
     #	sys     5m58.750s
     #- Make hard-masked .fa.masked files as well:
     time ./jkStuff/makeHardMasked.csh > scriptsOutput/maHardMasked.out 2>&1
 
     #- Create the bothMasksNib/ directory
     time ./jkStuff/makeNib.csh > scriptsOutput/maNib.out 2>&1
     #	real    14m41.694s
     #	user    6m28.000s
     #	sys     1m42.500s
 
     # Make symbolic links from /gbdb/hg17/nib to the real nibs.
     ssh hgwdev
     mv nib nib.raw
     mv bothMasksNib nib
     rm /gbdb/hg17/nib/*.nib
     ln -s `pwd`/nib/* /gbdb/hg17/nib
 
     # Load /gbdb/hg17/nib paths into database and save size info.
     hgsql hg17  < ~/kent/src/hg/lib/chromInfo.sql
     cd /cluster/data/hg17
     hgNibSeq -preMadeNib hg17 /gbdb/hg17/nib */chr*.fa
     #	3096628158 total bases
 
     #	Should be the same size as before
     hgsql -N -e "select chrom,size from chromInfo order by chrom" hg17 \
 	> chrom.sizes.masked
     diff chrom.sizes chrom.sizes.masked
     #	should be no output at all, thus:
     rm chrom.sizes.masked
 
     # Copy the masked contig fa to /scratch and /iscratch
     #	And everything else we will need for blastz runs, etc ...
     #	Best to do this sequence first to /cluster/bluearc/scratch,
     #	which is going to be the source for the /scratch copy.
     #	And then from there to the /iscratch
     #	Make sure you are on the fileserver for the original source:
     ssh eieio
     mkdir -p /cluster/bluearc/scratch/hg/gs.18/build35
     cd /cluster/bluearc/scratch/hg/gs.18/build35
 
     #	these copies take less than 2 minutes each
     mkdir bothMaskedNibs
     cp -p /cluster/data/hg17/nib/*.nib ./bothMaskedNibs
     mkdir maskedContigs
     foreach chrom ( `cat /cluster/data/hg17/chrom.lst` )
 	cp -p /cluster/data/hg17/${chrom}/N{C,G,T}_*/N{C,G,T}_??????.fa \
 		./maskedContigs
 	echo "done ${chrom}"
     end
     #	make sure you have them all:
     ls maskedContigs | wc -l
     #	380
     wc -l /cluster/data/hg17/contig.lst
     #	380
     mkdir rmsk
     foreach chrom ( `cat /cluster/data/hg17/chrom.lst` )
 	cp -p /cluster/data/hg17/${chrom}/*.out ./rmsk
 	echo "done ${chrom}"
     end
 
     #	Now, go to the destination for /iscratch and copy from the
     #	bluearc
     ssh kkr1u00
     mkdir -p /iscratch/i/gs.18/build35
     cd /iscratch/i/gs.18/build35
     #	This takes about 5 minutes
     rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/ .
 
     time /cluster/bin/iSync
     #	real    7m27.649s
 
     # request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch
 
 # LOAD ctgPos table - Contig position track (DONE - 2004-06-08 - Hiram)
     #	After fixing up hgCtgPos to accept the -chromLst argument, simply:
     cd /cluster/data/hg17
     hgCtgPos -chromLst=chrom.lst hg17 .
 
 # GOLD AND GAP TRACKS (DONE - 2004-05-21 - Hiram)
     #	RE-DONE with new randoms - 2004-06-03 - Hiram
     ssh hgwdev
     cd /cluster/data/hg17
     hgGoldGapGl -noGl -chromLst=chrom.lst hg17 /cluster/data/hg17 .
     #	Disappointing to see this create so many tables ...
     #	_gap and _gold for each chrom
 
     # Create the contig.gl files - XXX - NCBI doesn't deliver
     # contig_overlaps.agp - 2004-06-18 - this is beginning to come
     # together and there is now a contig_overlaps.agp file
     cd /cluster/store5/gs.18/build35
     #	combine the various psLayout attempts on different sections of
     #	clones
     ./combineContigOverlaps.sh
     #	Turn contig_overlaps.agp into gl files
     ~hiram/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
     # Create chromosome gl files  (had to fix liftUp to do the NC_ properly)
     jkStuff/liftGl.csh contig.gl
     #
     #	Need to remove these PAR clone names from chrY.gl
     rm -f /tmp/chrY.gl
     sed -e "/^AL954722.18/d; /^BX537334.4/d; /^BX000483.7/d; \
 	/^BX908402.3/d; /^BX649635.3/d; /^BX119919.5/d; \
 	/^AC079176.15/d; /^AC097314.27/d; /^AC006209.25/d; \
 	/^AJ271735.1/d; /^AJ271736.1/d" Y/chrY.gl > /tmp/chrY.gl
     rm -f Y/chrY.gl
     mv /tmp/chrY.gl Y/chrY.gl
 
 #  After contig.gl files have been made from contig_overlaps.agp
 #	The sed fixes the Celera clones that are marked phase W
 #	Call that phase 3 instead,
 #	Delete the Celera AACC clones, they are not in this assembly,
 #	And fix the line of AC018743 to add it to the assembly, it was a
 #	last minute addition by Terry that didn't get carried into the
 #	NCBI sequence.inf file.  And remove the older versions of five
 #	clones that got left in by mistake at NCBI
     #	AC004491.1=AC004491.2 AC004921.1=AC004921.2 AC004983.2=AC004983.3
     #	AC005088.2=AC005088.3 AC006014.2=AC006014.3 AC099654.4=AC099654.5 
 #	And finally the grep selects only those things for_assembly
     cd /cluster/data/hg17
     egrep "for_assembly|AC018743" /cluster/store5/gs.18/ncbi/sequence.inf | \
     sed -e "s/\tW\t/\t3\t/; /^AACC010000.*/d; /^AC004491.1.*/d; \
 	/^AC004921.1.*/d; /^AC004983.2.*/d; /^AC005088.2.*/d; \
 	/^AC006014.2.*/d; /^AC099654.4.*/d; \
 	s/AC018743.27\t31791062\t466818\t1\tD\tUn\t-\tBCM\tRP11-289M22\tSIZE:2big/AC018743.27\t31791062\t466818\t1\t-\t(12)\t-\tBCM\tRP11-289M22\tfor_assembly/" \
 	> sequence.inf
     cd /cluster/data/hg17
     hgGoldGapGl -chromLst=chrom.lst hg17 /cluster/store5/gs.18 build35
     $HOME/bin/i386/hgClonePos -chromLst=chrom.lst hg17 \
 	/cluster/data/hg17 ./sequence.inf /cluster/store5/gs.18 -maxErr=3 \
 	-maxWarn=2000 2> clone.pos.errors
     #	Extract all the PAR clones for chrX from clonePos, change the X
     #	to Y, fixup the coordinates on the last three, and load this
     #	data in on the clonePos table in addition to what is there
     #	already.
     cat << '_EOF_' > chrY.par.clonePos
 BX640545.2      34821   3       chrY    0       34250   F
 AL954722.18     37771   3       chrY    84821   122592  F
 BX537334.4      36872   3       chrY    120592  157464  F
 BX000483.7      15918   3       chrY    155466  171384  F
 AL954664.17     39200   3       chrY    251384  290307  F
 BX000476.5      33340   3       chrY    282188  315528  F
 AL732314.18     218723  3       chrY    313528  532251  F
 BX004827.18     119555  3       chrY    479050  600112  F
 AL683871.15     175765  3       chrY    598112  773877  F
 AL672311.26     115998  3       chrY    771877  887875  F
 AL672277.20     131682  3       chrY    885875  1017557 F
 BX908402.3      36556   3       chrY    1067557 1104113 F
 BX649635.3      43709   3       chrY    1154113 1197822 F
 BX649553.5      90286   3       chrY    1347822 1438108 F
 BX296563.3      21008   3       chrY    1488108 1509117 F
 BX119906.16     35666   3       chrY    1507116 1542782 F
 AL683870.15     162377  3       chrY    1541782 1704175 F
 AL691415.17     45085   3       chrY    1702175 1747265 F
 AL683807.22     189825  3       chrY    1745260 1935086 F
 AL672040.10     117297  3       chrY    1933086 2050383 F
 BX004859.8      63432   3       chrY    2048380 2111815 F
 BX119919.5      55442   3       chrY    2261815 2317257 F
 AC079176.15     186278  3       chrY    2311674 2497952 F
 AC097314.27     80501   3       chrY    2495948 2576449 F
 AC006209.25     141759  3       chrY    2551122 2692881 F
 AJ271735.1      240000  3       chrY    57302979        57543030        F
 AJ271736.1      158661  3       chrY    57543030        57701691        F
 '_EOF_'
 
 hgsql -e 'load data local infile "chrY.par.clonePos" into table clonePos;' hg17
 
     #	We have the following errors
 # Processing /cluster/data/hg17/Y/chrY.gl
 # Clone BX640545 is on chromosomes chrX and chrY.  Ignoring chrY
 # Clone AL954722 is on chromosomes chrX and chrY.  Ignoring chrY
 # ... etc for all the PAR clones
 # ... And there are an unknown number of these:
 # AB000359 is in ./sequence.inf but not in ooDir/*/*.gl
 # AB000360 is in ./sequence.inf but not in ooDir/*/*.gl
 
 #  gc5Base wiggle TRACK (DONE - 2004-05-22 - Hiram)
     #	This previously was a script that ran through each nib
     #	Recently transformed into a mini cluster run.
     #	Re-DONE with the new randoms - 2004-06-04
     ssh kki
     mkdir /cluster/data/hg17/bed/gc5Base
     cd /cluster/data/hg17/bed/gc5Base
 
     mkdir wigData5 dataLimits5 wigData5_1K dataLimits5_1K
 
     cat << '_EOF_' > kkRun.sh
 #!/bin/sh
 NIB=$1
 
 chr=${NIB/.nib/}
 chrom=${chr#chr}
 
 hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 hg17 \
         /iscratch/i/gs.18/build35/bothMaskedNibs | \
     grep -w GC | \
     awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
     wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
         -wibFile=wigData5/gc5Base_${chrom} \
             -name=${chrom} stdin 2> dataLimits5/${chr}
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x kkRun.sh
 
     ls /iscratch/i/gs.18/build35/bothMaskedNibs > nibList
     cat << '_EOF_' > gsub
 #LOOP
 ./kkRun.sh $(path1)
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 nibList single gsub jobList
     para create jobList
     para try, check, ... etc
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:       5251s      87.51m     1.46h    0.06d  0.000 y
 # IO & Wait Time:                   130s       2.17m     0.04h    0.00d  0.000 y
 # Average job time:                 117s       1.95m     0.03h    0.00d
 # Longest job:                      413s       6.88m     0.11h    0.00d
 # Submission to last job:           475s       7.92m     0.13h    0.01d
 
     # load the .wig files back on hgwdev:
     ssh hgwdev
     cd /cluster/data/hg17/bed/gc5Base
     hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/gc5Base hg17 gc5Base wigData5/*.wig
     # and symlink the .wib files into /gbdb
     mkdir /gbdb/hg17/wib/gc5Base
     ln -s `pwd`/wigData5/*.wib /gbdb/hg17/wib/gc5Base
 
     #	And then the zoomed data view
     ssh kki
     cd /cluster/data/hg17/bed/gc5Base
     mkdir wigData5_1K dataLimits5_1K
 
     cat << '_EOF_' > kkRunZoom.sh
 #!/bin/sh
 NIB=$1
 
 chr=${NIB/.nib/}
 chrom=${chr#chr}
 
 hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 hg17 \
         /iscratch/i/gs.18/build35/bothMaskedNibs | \
     grep -w GC | \
     awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
     wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
 	-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
             -name=${chrom} stdin 2> dataLimits5_1K/${chr}
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x kkRunZoom.sh
 
     cat << '_EOF_' > gsubZoom
 #LOOP
 ./kkRunZoom.sh $(path1)
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 nibList single gsubZoom jobListZoom
     para create jobListZoom
     para try ... check ... etc ...
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:       5216s      86.93m     1.45h    0.06d  0.000 y
 # IO & Wait Time:                    34s       0.57m     0.01h    0.00d  0.000 y
 # Average job time:                 114s       1.90m     0.03h    0.00d
 # Longest job:                      415s       6.92m     0.12h    0.00d
 # Submission to last job:           469s       7.82m     0.13h    0.01d
 
     #	Then load these .wig files into the same database as above
     ssh hgwdev
     hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/gc5Base \
 	-oldTable hg17 gc5Base wigData5_1K/*.wig
     # and symlink these .wib files into /gbdb
     mkdir -p /gbdb/hg17/wib/gc5Base
     ln -s `pwd`/wigData5_1K/*.wib /gbdb/hg17/wib/gc5Base
 
 # AUTO UPDATE GENBANK MRNA RUN  (DONE - 2004-06-08 - Hiram)
     ssh eieio
     cd /cluster/data/genbank
     # This is a new organism, edit the etc/genbank.conf file and add:
 	# hg17
 	hg17.genome = /scratch/hg/gs.18/build35/bothMaskedNibs/chr*.nib
 	hg17.lift = /cluster/store5/gs.18/build35/jkStuff/liftAll.lft
 	hg17.genbank.est.xeno.load = yes
 	hg17.mgcTables.default = full
 	hg17.mgcTables.mgc = all
 	hg17.downloadDir = hg17
 
     #	Do the refseq's first, they are the quick ones
     ssh eieio
     cd /cluster/data/genbank
     nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial hg17
     #	 logFile: var/build/logs/2004.05.25-13:41:07.hg17.initalign.log
     #	checking that log, or watching the batch on kk, you can find
     #	where the batch is running and after it is done get the time:
     cd /cluster/store6/genbank/work/initial.hg17/align
     para time > time
     cat time
 # Completed: 9500 of 9500 jobs
 # CPU time in finished jobs:      62241s    1037.35m    17.29h    0.72d  0.002 y
 # IO & Wait Time:                 33719s     561.98m     9.37h    0.39d  0.001 y
 # Average job time:                  10s       0.17m     0.00h    0.00d
 # Longest job:                     1062s      17.70m     0.29h    0.01d
 # Submission to last job:          1063s      17.72m     0.30h    0.01d
 
     # Load the results from the above
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17
 
 #	To get the genbank started, the above results need to be
 #	moved out of the way.  These things can be removed if there are
 #	no problems to debug
     ssh eieio
     cd /cluster/data/genbank/work
     mv initial.hg17 initial.hg17.refseq.mrna
 
     cd /cluster/data/genbank
     nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial hg17
     #	logFile: var/build/logs/2004.06.04-10:47:21.hg17.initalign.log
     #	One job was hung up, after killing it on its node, the batch
     #	finished in a few minutes.
 # Completed: 35720 of 35720 jobs
 # CPU time in finished jobs:    5161424s   86023.74m  1433.73h   59.74d  0.164 y
 # IO & Wait Time:                144149s    2402.48m    40.04h    1.67d  0.005 y
 # Average job time:                 149s       2.48m     0.04h    0.00d
 # Longest job:                    18306s     305.10m     5.08h    0.21d
 # Submission to last job:         35061s     584.35m     9.74h    0.41d
 
     ssh hgwdev
     cd /cluster/data/genbank
     #	some kind of error happened here, had to remove a lock file to
     #	get this to proceed  (this same thing happened again the second
     #	time around)
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17
 
     ssh eieio
     cd /cluster/data/genbank/work
     mv initial.hg17 initial.hg17.genbank.mrna
     cd /cluster/data/genbank
     nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial hg17
 # Completed: 189240 of 189240 jobs
 # CPU time in finished jobs:   97172120s 1619535.33m 26992.26h 1124.68d  3.081 y
 # IO & Wait Time:               1507789s   25129.82m   418.83h   17.45d  0.048 y
 # Average job time:                 521s       8.69m     0.14h    0.01d
 # Longest job:                    33165s     552.75m     9.21h    0.38d
 # Submission to last job:        126988s    2116.47m    35.27h    1.47d
 
     ssh hgwdev
     cd /cluster/data/genbank
     time nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg17
     #	real    440m42.750s
     #	user    69m7.810s
     #	sys     23m18.640s
     #	This is ~7.5 hours
 
     #	If the above is all OK, ask Mark to put this assembly on
     #	the daily updates.
 
 # CPGISLANDS (DONE - 2004-05-25 - Hiram)
     #	Re-DONE with new randoms - 2004-06-04 - Hiram
     ssh hgwdev
     mkdir -p /cluster/data/hg17/bed/cpgIsland
     cd /cluster/data/hg17/bed/cpgIsland
 
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     #	gcc readseq.c cpg_lh.c -o cpglh.exe
     mv cpglh.exe /cluster/data/hg17/bed/cpgIsland/
     
     # cpglh.exe requires hard-masked (N) .fa's.  
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     ssh eieio
     cd /cluster/data/hg17/bed/cpgIsland
     foreach f (../../*/chr*.fa.masked)
       set fout=$f:t:r:r.cpg
       echo running cpglh on $f to $fout
       ./cpglh.exe $f > $fout
     end
     #	the warnings:
     # Bad char 0x52 = 'R' at line 2046, base 102229, sequence chr16_random
     # Bad char 0x4d = 'M' at line 1216113, base 60805573, sequence chr3
     # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
     # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
     #	real    21m47.823s
     #	user    18m30.810s
     #	sys     1m13.420s
 
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 /* Input columns: */
 /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
 /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
 /* Output columns: */
 /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
 /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     # << this line makes emacs coloring happy
     awk -f filter.awk chr*.cpg > cpgIsland.bed
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/cpgIsland
     hgLoadBed hg17 cpgIslandExt -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
     #	Reading cpgIsland.bed
     #	Loaded 27801 elements of size 10
     #	Sorted
     #	Saving bed.tab
     #	Loading hg17
 
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2004-05-25 - Heather)
     ssh hgwdev
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans) \
 	VALUES("hg17", "blat12", "17778", "1"); \
 	INSERT INTo blatServers (db, host, port, isTrans) \
 	VALUES("hg17", "blat12", "17779", "0");' \
 	-h genome-testdb hgcentraltest
 
 # PREPARE CLUSTER FOR BLASTZ RUNS (DONE - 2004-05-26 - Hiram)
     #	Re-DONE with new randoms - 2004-06-03 - Hiram
 
     ssh eieio
     mkdir /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
     cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
     ln -s ../rmsk/*.out .
     #	This takes 40 minutes run as a script, to hurry it up it has
     #	been converted to a mini cluster run
     cat << '_EOF_' > runArian.sh
 #!/bin/sh
 for FN in *.out
 do
     echo /cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \
 	${FN} -query human -comp rat -comp mouse
 done
 '_EOF_'
     chmod +x runArian.sh
     ssh kki
     cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
     ./runArian.sh > jobList
     para create jobList
     para try, ... check ... push ... etc ...
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:        668s      11.14m     0.19h    0.01d  0.000 y
 # IO & Wait Time:                   514s       8.56m     0.14h    0.01d  0.000 y
 # Average job time:                  26s       0.43m     0.01h    0.00d
 # Longest job:                       86s       1.43m     0.02h    0.00d
 # Submission to last job:           108s       1.80m     0.03h    0.00d
 
     #	Now extract each one, 1 = Rat, 2 = Mouse
     ssh eieio
     cd /cluster/bluearc/scratch/hg/gs.18/build35
 
     mkdir linSpecRep.notInRat linSpecRep.notInMouse
     foreach f (rmsk.spec/*.out_rat_mus)
         set base = $f:t:r:r
         echo "$f -> $base.out.spec"
         /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                         linSpecRep.notInRat/$base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                         linSpecRep.notInMouse/$base.out.spec
     end
     #	There is actually no difference at all between these two results.
     #	copy to iscratch
     ssh kkr1u00
     cd /iscratch/i/gs.18/build35
     rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/ .
     /cluster/bin/iSync
     # request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch
 
 # COPY DATA TO GOLDEN PATH LOCATIONS (DONE - 2004-06-04 - Hiram)
     ssh hgwdev
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/chromosomes
     cd /cluster/data/hg17
     #	Beware, this backgrounding of the gzips can be hard on hgwdev.
     #	You could wait until after the copy then run one gzip to do them all
     foreach chrom ( `cat chrom.lst` )
 	cp -p ${chrom}/*.fa /usr/local/apache/htdocs/goldenPath/hg17/chromosomes
 	gzip \
 	/usr/local/apache/htdocs/goldenPath/hg17/chromosomes/chr${chrom}*.fa &
 	echo "done ${chrom}"
     end
     cd /usr/local/apache/htdocs/goldenPath/hg17/chromosomes
     gzip *.fa
 
 
 # FOSMID END PAIRS TRACK (2004-06-09 kate)
 # Corrected upper size limit to 50Kbp, reran pslPairs, 
 # and reloaded (2004-07-15 kate)
 
     # Use latest fosmid ends data prepared by Terry Furey.
     # He says there is no on-going work on fosmid ends, so this
     # should suffice indefinitely ?  Move/link this stuff into
     # central data area.
     ssh eieio
     cd /cluster/data/ncbi
     mkdir -p fosends/human
     ln -s /cluster/store1/fosends.3 fosends/human
     cd fosends/human/fosends.3
     faSize fosEnds.fa
        # 579735181 bases (369769 N's 579365412 real) in 1087670 sequences 
        # 580M bases in 1M sequences
     # create link in /gbdb/ncbi/fosends/human ?
 
     # use pre-split fosend files, and associated list for cluster run
     # Sequences are in /cluster/bluearc/hg/fosEnds
     cp /cluster/bluearc/booch/fosends/fosEnds.lst /cluster/bluearc/hg/fosEnds
     
     # run on rack9 since kilokluster is busy
     ssh kk9
     cd /cluster/data/hg17
     mkdir -p bed/fosends
     cd bed/fosends
     mkdir -p run
     cd run
     ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa \
                 > contigs.lst
     cp /cluster/bluearc/hg/fosEnds/fosEnds.lst fosEnds.lst
         # 380 contigs vs 97 fosEnd files -> 40K jobs
     # send output to kksilo, as it can better handle the NFS load
     mkdir -p /cluster/store7/kate/hg17/fosends/out
     ln -s /cluster/store7/kate/hg17/fosends/out ../out
 cat > gsub << 'EOF'
 #LOOP
 /cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/hg17/bed/fosends/out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 'EOF'
     gensub2 contigs.lst fosEnds.lst gsub jobList
     foreach f (`cat fosEnds.lst`)
         set d = $f:r:t
         echo $d
         mkdir -p /cluster/data/hg17/bed/fosends/out/$d
     end
 
     para create jobList
         # 36860 jobs
     para try
     para check
     para push
 # CPU time in finished jobs:    1655943s   27599.05m   459.98h   19.17d  0.053 y
 # IO & Wait Time:                101145s    1685.75m    28.10h    1.17d  0.003 y
 # Average job time:                  48s       0.79m     0.01h    0.00d
 # Longest job:                     1294s      21.57m     0.36h    0.01d
 # Submission to last job:         19269s     321.15m     5.35h    0.22d
 
     # sort, filter, and lift alignments
     ssh eieio
     cd /cluster/data/hg17/bed/fosends
     pslSort dirs raw.psl temp out/fosEnds*
     pslReps  -nearTop=0.01 -minCover=0.70 -minAli=0.85 -noIntrons raw.psl \
                         fosEnds.psl /dev/null
         # Processed 84096767 alignments
 
     # cleanup
     rm -r temp
     rm raw.psl
     rm -fr out /cluster/store7/kate/hg17/fosends
 
     mkdir lifted
     liftUp lifted/fosEnds.lifted.psl \
                 /cluster/data/hg17/jkStuff/liftAll.lft warn fosEnds.psl
     pslSort dirs fosEnds.sorted.psl temp lifted
     rmdir temp
     wc -l *.sorted.psl
         # 1693693 fosEnds.sorted.psl
  
     set ncbiDir = /cluster/data/ncbi/fosends/human/fosends.3
     ~/bin/i386/pslPairs -tInsert=5000 -minId=0.94 -noBin -min=30000 -max=50000 -slop -short -long -orphan -mismatch -verbose fosEnds.sorted.psl $ncbiDir/fosEnds.pairs all_fosends fosEnds
 
     # create header required by "rdb" tools
     # TODO: replace w/ awk & sort
     echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
     echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
     cat header fosEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > fosEndPairs.bed
     cat header fosEnds.slop fosEnds.short fosEnds.long fosEnds.mismatch \
          fosEnds.orphan \
     | row score ge 300 | sorttbl chr start | headchg -del > fosEndPairsBad.bed
 
     extractPslLoad -noBin fosEnds.sorted.psl fosEndPairs.bed \
                 fosEndPairsBad.bed | \
                         sorttbl tname tstart | headchg -del > fosEnds.load.psl
 
     # load into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/fosends
     hgLoadBed hg17 fosEndPairs fosEndPairs.bed \
                  -sqlTable=/cluster/home/kate/kent/src/hg/lib/fosEndPairs.sql 
         # Loaded 384558 elements
 
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed hg17 fosEndPairsBad fosEndPairsBad.bed \
                  -sqlTable=/cluster/home/kate/kent/src/hg/lib/fosEndPairsBad.sql
         # Loaded  30830 elements
 
     #hgLoadPsl hg17 -nobin -table=all_fosends fosEnds.load.psl
     # NOTE: truncates file to 0 if -nobin is used
     hgLoadPsl hg17 -table=all_fosends fosEnds.load.psl
         # load of all_fosends did not go as planned: 1526991 record(s), 0 row(s) skipped, 156 warning(s) loading psl.tab
 
     # load sequences
     mkdir -p /gbdb/hg17/fosends
     ln -s /cluster/data/ncbi/fosends/human/fosends.3/fosEnds.fa \
                                 /gbdb/hg17/fosends/fosEnds.fa
     hgLoadSeq hg17 /gbdb/hg17/fosends/fosEnds.fa
         # 1087670 sequences
        # NOTE: extFile ID is 832625 (shouldn't be so large ??) 
        # may want to reset this.
 
 
 # BAC END PAIRS TRACK (DONE - 2004-06-09 kate)
     # Re-ran pslPairs with updated pairs file (2004-10-04 booch)
 
     # Use latest BAC ends data from NCBI
     # Checked  ftp.ncbi.nih.gov/genomes/BACENDS/homo_sapiens,
     #  and files were unchanged from Terry's last download
     #  (to /cluster/store1/bacends.4)
     # Link this stuff into central data area.
     ssh eieio
     cd /cluster/data/ncbi
     mkdir -p bacends/human
     ln -s /cluster/store1/bacends.4 bacends/human
     cd bacends/human/bacends.4
     faSize BACends.fa
         # 400230494 bases (2743171 N's 397487323 real) in 832614 sequences
         # 400M bases in 800K sequences
 
     # use pre-split bacends files, and associated list for cluster run
     ssh kk
     cd /cluster/data/hg17
     mkdir -p bed/bacends
     cd bed/bacends
     mkdir run
     cd run
     ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > contigs.lst
     ls -1S /cluster/bluearc/hg/bacEnds/hs/*.fa > bacends.lst
         # 380 contigs vs 98 bacends files -> 40K jobs
 
     # send output to kksilo, as it can better handle the NFS load
     # (these are quick jobs)
     mkdir -p /cluster/store7/kate/hg17/bacends/out
     ln -s /cluster/store7/kate/hg17/bacends/out ../out
 cat > gsub << 'EOF'
 #LOOP
 /cluster/bin/i386/blat $(path1) $(path2) -ooc=/scratch/hg/h/11.ooc {check out line+ /cluster/data/hg17/bed/bacends/out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 'EOF'
     gensub2 contigs.lst bacends.lst gsub jobList
     foreach f (`cat bacends.lst`)
         set d = $f:r:t
         echo $d
         mkdir -p /cluster/data/hg17/bed/bacends/out/$d
     end
 
     para create jobList
         # 37240 jobs written to batch
     para try
     para check
     para push
 # CPU time in finished jobs:    1573932s   26232.19m   437.20h   18.22d  0.050 y
 # IO & Wait Time:                122751s    2045.86m    34.10h    1.42d  0.004 y
 # Average job time:                  46s       0.76m     0.01h    0.00d
 # Longest job:                     3312s      55.20m     0.92h    0.04d
 # Submission to last job:          7148s     119.13m     1.99h    0.08d
 
     cd ../out/BACends000
     pslCheck *.psl
 #Error: invalid PSL: AZ519021:1-575 NT_004559:1306426-1608347 - NT_004559.BACends000.psl:1101
 #AZ519021 query block 3 start 283 < previous block end 575
     # NOTE: inquired with JK regarding these results
 
     # lift alignments
     ssh eieio
     cd /cluster/data/hg17/bed/bacends
     pslSort dirs raw.psl temp out/BACends*
     # takes hours ?
 
         # 37240 files in 98 dirs
         # Got 37240 files 193 files per mid file
     pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                 raw.psl  bacEnds.psl /dev/null
         # Processed 52291246 alignments
     mkdir lifted
     liftUp lifted/bacEnds.lifted.psl \
                 /cluster/data/hg17/jkStuff/liftAll.lft warn bacEnds.psl
     pslSort dirs bacEnds.sorted.psl temp lifted
 
     # cleanup
     rmdir temp
     rm -fr out /cluster/store7/kate/hg17/bacends
 
     wc -l *.sorted.psl
         # 2497227 bacEnds.sorted.psl
 
     set ncbiDir = /cluster/data/ncbi/bacends/human/bacends.4
     ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds
 
     # create header required by "rdb" tools
     # TODO: replace w/ awk & sort
     echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
     echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
     cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed
     cat header  bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \
         | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed
 
     extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
                 bacEndPairsBad.bed | \
                         sorttbl tname tstart | headchg -del > bacEnds.load.psl
 
     # load into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/bacends
     hgLoadBed hg17 bacEndPairs bacEndPairs.bed \
                  -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql 
         # Loaded 201380  
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed hg17 bacEndPairsBad bacEndPairsBad.bed \
                  -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
         # Loaded 81773
     #hgLoadPsl hg17 -nobin -table=all_bacends bacEnds.load.psl
     # NOTE: truncates file to 0 if -nobin is used
     hgLoadPsl hg17 -table=all_bacends bacEnds.load.psl
         #load of all_bacends did not go as planned: 441072 record(s), 0 row(s) skipped, 30 warning(s) loading psl.tab
     # Reloaded table, 2004-07-21, and got more rows:
     # load of all_bacends did not go as planned: 1698790 record(s), 
     # 0 row(s) skipped, 63 warning(s) loading psl.tab
 
     # load BAC end sequences
 
     mkdir -p /gbdb/hg17/bacends
     ln -s /cluster/data/ncbi/bacends/human/bacends.4/BACends.fa \
                                 /gbdb/hg17/bacends/BACends.fa
     hgLoadSeq hg17 /gbdb/hg17/bacends/BACends.fa
         # 158588 sequences
 
     # Re-ran pslPairs with updated pairs file to take advantage of new 
     # feature of # allowing comma separated lists of end accessions for each
     #  end for a clone
     
     # First, create new pairs file (bacEndPairs.txt, bacEndSingles.txt)
     mkdir /cluster/data/ncbi/bacends/human/bacends.5
     cd /cluster/data/ncbi/bacends/human/bacends.5
     cp ../bacends.4/cl_ac_gi_len .
     /cluster/bin/scripts/convertEndPairInfo cl_ac_gi_len
 
     # Next, re-create the bed file
     mkdir /cluster/data/hg17/bed/bacends.update
     cd /cluster/data/hg17/bed/bacends.update
     ln -s /cluster/data/hg17/bed/bacends/bacEnds.sorted.psl ./bacEnds.sorted.psl
     set ncbiDir = /cluster/data/ncbi/bacends/human/bacends.5
     ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose bacEnds.sorted.psl $ncbiDir/bacEndPairs.txt all_bacends bacEnds
 
     # create header required by "rdb" tools
     # TODO: replace w/ awk & sort
     echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
     echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
     cat header bacEnds.pairs | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairs.bed
     cat header  bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch bacEnds.orphan \
         | row score ge 300 | sorttbl chr start | headchg -del > bacEndPairsBad.bed
 
     # wc *.bed
     # 204884 2253724 20612402 bacEndPairs.bed
     # 79401  873411 6527559 bacEndPairsBad.bed
 
     # previous
     # wc ../bacends/*.bed
     # 201380 2215180 20280578 ../bacends/bacEndPairs.bed
     # 81773  899503 6712402 ../bacends/bacEndPairsBad.bed
 
     extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
                 bacEndPairsBad.bed | \
                         sorttbl tname tstart | headchg -del > bacEnds.load.psl
 
     # load into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/bacends.update
     hgLoadBed hg17 bacEndPairs bacEndPairs.bed \
                  -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairs.sql 
         # Loaded 204884  
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed hg17 bacEndPairsBad bacEndPairsBad.bed \
                  -sqlTable=/cluster/home/kate/kent/src/hg/lib/bacEndPairsBad.sql
         # Loaded 79401
     #hgLoadPsl hg17 -nobin -table=all_bacends bacEnds.load.psl
     # NOTE: truncates file to 0 if -nobin is used
     hgLoadPsl hg17 -table=all_bacends bacEnds.load.psl
     # load of all_bacends did not go as planned: 1729146 record(s), 0 row(s) skipped, 70 warning(s) loading psl.tab
 
 
 # PLACE ASSEMBLY CLONES - misc instructions, only somewhat relevant
 #	See PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE below
 
 ######	 A second attempt at clone alignment###
     #	Split the clones into 3K pieces into about 1000 fa files
 
     #	Example:
 zcat Z99916.1.fa.gz Z99774.1.fa.gz Z99756.7.fa.gz | faSplit size stdin 3000 /tmp/name.fa -lift=/tmp/name.lft -oneFile
 
     #	Trying this idea in unPlacedBatch
     ssh kk0
     mkdir /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
     cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
     ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs > nibList
     ls -1S /cluster/data/hg17/bed/contig_overlaps/blatClones > cloneList
 cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -fastMap -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in exists /scratch/hg/gs.18/build35/bothMaskedNibs/$(path1)} {check in exists+ /cluster/data/hg17/bed/contig_overlaps/blatClones/$(path2)} {check out line+ psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     mkdir psl
     cat nibList | sed -e "s/.nib//" | while read D
 do
 mkdir psl/$D
 done
 
     gensub2 nibList cloneList gsub jobList
     para create jobList
 
 # PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE (DONE - 2004-07-12 - Hiram)
     ssh eieio
     mkdir /cluster/data/hg17/bed/contig_overlaps
     cd /cluster/data/hg17/bed/contig_overlaps
     #	find all the clones that were used in the assembly
     sed -e "/^#.*/d" /cluster/data/hg17/ncbi_build35.agp | \
         awk '{if (!match($5,"N")) {print $6}}' | \
         sort -u > placed_in_assembly.list
     wc -l placed_in_assembly.list
     #	26872 placed_in_assembly.list
     #	These may be available from the phases files at:
     #	ftp://ftp.ncbi.nih.gov/genbank/genomes/H_sapiens
     #	Which are easily fetched with wget.  However I took a look
     #	at those and could not find all the clones in them.  There may
     #	be a versioning problem because these phases files are often
     #	updated.
     #	Fetch them from Genbank with the following three PERL scripts:
     #	[hiram@hgwdev /cluster/data/hg17/bed/contig_overlaps] ls -og *.pl
     #	-rwxrwxr-x    1     3047 May 24 18:43 bioPerlFetch.pl
     #	-rwxrwxr-x    1     2370 Jun  4 15:21 fetchGenbank.pl
     #	-rwxrwxr-x    1      700 May 24 21:47 foldEm.pl
 
     #	Which takes about 4 days ...
     #	Example, 
     cat << '_EOF_' > terrys.list
 AC011841.7
 AC018692.9
 AC018743.27
 AC037482.14
 AL163540.11
 '_EOF_'
     # << this line makes emacs coloring happy
     #	only works on hgwdev
     ssh hgwdev
     cd /cluster/data/hg17/bed/contig_overlaps
     mkdir fasta
     time ./fetchGenbank.pl terrys.list > fetchResult.out 2>&1
 
     #	There is a bit of behind the scenes hocus pocus going on here.
     #	This is a tedious task of comparing various lists with each
     #	other and making sure everything matches.  Manual fixups are
     #	done for the newly named 6_hla_hap* items, copies of the PAR
     #	business were duplicated so that X and Y both have the same set
     #	of clones for that.  The end result should be a directory hierarchy
     #	here with a directory for each chrom, each random, the 6_hla_hap?
     #	items and each directory contains the clones that belong to that
     #	chromosome.  The leftovers are the unplaced clones which end up
     #	in the directory called: unPlaced.  The instructions here are
     #	merely a guideline of possibilities.  Care should be taken to
     #	make sure all listings are correct and everything gets in the
     #	right place.
     ssh eieio
     #	And then make a list of all clones considered for assembly:
     sed -e "/^#.*/d" /cluster/store5/gs.18/ncbi/sequence.inf | \
 	grep for_assembly | awk '{print $1}' | sort -u > sequence.list
     wc -l sequence.list
     #	46733 sequence.list
     #	Verify overlaps are correct:
     comm -12 placed_in_assembly.list sequence.list > inBoth
     comm -23 placed_in_assembly.list sequence.list > inAssemblyNotSequence
     comm -13 placed_in_assembly.list sequence.list > inSequenceNotAssembly
     wc in*
     #	    1       1      12 inAssemblyNotSequence
     #	26871   26871  301709 inBoth
     #	19862   19862  219050 inSequenceNotAssembly
     #	46734   46734  520771 total
     #	This stray one is from Terry's five additions in the final fixup
     #	phase with Greg:
     cat inAssemblyNotSequence
     #	AC018743.27
     #	Terry added: AC011841.7 AC018692.9 AC018743.27 AC037482.14 AL163540.11
     #
     #	Generate a listing that relates clones to their contigs
     sed -e "/^#.*/d" /cluster/store5/gs.18/build35/ncbi_build35.agp | \
 	./contigAcc.pl > disburseEm.list
     #
     #	Using that list, sort the downloaded clones into their
     #	respective chrom directories:
     ./disburse.sh
 
     #	Check the number of sequences obtained:
     find ./? ./?? ./*_random ./6_hla* -type f | wc -l
     #	26872
     #	So, why is this number one more than the inBoth list ?
     #	Because, the official NCBI sequence.inf file is missing one of
     #	the clones that Terry added: AC018743.27
     #	And it shows up in our check list above as inAssemblyNotSequence
     #	It isn't exactly missing, it just isn't marked "for_assembly"
 
     #	OK, with everything in place, we are ready to try and find
     #	all these items in the assembly.  To run a Kluster job on one of
     #	the chroms, matching the items that are supposed to be included
     #	in that chrom.  We need to get things set up on the Iservers,
     #	psLayout is heavy into disk I/O and it brings everything down if
     #	allowed to work on any NFS filesystems for input.
 
     #	It appears that psLayout wants an ooc file of tile size 10
     #	I tried making one for the whole assembly but it seemed to
     #	include too much for some contigs and it caused a lot of
     #	alignments to be missed.  Thus, create an ooc file for each
     #	contig
 
     ssh eieio
     mkdir /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10
     cd /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10
     ls ../maskedContigs | sed -e "s/.fa//" | while read CONTIG
     do
 	blat -repMatch=256 -makeOoc=${CONTIG}.10.ooc -tileSize=10 \
 	    ../maskedContigs/${CONTIG}.fa \
 	    ../maskedContigs/${CONTIG}.fa /dev/null
 	echo "done: ${CONTIG}"
     done
 
     #	Copy that result to the Iservers:
     ssh kkr1u00
     mkdir /iscratch/i/gs.18/build35/contigOoc10
     cd /iscratch/i/gs.18/build35/contigOoc10
     rsync -arlv /cluster/bluearc/scratch/hg/gs.18/build35/contigOoc10/ .
     #	And, copy the clone sequences:
     mkdir /iscratch/i/gs.18/build35/clones
     cd /cluster/store5/gs.18/build35/bed/contig_overlaps
     for D in ? ?? *_random 6_hla_hap?
     do
 	rsync -arlv `pwd`/${D} /iscratch/i/gs.18/build35/clones
     done
     
     /cluster/bin/iSync
 
     ssh kk
     cd /cluster/data/hg17/bed/contig_overlaps
     mkdir psl
     cat << '_EOF_' > runPsLayout.sh
 #!/bin/sh
 #       kkiPsLayout.sh <chrom> <clone> <contig>
 #       where <chrom> is the chrom this contig is on
 #       <clone> is one of the .fa.gz files in
 #               /cluster/data/hg17/bed/contig_overlaps/*/<clone>.fa.gz
 #               without the .fa.gz extension
 #               This stuff has been mirrored to:
 #               /iscratch/i/gs.18/clones/*/<clone>.fa.gz
 #       <contig> is one of the contigs found in:
 #               /cluster/store5/gs.18/build35/<chrom>/<contig>/<contig>.fa
 #
 CHROM=$1
 CLONE=$2
 CONTIG=$3
 TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa
 FAZ=/iscratch/i/gs.18/build35/clones/${CHROM}/${CLONE}.fa.gz
 OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc
 mkdir -p psl/${CONTIG}
 if [ ! -s ${FAZ} ]; then
         echo "Can not find: ${FAZ}"
         exit 255
 fi
 if [ ! -s ${TARGET} ]; then
         echo "Can not find: ${TARGET}"
         exit 255
 fi
 if [ ! -s ${OOC} ]; then
         echo "Can not find: ${OOC}"
         exit 255
 fi
 zcat ${FAZ} > /tmp/${CLONE}.fa
 $HOME/bin/i386/psLayout ${TARGET} \
         /tmp/${CLONE}.fa genomic ${OOC} psl/${CONTIG}/${CLONE}.psl
 RET=$?
 rm -f /tmp/${CLONE}.fa
 exit ${RET}
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x runPsLayout.sh
 
     #	make up a listing of chrom, clone, contig from:
     grep -v "^#" disburseEm.list | sed -e "s/.fa.gz//" > chr.clone.contig.list
     wc -l chr.clone.contig.list
     #	26872 chr.clone.contig.list
     awk '{
 printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s.psl}\n",
         $1, $2, $3, $3, $2
 }' chr.clone.contig.list > jobList
     # << this line makes emacs coloring happy
     #	To do a quick test, run just chr22:
     grep -v "^22" chr.clone.contig.list | awk '{
 printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s.psl}\n",
         $1, $2, $3, $3, $2
 }' > jobList
     para create jobList
     para try ... check ... etc ...
     #	One run on chr22 took:
 # Completed: 561 of 561 jobs
 # CPU time in finished jobs:     927068s   15451.14m   257.52h   10.73d  0.029 y
 # IO & Wait Time:                  6295s     104.91m     1.75h    0.07d  0.000 y
 # Average job time:                1664s      27.73m     0.46h    0.02d
 # Longest job:                    69745s    1162.42m    19.37h    0.81d
 # Submission to last job:         69780s    1163.00m    19.38h    0.81d
 
 
     #	put the results together, filter, lift and load:
     cd /cluster/data/hg17/bed/contig_overlaps/psl
     pslSort dirs raw.psl tmp N*
     pslReps -singleHit raw.psl repsSingle.psl /dev/null
     liftUp chr22.psl /cluster/data/hg17/jkStuff/liftAll.lft \
 	warn repsSingle.psl
     hgLoadPsl -table=cloneTest hg17 chr22.psl
 
     #	There are a number of clones listed in the sequence.inf file
     #	as status W with names beginning AACC AADB AADC AADD
     #	These are the Whole shotgun assemblies for the Celera genome.
     #	A few of them were used in the assembly of the NCBI genome, namely:
 ./11/AADB01066164.1.fa.gz
 ./11/AADC01095577.1.fa.gz
 ./11/AADD01116830.1.fa.gz
 ./11/AADD01118406.1.fa.gz
 ./11/AADD01116787.1.fa.gz
 ./11/AADD01112371.1.fa.gz
 ./11/AADD01116788.1.fa.gz
 ./11/AADD01115518.1.fa.gz
 ./11/AADD01118410.1.fa.gz
 ./11/AADD01117999.1.fa.gz
 ./21/AADD01172789.1.fa.gz
 ./21/AADD01172788.1.fa.gz
 ./21/AADD01209098.1.fa.gz
 ./21/AADD01172902.1.fa.gz
     #	And these have been distributed properly in their corresponding
     #	chromosome.  The rest of them, 26, all with names starting AACC are in
     #	the directory here: celeraOnly
 
     #	To run the unPlaced alignments.
     #	Prepare scratch and iscratch
     ssh eieio
     mkdir /cluster/bluearc/scratch/hg/gs.18/build35/clones/unPlaced
     rsync -arlv /cluster/data/hg17/bed/contig_overlaps/unPlaced/ \
 	/cluster/bluearc/scratch/hg/gs.18/build35/clones/unPlaced
     #	request scratch sync to cluster admins
 
     ssh kkr1u00
     mkdir /iscratch/i/gs.18/build35/clones/unPlaced
     rsync -arlv /cluster/data/hg17/bed/contig_overlaps/unPlaced/ \
 	/iscratch/i/gs.18/build35/clones/unPlaced
     /cluster/bin/iSync
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
     #	There are too many to try them all, obtain guildelines from hg16
     #	of clone to contig mapping:
     hgsql -N -e "select name,chrom from clonePos;" hg16 > hg16.clone.chrom
     hgsql -N -e "select contig,chrom from ctgPos;" hg16 > hg16.contig.chrom
 
     ssh kk
     mkdir /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
     cd /cluster/data/hg17/bed/contig_overlaps/unPlacedBatch
     ls ../unPlaced | sed -e "s/.fa.gz//" > unPlaced.clone.list
     wc -l unPlaced.clone.list
     #	19836 unPlaced.clone.list
     ls -1S /scratch/hg/gs.18/build35/maskedContigs > contig.list
     wc -l contig.list
     #	380 contig.list
 
     cat << '_EOF_' > runPsLayout.sh
 #!/bin/sh
 #       kkiPsLayout.sh <clone> <contig>
 #       <clone> is one of the .fa.gz files in
 #               /scratch/hg/gs.18/build35/clones/unPlaced
 #               without the .fa.gz extension
 #       <contig> is one of the contigs found in:
 #               /iscratch/i/gs.18/build35/maskedContigs
 #
 CLONE=$1
 CONTIG=$2
 TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa
 FAZ=/scratch/hg/gs.18/build35/clones/unPlaced/${CLONE}.fa.gz
 OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc
 mkdir -p psl/${CONTIG}
 if [ ! -s ${FAZ} ]; then
         echo "Can not find: ${FAZ}"
         exit 255
 fi
 if [ ! -s ${TARGET} ]; then
         echo "Can not find: ${TARGET}"
         exit 255
 fi
 if [ ! -s ${OOC} ]; then
         echo "Can not find: ${OOC}"
         exit 255
 fi
 zcat ${FAZ} > /tmp/${CLONE}.fa
 $HOME/bin/i386/psLayout ${TARGET} \
         /tmp/${CLONE}.fa genomic ${OOC} psl/${CONTIG}/${CLONE}.psl
 RET=$?
 rm -f /tmp/${CLONE}.fa
 exit ${RET}
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x runPsLayout.sh
 
     cat << '_EOF_' > gsub
 #LOOP
 ./runPsLayout.sh $(path1) $(path2) {check out line+ psl/$(path2)/$(path1).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 unPlaced.clone.list contig.list gsub jobList
 
    # XXXX - some time later ... 2004-07-12
     # Bringing this sequence to a close.  Difficulties encountered:
     #	Placed clones that did not survive the psLayout filter:
     #	AC006040.3 AC006328.5 AC007039.6 AC007241.3 AC007965.3
     #	AC009947.2 AC010682.2 AC012005.4 AC016707.2 AC016728.4
     #	AC016752.2 AC017005.7 AC025226.4 AC025246.6 AC055713.29
     #	AC068541.7 AC068601.8 AC068704.4 AC073649.3 AC073962.5
     #	AC091175.11 AC095381.1 AC104597.3 AC130223.2 AC130814.3
     #	AC133883.6 AC139103.3 AF003627.3 AF135405.3 AL021878.2
     #	AL137064.6 AL356803.2 AL390801.4 AL591480.8 AL901608.1
     #	AP005814.2 BX322790.2 Z84489.1 Z84814.1
     #	And placed clones that broken into two pieces during their
     #	psLayout alignment:
     #	AC006982.3 AC007742.4 AC023342.3 AC024183.4 AC025735.4
     #	AC095380.1 AL646104.4 BX293536.4
     #	For the above clones, their assignments in ref_placed.agp were
     #	used instead of trying to adjust the psLayout process.
 
     #	The PAR clones are a problem.  They were placed properly, but
     #	during their load with hgClonePos there was a warning issued
     #	about their dual existance.  hgClonePos said they were only
     #	going to be placed on chrX and not on chrY.  However in the
     #	browser when chrY is viewed it issues errors about these not
     #	having proper coordinates in the clonePos table.  These were
     #	removed from the coverage track to eliminate that error.
     #	AL954722.18 BX537334.4 BX000483.7 BX908402.3 BX649635.3 BX119919.5
     #	AC079176.15 AC097314.27 AC006209.25 AJ271735.1 AJ271736.1
     #
     #	And finally, after many different types of alignment attempts,
     #	there remain 1489 un-placed clones that could not be located.
 
     #	While trying to figure out which contigs many clones belonged
     #	to, the following cluster run script was used to take a survey
     #	using blat:
 #!/bin/sh
 #       runBlat.sh <clone> <contig>
 #       <clone> is one of the .fa.gz files in
 #               /scratch/hg/gs.18/build35/clones/
 #               without the .fa.gz extension
 #       <contig> is one of the contigs found in:
 #               /iscratch/i/gs.18/build35/maskedContigs
 #   
 # ./runBlat.sh unPlaced/AB000876.1.fa.gz NT_005612.fa {check out line+
 # psl/NT_005612.fa/unPlaced/AB000876.1.fa.gz.psl}
 #
 HERE=`pwd`
 CLONE=$1 
 CLONEDIR=`dirname ${CLONE}`
 CLONENAME=`basename ${CLONE}`
 CLONESRC=/iscratch/i/gs.18/build35/clones/${CLONE}.fa.gz
 CONTIG=$2
 CONTIGBASE=${CONTIG/.fa/}
 TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}
 if [ ! -s ${CLONESRC} ]; then
         echo "Can not find: ${CLONESRC}" 1>/dev/stderr
         exit 255
 fi
 if [ ! -s ${TARGET} ]; then
         echo "Can not find: ${TARGET}" 1>/dev/stderr
         exit 255
 fi
 mkdir -p /tmp/${CLONEDIR}/${CLONENAME}
 zcat ${CLONESRC} > /tmp/${CLONEDIR}/${CLONENAME}/${CLONENAME}.fa
 cd /tmp/${CLONEDIR}
 /cluster/data/hg17/bed/contig_overlaps/FfaSplit/faToFfa ${CLONENAME}
 ECOUNT=`cat error.convert | wc -l`
 if [ "${ECOUNT}" -ne 0 ]; then
         echo "Error during faToFfa, error.convert not empty" 1>/dev/stderr
         exit 255
 fi
 rm -f error.convert
 B=${CLONENAME/\.*/}
 cd /tmp/${CLONEDIR}/${CLONENAME}
 faSplit byname ${CLONENAME}.fa .
 RET=0
 export RET
 for F in ${CLONENAME}_*.fa
 do
     FA=${F/_*.fa/}
     A=${FA/.[0-9]*/}
     P=${F/.fa/}
     N=${P##*_}
     rm -f t.fa
     mv ${F} t.fa
     cat t.fa | faSplit -oneFile size stdin 3000 ${A}_${N}
     rm -f t.fa
     blat ${TARGET} ${A}_${N}.fa -ooc=/scratch/hg/h/11.ooc ${A}_${N}.psl \
         -t=dna -q=dna -fastMap -noHead
     RET=$?
     if [ "$RET" -ne 0 ]; then
         echo "Error during blat ${TARGET} ${A}_${N}.fa" 1>/dev/stderr
         break
     fi
 done
 rm -f ${CLONENAME}.fa
 rm -f ${B}_*.fa
 cd ${HERE}
 mkdir -p psl/${CONTIGBASE}
 sed -e "s/${A}/${CLONENAME}/" /tmp/${CLONEDIR}/${CLONENAME}/*.psl > \
         psl/${CONTIGBASE}/${CLONENAME}.psl
 rm -f /tmp/${CLONEDIR}/${CLONENAME}/*.psl
 rmdir --ignore-fail-on-non-empty /tmp/${CLONEDIR}/${CLONENAME}
 rmdir --ignore-fail-on-non-empty /tmp/${CLONEDIR}
 exit ${RET}
 
     #	The alignment with psLayout were done with the following cluster
     #	run script:
 
 #!/bin/sh
 #       kkiPsLayout.sh <clone> <contig>
 #       <clone> is one of the .fa.gz files in
 #               /scratch/hg/gs.18/build35/clones/unPlaced
 #               without the .fa.gz extension
 #       <contig> is one of the contigs found in:
 #               /iscratch/i/gs.18/build35/maskedContigs
 #
 # ./runPsLayout.sh unPlaced/AP001966.2 NT_016354 {check out exists
 # psl/NT_016354/AP001966.2.psl}
 #
 HERE=`pwd`
 CLONE=$1
 CONTIG=$2
 CLONEDIR=`dirname ${CLONE}`
 CLONENAME=`basename ${CLONE}`
 RESULT=psl/${CONTIG}/${CLONENAME}.psl
 CLONESRC=/iscratch/i/gs.18/build35/clones/${CLONE}.fa.gz
 TARGET=/iscratch/i/gs.18/build35/maskedContigs/${CONTIG}.fa
 OOC=/iscratch/i/gs.18/build35/contigOoc10/${CONTIG}.10.ooc
 if [ ! -s ${CLONESRC} ]; then
         echo "Can not find: ${CLONESRC}" 1>/dev/stderr
         exit 255
 fi
 if [ ! -s ${TARGET} ]; then
         echo "Can not find: ${TARGET}" 1>/dev/stderr
         exit 255
 fi
 if [ ! -s ${OOC} ]; then
         echo "Can not find: ${OOC}" 1>/dev/stderr
         exit 255
 fi
 mkdir -p /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}
 zcat ${CLONESRC} > /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa
 cd /tmp/${CONTIG}
 /cluster/data/hg17/bed/contig_overlaps/FfaSplit/faToFfa ${CLONEDIR}
 cd ${HERE}
 mkdir -p psl/${CONTIG}
 $HOME/bin/i386/psLayout ${TARGET} /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa genomic ${OOC} ${RESULT}
 RET=$?
 rm -f /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}.fa /tmp/${CONTIG}/error.convert
 rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}/${CLONEDIR}/${CLONENAME}
 rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}/${CLONEDIR}/
 rmdir --ignore-fail-on-non-empty /tmp/${CONTIG}
 exit ${RET}
 
 # BUILD KNOWN GENES TABLES (DONE 6/8/04 Fan)
 
   Build sp040515 and proteins040515 DBs first.
   
   hgsql hg17 -e "create database kgHg17"
   
   cd /cluster/store6/kgDB/bed
   mkdir kgHg17
   cd /cluster/store6/kgDB/bed/kgHg17
 
   ~/src/hg/protein/KGprocess.sh kgHg17 hg17 040515
   
   The script was run successfully with the last message:
 
   	Tue Jun  8 15:36:52 PDT 2004 DONE 
 
   After initial inspection of tables in kgHg17, do the following
   from mySql prompt:
 
   alter table kgHg17.cgapAlias rename as hg17.cgapAlias;
   alter table kgHg17.cgapBiocDesc rename as hg17.cgapBiocDesc;
   alter table kgHg17.cgapBiocPathway rename as hg17.cgapBiocPathway;
   alter table kgHg17.dupSpMrna rename as hg17.dupSpMrna;
   alter table kgHg17.keggMapDesc rename as hg17.keggMapDesc;
   alter table kgHg17.keggPathway rename as hg17.keggPathway;
   alter table kgHg17.kgAlias rename as hg17.kgAlias;
   alter table kgHg17.kgProtAlias rename as hg17.kgProtAlias;
   alter table kgHg17.kgXref rename as hg17.kgXref;
   alter table kgHg17.knownGene rename as hg17.knownGene;
   alter table kgHg17.knownGeneLink rename as hg17.knownGeneLink;
   alter table kgHg17.knownGeneMrna rename as hg17.knownGeneMrna;
   alter table kgHg17.knownGenePep rename as hg17.knownGenePep;
   alter table kgHg17.mrnaRefseq rename as hg17.mrnaRefseq;
   alter table kgHg17.spMrna rename as hg17.spMrna;
 
   hg17.knownGene has 43,401 entries and hg16.knownGene has 43,232 entries.
   and running featireBits shows:
   
    	featureBits hg17 knownGene
    	63983072 bases of 2866216770 (2.232%) in intersection
    
    	featureBits hg16 knownGene
    	63781799 bases of 2865248791 (2.226%) in intersection
   
   Connect to genome-testdb and use hgcentraltest DB.
   Add a new entry in gdbPdb table:
  
         insert into gdbPdb values('hg17', 'proteins040515');
 
 
 # CREATE LINEAGE-SPECIFIC REPEATS FOR BLASTZ WITH ZEBRAFISH
 # (DONE, 2004-06-08, hartera)
     # Treat all repeats as lineage-specific
     mkdir /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish
     foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out)
  cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
     end
     iSync
   
 
 # PREP FOR LIFTOVER CHAINS TO THIS ASSEMBLY (2004-06-10 kate)
 
     # split into 3K chunks
     ssh eieio
     set liftDir = /cluster/data/hg17/bed/liftOver/liftSplit
     mkdir -p $liftDir
     cd $liftDir
 cat > split.csh << 'EOF'
     set splitDir = /iscratch/i/hg17/liftOver/split
     mkdir -p $splitDir
     set liftDir = /cluster/data/hg17/bed/liftOver/liftSplit
     foreach n (`ls /cluster/data/hg17/nib`)
         set c = $n:r
         echo $c
         faSplit -lift=$liftDir/$c.lft size \
             /cluster/data/hg17/$d/$c.fa -oneFile 3000 $splitDir/$c
     end
 'EOF'
 # << for emacs
     csh split.csh >&! split.log &
     tail -100f split.log
 
     ssh kkr1u00
     iSync
 
 
 # STS MARKERS (DONE 2004-07-21 kate)
 # MANUAL UPDATE OF D21S168 and D21S167 (DONE, 2005-02-11, hartera)
 # FILTERED OUT noOoc ALIGNMENTS WITH tBaseInsert >=1000 
 # (DONE, 2005-02-17, hartera) AND RELOADED stsMap, stsInfo2 and all_sts_seq
 # DATABASE TABLES AFTER ADDING FILTERED ALIGNMENTS TO all_sts_seq AND 
 # REMOVING DATA FROM stsMap and stsInfo2 FOR THE MARKERS REMOVED FROM THE 
 # FILTERED SET (DONE, 2005-02-18, hartera)
 # UPDATE PSL ALIGNMENTS FOR D21S167 and D21S168 AND RELOAD INTO all_sts_seq
 # (DONE, 2005-02-23, hartera)
 # UPDATED stsAlias TABLE REMOVING OF FILTERED ALIGNMENTS (2005-02-24, hartera)
 
     # Terry's sts.9 dir is in /cluster/store5/sts.2004-07.old
     # remove this after verifying the newer version
 
    # update from NCBI (booch)
     ssh eieio
     # use store5 for space
     mkdir -p /cluster/store5/sts.2004-07
     ln -s /cluster/store5/sts.2004-07 /cluster/data/ncbi
     ln -s /cluster/data/ncbi/sts.2004-07 sts.9
     cd /cluster/data/ncbi/sts.2004-07
     wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
     wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
     wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
     gunzip sts.gz
     mv sts dbSTS.fa
 
     # incremental update from previous build
     # NOTE: could mysql dump this, unless hand-updated (like hg16)
     # First - copy from Terry's dir
     ssh eieio
     ln -s /cluster/store1/sts.8 /cluster/data/ncbi
     cd /cluster/data/ncbi/sts.9
 
     # this time, snag from Terry's dir
     cd /cluster/data/ncbi/sts.9
     cp -p ~booch/tracks/update/all.STS.fa.prev .
     cp -p ~booch/tracks/update/stsInfo2.bed stsInfo2.bed.prev
 
     # Convert dbSTS.fa file to easier reading format, and get accessions
     /cluster/bin/scripts/convertGbFaFile dbSTS.fa > dbSTS.convert.fa
     grep ">" dbSTS.convert.fa | cut -f 2 -d ">" > dbSTS.acc
 
     # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers, 
     #   all.STS.fa, stsAlias.bed files 
     updateStsInfo -verbose=1 -gb=dbSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
                     dbSTS.sts dbSTS.aliases dbSTS.convert.fa new
     # 129991  SWXD2599        99622   (0) not in dbSTS anymore
     # 166473  D3S3812 154523  (0) not in dbSTS anymore
     # 185776  RH83562 209614  (0) not in dbSTS anymore
 
     mv new.info stsInfo2.bed
     mv new.primers all.primers
     mv new.alias stsAlias.bed
     mv new.fa all.STS.fa
 
     # get list of all STS id's in the fasta file
     sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n >  all.STS.id
     wc -l all.STS.id
         # 92674 total sequences
     /cluster/bin/scripts/convertPrimerToFA all.primers > all.primers.fa
 
     # Copy stsInfo2.bed and stsAlias.bed to data directory becuase
     # these will be loaded into the database later
     mkdir -p /cluster/data/hg17/bed/sts
     cp stsInfo2.bed /cluster/data/hg17/bed/sts/
     cp stsAlias.bed /cluster/data/hg17/bed/sts/
 
     # Create sts sequence alignments
     mkdir -p /cluster/bluearc/sts.9/sts.split
     faSplit sequence all.STS.fa 50 /cluster/bluearc/sts.9/sts.split/sts
     cp /cluster/data/ncbi/sts.9/all.STS.fa /cluster/bluearc/sts.9
 
     # create small ooc file to use with alignments (if not existing)
     # NOTE: these were just used for experimenting; weren't used in
     # final runs
     ssh kolossus
     cd /cluster/data/hg17/bed/sts
     ls /cluster/bluearc/hg17/bothMaskedNibs/chr*.nib > nib.lst
     blat nib.lst /dev/null /dev/null \
         -tileSize=11 -makeOoc=/cluster/bluearc/hg/h/11.4096.ooc -repMatch=4096
     blat nib.lst /dev/null /dev/null \
         -tileSize=11 -makeOoc=/cluster/bluearc/hg/h/11.16384.ooc -repMatch=16384
 
     ssh kk
     cd /cluster/data/hg17/bed/sts
     mkdir run
     cd run
     ls -1S /scratch/hg/hg17/maskedContigs/*.fa > contigs.lst
     ls -1S /cluster/bluearc/sts.9/sts.split/sts*.fa > sts.lst
     mkdir -p /cluster/bluearc/hg17/sts/sts/out
     foreach f (`cat sts.lst`)
         set d = $f:t:r
         mkdir /cluster/bluearc/hg17/sts/sts/out/$d
     end
 
     # create alignments
 cat > template << 'EOF'
 #LOOP
 /cluster/bin/i386/blat $(path1) $(path2) -ooc=/cluster/bluearc/hg/h/11.ooc -stepSize=5 {check out line+ /cluster/bluearc/hg17/sts/sts/out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 'EOF'
 # << for emacs
 
     gensub2 contigs.lst sts.lst template jobList
     para create jobList
         # 17860 jobs
     para try
     para check
     para push
 # CPU time in finished jobs:     216985s    3616.41m    60.27h    2.51d  0.007 y
 # IO & Wait Time:                 48790s     813.17m    13.55h    0.56d  0.002 y
 # Average job time:                  15s       0.25m     0.00h    0.00d
 # Longest job:                      267s       4.45m     0.07h    0.00d
 # Submission to last job:          2228s      37.13m     0.62h    0.03d
 
     # Compile sts sequence results
     ssh kolossus
     cd /cluster/bluearc/hg17/sts/sts
     pslSort dirs raw.psl temp out/*
     rm -rf temp
     pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \
 	stsMarkers.psl /dev/null
             # Processed 7121016 alignments
     #cp stsMarkers.psl /cluster/data/hg17/bed/sts/run
 
     # Lift them and get them ready to combine with primer alignments
     #cd  /cluster/data/hg17/bed/sts/run
     #liftUp -nohead /cluster/data/hg17/bed/sts/run/stsMarkers.lifted.psl \
     liftUp -nohead stsMarkers.lifted.psl \
         /cluster/data/hg17/jkStuff/liftAll.lft warn stsMarkers.psl
 
     # missing some utilities for kolossus, so switch to fileserver
     # NOTE: probably no longer true -- try on kolossus next time
     ssh kksilo
     cd /cluster/bluearc/hg17/sts/sts
     /cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl
         # creates <file>.initial
     /cluster/bin/scripts/findAccession -agp stsMarkers.lifted.psl.initial \
 	/cluster/data/hg17
             # "Could not open /cluster/data/hg17/Y/chrY_random.agp" etc.
             # Looks like it trys all _randoms (even one's that don't
             # exist/aren't needed
             # creates <file>.acc
     #rm stsMarkers.lifted.psl.initial
     sort -k 4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final
     #rm stsMarkers.lifted.psl.initial.acc
     #cp stsMarkers.final stsMarkers.lifted.psl.initial /cluster/data/hg17/bed/sts
     # determine found markers (4th field in file)
     cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
     wc -l stsMarkers.found
         #   89532 stsMarkers.found
         # out of 92674 total sequences
 
     # extract sequences for markers not yet found, and
     # blat w/o ooc to try to place more
     comm -1 -3  stsMarkers.found /cluster/data/ncbi/sts.9/all.STS.id \
                 > stsMarkers.notFound
     wc -l stsMarkers.notFound
         # 3142 stsMarkers.notFound
     faSomeRecords /cluster/data/ncbi/sts.9/all.STS.fa stsMarkers.notFound \
                 notFound.STS.fa
     mkdir /cluster/bluearc/sts.9/sts.splitNotFound
     faSplit sequence notFound.STS.fa 20 \
                 /cluster/bluearc/sts.9/sts.splitNotFound/sts
 
     # blat with 11.ooc misses alignments, so reblat w/o the
     # sequences that aren't found
     # NOTE: filtering produces yield of only 149 markers placed (out of 3142).
     # not enough to justify this step next time
     ssh kk
     cd /cluster/data/hg17/bed/sts
     mkdir run.noOoc
     cd run.noOoc
     ls -1S /scratch/hg/hg17/maskedContigs/*.fa > contigs.lst
     ls -1S /cluster/bluearc/sts.9/sts.splitNotFound/sts*.fa > sts.lst
     mkdir -p /cluster/bluearc/hg17/sts/sts/out.noOoc
 
     foreach f (`cat sts.lst`)
         set d = $f:t:r
         mkdir /cluster/bluearc/hg17/sts/sts/out.noOoc/$d
     end
 
 cat > template << 'EOF'
 #LOOP
 /cluster/bin/i386/blat $(path1) $(path2) -stepSize=5 {check out line+ /cluster/bluearc/hg17/sts/sts/out.noOoc/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 'EOF'
 # << for emacs
 
     gensub2 contigs.lst sts.lst template jobList
     para create jobList
         # 7220 jobs written to batch
     para try
     para check
 
     # process this set of alignments
     ssh kolossus
     cd /cluster/bluearc/hg17/sts/sts
     pslSort dirs raw.noOoc.psl temp out.noOoc/*
 
     rm -rf temp
     pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons \
         raw.noOoc.psl stsMarkers.noOoc.psl /dev/null
             # Processed 4254094 alignments
     #cp stsMarkers.psl /cluster/data/hg17/bed/sts/run
 
     # Lift them and get them ready to combine with primer alignments
     liftUp -nohead stsMarkers.noOoc.lifted.psl \
         /cluster/data/hg17/jkStuff/liftAll.lft warn stsMarkers.noOoc.psl
 
     /cluster/bin/scripts/extractPslInfo stsMarkers.noOoc.lifted.psl
         # creates <file>.initial
     /cluster/bin/scripts/findAccession -agp \
         stsMarkers.noOoc.lifted.psl.initial /cluster/data/hg17
             # "Could not open /cluster/data/hg17/Y/chrY_random.agp" etc.
             # Looks like it trys all _randoms (even one's that don't
             # exist/aren't needed
             # creates <file>.acc
     #rm stsMarkers.lifted.psl.initial
     mv stsMarkers.final stsMarkers.ooc.final
     sort -k 4n stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra
     sort -k 4n stsMarkers.lifted.psl.initial.acc \
                 stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.final
 
     # determine found markers (4th field in file)
     cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
     wc -l stsMarkers.found
         #  89681 stsMarkers.found
     cut -f 4 stsMarkers.extra | sort -n -u > stsMarkers.extra.found
     wc -l stsMarkers.extra.found
         #   149 out of 3142 attempted
         # out of 92674 total sequences
     cp stsMarkers.final stsMarkers.lifted.psl stsMarkers.*lifted.psl.initial* stsMarkers.found \
                 /cluster/data/hg17/bed/sts
     # Alignments from noOoc set were not added to all_sts_seq but info for the markers
     # is in stsMap and stsInfo2. Some of the alignments are bad so filter by removing
     # all alignments from noOoc psl file where tBaseInsert >=1000. Add the remaining 
     # alignments to the set of final alignments for stsMarkers. The information for the  
     # removed markers from the filtered set was also removed from stsMap and stsInfo2.
     # (DONE, 2005-02-17, hartera)
     ssh eieio
     cd /cluster/data/hg17/bed/sts/fix
     cp /cluster/bluearc/hg17/sts/sts/stsMarkers.noOoc.lifted.psl .
     awk '{if ($8 < 1000) print;}' stsMarkers.noOoc.lifted.psl > stsMarkers.noOoc.lifted.filt1000.psl
     wc -l *.filt*.psl
     # 254    5334   26384 stsMarkers.noOoc.lifted.filt1000.psl
     sort -k 4n /cluster/bluearc/hg17/sts/sts/stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra
     awk '{print $4;}' stsMarkers.extra | sort -n | uniq >  extra.ids
     # in psl file, the ids are the 10th field
     awk '{print $10;}' stsMarkers.noOoc.lifted.psl | sort -n | uniq \
         > noOoc.ids
     diff extra.ids noOoc.ids
     # there is no difference as expected
     # get list of IDs from filtered file, filter < 1000
     awk '{print $10;}' stsMarkers.noOoc.lifted.filt1000.psl \
         | sort -n | uniq > filt1000.ids
     foreach i (`cat filt1000.ids`)
      awk 'BEGIN {OFS="\t"} \
          {if ($4 == "'$i'") print $1, $2, $3, $4, $5, $6, $7}' \
          stsMarkers.extra >> stsMarkers.extra.filt1000
     end
     cp ../stsMarkers.final stsMarkers.final
   #  cat stsMarkers.extra.filt1000 >> stsMarkers.final2 
      # need to filter stsMarkers.final not just cat this on the end
     # get list of alignments with tBaseInsert >= 1000 and remove these
     cd /cluster/data/hg17/bed/sts/fix
     awk '{if ($8 >= 1000) print;}' stsMarkers.noOoc.lifted.psl > stsMarkers.noOoc.lifted.filtToRemove.psl
     wc -l *.filt*.psl
     # 254 stsMarkers.noOoc.lifted.filt1000.psl
     # 249 stsMarkers.noOoc.lifted.filt500.psl
     # 448 stsMarkers.noOoc.lifted.filtToRemove.psl
     # get list of IDs that need to be removed
     awk '{print $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl | sort -n \
         | uniq  > noOoc.IdsToRemove.txt
     # get chrom and co-ordinates for IDs to be removed 
     awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
         stsMarkers.noOoc.lifted.filtToRemove.psl | sort | uniq \
         > sts.noOoc.filtToRemove.coords
     # checked that the stsMarkers.final contain the noOoc alignments
     # wrote perl script to remove lines with these IDs from stsMarkers.final
 cat << '_EOF_' > removeIds.pl
 #!/usr/bin/perl -w
 use strict;
 
 my $ids = $ARGV[0];
 my $file = $ARGV[1];
 # list of IDs with chrom and coords to remove
 open(IDS, $ids) || die "Can not open $ids: $!\n";
 # file for removal of IDs
 open(FILE, $file) || die "Can not open $file: $!\n";
 open(OUT, ">removed.txt") || die "Can not create removed.txt: $!\n";
 
 my %idsHash;
 
 while (<IDS>) {
    chomp;
    my @a = split(/\t/);
 
    my $chr = $a[0];
    my $st = $a[1];
    my $end = $a[2];
    my $id = $a[3];
    my $key = $id."_".$chr . "_" . $st . "_" . $end;
    $idsHash{$key}->{chrom} = $chr;
    $idsHash{$key}->{start} = $st;
    $idsHash{$key}->{end} = $end;
 }
 close IDS;
 
 while (<FILE>) {
    chomp;
    my $l = $_;
    my $found = "FALSE";
    my @f = split(/\t/, $l);
    foreach my $k (keys(%idsHash)) {
       # if the id is contained in the key
       if ($k =~ /^$f[3]/) {
          my $c = $idsHash{$k}->{chrom};
          my $s = $idsHash{$k}->{start};
          my $e = $idsHash{$k}->{end};
          if ($f[0] eq $c && $f[1] == $s && $f[2] == $e) {
              print OUT "$c\t$s\t$e\t$f[3]\n";
              $found = "TRUE";
          }
       }
    }
    if ($found eq "FALSE") {
       print "$l\n";
    }
 }
 '_EOF_'
     chmod +x removeIds.pl
     perl removeIds.pl sts.noOoc.filtToRemove.coords stsMarkers.final \
          > stsMarkers.final.new
     wc -l stsMarkers.final*
     # 92338 stsMarkers.final
     # 91890 stsMarkers.final.new
     # There are 448 ids and sets of co-ordinates in list of Ids to remove
     # check that stsMarkers.final.new contains all the alignments that
     # are in filtered set: stsMarkers.noOoc.lifted.filt1000.psl
     awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
         stsMarkers.noOoc.lifted.filt1000.psl | sort | uniq \
         > sts.noOoc.filt1000.coords
     awk 'BEGIN {OFS = "\t"} {print $1,$2,$3,$4}' \
         stsMarkers.final.new | sort | uniq \
         > sts.finalnew.coords
     diff sts.finalnew.coords sts.noOoc.filt1000.coords > finalnewvsfilt1000
     grep '>' finalnewvsfilt1000
     # there is nothing in sts.noOoc.filt1000.coords not found in the 
     # sts.finalnew.coords file therefore this contains all the alignments
     # from the filtered noOoc file.
     cp ../primers/primers.final .
     awk '{print $4}' primers.final | sort | uniq > primers.ids
     awk '{print $4}' stsMarkers.final.new | sort | uniq > stsfinal.new.ids
 
 
     # primers
     ssh eieio
     cd /cluster/data/ncbi/sts.9
     # strip out N's and wobbles (KS) from primers, as isPcr
     # can't currently handle them
     # strip out primers < 10 as isPcr can't handle them
     awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
                 all.primers > all.primers.ispcr
     mkdir -p /cluster/bluearc/sts.9/primers
     cd /cluster/bluearc/sts.9/primers
     split -l 2000 /cluster/data/ncbi/sts.9/all.primers.ispcr primers_
 
     ssh kk
     cd /cluster/data/hg17/bed/sts
     mkdir primers
     cd primers
     mkdir run
     cd run
     ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > contigs.lst
     ls -1S /cluster/bluearc/sts.9/primers/primers_* > primers.lst
     mkdir -p /cluster/bluearc/hg17/sts/primers/out
 
 cat > template << 'EOF'
 #LOOP
 /cluster/home/kate/bin/i386/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/scratch/hg/h/10.ooc  -stepSize=5 $(path1) $(path2) {check out line /cluster/bluearc/hg17/sts/primers/out/$(root1)_$(root2).psl}
 #ENDLOOP
 'EOF'
 # << for emacs
 
     gensub2 contigs.lst primers.lst template jobList
     para create jobList
 	# 26980 jobs
     para try
     para check
     para push
 
     #Completed: 26953 of 26980 jobs
     #Crashed: 27 jobs
     #CPU time in finished jobs:    1130353s   18839.22m   313.99h   13.08d  0.036 y
     #IO & Wait Time:                 86067s    1434.44m    23.91h    1.00d  0.003 y
     #Average job time:                  45s       0.75m     0.01h    0.00d
     #Longest job:                     1255s      20.92m     0.35h    0.01d
     #Submission to last job:          2762s      46.03m     0.77h    0.03d
 
     # 27 jobs seg faulted due to -minPerfect=2.
     # Looks like a bug in isPcr -- till it's fixed,
     # we'll rerun with -minPerfect=5 (Terry determined they
     # all complete with this (he used 3, 4, or 5, tuned individually
     # for each job, but just using 5 should be adequate and 
     # less labor-intensive).
     # NOTE: isPcr bug is fixed -- this shouldn't be necessary for
     # next run
 
     para crashed | grep isPcr | sed 's/minPerfect=2/minPerfect=5/' \
         > jobList.minPerfect5
     para create jobList.minPerfect5
         # 28 jobs
     # repeat with increasing minPerfect, till all complete succesfully
 
     # Filter output file quickly based on simple parameters
     ssh kolossus
     cd /cluster/bluearc/hg17/sts/primers/
     mkdir -p filter
     pslQuickFilter -minMatch=26 -maxMismatch=5 -maxTinsert=5000 -verbose out/ filter/
 	# Note: there will be many messages saying files are empty - this is OK
     pslSort dirs primers.psl.unlifted temp filter
 
     # filter primer alignments and create not found primer file for ePCR run (booch)
     pslFilterPrimers /cluster/bluearc/hg17/sts/primers/primers.psl.unlifted  \
 	/cluster/data/ncbi/sts.9/all.primers primers.filter.unlifted.psl
         # creates $3.notfound.primers
     wc -l primers.filter.unlifted.psl.notfound.primers                   
     # 21919  primers.filter.unlifted.psl.notfound.primers
 
     # use Greg Schuler's ePCR to attempt alignment of primers missed
     # by isPcr
     mkdir -p /cluster/data/hg17/bed/sts/primers/run.epcr
     mkdir -p /cluster/bluearc/hg17/sts/primers/epcr
     cd /cluster/bluearc/hg17/sts/primers/epcr
     split -l 2500 /cluster/data/hg17/bed/sts/primers/primers.filter.unlifted.psl.notfound.primers  primers_
     cd /cluster/data/hg17/bed/sts/primers/run.epcr
     ls -1S /cluster/bluearc/hg17/sts/primers/epcr/primers_* > primers.lst
     # create contig.lst based on split in build dir
     # NOTE: should probably replace this with something more standard
     # and faster.  Also, this appears to cause load spikes on fileservers.
     # Should get contigs from bluearc, iservers, or cluster local disk
     # At least it's over pretty quick!
     ssh eieio
     cd /cluster/data/hg17/bed/sts/primers/run.epcr
     /cluster/bin/scripts/splitContigList -ncbi /cluster/data/hg17 1
     # next time... ls -1S /cluster/bluearc/hg17/contigs/* > contig.lst (?)
     mkdir -p /cluster/bluearc/hg17/sts/primers/epcr/out
 
     ssh kk
     cd /cluster/data/hg17/bed/sts/primers/run.epcr
 
 cat > template << 'EOF'
 #LOOP
 /cluster/bin/scripts/runEpcr $(path1) $(path2) {check out line /cluster/bluearc/hg17/sts/primers/epcr/out/$(root1).$(root2).epcr}
 #ENDLOOP
 'EOF'
 # << for emacs
     gensub2 primers.lst contig.lst template jobList
     para create jobList
 	# 3420 jobs
     para try
     para check
     para push
 
 # CPU time in finished jobs:      78897s    1314.95m    21.92h    0.91d  0.003 y
 # IO & Wait Time:                254582s    4243.03m    70.72h    2.95d  0.008 y
 # Average job time:                  98s       1.63m     0.03h    0.00d
 # Longest job:                      647s      10.78m     0.18h    0.01d
 # Submission to last job:          1112s      18.53m     0.31h    0.01d
 
     # merge output
     ssh eieio
     cd /cluster/bluearc/hg17/sts/primers/epcr
     cat out/*.epcr > all.epcr
     wc -l all.epcr
     # 3573 
 
     # use all.epcr file to re-filter alignemnts and determine which
     # ePCR records to keep
     cp all.epcr /cluster/data/hg17/bed/sts/primers
     cd /cluster/data/hg17/bed/sts/primers
     pslFilterPrimers -epcr=all.epcr -verbose=1 \
         /cluster/bluearc/hg17/sts/primers/primers.psl.unlifted \
 	/cluster/data/ncbi/sts.9/all.primers primers.unlifted.epcr.psl
 
     # convert to PSL and combine with other psl file (this takes a couple hours)
     /cluster/bin/scripts/epcrToHgPsl epcr.not.found \
         /cluster/data/ncbi/sts.9/all.primers /cluster/data/hg17
     cat primers.unlifted.epcr.psl epcr.not.found.psl \
                 | sort -k 10n > primers.final.unlifted.psl
 
     # Fix the query gap lengths so that they match the all.primers.fa 
     #   file lengths
     /cluster/bin/scripts/fixPrimersQueryGaps \
         /cluster/data/ncbi/sts.9/all.primers primers.final.unlifted.psl \
                 > primers.final.unlifted.fix.psl
 
     # lift results from contigs to chrom coordinates, and create final file
     liftUp -nohead /cluster/data/hg17/bed/sts/primers/primers.psl \
             /cluster/data/hg17/jkStuff/liftAll.lft warn \
             primers.final.unlifted.fix.psl
     # Extract relevant info, make alignments unique, and create final file to be merged
     # with full sequence alignments
     /cluster/bin/scripts/extractPslInfo primers.psl
     /cluster/bin/scripts/findAccession -agp primers.psl.initial \
                 /cluster/data/hg17
     #rm primers.psl.initial
 
     /cluster/bin/scripts/getStsId /cluster/data/ncbi/sts.9/stsInfo2.bed \
 	primers.psl.initial.acc \
         | sort -k 4n > primers.final
     #rm primers.psl.initial.acc
     wc -l primers.final
     # 314713 primers.final
 
     # Merge primer and sequence files to create final bed file
     # Merge (combineSeqPrimerPos) takes about an hour to run
     ssh kolossus
     cd /cluster/data/hg17/bed/sts
     
     /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final primers/primers.final
         # creates *_pos.rdb
     /cluster/bin/scripts/createSTSbed /cluster/data/ncbi/sts.9/stsInfo2.bed \
                 stsMarkers_pos.rdb > stsMap.bed
 
     # Set up sequence files
     ssh hgwdev
     mkdir -p /gbdb/hg17/sts.9/
     ln -s /cluster/data/ncbi/sts.9/all.STS.fa /gbdb/hg17/sts.9/all.STS.fa
     ln -s /cluster/data/ncbi/sts.9/all.primers.fa \
         /gbdb/hg17/sts.9/all.primers.fa
 
     # Load all files
     cd /cluster/data/hg17/bed/sts
     hgLoadSeq hg17 /gbdb/hg17/sts.9/all.STS.fa /gbdb/hg17/sts.9/all.primers.fa
     hgsql hg17 < ~kent/src/hg/lib/stsInfo2.sql
     hgsql hg17 < ~kent/src/hg/lib/stsAlias.sql
     cp /cluster/data/ncbi/sts.9/{stsInfo2.bed,stsAlias.bed} .
     hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
     hgsql hg17 -e 'load data local infile "stsAlias.bed" into table stsAlias'
     hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \
 	hg17 stsMap stsMap.bed
     hgLoadPsl -nobin -table=all_sts_primer hg17 primers/primers.psl
     hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl
 
     # update of information for D21S167 and D21S168 (2005-02-11, hartera)
     # currently X52289 associated with D21S168 
     # and X53367 associated with D21S167 - these need to be switched as they
     # are causing incorrect positioning
     # On Terry's advice, 
     # first manually update the accession field stsInfo2.bed so that the 
     # corrected version is carried through to the next version
     cd /cluster/data/hg17/bed/sts
     # manually change accessions in this file so now X52289 is associated
     # with D21S167 and X53367 is now associated with D21S168 
     # manually update the chromStart and chromEnd fields for these
     # records in stsMap.bed
     # this change was not carried through after filtering to change stsMap.bed
     # again and reload this table (DONE, 2005-02-18, hartera)
     chr21   39867340        39867513        D21S167 1000    7888    AF064860
     # becomes
     chr21   37117635        37117858        D21S167 1000    7888    AF064860
 
     chr21   37117635        37117858        D21S168 1000    103256  AP000699
     # becomes
     chr21   39867340        39867513        D21S168 1000    103256  AP000699
     
     # then reload the stsMap.bed and stsInfo2.bed files
     # copy this updated bed file back to ncbi directory
     cp stsInfo2.bed /cluster/data/ncbi/sts.9/
     # delete previous data before reloading tables
     hgsql hg17 -e 'delete from stsInfo2'
     hgsql hg17 -e 'drop table stsMap'
     hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
     hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \
 	hg17 stsMap stsMap.bed
     # (2005-02-19, hartera)
     # also need to update the psl alignment file and reload into all_sts_seq
     # for D21S168, the id is 103256, this is qName in the psl file
     # for D21S167, the id is 7888
     cd /cluster/data/hg17/bed/sts
     # manually update the stsMarkers.lifted.psl file with the new
     # co-ordinates as above.
     # (2005-02-23) Correct alignments.
     # need to swap the names for the alignments not just the start and end
     # coords as before as now the rest of the alignment data fields in the 
     # table are incorrect. Change the start and end co-ordinates and just swap 
     # the names for D21S167 and D21S168 in the psl file then reload the table. 
     # sort on the ID field (qName)
     sort -k 10n stsMarkers.lifted.psl > sts.lifted.sort
     mv sts.lifted.sort stsMarkers.lifted.psl 
     hgsql hg17 -e 'drop table all_sts_seq'
     hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl
     
     # Add new information after filtering the noOoc files 
     # (DONE, 2005-02-17, hartera)
     # latest psl file: stsMarker.lifted.new.psl is in fix dir    
     # Merge primer and sequence files to create final bed file
     ssh kolossus
     cd /cluster/data/hg17/bed/sts/fix
     nice /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final.new \
                                              ../primers/primers.final
     # creates *_pos.rdb
     /cluster/bin/scripts/createSTSbed /cluster/data/ncbi/sts.9/stsInfo2.bed \
                 stsMarkers_pos.rdb > stsMap.bed
     awk '{print $6;}' stsMap.bed | sort -n | uniq > stsMap.ids
     diff stsMap.ids filt1000.ids 
     # There is only 1 id that does not make it into this set (109375)
     # There are 38 of the IDs to remove that do not appear in stsMap.ids
     # there are 65 therefore that appear in stsMap.bed: noOoctoremoveinStsMap
     foreach i (`cat noOoctoremoveinStsMap`) 
        awk 'BEGIN {OFS = "\t"} {if ($10 == "'$i'" && $8 >= 1000) \
         print $14, $16, $17, $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl \
         >> stsMap.noOoc.toRemove.coords
     end 
     sort stsMap.noOoc.toRemove.coords > stsMap.noOoc.toRemove.coords.sort
     wc -l stsMap.noOoc.toRemove.coords.sort
     # 122
     # get the equivalent co-ordinates from stsMap.bed
     foreach i (`cat noOoctoremoveinStsMap`)
        awk 'BEGIN {OFS = "\t"} {if ($6 == "'$i'") print $1,$2,$3,$6;}' \
            stsMap.bed >> stsMap.toRemove.coords
     end
     sort stsMap.toRemove.coords > stsMap.toRemove.coords.sort
     wc -l stsMap.toRemove.coords.sort
     # 68
     diff stsMap.noOoc.toRemove.coords stsMap.toRemove.coords.sort
     # They are different co-ordinates in each set although the same ID
     # is represented.
     # none of the noOoc alignments are in stsMarkers.lifted.psl so add
     cp ../stsMarkers.lifted.psl stsMarkers.lifted.psl
     awk '{print $10}' stsMarkers.lifted.psl | sort -n | uniq > sts.liftedpsl.ids
     # none of the noOoc alignments are in stsMarkers.lifted.psl so add
     # the filtered version
     cp stsMarkers.lifted.psl stsMarkers.lifted.new.psl
     cat stsMarkers.noOoc.lifted.filt1000.psl >> stsMarkers.lifted.new.psl
     wc -l stsMarkers.lifted.new.psl
     # 91890 
     awk '{print $1;}' ../stsInfo2.bed | sort -n | uniq > stsInfo2.ids
     # diff with filt1000.ids and noOoc.IdsToRemove.txt
     # all of these are in stsInfo2.bed
     # need to remove info for the filtered out set but only for the 38 that
     # were removed from stsMap.bed - noOocnotinstsMap 
     perl removeById.pl noOocnotinstsMap stsInfo2.bed > stsInfo2.new.bed
 cat << '_EOF_' > removeById.pl
 #!/usr/bin/perl -w
 use strict;
 
 my $ids = $ARGV[0];
 my $file = $ARGV[1];
 # list of to remove
 open(IDS, $ids) || die "Can not open $ids: $!\n";
 # file of stsMarkers.final
 open(FILE, $file) || die "Can not open $file: $!\n";
 open(OUT, ">removedIds.txt") || die "Can not create removedIds.txt: $!\n";
 
 my %idsHash;
 while (<IDS>) {
    chomp;
    my @a = split(/\t/);
    my $id = $a[0];
    $idsHash{$id} = 1;
 }
 close IDS;
 
 while (<FILE>) {
    my $l = $_;
    my $found = "FALSE";
    my @f = split(/\t/, $l);
    foreach my $k (keys(%idsHash)) {
       # if the id is contained in the key
       if ($k eq $f[0]) {
          $found = "TRUE";
          print OUT "$f[0]\n";
       }
    }
    if ($found eq "FALSE") {
       print $l;
    }
 }
 '_EOF_'
     # << emacs
     chmod +x removeById.pl
     # this removed data for all 38 of these Ids from stsInfo2.bed
     
     # need to reload database tables (2005-02-18, hartera)
     ssh hgwdev
     cd /cluster/data/hg17/bed/sts/fix
     hgsql hg17 -e 'drop table stsMap'
     hgsql hg17 -e 'drop table all_sts_seq'
     hgsql hg17 -e 'drop table stsInfo2'
     mv stsInfo2.new.bed stsInfo2.bed
     cp stsInfo2.bed /cluster/data/ncbi/sts.9/stsInfo2.bed
     mv stsMap.new.bed stsMap.bed
     mv stsMarkers.lifted.new.psl stsMarkers.lifted.psl
     hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/stsMap.sql \
 	hg17 stsMap stsMap.bed
     hgLoadPsl -nobin -table=all_sts_seq hg17 stsMarkers.lifted.psl
     hgsql hg17 < ~kent/src/hg/lib/stsInfo2.sql
     hgsql hg17 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
     cd ..
     mkdir old
     mv stsMap.bed stsInfo2.bed stsMarkers.lifted.psl ./old
     mv ./fix/stsMap.bed ./fix/stsInfo2.bed ./fix/stsMarkers.lifted.psl .
    
     # Update of stsAlias table (DONE, 2005-02-24, hartera)
     # stsAlias filtered IDs removed
     # should have same IDs as in stsInfo2
     ssh eieio
     cd /cluster/data/hg17/bed/sts/fix
     awk '{print $2;}' ../stsAlias.bed | sort -n | uniq > alias.ids
     # 145985 alias.ids
     awk '{print $6;}' ../stsMap.bed | sort -n | uniq > stsMap.new.ids.sort
     awk '{print $1;}' ../stsInfo2.bed | sort -n | uniq > stsInfo.new.ids.sort
     # 16678 ids in stsInfo2 that are not in stsMap
     # 16717 ids in stsAlias that are not in stsMap
     # 38 ids in stsAlias that are not in stsInfo2
     cat stsMap.new.ids.sort stsInfo.new.ids.sort | sort -n | uniq \
         > stsMapandInfo.ids.sort
     diff stsMapandInfo.ids.sort alias.ids | grep '>' > idstoremoveAlias
     # there are 38 of these IDs to remove
     perl -pi.bak -e 's/> //' idstoremoveAlias
     cp ../stsAlias.bed .
     foreach i (`cat idstoremoveAlias`)
       awk '{if ($2 != "'$i'") print;}' stsAlias.bed > stsAlias.tmp
       mv stsAlias.tmp stsAlias.bed
     end
     # check that ids are removed from file and that they are the correct ones
     # all looks good
     cd /cluster/data/hg17/bed/sts
     # save old stsAlias file and copy new one to sts dir and to ncbi sts dir
     mv stsAlias.bed ./old
     cp ./fix/stsAlias.bed .
     cp stsAlias.bed /cluster/data/ncbi/sts.9/stsAlias.bed
     ssh hgwdev
     # remove old table data and reload
     hgsql hg17 -e 'delete from stsAlias'
     hgsql hg17 -e 'load data local infile "stsAlias.bed" into table stsAlias'
 
 # PRUNE stsMap RECORDS (DONE 3/3/06)
 
   hgsql hg17 -e 'delete from stsMap where chromEnd-chromStart > 5000'
   
 # RECOMBINATION RATES (2004-07-13 Terry)
 #                       (2004-07-21 kate)
 
 # The STS MArkers track must be completed prior to creating this track
 
     ssh eieio
     cd /cluster/data/hg17/bed
     mv recombRate recombRate.terry
     mkdir -p recombRate
 	cd recombRate
 
 # Copy other necessary files here (in future, can take from previous version)
 # NOTE: these are stable, and could be saved in a permanent spot
     cp /projects/hg2/booch/psl/info/decode_all .
     cp /projects/hg2/booch/psl/info/marshfield_all .
     cp /projects/hg2/booch/psl/info/genethon_all .
 	
 # Determine maximum concordant set of markers for each of the maps
     /cluster/bin/scripts/assignGPsts -full -maxcon \
         /cluster/data/ncbi/sts.9/stsAlias.bed \
         /cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \
         decode_all > decode.marker.rdb
     /cluster/bin/scripts/assignGPsts -full -maxcon \
         /cluster/data/ncbi/sts.9/stsAlias.bed \
         /cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \
         marshfield_all > marshfield.marker.rdb
     /cluster/bin/scripts/assignGPsts -full -maxcon \
         /cluster/data/ncbi/sts.9/stsAlias.bed \
         /cluster/data/hg17/bed/sts/stsMarkers_pos.rdb \
         genethon_all > genethon.marker.rdb
 
 # Determine the rates for each of the maps
     /cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \
             /cluster/data/hg17/chrom.sizes 1000000 1000000 \
                 > decode_1mb_slide_1mb
     /cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \
             /cluster/data/hg17/chrom.sizes 1000000 1000000 \
                 > genethon_1mb_slide_1mb
         # Marker number 2 at position 120005974 on chr9 is out of genetic distance order. DISCARDING
     /cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \
             /cluster/data/hg17/chrom.sizes 1000000 1000000 \
                 > marshfield_1mb_slide_1mb
         # Marker number 1 at position 124276104 on chr9 is out of genetic distance order. DISCARDING
 
 # Convert files to proper format
     /cluster/bin/scripts/convertRecombRate decode_1mb_slide_1mb \
         /cluster/data/hg17/inserts \
         /cluster/data/hg17 1000 > decode_1mb_slide_1mb_conv
     /cluster/bin/scripts/convertRecombRate marshfield_1mb_slide_1mb \
         /cluster/data/hg17/inserts \
          /cluster/data/hg17 1000 > marshfield_1mb_slide_1mb_conv
     /cluster/bin/scripts/convertRecombRate genethon_1mb_slide_1mb \
         /cluster/data/hg17/inserts \
 	    /cluster/data/hg17 1000 > genethon_1mb_slide_1mb_conv
 
 # Create bed file and load
     /cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \
         marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \
                 > recombRate.bed
     hgLoadBed -noBin -tab \
         -sqlTable=/cluster/home/kent/src/hg/lib/recombRate.sql \
 	    hg17 recombRate recombRate.bed
 	   
 
 # FISH CLONES (DONE 2004-07-22 Kate)
 #  Reloaded 2004-09-36 after Terry Furey reworked fishClones.c
 #       to improve scoring
 
 # The STS Marker, Coverage, and BAC End Pairs tracks must be completed prior to 
 # creating this track
 
     ssh eieio
     mkdir -p /cluster/data/ncbi/fishClones/fishClones.2004-07/
     cd /cluster/data/ncbi/fishClones/fishClones.2004-07/
 
 # Download information from NCBI
         # point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
         # change "Show details on sequence-tag" to "yes"
         # change "Download or Display" to "Download table for UNIX"
         # press Submit - save as /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt
     chmod 664 /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt
 
 # Get current clone/accession information
     wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
 
 # Create initial Fish Clones bed file
     mkdir -p /cluster/data/hg17/bed/fishClones
     cd /cluster/data/hg17/bed/fishClones
 
 # Copy previous sts info from fhcrc (take from previous build in future)
     cp ~booch/tracks/fish/fhcrc.sts .
     fishClones -verbose=1 -fhcrc=fhcrc.sts -noBin hg17 \
          /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt \
          /cluster/data/ncbi/fishClones/fishClones.2004-07/clac.out \
          /cluster/data/ncbi/bacends/human/bacends.4/cl_acc_gi_len \
          /cluster/data/hg17/bed/bacends/lifted/bacEnds.lifted.psl \
             fishClones_initial
 	     
 # Get sequences for accessions not in genome
     ssh eieio
     mkdir -p /cluster/bluearc/hg17/fishClones/
     cd /cluster/bluearc/hg17/fishClones/
 	# goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide
 	# select file "/cluster/data/hg17/bed/fishClones/fishClones_initial.acc"
 	# change output to FASTA format
 	# download results to "/cluster/bluearc/hg17/fishClones/notFound.fa"
 
 # Align these using blat
     cp ~booch/tracks/gs.17/build34/fish/convert.pl .
     cp ~booch/tracks/gs.17/build34/fish/blatAll.pl .
         # edited to use ooc file on bluearc, so can run on kolossus
     convert.pl  < notFound.fa > notFound.convert.fa
     mkdir out
     blatAll.pl /cluster/data/hg17 notFound.convert.fa out
         # creates raw.psl, not.found.psl
 
 # Make final fishClones file with this new clone placement info
     cd /cluster/data/hg17/bed/fishClones
     fishClones -verbose=1 -fhcrc=fhcrc.sts -noBin \
          -psl=/cluster/bluearc/hg17/fishClones/not.found.psl hg17 \
          /cluster/data/ncbi/fishClones/fishClones.2004-07/hbrc.txt \
          /cluster/data/ncbi/fishClones/fishClones.2004-07/clac.out \
          /cluster/data/ncbi/bacends/human/bacends.4/cl_acc_gi_len \
          /cluster/data/hg17/bed/bacends/lifted/bacEnds.lifted.psl fishClones
 
 # Load the track
     ssh hgwdev
     cd /cluster/data/hg17/bed/fishClones
     hgLoadBed -noBin -tab \
         -sqlTable=/cluster/home/kent/src/hg/lib/fishClones.sql \
 	hg17 fishClones fishClones.bed
             # Loaded 10601 elements of size 16
 
 # fixed bad table entry (2004-08-12 kate)
 # NOTE: this won't be necessary in the future, as the fishClones program
 # will now accomodate more bad input data.
     hgsql hg17 -e "update fishClones set bandEnds='1q43,Yp' where  name='RP11-188A4' and placeCount=2"
 
 
 # CHROMOSOME BANDS TRACK (2004-07-13 Terry)
 
 # This must wait until the Fish Clones tracks is done
     mkdir -p /cluster/data/hg17/bed/cytoband
     cd /cluster/data/hg17/bed/cytoband
 
 # Copy in some necessary files (usually from previous version)
     cp /projects/hg2/booch/psl/cytobands/pctSetBands.txt .
     cp /projects/hg2/booch/psl/cytobands/ISCN800.txt .
 
 # Create some preliminary information files
     /cluster/bin/scripts/createSetBands pctSetBands.txt \
 	/cluster/data/hg17/inserts /cluster/data/hg17  100 > setBands.txt
     /cluster/bin/scripts/makeBands ISCN800.txt /cluster/data/hg17 > cytobands.pct.bed
     /cluster/bin/scripts/makeBandRanges cytobands.pct.bed > cytobands.pct.ranges
 
 # Reformat fishClones file
     /cluster/bin/scripts/createBanderMarkers \
 	/cluster/data/hg17/bed/fishClones/fishClones.bed > fishClones.txt
 
 # Create bed file
     /cluster/bin/scripts/runBander fishClones.txt \
 	ISCN800.txt setBands.txt /cluster/data/hg17
 
     # Should be 862 bands
     wc cytobands.bed
     # 862    4310   30748 cytobands.bed
 
 # Load track
     hgLoadBed -noBin -tab -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql \
 	hg17 cytoBand cytobands.bed
 	    
 # Load ideogram table
     hgLoadBed -noBin -tab -sqlTable=/cluster/home/booch/src/hg/lib/cytoBandIdeo.sql \
 	hg17 cytoBandIdeo cytobands.bed
 
 
 # CHROMOSOME BANDS TRACK REDO (2004-07-22 Kate)
 #       Just to make sure we know the proper steps.
 #       The tables were not reloaded, as Terry has already
 #       sent the data to NCBI
 
 # This must wait until the Fish Clones tracks is done
     ssh kolossus
     mkdir -p /cluster/data/hg17/bed/cytoband.kate
     cd /cluster/data/hg17/bed/cytoband.kate
 
 # Copy in some necessary files (usually from previous version)
     cp /projects/hg2/booch/psl/cytobands/pctSetBands.txt .
     cp /projects/hg2/booch/psl/cytobands/ISCN800.txt .
 
 # Create some preliminary information files
     /cluster/bin/scripts/createSetBands pctSetBands.txt \
 	/cluster/data/hg17/inserts /cluster/data/hg17  100 > setBands.txt
     /cluster/bin/scripts/makeBands ISCN800.txt \
         /cluster/data/hg17 > cytobands.pct.bed
     /cluster/bin/scripts/makeBandRanges cytobands.pct.bed \
         > cytobands.pct.ranges
 
 # Reformat fishClones file
     /cluster/bin/scripts/createBanderMarkers \
 	/cluster/data/hg17/bed/fishClones/fishClones.bed > fishClones.txt
 
 # Create bed file
     ssh eieio
     cd /cluster/data/hg17/bed/cytoband.kate
     /cluster/bin/scripts/runBander fishClones.txt \
 	ISCN800.txt setBands.txt /cluster/data/hg17
     # NOTE: fails on kolossus (C++ compiler different ??)
 
     # Should be 862 bands
     wc  -l cytobands.bed
     # 862    cytobands.bed
 
 # NOTE - don't load tracks, as Terry has already sent his
 #       versions to NCBI
 
 # Load track
     #hgLoadBed -noBin -tab \
         # -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql \
 	# hg17 cytoBand cytobands.bed
 	    
 # Load ideogram table
     #hgLoadBed -noBin -tab \
         # -sqlTable=/cluster/home/booch/src/hg/lib/cytoBandIdeo.sql \
 	# hg17 cytoBandIdeo cytobands.bed
 
 
 # LOAD AFFYRATIO (DONE - 2004-07-14 - Hiram)
 #	Copied from Hg16 doc
     # Set up cluster job to align consenesus/exemplars to hg17
     ssh eieio
     mkdir /cluster/bluearc/hg17/affyGnf
     cp -p /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /cluster/bluearc/hg17/affyGnf
 
     ssh kkr1u00
     mkdir -p /iscratch/i/affyGnf
     cp -p /cluster/bluearc/hg17/affyGnf/* /iscratch/i/affyGnf
     /cluster/bin/iSync
 
     ssh kki
     mkdir /cluster/data/hg17/bed/affyGnf.2004-06-09
     cd /cluster/data/hg17/bed/affyGnf.2004-06-09
     ls -1 /iscratch/i/affyGnf/* > affy.lst
     ls -1 /iscratch/i/gs.18/build35/maskedContigs/* > allctg.lst
     cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/gs.18/build35/hg17.11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 allctg.lst affy.lst template.sub jobList
     mkdir psl
     para create jobList
 # Completed: 380 of 380 jobs
 # CPU time in finished jobs:       2922s      48.70m     0.81h    0.03d  0.000 y
 # IO & Wait Time:                  1146s      19.10m     0.32h    0.01d  0.000 y
 # Average job time:                  11s       0.18m     0.00h    0.00d
 # Longest job:                       80s       1.33m     0.02h    0.00d
 # Submission to last job:           333s       5.55m     0.09h    0.00d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyU95.psl
     ssh eieio
     cd /cluster/data/hg17/bed/affyGnf.2004-06-09
     pslSort dirs raw.psl tmp psl
 
     # change filter parameters for these sequences. only use alignments that
     # cover 30% of sequence and have at least 95% identity in aligned
     # region. 
     # minAli = 0.97 too high. low minCover as a lot of n's in these
     # sequences
     pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 \
 	raw.psl contig.psl /dev/null
     liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl
     #   Eliminate the long names
     sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl
 
     # Merge with spot data and load into database. added -chip flag to 
     # affyPslAndAtlasToBed to allow correct parsing
     ssh hgwdev
     cd /cluster/data/hg17/bed/affyGnf.2004-06-09
     
     /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 \
 	affyU95shortQname.psl \
 	/projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt \
 	affyRatio.bed affyRatio.exr > affyPslAndAtlasToBed.log 2>&1
 
     hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg17 \
 	affyRatio affyRatio.bed
     #	Loaded 12740 elements of size 15
 
     mkdir affyU95
     hgLoadPsl hg17 -table=affyU95 affyU95shortQname.psl
     # sequences loaded 2004-08-06
     hgLoadSeq -abbr=U95Av2: hg17 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
     #	Advisory lock created
     #	Creating .tab file
     #	Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
     #	12386 sequences
     #	Updating seq table
     #	Advisory lock has been released
     #	All done
 
 # Load AFFYUCLANORM, extended version of affyUcla track. Hopefully
 # final freeze of data set.		(DONE - 2004-07-14 - Hiram)
     ssh kk
     mkdir /cluster/data/hg17/bed/affyUclaNorm
     cd /cluster/data/hg17/bed/affyUclaNorm
 
     cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa .
     ls -1 /scratch/hg/gs.18/build35/maskedContigs/* > contig.lst
 
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
     mkdir psl
     ls HG-U133AB_all.fa > affy.lst
     gensub2 contig.lst affy.lst gsub jobList
     para create jobList
     para try
     para check
     para push ... etc
 # Completed: 380 of 380 jobs
 # CPU time in finished jobs:      20070s     334.51m     5.58h    0.23d  0.001 y
 # IO & Wait Time:                162784s    2713.06m    45.22h    1.88d  0.005 y
 # Average job time:                 481s       8.02m     0.13h    0.01d
 # Longest job:                      735s      12.25m     0.20h    0.01d
 # Submission to last job:           771s      12.85m     0.21h    0.01d
 
     ssh eieio
     cd /cluster/data/hg17/bed/affyUclaNorm
     pslSort dirs hg17.affyU133AB_all.psl tmp psl
     wc hg17.affyU133AB_all.psl
     #	61022 1281401 12934919 hg17.affyU133AB_all.psl
     liftUp hg17.affyU133AB_all.lifted.psl \
 	/cluster/data/hg17/jkStuff/liftAll.lft warn hg17.affyU133AB_all.psl 
     pslReps -minCover=0.5 -sizeMatters -minAli=0.97 \
 	-nearTop=0.005  hg17.affyU133AB_all.lifted.psl \
 	hg17.affyU133AB_all.lifted.pslReps.psl out.psr
     #	Processed 61017 alignments
     affyUclaMergePslData -pslFile=hg17.affyU133AB_all.lifted.pslReps.psl \
 	-affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt \
 	-bedOut=hg17.affyUcla.bed \
 	-expRecordOut=hg17.affyUcla.expRecords \
 	-expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt
 
     ~/kent/src/hg/affyGnf/addUclaAnnotations.pl hg17.affyUcla.expRecords \
 	/projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg17.affyUcla.annotations.expRecords
 
     # Load the databases
     ssh hgwdev
     cd /cluster/data/hg17/bed/affyUclaNorm
     sed -e 's/affyRatio/affyUclaNorm/' ~/kent/src/hg/lib/affyRatio.sql \
 	> affyUclaNorm.sql
     hgLoadBed hg17 affyUclaNorm hg17.affyUcla.bed -sqlTable=affyUclaNorm.sql
 
 # MAKE AFFY U133 - made after above affyUclaNorm (DONE - 2004-07-15 - Hiram)
     #	Someday the names can be fixed.
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/affyU133
     cd /cluster/data/hg17/bed/affyU133
     ln -s ../affyUclaNorm/hg17.affyU133AB_all.lifted.pslReps.psl affyU133.psl
     hgLoadPsl hg17 affyU133.psl
     #	hgsql -e "select count(*) from affyU133;" hg17
     #	row count in hg16: 45693, in hg17: 44620
     hgLoadSeq hg17 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
     #	44792 sequences
 
 # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN & FUGU (DONE 2004-06-10 kate)
     # In an email 2/13/04 to Angie, Arian said we could treat all 
     # human repeats as 
     # lineage-specific for human-chicken blastz.  
     # and Angie did the same for fugu.
     # Lacking input from Arian, and using blastzSelf as a model,
     # I'm also using all human repeats for the human/chimp blastz.
     # Scripts expect *.out.spec filenames.
     ssh kkr1u00
     cd /cluster/data/hg17
     mkdir /iscratch/i/hg17/linSpecRep.chicken
     foreach f (/iscratch/i/hg17/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/hg17/linSpecRep.chicken/$f:t:r:r.out.spec
     end
     ln -s /iscratch/i/hg17/linSpecRep.chicken \
           /iscratch/i/hg17/linSpecRep.fugu
     ln -s /iscratch/i/hg17/linSpecRep.chicken \
           /iscratch/i/hg17/linSpecRep.chimp
     iSync
 
 
 # BLASTZ FUGU (FR1) (DONE 2004-06-24 kate)
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastz.fr1.2004-06-10
     ln -s /cluster/data/hg17/bed/blastz.fr1.2004-06-10 \
             /cluster/data/hg17/bed/blastz.fr1
     cd /cluster/data/hg17/bed/blastz.fr1
     # Set L=6000 (more relaxed than chicken) and abridge repeats.
     # Treat all repeats as lineage-specific (reuse linSpecRep.Chicken).
     cat << '_EOF_' > DEF
 # human vs. fugu
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Reuse parameters from human-chicken.
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.fugu
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Fugu
 SEQ2_DIR=/iscratch/i/fr1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/fr1/linSpecRep
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastz.fr1
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     bash # if a csh/tcsh user
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     # GOT HERE
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
         # 11935 jobs
     para try
     para check
     para push
 # Completed: 11935 of 11935 jobs
 # CPU time in finished jobs:    4673316s   77888.60m  1298.14h   54.09d  0.148 y
 # IO & Wait Time:                329249s    5487.48m    91.46h    3.81d  0.010 y
 # Average job time:                 419s       6.99m     0.12h    0.00d
 # Longest job:                      714s      11.90m     0.20h    0.01d
 # Submission to last job:          5575s      92.92m     1.55h    0.06d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     cd /cluster/data/hg17/bed/blastz.fr1
     bash # if a csh/tcsh user
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
         # 341 jobs
     para try
     para check 
     para push
 # CPU time in finished jobs:        315s       5.26m     0.09h    0.00d  0.000 y
 # IO & Wait Time:                  4451s      74.18m     1.24h    0.05d  0.000 y
 # Average job time:                  14s       0.23m     0.00h    0.00d
 # Longest job:                      107s       1.78m     0.03h    0.00d
 # Submission to last job:           368s       6.13m     0.10h    0.00d
 
     # third run: lav -> axt
     ssh kki
     cd /cluster/data/hg17/bed/blastz.fr1
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << 'EOF' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin \
         /iscratch/i/hg17/bothMaskedNibs /iscratch/i/fr1/nib stdout \
 | axtSort stdin ../../axtChrom/$chr.axt 
 axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
         ../../pslChrom/$chr.psl
 'EOF'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     foreach d (../lav/chr*)
       echo "do.csh $d" >> jobList
     end
     para create jobList
         # 41 jobs
     para try
     para check
     para push
 
 
 # CHAIN FUGU BLASTZ (2004-06-11 kate)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg17/bed/blastz.fr1
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/hg17/bed/blastz.fr1/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # Reuse gap penalties from chicken run.
     cat << '_EOF_' > temp.gap
 tablesize	11
 smallSize	111
 position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
 qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
 tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
 bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
 '_EOF_'
     # << this line makes emacs coloring happy
     sed 's/  */\t/g' temp.gap > ../../fuguHumanTuned.gap
     rm -f temp.gap
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                       -linearGap=../../fuguHumanTuned.gap \
                       -minScore=5000 $1 \
     /iscratch/i/hg17/bothMaskedNibs \
     /iscratch/i/fr1/nib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
         # 46 jobs
     para try
     para check
     para push
         # 1 crashed job -- chr6_hla_hap1.chain is empty
 # CPU time in finished jobs:        610s      10.16m     0.17h    0.01d  0.000 y
 # IO & Wait Time:                  1644s      27.40m     0.46h    0.02d  0.000 y
 # Average job time:                  50s       0.83m     0.01h    0.00d
 # Longest job:                      233s       3.88m     0.06h    0.00d
 # Submission to last job:           339s       5.65m     0.09h    0.00d
 
     # now on the cluster server, sort chains
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.fr1/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.fr1/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain hg17 ${c}_chainFr1 $i
     end
     featureBits hg16 chainFr1Link
         # 50709290 bases of 2865248791 (1.770%) in intersection
 
 
 # ANCIENT REPEAT TABLE (2004-06-11 kate)
 
     # The netClass operations requires an "ancientRepeat" table in one 
     # of the databases.
     # This is a hand curated table obtained from Arian.
 
     ssh hgwdev
     mkdir -p /cluster/data/hg17/bed/ancientRepeat
     cd /cluster/data/hg17/bed/ancientRepeat
     # mysqldump needs write permission to this directory
     chmod 777 .
     hgsqldump --all --tab=. hg15 ancientRepeat
     chmod 775 .
     hgsql hg17 < ancientRepeat.sql
     echo "LOAD DATA LOCAL INFILE 'ancientRepeat.txt' into table ancientRepeat"\
                 | hgsql hg17
 
 
 # NET FUGU BLASTZ (2004-06-11 kate)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.fr1/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.fr1/axtChain
     netClass noClass.net hg17 fr1 human.net
 
     # Make a 'syntenic' subset:
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.fr1/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn human.net > humanSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.fr1/axtChain
     netFilter -minGap=10 human.net |  hgLoadNet hg17 netFr1 stdin
     #netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyFr1 stdin
 
 
 # EXTRACT AXT'S AND MAF'S FROM THE NET (kate)
     # NOTE: Redo 2005-08-16 to fix overlap problem (use 8/05 netToAxt)
 # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
     ssh kkstore2
     cd /cluster/data/hg17/bed/blastz.fr1/axtChain
     netSplit human.net humanNet
     mkdir -p ../axtNet ../mafNet
 cat > makeMaf.csh << 'EOF'
     foreach f (humanNet/chr*.net)
         set c = $f:t:r
         echo "axtNet on $c"
         netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/fr1/nib stdout | axtSort stdin ../axtNet/$c.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/fr1/chrom.sizes \
             ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=fr1.
     end
 'EOF'
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     mkdir -p /cluster/bluearc/hg17/mafNet
     cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/fr1
 
 
 # FUGU FR1 DOWNLOADS (DONE 2004-09-17 kate)
 #    REDO axtNet downloads for fix, above (2005-09-12 kate)
 # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
 
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.fr1/axtChain
     ln -s all.chain fugu.chain
     mkdir gz
     gzip -c fugu.chain > gz/fugu.chain.gz
     gzip -c human.net > gz/fugu.net.gz
     cd ../axtNet
     nice gzip *.axt
 
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.fr1/axtNet
     gzip *.axt
     md5sum *.gz > md5sum.txt
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17
     mkdir -p vsFr1
     cd vsFr1
     # Copy and edit README
     cp /cluster/data/hg17/bed/blastz.fr1/axtChain/gz/*.gz .
     md5sum *.gz > md5sum.txt
 
     mv axtNet axtNet.old
     ln -s /cluster/data/hg17/bed/blastz.fr1/axtNet .
 
 
 # PRODUCE FUGU BLAT ALIGNMENT (DONE - 2004-07-07 - Hiram)
 
     # Use masked scaffolds from fr1 assembly (same sequence as
     # previous BlatFugu, however it's repeat and TRF-masked).
 
     ssh kk
     mkdir /cluster/data/hg17/bed/blatFr1
     cd /cluster/data/hg17/bed/blatFr1
     mkdir psl 
     # next time, use N?_?????? (to pick up NG_ contigs)
     foreach f ( `cat /cluster/data/hg17/contig.lst` )
       set c=$f:t:r
       echo $c
       mkdir psl/$c
     end
 
     # create cluster job
     mkdir run
     cd run
     ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
     ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > human.lst
 cat << 'EOF' > gsub
 #LOOP
 /cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg17/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl}
 #ENDLOOP
 'EOF'
     # << keep emacs happy
     gensub2 human.lst fugu.lst gsub jobList
     para create spec
        # 219640 jobs  
     para try
     para check
     para push -maxQueue=300000 -maxPush=220000
     para check
 # Completed: 219640 of 219640 jobs
 # CPU time in finished jobs:    5206945s   86782.41m  1446.37h   60.27d  0.165 y
 # IO & Wait Time:                797791s   13296.52m   221.61h    9.23d  0.025 y
 # Average job time:                  27s       0.46m     0.01h    0.00d
 # Longest job:                      951s      15.85m     0.26h    0.01d
 # Submission to last job:          7553s     125.88m     2.10h    0.09d
         # cd psl
         # count files with aligments
         # find . -not -size 427c | wc -l
         # 44558
         # count files with no aligments
         # find . -size 427c | wc -l
         # 175463
 
    # When cluster run is done, sort alignments
    # into chrom directory
     ssh eieio
     cd /cluster/data/hg17/bed/blatFr1
     pslCat -dir psl/N?_?????? | \
       liftUp -type=.psl stdout \
         /cluster/data/hg17/jkStuff/liftAll.lft warn stdin | \
       pslSortAcc nohead chrom temp stdin
         # 65 minutes ?
         # Processed 216595 lines into 1 temp files
 
     # Rename to correspond with tables and load into database:
     ssh hgwdev
     cd /cluster/data/hg17/bed/blatFr1/chrom
     foreach i (chr*.psl)
         set r = $i:r
         echo mv $i ${r}_blatFr1.psl
         mv $i ${r}_blatFr1.psl
     end
 
     # lift fugu scaffolds to Fugu browser chrUn,
     # so you can link to other browser.  And don't need to load sequence
     cd /cluster/data/hg17/bed/blatFr1
     liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl
 
     hgLoadPsl -table=blatFr1 hg17 all.psl
     #	load of blatFr1 did not go as planned: 216595 record(s),
     #	0 row(s) skipped, 3 warning(s) loading psl.tab
     #	featureBits hg17 blatFr1 refGene:CDS
     #	13563544 bases of 2866216770 (0.473%) in intersection
     #	featureBits hg16 blatFr1 refGene:CDS
     #	13547219 bases of 2865248791 (0.473%) in intersection
     #	featureBits hg15 blatFugu refGene:CDS
     #	12427544 bases of 2866466359 (0.434%) in intersection
 
 #  BLASTZ RAT RN3 (DONE - 2004-06-14 - Hiram)
 
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastz.rn3.2004-06-11
     cd /cluster/data/hg17/bed
     ln -s  blastz.rn3.2004-06-11 blastz.rn3
     cd blastz.rn3
 
     cat << '_EOF_' > DEF
 # rat vs. human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Rat
 SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
 # not currently used
 SEQ2_RMSK=
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/store5/gs.18/build35/bed/blastz.rn3
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/hg17/bed/blastz.rn3
     source DEF
     # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run0.sh
     #	it is a generic script and works for any assembly
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 Completed: 41943 of 41943 jobs
 CPU time in finished jobs:   15330421s  255507.02m  4258.45h  177.44d  0.486 y
 IO & Wait Time:                673809s   11230.15m   187.17h    7.80d  0.021 y
 Average job time:                 382s       6.36m     0.11h    0.00d
 Longest job:                     4651s      77.52m     1.29h    0.05d
 Submission to last job:        169197s    2819.95m    47.00h    1.96d
 
     #	Second cluster run to convert the .out's to .lav's
     #	You do NOT want to run this on the big cluster.  It brings
     #	the file server to its knees.  Run this on the small cluster.
     ssh kki
     cd /cluster/data/hg17/bed/blastz.rn3
     # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
     #	fixup machine check, should be kki, not kk
     /cluster/data/hg17/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:       1894s      31.56m     0.53h    0.02d  0.000 y
 # IO & Wait Time:                  6271s     104.52m     1.74h    0.07d  0.000 y
 # Average job time:                  24s       0.40m     0.01h    0.00d
 # Longest job:                      131s       2.18m     0.04h    0.00d
 # Submission to last job:           590s       9.83m     0.16h    0.01d
 
     #	Third cluster run to convert lav's to axt's
     cd /cluster/data/hg17/bed/blastz.rn3
     #	The copy of this in mm4 was broken, fixed here
     /cluster/data/hg17/jkStuff/BlastZ_run2.sh
     cd run.2
     para try, check, push, etc ...
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:        426s       7.09m     0.12h    0.00d  0.000 y
 # IO & Wait Time:                  7283s     121.39m     2.02h    0.08d  0.000 y
 # Average job time:                 168s       2.79m     0.05h    0.00d
 # Longest job:                      642s      10.70m     0.18h    0.01d
 # Submission to last job:           642s      10.70m     0.18h    0.01d
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.rn3
     mkdir pslChrom
     set tbl = "blastzRn3"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	That takes about 30 minutes
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.rn3/pslChrom
     for I in *.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
 	echo "done: ${I}"
     done
     # this is a 55 minute job
     #	Check results
     #	featureBits hg16 blastzRn3
     #	1013603401 bases of 2865248791 (35.376%) in intersection
     #	featureBits hg17 blastzRn3
     #	1013003285 bases of 2866216770 (35.343%) in intersection
 
 # CHAIN RN3 BLASTZ (DONE - 2004-06-14 - Hiram)
 #  re-worked with no 'axtFilter -notQ_random' on the axtChain step - 2004-06-23
 #    used to be: axtFilter -notQ_random $1 | axtChain stdin
 
 # The axtChain is best run on the small kluster, or the kk9 kluster
     ssh kki
     mkdir -p /cluster/data/hg17/bed/blastz.rn3/axtChain/run1
     cd /cluster/data/hg17/bed/blastz.rn3/axtChain/run1
     mkdir out chain
 
     ls -1S /cluster/data/hg17/bed/blastz.rn3/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
     axtChain $1 \
 	/iscratch/i/gs.18/build35/bothMaskedNibs \
 	/iscratch/i/rn3/bothMaskedNibs $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
 
     # 46 jobs
     gensub2 input.lst single gsub jobList
     para create jobList
     para try
     para push # ... etc ...
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:       4645s      77.41m     1.29h    0.05d  0.000 y
 # IO & Wait Time:                  6840s     114.00m     1.90h    0.08d  0.000 y
 # Average job time:                 250s       4.16m     0.07h    0.00d
 # Longest job:                     1539s      25.65m     0.43h    0.02d
 # Submission to last job:          3761s      62.68m     1.04h    0.04d
 
 
     # now on the file server, sort chains
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.rn3/axtChain
     time chainMergeSort run1/chain/*.chain > all.chain
     #	real    36m42.170s
     #	user    4m55.970s
     #	sys     1m49.840s
 
     time chainSplit chain all.chain
     #	real    13m54.860s
     #	user    4m50.370s
     #	sys     1m3.260s
 
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.rn3/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg17 ${c}_chainRn3 $i
         echo done $c
     end
     #	featureBits hg17 chainRn3
     #	2827052992 bases of 2866216770 (98.634%) in intersection
     #	(with filter:) 2826192649 bases of 2866216770 (98.604%) in intersection
     #	featureBits hg16 chainRn3
     #	2830563493 bases of 2865248791 (98.789%) in intersection
 
 # NET RN3 (DONE - 2004-06-15 - Hiram)
 #	Re-done due to Chain being re-done 2004-06-23
 #       NOTE: Redo net axt's and net maf's to fix overlaps,
 #       (using 8/05 netToAxt). (2005-08-16 kate)
 
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.rn3/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \
                         /cluster/data/rn3/chrom.sizes ../preNet/$i
     end
 
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \
                             /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
     end
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     #	memory usage 2510467072, utime 19307 s/100, stime 3181
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.rn3/axtChain
     time netClass hNoClass.net hg17 rn3 rat.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInRat \
 	-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
     #	real    34m29.829s
     #	user    11m30.440s
     #	sys     1m52.730s
 
     # If things look good do
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.rn3/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     time netFilter -syn rat.net > ratSyn.net
     #	real    16m25.640s
     #	user    7m41.330s
     #	sys     1m1.150s
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.rn3/axtChain
     netFilter -minGap=10 rat.net |  hgLoadNet hg17 netRn3 stdin
     netFilter -minGap=10 ratSyn.net | hgLoadNet hg17 syntenyNetRn3 stdin
     #	real    37m0.199s
     #	user    15m13.770s
     #	sys     1m41.540s
 
     # check results
     # featureBits hg17 netRn3
     # 2817656275 bases of 2866216770 (98.306%) in intersection
     # (with axtFilter) 2816623107 bases of 2866216770 (98.270%) in intersection
     # featureBits hg16 netRn3
     # 2820958389 bases of 2865248791 (98.454%) in intersection
 
     # featureBits hg17 syntenyNetRn3
     # 2781748096 bases of 2866216770 (97.053%) in intersection
     # (with axtFilter) 2780883450 bases of 2866216770 (97.023%) in intersection
     # featureBits hg16 syntenyNetRn3
     # 2784011730 bases of 2865248791 (97.165%) in intersection
 
     # Add entries for net and chain to rat/hg17 trackDb
 
     # make net
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.rn3/axtChain
     mkdir ratNet
     time netSplit rat.net ratNet
     #	real    12m1.478s
     #	user    8m35.050s
     #	sys     1m7.230s
 
     # extract axts from net 
     mkdir ../axtNet ../mafNet
 cat << 'EOF' > makeMaf.csh
     foreach n (ratNet/chr*.net)
 	set c=$n:t:r
         echo $c
 	netToAxt ratNet/$c.net chain/$c.chain \
 		/cluster/data/hg17/nib  /cluster/data/rn3/nib stdout | \
                     axtSort stdin ../axtNet/$c.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/rn3/chrom.sizes \
                 ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=rn3.
     end
 'EOF'
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     mkdir -p /cluster/bluearc/hg17/mafNet
     cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/rn3
 
     ssh hgwdev
     mkdir -p /cluster/data/hg17/bed/blastz.rn3/axtBest
     cd /cluster/data/hg17/bed/blastz.rn3/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.rn3/axtNet
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtNet
     gzip *.axt
     # add README.txt file to dir (use previous assembly's copy as template)
 
     #  Convert those axt files to psl
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.rn3
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestRn3.psl
 	echo "Done: ${c}_blastzBestRn3.psl"
     end
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.rn3/pslBest
     for I in chr*BestRn3.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
 	echo "done ${I}"
     done
 
      # check results
     # featureBits hg17 blastzBestRn3
     #	975533772 bases of 2866216770 (34.036%) in intersection
     # (with axtFilter) 970005525 bases of 2866216770 (33.843%) in intersection
     # featureBits hg16 blastzBestRn3
     #	976121391 bases of 2865248791 (34.068%) in intersection
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/hg17/axtBest/Rn3
      cd /gbdb/hg17/axtBest/Rn3
      ln -s /cluster/data/hg17/bed/blastz.rn3/axtNet/chr*.axt .
      cd /cluster/data/hg17/bed/blastz.rn3/axtNet
      rm -f axtInfoInserts.sql
      foreach f (/gbdb/hg17/axtBest/Rn3/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
          >> axtInfoInserts.sql
      end
     hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql
     #	table axtInfo may already exist, ignore create error.
     hgsql hg17 < axtInfoInserts.sql
 
 
 # MAKING RAT SYNTENY (DONE - 2004-06-30 - Hiram)
 #	Re-Done after above done without the axtFilter
 
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/syntenyRn3
 cd /cluster/data/hg17/bed/syntenyRn3
 
 # Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
 cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl .
 cp -p /cluster/data/hg16/bed/syntenyMm3/*.sh .
 
 ./syntenicBest.pl -db=hg17 -table=blastzBestRn3
 ./smooth.pl
 ./joinsmallgaps.pl
 ./fillgap.pl -db=hg17 -table=blastzBestRn3
 ./synteny2bed.pl
 #	The five commands above
 #	real    196m2.565s
 #	user    0m21.170s
 #	sys     0m4.690s
 
 #	Used to load this in syntenyRn3, but that type is misleading to
 #	the table browser and fails the checkTableCoords check.
 #	Better to use this ensRatMusHom type:
 sed -e 's/ensPhusionBlast/ensRn3MusHom/g' \
       $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
       > ensRn3MusHom.sql
 hgLoadBed hg17 ensRn3MusHom ucsc100k.bed -sqlTable=ensRn3MusHom.sql
 
     #	featureBits hg17 ensRn3MusHom
     #	2592164486 bases of 2866216770 (90.439%) in intersection
     #	featureBits hg16 syntenyRn3
     #	2595919851 bases of 2865248791 (90.600%) in intersection
 
 
 # MAKING RAT AXTTIGHT FROM AXTBEST (DONE - 2004-06-15 - Hiram)
     # After creating axtBest alignments above, use subsetAxt to get axtTight:
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.rn3/axtNet
     mkdir -p ../axtTight
     foreach i (*.axt)
       echo $i
       subsetAxt  $i ../axtTight/$i \
         ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
     end
 
     # translate to psl
     cd ../axtTight
     mkdir ../pslTight
     foreach i (*.axt)
       set c = $i:r
       axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl
       echo "Done: $i"
     end
 
     # Load tables into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.rn3/pslTight
     for I in chr*TightRn3.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
 	echo "done ${I}"
     done
 
     #	Compare results with previous assembly
     #	featureBits hg17 blastzTightRn3
     #	153936720 bases of 2866216770 (5.371%) in intersection
     #	featureBits hg16 blastzTightRn3
     #	153151903 bases of 2865248791 (5.345%) in intersection
 
     # copy  axt's to download area
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.rn3/axtTight
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/axtTight
     gzip *.axt
     # add README.txt file to dir (use previous assembly's copy as template)
 
     # REDO downloads with fixed axtNet's (2005=09-13 kate)
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3
     mv axtNet axtNet.old
     nice cp -rp /cluster/data/hg17/bed/blastz.rn3/axtNet .
     cd axtNet
     nice gzip *.axt
     md5sum *.axt.gz > md5sum.txt
 
 
 # BLASTZ RN3 CLEAN UP (DONE - 2004-07-02 - Hiram)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.rn3
     nice rm -rf raw &
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/hNoClass.net &
     nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &
 
 # BLASTZ CHICKEN (GALGAL2) (DONE - 2004-06-14 - Fan)
 
     ssh kk
     mkdir /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
     cd /cluster/data/hg17/bed
     ln -s /cluster/data/hg17/bed/blastz.galGal2.2004-06-14 blastz.galGal2
     cd blastz.galGal2
     # Set L=10000 (higher threshold on blastz's outer loop) and abridge 
     # repeats.
     cat << '_EOF_' > DEF
 # human vs. chicken
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.chicken
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken
 SEQ2_DIR=/iscratch/i/galGal2/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/store5/gs.18/build35/bed/blastz.galGal2
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/hg17/bed/blastz.galGal2
     bash
     # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run0.sh
     #	it is a generic script and works for any assembly
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 Completed: 41943 of 41943 jobs
 CPU time in finished jobs:   15330421s  255507.02m  4258.45h  177.44d  0.486 y
 IO & Wait Time:                673809s   11230.15m   187.17h    7.80d  0.021 y
 Average job time:                 382s       6.36m     0.11h    0.00d
 Longest job:                     4651s      77.52m     1.29h    0.05d
 Submission to last job:        169197s    2819.95m    47.00h    1.96d
 
     #	Second cluster run to convert the .out's to .lav's
     #	You do NOT want to run this on the big cluster.  It brings
     #	the file server to its knees.  Run this on the small cluster.
     ssh kki
     cd /cluster/data/hg17/bed/blastz.galGal2
     bash
     # script copied over from /cluster/data/mm4/jkStuff/BlastZ_run1.sh
     #	fixup machine check, should be kki, not kk
     /cluster/data/hg17/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:       1894s      31.56m     0.53h    0.02d  0.000 y
 # IO & Wait Time:                  6271s     104.52m     1.74h    0.07d  0.000 y
 # Average job time:                  24s       0.40m     0.01h    0.00d
 # Longest job:                      131s       2.18m     0.04h    0.00d
 # Submission to last job:           590s       9.83m     0.16h    0.01d
 
     #	Third cluster run to convert lav's to axt's
     cd /cluster/data/hg17/bed/blastz.galGal2
     #	The copy of this in mm4 was broken, fixed here
     /cluster/data/hg17/jkStuff/BlastZ_run2.sh
     cd run.2
     para try, check, push, etc ...
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:        426s       7.09m     0.12h    0.00d  0.000 y
 # IO & Wait Time:                  7283s     121.39m     2.02h    0.08d  0.000 y
 # Average job time:                 168s       2.79m     0.05h    0.00d
 # Longest job:                      642s      10.70m     0.18h    0.01d
 # Submission to last job:           642s      10.70m     0.18h    0.01d
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.galGal2
     mkdir pslChrom
     set tbl = "blastzGalGal2"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	That takes about 30 minutes
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.galGal2/pslChrom
     bash
     for I in *.psl
     do
         /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
         echo "done: ${I}"
     done   
 
 
 # GNF ATLAS 2 (DONE - 2004-07-14 - Hiram
     # Align probes from GNF1H chip.
     ssh kk
     cd /cluster/data/hg17/bed
     mkdir -p geneAtlas2/run/psl
     cd geneAtlas2/run
     #	This bluearc/geneAtlas2 directory already exists
     # mkdir -p /cluster/bluearc/geneAtlas2
     # cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2
     ls -1 /scratch/hg/gs.18/build35/maskedContigs > genome.lst
     ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst
     cat << '_EOF_' > gsub
 #LOOP
 blat -fine -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.18/build35/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 genome.lst mrna.lst gsub jobList
     para create jobList
     para try
     para check
     para push
     para time
 # Completed: 380 of 380 jobs
 # CPU time in finished jobs:      10599s     176.65m     2.94h    0.12d  0.000 y
 # IO & Wait Time:                  3893s      64.88m     1.08h    0.05d  0.000 y
 # Average job time:                  38s       0.64m     0.01h    0.00d
 # Longest job:                      649s      10.82m     0.18h    0.01d
 # Submission to last job:           663s      11.05m     0.18h    0.01d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create gnf1h.psl.
     pslSort dirs raw.psl tmp psl
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \
 	contig.psl /dev/null
     #	Processed 80818 alignments
     liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl
     rm -r contig.psl raw.psl psl
 
     # Load probes and alignments from GNF1H into database.
     ssh hgwdev
     cd /cluster/data/hg17/bed/geneAtlas2
     #	Already symlinked
     # ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa \
     #	/gbdb/hgFixed/affyProbes
     hgLoadPsl hg17 affyGnf1h.psl
     hgLoadSeq hg17 /gbdb/hgFixed/affyProbes/gnf1h.fa
 
     grep -v U133B ../affyUclaNorm/hg17.affyU133AB_all.lifted.pslReps.psl \
 	| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
 	| sed -e "s/;//" > affyU133A.psl
 
     hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
     	affyU133A.psl  /cluster/data/hg17/bed/geneAtlas2/affyGnf1h.psl
 
 # Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
 # Mapped 32857,  multiply-mapped 1462, missed 49, unmapped 11839
 
     hgLoadBed hg17 gnfAtlas2 gnfAtlas2.bed
     #	Loaded 34319 elements of size 15
 
 # LOAD SNPS ( Daryl Thomas; November 7, 2004 ; 
 	      snpExceptions added January 8, 2005 ; 
 	      updated to build 124 on January 13, 2005;
 	      added affy snps March 5, 2004 )
     set db    = hg17
     set org   = human
     set build = 124
     set dir   = /cluster/bluearc/snp/$db/build$build
 
     # ssh to some quiet machine with fast access to the bluearc
     # it takes ~4.5 hours to download the data 
     # (build 124 directly to /cluster/bluearc/... from eieio)
     # Check to make sure the chrMT file is included
     mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq
     cd $dir
     ln -s /cluster/data/$db/jkStuff/liftAll.lft .
 
     screen
     ftp ftp.ncbi.nih.gov
     cd snp/$org/XML
     prompt
     mget ds_ch*.xml.gz
     exit # screen
     exit # machine
 
     # TODO: check chromStart for each locType
 
     cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts
     chmod 775 /cluster/bin/scripts/parseDbSnpXML
 
     #ssh kk
     touch jobList
     foreach file ( /cluster/bluearc/snp/$db/build$build/ds_ch*.xml.gz )
 	    set out = $file:t:r
 	    echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/$db/build$build $out.contig >> jobList
     end
 
     # I removed ds_chMulti.xml.gz and ds_chNotOn.xml.gz from the job list
     # para create jobList; para push; para check ... 
 
 #Completed: 25 of 25 jobs
 #CPU time in finished jobs:      30120s     502.01m     8.37h    0.35d  0.001 y
 #IO & Wait Time:                  2533s      42.21m     0.70h    0.03d  0.000 y
 #Average job time:                1306s      21.77m     0.36h    0.02d
 #Longest job:                     2611s      43.52m     0.73h    0.03d
 #Submission to last job:          2611s      43.52m     0.73h    0.03d
     exit # kk
 
     mv -r $dir /cluster/data/$db/bed/snp/build$build
     set dir = /cluster/data/$db/bed/snp/build$build
     cd $dir
 
     # concatenate the details files to make it easier to lift (and load)
     time zcat det/ds_ch*.xml.contig.det.gz > $db.build$build.contig.bed
     # 33.380u 24.470s 1:54.79 50.3%   0+0k 0+0io 86pf+0w (hgwdev)
     time gzip $db.build$build.contig.bed
     # 251.160u 16.770s 12:40.77 35.2% 0+0k 0+0io 83pf+0w (hgwdev/bluearc - should have done it on eieio/store5)
 
     # some of the NT contigs are not in the liftSpec - this is expected as snps that map to
     # alternate assemblies (Celera) are in the original files, but we disregard their mappings.
     time liftUp $db.build$build.bed liftAll.lft warn $db.build$build.contig.bed.gz 
     # 232.260u 30.050s 5:09.04 84.8%  0+0k 0+0io 379pf+0w (hgwdev/store5)
 
     time gzip hg17.build124.bed                                                                      
     # 141.980u 8.180s 2:34.43 97.2%   0+0k 0+0io 83pf+0w
 
     # hgLoadBed is the important step - check to make sure there are no warnings
     time hgLoadBed $db snp $db.build$build.bed.gz -sqlTable=${HOME}/kent/src/hg/lib/snp.sql
     # Loaded 9131054 elements of size 16
     # 225.040u 37.030s 35:20.45 12.3% 0+0k 0+0io 308pf+0w
 
     # basic snp table is now loaded, but exception column needs to be updated
     # ~ 3 hours wall clock time from here to end
 
     # run queries from snpException.query against snp table
     mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/build$build
     cd       /usr/local/apache/htdocs/qa/test-results/snpException/build$build
     time snpException hg17 0 ${db}snpException > ${db}snpException.log
     chmod o+rx . 
     chmod o+r  * 
     # 10.610u 19.200s 53:59.98 0.9%   0+0k 0+0io 264pf+0w
 
     # check alignment of flanking sequences
     time snpValid $db /cluster/data/$db/bed/snp/build$build/seq > ${db}snpValid.log
     # 5205.860u 216.570s 1:55:10.27 78.4%     0+0k 0+0io 72408pf+0w (hgwdev)
 
     ### NOTE: the pseudoautosomal snps are reported in the chrX files
     ### only, which causes problems for snpValid when checking the
     ### chrY snp mappings.  I got around this by confirming that all
     ### of the 'missing flank' errors (#23) were in pseudoautosomal
     ### regions and ignoring them.  I manually truncated the
     ### hg17snpException.23.bed file before continuing with the next
     ### step.  This could/should be fixed in the next iteration.
 
     # create list of statements to update the snp table and run them
     time tail +3 ${db}snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt
     # ~10 seconds
     time updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql
     # 7.250u 0.390s 0:07.87 97.0%     0+0k 0+0io 337pf+0w
     time hgsql hg17 < updateExceptionList.sql
     # 8.420u 10.370s 11:58.44 2.6%    0+0k 0+0io 413pf+0w build123 (this is mostly a mysql process)
     # 6.550u  9.370s 14:34.17 1.8%    0+0k 0+0io 413pf+0w build124
     # > wc -l build12*/updateExceptionList.sql
     #  387166 build123/updateExceptionList.sql
     #  383759 build124/updateExceptionList.sql
 
     # Add Affy SNPs from new submission 
     #!/bin/csh -fe
     # rm -f log ; date ; ./loadAffySnps.csh > & log ; date ; cat log
 
     set db = hg17
     cd /cluster/data/$db/bed/snp/affy/latest
     touch affy.txt affy.bed Affy.bed bed.tab
     rm -f affy*.txt affy*.bed Affy.bed* bed.tab
 
     # datafile was provided by Valmeekam, Venu [Venu_Valmeekam@affymetrix.com]
     tar xfz affyhg17maps_withstrand_alleles.tgz
     wc -l affy*txt
 
     awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10K\t0\n",        $1,$2,$3,$4,$6,$7);}' < affy10K.txt         > affy10K.bed
     awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10Kv2\t0\n",      $1,$2,$3,$4,$6,$7);}' < affy10Kv2.txt       > affy10Kv2.bed
     awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_HindIII\t0\n",$1,$2,$3,$4,$6,$7);}' < affy50K_HindIII.txt > affy50K_HindIII.bed
     awk '$1 !~ /^chrom/ {printf("chr%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_XbaI\t0\n",   $1,$2,$3,$4,$6,$7);}' < affy50K_XbaI.txt    > affy50K_XbaI.bed
     
     # this is a temporary kluge to fix some bad input data.
     cat affy*.bed | sed 's/_par//' > Affy.bed
     
     # the source enum for 'dbSnp' is 2; all of the affy* values are higher.
     hgsql $db -e "delete from snp where source > 2 "
     
     hgLoadBed $db snp Affy.bed -oldTable -tab
     
     rm -f affy*.txt affy*.bed bed.tab
     gzip Affy.bed
     
     #mysql> select source, count(*) from hg17.snp group by source;                                                                                                                   
     #+-----------------+----------+
     #| source          | count(*) |
     #+-----------------+----------+
     #| dbSnp           |  9131054 |
     #| Affy10K         |    11344 |
     #| Affy10Kv2       |    10032 |
     #| Affy50K_HindIII |    56859 |
     #| Affy50K_XbaI    |    58494 |
     #+-----------------+----------+
 
     #March 7, 2005: fix pseudoautosomal snps:
     #SNP_A-1606360
     #SNP_A-1606329
     #SNP_A-1666553
     #SNP_A-1715750
     #SNP_A-1726331
     #SNP_A-1685712
     #SNP_A-1735899
     #SNP_A-1726272
     #SNP_A-1660936
     #SNP_A-1662285
     #SNP_A-1680848
     #SNP_A-1671440
     #SNP_A-1719355
     #SNP_A-1716499
     #SNP_A-1643847
     #SNP_A-1646007
     #SNP_A-1715285
     #SNP_A-1657714
     #SNP_A-1725038
     #SNP_A-1713938
     #SNP_A-1708565
     #SNP_A-1510243
     #SNP_A-1510197
     #SNP_A-1606356
 
     delete from snp 
     where  chrom = 'chrY' 
     and    name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356');
 
     update snp 
     set chrom = 'chrX' 
     where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356');
 
     insert into snp 
 	select  bin, 'chrY' as chrom, chromStart, chromEnd, name, score, strand, 
 		observed, molType, class, valid, avHet, avHetSE, func, locType, source, exception 
 	from    snp 
 	where   name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356');
 
     select chrom, count(*) 
     from snp 
     where name in ('SNP_A-1606360','SNP_A-1606329','SNP_A-1666553','SNP_A-1715750','SNP_A-1726331','SNP_A-1685712','SNP_A-1735899','SNP_A-1726272','SNP_A-1660936','SNP_A-1662285','SNP_A-1680848','SNP_A-1671440','SNP_A-1719355','SNP_A-1716499','SNP_A-1643847','SNP_A-1646007','SNP_A-1715285','SNP_A-1657714','SNP_A-1725038','SNP_A-1713938','SNP_A-1708565','SNP_A-1510243','SNP_A-1510197','SNP_A-1606356') 
     group by chrom;;
 
 
 ## LS-SNP links [load data only] (Daryl Thomas; November 3, 2005)
     # Data from Rachel Karchin in the Andrej Sali lab at UCSF
     # /cluster/data/hg17/bed/lssnp
     hgsql hg17 < ${HOME}/kent/src/hg/lib/lsSnpFunction.sql
     hgsql hg17 < ${HOME}/kent/src/hg/lib/lsSnpStructure.sql
     mysql> load data local infile "snp-human3-function-predictions.txt" into table lsSnpFunction;
     Query OK, 24337 rows affected (1.27 sec)
     mysql> load data local infile "snp-human3-structure-predictions.txt" into table lsSnpStructure;
     Query OK, 34764 rows affected (2.36 sec)
 
 # Tajima's D (DONE -- 2005-09-20 -- Daryl)
 # Data from Chris Carlson in Debbie Nickerson's lab
 # Chris Carlson [csc47<AT>u<DOT>washington<DOT>edu]
 
     # get data from ftp site, unpack in $dir:
     # tar tvfz *gz | more
     # -rw-r--r-- chris/admin 34405061 2005-06-03 13:22:15 AD.SNP.track
     # -rw-r--r-- chris/admin 29869512 2005-06-03 13:22:30 ED.SNP.track
     # -rw-r--r-- chris/admin 27154049 2005-06-03 13:22:41 XD.SNP.track
     # -rw-r--r-- chris/admin 10948753 2005-06-02 21:12:27 AD.tajd.track
     # -rw-r--r-- chris/admin 10928630 2005-06-02 21:12:39 ED.tajd.track
     # -rw-r--r-- chris/admin 10926122 2005-06-02 21:12:51 XD.tajd.track
 
     set db=hg17
     set dir=/cluster/data/$db/bed/tajdpoly/latest
     cd $dir
     
     tar xvfz TajDtracks.tar.gz
 
     mac2unix < AD.SNP.track  | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpAd.bed
     mac2unix < ED.SNP.track  | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpEd.bed
     mac2unix < XD.SNP.track  | grep -v track | sed 's/1\.02e+08/102000000/;s/8\.8e+07/88000000/;s/1\.5e+07/15000000/' > hg17.tajdSnpXd.bed
     mac2unix < AD.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdAd.bedGraph
     mac2unix < ED.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdEd.bedGraph
     mac2unix < XD.tajd.track | grep -v track | awk '{printf"%s\t%s\t%d\t%.3f\n",$1,$2,$3,$4}' > hg17.tajdXd.bedGraph
     
     set chain = /cluster/data/hg17/bed/bedOver/hg17ToHg16.over.chain
 
     foreach pop (Ad Ed Xd)
 	liftOver hg17.tajdSnp$pop.bed   $chain hg16.tajdSnp$pop.bed   hg17ToHg16.tajdSnp$pop.unmapped
 	liftOver hg17.tajd$pop.bedGraph $chain hg16.tajd$pop.bedGraph hg17ToHg16.tajd$pop.unmapped
 	foreach db (hg16 hg17)
 	    hgLoadBed -bedGraph=4 $db tajd$pop    $db.tajd$pop.bedGraph
 	    hgLoadBed             $db tajdSnp$pop $db.tajdSnp$pop.bed
 	end
     end
 
     set where1 = "where t.bin=g.bin and t.chrom=g.chrom and (t.chromStart between g.chromStart and g.chromEnd or t.chromEnd between g.chromStart and g.chromEnd)"
     set where2 = "t, chromInfo c where t.chromStart < 0 or (t.chrom=c.chrom and t.chromEnd > c.size)"
     set list = "as pop, t.chrom, t.chromStart from"
     foreach db (hg16 hg17)
 	rm -f $db.delete.sql
 	touch $db.delete.sql
 	foreach p (Ad Ed Xd SnpAd SnpEd SnpXd)
 	    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
 		echo "select 'tajd$p' $list tajd${p} t,chr${c}_gap g $where1" | \
 		    hgsql $db | \
 		    grep -v pop | \
 		    awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n",$1,$2,$3}' \
 		    >> $db.delete.sql
 	    end
 	    echo "select 'tajd$p' $list tajd${p} $where2" | \
 		hgsql $db | \
 		grep -v pop | \
 		awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n",$1,$2,$3}'\
 		>> $db.delete.sql
 	end
 	hgsql $db < $db.delete.sql
     end
 
 
 # GENE SORTER (AKA: FAMILY BROWSER) (DONE - 2004-06-16 - Hiram)
 # Added knownToU133Plus2 track (2004-10-14)
 #	to be done after knownGene tables are complete from known gene
 #	process.
 #
 # Cluster together various alt-splicing isoforms.
 #	Creates the knownIsoforms and knownCanonical tables
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/geneSorter.2004-06-15
 ln -s /cluster/data/hg17/bed/geneSorter.2004-06-15 \
 	/cluster/data/hg17/bed/geneSorter
 cd /cluster/data/hg17/bed/geneSorter
 hgClusterGenes hg17 knownGene knownIsoforms knownCanonical
 
 # Extract peptides from knownGenes into fasta file
 # and create a blast database out of them.
 mkdir /cluster/data/hg17/bed/geneSorter/blastp
 cd /cluster/data/hg17/bed/geneSorter/blastp
 pepPredToFa hg17 knownGenePep known.faa
 #	You may need to build this binary in src/hg/near/pepPredToFa
 /scratch/blast/formatdb -i known.faa -t known -n known
 #	This command is in /projects/compbio/bin/$MACH/formatdb
 
 # Copy over database to bluearc
 rm -fr /cluster/bluearc/hg17/blastp
 mkdir -p /cluster/bluearc/hg17/blastp
 cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \
 	/cluster/bluearc/hg17/blastp
 
 #	Had to pick up a new blastall binary (2004-06-15)
 #	Our old one would no longer run on our systems that have
 #	updated Linux versions
 mkdir /cluster/bluearc/blast229
 cd /cluster/bluearc/blast229
 wget --timestamping \
     ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/blast-2.2.9-ia32-linux.tar.gz
 wget --timestamping \
     ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/ChangeLog.txt
 wget --timestamping \
     ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.9/ReleaseNotes.txt
 tar xvzf blast-2.2.9-ia32-linux.tar.gz
 
 
 # Split up fasta file into bite sized chunks for cluster
 cd /cluster/data/hg17/bed/geneSorter/blastp
 mkdir split
 faSplit sequence known.faa 8000 split/kg
 
 # Make parasol run directory
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
 cd /cluster/data/hg17/bed/geneSorter/blastp/self
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
 	-e 0.01 -m 8 -b 1000
 '_EOF_'
     # << keep emacs happy
 chmod +x blastSome
 
 # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	'ls ../../split/*.fa' is too much, hence the echo
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 # Wait a couple of minutes, and do a para check,  if all is good
 # then do a
 para push
 # This should finish in ~15 minutes if the cluster is free.
 Completed: 7749 of 7749 jobs
 CPU time in finished jobs:     182148s    3035.81m    50.60h    2.11d  0.006 y
 IO & Wait Time:                 22954s     382.56m     6.38h    0.27d  0.001 y
 Average job time:                  26s       0.44m     0.01h    0.00d
 Longest job:                      372s       6.20m     0.10h    0.00d
 Submission to last job:           871s      14.52m     0.24h    0.01d
 
 # Load into database.  This takes about 30 minutes
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
 time hgLoadBlastTab hg17 knownBlastTab *.tab
 # Scanning through 7749 files
 # Loading database with 11799667 rows
 #	Hg16 was:       11376875 rows
 # real    30m10.761s
 # user    5m25.490s
 # sys     1m0.630s
 
 cd /cluster/data/hg17/bed/geneSorter
 # Create table that maps between known genes and RefSeq
 hgMapToGene hg17 refGene knownGene knownToRefSeq
 #	may need to build this command in src/hg/near/hgMapToGene
 #	hgsql -e "select count(*) from knownToRefSeq;" hg17
 #	row count changed from 36078 in Hg16 to 36082
 
 # Create table that maps between known genes and LocusLink
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \
 	> refToLl.txt
 hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
 #	hgsql -e "select count(*) from knownToLocusLink;" hg17
 #	row count went from 36078 in Hg16 to 36082
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
 #	hgsql -e "select count(*) from knownToPfam;" hg17
 #	row count dropped from 30467 in Hg16 to 29725
 
 
 # Create table to map between known genes and GNF Atlas2
 # expression data.
     hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 #	hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
 #	row count droppted from 35817 in Hg16 to 35739
 
 # Create expression distance table - takes about an hour
     hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
     	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnfAtlas2
 # Got 35739 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
     #	hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
     #	row count went from 35,817,000 in Hg16 to 35,739,000
     #	real    108m1.671s
     #	user    89m30.680s
     #	sys     3m6.800s
 
 # Create a table that maps between known genes and 
 # the nice affy expression data.
 hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
     #	hgsql -e "select count(*) from knownToU133;" hg17
     #	row count went from 37,634 in Hg16 to 36,795
 
 # Create expression distance table.  This will take about 2.5 hours
 cd /tmp
 cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
 time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
 	-weights=affyUcla.weight -lookup=knownToU133
     #	211 genes, 42 weights, 26.500000 total wieght
     #	Got 36795 unique elements in affyUclaNorm
     #	real    154m1.058s
     #	user    134m45.000s
     #	sys     3m1.990s
 
 # Create table that maps between known genes and 
 # the GNF data.
 cd /tmp
 hgMapToGene hg17 affyU95 knownGene knownToU95
     #	row count went from 18780 in Hg16 to 18796
     #	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
 hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
 	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95
     #	row count went from 17711000 in Hg16 to 17710000
     #	real    21m37.703s
     #	user    13m35.110s
     #	sys     0m28.470s
 
 # Create known gene mapping table and expression distance tables
 # for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)
 
 hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
 hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
 	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnf1h
 # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
 # Got 9756 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
 
 # create table mapping knownGenes to affyU133Plus2 table (2004-10-14, hartera)
 cd /cluster/data/hg17/bed/geneSorter
 hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
 
 # Make sure that GO database is up to date.
 See README in /cluster/store1/geneOntology.
 #	I update this GO database very carefully, checking that all
 #	structures in it remain the same from release to release and
 #	backing up the current go DB in a backup database.  In this case
 #	the backup is go040107 - when it was loaded for Mm4, and the new
 #	go database is based on data from Dec 17th 2003 and Feb 2004 according
 #	to the time stamp on the fetched data.  This build was done in
 #	/cluster/store1/geneOntology/20040217
 
 cd /cluster/data/hg17/bed/geneSorter
 
 XXX - DO NOT YET HAVE ensGene table - must wait on Ensembl to release that
 XXX - have not created the knownToEnsembl table yet - 2004-07-15 - Hiram
 # Create knownToEnsembl column
 hgMapToGene hg17 ensGene knownGene knownToEnsembl
 #	table row count went from previous version: 36068 to 38251
 
 # Make knownToCdsSnp table (DONE Nov 11, 2004, Heather)
   ssh hgwdev
   nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
 # row count 165728
 # unique 34013
 # approx. 5 minutes running time
 
 # Make C. elegans ortholog column using blastp on wormpep.
 # First make C. elegans protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/ce1/blastp should have data
 
 #	The blast jobs below can be run on the kk or kk9 clusters
 # Create the ceBlastTab
 ssh kk9
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/ce1
 cd /cluster/data/hg17/bed/geneSorter/blastp/ce1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 #	Only takes 10 minutes on an idle cluster
 # Completed: 7749 of 7749 jobs
 # CPU time in finished jobs:      32023s     533.72m     8.90h    0.37d  0.001 y
 # IO & Wait Time:                 20643s     344.05m     5.73h    0.24d  0.001 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest job:                      110s       1.83m     0.03h    0.00d
 # Submission to last job:          1911s      31.85m     0.53h    0.02d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/ce1/run/out
 hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
 #	row count went from 27620 to 27616
 
 # Make mouse ortholog column using blastp on mouse known genes.
 # First make mouse protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This already exists.  See makeMm5.doc for procedure
 #	the directory: /cluster/bluearc/scratch/mus/mm5/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm5
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm5
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \
 	-i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7749 of 7749 jobs
 # CPU time in finished jobs:     139041s    2317.34m    38.62h    1.61d  0.004 y
 # IO & Wait Time:                 21227s     353.79m     5.90h    0.25d  0.001 y
 # Average job time:                  21s       0.34m     0.01h    0.00d
 # Longest job:                      260s       4.33m     0.07h    0.00d
 # Submission to last job:          1137s      18.95m     0.32h    0.01d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm5/run/out
 hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
 # Scanning through 7748 files
 #	row count went from 36471 to 36638
 
 # Make rat ortholog column using blastp on rat known genes.
 # First make rat protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This already exists.  See makeRn3.doc for procedure.
 #	Files were put in this directory: /cluster/bluearc/rn3/blastp/
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
     -p blastp -d /cluster/bluearc/rn3/blastp/known \
     -i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 #Completed: 7749 of 7749 jobs
 #CPU time in finished jobs:      31035s     517.25m     8.62h    0.36d  0.001 y
 #IO & Wait Time:                 38472s     641.20m    10.69h    0.45d  0.001 y
 #Average job time:                   9s       0.15m     0.00h    0.00d
 #Longest job:                       75s       1.25m     0.02h    0.00d
 #Submission to last job:           169s       2.82m     0.05h    0.00d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
 hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
 # Scanning through 7749 files
 #Loading database with 25574 rows
 
 # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
 # First make protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/dr1/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/dr1
 cd /cluster/data/hg17/bed/geneSorter/blastp/dr1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
 	-i $1 -o $2 -e 0.005 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7749 of 7749 jobs
 # CPU time in finished jobs:     100217s    1670.28m    27.84h    1.16d  0.003 y
 # IO & Wait Time:                 23697s     394.95m     6.58h    0.27d  0.001 y
 # Average job time:                  16s       0.27m     0.00h    0.00d
 # Longest job:                      233s       3.88m     0.06h    0.00d
 # Submission to last job:          1667s      27.78m     0.46h    0.02d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/dr1/run/out
 hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
 #	row count went from 32971 to 33023
 
 # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
 # First make protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/sc1/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7749 of 7749 jobs
 # CPU time in finished jobs:      20738s     345.64m     5.76h    0.24d  0.001 y
 # IO & Wait Time:                 22018s     366.96m     6.12h    0.25d  0.001 y
 # Average job time:                   6s       0.09m     0.00h    0.00d
 # Longest job:                       39s       0.65m     0.01h    0.00d
 # Submission to last job:           572s       9.53m     0.16h    0.01d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
 hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
 #	row count went from 18286 to 18265
 
 # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
 # First make SwissProt protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/dm1/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7749 of 7749 jobs
 # CPU time in finished jobs:      82022s    1367.03m    22.78h    0.95d  0.003 y
 # IO & Wait Time:                 21982s     366.37m     6.11h    0.25d  0.001 y
 # Average job time:                  13s       0.22m     0.00h    0.00d
 # Longest job:                      174s       2.90m     0.05h    0.00d
 # Submission to last job:          1439s      23.98m     0.40h    0.02d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
 hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
 #	row count went from 29322 to 29341
 
 ####  Blat knownGene proteins to determine exons (braney 2004-06-20 DONE)
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir blat.hg17KG.2004-06-20
     rm blat.hg17KG
     ln -s  blat.hg17KG.2004-06-20 blat.hg17KG
     cd blat.hg17KG
     pepPredToFa hg17 knownGenePep known.fa
     hgPepPred hg17 generic blastKGPep00 known.fa
     grep ">" known.fa | sed "s/>//" > kgName.lst
     kgName hg17 kgName.lst blastKGRef00
     hgsql hg17 < ~/kent/src/lib/hg/blastRef.sql
     echo "rename table blastRef to blastKGRef00" | hgsql hg17
     echo "load data local infile 'blastKGRef00' into table blastKGRef00" | hgsql hg17
     ssh kk
     cd /cluster/data/hg17/bed/blat.hg17KG
     cat << '_EOF_' > blatSome
 #!/bin/csh -fe
 /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
 '_EOF_'
     # << keep emacs happy
     chmod +x blatSome
     ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst
     mkdir kgfa
     cd kgfa
     faSplit sequence ../known.fa 3000 kg
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     cat << '_EOF_' > blatGsub
 #LOOP
 blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
     gensub2 human.lst kg.lst blatGsub blatSpec
     mkdir psl
     cd psl
     foreach i (`cat ../human.lst`)
 	mkdir `basename $i .nib`
     end
     cd ..
     para create blatSpec
     para push
 
 # Completed: 133676 of 133676 jobs
 # CPU time in finished jobs:   29661130s  494352.16m  8239.20h  343.30d  0.941 y
 # IO & Wait Time:               2181179s   36352.99m   605.88h   25.25d  0.069 y
 # Average job time:                 238s       3.97m     0.07h    0.00d
 # Longest job:                   105972s    1766.20m    29.44h    1.23d
 
     ssh eieio
     cd /cluster/data/hg17/bed/blat.hg17KG
     pslSort dirs raw.psl /tmp psl/*
     pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
     pslUniq cooked.psl hg17KG.psl
     pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
 
 # BLASTZ MM4 (DONE - 2004-06-22 - Hiram)
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastz.mm4.2004-06-21
     cd /cluster/data/hg17/bed
     ln -s  blastz.mm4.2004-06-21 blastz.mm4
     cd blastz.mm4
 
     cat << '_EOF_' > DEF
 # human vs. mouse
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Mouse
 SEQ2_DIR=/scratch/mus/mm4/softNib
 # RMSK not currently used
 SEQ2_RMSK=/scratch/mus/mm4/rmsk
 # FLAG not currently used
 SEQ2_FLAG=-rodent
 SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastz.mm4
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/hg17/bed/blastz.mm4
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 # Completed: 43648 of 43648 jobs
 # CPU time in finished jobs:   16448001s  274133.36m  4568.89h  190.37d  0.522 y
 # IO & Wait Time:                751666s   12527.76m   208.80h    8.70d  0.024 y
 # Average job time:                 394s       6.57m     0.11h    0.00d
 # Longest job:                     8323s     138.72m     2.31h    0.10d
 # Submission to last job:         44244s     737.40m    12.29h    0.51d
 
     #	the file server to its knees.  Run this on the small cluster.
     ssh kki
     cd /cluster/data/hg17/bed/blastz.mm4
     /cluster/data/hg17/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:       3925s      65.42m     1.09h    0.05d  0.000 y
 # IO & Wait Time:                  6208s     103.46m     1.72h    0.07d  0.000 y
 # Average job time:                  30s       0.50m     0.01h    0.00d
 # Longest job:                      289s       4.82m     0.08h    0.00d
 # Submission to last job:          2800s      46.67m     0.78h    0.03d
 
     #	Third cluster run to convert lav's to axt's
     #	Does not work on kki since /scratch on the iservers is not the
     #	same as /scratch on the other clusters.
     ssh kk
     cd /cluster/data/hg17/bed/blastz.mm4
     /cluster/data/hg17/jkStuff/BlastZ_run2.sh
     cd run.2
     para try, check, push, etc ...
 # Completed: 45 of 46 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:       2389s      39.82m     0.66h    0.03d  0.000 y
 # IO & Wait Time:                 13374s     222.90m     3.71h    0.15d  0.000 y
 # Average job time:                 350s       5.84m     0.10h    0.00d
 # Longest job:                     1426s      23.77m     0.40h    0.02d
 # Submission to last job:          1440s      24.00m     0.40h    0.02d
 
     #	chr19 failing due to out of memory.  Run this job individually
     #	on kolossus, adjusting the location of the nib directories:
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.mm4
     sed -e "s/i386/x86_64/g" /cluster/bin/scripts/blastz-chromlav2axt > \
 	x86_64-chromlav2axt
     chmod +x x86_64-chromlav2axt
     time ./x86_64-chromlav2axt \
 	/cluster/data/hg17/bed/blastz.mm4/lav/chr19 \
 	/cluster/data/hg17/bed/blastz.mm4/axtChrom/chr19.axt \
 	/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs \
 	/cluster/bluearc/scratch/mus/mm4/softNib
     #	real    24m28.955s
     #	user    6m40.990s
     #	sys     1m16.500s
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm4
     mkdir -p pslChrom
     set tbl = "blastzMm4"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	This takes more than an hour.  You can shorten this by changing
     #	that command to a simple echo, put the results into a file,
     #	split the file into four parts and run the four files as shell
     #	scripts on eieio to have four processes running at the same
     #	time.  Load on eieio gets up to about 20 which is reasonable.
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm4/pslChrom
     bash
     for F in chr*_blastzMm4.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${F}
 	echo "${F} done"
     done
     # this is a 55 minute job
     # exit bash if you are tcsh
 
     # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
     # memory.  But if you reset your ~/.hg.conf to use the read-only
     #	user and contact the hgwdev host, then use the x86_64 featureBits
     # featureBits hg16 blastzMm4
     # 1056761609 bases of 2865248791 (36.882%) in intersection
     # featureBits hg17 blastzMm4
     # 1056201417 bases of 2866216770 (36.850%) in intersection
 
 # CHAIN MM4 BLASTZ (DONE - 2004-06-29 - Hiram)
 # redone with the 'axtFilter -notQ_random' removed - 2004-06-23
 
 # The axtChain is best run on the small kluster, or the kk9 kluster
     ssh kk9
     mkdir -p /cluster/data/hg17/bed/blastz.mm4/axtChain/run1
     cd /cluster/data/hg17/bed/blastz.mm4/axtChain/run1
     mkdir out chain
 
     ls -1S /cluster/data/hg17/bed/blastz.mm4/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
 #  May need -minScore=5000 for all chroms if chr19 won't finish on kolossus
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \
 	/iscratch/i/mm4/softNib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
 
     # 46 jobs
     gensub2 input.lst single gsub jobList
     para create jobList
     para try
     para push # ... etc ...
 # Completed: 45 of 46 jobs
 # CPU time in finished jobs:       6575s     109.58m     1.83h    0.08d  0.000 y
 # IO & Wait Time:                  9274s     154.57m     2.58h    0.11d  0.000 y
 # Average job time:                 352s       5.87m     0.10h    0.00d
 # Longest job:                     3121s      52.02m     0.87h    0.04d
 # Submission to last job:          3121s      52.02m     0.87h    0.04d
     #	one job wouldn't finish due to memory usage
     #	run the chr19 job on kolossus, takes an hour, gets up to 4 Gb
     #	memory usage
 
     # now on the file server, sort chains
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm4/axtChain
     time chainMergeSort run1/chain/*.chain > all.chain
     #	real    17m17.639s
     #	user    9m54.240s
     #	sys     1m31.210s
     #	(1.9 Gb result file !)
 
     time chainSplit chain all.chain
     #	real    27m32.278s
     #	user    9m46.970s
     #	sys     2m45.960s
 
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm4/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg17 ${c}_chainMm4 $i
         echo done $c
     end
 
     #	featureBits hg17 chainMm4
     #	2829135227 bases of 2866216770 (98.706%) in intersection
     #	featureBits hg16 chainMm4
     #	2828363353 bases of 2865248791 (98.713%) in intersection
 
 # NET MM4 (DONE - 2004-06-29 - Hiram)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm4/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \
                         /cluster/data/mm4/chrom.sizes ../preNet/$i
     end
 
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \
 	/cluster/data/mm4/chrom.sizes ../n1/$n /dev/null
     end
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     #	memory usage 2504171520, utime 19373 s/100, stime 5906
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm4/axtChain
     time netClass hNoClass.net hg17 mm4 mouse.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse \
 	-qNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman
     #	real    19m33.421s
     #	user    10m37.130s
     #	sys     1m45.630s
 
     # If things look good do
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm4/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     time netFilter -syn mouse.net > mouseSyn.net
     #	real    13m24.885s
     #	user    7m37.100s
     #	sys     1m5.760s
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm4/axtChain
     netFilter -minGap=10 mouse.net |  hgLoadNet hg17 netMm4 stdin
     netFilter -minGap=10 mouseSyn.net | hgLoadNet hg17 syntenyNetMm4 stdin
     #	real    44m20.735s
     #	user    15m58.620s
     #	sys     1m58.720s
     # check results
     # featureBits hg17 netMm4
     #	2824272033 bases of 2866216770 (98.537%) in intersection
     # featureBits hg16 netMm4
     #	2823565051 bases of 2865248791 (98.545%) in intersection
 
     # featureBits hg17 syntenyNetMm4
     #	2785830955 bases of 2866216770 (97.195%) in intersection
     # featureBits hg16 syntenyNetMm4
     #	2786960572 bases of 2865248791 (97.268%) in intersection
 
     # Add entries for net and chain to mouse/hg17 trackDb
 
     # make net
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm4/axtChain
     mkdir mouseNet
     time netSplit mouse.net mouseNet
     #	real    12m1.478s
     #	user    8m35.050s
     #	sys     1m7.230s
 
     #	extract axt's from net, and convert to maf's (DONE - Kate - 2004-06-24)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm4/axtChain
     mkdir ../axtNet ../mafNet
 cat > makeMaf.csh << '_EOF_'
     foreach f (mouseNet/chr*.net)
         set c = $f:t:r
         echo "netToAxt: $c.net -> $c.axt"
         rm -f ../axtNet/$c.axt
         netToAxt mouseNet/$c.net chain/$c.chain \
 	    /cluster/data/hg17/nib /cluster/data/mm4/nib stdout | \
 	    axtSort stdin ../axtNet/$c.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/mm4/chrom.sizes \
             ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=mm4.
 	echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf"
     end
 '_EOF_'
 # << for emacs
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
 
     ssh hgwdev
     mkdir -p /cluster/data/hg17/bed/blastz.mm4/axtBest
     cd /cluster/data/hg17/bed/blastz.mm4/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm4/axtNet
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtNet
     gzip *.axt
     # add README.txt file to dir (use previous assembly's copy as template)
 
     #  Convert those axt files to psl
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm4
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo "processing $c.axt -> ${c}_blastzBestMm4.psl"
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestMm4.psl
 	echo "Done: ${c}_blastzBestMm4.psl"
     end
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm4/pslBest
     for I in chr*BestMm4.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
 	echo "done ${I}"
     done
 
      # check results
     # featureBits hg17 blastzBestMm4
     #	1017319919 bases of 2866216770 (35.493%) in intersection
     # featureBits hg16 blastzBestMm4
     #	996722004 bases of 2865248791 (34.787%) in intersection
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/hg17/axtBest/Mm4
      cd /gbdb/hg17/axtBest/Mm4
      ln -s /cluster/data/hg17/bed/blastz.mm4/axtNet/chr*.axt .
      cd /cluster/data/hg17/bed/blastz.mm4/axtNet
      rm -f axtInfoInserts.sql
      foreach f (/gbdb/hg17/axtBest/Mm4/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \
          >> axtInfoInserts.sql
      end
     hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql
     #	table axtInfo may already exist, ignore create error.
     hgsql hg17 < axtInfoInserts.sql
 
 # MAKING MOUSE SYNTENY (DONE - 2004-06-29 - Hiram)
 
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/syntenyMm4
 cd /cluster/data/hg17/bed/syntenyMm4
 
 # Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
 cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .
 
 ./syntenicBest.pl -db=hg17 -table=blastzBestMm4
 ./smooth.pl
 ./joinsmallgaps.pl
 ./fillgap.pl -db=hg17 -table=blastzBestMm4
 ./synteny2bed.pl
 #	The five commands above
 #	real    220m16.227s
 #	user    0m22.940s
 #	sys     0m3.960s
 
 #	Used to load this in syntenyMm4, but that type is misleading to
 #	the table browser and fails the checkTableCoords check.
 #	Better to use this ensRatMusHom type:
 #	Need a new name here for the Mm4 to not conflict with Rn3
 sed -e 's/ensPhusionBlast/ensRatMm4Hom/g' \
       $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
       > ensRatMm4Hom.sql
 hgLoadBed hg17 ensRatMm4Hom ucsc100k.bed -sqlTable=ensRatMm4Hom.sql
     #	featureBits hg17 ensRatMm4Hom
     #	2549307611 bases of 2866216770 (88.943%) in intersection
     #	featureBits hg16 syntenyMm4
     #	2560252977 bases of 2865248791 (89.355%) in intersection
 
 # MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-06-29 - Hiram)
     # After creating axtBest alignments above, use subsetAxt to get axtTight:
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm4/axtNet
     mkdir -p ../axtTight
     foreach i (*.axt)
       echo $i
       subsetAxt  $i ../axtTight/$i \
         ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
     end
 
     # translate to psl
     cd ../axtTight
     mkdir ../pslTight
     foreach i (*.axt)
       set c = $i:r
       axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl
       echo "Done: $i"
     end
 
     # Load tables into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm4/pslTight
     for I in chr*TightMm4.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
 	echo "done ${I}"
     done
 
     #	Compare results with previous assembly:
     #	featureBits hg17 blastzTightMm4
     #	166569246 bases of 2866216770 (5.811%) in intersection
     #	featureBits hg16 blastzTightMm4
     #	162641577 bases of 2865248791 (5.676%) in intersection
 
     # copy  axt's to download area
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm4/axtTight
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm4/axtTight
     gzip *.axt
     # add README.txt file to dir (use previous assembly's copy as template)
 
 # BLASTZ MM4 CLEAN UP (DONE - 2004-07-02 - Hiram)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm4
     nice rm -rf raw &
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/hNoClass.net &
     nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &
 
 
 # BLASTZ CHIMP panTro1 (DONE 2004-06-22 kate)
 # NOTE: Ran with abridge repeats=0, although SMSK was set
 # Looked better than running with abridge=1, which had very
 # chopped-up alignments
 
     ssh kk
     cd /cluster/data/hg17/bed
     mkdir -p blastz.panTro1.2004-06-22
     rm -f blastz.panTro1
     cd blastz.panTro1.2004-06-22
 
     cat << 'EOF' > DEF
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=0
 
 # Specific settings for chimp
 BLASTZ_Y=3400
 BLASTZ_T=2
 BLASTZ_K=4500
 BLASTZ_Q=/cluster/data/penn/human_chimp.q
 
 # TARGET: Human
 SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
 # not used 
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.chimp
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chimp
 SEQ2_DIR=/scratch/chimp/panTro1/nib
 # not currently used
 SEQ2_RMSK=/iscratch/i/chimp/panTro1/linSpecRep.human
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastz.panTro1.2004-06-22
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 'EOF'
     # << this line keeps emacs coloring happy
 
     # first cluster run: raw blastz alignments
     ssh kk
     bash # if a csh/tcsh user
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
         # 160270 jobs written to batc
     para try, check, push, check, ....
 #CPU time in finished jobs:    2399227s   39987.11m   666.45h   27.77d  0.076 y
 #IO & Wait Time:                503100s    8385.00m   139.75h    5.82d  0.016 y
 #Average job time:                  18s       0.30m     0.01h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:            2073s      34.55m     0.58h    0.02d
 #Submission to last job:         10843s     180.72m     3.01h    0.13d
 
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     bash # if a csh/tcsh user
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
         # 341 jobs
     para try, check, push, etc ...
 # CPU time in finished jobs:       3458s      57.63m     0.96h    0.04d  0.000 y
 # IO & Wait Time:                 57996s     966.60m    16.11h    0.67d  0.002 y
 # Average job time:                 180s       3.00m     0.05h    0.00d
 # Longest job:                      483s       8.05m     0.13h    0.01d
 # Submission to last job:          1498s      24.97m     0.42h    0.02d
 
     # third run: lav -> axt -> psl
     ssh kki
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 cat `ls -1 *.lav | sort -g` \
 | /cluster/bin/x86_64/lavToAxt stdin \
     /iscratch/i/hg17/bothMaskedNibs /iscratch/i/chimp/panTro1/nib stdout \
 | /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
 /cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     for d in ../lav/chr*; do
       echo "do.csh $d" >> jobList
     done
     para create jobList
         # 46 jobs
     para try, check, push, check
 #Completed: 42 of 42 jobs
 #Average job time:                  38s       0.64m     0.01h    0.00d
 #Longest job:                      147s       2.45m     0.04h    0.00d
 #Submission to last job:           147s       2.45m     0.04h    0.00d
 
     # Load database tables (takes an hour or so)
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/pslChrom
 cat > load.csh << 'EOF'
     foreach f (chr*.psl)
 	set table = $f:r_blastzPanTro1
 	echo "loading ${table}"
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 -table=$f:r_${table} $f
     end
 'EOF'
 # << for emacs
     csh load.csh >&! load.log & 
     tail -100f load.log
 
 
 # CHAIN CHIMP BLASTZ (6/23/04 kate)
     # Run axtChain on little cluster
     # first copy input to bluearc, as eieo bogs down if even mini-cluster
     # gets input from it !?
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
     cp -rp axtChrom /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom
 
     ssh kki
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh -fe
 set c = $1:r:t
 axtChain $1 -scoreScheme=/cluster/data/blastz/human_chimp.q \
         /iscratch/i/hg17/bothMaskedNibs \
         /iscratch/i/chimp/panTro1/nib /tmp/$c.chain.$$ > /tmp/$c.out.$$
 set ret = $status
 mv -f /tmp/$c.chain.$$ $2
 mv -f /tmp/$c.out.$$ $3
 exit $status
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
     # TODO
     rm -fr /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom
     echo "remove after 7/1/04" > /cluster/bluearc/hg17/blastz.panTro1.2004-06-22/axtChrom/README
 
     # now on the cluster server, sort chains
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     # TODO
     rm run1/chain/*.chain
     echo "remove after 7/1/04" > run1/chain/README
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain hg17 ${c}_chainPanTro1 $i
     end
     # TODO
     featureBits hg16 chainPanTro1Link
         #2627280557 bases of 2865248791 (91.695%) in intersection
     featureBits hg17 chainPanTro1Link
         # 2633869032 bases of 2866216770 (91.894%) in intersection
 
 
 # NET CHIMP (DONE 2004-6-24 kate)
     # Redone to make chimp.net on 2004-10-11 kate (other files have
     # new times, but are the same as 6-24 versions)
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
     #chainPreNet all.chain ../S1.len ../S2.len stdout \
     #| chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     #| netSyntenic stdin noClass.net
 
     time chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=10 ../S1.len ../S2.len human.net chimp.net
         # 42.860u 2.080s 2:11.11 34.2% 
     netSyntenic human.net noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
     netClass noClass.net hg17 panTro1 human.net
     rm noClass.net
 
     # Make a 'syntenic' subset:
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
     # TODO
     #rm noClass.net
     # Make a 'syntenic' subset of these with
     # NOTE: we used -chimpSyn filtering for the reciprocal best nets
     # on hg16 -- perhaps should use for nets here as well
     netFilter -chimpSyn human.net > humanSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
     netFilter -minGap=10 human.net |  hgLoadNet hg17 netPanTro1 stdin
     netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyPanTro1 stdin
     # Add entries for chainPanTro1, netPanTro1 to
     # human/hg17 trackDb
 
     # save chimp net to downloads area
     ssh eieio
     cd /cluster/data/hg17/blastz.panTro1/axtChain
     nice gzip chimp.net
     cp chimp.net.gz /usr/local/apache/htdocs/goldenPath/panTro1/vsHg17
     cd /usr/local/apache/htdocs/goldenPath/panTro1/vsHg17
     md5sum *.gz > md5sum.txt
 
 
 # RECIPROCAL BEST CHAINS FOR ENSEMBL GENE BUILD (DONE 2004-10-11 kate)
 
     # Starting with the chimp-reference net, which contains the best human
     # alignments to chimp, extract the subset of chains in the net.
     # (these are the "best" chains of human alignments to chimp).
     # Net these chains and use the resulting human-reference net (the
     # "reciprocal best" net).  Extract the chains from this net to
     # obtain "reciprocal best" chains of chimp alignments to human.
 
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
     mkdir rBest
     grep chain all.chain | wc -l
 
     # extract "best" chains from the chimp-reference net
     time chainSwap all.chain stdout | \
         netChainSubset chimp.net stdin stdout | \
         chainSort stdin rBest/chimp.best.chain
     grep chain rBest/chimp.best.chain | wc -l
         # 64396 
 
     # for comparison later, extract "best" chains from human-reference net
     netChainSubset human.net all.chain stdout | \
         chainSort stdin rBest/human.best.chain
     cd rBest
 
     # net the best chains from the chimp net and pull the human-ref net
     # (Daryl accidentally deleted human.rbest.net and rebuilt it with the 
     #  same command on 8/14/2005, resulting in a file of the same size)
     time chainPreNet chimp.best.chain ../../S2.len ../../S1.len stdout | \
        chainNet stdin -minSpace=10 ../../S2.len ../../S1.len \
                          /dev/null human.rbest.net
 
     # extract "reciprocal best" chains from the "best" human-reference net
     netChainSubset human.rbest.net ../all.chain stdout | \
         chainSort stdin human.rbest.chain
 
     # take a look
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
     cd rBest
     mkdir rBestChain
     chainSplit rBestChain human.rbest.chain
     hgLoadChain hg17 chr7_rBestChainPanTro1 rBestChain/chr7.chain
         # Loading 1639 chains into hg17.chr7_rBestChainPanTro1
     mkdir bestChain
     chainSplit bestChain human.best.chain
     hgLoadChain hg17 chr7_bestChainPanTro1 bestChain/chr7.chain
         # Loading 6516 chains into hg17.chr7_bestChainPanTro1
     # compare 
     hgsql hg16 -s -e "select count(*) from chr7_rBestChainPanTro1"
         # 2416
     # spot-checked by comparing chr7 best and rbest:
     # 1. for a a chain appearing in rBest,  click thru to human browser, 
     #   then via chimp net back to human browser at same region
     # 2. for a chain in "best", but not rBest, do the same, verify
     #   that it produces a different region in the human browser
 
     # post pre-Q/A file for ensembl download
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain/rBest
     gzip human.rbest.chain
     cp human.rbest.chain.gz \
         /usr/local/apache/htdocs/kate/ensembl/hg17-panTro1.rbest.chain.gz
     cd /usr/local/apache/htdocs/kate/ensembl
     md5sum *.gz > md5sum.txt
     mv hg17-panTro1.rbest.chain.gz /usr/local/apache/htdocs/hg17/vsPanTro1/hg17.panTro1.rbest.chain.gz
 
     # save as reciprocal best liftover chain (2005-02-22 kate)
     gunzip -c  human.rbest.chain.gz > \
         /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.rbest.chain
 
     # cleanup (TODO -- after QA)
     ssh hgwdev
     hgsql hg17 -e "drop table chr7_rBestChainPanTro1"
     hgsql hg17 -e "drop table chr7_bestChainPanTro1"
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
     mv rBest/human.rbest.chain.gz ..
     rm -fr rBest
 
 # RECIPROCAL BEST AXT'S FROM RECIPROCAL BEST CHAIN (2005-08-16 kate)
 #       (requested by Daryl)
     cd /cluster/data/hg17/bed/blastz.panTro1
     mkdir -p axtRBestNet
 cat > makeRbestAxt.csh << 'EOF'
 foreach f (axtChain/rBest/rBestChain/*.chain)
     set c = $f:t:t:r
     echo $c
     chainToAxt $f /cluster/data/hg17/nib  /cluster/data/panTro1/nib stdout \
         | axtSort stdin axtRBestNet/$c.axt
     end
 'EOF'
 # << for emacs
     csh makeRbestAxt.csh >&! makeRbestAxt.log &
 
 
 # GENERATE CHIMP MAF FOR MULTIZ FROM NET (DONE 2004-06-24 kate)
     # Redo to fix overlap problem using 8/05 netToAxt (2005-08-16 kate)
     # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
     #   There was apparently a bad chr5 nib for a while...
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.panTro1.2004-06-22/axtChain
     netSplit human.net net
 
     mkdir axtNet mafNet
 cat > makeMaf.csh << 'EOF'
 foreach f (axtChain/net/*.net)
     set c = $f:t:r
     netToAxt $f axtChain/chain/$c.chain /cluster/data/hg17/nib \
         /cluster/data/panTro1/nib stdout | axtSort stdin axtNet/$c.axt
     axtToMaf axtNet/$c.axt  \
         /cluster/data/hg17/chrom.sizes /cluster/data/panTro1/chrom.sizes \
         mafNet/$c.maf -tPrefix=hg17. -qPrefix=panTro1.
     end
 'EOF'
 # << for emacs
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
 
     mkdir -p /cluster/bluearc/hg17/mafNet
     cp -rp mafNet /cluster/bluearc/hg17/mafNet/panTro1
 
 
 # MAKE PANTRO1 DOWNLOADABLES (DONE 2004-09-14 kate)
 #  Redo panTro1.net.gz (it was truncated) 2004-10-07 kate
 # Redo axtNets with non-overlapped versions (2005-08-29 kate)
 # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.panTro1
     # gzip chains and nets
     mkdir gz
     cd gz
     nice gzip -c ../axtChain/all.chain  > panTro1.chain.gz
     nice gzip -c ../axtChain/human.net > panTro1.net.gz
     wc -l *.gz
     cd ../axtNet
     time nice gzip *.axt
         # 46 mins.
    
     ssh hgwdev
     # copy chains and nets to downloads area
     cd /usr/local/apache/htdocs/goldenPath/hg17
     mkdir -p vsPanTro1
     cd vsPanTro1
     mv /cluster/data/hg17/bed/blastz.panTro1/gz/*.gz .
     md5sum *.gz > md5sum.txt
     # copy in README and edit
     rmdir /cluster/data/hg17/bed/blastz.panTro1/gz
     mkdir -p axtNet
     cd axtNet
     cp /cluster/data/hg17/bed/blastz.panTro1/axtNet/*.axt.gz .
     md5sum *.gz > md5sum.txt
 
 
 # RESCORE CHICKEN BLASTZ (DONE 6/23/04 angie)
     # Webb noticed low scores when using non-default BLASTZ_Q scoring matrix 
     # and repeats abridged --
     # PSU's restore_rpts program rescored alignments with default matrix 
     # instead of BLASTZ_Q matrix.  Rescore them here so the chainer sees 
     # the higher scores:
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
     mkdir axtChrom.rescore
     foreach f (axtChrom/chr*.axt)
       axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
         $f axtChrom.rescore/$f:t
     end
     mv axtChrom axtChrom.preRescore
     mv axtChrom.rescore axtChrom
 
 
 # CHAIN CHICKEN BLASTZ (DONE 6/23/04 angie)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
          -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
          -minScore=5000 $1 \
     /iscratch/i/hg17/bothMaskedNibs \
     /iscratch/i/galGal2/nib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
     # axtChrom/chr18_random.axt is empty, so the {out line +} check failed:
 #Completed: 45 of 46 jobs
 #Crashed: 1 jobs
 #Average job time:                  46s       0.76m     0.01h    0.00d
 #Longest job:                      273s       4.55m     0.08h    0.00d
 #Submission to last job:           519s       8.65m     0.14h    0.01d
 
     # now on the cluster server, sort chains
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain hg17 ${c}_chainGalGal2 $i
     end
 
 
 # NET CHICKEN BLASTZ (DONE 6/23/04 angie)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
     netClass noClass.net hg17 galGal2 human.net
 
     # Make a 'syntenic' subset:
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn human.net > humanSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
     netFilter -minGap=10 human.net |  hgLoadNet hg17 netGalGal2 stdin
     netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 netSyntenyGalGal2 stdin
     # Add entries for chainGalGal2, netGalGal2, syntenyGalGal2 to 
     # human/hg17 trackDb
 
 # XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk)
 # see makeXenTro1.doc and search for zb.hg17
 # The results of this are also symlinked under hg17/bed
 
 # GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 6/23/04 angie)
 #       Redo net axt's and maf's to fix overlap problem (use 8/05 netToAxt)
 #               (2005-08-16 kate)
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
     netSplit human.net net
     cd ..
     mkdir axtNet mafNet
 cat > makeMaf.csh << 'EOF'
     foreach f (axtChain/net/*)
       set chr = $f:t:r
       netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg17/nib \
         /cluster/data/galGal2/nib stdout \
       | axtSort stdin axtNet/$chr.axt
       axtToMaf axtNet/$chr.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/galGal2/chrom.sizes \
             mafNet/$chr.maf -tPrefix=hg17. -qPrefix=galGal2.
     end
 'EOF'
 # << for emacs
     csh makeMaf.csh >&! makeMaf.log &
     mkdir -p /cluster/bluearc/hg17/mafNet
     cp -rp mafNet /cluster/bluearc/hg17/mafNet/galGal2
 
 
 # MAKE VSGALGAL2 DOWNLOADABLES (REDONE 9/13/04 angie)
 #   REDO axtNet's to fix overlaps (2005-09-12 kate)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain
     gzip -c all.chain > /cluster/data/hg17/zip/chicken.chain.gz
     gzip -c human.net > /cluster/data/hg17/zip/chicken.net.gz
     mkdir /cluster/data/hg17/zip/axtNet
     foreach f (axtNet/chr*axt)
       gzip -c $f > /cluster/data/hg17/zip/$f.gz
     end
     # Doh! above for loop didn't work because all axt's have been removed 
     # from this dir!  :|  Just this once, regenerate compressed axtNet on 
     # the fly:
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChain/net
     foreach f (*.net)
       set chr = $f:t:r
       echo $chr
       netToAxt $f ../chain/$chr.chain /cluster/data/hg17/nib \
         /cluster/data/galGal2/nib stdout \
       | axtSort stdin stdout \
       | gzip -c > /cluster/data/hg17/zip/axtNet/$chr.axt.gz
     end    
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2
     mv /cluster/data/hg17/zip/chicken*.gz .
     mv /cluster/data/hg17/zip/axtNet .
     md5sum *.gz */*.gz > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
     # REDO axtNet downloads to fix overlaps (2005-09-13 kate)
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.galGal2/axtNet
     nice gzip *.axt
     md5sum *.axt.gz > md5sum.txt
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsGalGal2
     mv axtNet axtNet.old
     ln -s /cluster/data/hg17/bed/blastz.galGal2/axtNet .
 
 
 # 8-WAY MULTIZ MULTIPLE ALIGNMENT WITH MM5 (DONE 2004-07-13 kate)
 #       Redo, below to fix overlapping alignments (2005-08-16 kate)
 
     ssh eieio
     set multizDir = multiz.2004-07-13
     set workingDir = /cluster/bluearc/hg17/$multizDir
     ln -s $workingDir /cluster/bluearc/hg17/multiz8way
     mkdir -p $workingDir
     mkdir -p /cluster/data/hg17/bed/$multizDir
     cd /cluster/data/hg17/bed/$multizDir
 
 # wrapper script for multiz
     # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) 
     # NOTE: next time, modify script so it only needs one arg -- saves the
     # multiple dirname in a file for use by the next run
     cat << 'EOF' > doMultiz.csh
 #!/bin/csh -fe
 mkdir -p $3:h
 /cluster/bin/penn/multiz $1 $2 - > $3
 'EOF'
 # << for emacs
     cat << 'EOF' > gsub
 #LOOP
 ../doMultiz.csh {check in line /cluster/bluearc/hg17/multiz.2004-07-13/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/hg17/multiz.2004-07-13/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/hg17/multiz.2004-07-13/$(root1)$(dir1)/$(root2).maf}
 #ENDLOOP
 'EOF'
 # << for emacs
     chmod +x doMultiz.csh
 
     ssh eieio
     set workingDir = /cluster/bluearc/hg17/multiz.2004-07-13
 
     # copy mafs to bluearc -- chimp
     mkdir $workingDir/panTro1
     cp /cluster/data/hg17/bed/blastz.panTro1/mafNet/*.maf \
                 $workingDir/panTro1
     ls $workingDir/panTro1/*.maf > chrom.lst
 
     # mouse
     mkdir $workingDir/mm5
     cp /cluster/data/hg17/bed/blastz.mm5/mafNet/chr*.maf $workingDir/mm5
 
     # rat
     mkdir $workingDir/rn3
     cp /cluster/data/hg17/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3
 
     # dog
     mkdir $workingDir/canFam1
     foreach f (/cluster/data/hg17/bed/blastz.canFam1.2004-07-08/mafNet/chr*.maf)
         set c = $f:r:r:t
         echo $c
         cp $f $workingDir/canFam1/$c.maf
     end
 
     # chicken
     mkdir $workingDir/galGal2
     foreach f (/cluster/data/hg17/bed/blastz.galGal2/mafNet/chr*.maf)
         set c = $f:r:r:t
         cp $f $workingDir/galGal2/$c.maf
     end
 
     # fugu
     mkdir $workingDir/fr1
     cp /cluster/data/hg17/bed/blastz.fr1/mafNet/chr*.maf $workingDir/fr1
 
     # zebrafish
     mkdir $workingDir/danRer1
     cp /cluster/data/hg17/bed/blastz.danRer1.swap/mafNet/chr*.maf \
                         $workingDir/danRer1
 
     # first multiz - add in mm5 mouse to human/chimp
     # 
     ssh kki
     set multizDir = multiz.2004-07-13
     set workingDir = /cluster/bluearc/hg17/$multizDir
     cd /cluster/data/hg17/bed/$multizDir
     mkdir run.mm5
     cd run.mm5
     echo "mm5/panTro1" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     para create jobList
         # 46 jobs
     para try, check, push, check
 # CPU time in finished jobs:       6620s     110.33m     1.84h    0.08d  0.000 y
 # IO & Wait Time:                  3685s      61.42m     1.02h    0.04d  0.000 y
 # Average job time:                 224s       3.73m     0.06h    0.00d
 # Longest job:                      819s      13.65m     0.23h    0.01d
 # Submission to last job:          1474s      24.57m     0.41h    0.02d
     cd ..
 
     # rat
     mkdir run.rn3
     cd run.rn3
     echo "rn3/panTro1mm5" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     para create jobList
         # 46 jobs
     para try, check, push, check
     cd ..
 
     # dog
     mkdir run.canFam1
     cd run.canFam1
     echo "canFam1/panTro1mm5rn3" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     para create jobList
         # 46 jobs
     para try, check, push, check
     cd ../
 
     # chicken
     mkdir run.galGal2
     cd run.galGal2
     echo "galGal2/panTro1mm5rn3canFam1" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     # no alignment file for chr18_random -- create one so we can create jobList
     touch $workingDir/galGal2/chr18_random.maf
     para create jobList
         # 46 jobs
     para try, check, push, check
     # 1 crashed job for empty file chr18_random
     cd ..
 
     # fugu
     mkdir run.fr1
     cd run.fr1
     echo "fr1/panTro1mm5rn3canFam1galGal2" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     # create empty alignment file for missing one (no alignments)
     touch /cluster/bluearc/hg17/multiz.2004-07-13/fr1/chr6_hla_hap1.maf
     para create jobList
         # 46 jobs
     para try, check, push, check
     # 1 crashed job for empty file chr6_hla_hap1
     cd ..
 
     # zebrafish
     mkdir run.danRer1
     cd run.danRer1
     echo "danRer1/panTro1mm5rn3canFam1galGal2fr1" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     para create jobList
         # 46 jobs
     para try, check, push, check
     cd ..
 
     # copy 8-way mafs to build directory
     ssh eieio
     set multizDir = multiz.2004-07-13
     set workingDir = /cluster/bluearc/hg17/$multizDir
     ln -s $workingDir/panTro1mm5rn3canFam1galGal2fr1danRer1 $workingDir/maf
     cd /cluster/data/hg17/bed/multiz.2004-07-13
     mkdir maf
     cp $workingDir/maf/*.maf maf
 
     # copy to download area (2004-07-27 angie)
     # moved gzipped files to mafDownload dir and recreated symlinks 
     #   (2006-04-23 kate)
     cd /usr/local/apache/htdocs/goldenPath/hg17
     mkdir -p mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
     # gzipped & copied maf files from /cluster/data/hg17/bed/multiz8way/maf
     # dumped table and gzipped for download (user request to after file
     # removed when the track was replaced by 18way).
     cd /cluster/data/hg17/bed/multiz8way/mafDownloads
     hgsqldump --all -c --tab=. hg17 multiz8way
     ssh kkstore02 \
         'gzip /cluster/data/hg17/bed/multiz8way/mafDownloads/multiz8way.{sql,txt}'
     ln -s /cluster/data/hg17/bed/multiz8way/mafDownloads/multiz8way.{sql,txt}.gz \
         /usr/local/apache/htdocs/goldenPath/hg17/multiz8way
 
     # load summary table (2005-09-27)
     cd /cluster/data/hg17/bed/multiz.2004-07-13/maf
     time cat chr*.maf | hgLoadMafSummary hg17 multiz8waySummary stdin
     # 30 minutes ?
     # NOTE: this didn't improve track display time at 5MB, so
     #   I'm leaving out of trackDb (sticking with pairwise maf's) for now
     #   It may be that this helps performance only with larger numbers
     #   of species.
 
     # Create upstream files for download (2004-09-13 kate)
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz8way
     echo hg17 panTro1 mm5 rn3 canFam1 galGal2 fr1 danRer1 > org.txt
     # mafFrags takes a while
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad
         awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         mafFrags hg17 multiz8way up.bed upstream$i.maf -orgs=org.txt
         rm up.bed
     end
     ssh eieio
     cd /cluster/data/hg17/bed/multiz8way
     nice gzip upstream{1000,2000,5000}.maf
         # 6 mins.
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17
     ln -s mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 multiz8way
     mv /cluster/data/hg17/bed/multiz8way/upstream*.maf.gz multiz8way
 
 
 
 # PHYLO-HMM (PHASTCONS) CONSERVATION FOR 8-WAY WITH MM5 (DONE 2004-07-20 kate)
 # (this was partially redone by acs using the new phastCons, 08-28;
 # I've tried to merge the two sets of docs into one cohesive
 # description)
 # More revisions, acs, 09-13
 
     ssh eieio
     set path = ($path /cluster/bin/phast)
     cd /cluster/data/hg17/bed/multiz.2004-07-13
     mkdir cons
     cd cons
 
     #break up the genome-wide MAFs into pieces
     mkdir /cluster/bluearc/hg17/chrom
     cd /cluster/data/hg17
     foreach f (`cat chrom.lst`)
         echo $f
         cp $f/*.fa /cluster/bluearc/hg17/chrom
     end
 
     ssh kki
     cd /cluster/data/hg17/bed/multiz.2004-07-13/cons
     mkdir run.split
     cd run.split
     set WINDOWS = /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
     cat << 'EOF' > doSplit.sh
 #!/bin/sh
 
 PHAST=/cluster/bin/phast
 FA_SRC=/cluster/bluearc/hg17/chrom
 WINDOWS=/cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS
 
 maf=$1
 c=`basename $maf .maf`
 echo $c
 mkdir -p /scratch/msa_split
 ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg17,panTro1,mm5,rn3,canFam1,galGal2,fr1,danRer1 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000
 [ $? -eq 0 ] || exit 1
 echo "Copying..."
 cd /scratch/msa_split
 for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
 [ $? -eq 0 ] || exit 1
 rm -f /scratch/msa_split/$c.*.ss
 echo "Done copying"
 echo "Done" >> ${WINDOWS}/$c.done
 'EOF'
 # << for emacs
     chmod +x doSplit.sh
     rm -f jobList
     foreach file (/cluster/bluearc/hg17/multiz.2004-07-13/maf/*.maf) 
         set c = $file:t:r
 	echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList
     end
     
     para create jobList
         # 46 jobs
     para try
     para check
     para push
         # 2 crashed jobs -- due to no alignments in input maf
         # chr18_random, chr6_hla_hap1
     cd ..
 
     # now generate conservation scores and predicted elements
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz.2004-07-13/cons
     mkdir run.elements   
     # despite the name, I've put the elements and the new conservation
     # scores here
 
     # first produce a rough starting model; in this case, we can just
     # use the model previously estimated (see the entry below on PHYLOFIT/PHASTCONS)
     cp /cluster/bluearc/hg17/multiz.2004-07-13/panTro1mm5rn3canFam1/hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod starting-tree.mod
     # In other cases, it would be sufficient to choose an arbitrary
     # input file from the WINDOWS directory (choose one with plenty of
     # data, i.e., large NTUPLES) and run phyloFit on it with the
     # correct tree topology, e.g.,
     # phyloFit -i SS datafile.ss --tree \
     #    "(((((hg17,panTro1),(mm5,rn3)),canFam1),galGal2),(fr1,danRer1))" \
     #    --out-root starting-tree
 
     # Get genome-wide average GC content (for all species together,
     # not just the reference genome).  If you have a globally
     # estimated tree model, as above, you can get this from the
     # BACKGROUND line in the .mod file.  E.g.,
 # ALPHABET: A C G T
 # ...
 # BACKGROUND: 0.294633 0.205082 0.205189 0.295097
     # This implies a GC content of 0.205 + 0.205 = 0.410
 
     # If you do *not* have a global tree model and you do not know
     # your GC content, you can get it directly from the MAFs with
     # a command like:
     #	msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1,galGal2,danRer1,fr1 \
     #       -i MAF --summary-only /cluster/data/hg17/bed/multiz.2004-07-13/maf/chr*.maf\
     #	    > maf_summary.txt
     # This will take a little while (30-60 min).  Run on eieio. 
 
     # now set up cluster job to estimate model parameters.  Parameters
     # will be estimated separately for each alignment fragment then
     # will be combined across fragments
     cat << 'EOF' > doEstimate.sh
 #!/bin/sh
 zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.410 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 12 --target-coverage 0.17 --quiet --log $2 --estimate-trees $3
 EOF
     # Be sure to substitute in the right G+C content. Also, notice the
     # target coverage of 0.17.  We actually want 5% coverage here but
     # the final (posterior) coverage is only indirectly related to the
     # expected (prior) coverage.  One thing to consider is that we
     # only have about 40% alignment coverage (excluding chimp, which
     # doesn't help us much in identifying conserved regions).  As far
     # as phastCons is concerned, we want to aim for about 0.05 / 0.4 =
     # 0.125 coverage.  In this case, though, --target-coverage
     # 0.125 resulted in only about 4.1% coverage.  I had to iterate
     # a couple of times (using only chromosome 1) to find a value that
     # got me close to the target of 5%
 
     chmod u+x doEstimate.sh
 
     rm -fr LOG TREES
     mkdir -p LOG TREES
     rm -f jobs.lst
     # watch out: bash assumed below in a few places
     for f in /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS/*.ss.gz ; do \
 	root=`basename $f .ss.gz` ;\
 	echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\
     done
 
     # run cluster job
     ssh kk, cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements, para create, ...
     # takes about an hour
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ls TREES/*.cons.mod > cons.txt
     phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
     ls TREES/*.noncons.mod > noncons.txt
     phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
     # look over the files cons_summary.txt and noncons_summary.txt.
     # The means and medians should be roughly equal and the stdevs
     # should be reasonably small compared to the means, particularly
     # for rate matrix parameters (at bottom) and for branches to the
     # leaves of the tree.  The stdevs may be fairly high for branches
     # near the root of the tree; that's okay.  Some min values may be
     # 0 for some parameters.  That's okay, but watch out for very large
     # values in the max column, which might skew the mean.  If you see
     # any signs of bad outliers, you may have to track down the
     # responsible .mod files and throw them out.  I've never had to do
     # this; the estimates generally seem pretty well behaved.
 
     # NOTE: Actually, a random sample of several hundred to a thousand
     # alignment fragments (say, a number equal to the number of
     # available cluster nodes) should be more than adequate for
     # parameter estimation.  If pressed for time, use this strategy.
 
     # Now we are ready to set up the cluster job for computing the
     # conservation scores and predicted elements.  It's all downhill
     # from here.
 
     cat << 'EOF' > doPhastCons.sh
 #!/bin/sh
 
 mkdir -p /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/ELEMENTS
 pref=`basename $1 .ss.gz`
 chr=`echo $pref | awk -F\. '{print $1}'`
 tmpfile=/scratch/phastCons.$$
 zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 12 --target-coverage 0.17 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/hg17/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
 gzip -c $tmpfile > /cluster/bluearc/hg17/phastCons/POSTPROBS/$pref.pp.gz
 rm $tmpfile
 EOF
     chmod u+x doPhastCons.sh
 
     rm -fr /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/ELEMENTS
     rm -f jobs2.lst
     for f in /cluster/bluearc/hg17/multiz.2004-07-13/cons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done
 
     # run cluster job
     ssh kk, cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements, para create, ...
     logout
     # takes about 20 minutes
 
     # combine predictions and transform scores to be in 0-1000 interval
     # do in a way that avoids limits on numbers of args
     find /cluster/bluearc/hg17/phastCons/ELEMENTS -name "*.bed" > files
     rm -f splitfiles* all.raw.bed
     split files splitfiles
     for s in splitfiles* ; do awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed ; done
     /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
     rm files splitfiles* 
 
     hgLoadBed hg17 phastConsElements all.bed
     hgLoadBed -chrom=chr1 hg17 phastConsElements all.bed
 
     # check coverage
     featureBits hg17 phastConsElements
 #137850739 bases of 2866216770 (4.810%) in intersection
     # This should be close enough.  If necessary, you can rerun the
     # steps above with a different target coverage.  When hitting the
     # target is important, you may want to perform several iterations
     # using a representative subset of the entire dataset (human chr1
     # seems to work pretty well)
 
     # set up wiggle
     mkdir -p /cluster/bluearc/hg17/phastCons/wib
     cat << 'EOF' > doWigAsciiToBinary.sh
 #!/bin/sh
 chr=$1
 zcat `ls /cluster/bluearc/hg17/phastCons/POSTPROBS/$chr.*.pp.gz | sort -t\. -k2,2n` | wigAsciiToBinary -chrom=$chr -wibFile=/cluster/bluearc/hg17/phastCons/wib/${chr}_phastCons stdin 
 EOF
     chmod u+x doWigAsciiToBinary.sh
 
     rm -f jobs3.lst
     for chr in `ls /cluster/bluearc/hg17/phastCons/POSTPROBS | awk -F\. '{print $1}' | sort -u` ; do echo doWigAsciiToBinary.sh $chr >> jobs3.lst ; done
 
     # run a little wigAsciiToBinary cluster job
     ssh kk, etc.
 
     # copy wibs and wigs from bluearc
     rsync -av /cluster/bluearc/hg17/phastCons/wib .
 
     # load track
     hgLoadWiggle hg17 phastCons -pathPrefix=/gbdb/hg17/phastCons/wib \
                 wib/chr*_phastCons.wig
     mkdir -p /gbdb/hg17/phastCons/wib
     rm -f /gbdb/hg17/phastCons/wib/chr*phastCons.wib
     ln -s /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements/wib/*.wib /gbdb/hg17/phastCons/wib
     chmod 775 . wib /gbdb/hg17/phastCons /gbdb/hg17/phastCons/wib
     chmod 664 wib/*.wib
 
     # move postprobs over and clean up bluearc 
     rsync -av /cluster/bluearc/hg17/phastCons/POSTPROBS .
     # (people sometimes want the raw scores)
     rm -r /cluster/bluearc/hg17/phastCons/ELEMENTS /cluster/bluearc/hg17/phastCons/POSTPROBS /cluster/bluearc/hg17/phastCons/wib
 
     # set up full alignment/conservation track ("multiz8way")
     # load multiz maf tables 
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz.2004-07-13
     set mafDir = /gbdb/hg17/multiz8way/maf
     set table = multiz8way
     mkdir -p $mafDir/$table
     ln -s `pwd`/maf/*.maf $mafDir/$table
     cd maf
     hgLoadMaf hg17 -warn multiz8way -pathPrefix=$mafDir/$table/maf
 
     # someone dropped from hgwdev
     # reload (2007-03-19 kate)
     nice hgLoadMaf hg17 -warn multiz8way -pathPrefix=/gbdb/hg17/multiz8wayFixed
     cat /gbdb/hg17/multiz8wayFixed/*.maf | \
         nice hgLoadMafSummary hg17 -minSize=30000 -mergeGap=1500 -maxSize=200000  \
             multiz8waySummary stdin
 
     # load blastz maf tables
     # TODO: change mafWiggle to use db names instead of species names
     # in speciesOrder 
     # link files into /gbdb table dir
     ln -s /cluster/data/hg17/bed/blastz.panTro1/mafNet $mafDir/chimp_netBlastz
     ln -s /cluster/data/hg17/bed/blastz.mm5/mafNet $mafDir/mouse_netBlastz
     ln -s /cluster/data/hg17/bed/blastz.rn3/mafNet $mafDir/rat_netBlastz
     ln -s /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/mafNet $mafDir/dog_netBlastz
     ln -s /cluster/data/hg17/bed/blastz.galGal2/mafNet $mafDir/chicken_netBlastz
     ln -s /cluster/data/hg17/bed/blastz.fr1/mafNet $mafDir/fugu_netBlastz
     ln -s /cluster/data/hg17/bed/blastz.danRer1.swap/mafNet $mafDir/zebrafish_netBlastz
 
     # remove empty file, disliked by hgLoadMaf
     # NOTE: these shouldn't be empty -- next time, make sure previous
     # alignments are copied over to output maf (multiz won't if there's
     # an empty input file).
     rm chicken/chr18_random.maf
     rm fugu/chr6_hla_hap1.maf
 
     # load tables
     foreach s (chimp mouse rat dog chicken fugu zebrafish)
         set table = ${s}_netBlastz
         echo "$s $mafDir/$table"
         ~kate/bin/i386/hgLoadMaf hg17 -warn ${s}_netBlastz -pathPrefix=$mafDir/$table
     end
 
     # trackDb entry:
 # track multiz8way
 # shortLabel Conservation
 # longLabel Chimp/Mouse/Rat/Dog/Chicken/Fugu/Zebrafish Multiz Alignments & Conservation
 # group compGeno
 # priority 149
 # visibility pack
 #color 0, 10, 100
 # type wigMaf 0.0 1.0
 # maxHeightPixels 100:40:11
 # wiggle phastCons
 # yLineOnOff Off
 # autoScaleDefault Off
 # pairwise netBlastz
 # speciesOrder chimp mouse rat dog chicken fugu zebrafish
 
 
 # PHASTCONS SCORES DOWNLOADABLES (REDONE 6/15/05 angie)
     # Initially done 10/11/04, but using scores from run.cons -- which 
     # had been replaced by scores in run.elements, where I did not think 
     # to look for scores.  :(  !
     ssh eieio
     mkdir /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
     cd /cluster/data/hg17/bed/multiz8way/cons/run.elements/POSTPROBS
     foreach chr (`awk '{print $1;}' /cluster/data/hg17/chrom.sizes`)
       echo $chr
       nice zcat `ls -1 $chr.*.pp.gz | sort -t\. -k2,2n` \
       | nice gzip -c \
       > /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1/$chr.gz
     end
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg17/phastCons
     # Doh!  /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 is 11G now -- 
     # too much to dump on hgwdev's / which is at 94%.  So don't do this:
     #mv /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1 .
     # make symbolic links instead:
     mkdir /usr/local/apache/htdocs/goldenPath/hg17/phastCons/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
     cd /usr/local/apache/htdocs/goldenPath/hg17/phastCons/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1
     ln -s /cluster/data/hg17/zip/mzPt1Mm5Rn3Cf1Gg2Fr1Dr1/* .
     md5sum *.gz > md5sum.txt
     # make a README.txt.
 
 
 # PHYLOFIT AND TREE-DOCTOR FOR 8-WAY: ESTIMATE PHYLOGENETIC TREE (acs)
 # (This was originally done for phastCons but is not necessary with
 # the new version.  However, it may be useful for other purposes, so
 # I'm leaving it in as a separate entry.)
 
     # first estimate a model for the mammals
     ssh eieio
     cd /cluster/bluearc/hg17/multiz.2004-07-13/panTro1mm5rn3canFam1
 
     # collect sufficient stats (takes maybe an hour)
     for file in *.maf ; do echo $file ; msa_view -i MAF $file -o SS --order hg17,panTro1,rn3,mm5,canFam1 > `basename $file .maf`.ss ; done
     ls *.ss | grep -v chr6_hla_hap2 > files
     msa_view '*files' --aggregate hg17,panTro1,rn3,mm5,canFam1 -i SS -o SS > all.ss
     # BTW, this can now be done in one step using something like:
     # msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1 -i MAF -o SS *.maf > all.ss
     # (modify to exclude certain files if necessary)
 
     # estimate model, with rate variation (takes about a minute)
     phyloFit all.ss --nrates 10 --tree "(((hg17,panTro1),(rn3,mm5)),canFam1)" --alpha 4.4 --EM --log log -i SS --out-root hprmc-rev-dg
     # (Actually, --nrates 4 should be more than adequate for most purposes)
 
     cat hprmc-rev-dg.mod
 #ALPHABET: A C G T
 #ORDER: 0
 #SUBST_MOD: REV
 #NRATECATS: 10
 #ALPHA: 4.658942
 #TRAINING_LNL: -6889216721.159384
 #BACKGROUND: 0.294633 0.205082 0.205189 0.295097
 #RATE_MAT:
 #  -0.865237    0.159990    0.554805    0.150442
 #   0.229851   -1.194646    0.168269    0.796526
 #   0.796651    0.168182   -1.194919    0.230086
 #   0.150205    0.553556    0.159985   -0.863747
 #TREE: (((1:0.006523,2:0.007997):0.103779,(3:0.104867,4:0.078911):0.265676):0.112364,5:0.112364);
 
     # now extrapolate to fish and chicken using tree_doctor and the CFTR 25 tree 
     # (replace numbers with names in hprmc-rev-dg.mod; this won't be necessary in the future)
     tree_doctor --rename "1->hg17;2->panTro1;3->rn3;4->mm5;5->canFam1" hprmc-rev-dg.mod > hprmc-rev-dg.names.mod
     # (obtain 8-way subtree from cftr25_hybrid.nh; also map names as necessary to match above)
     tree_doctor /cluster/data/nisc/targets/cftr/phyloHMMcons25/cftr25_hybrid.nh --prune-all-but hg16,chimp,mm3,rn3,dog,chicken,fr1,zfish --rename "hg16->hg17;mm3->mm5;chimp->panTro1;dog->canFam1;chicken->galGal2;zfish->danRer1" > cftr8way.nh
     # now merge (see tree_doctor help page for explanation)
     tree_doctor hprmc-rev-dg.names.mod --merge cftr8way.nh > hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod
 
     cat hg17panTro1rn3mm5canFam1galGal2fr1danRer1.mod
 #ALPHABET: A C G T
 #ORDER: 0
 #SUBST_MOD: REV
 #NRATECATS: 10
 #ALPHA: 4.658942
 #BACKGROUND: 0.294633 0.205082 0.205189 0.295097
 #RATE_MAT:
 #  -0.865237    0.159990    0.554805    0.150442
 #   0.229851   -1.194646    0.168269    0.796526
 #   0.796651    0.168182   -1.194919    0.230086
 #   0.150205    0.553556    0.159985   -0.863747
 #TREE: (((((hg17:0.006523,panTro1:0.007997):0.103779,(rn3:0.104867,mm5:0.078911):0.265676):0.019461,canFam1:0.205267):0.377150,galGal2:0.511134):0.536627,(danRer1:0.905323,fr1:0.922995):0.536627);
 
 
 # CONSERVED NON-CODING (CNS) TRACK (acs 08/29/04)
 # (depends on phastConsElements)
 
     cd /cluster/data/hg17/bed/multiz.2004-07-13/cons/run.elements
     featureBits hg17 -bed=possibleCoding.bed -or twinscan:exon xenoMrna mrna intronEst 
     # (add SGP, exoniphy, possib. others if available)
     # now filter out all phastCons elements that overlap
     overlapSelect -nonOverlapping possibleCoding.bed all.bed cns.bed
     hgLoadBed hg17 cns cns.bed
 
 # track cns
 # shortLabel CNS
 # longLabel Conserved Non-Coding (Cons Elements Minus Predicted Coding)
 # priority 109.11
 # group compGeno
 # visibility hide
 # type bed 5 .
 
     
 # PRODUCING GENSCAN PREDICTIONS (DONE - 2004-07-08 - Hiram)
 #	Needed to download a new binary for this run.  Our Linux systems
 #  XXX - I thought a new binary was needed.  Turned out it was already
 #  here in our hg3rdParty CVS project.  All of this discussed here can
 #  be simply fetched from cvs:  cvs co hg3rdParty/genscanlinux
 #	have been updated since the last time, the old binary would not
 #	run.  Go to: http://genes.mit.edu/GENSCAN.html
 #	and then to: http://genes.mit.edu/license.html
 #	Fill in the license agreement and you can then pick up the
 #	README and the Linux version: genscanlinux.tar.uue.tgz
 #	To uudecode that file, go to one of the Solaris home machines
 #	and use the uudecode command:
 #	uudecode genscanlinux.tar.uue.tgz
 #	That produces the file: genscanlinux.tar
 #	Which contains the files:
 # drwxr-xr-x chris/burgelab    0 2003-02-17 11:48:44 ./
 # -rw-r--r-- chris/burgelab 219056 2000-09-07 12:39:26 ./Arabidopsis.smat
 # -rw-r--r-- chris/burgelab   6622 2000-09-07 12:39:26 ./HUMRASH
 # -rw-r--r-- chris/burgelab    849 2000-09-07 12:39:26 ./HUMRASH.sample
 # -rw-r--r-- chris/burgelab 219050 2000-09-07 12:39:26 ./HumanIso.smat
 # -rw-r--r-- chris/burgelab 155735 2000-09-07 12:39:26 ./Maize.smat
 # -rw-r--r-- chris/burgelab  24465 2000-09-07 12:39:26 ./README
 # -rw-r--r-- chris/burgelab   6344 2000-09-07 12:39:27 ./HUMRASH.ps
 # -rwxr-xr-x chris/burgelab 126365 2003-02-17 11:48:44 ./genscan
 #
 #	I placed these currently in: /cluster/home/hiram/GENSCAN/
 #	I'll check with Angie where it should properly live ...
 #  XXX - it already lives in 'cvs co hg3rdParty/genscanlinux'
 #  These instructions should simple check it out right here in
 #  bed/genscan and make the gsub command refer to these copies.
 
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/genscan
     cd /cluster/data/hg17/bed/genscan
     cvs co hg3rdParty/genscanlinux
 
     ssh eieio
     cd /cluster/data/hg17/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the contigs
     # *that do not have pure Ns* (due to heterochromatin, unsequencable 
     # stuff) which would cause genscan to run forever.
     rm -f genome.list
     bash
     for f in `cat /cluster/data/hg17/contig.lst`
     do
       egrep '[ACGT]' /cluster/data/hg17/$f.masked > /dev/null
 	if [ $? = 0 ]; then
 	    echo /cluster/data/hg17/$f.masked >> genome.list
 	fi
     done
     # exit your bash shell if you are [t]csh ...
     #	This egrep matched all the contigs in hg17.  I guess none of
     #	them are complete Ns* at this point.
 
     # Log into kki (not kk !).  kki is the driver node for the small
     # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
     # big cluster, due to limitation of memory and swap space on each
     # processing node).
     ssh kki
     cd /cluster/data/hg17/bed/genscan
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 genome.list single gsub jobList
     para create jobList
     para try
     para check
     para push ... etc ...
 # Completed: 379 of 380 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:      79998s    1333.30m    22.22h    0.93d  0.003 y
 # IO & Wait Time:                  2989s      49.82m     0.83h    0.03d  0.000 y
 # Average job time:                 219s       3.65m     0.06h    0.00d
 # Longest job:                     2999s      49.98m     0.83h    0.03d
 # Submission to last job:          8324s     138.73m     2.31h    0.10d
 
     #	Running the single failed job on kolossus with a smaller window:
 /cluster/bin/x86_64/gsBig /cluster/data/hg17/5/NT_006576/NT_006576.fa.masked \
         gtf/NT_006576.fa.gtf -trans=pep/NT_006576.fa.pep \
         -subopt=subopt/NT_006576.fa.bed -exe=hg3rdParty/genscanlinux/genscan \
         -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000
 
     # If there were out-of-memory problems (run "para problems"), then 
     # re-run those jobs by hand but change the -window arg from 2400000
     # something lower.  In build33, this was 22/NT_011519
     #  In build34 there were NO failures !
 
     # Convert these to chromosome level files as so:     
     ssh eieio
     cd /cluster/data/hg17/bed/genscan
     $HOME/bin/i386/liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N*.gtf
     $HOME/bin/i386/liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft \
 	warn subopt/N*.bed
     cat pep/*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/hg17/bed/genscan
     ldHgGene hg17 genscan genscan.gtf
     #	35 minute job
     #	Read 42807 transcripts in 325994 lines in 1 files
     #	  42807 groups 46 seqs 1 sources 1 feature types
 
     hgPepPred hg17 generic genscanPep genscan.pep
     #	Processing genscan.pep
     hgLoadBed hg17 genscanSubopt genscanSubopt.bed
     #	Reading genscanSubopt.bed
     #	Loaded 517157 elements of size 6
     #	Sorted
     #	Creating table definition for 
     #	Saving bed.tab
     #	Loading hg17
 
     #	featureBits hg17 genscan
     #	55323340 bases of 2866216770 (1.930%) in intersection
 
     #	featureBits hg16 genscan
     #	55333689 bases of 2865248791 (1.931%) in intersection
 
     #	featureBits hg17 genscanSubopt
     #	55986178 bases of 2866216770 (1.953%) in intersection
     #	featureBits hg16 genscanSubopt
     #	56082952 bases of 2865248791 (1.957%) in intersection
 
     #	Should be zero intersection with rmsk
     #	featureBits -chrom=chr1 hg17 genscan rmsk
     #	794 bases of 222827847 (0.000%) in intersection
 
 
 # EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 8/1/05 angie)
     # Originally done 7/1/04 for canFam1 -- redone 8/1/05 for canFam2.
     ssh kolossus
     cd /san/sanvol1/scratch/hg17/rmsk
     # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
     # whether repeats in -query are also expected in -comp species.  
     # Even though we already have the human-mouse linSpecReps,
     # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
     # additions.  So add mouse, then ignore it.  
     # Dog in extra column 1, Mouse in extra column 2
     foreach outfl ( *.out )
         echo "$outfl"
         /cluster/bluearc/RepeatMasker/DateRepeats \
           ${outfl} -query human -comp dog -comp mouse
     end
     # Now extract dog (extra column 1), ignore mouse.
     cd ..
     mkdir linSpecRep.notInDog
     foreach f (rmsk/*.out_canis-familiaris_mus-musculus)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                         linSpecRep.notInDog/$base.out.spec
     end
     # Clean up.
     rm rmsk/*.out_canis*
     rsync -av /san/sanvol1/scratch/hg17/linSpecRep.notInDog \
       /cluster/bluearc/scratch/hg/gs.18/build35/
     # Ask cluster-admin for an rsync.
 
 
 # BLASTZ DOG (CANFAM1) (DONE 7/8/04 angie)
     ssh kk
     # space is awful tight on store4 -- use store7.  
     mkdir -p /cluster/store7/hg17/bed/blastz.canFam1.2004-07-08
     ln -s /cluster/store7/hg17/bed/blastz.canFam1.2004-07-08 \
       /cluster/data/hg17/bed/
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
     # Use default (Human-Mouse) settings for starters.
     cat << '_EOF_' > DEF
 # human vs. dog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Default
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog
 SEQ2_DIR=/scratch/hg/canFam1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastz.canFam1.2004-07-08
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # first cluster run: raw blastz alignments
     ssh kk
     bash # if a csh/tcsh user
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
     para try, check, push, check, ....
     # Moving the human chr19 jobs up to the top of the jobList probably 
     # would have shaved 4 hours off the total time!  It was almost done 
     # after 6 hours, except for a few chr19 stragglers.
 #Completed: 93775 of 93775 jobs
 #Average job time:                 202s       3.37m     0.06h    0.00d
 #Longest job:                    17806s     296.77m     4.95h    0.21d
 #Submission to last job:         35523s     592.05m     9.87h    0.41d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     bash # if a csh/tcsh user
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
     para try, check, push, etc ...
 #Completed: 341 of 341 jobs
 #Average job time:                  36s       0.61m     0.01h    0.00d
 #Longest job:                      302s       5.03m     0.08h    0.00d
 #Submission to last job:          1143s      19.05m     0.32h    0.01d
 
     # third run: lav -> axt
     # (if non-default BLASTZ_Q is used in the future, put axtRescore in 
     # the pipe after lavToAxt)
     ssh kki
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 cat `ls -1 *.lav | sort -g` \
 | $HOME/bin/x86_64/lavToAxt stdin \
     /iscratch/i/gs.18/build35/bothMaskedNibs /iscratch/i/canFam1/nib stdout \
 | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
 $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     foreach d (../lav/chr*)
       echo "do.csh $d" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 46 of 46 jobs
 #Average job time:                 300s       5.00m     0.08h    0.00d
 #Longest job:                     1669s      27.82m     0.46h    0.02d
 #Submission to last job:          1689s      28.15m     0.47h    0.02d
 
 
 # CHAIN DOG BLASTZ (DONE 7/9/04 angie)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 \
     /iscratch/i/gs.18/build35/bothMaskedNibs \
     /iscratch/i/canFam1/nib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 #Completed: 46 of 46 jobs
 #Average job time:                 266s       4.43m     0.07h    0.00d
 #Longest job:                     3578s      59.63m     0.99h    0.04d
 #Submission to last job:          3578s      59.63m     0.99h    0.04d
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r
       textHistogram -binSize=10000 /tmp/score.$f:t:r
       echo ""
     end
 
     # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
     # chains.  So filter the chain down somewhat...
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain
     rm chain/*
     chainSplit chain all.chain
     gzip all.chain.unfiltered
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg17 ${c}_chainCanFam1 $i
     end
     # Coverage is significantly higher than mouse:
     featureBits hg17 -chrom=chr1 chainCanFam1Link
 #123999291 bases of 222827847 (55.648%) in intersection
 # before filtering: 124750124 bases of 222827847 (55.985%) in intersection
     featureBits hg17 -chrom=chr1 chainMm5Link
 #83773012 bases of 222827847 (37.595%) in intersection
 
 
 # NET DOG BLASTZ (DONE 7/9/04 angie)
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
     netClass noClass.net hg17 canFam1 dog.net
 
     # Make a 'syntenic' subset:
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn dog.net > dogSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
     netFilter -minGap=10 dog.net |  hgLoadNet hg17 netCanFam1 stdin
     netFilter -minGap=10 dogSyn.net | hgLoadNet hg17 syntenyNetCanFam1 stdin
     # Add entries for chainCanFam1, netCanFam1 to human/hg17 trackDb
 
 
 # MAKE VSCANFAM1 DOWNLOADABLES (DONE 9/17/04 kate)
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
     ln -s all.chain dog.chain
     mkdir gz
     cd gz
     gzip -c ../dog.chain > dog.chain.gz
     gzip -c ../dog.net > dog.net.gz
     gzip ../dogSyn.net > dogSyn.net.gz
     # Angie's notes...
     # Mike Zody asked for raw blastz in chain format, so figure out some 
     # way to translate axt or psl to chain and put it out there.  
     # Actually, it's probably just hg16-canFam1 that he wants for now -- ?
     # Ask when we get to this point.
 
     cd ../axtNet
     time gzip *.axt
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17
     mkdir -p vsCanFam1
     cd vsCanFam1
     mv /cluster/data/hg17/bed/blastz.canFam1/axtChain/gz/*.gz .
     md5sum *.gz > md5sum.txt
     mkdir -p axtNet
     cd axtNet
     cp /cluster/data/hg17/bed/blastz.canFam1/axtNet/*.axt.gz .
     md5sum *.gz > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
     # REDO downloads of axtNet's to fix overlaps (2005-09-13 kate)
     # Finally, replace bad chr5 files (2006-01-05 kate)
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.canFam1/axtNet
     nice gzip *.axt
     md5sum *.axt.gz > md5sum.txt
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsCanFam1
     mv axtNet axtNet.old
     ln -s /cluster/data/hg17/bed/blastz.canFam1/axtNet .
 
 
 # GENERATE CANFAM1 MAF FOR MULTIZ FROM NET (DONE 7/9/04 angie)
 # Redo net axt's and maf's to fix overlaps (use 8/5 netToAxt)
 #       (2005-08-16 kate)
 # and replace bad chr5 files (2006-01-05 kate)
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08/axtChain
     netSplit dog.net net
     cd /cluster/data/hg17/bed/blastz.canFam1.2004-07-08
     mkdir axtNet mafNet
 cat > makeMaf.csh << 'EOF'
     foreach f (axtChain/net/*)
       set chr = $f:t:r
       echo $chr
       netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg17/nib \
         /cluster/data/canFam1/nib stdout \
       | axtSort stdin axtNet/$chr.axt
       axtToMaf axtNet/$chr.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/canFam1/chrom.sizes \
             mafNet/$chr.maf -tPrefix=hg17. -qPrefix=canFam1.
     end
 'EOF'
     csh makeMaf.csh >&! makeMaf.log &
     mkdir -p /cluster/bluearc/hg17/mafNet
     cp -rp mafNet /cluster/bluearc/hg17/mafNet/canFam1
 
 
 # BLASTZ MM5 (DONE - 2004-06-22 - Hiram)
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastz.mm5.2004-07-01
     cd /cluster/data/hg17/bed
     ln -s  blastz.mm5.2004-07-01 blastz.mm5
     cd blastz.mm5
 
     cat << '_EOF_' > DEF
 # human vs. mouse
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInRat
 # notInRat OK as it is identical to notInMouse
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Mouse
 SEQ2_DIR=/scratch/mus/mm5/softNib
 # RMSK not currently used
 SEQ2_RMSK=/scratch/mus/mm5/rmsk
 # FLAG not currently used
 SEQ2_FLAG=-rodent
 SEQ2_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastz.mm5
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/hg17/bed/blastz.mm5
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 # Completed: 44330 of 44330 jobs
 # CPU time in finished jobs:   16250628s  270843.80m  4514.06h  188.09d  0.515 y
 # IO & Wait Time:                387936s    6465.60m   107.76h    4.49d  0.012 y
 # Average job time:                 375s       6.26m     0.10h    0.00d
 # Longest job:                     4417s      73.62m     1.23h    0.05d
 # Submission to last job:         43754s     729.23m    12.15h    0.51d
 
     #	the file server to its knees.  Run this on the small cluster.
     ssh kki
     cd /cluster/data/hg17/bed/blastz.mm5
     /cluster/data/hg17/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:       2189s      36.48m     0.61h    0.03d  0.000 y
 # IO & Wait Time:                  7714s     128.57m     2.14h    0.09d  0.000 y
 # Average job time:                  29s       0.48m     0.01h    0.00d
 # Longest job:                      165s       2.75m     0.05h    0.00d
 # Submission to last job:           830s      13.83m     0.23h    0.01d
 
 
     #	Third cluster run to convert lav's to axt's
     #	Does not work on kki since /scratch on the iservers is not the
     #	same as /scratch on the other clusters.
     ssh kk
     cd /cluster/data/hg17/bed/blastz.mm5
     /cluster/data/hg17/jkStuff/BlastZ_run2.sh
     cd run.2
     para try, check, push, etc ...
 # Completed: 45 of 46 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:       1638s      27.30m     0.46h    0.02d  0.000 y
 # IO & Wait Time:                 12068s     201.13m     3.35h    0.14d  0.000 y
 # Average job time:                 305s       5.08m     0.08h    0.00d
 # Longest job:                     1124s      18.73m     0.31h    0.01d
 # Submission to last job:          2519s      41.98m     0.70h    0.03d
     #	chr19 takes too long, the axtSort becomes too large and the poor
     #	node ends up swapping forever.  When you are down to that last
     #	job running, stop it and go to kolossus.
     #	Adjusting the location of the nib directories, and fixing the
     #	MACHTYPE on the commands in the blastz script:
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.mm5
     sed -e "s/i386/x86_64/g" /cluster/bin/scripts/blastz-chromlav2axt > \
 	x86_64-chromlav2axt
     chmod +x x86_64-chromlav2axt
     time ./x86_64-chromlav2axt \
 	/cluster/data/hg17/bed/blastz.mm5/lav/chr19 \
 	/cluster/data/hg17/bed/blastz.mm5/axtChrom/chr19.axt \
 	/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs \
 	/cluster/bluearc/scratch/mus/mm5/softNib
     #	real    7m41.719s
     #	user    2m2.850s
     #	sys     0m23.070s
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm5
     mkdir -p pslChrom
     set tbl = "blastzMm5"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	This takes more than an hour.  You can shorten this by changing
     #	that command to a simple echo, put the results into a file,
     #	split the file into four parts and run the four files as shell
     #	scripts on eieio to have four processes running at the same
     #	time.  Load on eieio gets up to about 20 which is reasonable.
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm5/pslChrom
     bash		#	for tcsh users
     for F in chr*_blastzMm5.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${F}
 	echo "${F} done"
     done
     # this is a 40 minute job
     # exit bash if you are tcsh
 
     # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
     # memory.  But if you reset your ~/.hg.conf to use the read-only
     #	user and contact the hgwdev host, then use the x86_64 featureBits
     # featureBits hg16 blastzMm5
     # 1056761609 bases of 2865248791 (36.882%) in intersection
     # featureBits hg17 blastzMm5
     #	1052077141 bases of 2866216770 (36.706%) in intersection
     # featureBits hg17 blastzMm4
     #  1056201417 bases of 2866216770 (36.850%) in intersection
 
 # CHAIN MM5 BLASTZ (DONE - 2004-07-02 - Hiram)
 
 # The axtChain is best run on the small kluster, or the kk9 kluster
     ssh kki
     mkdir -p /cluster/data/hg17/bed/blastz.mm5/axtChain/run1
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain/run1
     mkdir out chain
 
     ls -1S /cluster/data/hg17/bed/blastz.mm5/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
 #  May need -minScore=5000 for all chroms if chr19 won't finish on kolossus
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \
 	/iscratch/i/mus/mm5/softNib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
 
     # 46 jobs
     gensub2 input.lst single gsub jobList
     para create jobList
     para try
     para push # ... etc ...
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:       4856s      80.94m     1.35h    0.06d  0.000 y
 # IO & Wait Time:                 20083s     334.71m     5.58h    0.23d  0.001 y
 # Average job time:                 542s       9.04m     0.15h    0.01d
 # Longest job:                     2929s      48.82m     0.81h    0.03d
 # Submission to last job:          2929s      48.82m     0.81h    0.03d
 
 
     # now on the file server, sort chains
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain
     time chainMergeSort run1/chain/*.chain > all.chain
     #	real    8m42.853s
     #	user    5m59.100s
     #	sys     0m40.320s
 
     time chainSplit chain all.chain
     #	real    10m52.224s
     #	user    5m52.360s
     #	sys     0m34.870s
 
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain/chain
     bash	#	for tcsh users
     for i in *.chain
     do
         c=${i/.chain/}
         hgLoadChain hg17 ${c}_chainMm5 $i
         echo done $c
     done
     # exit bash if you are tcsh
     #	This is a 50 minute job
 
     #	featureBits hg17 chainMm5
     #	2834490112 bases of 2866216770 (98.893%) in intersection
     #	featureBits hg17 chainMm4
     #	2829135227 bases of 2866216770 (98.706%) in intersection
     #	featureBits hg16 chainMm4
     #	2828363353 bases of 2865248791 (98.713%) in intersection
 
 
 # NET MM5 (DONE - 2004-07-02 - Hiram)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain
     mkdir preNet
     cd chain
     bash	#	for tcsh users
     for i in *.chain
     do
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i /cluster/data/hg17/chrom.sizes \
                         /cluster/data/mm5/chrom.sizes ../preNet/$i
     done
     # exit bash if you are tcsh
     #	15 minute job
 
     cd ..
     mkdir n1
     cd preNet
     bash	#	for tcsh users
     for i in *.chain
     do
       n=${i/.chain/}.net
       echo primary netting $i $n
       /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg17/chrom.sizes \
 	/cluster/data/mm5/chrom.sizes ../n1/$n /dev/null
     done
     # exit bash if you are tcsh
     #	9 minute job
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     #	memory usage 2546110464, utime 16327 s/100, stime 3546
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain
     time netClass hNoClass.net hg17 mm5 mouse.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInMouse \
 	-qNewR=/cluster/bluearc/scratch/mus/mm5/linSpecRep.notInHuman
     #	real    16m38.098s
     #	user    11m38.490s
     #	sys     1m48.470s
 
     # If things look good do
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     time netFilter -syn mouse.net > mouseSyn.net
     #	real    12m3.701s
     #	user    8m44.180s
     #	sys     1m1.610s
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain
     netFilter -minGap=10 mouse.net |  hgLoadNet hg17 netMm5 stdin
     netFilter -minGap=10 mouseSyn.net | hgLoadNet hg17 syntenyNetMm5 stdin
 
     # check results
     #	featureBits hg17 netMm5
     #	2830625630 bases of 2866216770 (98.758%) in intersection
 
     #	featureBits hg17 netMm4
     #	2824272033 bases of 2866216770 (98.537%) in intersection
     # featureBits hg16 netMm5
     #	2823565051 bases of 2865248791 (98.545%) in intersection
 
     # featureBits hg17 syntenyNetMm5
     #	2799194300 bases of 2866216770 (97.662%) in intersection
     # featureBits hg17 syntenyNetMm4
     #	2785830955 bases of 2866216770 (97.195%) in intersection
     # featureBits hg16 syntenyNetMm5
     #	2786960572 bases of 2865248791 (97.268%) in intersection
 
     # Add entries for net and chain to mouse/hg17 trackDb
 
     # make net
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain
     mkdir mouseNet
     time netSplit mouse.net mouseNet
     #	real    11m45.243s
     #	user    8m48.490s
     #	sys     1m13.490s
 
     #	extract axt's from net, and convert to maf's
     # NOTE: Redo the net axt's and maf's using 8/05 netToAxt
     # in order to remove overlaps (2005-08-16 kate)
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain
     mkdir ../axtNet ../mafNet
 cat > makeMaf.csh << '_EOF_'
 #!/bin/csh -ef
     foreach f (mouseNet/chr*.net)
         set c = $f:t:r
         echo "netToAxt: $c.net -> $c.axt"
         rm -f ../axtNet/$c.axt
         netToAxt mouseNet/$c.net chain/$c.chain \
 	    /cluster/data/hg17/nib /cluster/data/mm5/nib stdout | \
 	    axtSort stdin ../axtNet/$c.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/mm5/chrom.sizes \
             ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=mm5.
 	echo "Complete: $c.net -> axtNet/$c.axt -> mafNet/$c.maf"
     end
 '_EOF_'
 # << for emacs
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     mkdir -p /cluster/bluearc/hg17/mafNet
     cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/mm5
 
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/blastz.mm5/axtBest
     cd /cluster/data/hg17/bed/blastz.mm5/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm5/axtNet
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtNet
     gzip *.axt
     # add README.txt file to dir (use previous assembly's copy as template)
     #	32 minute gzip
     
 
     #  Convert those axt files to psl
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm5
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo -n "processing $c.axt -> ${c}_blastzBestMm5.psl ..."
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestMm5.psl
 	echo "Done"
     end
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm5/pslBest
     for I in chr*BestMm5.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
 	echo "done ${I}"
     done
 
      # check results
     # featureBits hg17 blastzBestMm5
     #	1013348528 bases of 2866216770 (35.355%) in intersection
     # featureBits hg17 blastzBestMm4
     #	1017319919 bases of 2866216770 (35.493%) in intersection
     # featureBits hg16 blastzBestMm5
     #	996722004 bases of 2865248791 (34.787%) in intersection
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/hg17/axtBest/Mm5
      cd /gbdb/hg17/axtBest/Mm5
      ln -s /cluster/data/hg17/bed/blastz.mm5/axtNet/chr*.axt .
      cd /cluster/data/hg17/bed/blastz.mm5/axtNet
      rm -f axtInfoInserts.sql
      foreach f (/gbdb/hg17/axtBest/Mm5/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('mm5','Blastz Best in Genome','$chr','$f');" \
          >>! axtInfoInserts.sql
      end
     hgsql hg17 < ~/kent/src/hg/lib/axtInfo.sql
     #	table axtInfo may already exist, ignore create error.
     hgsql hg17 < axtInfoInserts.sql
 
     # REDO: replace downloadable axtNet's to remove overlaps (2005-09-12 kate)
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5
     mv axtNet axtNet.old
     mkdir axtNet
     cd axtNet
     cp /cluster/data/hg17/bed/blastz.mm5/axtNet/*.axt .
     nice gzip *.axt
     md5sum *.axt.gz > md5sum.txt
 
 
 # HG17 TO MM5 LIFTOVER CHAIN (DONE 1/6/05 Andy)
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.mm5/axtChain
     mkdir over
     for file in chain/*.chain; do
        chrom=`basename $file .chain`
        netChainSubset mouseNet/$chrom.net chain/$chrom.chain over/$chrom.over
        cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain
     done
     rm -rf over/
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg17/liftOver
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
     cp /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain .
     gzip hg17ToMm5.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToMm5.chain /gbdb/hg17/liftOver/hg17ToMm5.over.chain
     hgAddLiftOverChain -multiple hg17 mm5
 
 # HG17 TO CANFAM1 LIFTOVER CHAIN (DONE 1/7/05 Andy)
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.canFam1/axtChain
     mkdir over
     for file in chain/*.chain; do
        chrom=`basename $file .chain`
        netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
        cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain
     done
     rm -rf over/
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
     cp /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain .
     gzip hg17ToCanFam1.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToCanFam1.chain /gbdb/hg17/liftOver/hg17ToCanFam1.over.chain
     hgAddLiftOverChain -multiple hg17 canFam1
 
 # HG17 TO PANTRO1 LIFTOVER CHAIN (DONE 1/20/05 Andy)
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.panTro1/axtChain
     mkdir over
     for file in chain/*.chain; do
        chrom=`basename $file .chain`
        netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
        cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain
     done
     rm -rf over/
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
     cp /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain .
     gzip hg17ToPanTro1.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToPanTro1.chain /gbdb/hg17/liftOver/hg17ToPanTro1.over.chain
     hgAddLiftOverChain -multiple hg17 panTro1   
 
 # HG17 TO RN3 LIFTOVER CHAIN (DONE 3/1/05 Andy)
     #ssh kolossus
     #cd /cluster/data/hg17/bed/blastz.rn3/axtChain
     #mkdir over
     #for file in chain/*.chain; do
     #   chrom=`basename $file .chain`
     #   netChainSubset ratNet/$chrom.net chain/$chrom.chain over/$chrom.over
     #   cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToRn3.chain
     #done
     #rm -rf over/
     # Oh fancy that, there's already a hg17ToRn3.over.chain in the /cluster/data/hg17/bed/liftOver
     # directory generated by Angie.  
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
     cp /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain .
     gzip hg17ToRn3.over.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToRn3.over.chain /gbdb/hg17/liftOver/hg17ToRn3.over.chain
     hgAddLiftOverChain -multiple hg17 rn3   
 
 # HG17 TO GALGAL2 LIFTOVER CHAIN (DONE 3/1/05 Andy)
     # OK there's already a /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain file generated
     # by Angie.
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
     cp /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain .
     gzip hg17ToGalGal2.over.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToGalGal2.over.chain /gbdb/hg17/liftOver/hg17ToGalGal2.over.chain
     hgAddLiftOverChain -multiple hg17 galGal2       
 
 # HG17 TO MONDOM1 LIFTOVER CHAIN (DONE 3/1/05 Andy)
     ssh kksilo
     cd /cluster/data/monDom1/bed/zb.hg17/axtChain
     netSplit human.net.gz net
     ssh kolossus
     cd /cluster/data/monDom1/bed/zb.hg17/axtChain
     mkdir over
     for file in chain/*.chain.gz; do
        chrom=`basename $file .chain.gz`
        netChainSubset net/$chrom.net chain/$chrom.chain.gz over/$chrom.over
        cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain
     done
     rm -rf over/ net/
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
     cp /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain .
     gzip hg17ToMonDom1.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToMonDom1.chain /gbdb/hg17/liftOver/hg17ToMonDom1.over.chain
     hgAddLiftOverChain -multiple hg17 monDom1 
 
 # HG17 TO DANRER2 LIFTOVER CHAIN (DONE 3/2/05 Andy)
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
     chainSplit chain all.chain.gz
     netSplit zfishdanRer2.net.gz net
     mkdir over
     # FAILED STEPS:
     #for file in chain/*.chain; do
     #   chrom=`basename $file .chain`
     #   netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
     #   cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain
     #done
     # Error:
 #read 28019 of 28019 chains in chain/chr1.chain
 #Processing chr1
 #netChainSubset: netChainSubset.c:55: writeChainPart: Assertion `subChain != ((void *)0)' failed.
     # OK instead of using the ones in the chain/ subdir, I'm using the ones in
     # the chainAR/ subdir.  These chain files had an additional step in the process of making
     # them: Rachel used the chainAntiRepeat program.  
     for file in chain/*.chain; do
        chrom=`basename $file .chain`
        if [ $chrom = "chr1" ]; then
            netChainSubset net/$chrom.net chainAR/$chrom.chain over/$chrom.over
        else
            netChainSubset net/$chrom.net chainAR/$chrom.chain.gz over/$chrom.over          
        fi
        cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain
     done  
     rm -rf over/
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
     cp /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain .
     gzip hg17ToDanRer2.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToDanRer2.chain /gbdb/hg17/liftOver/hg17ToDanRer2.over.chain
     hgAddLiftOverChain -multiple hg17 danRer2 
     
 
 # HG17 TO TETNIG1 LIFTOVER CHAIN (DONE 3/1/05 Andy)
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
     mkdir over
     for file in chain/*.chain; do
        chrom=`basename $file .chain`
        netChainSubset tetraodonNet/$chrom.net chain/$chrom.chain over/$chrom.over
        cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain
     done
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
     cp /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain .
     gzip hg17ToTetNig1.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToTetNig1.chain /gbdb/hg17/liftOver/hg17ToTetNig1.over.chain
     hgAddLiftOverChain -multiple hg17 tetNig1      
 
 # HG17 TO BOSTAU1 LIFTOVER CHAIN (DONE Mar. 18, 2004, Heather)
 
     ssh kolossus
     cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
     mkdir over
     for file in chain/*.chain; do
        chrom=`basename $file .chain`
        netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
        cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain
     done
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver 
     cp /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain .
     gzip hg17ToBosTau1.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToBosTau1.chain /gbdb/hg17/liftOver/hg17ToBosTau1.over.chain
     hgAddLiftOverChain -multiple hg17 bosTau1      
 
 # HG17 TO XENTRO1 LIFTOVER CHAIN (DONE 7/5/05 Andy)
     ssh kolossus
     cd /cluster/data/xenTro1/bed/zb.hg17/axtChain
     mkdir chain net over
     chainSplit chain all.chain
     netSplit human.net net
     for file in chain/*.chain; do
        chrom=`basename $file .chain`
        netChainSubset net/$chrom.net chain/$chrom.chain over/$chrom.over
        cat over/$chrom.over >> /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain
     done
     rm -rf over/ chain/ net/
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
     cp /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain .
     gzip hg17ToXenTro1.chain
     ln -s /cluster/data/hg17/bed/liftOver/hg17ToXenTro1.chain /gbdb/hg17/liftOver/hg17ToXenTro1.over.chain
     hgAddLiftOverChain -multiple hg17 xenTro1   
 
 # ADD CHAIN AND NET TO VSMM5 AND VSRN3 DOWNLOAD AREAS (DONE 8/5/04 angie)
     ssh hgwdev
     cp -p /cluster/data/hg17/bed/blastz.mm5/axtChain/all.chain.gz \
       /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/mouse.chain.gz
     cp -p /cluster/data/hg17/bed/blastz.mm5/axtChain/mouse.net.gz \
       /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/mouse.net.gz
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5
     md5sum *.gz */*.gz > md5sum.txt
     # Update the README.txt
     cp -p /cluster/data/hg17/bed/blastz.rn3/axtChain/all.chain.gz \
       /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/rat.chain.gz
     cp -p /cluster/data/hg17/bed/blastz.rn3/axtChain/rat.net.gz \
       /usr/local/apache/htdocs/goldenPath/hg17/vsRn3/rat.net.gz
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3
     md5sum *.gz */*.gz > md5sum.txt
     # Update the README.txt
 
 # ADD CHAIN AND NET TO VSHG17 DOWNLOAD AREAS (DONE Sept. 8th, 2004, heather)
     ssh hgwdev
     cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/all.chain.gz \
       /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.chain.gz
     cp -p /cluster/data/mm5/bed/blastz.hg17/axtChain/human.net.gz \
       /usr/local/apache/htdocs/goldenPath/mm5/vsHg17/human.net.gz
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsRn3
     md5sum *.gz */*.gz > md5sum.txt
     # Update the README.txt
 
 
 # SWAP BLASTZ ZEBRAFISH-HUMAN (danRer1-hg17) to HUMAN-ZEBRAFISH (hg17-danRer1)
 # USE RESCORED ALIGNMENTS (see makeDanRer1.doc)
 # (DONE, 2004-06-22, hartera)
 # CONVERT AXTs TO PSL AND LOAD INTO DATABASE (DONE, 2004-07-08, hartera)
     ssh kolossus
     mkdir /cluster/data/hg17/bed/blastz.danRer1.swap
     cd /cluster/data/hg17/bed/blastz.danRer1.swap
     # use rescored axtChrom from blastzHg17 on danRer1
     set aliDir = /cluster/data/danRer1/bed/blastz.hg17
     cp $aliDir/S1.len S2.len
     cp $aliDir/S2.len S1.len
     mkdir unsorted axtChrom
     cat $aliDir/axtChrom/chr*.axt \
     | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
     | axtSplitByTarget stdin unsorted
     # Sort the shuffled .axt files.
     foreach f (unsorted/*.axt)
       echo sorting $f:t:r
       axtSort $f axtChrom/$f:t
     end
     du -sh $aliDir/axtChrom unsorted axtChrom
 # 19G     /cluster/data/danRer1/bed/blastz.hg17/axtChrom
 # 19G     unsorted
     rm -r unsorted
    # translate sorted axt files into psl
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.danRer1.swap
     mkdir -p pslChrom
     set tbl = "blastzDanRer1"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/pslChrom
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl hg17 $f
       echo "$f Done"
     end
 
 # CHAIN ZEBRAFISH (danRer1) BLASTZ (DONE, 2004-06-23, hartera)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg17/bed/blastz.danRer1.swap
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/hg17/bed/blastz.danRer1.swap/axtChrom/*.axt \
                     > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
 
     # << this line makes emacs coloring happy
 
 # Reuse gap penalties from hg16 vs chicken run.
     cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize^V     11
 smallSize^V     111
 position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V 72111^V 152111^V        252111
 qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
 tGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
 bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V  16000^V 32000^V 57000
 '_EOF_'
     # << this line makes emacs coloring happy
 
 cat << '_EOF_' > doChain
 #!/bin/csh
 axtFilter $1 \
 | axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                       -linearGap=../../chickenHumanTuned.gap \
                       -minScore=5000 stdin \
     /iscratch/i/gs.18/build35/bothMaskedNibs \
     /iscratch/i/danRer1/nib $2 > $3
 '_EOF_'
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # para time
 # Completed: 45 of 46 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:       3559s      59.32m     0.99h    0.04d  0.000 y
 # IO & Wait Time:                   934s      15.56m     0.26h    0.01d  0.000 y
 # Average job time:                 100s       1.66m     0.03h    0.00d
 # Longest job:                      502s       8.37m     0.14h    0.01d
 # Submission to last job:          2969s      49.48m     0.82h    0.03d
 # chr19.axt crashed - out of memory so try again on kolossus
 
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/run1
     # need to use nibs on bluearc as iscratch not accessible to kolossus
 cat << '_EOF_' > doChain2
 #!/bin/csh
 axtFilter $1 \
 | axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                       -linearGap=../../chickenHumanTuned.gap \
                       -minScore=5000 stdin \
     /cluster/bluearc/hg17/bothMaskedNibs \
     /cluster/bluearc/danRer1/nib $2 >& $3
 '_EOF_'
     chmod +x doChain2
 
     doChain2 \
          /cluster/data/hg17/bed/blastz.danRer1.swap/axtChrom/chr19.axt \
          chain/chr19.chain out/chr19.out
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg17 ${c}_chainDanRer1 $i
         echo done $c
     end
 # tried minScore = 1000 and minScore = 10000 for axtChain
 # minScore = 5000 was best for reducing low scoring chains but not reducing 
 # overlap with refGene CDS too much
 
 # NET ZEBRAFISH (danRer1) BLASTZ (DONE, 2004-06-24, hartera)
 # REMAKE NET WITHOUT ANCIENT REPEATS (DONE, 2004-07-07, hartera)
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                      ../preNet/$i
     end
 
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
 
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 149086208, utime 868 s/100, stime 173                                                                            
     # Add classification info using db tables:
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
     # netClass looks for ancient repeats in one of the databases
     # hg17 has this table - hand-curated by Arian
     # this is only for human rodent comparisons so use -noAr option
     mkdir -p /cluster/bluearc/danRer1/linSpecRep.notInHuman
     mkdir -p /cluster/bluearc/hg17/linSpecRep.notInZebrafish
     cp /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish/* \
        /cluster/bluearc/hg17/linSpecRep.notInZebrafish
     cp /iscratch/i/danRer1/linSpecRep.notInHuman/* \
        /cluster/bluearc/danRer1/linSpecRep.notInHuman
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
     # add -noAr option
     # mkdir old
     # mv zebrafish.net ./old/zebrafish.net.old
     time netClass noClass.net hg17 danRer1 zebrafish.net \
         -tNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish \
         -qNewR=/cluster/bluearc/danRer1/linSpecRep.notInHuman -noAr
     # 83.410u 43.650s 3:09.94 66.8%   0+0k 0+0io 198pf+0w
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
     netFilter -minGap=10 zebrafish.net | hgLoadNet hg17 netDanRer1 stdin
 
 # EXTRACT AXT'S AND MAF'S FROM ZEBRAFISH (danRer1) NET 
 # (DONE, 2004-06-24, hartera) used net where hg17 ancient Repeat table used
 # sorted axts and remade mafs as multiz needs axts to be sorted
 # (DONE, 2004-06-25, kate)
 # Redone to fix overlaps using 8/05 axtToNet (2005-08-16 kate)
 # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
     ssh eieio
     # create axts
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
     netSplit zebrafish.net zebrafishNet
     mkdir -p ../axtNet ../mafNet
 cat > makeMaf.csh << 'EOF'
     foreach f (zebrafishNet/chr*.net)
         set c = $f:t:r
         echo $c
         netToAxt zebrafishNet/$c.net chain/$c.chain \
         /cluster/data/hg17/nib /cluster/data/danRer1/nib stdout | \
                axtSort stdin ../axtNet/$c.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/danRer1/chrom.sizes \
             ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=danRer1.
     end
 'EOF'
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     mkdir -p /cluster/bluearc/hg17/mafNet
     cp -rp ../mafNet /cluster/bluearc/hg17/mafNet/danRer1
 
 
 # BLASTZ ZEBRAFISH (danRer1) CLEAN UP (DONE, 2004-07-19, hartera)
 # FURTHER CLEANUP (hartera, 2006-09-01, hartera)
      ssh eieio
      cd /cluster/data/hg17/bed/blastz.danRer1.swap
      nice rm axtChain/run1/chain/* &
      nice rm -fr axtChain/n1 axtChain/hNoClass.net &
      nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net &
      # further cleanup (2006-09-01, hartera)
      ssh kkstore02
      cd /cluster/data/hg17/bed/blastz.danRer1.swap
      rm -r axtNet.old axtNet.unsorted mafNet
      cd axtChain
      rm hist*
      # remove chains and nets directories. These can be reconstructed with
      # all.chain.gz and zebrafish.net.gz
      rm -r old chain zebrafishNet preNet
      rm noClass.net.gz
      cd ..
      rm pslChrom/psl.tab.gz
 
 # ZEBRAFISH DANRER1 DOWNLOADS (WORKING 2004-09-17 kate)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtNet
     gzip *.axt
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17
     mkdir -p vsDanRer1
     cd vsDanRer1
     cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/all.chain.gz zebrafish.chain.gz
     cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain/zebrafish.net.gz .
     md5sum *.gz > md5sum.txt
     mkdir -p axtNet
     cd axtNet
     cp /cluster/data/hg17/bed/blastz.danRer1.swap/axtNet/*.axt.gz .
     md5sum *.gz > md5sum.txt
     # Copy and edit README.txt
       
  
 # MAKING MOUSE SYNTENY (DONE - 2004-07-03 - Hiram)
 
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/syntenyMm5
 cd /cluster/data/hg17/bed/syntenyMm5
 
 # Copy all the needed scripts from /cluster/data/hg16/bed/syntenyMm3
 cp -p /cluster/data/hg17/bed/syntenyRn3/*.pl .
 
 ./syntenicBest.pl -db=hg17 -table=blastzBestMm5
 ./smooth.pl
 ./joinsmallgaps.pl
 ./fillgap.pl -db=hg17 -table=blastzBestMm5
 ./synteny2bed.pl
     #	The five commands above
     #	real    209m28.161s
     #	user    0m21.040s
     #	sys     0m4.100s
 
 #	Used to load this in syntenyMm5, but that type is misleading to
 #	the table browser and fails the checkTableCoords check.
 #	Better to use this ensRatMusHom type:
 #	Need a new name here for the Mm5 to not conflict with Rn3
 sed -e 's/ensPhusionBlast/ensRatMm5Hom/g' \
       $HOME/kent/src/hg/lib/ensPhusionBlast.sql \
       > ensRatMm5Hom.sql
 hgLoadBed hg17 ensRatMm5Hom ucsc100k.bed -sqlTable=ensRatMm5Hom.sql
     #	featureBits hg17 ensRatMm5Hom
     #	2649530748 bases of 2866216770 (92.440%) in intersection
     #	featureBits hg17 ensRatMm4Hom
     #	2549307611 bases of 2866216770 (88.943%) in intersection
     #	featureBits hg16 syntenyMm5
     #	2560252977 bases of 2865248791 (89.355%) in intersection
 
 # MAKING MOUSE AXTTIGHT FROM AXTBEST (DONE - 2004-07-02 - Hiram)
     # After creating axtBest alignments above, use subsetAxt to get axtTight:
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm5/axtNet
     mkdir -p ../axtTight
     bash	#	for tcsh users
     for I in *.axt
     do
       echo $I
       subsetAxt  $I ../axtTight/$I \
 	~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
     done
     # exit bash if you are tcsh
     #	An 8 minute job
 
     # translate to psl
     cd ../axtTight
     mkdir ../pslTight
     bash	#	for tcsh users
     for I in *.axt
     do
       C=${I/.axt/}
       axtToPsl $I ../S1.len ../S2.len ../pslTight/${C}_blastzTightMm5.psl
       echo "Done: $I -> ${C}_blastzTightMm5.psl"
     done
     # exit bash if you are tcsh
 
     # Load tables into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm5/pslTight
     for I in chr*TightMm5.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
 	echo "done ${I}"
     done
 
     #	Compare results with previous assembly:
     #	featureBits hg17 blastzTightMm5
     #	165862935 bases of 2866216770 (5.787%) in intersection
     #	featureBits hg17 blastzTightMm4
     #	166569246 bases of 2866216770 (5.811%) in intersection
     #	featureBits hg16 blastzTightMm5
     #	162641577 bases of 2865248791 (5.676%) in intersection
 
     # copy  axt's to download area
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.mm5/axtTight
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsMm5/axtTight
     gzip *.axt
     # add README.txt file to dir (use previous assembly's copy as template)
     #	4 minute gzip
 
 # BLASTZ MM5 CLEAN UP (DONE 2004-07-02 - Hiram)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.mm5
     nice rm -rf raw &
     nice rm -fr axtChain/n1 axtChain/hNoClass.net &
     nice rm axtChain/run1/chain/* &
     nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &
 
 ##############################################################################
 #  MAKING BLASTZ SELF (DONE - 2004-07-14 - Hiram)
 
     # The procedure for lineage spec business with self is to simply
     # use the actual repeat masker output for this human assembly as
     # the lineage specific repeats for itself.  Thus, merely make
     # symlinks to the repeat masker out files and name them as expected
     # for blastz.  In this case they are called notInHuman but they
     # really mean InHuman.  Yes, it is confusing, but that's just the
     # nature of the game in this case.
 
     ssh eieio
     mkdir /cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
     cd /cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
     foreach f (../rmsk/*.fa.out)
 	set base = $f:t:r:r
 	echo $base.out.spec
 	ln -s $f $base.out.spec
     end
     #	Same thing done on iscratch
     #	Not worried about pushing this scratch yet, it will get done
     #	sometime later.  Using the actual /cluster/bluearc/scratch/
     #	location below.
 
     ssh kk
     mkdir /cluster/data/hg17/bed/blastzSelf.2004-07-01
     cd /cluster/data/hg17/bed
     ln -s blastzSelf.2004-07-01 blastzSelf
     cd blastzSelf
 
     cat << '_EOF_' > DEF
 # human vs. human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Human
 SEQ2_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
 # not currently used
 SEQ2_RMSK=
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 
 BASE=/cluster/data/hg17/bed/blastzSelf
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/hg17/bed/blastzSelf
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
     #	you need a -maxPush=200000 on this one, it is more than 100000
     #	jobs the default push limit.  Also be aware of maxQueue limits
     #	on the KK, may need something more than the default of 200000 if
     #	the KK is busy.
 XXX - running 2004-07-01 11:26
 
 
 ##############################################################################
 # LIFTOVER (DROP) CHAINS TO HG16 (IN PROGRESS 2005-01-03 kate)
 
    # swap hg16->hg17 chains 
 
  # LIFTOVER (DROP) CHAINS TO HG16 (IN PROGRESS 2004-07-07 kate)
 
     # run alignment
     # NOTE: split hg16 to /iscratch/i is doc'ed in makeHg16.doc
     ssh kk
     cd /cluster/data/hg17
     makeLoChain-align hg17 /scratch/hg/gs.18/build35/bothMaskedNibs \
                     hg16 /iscratch/i/gs.17/build34/liftOver/split
     # Created parasol job in bed/blat.hg16.2004-07-07/run
     # 1150 jobs
     cd bed/blat.hg16.2004-07-07/run
     para try
     para check
     para push
 
     # GOT HERE
 
     # lift results (use bash)
     cd /cluster/data/hg17/bed/blat.hg16
     for file in /cluster/data/hg16/nib/*.nib; do
        chrom=`basename $file .nib`
        liftUp -pslQ psl/$chrom.psl /cluster/bluearc/hg/gs.17/build34/liftOver/lift/$chrom.lft warn raw/chr*_${chrom}.psl
     done
     # There were some errors from not finding .lft files for the chr_random ones.
     ssh kk9
     cd ../liftOver
     ln -s blat.hg16 blat.hg16.2005-01-22
     makeLoChain-chain hg17 /cluster/data/hg17/nib hg16 /cluster/data/hg16/nib 2>chain.error.log >chain.log
     ssh eieio
     makeLoChain-net hg17 hg16
     ssh hgwdev
     makeLoChain-load hg17 hg16
 
 # DROPUNDER CHAIN TO HG15 (DONE 2005-07-21 Andy)
     # Split things up
     ssh eieio
     cd /cluster/bluearc
     mkdir -p hg15/liftOver/split
     cd hg15/liftOver/split/
     mkdir ../lift
     for c in `cut -f1 /cluster/data/hg15/chrom.sizes`; do
        echo $c
        num=${c%_random}
        num=${num#chr}
        faSplit -lift=../lift/${c}.lft size /cluster/data/hg15/${num}/${c}.fa -oneFile 3000 ${c}
     done  
 
     # Move files to santest
     ssh hgwdev
     cd /santest/scratch
     mkdir hg15
     cd hg15/
     cp -r /cluster/bluearc/hg15/liftOver .
 
     # run alignment
     ssh kk
     cd /cluster/data/hg17
     makeLoChain-align hg17 /scratch/hg/gs.18/build35/bothMaskedNibs \
                     hg15 /santest/scratch/hg15/liftOver/split
     # Created parasol job in bed/blat.hg16.2004-07-07/run
     # 2024 jobs written to batch
     # *** IGNORE the batch created by the script.
     ln -s bed/blat.hg15.2005-07-21 bed/blat.hg15
     cd bed/blat.hg15/
     mv run run.kk
     mkdir run.kk9 run.kki
     cd run.kk/
     sed 's/\.fa\./\./g' spec > tmp; mv tmp spec
     grep Un_random spec > ../run.kki/spec
     grep -v Un_random spec > newspec
     mv newspec spec
     egrep "chr(1|19|X)(\.|_)" spec | grep -v random > ../run.kk9/spec
     grep -Fv -f ../run.kk9/spec spec > newspec
     mv newspec spec
     wc -l spec ../run.kk9/spec ../run.kki/spec
 #   1831 spec
 #    147 ../run.kk9/spec
 #     46 ../run.kki/spec
 #   2024 total
     # Checks out
     
     # Run the thing on all 3 clusters.
     para create spec
     para push
 #Completed: 1831 of 1831 jobs
 #CPU time in finished jobs:    8556066s  142601.10m  2376.69h   99.03d  0.271 y
 #IO & Wait Time:                 60428s    1007.13m    16.79h    0.70d  0.002 y
 #Average job time:                4706s      78.43m     1.31h    0.05d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:           46724s     778.73m    12.98h    0.54d
 #Submission to last job:         46725s     778.75m    12.98h    0.54d
     ssh kk9
     cd /cluster/data/hg17/bed/blat.hg15/run.kk9
     para create spec
     para push
 #Completed: 147 of 147 jobs
 #CPU time in finished jobs:    1698424s   28307.07m   471.78h   19.66d  0.054 y
 #IO & Wait Time:                   874s      14.56m     0.24h    0.01d  0.000 y
 #Average job time:               11560s     192.66m     3.21h    0.13d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:           31413s     523.55m     8.73h    0.36d
 #Submission to last job:         31413s     523.55m     8.73h    0.36d
     ssh kki
     cd /cluster/data/hg17/bed/blat.hg15/run.kki
     para create spec
     para push
     # OK I don't have para time stuff for this one, but it was the shortest 
     # by far.
 
     # lift results
     cd /cluster/data/hg17/bed/blat.hg15/lift.run
     for chrom in `cut -f1 /cluster/data/hg15/chrom.sizes`; do
        liftUp -pslQ /cluster/bluearc/hg15/liftOver/psl/${chrom}.psl /cluster/bluearc/hg15/liftOver/lift/${chrom}.lft warn raw/chr*_${chrom}.psl
     done
 
     # Chain
     # There's been some problems with store5.  
     ssh kk9
     cd /cluster/store12/store5/gs.18/build35/bed/blat.hg15.2005-07-21
     mkdir chainRun
     mkdir -p /panasas/store/hg15/chainRaw
     ln -s /panasas/store/hg15/chainRaw chainRaw
     cd chainRun/
     ls -1S ../psl/*.psl > in.lst
     cat > chain.sh << "_EOF_"
 #!/bin/bash
 tmp=/scratch/`basename $4`
 axtChain -psl $1 $2 $3 $tmp
 cp $tmp $4
 rm $tmp  
 _EOF_
     chmod +x chain.sh
     cat > gsub << "_EOF_"
 #LOOP
 ./chain.sh $(path1) /scratch/hg/gs.18/build35/bothMaskedNibs /scratch/hg/gs.16/build33/chromTrfMixedNib {check out line+ ../chainRaw/$(root1).chain}
 #ENDLOOP
 _EOF_
     # <<
     cd chainRun/
     gensub2 in.lst single gsub spec
     para create spec
     para push
 #Completed: 44 of 44 jobs
 #CPU time in finished jobs:       7448s     124.13m     2.07h    0.09d  0.000 y
 #IO & Wait Time:                  9591s     159.85m     2.66h    0.11d  0.000 y
 #Average job time:                 387s       6.45m     0.11h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:            1906s      31.77m     0.53h    0.02d
 #Submission to last job:          1906s      31.77m     0.53h    0.02d
     ssh kolossus
     cd /panasas/store/hg15/chainRaw    
     chainMergeSort *.chain | chainSplit /scratch/andy/chain stdin
     cd /scratch/andy
     mkdir net over
     cd chain/
     for chain in *; do
        c=${chain%.chain}
        echo $c
        chainNet $chain /cluster/store12/store5/gs.18/build35/chrom.sizes \
           /cluster/store12/store5/gs.16/build33/chrom.sizes \
           ../net/${c}.net /dev/null
        netChainSubset ../net/${c}.net $chain ../over/${c}.over
     done
     cd ../over/
     cat * >> ../hg17ToHg15.over.chain
     cd ../
     cp -r hg17* over/ /cluster/store12/store5/gs.18/build35/bed/blat.hg15.2005-07-21/
     cd ../
     rm -rf andy/
     rm -rf /panasas/store/hg15    
     cd /cluster/bluearc/hg15/liftOver/psl
     for psl in *; do
        gzip $psl
     done
     cd ../
 
     
 # Completed: 116281 of 116281 jobs
 # CPU time in finished jobs:   21807388s  363456.46m  6057.61h  252.40d  0.692 y
 # IO & Wait Time:               2319383s   38656.39m   644.27h   26.84d  0.074 y
 # Average job time:                 207s       3.46m     0.06h    0.00d
 # Longest job:                    22063s     367.72m     6.13h    0.26d
 # Submission to last job:         83402s    1390.03m    23.17h    0.97d
 
 
     #	Second cluster run to convert the .out's to .lav's
     #	You do NOT want to run this on the big cluster.  It brings
     #	the file server to its knees.  Run this on the small cluster.
     ssh kki
     cd /cluster/data/hg17/bed/blastzSelf
     /cluster/data/hg17/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:       6344s     105.73m     1.76h    0.07d  0.000 y
 # IO & Wait Time:                  5413s      90.22m     1.50h    0.06d  0.000 y
 # Average job time:                  34s       0.57m     0.01h    0.00d
 # Longest job:                      505s       8.42m     0.14h    0.01d
 # Submission to last job:          4521s      75.35m     1.26h    0.05d
 
     #	Third cluster run to convert lav's to axt's
 
     #	These self alignments do not work well as the usual third cluster job.
     #	Instead, a specialized job here that includes a DropSelf
     #	operation, and in individual lav pieces to avoid out of memory
     #	problems during axtSort
     ssh kki
     cd /cluster/data/hg17/bed/blastzSelf
     mkdir axtChrom run.2
     cd run.2
     cat << '_EOF_' > runLavToAxt.sh
 #!/bin/sh
 
 BASE=/cluster/data/hg17/bed/blastzSelf
 SEQ1_DIR=/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs
 SEQ2_DIR=/cluster/bluearc/scratch/hg/gs.18/build35/bothMaskedNibs
 
 CHR=$1
 OUT=axtChrom/$CHR.axt
 cd ${BASE}/lav/${CHR}
 for D in *.lav
 do
     smallout=$D.axt
     lavToAxt  $D $SEQ1_DIR $SEQ2_DIR stdout \
 	| axtDropSelf stdin stdout \
 	| axtSort stdin $smallout
 done
 cat `ls -1 *.lav.axt | sort -g` > $BASE/$OUT
 '_EOF_'
     # << keep emacs coloring happy
     chmod +x runLavToAxt.sh
 
     cat << '_EOF_' > gsub
 #LOOP
 ./runLavToAxt.sh $(path1) {check out line ../axtChrom/$(path1).axt}
 #ENDLOOP
 '_EOF_'
     # << keep emacs coloring happy
 
     ls ../lav > chrList
     gensub2 chrList single gsub jobList
     para create jobList
     para try
     para push
     #	This is a tough load on eieio.  Managable, but the load should
     #	be monitored to make sure it isn't severe.  I saw about 100 to 150
     #	the chr19 job will not finish, even in parts it takes up too
     #	much memory and the node it runs on ends up swapping endlessly.
     #	Need to go to kolossus to do chr19
     para stop
     para recover jobList chr19JobList
     ssh kolossus
     cd /cluster/data/hg17/bed/blastzSelf/run.2
     time ./runLavToAxt.sh chr19
     #	real    43m14.797s
     #	user    12m56.670s
     #	sys     3m13.590s
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg17/bed/blastzSelf
     mkdir pslChrom
     set tbl = "blastzSelf"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	That takes about 70 minutes
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastzSelf/pslChrom
     bash # if a csh/tcsh user
     for I in *.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
 	echo "done: ${I}"
     done
     # exit bash if you are tcsh
     #	This is an 80 minute job
 
     #	Check results
     #	featureBits hg17 blastzSelf
     #	252256266 bases of 2866216770 (8.801%) in intersection
     #	real    40m49.573s
     #	user    21m14.200s
     #	sys     2m10.420s
 
     #	featureBits hg16 blastzSelf
     #	254410837 bases of 2865248791 (8.879%) in intersection
 
 
 # CHAIN SELF BLASTZ (DONE - 2004-07-07 - Hiram)
 
 # The axtChain is best run on the small kluster, or the kk9 kluster
     ssh kki
     mkdir -p /cluster/data/hg17/bed/blastzSelf/axtChain/run1
     cd /cluster/data/hg17/bed/blastzSelf/axtChain/run1
     mkdir out chain
 
     ls -1S /cluster/data/hg17/bed/blastzSelf/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} out/$(root1).out
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
 #  May need -minScore=5000 for all chroms if chr19 won't finish on kolossus
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 /iscratch/i/gs.18/build35/bothMaskedNibs \
 	/iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
 
     # 46 jobs
     gensub2 input.lst single gsub jobList
     para create jobList
     para try
     para push # ... etc ...
 # Completed: 45 of 46 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:       8519s     141.98m     2.37h    0.10d  0.000 y
 # IO & Wait Time:                  4795s      79.92m     1.33h    0.06d  0.000 y
 # Average job time:                 296s       4.93m     0.08h    0.00d
 # Longest job:                     2407s      40.12m     0.67h    0.03d
 # Submission to last job:          3540s      59.00m     0.98h    0.04d
 
     #	chr19 did fail, on kolossus, try:
     ssh kolossus
     cd /cluster/data/hg17/bed/blastzSelf/axtChain/run1
     time axtChain /cluster/data/hg17/bed/blastzSelf/axtChrom/chr19.axt \
 	/cluster/data/hg17/nib \
 	/cluster/data/hg17/nib \
 	chain/chr19.chain > out/chr19.out
     #	80 minute job, 1.5 Gb result:
     #	-rw-rw-r--    1 1588795432 Jul  7 21:54 chr19.chain
 
     # now on the file server, sort chains
     ssh eieio
     cd /cluster/data/hg17/bed/blastzSelf/axtChain
     time chainMergeSort run1/chain/*.chain > all.chain
     #	real    27m38.935s
     #	user    23m18.540s
     #	sys     2m39.300s
     #	A 5 Gb file:
     #	-rw-rw-r--    1 5267202936 Jul  7 22:23 all.chain
 
 
     time chainSplit chain all.chain
     #	real    29m27.062s
     #	user    22m48.250s
     #	sys     1m57.910s
 
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain
     bash	#	for tcsh users
     for I in *.chain
     do
         c=${I/.chain/}
         $HOME/bin/i386/hgLoadChain -normScore hg17 ${c}_chainSelf $I
         echo done $c
     done
     # exit bash if you are tcsh
     #	This is almost 3 hours to load
 
     ssh kolossus
     cd /cluster/data/hg17/bed/blastzSelf.2004-07-01
     time HGDB_CONF=~/.hg.conf.read-only featureBits \
 	-noRandom -noHap hg17 chainSelfLink > fb.chainSelfLink 2>&1 &
     #	real    56m34.802s
     #	240976607 bases of 2851352871 (8.451%) in intersection
 
     #	featureBits hg17 chainSelf
     #	682833453 bases of 2866216770 (23.824%) in intersection
     #	featureBits hg16 chainSelf
     #	626345319 bases of 2865248791 (21.860%) in intersection
 
     #	DELIVER these chain files to hgdownload (2005-01-27 - Hiram)
     ssh eieio
     cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain
     gzip chr*.chain
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
     cd /cluster/data/hg17/bed/blastzSelf/axtChain/chain
     cp -p *.chain.gz /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
 
     #	fixup README file, request push
 
 # NET SELF (DONE - 2004-07-13 - Hiram)
 
     ssh eieio
     cd /cluster/data/hg17/bed/blastzSelf/axtChain
     mkdir preNet
     cd chain
     bash	#	for tcsh users
     for I in *.chain
     do
       echo preNetting $I
       /cluster/bin/i386/chainPreNet $I /cluster/data/hg17/chrom.sizes \
                         /cluster/data/hg17/chrom.sizes ../preNet/$I
     done
     #	23 minutes
 
     cd ..
     mkdir n1
     cd preNet
     for I in *.chain
     do
       N=${I/.chain/}.net
       echo primary netting $I
       /cluster/bin/i386/chainNet $I -minSpace=10 \
 	/cluster/data/hg17/chrom.sizes /cluster/data/hg17/chrom.sizes \
 	../n1/$N /dev/null
     done
     # exit bash if you are tcsh
     #	5 minute job
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     #	memory usage 206442496, utime 3009 s/100, stime 252
     #	memory usage 2510467072, utime 19307 s/100, stime 3181
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastzSelf/axtChain
     time netClass hNoClass.net hg17 hg17 human.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman \
 	-qNewR=/cluster/bluearc/scratch/hg/gs.18/build35/linSpecRep.notInHuman
     #	real    9m32.951s
     #	user    2m42.840s
     #	sys     1m23.460s
 
     # If things look good do
     ssh eieio
     cd /cluster/data/hg17/bed/blastzSelf/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     time netFilter -syn human.net > humanSyn.net
     #	real    0m29.851s
     #	user    0m27.200s
     #	sys     0m2.120s
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastzSelf/axtChain
     netFilter -minGap=10 human.net |  hgLoadNet hg17 netSelf stdin
     netFilter -minGap=10 humanSyn.net | hgLoadNet hg17 syntenyNetSelf stdin
 
     # check results
     # featureBits hg17 netSelf
     #	620827374 bases of 2866216770 (21.660%) in intersection
     # featureBits hg16 netSelf
     #	563788850 bases of 2865248791 (19.677%) in intersection
     # featureBits hg15 selfNet
     #	749177799 bases of 2866466359 (26.136%) in intersection
 
 
     # featureBits hg17 syntenyNetSelf
     #	404535376 bases of 2866216770 (14.114%) in intersection
     # featureBits hg16 syntenyNetSelf
     #	340871322 bases of 2865248791 (11.897%) in intersection
 
     # Add entries for net and chain to human/hg17 trackDb
 
     # make net
     ssh eieio
     cd /cluster/data/hg17/bed/blastzSelf/axtChain
     mkdir humanNet
     time netSplit human.net humanNet
     #	real    0m52.106s
     #	user    0m43.350s
     #	sys     0m5.170s
 
     # extract axts from net  - this should be combined with the sort and
     # maf conversion below
     mkdir ../axtNet 
     foreach n (humanNet/chr*.net)
 	set c=$n:t:r
 	echo "netToAxt: $c.net -> $c.axt"
 	rm -f ../axtNet/$c.axt
 	netToAxt humanNet/$c.net chain/$c.chain \
 		/cluster/data/hg17/nib \
 		/cluster/data/hg17/nib stdout > ../axtNet/$c.axt
 	echo "Complete: $c.net -> axtNet/$c.axt"
     end
 
     # sort axt's and convert to maf format
     mkdir ../mafNet
     foreach f (../axtNet/chr*.axt)
         set c=$f:t:r
         echo $c.axt
         mv ../axtNet/$c.axt ../axtNet/$c.unsorted.axt
         axtSort ../axtNet/$c.unsorted.axt ../axtNet/$c.axt
         rm ../axtNet/$c.unsorted.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/hg17/chrom.sizes \
                 ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=hg17.
     end
     # a 3 minute job
 
     XXXX - ! ! !   WE DO NOT NEED the Best and Tight tracks for Self ! ! !
     ssh hgwdev
     mkdir -p /cluster/data/hg17/bed/blastzSelf/axtBest
     cd /cluster/data/hg17/bed/blastzSelf/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area - XXX Do we need this for Self ?
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastzSelf/axtNet
     mkdir /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsSelf
     nice gzip *.axt
     nice md5sum *.gz > md5sum.txt
     # add README.txt file to dir (use previous assembly's copy as template)
 
     #  Convert those axt files to psl
     ssh eieio
     cd /cluster/data/hg17/bed/blastzSelf
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo "processing $c.axt -> ${c}_blastzBestSelf.psl"
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestSelf.psl
 	echo "Done: ${c}_blastzBestSelf.psl"
     end
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastzSelf/pslBest
     bash # if a csh/tcsh user
     for I in chr*BestSelf.psl
     do
 	/cluster/bin/i386/hgLoadPsl -noTNameIx hg17 ${I}
 	echo "done ${I}"
     done
     # exit bash if you are tcsh
 
     #	check results
     #	featureBits hg17 blastzBestSelf
     #	233978156 bases of 2866216770 (8.163%) in intersection
     #	featureBits hg16 blastzBestSelf
     #	225819219 bases of 2865248791 (7.881%) in intersection
 
 # MAKING HUMAN AXTTIGHT FROM AXTBEST (NOT TO BE DONE - 2004-07-13 - Hiram)
     #	XXXX - ! ! !  DO NOT NEED axtBest for Self alignments
     #	Been done anyway, Robert and Gill like to see it.
 
 # BLASTZ SELF CLEAN UP (DONE - 2004-07-15 - Hiram)
     ssh eieio
     cd /cluster/data/hg17/bed/blastzSelf
     nice rm -rf raw &
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/hNoClass.net &
     nice gzip axtChrom/* pslChrom/* lav/*/* axtChain/all.chain axtChain/*.net &
 
 
 # CREATING BIG ZIPS (DONE - 2004-07-23 - Hiram)
     ssh eieio
     cd /cluster/data/hg17/jkStuff
     time ./zipAll.sh > zipAll.out 2>&1
 
     ssh hgwdev
     #  This stuff has to work in a different way because this stuff
     #	updates on a daily basis.
     cd /usr/local/apache/htdocs/goldenPath/hg17/bigZips
     featureBits hg17 refGene:upstream:1000 -fa=upstream1000.fa
     zip upstream1000.zip upstream1000.fa
     rm upstream1000.fa
     featureBits hg17 refGene:upstream:2000 -fa=upstream2000.fa
     zip upstream2000.zip upstream2000.fa
     rm upstream2000.fa
     featureBits hg17 refGene:upstream:5000 -fa=upstream5000.fa
     zip upstream5000.zip upstream5000.fa
     rm upstream5000.fa
 
 
 # ENCODE REGIONS (DONE 2004-07-28 kate)
 
     ssh eieio
     cd /cluster/data/hg17/bed
     mkdir encodeRegions
     cd encodeRegions
     liftOver /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \
         /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
                 encodeRegions.bed encodeRegions.unmapped
     wc -l encodeRegions.*
     # 44 encodeRegions.bed
     # 0 encodeRegions.unmapped
     ssh hgwdev
     cd /cluster/data/hg17/bed/encodeRegions
     hgLoadBed hg17 encodeRegions encodeRegions.bed -noBin
 
 
 # H-INVITATIONAL GENE ANNOTATION DATABASE (WORKING 2004-07-28 kate)
     # http://www.jbirc.aist.go.jp/hinv/top.html
     # Create knownGene table to reference HINV gene ID's
     #  for link on knownGenes details page
     # Also, create an HINV gene track
 
     # download CDNA file release 1.5 -- got release # from downloads page).
     ssh kksilo
     mkdir -p /cluster/data/hinv
     cd /cluster/data/hinv
     wget http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz
     gunzip FCDNA.gz
     mv FCDNA FCDNA.1.5
 
     # set up assembly work area
     ssh eieio
     cd /cluster/data/hg17
     mkdir -p bed/hinv
     cd bed/hinv
 
     # extract H-INV ID's and Genbank accessions of mRNAs
     awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/FCDNA.1.5 \
                                                         > accessions.txt
     awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/FCDNA.1.5 \
                                                         > ids.txt
     paste accessions.txt ids.txt > queries.txt
     wc -l ids.txt
         #   41118 ids.txt
 
     # create PSL file from alignments for these mRNA's, extracted from the 
     #       table of all aligned mRNA's
     ssh hgwdev
     cd /cluster/data/hg17/bed/hinv
     hgsql hg17 -s -e "SELECT * FROM all_mrna"  | cut -f 2- > all_mrna.tab
 
     ssh eieio
     cd /cluster/data/hg17/bed/hinv
     pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
         # using pslReps to generate the PSL file header
     ~kate/bin/i386/pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl
         # NOTE: generated with pslSelect.c v1.3 (1.4 is broken -- test is
         # setup in hg/pslSelect/tests & I requested Robert take a look)
 
     # load track of mrna alignments
     hgwdev
     cd /cluster/data/hg17/bed/hinv
     hgLoadPsl hg17 -table=HInvGeneMrna hinv_mrna.psl
     hgsql hg17 -s -e \
         "select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna
     hgsql hg16 -s -e \
         "select distinct(qName) from HInvGeneMrna order by qName" > hg16.mrna
     wc -l hg*.mrna
         # 40998 hg16.mrna
         # 41023 hg17.mrna
 
     comm -1 -3 *.mrna > hg17.aligned
     wc -l hg17.aligned
         # 29 (transcripts newly aligned in hg17)
     comm -2 -3 *.mrna > hg16.aligned
     wc -l hg16.aligned
         # 4 (transcripts no longer aligned in hg17)
     comm -2 -3 ids.txt hg17.mrna > hg17.notaligned
     wc -l hg17.notaligned
         # 95 (transcripts not aligned in hg17 -- checking on why...)
 
     # also make a table with various useful items for each transcript
     ssh hgwdev
     hgsql hg17 < ~/kent/src/hg/lib/HInv.sql
     cd /cluster/data/hg17/bed/hinv
     /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/FCDNA.1.5 > HInv.tab
     echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg17
     hgsql hg16 -s -e "select count(*) from HInv"
         # 41118
     hgsql hg17 -s -e "select count(*) from HInv"
         # 41118
 
     # create table for knownGenes detail page
     ssh hgwdev
     cd /cluster/data/hg17/bed/hinv
     hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
 
 
 # GENEID GENE PREDICTIONS (DONE 7/30/04 angie)
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/geneid
     cd /cluster/data/hg17/bed/geneid
     foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
       wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/geneid_v1.2/$chr.gtf
       wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/geneid_v1.2/$chr.prot
     end
     # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
     cp /dev/null geneid.fa
     foreach f (chr*.prot)
       perl -wpe 's/^(>chr\S+)/$1.1/' $f >> geneid.fa
     end
     ldHgGene -gtf -genePredExt hg17 geneid *.gtf 
     hgPepPred hg17 generic geneidPep geneid.fa
 
 
 # MITOPRED DATA FOR HGGENE (DONE 7/30/04 angie)
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/mitopred
     cd /cluster/data/hg17/bed/mitopred
     wget http://mitopred.sdsc.edu/data/hum_30.out
     perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' hum_30.out > mitopred.tab
     cat > mitopred.sql << '_EOF_'
 # Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
 CREATE TABLE mitopred (
     name varchar(10) not null,      # SwissProt ID
     confidence varchar(8) not null, # Confidence level
               #Indices
     PRIMARY KEY(name(6))
 );
 '_EOF_'
     # << this line makes emacs coloring happy
     hgsql hg17 < mitopred.sql
     hgsql hg17 -e 'load data local infile "mitopred.tab" into table mitopred'
 
 
 # NUCLEAR PROTEIN DATABASE (IN PROGRESS 7/30/04 angie)
     ssh eieio
     mkdir /cluster/data/hg17/bed/npd
     cd /cluster/data/hg17/bed/npd
     wget ftp://ftp.hgu.mrc.ac.uk/pub/npd/database.zip
     unzip database.zip
     # OK, it's one big .mdb (Microsoft Access DB) file.  
     # Googling... can buy a converter for $40... free trial .exe...
 
 
 # CREATING REFFULL - DBTSS MRNA (DONE - 2004-08-02 - Hiram)
 
     ssh to eieio
     mkdir /cluster/data/hg17/bed/refFull
     cd /cluster/data/hg17/bed/refFull
     wget --timestamping "ftp://ftp.hgc.jp/pub/hgc/db/dbtss/ref-full.fa.gz" .
     wget --timestamping "ftp://ftp.hgc.jp/pub/hgc/db/dbtss/readme" .
     #	See also: http://dbtss.hgc.jp/index.html
 
     #	gunzip it and split the ref-rull.fa file into about 200 pieces
     #	(faSplit won't do this job if it is:
     #	zcat ref-full.fa.gz | faSplit sequence stdin 50 splitRefFull
     gunzip ref-full.fa.gz
     faSplit sequence ref-full.fa 50 splitRefFull
     gzip ref-full.fa
 
     #	copy to Iservers
     ssh kkr1u00
     cd /cluster/data/hg17/bed/refFull
     mkdir /iscratch/i/gs.18/build35/refFull
     cp -p split*.fa /iscratch/i/gs.18/build35/refFull
     /cluster/bin/iSync
     #	no longer need these split files here
     rm -f split*.fa
 
     #	run alignments on kluster
     ssh kk
     cd /cluster/data/hg17/bed/refFull
     ls -1S /scratch/hg/gs.18/build35/maskedContigs > genome.lst
     ls -1S /iscratch/i/gs.18/build35/refFull > refFull.lst
 
 
     #	Use BLAT to generate refFull alignments as so:
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -ooc=/scratch/hg/h/11.ooc -q=dna -t=dna {check in exists /scratch/hg/gs.18/build35/maskedContigs/$(path1)} {check in exists+ /iscratch/i/gs.18/build35/refFull/$(path2)} {check out line+ psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << keep emacs coloring happy
 
     bash # if a csh/tcsh user
     mkdir psl
     cat genome.lst | sed -e "s/.fa//" | while read C
 	do
 		mkdir psl/${C}
 	done
     # exit bash if you are tcsh
     gensub2 genome.lst refFull.lst gsub jobList
     para create jobList
     #	18240 jobs written to batch
     para try
     para check
     para push ... etc ...
 # Completed: 18240 of 18240 jobs
 # CPU time in finished jobs:      37011s     616.85m    10.28h    0.43d  0.001 y
 # IO & Wait Time:                 62630s    1043.84m    17.40h    0.72d  0.002 y
 # Average job time:                   5s       0.09m     0.00h    0.00d
 # Longest job:                       51s       0.85m     0.01h    0.00d
 # Submission to last job:           850s      14.17m     0.24h    0.01d
 
     #	Process refFull alignments into near best in genome.
     ssh eieio
     cd /cluster/data/hg17/bed/refFull
     pslSort dirs raw.psl tmp psl/*
     pslReps -minCover=0.2 -sizeMatters -minAli=0.965 \
 	-nearTop=0.001 raw.psl contig.psl /dev/null
     liftUp -nohead all_refFull.psl ../../jkStuff/liftAll.lft warn contig.psl
     pslSortAcc nohead chrom tmp all_refFull.psl
     pslCat -dir chrom > refFullAli.psl
 
     #	Load refFull alignments into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/refFull
     hgLoadPsl hg17 -tNameIx refFullAli.psl
 
 # VAR_MULTIZ HG17/MM5/RN3/GALGAL2/FR1 (acs 2004-08-12)
 
 # This is a new, experimental version of multiz written by Minmei at
 # PSU and sent by e-mail from Webb.  This version allows for a
 # progressive alignment strategy (i.e., alignment construction in a
 # post-order traversal of the tree) using only pairwise alignments of
 # each sequence with the reference sequence and without any need for
 # "staging".  Here's a little blurb about it from the header of
 # var_multiz.v3.c.
 
 #  var_multiz.v3.c
 #
 #  Variant to multiz program. It aligns two files of
 #  alignment blocks where top row is always the reference,
 #  assuming blocks are increasing ordered based on the
 #  start position on the refernece seqence. Single-coverage
 #  on reference is required at this stage.
 #
 #  Four arguments are required: char* arg1, char* arg2,
 #  int arg3, int arg4. arg1 and arg2 are two files need
 #  to be aligned together. The alignment of reference in
 #  two files are either fixed or not, determined from
 #  argurments arg3 and arg4. arg3 and arg4 are either 1
 #  or 0, but cannot be 1 at the same time. 1 means
 #  reference is fixed. v1 and v2 cannot be both 1.
 #  ...
 
     mkdir /cluster/data/hg17/bed/var_multiz.2004-08-12
 
     # unpack source and compile
     cp /cluster/home/acs/var_multiz.tar.gz /cluster/data/hg17/bed/var_multiz.2004-08-12
     cd /cluster/data/hg17/bed/var_multiz.2004-08-12
     tar xfz var_multiz.tar.gz
     cd var_multiz_source
     make
 
      NOTE (8/14): this version of the source is already out of date!
     # Source is now checked in under hg3rdParty and updated binaries
     # are being kept under /cluster/bin/penn/var_multiz
 
     # script for creating the 5-way alignments for a given chromosome
     # (acs, 8/20/04) below revised after e-mail exchange with Minmei
     cat << '_EOF_' > doVarMultiz.csh
 #!/bin/csh -fe
 
 set chr = $1			# may include _random or _hla_hap[12]
 
 set REF = hg17.$chr
 set RAT = /cluster/bluearc/hg17/multiz8way/rn3/$chr.maf
 set MOUSE = /cluster/bluearc/hg17/multiz8way/mm5/$chr.maf
 set CHICKEN = /cluster/bluearc/hg17/multiz8way/galGal2/$chr.maf
 set FISH = /cluster/bluearc/hg17/multiz8way/fr1/$chr.maf
 set DEST = /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/$chr.maf
 
 set VMZ = /cluster/bin/penn/var_multiz
 set PROJECT = /cluster/bin/penn/var_multiz.2004.08.12/maf_project
 
 mkdir -p $DEST:h
 
 if ( -s $RAT && -s $MOUSE ) then 
     echo "Aligning $RAT $MOUSE..."
     $VMZ $RAT $MOUSE 0 0 > /scratch/$chr.tmp1.maf
     echo "Projecting on $REF..."
     $PROJECT /scratch/$chr.tmp1.maf $REF > /scratch/$chr.hrm.maf
 else if ( -s $RAT ) then
     cp $RAT /scratch/$chr.hrm.maf
 else if ( -s $MOUSE ) then
     cp $MOUSE /scratch/$chr.hrm.maf
 endif
 
 if ( -s $CHICKEN && -s /scratch/$chr.hrm.maf ) then
     echo "Adding $CHICKEN..."
     $VMZ /scratch/$chr.hrm.maf $CHICKEN 1 0 > /scratch/$chr.tmp2.maf
     echo "Projecting on $REF..."
     $PROJECT /scratch/$chr.tmp2.maf $REF > /scratch/$chr.hrmc.maf
 else if ( -s $CHICKEN ) then
     cp $CHICKEN /scratch/$chr.hrmc.maf
 else if ( -s /scratch/$chr.hrm.maf ) then
     cp /scratch/$chr.hrm.maf /scratch/$chr.hrmc.maf
 endif 
 
 if ( -s $FISH && -s /scratch/$chr.hrmc.maf ) then
     echo "Adding $FISH..."
     $VMZ /scratch/$chr.hrmc.maf $FISH 1 0 > /scratch/$chr.tmp3.maf
     echo "Projecting on $REF..."
     $PROJECT /scratch/$chr.tmp3.maf $REF > $DEST
 else if ( -s $FISH ) then
     cp $FISH $DEST
 else if ( -s /scratch/$chr.hrmc.maf ) then
     cp /scratch/$chr.hrmc.maf $DEST
 endif 
 
 
 echo "Done."
 rm /scratch/$chr.tmp[123].maf /scratch/$chr.hrm.maf /scratch/$chr.hrmc.maf
 '_EOF_'
     # << keep emacs coloring happy    
     chmod 755 doVarMultiz.csh
 
     for file in `find /cluster/bluearc/hg17/multiz8way/rn3 /cluster/bluearc/hg17/multiz8way/mm5 /cluster/bluearc/hg17/multiz8way/galGal2 /cluster/bluearc/hg17/multiz8way/fr1 -name "chr*.maf"` ; do echo `basename $file .maf` ; done | sort -u > chrlist
     rm -f jobs.lst
     for chr in `cat chrlist` ; do echo "doVarMultiz.csh $chr" >> jobs.lst ; done
 
     # run cluster job
     ssh kk ; cd /cluster/data/hg17/bed/var_multiz.2004-08-12; para create jobs.lst ; para try ; para push 
     # (etc.)
 
 Completed: 46 of 46 jobs
 CPU time in finished jobs:      71302s    1188.36m    19.81h    0.83d  0.002 y
 IO & Wait Time:                  1162s      19.37m     0.32h    0.01d  0.000 y
 Average job time:                1575s      26.26m     0.44h    0.02d
 Longest job:                     6353s     105.88m     1.76h    0.07d
 Submission to last job:          6362s     106.03m     1.77h    0.07d
 
     # for now just create an ordinary maf track (conservation later)
     rm -rf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
     mkdir -p /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
     ln -s /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/*.maf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
     /cluster/bin/i386/hgLoadMaf hg17 -warn varMultizMm5Rn3GalGal2Fr1 -pathPrefix=/gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1
     chmod 775 /gbdb/hg17/varMultiz /gbdb/hg17/varMultiz/maf /gbdb/hg17/varMultiz/maf/varMultizMm5Rn3GalGal2Fr1 /cluster/data/hg17/bed/var_multiz.2004-08-12 /cluster/data/hg17/bed/var_multiz.2004-08-12/maf
     chmod 664 /cluster/data/hg17/bed/var_multiz.2004-08-12/maf/*.maf
 
     # trackDb entry
 # track varMultizMm5Rn3GalGal2Fr1
 # shortLabel varMultiz5Way
 # longLabel Human/Mouse/Rat/Chicken/Fugu Var-Multiz 
 # group compGeno
 # priority 190
 # visibility hide
 # type maf
 
 
 # elephant human blastz alignment by Robert Aug 11 2004
 mkdir /cluster/bluearc/elephant
 cd /cluster/bluearc/elephant
 
 #get reads and qual scores from trace repository
 for i in `cat trace.lst`; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/loxodonta_africana/fasta.loxodonta_africana.$i.gz ; done
 for i in `cat trace.lst`; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/loxodonta_africana/qual.loxodonta_africana.$i.gz ; done
 for i in `cat trace.lst` ; do zcat fasta.loxodonta_africana.$i.gz > loxodonta_africana.$i.fa ; done     
 
 #trim reads
 for i in `cat trace.lst` ; do nice gunzip -c qual.loxodonta_africana.$i.gz > qual.loxodonta_africana.$i; faTrimRead loxodonta_africana.$i.fa qual.loxodonta_africana.$i tmp.$i.fa lift.$i.lft; mv -f tmp.$i.fa loxodonta_africana.$i.fa ; rm -f qual.loxodonta_africana.$i ; done
 for i in `cat trace.lst`; do faSize -detailed=on loxodonta_africana.$i.fa > mac.$i.len ; done  
 cat mac.0*.len > S2.len
 for i in `cat trace.lst`; do sed -e s/S2.len/mac.$i.len/ < DEF > DEF.$i ; done
 
 #split fa reads into 10mb chunks for blastz run and distribute to i-servers.
 ssh kkr1u00
 for i in `cat trace.lst`; do nice faSplit about loxodonta_africana.$i.fa 10000000 /iscratch/i/elephant/${i}.mac. ; done
 cd /iscratch/i/elephant
 find split -name \*.fa > /cluster/bluearc/elephant/mac.lst
 cd /cluster/bluearc/elephant
 hgsql hg17 -N < chromLen.sql > S1.len
 cd /iscratch/i/elephant
 iSync
 
 #setup cluster run to blastz reads to human genome
 ssh kk
 cd /cluster/bluearc/elephant
 BlastZ_run0.sh
 cd run.0
 para create jobList
 para push
 
 #94798 jobs in batch
 #149 jobs (including everybody's) in Parasol queue.
 #Checking finished jobs
 #Completed: 94797 of 94798 jobs
 #Crashed: 1 jobs
 #CPU time in finished jobs:   14183153s  236385.89m  3939.76h  164.16d  0.450 y
 #IO & Wait Time:                310938s    5182.30m    86.37h    3.60d  0.010 y
 #Average job time:                 153s       2.55m     0.04h    0.00d
 #Longest job:                     1770s      29.50m     0.49h    0.02d
 #Submission to last job:         52186s     869.77m    14.50h    0.60d
 
 ssh kkr9
 BlastZ_run1.sh
 cd run.1
 para create jobList
 para push
 #341 jobs in batch
 #151 jobs (including everybody's) in Parasol queue.
 #Checking finished jobs
 #Completed: 341 of 341 jobs
 #CPU time in finished jobs:     142914s    2381.91m    39.70h    1.65d  0.005 y
 #IO & Wait Time:                 14078s     234.63m     3.91h    0.16d  0.000 y
 #Average job time:                 460s       7.67m     0.13h    0.01d
 #Longest job:                      782s      13.03m     0.22h    0.01d
 #Submission to last job:          1954s      32.57m     0.54h    0.02d
 
 #generate lst and fa files for each chromosome for faster lavToAxt
 cd /cluster/bluearc/elephant
 echo "select chrom from chromInfo;" > chrom.sql
 hgsql hg17 -B -N < chrom.sql > chrom.lst
 
 for i in `cat chrom.lst` ; do grep -h '>' lav/$i/* | awk '{print $1}' | sed -e 's/"//g' | sed -e 's/>//g' > mac.$i.lst ; echo $i ; done
 
 ssh kki
 cd /cluster/bluearc/elephant
 mkdir -p splitChrom
 /bin/rm splitChrom/*
 gensub2 trace.lst chrom.lst gsub.split spec.split
 para create spec.split
 para push
 #322 jobs in batch
 #Checking finished jobs
 #Completed: 322 of 322 jobs
 #CPU time in finished jobs:        819s      13.65m     0.23h    0.01d  0.000 y
 #IO & Wait Time:                  3278s      54.63m     0.91h    0.04d  0.000 y
 #Average job time:                  13s       0.21m     0.00h    0.00d
 #Longest job:                       51s       0.85m     0.01h    0.00d
 #Submission to last job:           462s       7.70m     0.13h    0.01d
 
 
 cd /cluster/bluearc/elephant/splitChrom
 for i in `cat /cluster/bluearc/elephant/chrom.lst` ; do cat mac.*.$i.fa > mac.$i.fa ; echo $i ; done
 
 #lav to axt run
 ssh kk
 cd /cluster/bluearc/elephant
 mkdir -p run.2
 #NOTE: chr19 must be run on kolossus with 64bit executables
 #change SEQ1_DIR form /scratch to /iscratch for mini cluster
 . DEF
 echo "#LOOP" > run.2/gsub
 echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR}' /cluster/bluearc/elephant/splitChrom/mac.$(root1).fa' >> run.2/gsub
 echo "#ENDLOOP" >> run.2/gsub
 cd run.2
 gensub2 ../chrom.lst single gsub jobList
 para create jobList
 para push
 #chrM has no data and crashed
 #46 jobs in batch
 #Checking finished jobs
 #Completed: 45 of 46 jobs
 #Crashed: 1 jobs
 #CPU time in finished jobs:     249970s    4166.17m    69.44h    2.89d  0.008 y
 #IO & Wait Time:                  5407s      90.11m     1.50h    0.06d  0.000 y
 #Average job time:                5675s      94.58m     1.58h    0.07d
 #Longest job:                    27065s     451.08m     7.52h    0.31d
 #Submission to last job:         47744s     795.73m    13.26h    0.55d
 
 
 
 #split reads by prefix so axtBest will fit in memory
 mkdir axtByQ
 cat mac*.lst | awk -F\| '{print substr($3,1,3)}' | sort -nu >prefix.lst
 for i in `cat prefix.lst` ; do cat axtChrom/*.axt | axtFilter -qStartsWith=gnl\|ti\|$i stdin | axtSwap stdin S1.len S2.len stdout | axtSort stdin axtByQ/q$i.axt ; done
 mkdir axtByQBest
 #lots of memory needed for reciprocal best
 ssh kolossus
 cd /cluster/bluearc/elephant/axtByQ
 for i in `ls *.axt` ; do axtBest -quiet i all stdout | axtSwap stdin ../S2.len ../S1.len ../axtByQBest/$i ; echo $i done ; done
 cd ../cluster/bluearc/elephant/axtByQBest
 cat q*.axt | axtSplitByTarget stdin .
 mkdir axtRecipBest
 for i in `cat chrom.lst` ; do axtSort axtByQBest/$i.axt stdout | axtBest stdin $i axtRecipBest/$i.axt ; echo $i ;done 
 for i in `cat chrom.lst` ; do axtToMaf axtRecipBest/$i.axt S1.len S2.len maf/$i.maf -tPrefix=hg17. -qPrefix=rm1. -scoreZero ; done 
 for i in `cat chrom.lst` ; do mafFilter -minScore=1000 maf/$i.maf > mafFilter/$i.maf ; done 
 # record coverage
 cd mafFilter
 for i in `cat ../chrom.lst` ; do nice mafCoverage hg17 $i.maf -count=2 > $i.cov ; echo done $i ; done   
 
 # CHIMP DELS FROM HG16 (DONE 2004-08-17 kate)
 # NOTE: this track just for development -- it should be regenerated from the latest
 #       alignments instead of lifted.
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir -p chimpDels
     cd chimpDels
     hgsql -s hg16 -e "SELECT * FROM chimpDels" | cut -f 2- > chimpDels.hg16.bed
     liftOver chimpDels.hg16.bed \
         /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
                 chimpDels.bed chimpDels.unmapped
     wc -l chimpDels.bed chimpDels.unmapped
     # 27662 chimpDels.bed
     # 132 chimpDels.unmapped
     # 27794 total
     hgLoadBed hg17 chimpDels chimpDels.bed -noBin
 
 ### CREATE chimpFixedDiff -- panTro1 (Daryl, August 18, 2005)
 
     # Convert chimp quality scores from uncompressed to compressed
     # chromosome format.  This took 22 minutes on crow.
     ## previously done for hg16
     # cd /cluster/data/panTro1
     # cat */chr*.qa | qaToQac stdin chrom.qac
 
     # Make single base pair high quality differences into a bed file
     # and load into database
     cd /cluster/data/hg17/bed
     mkdir chimpFixedDiff
     cd chimpFixedDiff
     sed 's/simpleNucDiff/chimpFixedDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > chimpFixedDiffs.sql
 
     # chimpHiQualDiffs was changed to allow different 
     #    quality parameters as command line options
 
     set axtDir = /cluster/data/hg17/bed/blastz.panTro1/axtRBestNet
     # This crashed twice at the same place, but ran successfully when
     # each chromosome was run separately.
     ##  time chimpFixedDiffs /$axtDir /cluster/data/panTro1/chrom.qac chimpFixedDiffs.bed >& chimpFixedDiffs.log
     mkdir chroms; cd chroms
     ls -1 $axtDir | grep chr | grep axt | sed 's/.axt//' | xargs mkdir
     # rmdir chr*random
     touch cfd.log
     foreach f (chr*)
 	echo -n $f "  "
 	ln -s /$axtDir/$f.axt $f/$f.axt
 	time nice chimpFixedDiffs $f /cluster/data/panTro1/chrom.qac $f.chimpFixedDiffs.bed>>& cfd.log
     end
     rm ../chimpFixedDiffs.bed
     cat chr*bed > ../chimpFixedDiffs.bed
 
     ## The load (sort) ran out of memory on hgwdev, so I sorted the
     ## file first on kolossus (3 minutes) and then loaded it on hgwdev
     ssh kolossus
     hgLoadBed -strict -sqlTable=chimpFixedDiffs.sql -noLoad hg16 chimpFixedDiff chimpFixedDiffs.bed
     exit
     ## hgwdev (37 minutes)
     hgLoadBed -hasBin -noSort -sqlTable=chimpFixedDiffs.sql hg16 chimpFixedDiff bed.tab
 
     TODO: need to filter out polymorphic sites (SNPs)
 
 
 
 
 
 #  Load firstEF track (DONE 2004-08-18 braney)
     ssh hgwdev
     mkdir -p /cluster/data/hg17/bed/firstEF
     cd /cluster/data/hg17/bed/firstEF
     wget "http://bioinformatics.med.ohio-state.edu/downloads/firstEFMay04.bed.gz"
     cat << '_EOF_' > sedScript
 s/chr23/chrX/g
 s/chr24/chrY/g
 /^>/d
 /^$/d
 /^No/d
 '_EOF_'
 
     zcat firstEFMay04.bed.gz | sed -f sedScript | awk  "{OFS=\"\t\"} {\$3 +=1; print  \$0}" > firstEF.bed
     hgLoadBed hg17 firstEF firstEF.bed
     rm firstEF.tab
     gzip *.bed
 #done firstEF
 
 
 # GENE BOUNDS (RNACLUSTER) (DONE 08-18-2004 Chuck)
 # Create rnaCluster table (depends on {est,mrna}OrientInfo)
 cd ~sugnet/store1/altSplice/hg17/
 mkdir rnaCluster
 cd rnaCluster/
 mkdir chrom
 # Create a list of accessions that come from RAGE libraries and need to be excluded.
 ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg17 rage.libs
 foreach f (/cluster/data/hg17/nib/chr*.nib)
     set c = $f:t:r
     set out = chrom/$c.bed
     # Exclude accesions in the RAGE file
     echo clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c
     clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c
 end
 hgLoadBed hg17 rnaCluster chrom/*.bed
 mkdir /cluster/data/hg17/bed/rnaCluster
 cp -r chrom /cluster/data/hg17/bed/rnaCluster
 
 #  miRNA track (CORRECTION 2004-12-09 - Hiram)
     #	Received the following correction from Michel Weber:
     #	Could you please replace the two lines:
     #	chr6 72169974 72170045 hsa-mir-30a 480 - 72169977 72169999
     #	chr6 72170017 72170040 hsa-mir-30a-5p 480 - 72170017 72170040
     #	by:
     #	chr6 72169974 72170045 hsa-mir-30a 480 - 72169977 72169999
     #	chr6 72169974 72170045 hsa-mir-30a 480 - 72170017 72170040
     #	(The first line remains identical, only the second is changed. The 
     #	repetition of the hsa-mir-30a entry means that both strands of its
     #	hairpin structure are matured into microRNAs, named hsa-miR-30a-3p and 
     #	hsa-miR-30a-5p in Rfam database).
     ssh hgwdev
     cd /cluster/data/hg17/bed/miRNA
     mv miRNA_hg17_1.bed miRNA_hg17_1.bed.0
     cp miRNA_hg17_1.bed.0 miRNA_hg17_1.bed
     #	edit miRNA_hg17_1.bed to change the single line.  Then:
     mv hg17.bed hg17.bed.0
     egrep -v "^track |^browser " miRNA_hg17_1.bed | \
 	sed -e "s/miR/mir/g; s/ sa-mir/ hsa-mir/g; s/ /\t/g;" > hg17.bed
     #	Check that the edit is in place properly:
     diff hg17.bed.0 hg17.bed
     #	and load it
     hgLoadBed hg17 miRNA hg17.bed
     #	Loaded 221 elements of size 8
     #	featureBits remains the same:
     featureBits hg17 miRNA
     #	18052 bases of 2866216770 (0.001%) in intersection
 
 #  miRNA track (DONE 2004-09-03 - Hiram)(CORRECTED, see above 2004-12-09)
     #	The source data for this was received via email from Sam
     #	Griffiths-Jones to Donna 16 August 2004.  In other email Michel
     #	Weber asked to add one more data line to that file.
 
     #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
     #	and Michel.Weber@ibcg.biotoul.fr
     #	notify them if this assembly updates to renew this track
     cd /cluster/data/hg17/bed
     mkdir miRNA
     cd miRNA
     #	one name was missing the h in hsa-mir and one was miR instead of
     #	mir
     egrep -v "^track |^browser " miRNA_hg17_1.bed | \
 	sed -e "s/miR/mir/g; s/ sa-mir/ hsa-mir/g; s/ /\t/g;" > hg17.bed
     hgLoadBed hg17 miRNA hg17.bed
     #	compare with previous results, should be relatively similar
     #	featureBits hg16 miRNA
     #	16923 bases of 2865248791 (0.001%) in intersection
     #	featureBits hg17 miRNA
     #	18052 bases of 2866216770 (0.001%) in intersection
 
     # entry is already in trackDb/trackDb.ra
 
 
 ## blastz mRNA track for internal use - Robert 8/12/04
 mkdir /cluster/bluearc/hg17/mrnaBlastz
 cd /cluster/bluearc/hg17/mrnaBlastz
 /cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank genbank mrna mrna.fa -db=hg -native
 mkdir -p split
 faTrimPolyA mrna.fa trim.fa
 faSplit about trim.fa 1000000 split/mrna
 cp -ip trim.fa /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz
 faSize trim.fa -detailed=on > S2.len
 hgsql hg16 < chromInfo.sql > S1.len
 
 BlastZ_run0.sh
 cd run.0
 para push
 para time
 #113894 jobs in batch
 #207911 jobs (including everybody's) in Parasol queue.
 #Checking finished jobs
 #Completed: 113894 of 113894 jobs
 #CPU time in finished jobs:   14423845s  240397.41m  4006.62h  166.94d  0.457 y
 #IO & Wait Time:                334352s    5572.54m    92.88h    3.87d  0.011 y
 #Average job time:                 130s       2.16m     0.04h    0.00d
 #Longest job:                    38301s     638.35m    10.64h    0.44d
 #Submission to last job:         59841s     997.35m    16.62h    0.69d
 
 mkdir run.1
 ~angie/hummus/do.out2lav DEF > run.1/j
 cd run.1
 para create j
 para push
 para time
 #341 jobs in batch
 #208550 jobs (including everybody's) in Parasol queue.
 
 #Checking finished jobs
 #Completed: 341 of 341 jobs
 #CPU time in finished jobs:      28990s     483.17m     8.05h    0.34d  0.001 y
 #IO & Wait Time:                 43139s     718.98m    11.98h    0.50d  0.001 y
 #Average job time:                 212s       3.53m     0.06h    0.00d
 #Longest job:                     2015s      33.58m     0.56h    0.02d
 #Submission to last job:          2187s      36.45m     0.61h    0.03d
 
 #!/bin/tcsh
 set base="/cluster/bluearc/hg17/mrnaBlastz"
     cd $base
     mkdir -p pslRaw
     foreach c (lav/*)
       pushd $c
       set chr=$c:t
       set out=$base/pslRaw/$chr.psl
       echo "Translating $chr lav to $out"
       cat `ls -1 *.lav | sort -g` \
         | lavToPsl stdin stdout \
         | sed -e 's@scratch/hg/gs.18/build35/bothMaskedNibs//@@' | sed -e 's/\.nib:[0-9]*-[0-9]*//' > $out
       popd
     end
 
 mkdir run.2
 
 for i in `awk '{print $1}' S1.len` ; do echo doSortFilter.sh ../pslRaw/$i.psl ../pslFilter/$i.psl  >> run.2/spec.dup ; done
 cd run.2
 para create spec.dup
 para push
 #46 jobs in batch
 #3 jobs (including everybody's) in Parasol queue.
 #Checking finished jobs
 #Completed: 46 of 46 jobs
 #CPU time in finished jobs:       4409s      73.48m     1.22h    0.05d  0.000 y
 #IO & Wait Time:                  1082s      18.04m     0.30h    0.01d  0.000 y
 #Average job time:                 119s       1.99m     0.03h    0.00d
 #Longest job:                     3842s      64.03m     1.07h    0.04d
 #Submission to last job:          3842s      64.03m     1.07h    0.04d
 cd ..
 for i in `awk '{print $1}' S1.len` ; do echo axtChain -linearGap=linearGap.txt -psl pslFilter/$i.psl /scratch/hg/gs.18/build35/bothMaskedNibs/ -faQ /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz/trim.fa chain/$i.chain >> spec.chain ; done
 para create spec.chain
 para push
 
 cd run.3
 para create spec.filter
 para push
 cd ..
 ls /cluster/data/hg17/nib/*.nib > S1.lst
 #Skip chainPreNet it is not good for mrna
 #mkdir -p preNet
 #
 #cd chainFilter
 #foreach i ( *.chain)
 #chainPreNet $i ../S1.len ../S2.len ../preNet/$i
 #end
 
 mkdir run.4
 cd run.4
 for i in `awk '{print $1}' ../S1.len`; do echo "chainToPsl ../chainFilter/$i.chain ../S1.len ../S2.len ../S1.lst /panfs/ucsc.edu/home/scratch/hg17/mrnaBlastz/trim.fa ../psl/$i.psl" >> spec.chain2psl.new  ; done
 
 pslCat psl/*psl > mrnaBlastz.psl
 hgLoadPsl hg17 mrnaBlastz.psl
 
 cp trim.fa /cluster/data/hg17/bed/mrnaBlastz/hg17Mrna.fa
 ln /cluster/data/hg17/bed/mrnaBlastz/hg17Mrna.fa /gbdb/hg17/mrnaBlastz/ -s
 
 hgLoadSeq -prefix=bz hg17 /gbdb/hg17/mrnaBlastz/hg17Mrna.fa
 
 ## end of blastz Mrna track
 
 #### BUILD RETROGENE TRACK (done Robert 8/26/2004)
 #### REBUILD RETROGENE TRACK (done Robert 12/24/2004 - but no notes - kuhn)
 # diffs before push to beta:
 #  1640 hg17.pseudoGeneLink.devOnly
 #  9639 hg17.pseudoGeneLink.betaOnly
 # 15091 hg17.pseudoGeneLink.common
 
 #  RETROGENE TRACK data update - Robert - 2005-04-08
    (added by Jen 2006-01-31)
    - pushQ entry did not include psuedoMrna table. Old table is still
      present on RR. New data has since been lost on dev. 
      User impact: ~1000 sequence missing links in browser 
    - new all.joiner rule needed to link psuedoMrna to pseudoGeneLink table
    - current all.joiner rule between knownGene to pseudoGeneLink gives errors.
      the data types appear to be mismatched. pseudoGeneLink.kgName is
      a gene symbol, not the same identifier as in knownGene.name
    - data is to be regenerated soon and errors corrected at that time
 
 
 mkdir /cluster/data/hg17/bed/pseudo
 cd /cluster/data/hg17/bed/pseudo
 ls /cluster/data/hg17/nib/*.nib > S1.lst
 hgsql hg17 -N -B < allMrna.sql > allMrna.lst
 cp  /cluster/data/genbank/data/aligned/genbank.142.0/hg17/full/mrna.native.psl.gz .
 gunzip mrna.native.psl.gz
 awk  '{OFS="\t";print $1,$2,$3,$4,$5,$6,$7,$8,$9,substr($10,1,index($10,".")-1),$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23}' mrna.native.rawPsl  > mrnaBlat.psl
 
 hgsql hg17 -N -B < refGene.sql > refGene.tab
 hgsql hg17 -B -N < mgcGene.sql > mgcGene.tab
 cat ../../*/*.fa.out | awk '$5~/chr*/{OFS="\t";print $5,$6,$7}' >rmsk.bed
 
 cd /cluster/bluearc/hg17/mrnaBlastz/
 zcat /cluster/data/hg17/bed/blastz.mm5/axtChain/mouseSyn.net.gz | netToBed stdin mouseSyn.bed
 
 hgsql hg17 < mrna.sql | grep -v matches | awk '{OFS="\t"; print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22}' > all_mrna.psl
 ssh eieio
 pslCat -nohead -check all_mrna.psl /cluster/bluearc/hg17/mrnaBlastz/psl/*.psl |awk '{print $0, $1*3-$2}' | sort -k 10,10 -k 22nr -T /tmp | awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' >  blatBlastzHg17.psl 
 ssh hgwdev
 cp blatBlastzHg17.psl /scratch/
 
 tawk '$12 > 1 && $12 < 9999999{x=$11;$11=$12;$12=x;print $0}'  /cluster/data/kgDB/bed/hg17/kgBestMrna/sortedKnownGene.tab > sortedKnownGene.tab
 #copy files to iServers for cluster run
 ssh kkr1u00
 /cluster/home/baertsch/bin/i386/pslSplit nohead -chunkSize=121 /iscratch/i/gs.18/build35/pseudo blatBlastzHg17.psl
 cd /cluster/data/hg17/bed/pseudo
 cp refGene.tab /iscratch/i/gs.18/build35/pseudo
 cp /cluster/data/hg17/bed/simpleRepeat/simpleRepeat.bed /iscratch/i/gs.18/build35/pseudo
 cp mrnaHg17.fa /iscratch/i/gs.18/build35/pseudo
 cp sortedKnownGene.tab /iscratch/i/gs.18/build35/pseudo
 cp rmsk.bed /iscratch/i/gs.18/build35/pseudo
 cp all_mrna.psl /iscratch/i/gs.18/build35/pseudo
 cp mouseSyn.bed /iscratch/i/gs.18/build35/pseudo
 for i in `ls tmp*` ; do echo "doBuildkk.sh ${i%%.psl}" ; done | sed -e 's/tmp//g'  > ~/hg17/pseudo/spec.kk   
 cd /iscratch/i/hg/gs.18/build35/pseudo
 iSync
 
 para create spec.kk
 para push
 
 #post process
 
 # run from eieio
 BLUE=/cluster/bluearc/hg17/pseudo
 echo catting output
 cat $BLUE/pseudoGeneLink[0-9]*.bed | sort -k1,1 -k2,3n >pseudoGeneLinkSort.bed ; /bin/rm $BLUE/pseudoGeneLink[0-9]*.bed
 cat $BLUE/pseudo[0-9]*.psl > pseudo.psl ; /bin/rm $BLUE/pseudo[0-9]*.psl &
 echo Filtering pseudoGeneLinkSort.bed
 tawk '$5 > 10 && $15 > 10000 && $35 > 650 {OFS="\t";print $0}' pseudoGeneLinkSort.bed > pseudoGeneLinkSortFilter.bed
 echo Removing Overlaps
 doSplit
 cd /cluster/bluearc/hg17/pseudo/run.o 
 spec.overlap
 cd ~/hg17/pseudo
 cat /cluster/bluearc/hg17/pseudo/chr*pseudoNoOverlap.bed > pseudoGeneLinkNoOverlap.bed
 echo Making psl
 awk '{printf("%s\t%s\t%s\n", $5,$2,$3)}' pseudoGeneLinkNoOverlap.bed > pseudoGeneLinkSelect.tab
 ## 350 is the sacrad magic number and will probably change
 tawk '$6>=350{print $0}' pseudoGeneLinkNoOverlap.bed > pseudoGeneLinkNoOverlapFilter.bed                       
 
 pslSelect -qtStart=pseudoGeneLinkSelect.tab pseudo.psl pseudoMrna.psl
 echo Loading Bed
 hgLoadBed hg17 pseudoGeneLink pseudoGeneLinkNoOverlapFilter.bed -hasBin -sqlTable=/cluster/home/baertsch/kent/src/hg/lib/pseudoGeneLink.sql
 echo Loading Psl
 hgLoadPsl hg17 pseudoMrna.psl
 
 ## end of retroGene track
 
 
 # 3-WAY MULTIZ MULTIPLE ALIGNMENT (MM5, RN3) (DONE 2004-08-27 kate)
 #       HMR Maf's needed for regulatory potential track
     ssh eieio
     set multizDir = multiz.2004-08-27
     cd /cluster/data/hg17/bed/$multizDir
     set workingDir = /cluster/bluearc/hg17/$multizDir
     ln -s $workingDir /cluster/bluearc/hg17/multiz3way
     ln -s $multizDir multiz3way
     mkdir -p $workingDir
     mkdir -p /cluster/data/hg17/bed/$multizDir
 
 # wrapper script for multiz
     # NOTE: first arg is pairwise, 2nd arg is multiple (to add to) 
     # NOTE: next time, modify script so it only needs one arg -- saves the
     # multiple dirname in a file for use by the next run
     cat << 'EOF' > doMultiz.csh
 #!/bin/csh -fe
 mkdir -p $3:h
 /cluster/bin/penn/multiz $1 $2 - > $3
 'EOF'
 
 # << for emacs
     cat << 'EOF' > gsub
 #LOOP
 ../doMultiz.csh {check in line /cluster/bluearc/hg17/multiz.2004-08-27/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/hg17/multiz.2004-08-27/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/hg17/multiz.2004-08-27/$(root1)$(dir1)/$(root2).maf}
 #ENDLOOP
 'EOF'
 # << for emacs
 
     chmod +x doMultiz.csh
 
     ssh eieio
     set workingDir = /cluster/bluearc/hg17/multiz.2004-08-27
 
     # copy mafs to bluearc -- chimp
     mkdir $workingDir/mm5
     cp /cluster/data/hg17/bed/blastz.mm5/mafNet/*.maf \
                 $workingDir/mm5
     ls $workingDir/mm5/*.maf > chrom.lst
 
     # rat
     mkdir $workingDir/rn3
     cp /cluster/data/hg17/bed/blastz.rn3/mafNet/chr*.maf $workingDir/rn3
 
     # multiz - add in rn3 rat to human/mouse
     # 
     ssh kki
     set multizDir = multiz.2004-08-27
     set workingDir = /cluster/bluearc/hg17/$multizDir
     cd /cluster/data/hg17/bed/$multizDir
     mkdir run.rn3
     cd run.rn3
     echo "rn3/mm5" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     para create jobList
         # 47 jobs
     para try, check, push, check
 
     # copy 3-way mafs to build directory
     ssh eieio
     set multizDir = multiz.2004-08-27
     set workingDir = /cluster/bluearc/hg17/$multizDir
     ln -s $workingDir/mm5rn3 $workingDir/maf
     cd /cluster/data/hg17/bed/multiz.2004-08-27
     mkdir maf
     cp $workingDir/maf/*.maf maf
 
 
 # BLASTZ TETRAODON (tetNig1) (DONE, 2004-08-26, hartera)
     # blastz requires lineage-specific repeats
     # Treat all repeats as lineage-specific.
     ssh kkr1u00
     mkdir /iscratch/i/gs.18/build35/linSpecRep.notInTetraodon
  foreach f (/iscratch/i/gs.18/build35/rmsk/chr*.fa.out)
  cp -p $f /iscratch/i/gs.18/build35/linSpecRep.notInTetraodon/$f:t:r:r.out.spec
     end
 
     mkdir /iscratch/i/tetNig1/linSpecRep.notInHuman
     foreach f (/iscratch/i/tetNig1/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/tetNig1/linSpecRep.notInHuman/$f:t:r:r.out.spec
     end
     iSync
 
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastz.tetNig1.2004-08-20
     ln -s /cluster/data/hg17/bed/blastz.tetNig1.2004-08-20 \
             /cluster/data/hg17/bed/blastz.tetNig1
     cd /cluster/data/hg17/bed/blastz.tetNig1
  
     # abridge repeats.
     # Treat all repeats as lineage-specific.
     cat << '_EOF_' > DEF
 # human vs. Tetraodon
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 # use same parameters as for danRer1-fr1
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/hg17/linSpecRep.notInTetraodon
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Tetraodon
 SEQ2_DIR=/iscratch/i/tetNig1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/tetNig1/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastz.tetNig1
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     # save the DEF file in the current standard place
     chmod +x DEF
     cp DEF ~angie/hummus/DEF.hg17-tetNig1.2004-08-20
  
     # make sure BlastZ_run0.sh, BlastZ_run1.sh and BlastZ_run2.sh scripts
     # are in /cluster/data/hg17/jkStuff
     # edit BlastZ_run0.sh so directory for blastz is /cluster/bin/penn
 
     bash # if a csh/tcsh user
     . ./DEF
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     # check batch looks ok then
     para try, check, push, check, ....
 # para time
 # Completed: 19437 of 19437 jobs
 # CPU time in finished jobs:    3225816s   53763.60m   896.06h   37.34d  0.102 y
 # IO & Wait Time:                174096s    2901.60m    48.36h    2.01d  0.006 y
 # Average job time:                 175s       2.92m     0.05h    0.00d
 # Longest job:                      709s      11.82m     0.20h    0.01d
 # Submission to last job:          5324s      88.73m     1.48h    0.06d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     cd /cluster/data/hg17/bed/blastz.tetNig1
     bash # if a csh/tcsh user
     . ./DEF
     /cluster/data/hg17/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, check etc.
 # para time
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:        280s       4.66m     0.08h    0.00d  0.000 y
 # IO & Wait Time:                  2183s      36.39m     0.61h    0.03d  0.000 y
 # Average job time:                   7s       0.12m     0.00h    0.00d
 # Longest job:                       41s       0.68m     0.01h    0.00d
 # Submission to last job:           469s       7.82m     0.13h    0.01d
 
     # third run: lav -> axt
     ssh kki
     cd /cluster/data/hg17/bed/blastz.tetNig1
     mkdir axtChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin /iscratch/i/gs.18/build35/bothMaskedNibs \
 /iscratch/i/tetNig1/nib stdout \
 | axtSort stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/$(root1).axt}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     \ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
     head jobList
     para create jobList
     para try, check, push, check,...
 # para time
 # Completed: 45 of 46 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:         52s       0.87m     0.01h    0.00d  0.000 y
 # IO & Wait Time:                   256s       4.27m     0.07h    0.00d  0.000 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest job:                       36s       0.60m     0.01h    0.00d
 # Submission to last job:           275s       4.58m     0.08h    0.00d
 
 # one job crashed because chr6_hla_hap1.axt is empty. Checked by running this
 # again and then looked at the lav file which has no alignments in it.
                                                                                 
    # translate sorted axt files into psl
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.tetNig1
     mkdir -p pslChrom
     set tbl = "blastzTetNig1"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.tetNig1/pslChrom
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 $f
       echo "$f Done"
     end
 
 # original blastzTetNig1: 
 # BLASTZ_H=2000
 # BLASTZ_Y=3400
 # BLASTZ_L=6000
 # BLASTZ_K=2200
 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 # BLASTZ_ABRIDGE_REPEATS=1
 # featureBits -chrom=chr1 hg17 blastzTetNig1
 # 6378680 bases of 222827847 (2.863%) in intersection
 # featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1 -enrichment
 # refGene:cds 1.246%, blastzTetNig1 2.863%, both 0.856%, cover 68.70%, 
 # enrich 24.00x
 # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer1 -enrichment
 # refGene:cds 1.246%, blastzDanRer1 3.934%, both 0.831%, cover 66.72%, 
 # enrich 16.96x
 # comparable to zebrafish so good
 # try same parameters with L=8000
 # featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1L8k -enrichment
 # refGene:cds 1.246%, blastzTetNig1L8k 2.095%, both 0.753%, cover 60.47%, 
 # enrich 28.87x
 # load chr1 with blastz using just H=2000 and default parameters
 # featureBits -chrom=chr1 hg17 refGene:cds blastzTetNig1Default -enrichment
 # refGene:cds 1.246%, blastzTetNig1Default 1.630%, both 0.808%, cover 64.87%, 
 # enrich 39.80x
 # rows in chr1_blastzTetNig1 tables 
 # blastzTetNig1	95156
 # blastzTetNig1L8k 58015
 # blastzTetNig1Default 71342
 # The default values also used for danRer1 vs fugu give good coverage and
 # higher enrichment than blastzTetNig1 with less alignments so this will be
 # used for the blastz track - now called blastzTetNig1.
 
 # CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera)
 # Make chains with rescored blastz
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg17/bed/blastz.tetNig1
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/*.axt \
         > input.lst
 cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # Make our own linear gap file with reduced gap penalties, 
     # in hopes of getting longer chains - works well for species at 
     # chicken-human distance or greater
     cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize	11
 smallSize	111
 position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
 qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
 tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
 bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
 '_EOF_'
     # << this line makes emacs coloring happy
 
 cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -linearGap=../../chickenHumanTuned.gap $1 \
     /iscratch/i/gs.18/build35/bothMaskedNibs \
     /iscratch/i/tetNig1/nib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # para time
 # Completed: 45 of 46 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:        553s       9.22m     0.15h    0.01d  0.000 y
 # IO & Wait Time:                   102s       1.69m     0.03h    0.00d  0.000 y
 # Average job time:                  15s       0.24m     0.00h    0.00d
 # Longest job:                       56s       0.93m     0.02h    0.00d
 # Submission to last job:           985s      16.42m     0.27h    0.01d
 
 # one job crashed since chr6_hla_hap1.axt is empty - no alignments
 
    # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
    
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r >> hist5000.out
       textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
       echo ""
     end
 
     # only chr19 has a very large number of chains with score < 5000
     # load chr 1 into table
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain
     hgLoadChain hg17 chr1_chainTetnig1 chr1.chain
 
 # featureBits -chrom=chr1 hg17 refGene:cds chainTetnig1Link -enrichment
 # refGene:cds 1.246%, chainTetnig1Link 1.582%, both 0.805%, cover 64.59%, 
 # enrich 40.83x
 
     # try filtering with minScore of 5000
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain
     rm -r chain
     chainSplit chain all.chain
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain
     hgLoadChain hg17 chr1_chainTetNig1Filt5k chr1.chain
 # featureBits -chrom=chr1 hg17 refGene:cds chainTetNig1Filt5kLink -enrichment
 # refGene:cds 1.246%, chainTetNig1Filt5kLink 1.487%, both 0.789%, cover 63.33%,
 # enrich 42.58x
 # this cleans it up a lot with little reduction in coverage.
 # check in browser - filtered version looks good.
 
 # add all chains for minScore=5000 filtered chains
 # remove test chain tables for chr1
     ssh hgwdev
     hgsql -e "drop table chr1_chainTetnig1;" hg17
     hgsql -e "drop table chr1_chainTetnig1Link;" hg17
     hgsql -e "drop table chr1_chainTetNig1Filt5k;" hg17
     hgsql -e "drop table chr1_chainTetNig1Filt5kLink;" hg17
 
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg17 ${c}_chainTetNig1 $i
         echo done $c
     end
 
 # NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera)
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                      ../preNet/$i
     end
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 55373824, utime 415 s/100, stime 45
     # Add classification info using db tables:
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
     # netClass looks for ancient repeats in one of the databases
     # hg17 has this table - hand-curated by Arian but this is for
     # human-rodent comparisons so do not use here, use -noAr option
     mkdir -p /cluster/bluearc/hg17/linSpecRep.notInTetraodon
     mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInHuman
     cp /iscratch/i/hg17/linSpecRep.notInTetraodon/* \
        /cluster/bluearc/hg17/linSpecRep.notInTetraodon
     cp /iscratch/i/tetNig1/linSpecRep.notInHuman/* \
        /cluster/bluearc/tetNig1/linSpecRep.notInHuman
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
     time netClass noClass.net hg17 tetNig1 tetNig1.net \
           -tNewR=/cluster/bluearc/hg17/linSpecRep.notInTetraodon \
           -qNewR=/cluster/bluearc/tetNig1/linSpecRep.notInHuman -noAr
     # 54.100u 31.890s 2:20.01 61.4%   0+0k 0+0io 197pf+0w
     netFilter -minGap=10 tetNig1.net |  hgLoadNet hg17 netTetNig1 stdin
     # featureBits hg17 refGene:cds netTetNig1 -enrichment
     # refGene:cds 0.978%, netTetNig1 25.095%, both 0.778%, cover 79.53%, 
     # enrich 3.17x
 
 # TWINSCAN 1.3 GENE PREDICTIONS (Done, 2004-Aug-26, heather)
 
     cd /cluster/data/hg17/bed
     mkdir twinscan
     tarFile=hg17_TS13_pseudomasked.tar.gz
     wget http://genes.cs.wustl.edu/predictions/human/NCBI35/hg17_TS13_pseudomasked.tar.gz
     wget http://genes.cs.wustl.edu/predictions/human/NCBI35/md5sum.txt
 
     # check file transferred correctly
     grep gz md5sum.txt > gz.sum
     md5sum $tarFile | diff - gz.sum
 
     # extract
     tar xvfz $tarFile
     unset tarFile 
     
     # check that files unzipped and untarred correctly
     # expect no differences
     cd chr_gtf
     grep gtf ../md5sum.txt > md5sum.txt
     cd ../chr_ptx
     grep ptx ../md5sum.txt > md5sum.txt
     cd ../chr_tx
     grep tx ../md5sum.txt > md5sum.txt
     cd ..
     
     md5sum chr_gtf/* > gtf.sum
     diff gtf.sum chr_gtf/md5sum.txt
 
     md5sum chr_ptx/* > ptx.sum
     diff ptx.sum chr_ptx/md5sum.txt
 
     md5sum chr_tx/* > tx.sum
     diff tx.sum chr_tx/md5sum.txt
 
     # pare down protein FASTA header to id and add missing .a:
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
       echo chr$c
       perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
     end
     ldHgGene hg17 twinscan chr_gtf/chr*.gtf -gtf -genePredExt
     hgPepPred hg17 generic twinscanPep chr_ptx/chr*-fixed.fa
 
 # MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-09-08, hartera)
 #    Replace with gzipped versions (DONE 2004-09-14 kate)
     ssh kksilo
     # zip chains and nets
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
     cp all.chain tetNig1.chain
     zip -j /cluster/data/hg17/zip/tetNig1.chain.zip tetNig1.chain
     rm tetNig1.chain
     zip -j /cluster/data/hg17/zip/tetNig1.net.zip tetNig1.net
    
     ssh hgwdev
     # copy chains and nets to downloads area
     set gp = /usr/local/apache/htdocs/goldenPath/hg17
     mkdir -p $gp/vsTetNig1
     cd $gp/vsTetNig1
     mv /cluster/data/hg17/zip/tetNig1*.zip .
     md5sum *.zip > md5sum.txt
 
     # move axt files to downloads area and zip
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChrom
     mkdir -p $gp/vsTetNig1/axtChrom
     cp -p *.axt $gp/vsTetNig1/axtChrom
     cd $gp/vsTetNig1/axtChrom
     gzip *.axt
     md5sum *.gz > md5sum.txt
 
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 # BLASTZ TETRAODON (tetNig1) CLEANUP (DONE, 2004-09-10, hartera)
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.tetNig1
     nice rm -rf raw &
     nice rm -rf lav &
     nice rm axtChain/run1/chain/* &
     nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
 
 
 # regulatory potential 2X track (WORKING - 2004-09-14 - Hiram)
     ssh eieio
     mkdir /cluster/store3/gs.18/build35/bed/regPotential2X
     mkdir /cluster/store3/gs.18/build35/bed/regPotential3X
     cd /cluster/data/hg17/bed
     ln -s /cluster/store3/gs.18/build35/bed/regPotential2X .
     ln -s /cluster/store3/gs.18/build35/bed/regPotential3X .
     cd regPotential2X
     wget --timestamping 'http://www.bx.psu.edu/~james/stuff/rp_kit.tgz' .
     tar xvzf rp_kit.tgz
     # fixup the hmr_rp_score.sh and hm_rp_score.sh to set
     #	RP=. to read: RP=/cluster/data/hg17/bed/regPotential2X/rp_kit
     #	And fix the usage of SHIFT and WINDOW, the following diff shows
     #	the changes:
     #	5c5
     #	< RP_DIR=/cluster/data/hg17/bed/regPotential2X/rp_kit
     #	---
     #	> RP_DIR=.
     #	8,9c8,9
     #	< MAPPING=rp_kit/hm_5a_mapping.txt
     #	< MATRIX=rp_kit/hm_5a+3_scoreMatrix.dat
     #	---
     #	> MAPPING=hm_5a_mapping.txt
     #	> MATRIX=hm_5a+3_scoreMatrix.dat
     #	12c12
     #	< SHIFT=1
     #	---
     #	> SHIFT=5
     #	24,25c24,25
     #	<     --shiftAmount $SHIFT \
     #	<     --windowSize $WINDOW \
     #	---
     #	>     --shiftAmount 5 \
     #	>     --windowSize 100 \
 
     mkdir maf
     for A in  `(cd /cluster/data/hg17/bed/blastz.mm5/axtNet; ls chr*.axt)`
     do
 	C=${A/.axt}
 	echo "/cluster/data/hg17/bed/blastz.mm5/axtNet/${A} -> maf/${C}.maf.gz"
 	axtToMaf /cluster/data/hg17/bed/blastz.mm5/axtNet/${A} \
 		/cluster/data/hg17/chrom.sizes /cluster/data/mm5/chrom.sizes \
 		stdout | gzip > maf/${C}.maf.gz
     done
     # Replace bad chr5 axtNet and mafNet (2006-01-05 kate) 
 
     #	a valid java runtime is only on hgwdev.  This is a java procedure
     ssh hgwdev
     cd /cluster/data/hg17/bed/regPotential2X
     mkdir rp_scores
     #  WARNING - the following loop takes almost 12 hours !
     for M in maf/chr*.maf.gz
     do
 	C=${M/.maf.gz}
 	C=${C#maf/}
 	echo "$M -> rp_scores/$C.score.gz"
 	(zcat ${M} | ./rp_kit/hm_rp_score.sh /dev/stdin /dev/stderr 2>&1 >/dev/null) | sort -n | \
 	    gzip > rp_scores/${C}.score.gz
     done
     #	real    709m55.805s
     #	user    754m51.030s
     #	sys     20m11.000s
 
     #	Back to the file server to create the wiggle data
 
     ssh eieio
     cd /cluster/data/hg17/bed/regPotential2X
     mkdir wigData dataLimits
     for S in rp_scores/chr*.score.gz
     do
 	C=${S/.score.gz}
 	C=${C#rp_scores/}
 	echo "$S -> wigData/$C.wig"
 	zcat $S | sort -n | \
 	    wigAsciiToBinary -chrom=$C -dataSpan=1 \
 	    -wibFile=wigData/$C stdin 2> dataLimits/$C.limits
     done
     #	real    313m0.567s
     #	user    285m37.319s
     #	sys     23m8.301s
 
     #	Loading the table on hgwdev
     ssh hgwdev
     cd /cluster/data/hg17/bed/regPotential2X/wigData
     mkdir /gbdb/hg17/wib/regPotential2X
     ln -s `pwd`/*.wib /gbdb/hg17/wib/regPotential2X
     time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential2X \
 	hg17 regPotential2X chr*.wig
     #	real    2m29.668s
     #	user    0m33.380s
     #	sys     0m8.200s
 
 # regulatory potential 3X track (WORKING - 2004-09-14 - Hiram)
     #	Expects groundwork done above in the 2X track
     #	a valid java runtime is only on hgwdev.  This is a java procedure
     ssh hgwdev
     cd /cluster/data/hg17/bed/regPotential3X
     ln -s ../regPotential2X/rp_kit/hmr_rp_score.sh .
     mkdir rp_scores
     #  WARNING - the following loop takes almost 12 hours !
     for M in maf/chr*.maf.gz
     do
 	C=${M/.maf.gz}
 	C=${C#maf/}
 	echo "$M -> rp_scores/$C.score.gz"
 	(zcat ${M} | ./rp_kit/hm_rp_score.sh /dev/stdin /dev/stderr 2>&1 >/dev/null) | sort -n | \
 	    gzip > rp_scores/${C}.score.gz
     done
     #	real    613m8.230s
     #	user    623m7.110s
     #	sys     20m24.550s
 
     #	Back to the file server to create the wiggle data
 
     ssh eieio
     cd /cluster/data/hg17/bed/regPotential3X
     mkdir wigData dataLimits
 
     for S in rp_scores/chr*.score.gz
     do
 	C=${S/.score.gz}
 	C=${C#rp_scores/}
 	echo "$S -> wigData/$C.wig"
 	zcat $S | sort -n | \
 	    wigAsciiToBinary -chrom=$C -dataSpan=1 \
 		-wibFile=wigData/$C stdin 2> dataLimits/$C.limits
     done
 
     #	Loading the table on hgwdev
     ssh hgwdev
     cd /cluster/data/hg17/bed/regPotential3X/wigData
     mkdir /gbdb/hg17/wib/regPotential3X
     ln -s `pwd`/*.wib /gbdb/hg17/wib/regPotential3X
     time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential3X \
 	hg17 regPotential3X chr*.wig
     #	real    1m45.568s
     #	user    0m32.740s
     #	sys     0m6.140s
 
 # regulatory potential 5X track (DONE - 2005-09-19 - Daryl)
     ssh kkstore02
     mkdir -p /cluster/data/hg17/bed/regPotential5X/rp_scores
     cd /cluster/data/hg17/bed/regPotential5X/rp_scores
     wget -r -l 1 -nH http://www.bx.psu.edu/~james/rp/hg17panTro1mm5rn3canFam1/all_truncate.tar
     tar xvf all_truncate.tar
     cd /cluster/data/hg17/bed/regPotential5X
     mkdir -p wigData dataLimits 
     cd wigData
     ## 8 minutes
     for S in ../rp_scores/chr*.scores.truncated.gz
     do
 	C=${S/.scores.truncated.gz}
 	C=${C#../rp_scores/}
 	echo "$S -> wigData/$C.wig"
 	zcat $S | wigEncode stdin $C.wig $C.wib 2> ../dataLimits/$C.limits
     done
 
     #	Loading the table on hgwdev
     ssh hgwdev
     cd /cluster/data/hg17/bed/regPotential5X/wigData
     mkdir -p /gbdb/hg17/wib/regPotential5X
     chmod o+rx /gbdb/hg17/wib/regPotential5X
     ln -s /cluster/data/hg17/bed/regPotential5X/wigData/*.wib /gbdb/hg17/wib/regPotential5X
     chmod o+r /gbdb/hg17/wib/regPotential5X/ch*wib
     time hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/regPotential5X hg17 regPotential5X chr*.wig
     # 57.720u 9.960s 2:26.05 46.3%    0+0k 0+0io 213pf+0w
 
 # SGP GENES (DONE 9/17/04 angie)
     ssh eieio
     mkdir /cluster/data/hg17/bed/sgp
     cd /cluster/data/hg17/bed/sgp
     foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
       wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/SGP/$chr.gtf
       wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200405/SGP/$chr.prot
     end
     # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
     cp /dev/null sgpPep.fa
     foreach f (chr*.prot)
       perl -wpe 's/^(>chr\S+)/$1.1/' $f >> sgpPep.fa
     end
     ssh hgwdev
     cd /cluster/data/hg17/bed/sgp
     ldHgGene -gtf -genePredExt hg17 sgpGene chr*.gtf
     hgPepPred hg17 generic sgpPep sgpPep.fa
 
 # SGP GENES (UPDATE 1/18/2006)
     sgpPep table dropped, replaced by hgc generated protein seq in browser
 
 # LIFTOVER RNAGENE FROM HG16 (09/29/04, acs)
     cd /cluster/data/hg17/bed
     mkdir rnaGene
     cd rnaGene
     liftOver -gff /cluster/data/hg16/bed/rnaGene/all.gff \
 	/cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain \
 	rnaGeneLift.gff rnaGeneMiss.gff
     # 7204 records passed, 16 failed
     hgsql hg17 < ~/kent/src/hg/lib/rnaGene.sql
     hgRnaGenes hg17 rnaGeneLift.gff
 
     
 # BUILD BioCyc TABLES (DONE 10/1/04 Fan)
 
   - Create bioCycMapDesc table.
 
 	CREATE TABLE bioCycMapDesc (
   	mapID varchar(40) NOT NULL default '',
   	description varchar(255) NOT NULL default '',
   	KEY mapID (mapID)
 	) TYPE=MyISAM;
 
   - Crate bioCycPathway table.
 
 	CREATE TABLE bioCycPathway (
   	kgID varchar(40) NOT NULL default '',
   	geneID varchar(40) NOT NULL default '',
   	mapID varchar(40) NOT NULL default '',
   	KEY kgID (kgID),
  	KEY geneID (geneID),
   	KEY mapID (mapID)
 	) TYPE=MyISAM;
 
   Using data files sent by Peter Carp from SRI,
   per Peter's email of 10/1/04, they don't have recent update,
   so data files received last year are used.
   
   Save the BioCyc Pathway name and description table as names.txt.
   Save the pathway data file as gene-pathway.dat.
   Make sure there is no extra ^M at end of the lines.
 
   hgsql hg17 -e	'LOAD DATA local INFILE 'names.txt' into table bioCycMapDesc'
 
   Run hgBioCyc program to generate the file bioCycPathway.tab.
 
 	 hgBioCyc gene-pathway.dat hg17
 
   Load into hg17.
  
 	 hgsql hg17 -e 'LOAD DATA local INFILE "bioCycPathway.tab" into table bioCycPathway'
 	
 # MAKING FOLDUTR TABLES (DONE - 2004-10-4 Fan)
 # First set up directory structure and extract UTR sequence on hgwdev
     ssh hgwdev
     mkdir -p /cluster/data/hg17/bed/rnaStruct
     cd /cluster/data/hg17/bed/rnaStruct
     mkdir -p utr3/split utr5/split utr3/fold utr5/fold
     utrFa hg17 knownGene utr3 utr3/utr.fa
     utrFa hg17 knownGene utr5 utr5/utr.fa
 
 # Split up files and make files that define job.
     ssh kk
     cd /cluster/data/hg17/bed/rnaStruct
     faSplit sequence utr3/utr.fa 50000 utr3/split/s
     faSplit sequence utr5/utr.fa 50000 utr5/split/s
     ls -1 utr3/split > utr3/in.lst
     ls -1 utr5/split > utr5/in.lst
     cd utr3
     cat > gsub <<end
 #LOOP
 rnaFoldBig split/\$(path1) fold
 #ENDLOOP
 end
     cp gsub ../utr5
 
 # Do cluster run for 3' UTRs
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 37244 of 37244 jobs
 # CPU time in finished jobs:    1036479s   17274.64m   287.91h   12.00d  0.033 y
 # IO & Wait Time:                112286s    1871.44m    31.19h    1.30d  0.004 y
 # Average job time:                  31s       0.51m     0.01h    0.00d
 # Longest job:                     3370s      56.17m     0.94h    0.04d
 # Submission to last job:          4355s      72.58m     1.21h    0.05d
 
 # Do cluster run for 5' UTRs 
     cd ../utr5
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 29817 of 29817 jobs
 # CPU time in finished jobs:      98143s    1635.72m    27.26h    1.14d  0.003 y
 # IO & Wait Time:                105763s    1762.71m    29.38h    1.22d  0.003 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest job:                     2133s      35.55m     0.59h    0.02d
 # Submission to last job:          2465s      41.08m     0.68h    0.03d
 
 # Load database
     ssh hgwdev
     cd /cluster/data/hg17/bed/rnaStruct/utr5
     hgLoadRnaFold hg17 foldUtr5 fold
     cd ../utr3
     hgLoadRnaFold hg17 foldUtr3 fold
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 ####### BUILD RGD HUMAN QTL TRACKS (DONE 10/7/04 Fan) ##############
 
 mkdir -p /cluster/store8/rgd/human041007
 ln -s /cluster/store8/rgd/human041007 /cluster/data/hg17/bed/rgdQtl
 cd /cluster/data/hg17/bed/rgdQtl
 
 # download data files from RGD
 
 wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/human_QTL.gff
 
 # remove extra line feed character at the end of lines
 
 rmLf human_QTL.gff > rgdQtl.gff
 
 # create rgdQtl.tab
 awk '{print $1"\t"$4"\t"$5"\t"$10}'  rgdQtl.gff |sed -e 's/Chr/chr/g'| \
 sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab
 
 # create rgdQtlLink.tab
 
 awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' rgdQtl.gff | \
 sed -e 's/"//g'| sed -e 's/RGD://g' | sed -e 's/;//g'| sed -e 's/Note//g' > rgdQtlLink.tab
 
 # load rgdQtl table
 hgLoadBed hg17 rgdQtl rgdQtl.tab
 
 # check rgdQtl table
 checkTableCoords hg17 rgdQtl
 
 # load rgdQtlLink table
 hgsql hg17 -e "drop table hg17.rgdQtlLink;"
 hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql
 hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;'
 
 # updated trackDb.ra under /kent/src/hg/makeDb/trackDb/human/hg17 and
 # added rgdQtl.html.
 
 #### AFFYMETRIX HG-U133 PLUS TRACK (DONE, 2004-10-11, hartera)
      ssh hgwdev
      mkdir -p /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
      # Go to 
 #http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus 
      # and download the consensus and exemplar sequences to this directory
      cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
      unzip HG-U133_Plus_2_consensus.zip
      unzip HG-U133_Plus_2_exemplar.zip
      cat HG-U133_Plus_2_consensus HG-U133_Plus_2_exemplar >> U133Plus2_all.fa
      perl -pi.bak -e "s/(consensus|exemplar):HG-U133_Plus_2:/U133+2:/" \
                      U133Plus2_all.fa
      # remove ";" from probe set names
      perl -pi.bak -e "s/;//" U133Plus2_all.fa
      # clean up
      rm *.zip *.bak
      mkdir -p /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
      cp U133Plus2_all.fa /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
      # Set up cluster job to align consensus/exemplars to hg16
      ssh kkr1u00
      mkdir -p /iscratch/i/affy
      mv /cluster/data/hg17/bed/affyU133Plus2.2004-10-11/U133Plus2_all.fa \
         /iscratch/i/affy
      iSync
 
      ssh kk
      cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
      ls -1 /iscratch/i/affy/U133Plus2_all.fa > affy.lst
      ls -1 /iscratch/i/gs.18/build35/maskedContigs/* > allctg.lst
 
     cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/gs.18/build35/hg17.11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << for emacs
     gensub2 allctg.lst affy.lst template.sub para.spec
     mkdir psl
     para create para.spec
     para try, para check, para push .....
 # para time
 # Completed: 380 of 380 jobs
 # CPU time in finished jobs:      24533s     408.88m     6.81h    0.28d  0.001 y
 # IO & Wait Time:                  2180s      36.34m     0.61h    0.03d  0.000 y
 # Average job time:                  70s       1.17m     0.02h    0.00d
 # Longest job:                      751s      12.52m     0.21h    0.01d
 # Submission to last job:          2425s      40.42m     0.67h    0.03d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyU133Plus2.psl
     pslSort dirs raw.psl tmp psl
                                                                                 
     # use filter parameters for these sequences. only use alignments that
     # cover 30% of sequence and have at least 95% identity in aligned region.
     # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
   pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp affyU133Plus2.psl ../../jkStuff/liftAll.lft warn contig.psl
     perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl
     # load into the database
     ssh hgwdev
     cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
     hgLoadPsl hg17 affyU133Plus2.psl
     # Add sequence data to database
         # Copy probe sequence to /gbdb if it isn't already
     mkdir -p /gbdb/hgFixed/affyProbes
     cd /gbdb/hgFixed/affyProbes
     ln -s /projects/compbio/data/microarray/affyHuman/HG-U133Plus2/U133Plus2_all.fa .
     cd /cluster/data/hg17/bed/affyU133Plus2.2004-10-11
     hgLoadSeq -abbr=U133+2: hg17 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa
     
     # clean up
     rm -r psl tmp err contig.psl raw.psl *.bak psl.tab seq.tab
 # Added knownToU133Plus2 track (2004-10-14) - see GeneSorter section
 
 
 #### MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie)
     # First, get ranges of target coverage:
     ssh eieio
     mkdir /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage
     cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage/
     cat /cluster/data/hg17/bed/var_multiz.2004-08-12/maf.09-12-04/*.maf \
     | nice mafRanges -notAllOGap stdin hg17 hg17.mafRanges.bed
     # Get pairwise coverage as well.
     ssh kolossus
     cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage/
     cat /cluster/bluearc/hg17/multiz8way/rn3/*.maf \
     | nice mafRanges -notAllOGap stdin hg17 hg17.rn3.mafRanges.bed
     cat /cluster/bluearc/hg17/multiz8way/mm5/*.maf \
     | nice mafRanges -notAllOGap stdin hg17 hg17.mm5.mafRanges.bed
     cat /cluster/bluearc/hg17/multiz8way/galGal2/*.maf \
     | nice mafRanges -notAllOGap stdin hg17 hg17.galGal2.mafRanges.bed
     cat /cluster/bluearc/hg17/multiz8way/fr1/*.maf \
     | nice mafRanges -notAllOGap stdin hg17 hg17.fr1.mafRanges.bed
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/var_multiz.2004-08-12/coverage
     # To make subsequent intersections a bit quicker, output a bed with 
     # duplicate/overlapping ranges collapsed:
     nice featureBits hg17 hg17.mafRanges.bed \
       -bed=hg17.mafRangesCollapsed.bed
 #1147548420 bases of 2866216770 (40.037%) in intersection
     foreach other (mm5 rn3 galGal2 fr1)
       nice featureBits hg17 hg17.$other.mafRanges.bed \
       -bed=hg17.${other}.mafRangesCollapsed.bed
     end
 #1013348528 bases of 2866216770 (35.355%) in intersection
 #975533772 bases of 2866216770 (34.036%) in intersection
 #101623034 bases of 2866216770 (3.546%) in intersection
 #46737824 bases of 2866216770 (1.631%) in intersection
 
     # mafCoverage barfs currently, so pass on this for now:
     #cat ../maf.09-12-04/*.maf \
     #| nice mafCoverage -count=2 hg17 stdin > hg17.mafCoverage
 
     # Intersect maf target coverage with gene regions -- 
     # use Adam's knownGene region files:
     nice featureBits hg17 -enrichment \
       ../phastCons/stats2/knownGenesCds.bed \
       hg17.mafRangesCollapsed.bed \
       -bed=hg17.mafCds.bed
 #knownGenesCds.bed 1.166%, hg17.mafRangesCollapsed.bed 40.037%, both 1.111%, cover 95.36%, enrich 2.38x
 
     nice featureBits hg17 -enrichment \
       ../phastCons/stats2/knownGenesUtr3.bed \
       hg17.mafRangesCollapsed.bed \
       -bed=hg17.mafUtr3.bed
 #knownGenesUtr3.bed 0.918%, hg17.mafRangesCollapsed.bed 40.037%, both 0.662%, cover 72.18%, enrich 1.80x
     nice featureBits hg17 -enrichment \
       ../phastCons/stats2/knownGenesUtr5.bed \
       hg17.mafRangesCollapsed.bed \
       -bed=hg17.mafUtr5.bed
 #knownGenesUtr5.bed 0.266%, hg17.mafRangesCollapsed.bed 40.037%, both 0.198%, cover 74.42%, enrich 1.86x
 
     # Intersect pairwise target coverages with gene regions:
     foreach other (mm5 rn3 galGal2 fr1)
       nice featureBits hg17 -enrichment \
         ../phastCons/stats2/knownGenesCds.bed \
         hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Cds.bed
       nice featureBits hg17 -enrichment \
         ../phastCons/stats2/knownGenesUtr3.bed \
         hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Utr3.bed
       nice featureBits hg17 -enrichment \
         ../phastCons/stats2/knownGenesUtr5.bed \
         hg17.$other.mafRangesCollapsed.bed -bed=hg17.${other}Utr5.bed
     end
 #knownGenesCds.bed 1.166%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 1.093%, cover 93.74%, enrich 2.65x
 #knownGenesUtr3.bed 0.918%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 0.618%, cover 67.37%, enrich 1.91x
 #knownGenesUtr5.bed 0.266%, hg17.mm5.mafRangesCollapsed.bed 35.355%, both 0.186%, cover 69.81%, enrich 1.97x
 #knownGenesCds.bed 1.166%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 1.071%, cover 91.85%, enrich 2.70x
 #knownGenesUtr3.bed 0.918%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 0.597%, cover 65.09%, enrich 1.91x
 #knownGenesUtr5.bed 0.266%, hg17.rn3.mafRangesCollapsed.bed 34.036%, both 0.179%, cover 67.33%, enrich 1.98x
 #knownGenesCds.bed 1.166%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.779%, cover 66.84%, enrich 18.85x
 #knownGenesUtr3.bed 0.918%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.194%, cover 21.12%, enrich 5.96x
 #knownGenesUtr5.bed 0.266%, hg17.galGal2.mafRangesCollapsed.bed 3.546%, both 0.056%, cover 21.03%, enrich 5.93x
 #knownGenesCds.bed 1.166%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.714%, cover 61.26%, enrich 37.57x
 #knownGenesUtr3.bed 0.918%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.073%, cover 7.92%, enrich 4.86x
 #knownGenesUtr5.bed 0.266%, hg17.fr1.mafRangesCollapsed.bed 1.631%, both 0.039%, cover 14.82%, enrich 9.09x
 
 
 # ALTERNATIVE CPG ISLANDS (DONE 10/14/04 angie)
     ssh eieio
     nice tcsh
     mkdir /cluster/data/hg17/bed/cpgIslandAlt
     cd /cluster/data/hg17/bed/cpgIslandAlt
     # Try cpg_ash (WUSTL program modified to not chop islands in half before 
     # scoring) with default params:
     cp /dev/null cpg_ash.default.cpg
     foreach f (../../?{,?}/chr*.fa.masked)
       echo running on $f:t:r:r
       ~angie/cb/hg3rdParty/cpgIslands/cpg_ash.exe $f >> cpg_ash.default.cpg
     end
     awk -f ../cpgIsland/filter.awk cpg_ash.default.cpg > cpgIslandAlt.bed
     # Run Andy Law's script on masked seq:
     cp /dev/null cpgIslandGgfAndyMasked.bed
     foreach f (../../?{,?}/chr*.fa.masked)
       set chr = $f:t:r:r
       echo running on $chr
       /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f \
       | /cluster/home/angie/ggf-andy-cpg-island.pl \
       | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                    $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                    $pGc = (100.0 * $gc / $n); \
                    $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                         "$pCpG\t$pGc\t$oE\n";' \
       >> cpgIslandGgfAndyMasked.bed
     end
     # Compare enrichment for knownGene upstream -- an uphill battle for 
     # programs closer to meeting the stated length, GC, O/E params!
     ssh hgwdev
     nice featureBits hg17 -enrichment knownGene:upstream:1000 \
       /cluster/data/hg17/bed/cpgIsland/cpgIsland.bed
 #knownGene:upstream:1000 0.857%, cpgIsland.bed 0.741%, both 0.166%, cover 19.37%, enrich 26.13x
     nice featureBits hg17 -enrichment knownGene:upstream:1000 \
       /cluster/data/hg17/bed/cpgIslandAlt/cpgIslandAlt.bed
 #knownGene:upstream:1000 0.857%, cpgIslandAlt.bed 1.075%, both 0.200%, cover 23.38%, enrich 21.76x
     nice featureBits hg17 -enrichment knownGene:upstream:1000 \
       /cluster/data/hg17/bed/cpgIslandAlt/cpgIslandGgfAndyMasked.bed
 #knownGene:upstream:1000 0.857%, cpgIslandGgfAndyMasked.bed 1.964%, both 0.292%, cover 34.06%, enrich 17.34x
     cd /cluster/data/hg17/bed/cpgIslandAlt
     sed -e 's/cpgIslandExt/cpgIslandAlt/g' \
       ~/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandAlt.sql
     hgLoadBed -noBin -tab -sqlTable=cpgIslandAlt.sql \
       hg17 cpgIslandAlt cpgIslandAlt.bed
 #Loaded 29998 elements of size 10
     sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
       ~/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
     hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
       hg17 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed
 #Loaded 80555 elements of size 10
     # Quick length stats:
     hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandExt'
 #|         201 |    764.1913 |       40058 |
     hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandAlt'
 #|         200 |   1026.9194 |       32440 |
     hgsql hg17 -e 'select min(length), avg(length), max(length) from cpgIslandGgfAndyMasked'
 #|         200 |    698.8257 |      100308 |
     # 1/26/05: Make better island names in cpgIslandGgfAndyMasked,
     # for Dave Burt's cross-species island comparisons.
     ssh eieio
     cd /cluster/data/hg17/bed/cpgIslandAlt
     mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig
     perl -wpe '@w=split("\t"); $w[3] = "hg17.$w[0]." . ($w[1]+1) . ".$w[2]"; \
                $_ = join("\t", @w);' \
       cpgIslandGgfAndyMasked.bed.orig \
     > cpgIslandGgfAndyMasked.bed
     # Now liftOver islands from mm5, rn3, galGal2:
     ssh kolossus
     cd /cluster/data/hg17/bed/cpgIslandAlt
     foreach match (50 95)
       liftOver /cluster/data/mm5/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \
         /cluster/data/mm5/bed/bedOver/mm5Tohg17.chain -minMatch=0.$match \
         cpgIslandGAMFromMm5_$match.bed cpgIslandGAMFromMm5_$match.unmapped
       liftOver /cluster/data/rn3/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \
         /cluster/data/rn3/bed/bedOver/rn3ToHg17.over.chain -minMatch=0.$match \
         cpgIslandGAMFromRn3_$match.bed cpgIslandGAMFromRn3_$match.unmapped
       liftOver /cluster/data/galGal2/bed/cpgIslandGgfAndy/cpgIslandGgfAndyMasked.bed \
         /cluster/data/galGal2/bed/bedOver/galGal2ToHg17.over.chain -minMatch=0.$match \
         cpgIslandGAMFromGalGal2_$match.bed cpgIslandGAMFromGalGal2_$match.unmapped
     end
     # Load up the renamed islands as well as
     ssh hgwdev
     cd /cluster/data/hg17/bed/cpgIslandAlt
     hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
       hg17 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed
 
 
 # MAKE UNIGENE/SAGE TRACK (DONE - 2004-10-15 Fan)
 
 # First get SAGE data and determine which version of UniGene to use first
     ssh hgwdev
     cd ~/kent/src/hg/sage
     make
     # XXX = uniGene build for which SAGE was built -- not necessarily current!
     # Figure out the build number by peeking at this file:
     wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null
 
 # UniGene Build #44  Arabidopsis thaliana
 # UniGene Build #61  Bos taurus
 # UniGene Build #16  Caenorhabditis elegans
 # UniGene Build #171  Homo sapiens
 # UniGene Build #19  Medicago truncatula
 # UniGene Build #138  Mus musculus
 # UniGene Build #52  Oryza sativa
 # UniGene Build #14  Pinus taeda
 # UniGene Build #132  Rattus norvegicus
 # UniGene Build #27  Sus scrofa
 # UniGene Build #38  Triticum aestivum
 # UniGene Build #11  Vitis vinifera
 # UniGene Build #41  Zea mays
 
 # From above info, set Version 171 for hg17
     ls /projects/cc/hg/sugnet/uniGene
 #   set Version = XXX
     set Version=171
     mkdir /projects/cc/hg/sugnet/sage/sage.$Version
     cd /projects/cc/hg/sugnet/sage/sage.$Version
 
     wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/Hs
     wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/readme.txt
     wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt
     wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/extr
     wget -r -nH --cut-dirs=2 --timestamp ftp://ftp.ncbi.nih.gov/pub/sage/info
 
 #  That downloaded about 1 GB of data
 
     cd map/Hs/NlaIII
     unzip -j SAGEmap_tag_ug-rel.zip
     cd ../../../extr/
     ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_*
     ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab
     ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \
       ./SAGE_*
     ../../scripts/countsPerExp.pl expCounts.tab expList.tab
 
     cd ../map/Hs/NlaIII/ 
     cat << '_EOF_' > /tmp/t.pl
 #!/usr/local/bin/perl
 
 while (<>) {
  chomp($_);
  @p = split(/\t/, $_);
  print "$p[2]\t$p[3]\t$p[0]\n";
 }
 '_EOF_'
     chmod +x /tmp/t.pl
 
     cat SAGEmap_tag_ug-rel | /tmp/t.pl | sort | sed -e 's/ /_/g' \
       > SAGEmap_ug_tag-rel_Hs
     cd ../../../extr
     createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \
       tagExpArrays.tab sageSummary.sage
 
 # Create the uniGene alignments 
 # /cluster/data/hg17/uniGene/hg17.uniGene.lifted.pslReps.psl
 
     # Download of the latest UniGene version is now automated by a 
     # cron job -- see /cluster/home/angie/crontab , 
     # /cluster/home/angie/unigeneVers/unigene.csh .  
     # If hgwdev gets rebooted, that needs to be restarted... maybe there's 
     # a more stable place to set up that cron job.  
 
     # substitute XXX -> the uniGene version used by SAGE.
     # set Version = XXX
     set Version = 171			(bash: export Version=171)
     cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version
     gunzip Hs.seq.uniq.gz Hs.data.gz
     ../countSeqsInCluster.pl Hs.data counts.tab
     ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab
     # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects)
     ssh kkstore
     set Version = 171 # same as above
     mkdir -p /iscratch/i/uniGene.$Version
     cp -p \
   /projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
       /iscratch/i/uniGene.$Version
     ssh kkr1u00
     iSync
     ssh kk
     set Version = 171 # same as above
     mkdir -p /cluster/data/hg17/bed/uniGene.$Version
     cd /cluster/data/hg17/bed/uniGene.$Version
     ls -1S /scratch/hg/gs.18/build35/maskedContigs/*.fa > allctg.lst
     ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
       > uniGene.lst
     cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2)  {check out 
 line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
     gensub2 allctg.lst uniGene.lst template.sub para.spec
     para create para.spec
     mkdir psl
     para try
     para check
     para push
 # Completed: 380 of 380 jobs
 # CPU time in finished jobs:      35994s     599.91m    10.00h    0.42d  0.001 y
 # IO & Wait Time:                  1812s      30.19m     0.50h    0.02d  0.000 y
 # Average job time:                  99s       1.66m     0.03h    0.00d
 # Longest job:                     1497s      24.95m     0.42h    0.02d
 # Submission to last job:          1551s      25.85m     0.43h    0.02d
 
     ssh eieio
     set Version = 171 # same as above
     cd /cluster/data/hg17/bed/uniGene.$Version
     pslSort dirs raw.psl tmp psl >& pslSort.log
     liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
     | pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
       stdin hg17.uniGene.lifted.pslReps.psl /dev/null
 # Processed 141416 alignments
 
 # use hg17.uniGene.lifted.pslReps.psl for building UNIGENE/SAGE track.
 
     ssh hgwdev
     set Version = 171
     cd /projects/cc/hg/sugnet/sage/sage.$Version/extr
     addAveMedScoreToPsls \
       /cluster/data/hg17/bed/uniGene.$Version/hg17.uniGene.lifted.pslReps.psl \
       sageSummary.sage  uniGene.wscores.bed
     hgLoadBed hg17 uniGene_2 uniGene.wscores.bed
     hgsql hg17 < ~kent/src/hg/lib/sage.sql 
     echo "load data local infile 'sageSummary.sage' into table sage" \
         | hgsql hg17
     cd ../info
     ../../scripts/parseRecords.pl ../extr/expList.tab  > sageExp.tab
     hgsql hg17 < ~/kent/src/hg/lib/sageExp.sql 
     echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg17
     # update ~/kent/src/hg/makeDb/trackDb/human/hg17/uniGene_2.html 
     # with current uniGene date.
 
 
 # CREATE kgSpAlias TABLE FOR PB (Done 10/20/04)
 
     hgsql hg17 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
     hgsql hg17 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >>j.tmp
     cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
     rm j.tmp
 
     hgsql hg17 -e 'drop table kgSpAlias';
     hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
     hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
 
 
 # SEGMENTAL DUPLICATIONS (DONE 10/21/04 angie)
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/genomicSuperDups
     cd /cluster/data/hg17/bed/genomicSuperDups
     # A tar file containing files for both hg16 and hg17 was downloaded into 
     # /cluster/data/hg16/bed/genomicSuperDups;  move over the hg17 part.
     mv /cluster/data/hg16/bed/genomicSuperDups/bd35 .
     cd bd35
     # A note from Xinwei She about the contents:
 #Build35 contains only 2 tables: genomicSuperDups and celeraDupPositive.
     # use tail +2 to skip past the header line:
     # actually, celeraDupPositive.tab.gz has one extra bogus line so +3 for it:
     zcat celeraDupPositive.tab.gz | tail +3 \
     | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraDupPositive.sql \
         hg17 celeraDupPositive stdin
     zcat genomicSuperDups.tab.gz | tail +2 \
     | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql \
         hg17 genomicSuperDups stdin
     # clean up
     rm bed.tab
 
 # ECGENE TRACK (DONE, 2004-10-29, hartera)
     ssh eieio
     mkdir -p /cluster/data/hg17/bed/ECgene.2004-10-27
     ln -s /cluster/data/hg17/bed/ECgene.2004-10-27 \
           /cluster/data/hg17/bed/ECgene
     cd  /cluster/data/hg17/bed/ECgene
     wget \
 "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_hg17_low_gene.txt.gz"
     wget \
 "http://genome.ewha.ac.kr/ECgene/download/v1.2_ECgene/v1.2_hg17_low_pep.txt.gz"
     gunzip *.gz
 
     # load database
     ssh hgwdev
     cd /cluster/data/hg17/bed/ECgene
     ldHgGene -predTab hg17 ECgene v1.2_hg17_low_gene.txt
     # 646778 gene predictions
     hgPepPred hg17 tab ECgenePep v1.2_hg17_low_pep.txt
     rm *.tab
     nice gzip *.txt
 
 # LOAD ENSEMBL GENES (DONE, 2004-11-19, hartera)
 
     mkdir /cluster/data/hg17/bed/ensembl
     cd /cluster/data/hg17/bed/ensembl
     # Get the ensembl protein data from 
     # http://www.ensembl.org/Homo_sapiens/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Structures" box. 
     # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
     # Save as ensemblGene.gtf.gz
 
     # Ensembl handles random chromosomes differently than us, so we
     # strip this data.  Fortunately it just loses a couple of genes.
     # Add "chr" to front of each line in the gene data gtf file to make 
     # it compatible with our software.
     # Finally, get rid of the ".1" or ".2" after the name
     gunzip -c ensemblGene.gtf.gz \
     | grep -v ^6_DR51 \
     | grep -v ^DR51 \
     | grep -v ^DR52 \
     | grep -v ^DR53 \
     | grep -v _NT_ \
     | perl -wpe 's/^([0-9]|X|Y|Un|MT)/chr$1/ \
                  || die "Line $. doesnt start with human chrom:\n$_"' \
     | sed -e 's/\..\"/\"/g' \
     | sed -e 's/chrMT_NC_001807/chrM/' \
     > ensGene.gtf
 
     ssh hgwdev
     /cluster/data/hg17/bed/ensembl
     /cluster/bin/i386/ldHgGene hg17 ensGene ensGene.gtf
 
     # Read 33666 transcripts in 696579 lines in 1 files
     # 33666 groups 25 seqs 1 sources 4 feature types
     # 33666 gene predictions
 
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Choose Text, tab-separated as the output format.  Result name ensGtp.
     # Save file as ensGtp.txt.gz
     gunzip ensGtp.txt.gz
     hgsql hg17 < ~/kent/src/hg/lib/ensGtp.sql
     # remove header line from ensGtp.txt
     echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg17
 
     # Load Ensembl peptides:
     # Get them from ensembl as above in the gene section except for
     # Page 3) Choose the "Sequences" box. 
     # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
     # Save file as ensemblPep.fa.gz
     gunzip ensemblPep.fa.gz
     hgPepPred hg17 ensembl ensemblPep.fa
 
 # UPDATE KNOWN GENES TABLES (DONE 11/22/04 Fan)
 
 # Make sure the protein databases (sp041115 and proteins041115) were built first.
   
   hgsql hg17 -e "create database kgHg17B"
   
   mkdir -p /cluster/store8/kg/kgHg17B
   cd /cluster/store6/kgDB/bed
   ln -s /cluster/store8/kg/kgHg17B kgHg17B
   cd kgHg17B
 
     ~/src/hg/protein/KGprocess.sh kgHg17B hg17 041115
 
 # Found gbGetSeqs changed the format of mrna.fa output file 
 # (extra version number).  Updated Kgprocess.sh and manually
 # re-ran the following manually:
 
     grep "^>" mrna.fa |awk '{print $1}' > mrna.lis
 
     kgGetPep 041115 > mrnaPep.fa
 
     hgKgMrna kgH17BTemp mrna.fa mrna.ra tight_mrna.psl ll/loc2ref \
         mrnaPep.fa ll/mim2loc ${PDB} > kgHg17BKgMrna.out 2> kgHg17BKgMrna.err
 
 # then run Kgprocess.sh again to continue processing.
 
     ~/src/hg/protein/KGprocess.sh kgHg17B hg17 041115
 
 hgsql hg17 -e "select * from chromInfo" > chromInfo.tab
 getDbTabDef hg17 chromInfo >chromInfo.sql
 hgsql kgHg17B <chromInfo.sql
 hgsql kgHg17B -e 'load data local infile "chromInfo.tab" into table chromInfo ignore 1 lines'
 
 # Build kgProtMap table.  This table is needed by the Proteome Browser and 
 # it should be built before all the KG tables be moved from kgHg17B to hg17.
 
     ~/src/hg/protein/kgProtMap.sh kgHg17B hg17 041115
 
 # Completed: 7923 of 7923 jobs
 # CPU time in finished jobs:    2502923s   41715.39m   695.26h   28.97d  0.079 y
 # IO & Wait Time:                175358s    2922.63m    48.71h    2.03d  0.006 y
 # Average job time:                 338s       5.63m     0.09h    0.00d
 # Longest job:                     2403s      40.05m     0.67h    0.03d
 # Submission to last job:          7164s     119.40m     1.99h    0.08d
   
 # The script ran successfully with the last message:
 # Mon Nov 22 17:11:59 PST 2004 DONE =========================
 
 # Create database hg17Kg1 to store the old KG tables, just in case.
 
   hgsql hg17
 
   create database hg17Kg1;
 
   alter table cgapAlias rename as hg17Kg1.cgapAlias;
   alter table cgapBiocDesc rename as hg17Kg1.cgapBiocDesc;
   alter table cgapBiocPathway rename as hg17Kg1.cgapBiocPathway;
   alter table dupSpMrna rename as hg17Kg1.dupSpMrna;
   alter table keggMapDesc rename as hg17Kg1.keggMapDesc;
   alter table keggPathway rename as hg17Kg1.keggPathway;
   alter table kgAlias rename as hg17Kg1.kgAlias;
   alter table kgProtAlias rename as hg17Kg1.kgProtAlias;
   alter table kgXref rename as hg17Kg1.kgXref;
   alter table knownGene rename as hg17Kg1.knownGene;
   alter table knownGeneLink rename as hg17Kg1.knownGeneLink;
   alter table knownGeneMrna rename as hg17Kg1.knownGeneMrna;
   alter table knownGenePep rename as hg17Kg1.knownGenePep;
   alter table mrnaRefseq rename as hg17Kg1.mrnaRefseq;
   alter table spMrna rename as hg17Kg1.spMrna;
   alter table kgProtMap rename as hg17Kg1.kgProtMap;
 
 # After initial inspection of tables in kgHg17B, do the following
 # from mySql prompt:
 
   alter table kgHg17B.cgapAlias rename as hg17.cgapAlias;
   alter table kgHg17B.cgapBiocDesc rename as hg17.cgapBiocDesc;
   alter table kgHg17B.cgapBiocPathway rename as hg17.cgapBiocPathway;
   alter table kgHg17B.dupSpMrna rename as hg17.dupSpMrna;
   alter table kgHg17B.keggMapDesc rename as hg17.keggMapDesc;
   alter table kgHg17B.keggPathway rename as hg17.keggPathway;
   alter table kgHg17B.kgAlias rename as hg17.kgAlias;
   alter table kgHg17B.kgProtAlias rename as hg17.kgProtAlias;
   alter table kgHg17B.kgXref rename as hg17.kgXref;
   alter table kgHg17B.knownGene rename as hg17.knownGene;
   alter table kgHg17B.knownGeneLink rename as hg17.knownGeneLink;
   alter table kgHg17B.knownGeneMrna rename as hg17.knownGeneMrna;
   alter table kgHg17B.knownGenePep rename as hg17.knownGenePep;
   alter table kgHg17B.mrnaRefseq rename as hg17.mrnaRefseq;
   alter table kgHg17B.spMrna rename as hg17.spMrna;
   alter table kgHg17B.kgProtMap rename as hg17.kgProtMap;
 
 # Old hg17.knownGene has 43,401 entries and the new one has 44,338 entries.
 # Previously:
   
     featureBits hg17 knownGene
 #      65728598 bases of 2866216770 (2.293%) in intersection
 # Previously, was:
 #    	 63983072 bases of 2866216770 (2.232%) in intersection
   
 # Connect to genome-testdb and use hgcentraltest DB.
 # Update the entry in gdbPdb table:
 
     delete from gdbPdb where genomeDb='hg17';
     insert into gdbPdb values('hg17', 'proteins041115');
 
 # UPDATE KGSPALIAS TABLE TO BE USED BY PB (Done 12/20/04)
 
     cd /cluster/data/hg17/bed/pb
     mkdir new
     cd new
     hgsql hg17 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
     hgsql hg17 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >>j.tmp
     cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
     rm j.tmp
 
     hgsql hg17 -e 'drop table kgSpAlias';
     hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
     hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
     gzip kgSpAlias.tab
 
 # Create hg17GeneList.html (to be used by Google).
 # This step was done 12/08/04.
 
     cd /cluster/data/hg17/bed
     mkdir geneList
     cd geneList
     wget -O hg17GeneList.html "http://hgwdev-fanhsu.cse.ucsc.edu/cgi-bin/hgGeneList?db=hg17"
     cp -p hg17GeneList.html /usr/local/apache/htdocs/goldenPath
 # Check this html file into CVS.
 
 # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
 # This depends on the go and uniProt databases as well as 
 # the kgAlias and kgProAlias tables.  The hgKgGetText takes
 # about 5 minutes when the database is not too busy.  The rest
 # is real quick.
      ssh hgwdev
      cd /cluster/data/hg17/bed/kgHg17F
      mkdir index
      cd index
      hgKgGetText hg17 knownGene.text
      ixIxx knownGene.text knownGene.ix knownGene.ixx
      ln -s /cluster/data/hg17/bed/kgHg17F/index/knownGene.ix /gbdb/hg17/knownGene.ix
      ln -s /cluster/data/hg17/bed/kgHg17F/index/knownGene.ixx /gbdb/hg17/knownGene.ixx
 
 
 
 # UPDATE TABLES NEEDED BY hgGene (DONE 11/30/04 Fan)
 
 # UPDATE BioCyc TABLES
     hgsql hg17 -e 'delete from bioCycPathway'
     hgsql hg17 -e 'delete from bioCycMapDesc'
 
 #  Using data files sent by Peter Carp from SRI,
 #  per Peter's email of 10/1/04, they don't have recent update,
 #  so data files received last year are used.
   
 #  Save the BioCyc Pathway name and description table as gene-pathway.dat.
 #  Save the pathway data file as gene-pathway.dat.
 #  Make sure there is no extra ^M at end of the lines.
 
 # Run hgBioCyc program to generate the file bioCycPathway.tab.
 
 	 hgBioCyc gene-pathway.dat hg17
 
 # Load results into hg17.
 
     LOAD DATA local INFILE 'pathway-names.dat' into table bioCycMapDesc;
     LOAD DATA local INFILE 'bioCycPathway.tab' into table bioCycPathway;
 
 # REBUID FOLDUTR TABLES (DONE - 2004-11-30 Fan)
 # First set up directory structure and extract UTR sequence on hgwdev
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mv rnaStruct rnaStruct.2004-10-04
     mkdir -p /cluster/data/hg17/bed/rnaStruct.2004-11-30
     ln -s rnaStruct.2004-11-30 rnaStruct
     cd /cluster/data/hg17/bed/rnaStruct
     mkdir -p utr3/split utr5/split utr3/fold utr5/fold
     utrFa hg17 knownGene utr3 utr3/utr.fa
     utrFa hg17 knownGene utr5 utr5/utr.fa
 
 # Split up files and make files that define job.
     ssh kk
     cd /cluster/data/hg17/bed/rnaStruct
     faSplit sequence utr3/utr.fa 50000 utr3/split/s
     faSplit sequence utr5/utr.fa 50000 utr5/split/s
     ls -1 utr3/split > utr3/in.lst
     ls -1 utr5/split > utr5/in.lst
     cd utr3
     cat > gsub <<end
 #LOOP
 rnaFoldBig split/\$(path1) fold
 #ENDLOOP
 end
     cp gsub ../utr5
 
 # Do cluster run for 3' UTRs
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 38115 of 38115 jobs
 # CPU time in finished jobs:    1101680s   18361.33m   306.02h   12.75d  0.035 y
 # IO & Wait Time:                100275s    1671.25m    27.85h    1.16d  0.003 y
 # Average job time:                  32s       0.53m     0.01h    0.00d
 # Longest job:                     3645s      60.75m     1.01h    0.04d
 # Submission to last job:          7007s     116.78m     1.95h    0.08d
 
 # Do cluster run for 5' UTRs 
     cd ../utr5
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 30524 of 30524 jobs
 # CPU time in finished jobs:     116647s    1944.12m    32.40h    1.35d  0.004 y
 # IO & Wait Time:                 80477s    1341.28m    22.35h    0.93d  0.003 y
 # Average job time:                   6s       0.11m     0.00h    0.00d
 # Longest job:                     2449s      40.82m     0.68h    0.03d
 # Submission to last job:          3386s      56.43m     0.94h    0.04d
 
 # Load database
     ssh hgwdev
     cd /cluster/data/hg17/bed/rnaStruct/utr5
     hgLoadRnaFold hg17 foldUtr5 fold
 # Parsed 30525 files
 # Warning: load of foldUtr5 did not go as planned: 30525 record(s), 2 row(s) skipped, 0 warning(s) loading 
 ./foldUtr5.tab
 
     cd ../utr3
     hgLoadRnaFold hg17 foldUtr3 fold
 # Parsed 38115 files
 #Warning: load of foldUtr3 did not go as planned: 38115 record(s), 2 row(s) skipped, 0 warning(s) loading 
 ./foldUtr3.tab
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 # UPDATE GENE SORTER TABLES (AKA: FAMILY BROWSER) (DONE - 2004-11-29 - Fan)
 #	This should be done after knownGene tables are complete from known gene
 #	process.
 #
 # Cluster together various alt-splicing isoforms.
 #	Creates the knownIsoforms and knownCanonical tables
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/geneSorter.2004-11-24
 # remove old symbolic link
 rm /cluster/data/hg17/bed/geneSorter
 ln -s /cluster/data/hg17/bed/geneSorter.2004-11-24 \
 	/cluster/data/hg17/bed/geneSorter
 cd /cluster/data/hg17/bed/geneSorter
 hgClusterGenes hg17 knownGene knownIsoforms knownCanonical
 
 # Extract peptides from knownGenes into fasta file
 # and create a blast database out of them.
 mkdir /cluster/data/hg17/bed/geneSorter/blastp
 cd /cluster/data/hg17/bed/geneSorter/blastp
 pepPredToFa hg17 knownGenePep known.faa
 #	You may need to build this binary in src/hg/near/pepPredToFa
 /scratch/blast/formatdb -i known.faa -t known -n known
 #	This command is in /projects/compbio/bin/$MACH/formatdb
 
 # Copy over database to bluearc
 rm -fr /cluster/bluearc/hg17/blastp
 mkdir -p /cluster/bluearc/hg17/blastp
 cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \
 	/cluster/bluearc/hg17/blastp
 
 #	Had to pick up a new blastall binary (2004-06-15)
 #	Our old one would no longer run on our systems that have
 #	updated Linux versions
 mkdir /cluster/bluearc/blast2210
 cd /cluster/bluearc/blast2210
 wget --timestamping \
     ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.10/blast-2.2.10-ia32-linux.tar.gz
 wget --timestamping \
     ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.10/ChangeLog.txt
 wget --timestamping \
     ftp://ftp.ncbi.nlm.nih.gov/blast/executables/release/2.2.10/ReleaseNotes.txt
 tar xvzf blast-2.2.10-ia32-linux.tar.gz
 
 # Split up fasta file into bite sized chunks for cluster
 cd /cluster/data/hg17/bed/geneSorter/blastp
 mkdir split
 faSplit sequence known.faa 8000 split/kg
 
 # Make parasol run directory
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
 cd /cluster/data/hg17/bed/geneSorter/blastp/self
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
 -e 0.01 -m 8 -b 1000
 '_EOF_'
     # << keep emacs happy
 chmod +x blastSome
 
 # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	'ls ../../split/*.fa' is too much, hence the echo
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 # Wait a couple of minutes, and do a para check,  if all is good
 # then do a
 para push
 # This should finish in ~15 minutes if the cluster is free.
 Completed: 7748 of 7748 jobs
 CPU time in finished jobs:     191136s    3185.59m    53.09h    2.21d  0.006 y
 IO & Wait Time:                 66703s    1111.72m    18.53h    0.77d  0.002 y
 Average job time:                  33s       0.55m     0.01h    0.00d
 Longest job:                      370s       6.17m     0.10h    0.00d
 Submission to last job:           747s      12.45m     0.21h    0.01d
 
 # Load into database.  This takes about 30 minutes
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
 time hgLoadBlastTab hg17 knownBlastTab *.tab
 # Scanning through 7748 files
 # Loading database with 12810133 rows
 # 306.480u 54.190s 26:35.50 22.6% 0+0k 0+0io 206pf+0w
 
 cd /cluster/data/hg17/bed/geneSorter
 # Create table that maps between known genes and RefSeq
 hgMapToGene hg17 refGene knownGene knownToRefSeq
 #	may need to build this command in src/hg/near/hgMapToGene
 #	hgsql -e "select count(*) from knownToRefSeq;" hg17
 #	row count changed 37611
 
 # Create table that maps between known genes and LocusLink
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \
 	> refToLl.txt
 hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
 #	hgsql -e "select count(*) from knownToLocusLink;" hg17
 #	row count changed to 37611
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
 #	hgsql -e "select count(*) from knownToPfam;" hg17
 #	row count changed to 36302
 
 # Create table to map between known genes and GNF Atlas2
 # expression data.
     hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 #	hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
 #	row count changed to 36373
 
 # Create expression distance table - takes about an hour
     hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
     	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnfAtlas2 &
 # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
 # Got 36373 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
 
 #	hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
 #	row count changed to 36373000
 
 # Create a table that maps between known genes and 
 # the nice affy expression data.
 hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
 #	hgsql -e "select count(*) from knownToU133;" hg17
 #	row count changed to 37299
 
 # Create expression distance table.  This will take about 2.5 hours
 cd /tmp
 cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
 time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
 	-weights=affyUcla.weight -lookup=knownToU133 &
 # Have 43039 elements in affyUclaNorm
 # 211 genes, 42 weights, 26.500000 total wieght
 # Got 37299 unique elements in affyUclaNorm
 # 8212.320u 217.310s 2:38:07.84 88.8%     0+0k 0+0io 267pf+0w
 # Create table that maps between known genes and 
 # the GNF data.
 cd /tmp
 hgMapToGene hg17 affyU95 knownGene knownToU95
 #	row count changed to 18791
 #	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
 hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
 	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95 &
 # Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
 # Got 17682 unique elements in hgFixed.gnfHumanU95MedianRatio
 #	row count changed to 17682000 
 
 # Create known gene mapping table and expression distance tables
 # for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)
 
 hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
 hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
 	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnf1h &
 # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
 # Got 10273 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
 
 cd /cluster/data/hg17/bed/geneSorter
 hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
 #	row count changed to 40015 
 
 # Make sure that GO database is up to date.
 # UPDATE GO DATABASE (DONE 11/24/04 Fan)
 
 # Download the terms and make the database.
 ssh hgwdev
 mkdir /cluster/store1/geneOntology/20041124
 cd /cluster/store1/geneOntology/20041124
 
 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200411-assocdb-data.gz
 
 hgsql mysql <<end
 create database go041124;
 end
 zcat go_*data.gz | hgsql go041124
 
 wget -timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_sptr.gz
 
 wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz
 
 zcat gene_association.goa_uniprot.gz | hgGoAssociation go041124 goaPart stdin 
 # Passed 4502016 of 5291097 of 5291097, 85.09%
 
 # Ask sys-admin to switch the database pointer go to point to go041124.
 
 cd /cluster/data/hg17/bed/geneSorter
 
 XXX - DO NOT YET HAVE ensGene table - must wait on Ensembl to release that
 XXX - have not created the knownToEnsembl table yet - 2004-07-15 - Hiram
 # Create knownToEnsembl column
 hgMapToGene hg17 ensGene knownGene knownToEnsembl
 #	table row count went from previous version: 36068 to 38251
 
 # Make knownToCdsSnp table (Heather did this table, Nov 29, 2004)
   ssh hgwdev
   nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
 # row count 168336
 # approx. 5 minutes running time
 
 # Make C. elegans ortholog column using blastp on wormpep.
 # First make C. elegans protein database and copy it to cluster/bluearc
 # There was no /cluster/bluearc/ce1/blastp, so get the latest wormpep from Sanger
 cd /cluster/data/ce1/bed/blastp
 mkdir old
 cp -p * old
 wget --timestamp ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep134/wormpep134
 mv wormpep134 wormPep.faa
 formatdb -i wormPep.faa -t wormPep -n wormPep
 #copy them to /cluster/bluearc
 ssh kkr1u00
 mkdir -p /cluster/bluearc/ce1/blastp
 cp /cluster/data/ce1/bed/blastp/wormPep.p?? /cluster/bluearc/ce1/blastp
 
 #	The blast jobs below can be run on the kk or kk9 clusters
 # Create the ceBlastTab
 ssh kk9
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/ce1
 cd /cluster/data/hg17/bed/geneSorter/blastp/ce1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 #	Only takes 10 minutes on an idle cluster
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:      33235s     553.91m     9.23h    0.38d  0.001 y
 # IO & Wait Time:                 19891s     331.52m     5.53h    0.23d  0.001 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest job:                       68s       1.13m     0.02h    0.00d
 # Submission to last job:           653s      10.88m     0.18h    0.01d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/ce1/run/out
 hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
 #	row count changed to 28252
 
 # Make mouse ortholog column using blastp on mouse known genes.
 # First make mouse protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This already exists.  See makeMm5.doc for procedure
 #	the directory: /cluster/bluearc/scratch/mus/mm5/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm5
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm5
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/scratch/mus/mm5/blastp/known \
 	-i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:     141842s    2364.04m    39.40h    1.64d  0.004 y
 # IO & Wait Time:                 52251s     870.85m    14.51h    0.60d  0.002 y
 # Average job time:                  25s       0.42m     0.01h    0.00d
 # Longest job:                      254s       4.23m     0.07h    0.00d
 # Submission to last job:           540s       9.00m     0.15h    0.01d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm5/run/out
 hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
 # Scanning through 7748 files
 #	row count changed to 37549 
 
 # Make rat ortholog column using blastp on rat known genes.
 # First make rat protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This already exists.  See makeRn3.doc for procedure.
 #	Files were put in this directory: /cluster/bluearc/rn3/blastp/
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
     -p blastp -d /cluster/bluearc/rn3/blastp/known \
     -i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:      31786s     529.77m     8.83h    0.37d  0.001 y
 # IO & Wait Time:                 25795s     429.91m     7.17h    0.30d  0.001 y
 # Average job time:                   7s       0.12m     0.00h    0.00d
 # Longest job:                       75s       1.25m     0.02h    0.00d
 # Submission to last job:           157s       2.62m     0.04h    0.00d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
 hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
 # Scanning through 7748 files
 #Loading database with 26133 rows
 
 # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
 # First make protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 # NOTE: data used to reside in /cluster/bluearc/dr1/blastp
 mv /cluster/bluearc/dr1/blastp /cluster/bluearc/danRer1/blastp
 #	the directory: /cluster/bluearc/danRer1/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/danRer1
 cd /cluster/data/hg17/bed/geneSorter/blastp/danRer1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/danRer1/blastp/ensembl \
 	-i $1 -o $2 -e 0.005 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:     102324s    1705.39m    28.42h    1.18d  0.003 y
 # IO & Wait Time:                 47203s     786.72m    13.11h    0.55d  0.001 y
 # Average job time:                  19s       0.32m     0.01h    0.00d
 # Longest job:                      230s       3.83m     0.06h    0.00d
 # Submission to last job:           427s       7.12m     0.12h    0.00d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/danRer1/run/out
 hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
 # Loading database with 33852 rows
 
 # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
 # First make protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/sc1/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:      20983s     349.72m     5.83h    0.24d  0.001 y
 # IO & Wait Time:                 25513s     425.21m     7.09h    0.30d  0.001 y
 # Average job time:                   6s       0.10m     0.00h    0.00d
 # Longest job:                       37s       0.62m     0.01h    0.00d
 # Submission to last job:           106s       1.77m     0.03h    0.00d
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
 hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
 # Loading database with 18489 rows
 
 # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
 # First make SwissProt protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/dm1/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:      83377s    1389.62m    23.16h    0.97d  0.003 y
 # IO & Wait Time:                 39913s     665.21m    11.09h    0.46d  0.001 y
 # Average job time:                  16s       0.27m     0.00h    0.00d
 # Longest job:                      167s       2.78m     0.05h    0.00d
 # Submission to last job:           365s       6.08m     0.10h    0.00d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
 hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
 # Loading database with 30067 rows
 
 # update knownToHInv table
 # Verified that there is now new release of HInv data.
 hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
 # count changed to 33236
 
 #### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-11-30 - Fan)
 
     # Get the ensembl gene/protein cross-reference data from
     # http://www.ensembl.org/Homo_sapiens/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Feature" box, select gene, transcript, protein,
               SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC
     # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
     # Save as ensXref.txt
 
     sed ensXref.txt -e 's/\./\t/g' > ensemblXref3.tab
 
     hgsql hg17 -e "drop table ensemblXref3"
     hgsql hg17 < ~/src/hg/lib/ensemblXref3.sql
 
     hgsql hg17 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'
 
 #### BUILD SUPERFAMILY RELATED TABLES (DONE - 2004-11-30 - Fan)
 
 # Download Superfamily data files and build the Superfamily DB
 # from supfam.mrc-lmb.cam.ac.uk
 
     mkdir /cluster/store8/superfamily/041128
     ln -s /cluster/store8/superfamily/041128 /cluster/data/superfamily/041128
     cd /cluster/data/superfamily/041128
 # ftp over the following two files:
 ass_28-Nov-2004.tab.gz
       supfam_28-Nov-2004.sql.gz
     
     gzip -d *.gz
 # Load the Superfamily database
     hgsql hg17 -e "create database superfam041128"
     hgsql superfam041128 < supfam_28-Nov-2004.sql
 # This may take about an hour.
 
 # Make sure to add an index on id of the des table of superfam041128.
     hgsql superfam041128 -e "create index id on des(id);"
 
     hgsql superfam041128 < ~/src/hg/lib/sfAssign.sql
     hgsql superfam041128 -e 'load data local infile "ass_28-Nov-2004.tab" into table 
 superfam041128.sfAssign;'
 
 # Build or rebuild Superfamily track and create sf tables needed for PB
 
    hgsql hg17 < ~/src/hg/lib/sfAssign.sql
 
    cd /cluster/data/superfamily/041128  
    hgsql hg17 -e 'load data local infile "ass_28-Nov-2004.tab" into table hg17.sfAssign;'
 
 # If hg17.sfDes already exists, drop it.
 
    hgsql superfam041128 -e "select * from des" >sfDes.tab
    hgsql hg17 < ~/src/hg/lib/sfDes.sql
    hgsql hg17 -e 'load data local infile "sfDes.tab" into table hg17.sfDes ignore 1 lines;'
 
 # If hg17.superfamily already exists, drop it.
    cd /cluster/data/hg17/bed
    mkdir /cluster/data/hg17/sf.2004-1128
    ln -s sf.2004-1128 sf
    hgSuperfam hg17 > sf.log
 
 # It is normal that many proteins does not have corresponding Superfamily entries.
 
 # If hg17.sfDescription exists, drop it.
 
    hgsql hg17 < ~/src/hg/lib/sfDescription.sql
    hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg17.sfDescription;'
 
 # Finally, load the superfamily table.
 
    hgLoadBed hg17 superfamily superfamily.tab -tab
 
 # Create knownToSuperfamily table
 # Note hs is changed into ht for this Superfamily release.
    
    cat /cluster/data/superfamily/041128/ass_28-Nov-2004.tab \
    | hgKnownToSuper hg17 hs stdin
 # created 25287 rows in knownToSuper
 
 ### HG17 PROTEOME BROWSER TABLES RE-BUILD ####  (DONE - 2004-12-01 - Fan)
 # These are instructions for rebuilding tables 
 # needed for the Proteome Browser.  
 # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
 # ARE REBUILT.  
 # This update is based on proteins DBs dated 041115.
 
 # Create the working directory
 
     ssh hgwdev
     mv /cluster/data/hg17/bed/pb /cluster/data/hg17/bed/pb.2004-06-11
     mkdir /cluster/data/hg17/bed/pb.2004-12-01
     cd /cluster/data/hg17/bed
     ln -s /cluster/data/hg17/bed/pb.2004-12-01 pb
 
 # Move the existing PB tables by:
 
 hgsql hg17
 create database hg17Sav2;
 
 alter table hg17.pepCCntDist rename as hg17Sav2.pepCCntDist;
 alter table hg17.pepExonCntDist rename as hg17Sav2.pepExonCntDist;
 alter table hg17.pepHydroDist rename as hg17Sav2.pepHydroDist;
 alter table hg17.pepIPCntDist rename as hg17Sav2.pepIPCntDist;
 alter table hg17.pepMolWtDist rename as hg17Sav2.pepMolWtDist;
 alter table hg17.pepMwAa rename as hg17Sav2.pepMwAa;
 alter table hg17.pepPi rename as hg17Sav2.pepPi;
 alter table hg17.pepPiDist rename as hg17Sav2.pepPiDist;
 alter table hg17.pepResDist rename as hg17Sav2.pepResDist;
 
 alter table hg17.pbAaDistA rename as hg17Sav2.pbAaDistA;
 alter table hg17.pbAaDistC rename as hg17Sav2.pbAaDistC;
 alter table hg17.pbAaDistD rename as hg17Sav2.pbAaDistD;
 alter table hg17.pbAaDistE rename as hg17Sav2.pbAaDistE;
 alter table hg17.pbAaDistF rename as hg17Sav2.pbAaDistF;
 alter table hg17.pbAaDistG rename as hg17Sav2.pbAaDistG;
 alter table hg17.pbAaDistH rename as hg17Sav2.pbAaDistH;
 alter table hg17.pbAaDistI rename as hg17Sav2.pbAaDistI;
 alter table hg17.pbAaDistK rename as hg17Sav2.pbAaDistK;
 alter table hg17.pbAaDistL rename as hg17Sav2.pbAaDistL;
 alter table hg17.pbAaDistM rename as hg17Sav2.pbAaDistM;
 alter table hg17.pbAaDistN rename as hg17Sav2.pbAaDistN;
 alter table hg17.pbAaDistP rename as hg17Sav2.pbAaDistP;
 alter table hg17.pbAaDistQ rename as hg17Sav2.pbAaDistQ;
 alter table hg17.pbAaDistR rename as hg17Sav2.pbAaDistR;
 alter table hg17.pbAaDistS rename as hg17Sav2.pbAaDistS;
 alter table hg17.pbAaDistT rename as hg17Sav2.pbAaDistT;
 alter table hg17.pbAaDistV rename as hg17Sav2.pbAaDistV;
 alter table hg17.pbAaDistW rename as hg17Sav2.pbAaDistW;
 alter table hg17.pbAaDistY rename as hg17Sav2.pbAaDistY;
 alter table hg17.pbAnomLimit rename as hg17Sav2.pbAnomLimit;
 alter table hg17.pbResAvgStd rename as hg17Sav2.pbResAvgStd;
 alter table hg17.pbStamp rename as hg17Sav2.pbStamp;
 
 quit
 
 # Define pep* tables in hg17 DB
 
 	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
 
 #  First edit out pepPred table definition, then
 
 	hgsql hg17 < pepAll.sql
 
 # Build the pepMwAa table
 
   hgsql proteins041115 -e "select info.acc, molWeight, aaSize from sp041115.info, sp041115.accToTaxon where 
 accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab
 
 hgsql hg17 <<end
 load data local infile "pepMwAa.tab" into table hg17.pepMwAa ignore 1 lines;
 end
 
 o Build the pepPi table
 
   hgsql proteins041115 -e "select info.acc from sp041115.info, sp041115.accToTaxon where 
 accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis
 
   pbCalPi protAcc.lis sp041115 pepPi.tab
 
 hgsql hg17 <<end
 load data local infile "pepPi.tab" into table hg17.pepPi;
 end
 
 # Calculate and load pep distributions
 
   pbCalDist sp041115 proteins041115 9606 hg17 >pbCalDist.out
 
     cat pbCalDist.out
     wc  pbCalDist.out
 
     hgsql hg17
 
     load data local infile "pepExonCntDist.tab" into table hg17.pepExonCntDist;
     load data local infile "pepCCntDist.tab" into table hg17.pepCCntDist;
     load data local infile "pepHydroDist.tab" into table hg17.pepHydroDist;
     load data local infile "pepMolWtDist.tab" into table hg17.pepMolWtDist;
     load data local infile "pepResDist.tab" into table hg17.pepResDist;
     load data local infile "pepIPCntDist.tab" into table hg17.pepIPCntDist;
     load data local infile "pepPiDist.tab" into table hg17.pepPiDist;
     quit
 
 # Calculate frequency distributions
 
     pbCalResStd 041115 9606 hg17
 
 # Create pbAnomLimit and pbResAvgStd tables
 
    hgsql hg17 < ~/src/hg/lib/pbAnomLimit.sql
    hgsql hg17 < ~/src/hg/lib/pbResAvgStd.sql
 
    hgsql hg17 -e 'load data local infile "pbResAvgStd.tab" into table hg17.pbResAvgStd;'
    hgsql hg17 -e 'load data local infile "pbAnomLimit.tab" into table hg17.pbAnomLimit;'
 
 # Create pbStamp table for PB
    
   hgsql hg17 < ~/src/hg/lib/pbStamp.sql
   hgsql hg17Sav2 -e 'select * from pbStamp' > pbStamp.tab
 
   hgsql hg17 -e 'load data local infile "pbStamp.tab" into table hg17.pbStamp ignore 1 lines;'
 
 # Adjust drawing parameters for Proteome Browser stamps
 
   Now invoke Proteome Browser and adjust various drawing parameters
   (mostly the ymax of each stamp) if necessary, by updating the 
   pbStamp.tab file and then delete and reload the pbStamp table. 
 
 # Perform preliminary review of Proteome Browser for hg17, then
   notify QA for formal review.
 
 ####  Blat knownGene proteins to determine exons  (braney DONE 12/11/04)
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir blat.hg17KG.2004-12-08
     rm blat.hg17KG
     ln -s  blat.hg17KG.2014-12-08 blat.hg17KG
     cd blat.hg17KG
     pepPredToFa hg17 knownGenePep known.fa
     ssh kk
     cd /cluster/data/hg17/bed/blat.hg17KG
     cat << '_EOF_' > blatSome
 #!/bin/csh -fe
 /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
 '_EOF_'
     cat << '_EOF_' > blatGsub
 #LOOP
 blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
     chmod +x blatSome
     ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst
     mkdir kgfa
     cd kgfa
     faSplit sequence ../known.fa 3010 kg
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     gensub2 human.lst kg.lst blatGsub blatSpec
     mkdir psl
     cd psl
     foreach i (`cat ../human.lst`)
 	mkdir `basename $i .nib`
     end
     cd ..
     para create blatSpec
     para push
 
 # Completed: 134130 of 134136 jobs
 # Crashed: 6 jobs
 # CPU time in finished jobs:   29801114s  496685.23m  8278.09h  344.92d  0.945 y
 # IO & Wait Time:               1983513s   33058.55m   550.98h   22.96d  0.063 y
 # Average job time:                 237s       3.95m     0.07h    0.00d
 # Longest job:                    63306s    1055.10m    17.59h    0.73d
 # Submission to last job:        169384s    2823.07m    47.05h    1.96d
 
 # did 6 crashed jobs on small cluster
     ssh eieio
     cd /cluster/data/hg17/bed/blat.hg17KG
     pslSort dirs raw.psl /tmp psl/*
     pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
     pslUniq cooked.psl hg17KG.psl
     pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
     kgName hg17 hg17KG.psl blastKGRef01
     cut -f 10 hg17KG.psl > kgName.lst
     faSomeRecords known.fa kgName.lst hg17KG.fa
     hgPepPred hg17 generic blastKGPep01 hg17KG.fa
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blat.hg17KG
     hgsql hg17 < ~/kent/src/hg/lib/blastRef.sql
     echo "rename table blastRef to blastKGRef01" | hgsql hg17
     echo "load data local infile 'blastKGRef01' into table blastKGRef01" | hgsql hg17
 
 #### TIGR GENE INDEX (DONE 2004-12-04 Fan)
     mkdir -p /cluster/data/hg17/bed/tigr
     cd /cluster/data/hg17/bed/tigr
     wget -timestamp ftp://ftp.tigr.org/pub/data/tgi/Homo_sapiens/TGI_track_HumanGenome_build35.tgz
     tar xvzf TGI*.tgz
 
     foreach f (*cattle*)
       set f1 = `echo $f | sed -e 's/cattle/cow/g'`
       mv $f $f1
     end
 
     foreach o (mouse cow human pig rat)
       echo $o
       setenv O $o
       foreach f (chr*_$o*s)
         tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
       end
     end
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/tigr
     hgsql hg17 -e "drop table tigrGeneIndex"
     hgsql hg17 < ~/kent/src/hg/lib/tigrGeneIndex.sql
 
     foreach f (*.gff)
         echo Processing $f ...
         /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC hg17 tigrGeneIndex $f
         hgsql hg17 -e "select count(*) from tigrGeneIndex"
     end
     # Total of 401322 entries created in tigrGeneIndex table.
 
     hgsql hg17 -e "update tigrGeneIndex set cdsStart = txStart;"
     hgsql hg17 -e "update tigrGeneIndex set cdsEnd = txEnd;"
 
     checkTableCoords hg17 tigrGeneIndex
 
     gzip *.gff *TCs
 
 # BLASTZ FOR ZEBRAFISH (danRer2) (DONE, 2004-12-09, hartera)
     ssh kkr1u00
     # blastz requires lineage-specific repeats
     # Treat all repeats as lineage-specific.
  # /iscratch/i/gs.18/build35/linSpecRep.notInZebrafish exists (makeDanRer1.doc)
     mkdir -p /iscratch/i/danRer2/linSpecRep.notInHuman
     foreach f (/iscratch/i/danRer2/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/danRer2/linSpecRep.notInHuman/$f:t:r:r.out.spec
     end
     iSync
 
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastz.danRer2.2004-12-08
     ln -s /cluster/data/hg17/bed/blastz.danRer2.2004-12-08 \
           /cluster/data/hg17/bed/blastz.danRer2
     cd /cluster/data/hg17/bed/blastz.danRer2
     # Set L=6000 and abridge repeats - these are the same parameters used
     # for hg16 and Fugu and similar to those for hg16-galgal2
                                                                                 
     cat << '_EOF_' > DEF
 # human (hg17) vs zebrafish (danRer2)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
                                                                                 
 ALIGN=blastz-run
 BLASTZ=blastz
                                                                                 
 # Reuse parameters from hg16-fr1.
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
                                                                                 
 # TARGET: Human (hg17)
 SEQ1_DIR=/iscratch/i/gs.18/build35/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.18/build35/linSpecRep.notInZebrafish
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
                                                                                 
 # QUERY: zebrafish (danRer2)
 SEQ2_DIR=/iscratch/i/danRer2/nib/
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/danRer2/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
                                                                                 
 BASE=/cluster/data/hg17/bed/blastz.danRer2
                                                                                 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x DEF
     # Save the DEF file in the current standard place
     cp DEF ~angie/hummus/DEF.hg17-danRer2.2004-12-08
     # prepare first cluster run
     ssh kk
     cd /cluster/data/hg17/bed/blastz.danRer2
     bash # if a csh/tcsh user
     . ./DEF
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check ...etc.
 # para time
 # Completed: 58993 of 58993 jobs
 # CPU time in finished jobs:   19583036s  326383.93m  5439.73h  226.66d  0.621 y
 # IO & Wait Time:                471090s    7851.50m   130.86h    5.45d  0.015 y
 # Average job time:                 340s       5.67m     0.09h    0.00d
 # Longest job:                      885s      14.75m     0.25h    0.01d
 # Submission to last job:         78245s    1304.08m    21.73h    0.91d
 
     ssh kki
     cd /cluster/data/hg17/bed/blastz.danRer2
     bash # if a csh/tcsh user
     . ./DEF
     /cluster/data/hg17/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # para time
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:        789s      13.14m     0.22h    0.01d  0.000 y
 # IO & Wait Time:                  2992s      49.87m     0.83h    0.03d  0.000 y
 # Average job time:                  11s       0.18m     0.00h    0.00d
 # Longest job:                       34s       0.57m     0.01h    0.00d
 # Submission to last job:           391s       6.52m     0.11h    0.00d
 
     #   Third cluster run to convert lav's to axt's
     ssh kki
     cd /cluster/data/hg17/bed/blastz.danRer2
     mkdir axtChrom
     # a new run directory
     mkdir run.2
     cd run.2
 cat << '_EOF_' > do.csh
 #!/bin/csh
 cd $1
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin /iscratch/i/gs.18/build35/bothMaskedNibs \
  /iscratch/i/danRer2/nib stdout \
 | axtSort stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg17/bed/blastz.danRer2/axtChrom/$(root1).axt}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     \ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
     head jobList
     para create jobList
     para try, check, push, check,...
 # para time
 # Completed: 45 of 46 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:         99s       1.64m     0.03h    0.00d  0.000 y
 # IO & Wait Time:                   862s      14.37m     0.24h    0.01d  0.000 y
 # Average job time:                  21s       0.36m     0.01h    0.00d
 # Longest job:                       92s       1.53m     0.03h    0.00d
 # Submission to last job:           456s       7.60m     0.13h    0.01d
     # crashed job: chr6_hla_hap1.axt is empty - has no alignments
     # translate sorted axt files into psl
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.danRer2
     mkdir -p pslChrom
     set tbl = "blastzDanRer2"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
                                                                                
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.danRer2/pslChrom
                                                                                
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl -noTNameIx hg17 $f
       echo "$f Done"
     end
 # try different parameters for blastz with chr1 of hg17
 # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer1 -enrichment
 # refGene:cds 1.301%, blastzDanRer1 3.934%, both 0.874%, cover 67.23%, 
 # enrich 17.09x
 # H=2000, Y=3400, L=6000, K=2200 and HoxD55.q scoring matrix
 # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2 -enrichment
 # refGene:cds 1.301%, blastzDanRer2 3.845%, both 0.879%, cover 67.55%, 
 # enrich 17.57x
 # same parameters as above but L=8000
 # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2L8k -enrichment
 # refGene:cds 1.301%, blastzDanRer2L8k 2.309%, both 0.778%, cover 59.81%, 
 # enrich 25.91x
 # enrichment went up but coverage dropped quite a bit.
 # Default parameters with H=2000
 # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2Default -enrichment
 # refGene:cds 1.301%, blastzDanRer2Default 1.701%, both 0.846%, cover 65.04%, 
 # enrich 38.24x
 # same as first run but with no Y option set (default Y)
 # featureBits -chrom=chr1 hg17 refGene:cds blastzDanRer2NoY -enrichment
 # refGene:cds 1.301%, blastzDanRer2NoY 3.980%, both 0.877%, cover 67.47%, 
 # enrich 16.95x
 
 # row count:
 # danRer2  122160
 # danRer2L8k 62815
 # danRer2Default 75818
 # danRer2NoY 124129
 # can be pruned at the chaining step.
 # trackDb - change Zebrafish Blastz to danRer1 Blastz and display this track
 # for danRer2 as Zebrafish Blastz
 
 # RESCORE DANRER2 BLASTZ (DONE, 2004-12-09, hartera)
     # Low scores can occur with repeats abridged and using the
     # HoxD55.q matrix. PSU's restore_rpts program rescored alignments
     # with the default matrix instead of the BLASTZ_Q matrix.
     # Rescore them here so the chainer sees the higher scores:
                                                                                 
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.danRer2
     mkdir axtChrom.rescore
     foreach f (axtChrom/chr*.axt)
         axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
         $f axtChrom.rescore/$f:t
     end
     mv axtChrom axtChrom.orig
     mv axtChrom.rescore axtChrom
                                                                                 
 #   psl files and blastz tables will be the same regardless of score so
 #   no need to reload
 
 # CHAIN DANRER2 BLASTZ (DONE, 2004-12-09, hartera)
 # RELOAD CHAINS WIH FILTERING (DONE, 2004-12-10, hartera)
 # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
 # AND DEGENERATE DNA (DONE, 2004-12-22, hartera)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg17/bed/blastz.danRer2
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     # create input list
     ls -1S /cluster/data/hg17/bed/blastz.danRer2/axtChrom/*.axt \
         > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
                                                                                 
     # Reuse gap penalties from hg16 vs chicken run.
     cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize^V     11
 smallSize^V     111
 position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V
 72111^V 152111^V        252111
 qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V
 31600^V 56600
 bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V
 16000^V 32000^V 57000
 '_EOF_'
     # << this line makes emacs coloring happy
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
     -linearGap=../../chickenHumanTuned.gap $1 \
     /iscratch/i/gs.18/build35/bothMaskedNibs \ 
     /iscratch/i/danRer2/nib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # para time
 # Completed: 45 of 46 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:       1837s      30.62m     0.51h    0.02d  0.000 y
 # IO & Wait Time:                   441s       7.35m     0.12h    0.01d  0.000 y
 # Average job time:                  51s       0.84m     0.01h    0.00d
 # Longest job:                      106s       1.77m     0.03h    0.00d
 # Submission to last job:           419s       6.98m     0.12h    0.00d
 
     # crashed job is chr6_hla_hap1 which has no alignments
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
                                                                                 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r >> hist5000.out
       textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
       echo ""
     end
 
     # apart from chr19 not too many with chains with scores < 5000
     # load chr1 chain into table and check
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/chain
     hgLoadChain hg17 chr1_chainDanRer2 chr1.chain
 # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
 # refGene:cds 1.301%, chainDanRer2Link 3.676%, both 0.877%, cover 67.42%,
 # enrich 18.34x
 # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2 -enrichment
 # refGene:cds 1.301%, chainDanRer2 32.611%, both 1.034%, cover 79.52%, 
 # enrich 2.44x
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain
     chainSplit chainFilt5k all.chain
 
     # load chr1 filtered chains and check
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/chainFilt5k
     hgLoadChain hg17 chr1_chainDanRer2Filt5k chr1.chain
 # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Filt5kLink -enrichment
 # refGene:cds 1.301%, chainDanRer2Filt5kLink 2.907%, both 0.870%, cover 66.86%,
 # enrich 23.00x
 # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Filt5k -enrichment
 # refGene:cds 1.301%, chainDanRer2Filt5k 31.343%, both 1.028%, cover 79.02%, 
 # enrich 2.52x
 # checked browser - when filtered on minScore=5000, the low scoring 
 # alignments removed are small and/or poor alignments so use this version.
     
     # remove repeats from filtered chains and reload into database
     # (2004-12-22, hartera)
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
     mv chainFilt5k chainRaw
     mkdir chain
     cd chainRaw
     foreach f (*.chain)
        set c = $f:r
        echo $c
        nice chainAntiRepeat /cluster/bluearc/hg17/bothMaskedNibs \
                        /cluster/bluearc/danRer2/nib $f \
                        ../chain/$c.chain
     end
     cd ..
     chainMergeSort ./chain/*.chain > all.chain.antirepeat
     chainSplit chainAR all.chain.antirepeat
     # load filtered chains and check
     ssh hgwdev
     echo 'drop table chr1_chainDanRer2Filt5k;' | hgsql hg17
     echo 'drop table chr1_chainDanRer2Filt5kLink;' | hgsql hg17
     # reload filtered chains instead of unfiltered (2004-12-10, hartera)
     # reload filtered chains with repeats removed (2004-12-22, hartera)
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain/
     cd chainAR
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg17 ${c}_chainDanRer2 $i
         echo done $c
     end
 # trackDb - change Zebrafish Chain to danRer1 Chain and display this track
 # for danRer2 as Zebrafish Chain.
 # after chainAntiRepeat
 # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
 # refGene:cds 1.304%, chainDanRer2Link 2.742%, both 0.872%, cover 66.81%, 
 # enrich 24.36x 
 
 # NET DANRER2 BLASTZ (DONE, 2004-12-09, hartera)
 # RE-CREATE NET WITH FILTERED CHAINS (DONE, 2004-12-10, hartera)
 # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-22, hartera) 
     ssh kksilo
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
     rm -r preNet
     mkdir preNet
     cd chainAR
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                      ../preNet/$i
     end
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 133443584, utime 905 s/100, stime 139
                                                                                 
     # Add classification info using db tables:
     # netClass looks for ancient repeats in one of the databases
     # hg17 has this table - hand-curated by Arian but this is for
     # human-rodent comparisons so do not use here, use -noAr option
     mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInHuman
     # linSpecRep.notInZebrafish exists for hg17 
     cp /iscratch/i/danRer2/linSpecRep.notInHuman/* \
        /cluster/bluearc/danRer2/linSpecRep.notInHuman
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
     time netClass noClass.net hg17 danRer2 zfishdanRer2.net \
          -tNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish \
          -qNewR=/cluster/bluearc/danRer2/linSpecRep.notInHuman -noAr
     # 97.230u 54.290s 5:37.50 44.8%   0+0k 0+0io 217pf+0w
     # load net into database
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
     netFilter -minGap=10 zfishdanRer2.net |  hgLoadNet hg17 netDanRer2 stdin
 # trackDb - change Zebrafish Net to danRer1 Net and display this track
 # for danRer2 as Zebrafish Net.
 # after chainAntiRepeat:
 # featureBits hg17 refGene:cds netDanRer2 -enrichment
 # refGene:cds 1.015%, netDanRer2 22.898%, both 0.783%, cover 77.15%, 
 # enrich 3.37x
 
     # index had NULL cardinality, analyze table to fix (2005-1-18, Heather)
     hgsql hg17
     analyze table netDanRer2
 
 # LOAD ACEMBLY TRACK (DONE, 2005-01-24, hartera)
 # ACEMBLY TABLE RELOADED AND FINISHED COLOR CODING CODE IN 
 # hgTracks (2005-01-28, hartera)
 # FINISHED CODE FOR FILTERING BY GENE CLASS (2005-02-03, hartera)
     mkdir -p /cluster/data/hg17/bed/acembly
     cd /cluster/data/hg17/bed/acembly
     # Data is obtained from 
     # Danielle et Jean Thierry-Mieg     mieg@ncbi.nlm.nih.gov
  
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.proteins.fasta.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.gff.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.mrnas.fasta.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35.human.genes/acembly.ncbi_35.genes.pfamhits.tar.gz
     tar xvzf acembly.ncbi_35.genes.gff.tar.gz
     tar xvzf acembly.ncbi_35.genes.proteins.fasta.tar.gz
     cd acembly.ncbi_35.genes.gff
     # the acembly dataset for hg16 had problems with reverse blocks so
     # check for these
 cat << '_EOF_' > checkReversedBlocks
 for i in x1*.gff
 do
         echo -n "$i working ..."
         awk -F"\t" '
 {
 if ($4 > $5) {
     printf "reverse blocks problem for $1"
         printf "\n"
 }
 }
 ' $i > $i.fixed
         echo " done"
 done
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x checkReversedBlocks
     ./checkReversedBlocks  
     ls -l *.fixed
     # all *.fixed files are empty so remove - there is no reversing of blocks
     rm *.fixed
 
     foreach f (x1.acemblygenes.*.gff)
       set c=$f:r:e
       egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
         perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
       if (-e ../../../$c/lift/random.lft) then
         liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
           ctg-chr${c}_random.gff
       endif
       grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
         grep -v "^chr//" > chr$c.gff
       echo "done $c"
     end
     
     #- Load into database - use extended genePred
     ssh hgwdev
     cd /cluster/data/hg17/bed/acembly
     # Reloaded without -genePredExt 1/6/05:
     ldHgGene -gtf hg17 acembly acembly.ncbi_35.genes.gff/chr*.gff
     # for entry with 28212470 from chr6.gff, change to chr6 
     # and for 29124352 in chr6.gff, change to chr6 (1/13/05)
     echo 'update acembly set chrom = "chr6" where chrom = "chr28212470";' \
          | hgsql hg17 
     echo 'update acembly set chrom = "chr6" where chrom = "chr29124352";' \
          | hgsql hg17 
     
     # checkTableCoords and runGeneCheck to check data
     # a number of errors so leave on hgwdev for the moment
 # checkTableCoords:
 # rah.acembly has 16 records with chrom not described in chromInfo.
 # rah.acembly item RPL10A.sNov04 chr6:35544172-35546520: end of last block (35546519) is not the same as chromEnd (35546520).
 # rah.acembly has 1 records with blockEnd[n-1] != end.
 # rah.acembly has 1 records with end > chromSize.
 # chr6    acembly exon    35545934        35546101        .       +       0  
 # gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 5
 # chr6    acembly intron  35546102        35546520        .       +       0
 # gene_id RPL10A; transcript_id RPL10A.sNov04; intron_type fuzzy
 # chr6    acembly CDS     35546335        35546384        .       +       0
 # gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 6
 # chr6    acembly exon    35546335        35546519        .       +       0
 # gene_id RPL10A; transcript_id RPL10A.sNov04; exon_number 6
 # chr6    acembly stop_codon      35546382        35546384        .       +
 # 0       gene_id RPL10A; transcript_id RPL10A.sNov04;
 # here the intron overlaps exon 6 so take 35546519 to be txEnd
       echo 'update acembly set txEnd = 35546519 where name = "RPL10A.sNov04";'\
            | hgsql hg17
 # for record where end > chromSize
       echo 'select * from acembly as a, chromInfo as c where c.chrom = a.chrom \            and c.size < a.txEnd;' | hgsql hg17
       # KIR2DL5.bNov04 on chr19_random, chr19_random size is 301858, 
       # txEnd is 305266 delete this record
       echo 'delete from acembly where name = "KIR2DL5.bNov04";' | hgsql hg17
 # from runGeneCheck:
 # 5780   inFrameStop
 # 110664 noStart
 # 23085 badCdsSplice
 # 23848 noStop
 # 14957 badUtrSplice
 # 3661 gap
 # 4726 badFrame
 # 261066 lines in genePred.tab
 
     # e-mailed authors of data (2004-12-21, hartera) 
 #  notiri.aNov04 - has ctg instead of atg at start. others have no start specified: sirora.nNov04
 # sirora.zmNov04 - chr1:19389-19392 is AAC (gtt) (-)
 # sirora.sNov04 - chr1:8925-8928 CAA (ttg) (-)
 # sirora.rNov04 - chr1:8925-8928 CAA (ttg) (-)
     # for entries with 28212470 and 29124352 instead of chr6 change to chr6
     # Re-process this x1 file to chr6.gff (2005-01-24)
     mv x1.acemblygenes.6.gff x1.acemblygenes.6.gff.broken
     sed -e "s/^28212470/6/" x1.acemblygenes.6.gff.broken | sed -e \
            "s/^29124352/6/"  > x1.acemblygenes.6.gff
     grep -v ^6\| x1.acemblygenes.6.gff | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
         grep -v "^chr//" > chr6.gff
     # Received a list of genes from Jean and Danielle Mieg
     # showing genes that are "main", "putative" or "cloud" - there should be
     # no "cloud" genes in our data set (2005-01-11)
     # download acembly_gene_lists.tar.gz from e-mail
     cd /cluster/data/hg17/bed/acembly
     tar xvzf acembly_gene_lists.tar.gz
     cd acembly_gene_lists
 cat << '_EOF_' > getIDs.pl
 #!/usr/bin/perl -w
 use strict;
 
 while (<STDIN>) {
    my @f = split(/\s+/);
    for (my $i =0; $i <= $#f; $i++) {
        if ($f[$i] =~ /gene_id$/) {
           # if field is ID type then next value is the ID
           my $id = $f[$i+1];
           # remove ";" at end and print ID
           chop $id;
           print "$id\n";
        }
    }
 }
 '_EOF_'
     chmod +x getIDs.pl
     # get gene IDs from gff files
     foreach f (../acembly.ncbi_35.genes.gff/chr*.gff)
        echo "Processing $f"
        perl getIDs.pl < $f >> genesGffs.ids
     end
     # remove back slash from some names
     sort genesGffs.ids | uniq > genesGffs.ids.uniq
     # reformat gene list to get just the genes and remove first 2 lines and sort
     foreach g (*.list)
        sed -e 's/"//g;' $g | sed -e 's/Gene : //;' | sed -e '1,2d' \
               | sort | uniq > $g.IDsort
     end 
     # remove back slash from some names
     perl -pi.bak -e 's/\\//' *.IDsort
     # check if cloud genes appear in gff files list of genes
     # list of genes in cloud but not in gff
     comm -13 genesGffs.ids.uniq cloud_gene.list.IDsort > gffvscloud.out
     diff gffvscloud.out cloud_gene.list.IDsort
     # there is no difference so none of the cloud genes are in the gff files
     # check if all the other genes in the main and putative lists are in gffs
     comm -13 genesGffs.ids.uniq main_gene.list.IDsort > gffvsmain.out
     comm -13 genesGffs.ids.uniq putative_gene.list.IDsort > gffvsputative.out
     wc -l *.out
     # 14 gffvsmain.out
     # 0 gffvsputative.out
 
     # there are 14 genes in the main set not in the gff files
     # actually there are 12, as FCA/MR and SLA/LP are in the gff files
     # all putative genes are in the gff set
     wc -l main_gene.list.IDsort putative_gene.list.IDsort
     # 52467 main_gene.list.IDsort
     # 43978 putative_gene.list.IDsort
     # 96445 total
     wc -l genesGffs.ids.uniq    
     # 97042 genesGffs.ids.uniq
     # check discrepancy
     cat main_gene.list.IDsort putative_gene.list.IDsort > mp.ids
     sort mp.ids > mp.sort
     comm -23 genesGffs.ids.uniq mp.sort > gffNotMP.out
     wc -l gffNotMP.out
     # 609 gffNotMP.out
 # create table of Acembly gene classifications
 # see http://www.ncbi.nlm.nih.gov/IEB/Research/Acembly/index.html?human
 # in FAQ, describes main, putative and cloud genes. The cloud genes are not 
 # well confirmed and so they are not in this data set. 
     # NEED TO FILTER GENES AND RELOAD TABLES:
     # authors Jean and Danielle Mieg e-mailed back. The 12 genes in the 
     # putative list that are not in the gff files were not exported
     # as they did not find a single putative protein to describe so they 
     # were not added to the gffs. They will be added at a later date.
     # Remove these from the acemblyClass table (2005-01-21, hartera)
     # Reload acemblyClass table as problems with the gene names
     # the class table has gene IDs and the acembly table has transcript IDs
     # it is hard to look up class in the class table since just removing the 
     # transcript ID suffixes (e.g. "aNov04" after a ".") does not work as 
     # some gene IDs have a "." in them anyway.
 
     ssh kksilo
     cd /cluster/data/hg17/bed/acembly/acembly_gene_lists
     comm -13 gffvsmain.out main_gene.list.IDsort > main_gene.list.filt
     wc -l main_gene.list.filt
     # 52455 main_gene.list.filt
     ssh hgwdev
     cd /cluster/data/hg17/bed/acembly/acembly_gene_lists
     # drop acemblyClass table and recreate (2005-01-27, hartera)
     echo 'drop table acemblyClass;' | hgsql hg17
     # prepare a file of genes and classification
     # use transcript IDs - get these and corresponding gene IDs from gff files
     # if Gene IDs used the difficult to parse transcript ID (name column) from 
     # the acembly genePred table. e.g. notiri.aNov04 is a transcript ID so can
     # remove suffix after "." to obtain gene ID, some gene names have "." in
     # them and not all have the suffix.
     # 260446 transcript IDs (use allFiltered.gff - see below) 
     perl getClass.pl main_gene.list.filt putative_gene.list.IDsort \
          ../acembly.ncbi_35.genes.gff/allFiltered.gff 
     foreach f (main_gene.list.filt putative_gene.list.IDsort)
        if ($f == "main_gene.list.filt") then
           set t = "main"
        endif
        if ($f == "putative_gene.list.IDsort") then
           set t = "putative"
        endif
        awk 'BEGIN {OFS="\t"} {print $1, "'$t'"}' $f >> class.txt
     end
     sort classes.txt | uniq > geneIDtxID.class
     # get transcript ID and class fields for acemblyClass table
     awk 'BEGIN {OFS="\t"} {print $2,$3}' geneIDtxID.class > acemblyClass.tab
     wc -l acemblyClass.tab
     # 260446 acemblyClass.tab
     # make change to acemblyClass.as and check in:
     # change name to be transcript ID instead of gene ID
 cat << '_EOF_' > $HOME/kent/src/hg/lib/acemblyClass.as
 table acemblyClass
 "Class for Acembly genes"
    (
     string name; "Transcript ID for Acembly gene"
     string class; "Class of gene"   
    )
 '_EOF_'
     cd $HOME/kent/src/hg/lib/
     autoSql acemblyClass.as acemblyClass
     mv acemblyClass.h $HOME/kent/src/hg/inc
     # do make to check it works and commit the .as, .sql, .c and .h files to CVS
     cd /cluster/data/hg17/bed/acembly/acembly_gene_lists
     echo "drop table acemblyClass" | hgsql hg17
     hgsql hg17 < ~/kent/src/hg/lib/acemblyClass.sql
     # reload table with transcript IDs 
     echo "load data local infile 'acemblyClass.tab' into table acemblyClass" \
          | hgsql hg17
     
     # There were also 609 genes in the gff files that are not in the
     # main, putative or cloud gene lists. Jean and Danielle Mieg say that
     # these were filtered out from their data set but not from the gff files.
     # Remove these from the gff files. (gffNotMP.out) (2005-01-24)
     cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.gff
     cat chr*.gff > all.gff
 cat << '_EOF_' > removeGenes.pl
 #!/usr/bin/perl -w
 use strict;
                                                                                 
 my $genes = $ARGV[0];
 my $gff = $ARGV[1];
                                                                                 
 open(GENES, $genes) || die "Can not open $genes:$!\n";
 open(GFF, $gff) || die "Can not open $gff:$!\n";
 open(OUT, ">removed.out") || die "Can not open removed.out:$!\n";
                                                                                 
 my %genes;
                                                                                 
 while (<GENES>) {
    chomp;
    my $g = $_;
    $genes{$g} = 1;
 }
 close GENES;
                                                                                 
 while (<GFF>) {
    my $l = $_;
    my $id;
    my @line = split(/\s+/);
    for (my $i = 0; $i <= $#line; $i++) {
       if ($line[$i] eq "gene_id") {
          $id = $line[$i+1];
       }
    }
    $id =~ s/;//;
    print "id is now $id\n";
    if (!exists($genes{$id})) {
       print $l;
    }
    else {
       print OUT $l;
    }
 }
 '_EOF_'
     perl removeGenes.pl ../acembly_gene_lists/gffNotMP.out all.gff \
                         > allFiltered.gff 
     # checked that gene IDs in the removed.out file are the same
     # same as those in gffNotMP.out
     # reload into the acembly table
     ssh hgwdev
     cd /cluster/data/hg17/bed/acembly
     echo 'drop table acembly;' | hgsql hg17
     # Reloaded with filtered set 2005-01-23, reload again 2005-01-28 with
     # the genePredExt option to get gene ID in name 2 field
     ldHgGene -gtf -genePredExt hg17 acembly \
              acembly.ncbi_35.genes.gff/allFiltered.gff
     # Read 260446 transcripts in 3656676 lines in 1 files
     #  260446 groups 41 seqs 1 sources 5 feature types
     #  260446 gene predictions
     # remove cdsStartStat, cdsEndStat and exonFrames fields
     echo 'alter table acembly drop column cdsStartStat;' | hgsql hg17
     echo 'alter table acembly drop column cdsEndStat;' | hgsql hg17
     echo 'alter table acembly drop column exonFrames;' | hgsql hg17
 
     # fix problem data found by checkTableCoords
     # here the intron overlaps exon 6 so take 35546519 to be txEnd
     echo 'update acembly set txEnd = 35546519 where name = "RPL10A.sNov04";'\
            | hgsql hg17
     # for record where end > chromSize
     echo 'select * from acembly as a, chromInfo as c where c.chrom = a.chrom \            and c.size < a.txEnd;' | hgsql hg17
     # KIR2DL5.bNov04 on chr19_random, size is 301858, txEnd is 305266
     # delete this record
     echo 'delete from acembly where name = "KIR2DL5.bNov04";' | hgsql hg17
     # acembly peptide table
     # need to just grab same sequences that are in acembly
     cd ./acembly.ncbi_35.genes.proteins.fasta
     echo 'select name from acembly;' | hgsql -N hg17 > acembly.name
     cat *.fasta > allPep.fa
     faSomeRecords allPep.fa acembly.name acemblyPep.fa
     # PEPTIDE SEQUENCES NOT LOADED
     # There are 236,554 peptide names that do not match transcript IDs in
     # the acembly table and 110,278 transcript IDs in acembly that do not
     # have a corresponding peptide. Waiting for repsonse about this from
     # Jean and Danielle (2005-01-31)  
     # hgPepPred hg17 generic acemblyPep \
     #         acembly.ncbi_35.genes.proteins.fasta/*.fasta
 
     # Edit hgTracks.c to get colour coded tracks based on the gene class
     # for each gene as read from the acemblyClass table.
     # Edits to hui.c, hgTrackUi.c and hgTracks.c to allow filtering of 
     # genes based on class.
 # acembly trackDb entry:
 # track acembly
 # shortLabel Acembly Genes
 # longLabel AceView Gene Models With Alt-Splicing
 # group genes
 # priority 41
 # visibility dense
 # color 155,0,125
 # type genePred acemblyPep acemblyMrna
 # url http://www.ncbi.nih.gov/IEB/Research/Acembly/av.cgi?db=hg17&l=$$
 # itemClassTbl acemblyClass
 # geneClasses main putative
 # gClass_main 128,0,125
 # gClass_putative 200,0,125
 # urlLabel Transcript ID:
 
 # search added:
 # searchTable acembly
 # searchType genePred
 # searchMethod prefix
 # termRegex [^[:space:]]+
 # searchPriority 50
 
     # Received data with gene product relationship from Jean Thierry-Mieg
     # (2005-02-17)
     ssh eieio
     cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.proteins.fasta
     wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/repository/acedb/human/acembly.ncbi_35.gene2product.txt.gz
     gunzip acembly.ncbi_35.gene2product.txt.gz
     # these are gene ID and product mappings, need transcript ID to product
     # mappings. E-mailed Jean Thierry-Mieg to ask for this information
     
 # BUILD WGRNA TRACK (DONE, 2004-12-13, Fan)
 
 # Grab data from original miRNA track and convert them into wgRna .tab format.
     hgsql hg17 --skip-column-names -e 'select * from miRNA' >miRNA.out
     cat miRNA.out  | awk {'print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"'} >wgRna.tab
 
 # Break the original custom track data file, hsa-snoRNA_track.txt, into two files j1 and j2, 
 # then remove header and blank lines.
     cat j1 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""CDBox"'} >>wgRna.tab
     cat j2 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""HAcaBox"'} >>wgRna.tab
 
 # load into wgRna table
     hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
 # create and edit wgRna.html under src/hg/makeDb/trackDb/human/hg17.
 
 # RELOADED wgRna DATA USING wgRNA_corrected.txt SENT BY MICHEL WEBER
 # Manually removed the first header line and the first column of the bin field and removed 
 # the last empty line.
 
     cut -f 2- wgRNA_corrected.txt >wgRna.tab
     vi wgRna.tab
     hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
 
 
 # UPDATED WGRNA DATA PER EMAIL FROM WEBER (2004-12-14, Fan).
 
 # Added the following 3 lines to j1
 chr3 161715396 161715726 U90 480 -
 chr11 93104316 93104387 Z40 480 -
 chr11 93106041 93106114 mgh28S-2410 480 -
     
 # Regenerated wgRna table
     cat miRNA.out  | awk {'print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"'} >wgRna.tab
     cat j1 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""CDBox"'} >>wgRna.tab
     cat j2 | awk {'print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"0"\t"0"\t""HAcaBox"'} >>wgRna.tab
     hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
 
 # Changed the following records to RNA type scaRna.
 
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U85"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U87"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U88"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U89"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U90"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U91"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U92"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U93"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U100"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA26"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA35"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA45"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA47"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA57"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="HBII-382"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-19/30"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-25/61"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-22/U4-8"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U85"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U87"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U88"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U89"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U90"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U91"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U92"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U93"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="U100"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA26"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA35"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA45"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA47"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="ACA57"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="HBII-382"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-19/30"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-25/61"'
 hgsql hg17 -e 'update wgRna set type = "scaRna" where name ="mgU2-22/U4-8"'
 
 # Updated .../trackDb/human/hg17/wgRna.html.
 
 # MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-14, hartera)
 # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat
 # (DONE, 2004-12-22, hartera)
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChrom
     set gp = /usr/local/apache/htdocs/goldenPath/hg17
     mkdir -p $gp/vsDanRer2/axtChrom
     cp -p *.axt $gp/vsDanRer2/axtChrom
     cd $gp/vsDanRer2/axtChrom
     gzip *.axt
     md5sum *.gz > md5sum.txt
                                                                                 
     # copy chains and nets to downloads area
     # re-make chains and net downloadables (2004-12-22, hartera)
     rm $gp/vsDanRer2/zebrafish*.gz $gp/vsDanRer2/md5sum.txt
     cd /cluster/data/hg17/bed/blastz.danRer2/axtChain
     gzip -c all.chain.antirepeat > \
             /cluster/data/hg17/zip/zebrafishDanRer2.chain.gz
     gzip -c zfishdanRer2.net > /cluster/data/hg17/zip/zebrafishDanRer2.net.gz
     cd $gp/vsDanRer2
     mv /cluster/data/hg17/zip/zebrafish*.gz .
     md5sum *.gz > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
                                                                                 
 # CLEANUP DANRER2 BLASTZ (DONE, 2004-12-14, hartera)
 # RE-DONE (DONE, 2004-12-22, hartera)
 # REMOVED RAW AND LAV DIRS (DONE, 2005-02-24, hartera)
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.danRer2
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/noClass.net &
     nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
     nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
     nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
     nice rm -rf raw &
     nice rm -rf lav &
 
 # EXTRACT AXT'S AND MAF'S FROM TETRAODON (tetNig1) NET 
 # (DONE, 2004-12-15, hartera) 
 # Redo to remove overlaps (2006-04-07 kate)
     ssh eieio
     # create axts
     cd /cluster/data/hg17/bed/blastz.tetNig1/axtChain
     netSplit tetNig1.net tetraodonNet
     mkdir -p ../axtNet
 cat > axtNet.csh << 'EOF'
     foreach f (tetraodonNet/chr*.net)
         set c = $f:t:r
         echo "axtNet on $c"
         netToAxt tetraodonNet/$c.net chain/$c.chain \
         /cluster/data/hg17/nib /cluster/data/tetNig1/nib ../axtNet/$c.axt
         echo "Complete: $c.net -> $c.axt"
         end
 'EOF'
 
     chmod +x axtNet.csh
     csh axtNet.csh >&! axtNet.log &
     tail -100f axtNet.log
 
     # sort axts before making mafs - must be sorted for multiz
     cd /cluster/data/hg17/bed/blastz.tetNig1
     mv axtNet axtNet.unsorted
     mkdir axtNet
     foreach f (axtNet.unsorted/*.axt)
         set c = $f:t:r
         echo "Sorting $c"
         axtSort $f axtNet/$c.axt
     end
 
     # create maf
     ssh eieio
     cd /cluster/data/hg17/bed/blastz.tetNig1
     cd axtNet
     mkdir ../mafNet
 
 cat > makeMaf.csh << 'EOF'
     foreach f (chr*.axt)
       set maf = $f:t:r.tetNig1.maf
       echo translating $f to $maf
       axtToMaf $f \
             /cluster/data/hg17/chrom.sizes /cluster/data/tetNig1/chrom.sizes \
             ../mafNet/$maf -tPrefix=hg17. -qPrefix=tetNig1.
     end
 'EOF'
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
 
     nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/*.net &
 
     # redo axt's and maf's to remove overlaps (2006-04-07 kate)
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.tetNig1
     mv axtNet axtNet.old
     mv mafNet mafNet.old
     mkdir -p axtNet mafNet
     cd axtChain
 
 cat > fix.csh << 'EOF'
     date
     foreach f (tetraodonNet/chr*.net)
         set c = $f:t:r
         echo $c
         netToAxt tetraodonNet/$c.net chain/$c.chain \
             /cluster/data/hg17/nib /cluster/data/tetNig1/nib stdout | \
                 axtSort stdin ../axtNet/$c.axt
         echo "Complete: $c.net -> $c.axt"
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/tetNig1/chrom.sizes \
             ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=tetNig1.
     end
     date
 'EOF'
     csh fix.csh >&! fix.log &
     cd /san/sanvol1/scratch/hg17/mafNet
     rm -fr tetNig1
     cp -rp /cluster/data/hg17/bed/blastz.tetNig1/mafNet tetNig1
 
 
 # 10-WAY MULTIZ -- 8-WAY PLUS FROG AND TETRA (DONE 2004-12-22 kate)
 #       Use older multiz (not v10) till bugs fixed
 
     ssh eieio
     cd /cluster/data/hg17/bed
     rm multiz10way
     mkdir multiz.2004-12-22
     ln -s multiz.2004-12-22 multiz10way
     cd multiz10way
 
     cat > tree.nh << 'EOF'
 ((((((hg17,panTro1),(rn3,mm5)),canFam1),galGal2),xenTro1),((fr1,tetNig1),danRer1))
 'EOF'
 
     mkdir /cluster/bluearc/hg17/multiz.2004-12-22
     cd /cluster/bluearc/hg17
     mkdir 2004-12-22
     rm multiz10way
     ln -s multiz.2004-12-17 multiz10way.v10
     ln -s multiz.2004-12-22 multiz10way
     # reuse pairwise MAF's on bluearc
     mv multiz10way.v10/{canFam1,danRer1,fr1,galGal2,mm5,panTro1,rn3,tetNig1,xenTro1} multiz10way
     # NOTE: pairwise mafs were moved to /cluster/bluearc/hg17/mafNet
 
     # make output dir and run dir
     ssh kk9
     cd /cluster/data/hg17/bed
     cd multiz10way
     mkdir -p maf
     mkdir -p run
     cd run
 
     # create scripts to run multiz on cluster
 cat > oneMultiz.csh << 'EOF'
 #!/bin/csh -fe
     set c = $1
     set multi = /scratch/$user/multiz10way.$c
     set pairs = /cluster/bluearc/hg17/multiz10way
 
     # special mode --
     # with 1 arg, cleanup
     if ($#argv == 1) then
         rm -fr $multi
         exit
     endif
 
     set s1 = $2
     set s2 = $3
 
     # locate input files -- in pairwise dir, or multiple dir
     set d1 = $multi
     set d2 = $multi
     if (-d $pairs/$s1) then
         set d1 = $pairs
     endif
     if (-d $pairs/$s2) then
         set d2 = $pairs
     endif
     set f1 = $d1/$s1/$c.maf
     set f2 = $d2/$s2/$c.maf
     # write to output dir
     set out = $multi/${s2}${s1}
     mkdir -p $out
 
     # check for empty input file
     if (-s $f1 && -s $f2) then
         echo "Aligning $f1 $f2"
         /cluster/bin/penn/multiz $f1 $f2 - > $out/$c.maf
     else if (-s $f1) then
         cp $f1 $out
     else if (-s $f2) then
         cp $f2 $out
     endif
 'EOF'
 # << for emacs
 chmod +x oneMultiz.csh
 
 cat > allMultiz.csh << 'EOF'
 #!/bin/csh -fe
 
 set c = $1
 oneMultiz.csh $c mm5 panTro1
 oneMultiz.csh $c rn3 panTro1mm5 
 oneMultiz.csh $c canFam1 panTro1mm5rn3 
 oneMultiz.csh $c galGal2 panTro1mm5rn3canFam1
 oneMultiz.csh $c xenTro1 panTro1mm5rn3canFam1galGal2
 oneMultiz.csh $c fr1 panTro1mm5rn3canFam1galGal2xenTro1
 oneMultiz.csh $c tetNig1 panTro1mm5rn3canFam1galGal2xenTro1fr1
 oneMultiz.csh $c danRer1 panTro1mm5rn3canFam1galGal2xenTro1fr1tetNig1
 # get final alignment file
 cp /scratch/$user/multiz10way.$c/panTro1mm5rn3canFam1galGal2xenTro1fr1tetNig1danRer1/$c.maf /cluster/data/hg17/bed/multiz10way/maf/$c.maf
 #cleanup
 oneMultiz.csh $c
 'EOF'
 # << for emacs
 chmod +x allMultiz.csh
 
 cat > gsub << 'EOF'
 #LOOP
 allMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz10way/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
 # << for emacs
     cut -f 1 /cluster/data/hg17/chrom.sizes > chrom.lst
     gensub2 chrom.lst single gsub jobList
     para create jobList
     para try; para check
     para push
 
     # post-process multiz maf with maf_project to "glue" short
     # alignment blocks together
     ssh eieio
     cd /cluster/data/hg17/bed/multiz10way.v8
     mkdir -p mafGlued
     cd maf
     foreach f (*.maf)
         set c = $f:r
         echo "gluing $f"
         /cluster/bin/penn/maf_project $f hg17.$c > ../mafGlued/$c.maf
     end
     # filter out alignment blocks with no alignments in non-reference species,
     # and low-scoring alignments based on Webb Miller's latest
     # recommendations (score < -5 * ncol^2 * nrow)
     # NOTE: Webb hasn't approved the filtered alignments yet,
     # so leaving them in for now.
     #mkdir -p mafFiltered
     #cd ../mafGlued
     #foreach f (*.maf)
         #set c = $f:r
         #echo "filtering $f"
         #~kate/bin/i386/mafFilter -factor $f > ../mafFiltered/$c.maf
     #end
     #cd ..
 
     grep score mafGlued/chr1.maf | wc -l
     grep score mafFiltered/chr1.maf | wc -l
     grep score mafGlued/bad | wc -l
         # 43692
     grep score=0.0 bad | wc -l
         # 10206
     # load alignments into tables
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz10way.v8
     set mafDir = /gbdb/hg17/mafNet
     mkdir -p $mafDir
 
     # multiple alignment
     set mafDir = /gbdb/hg17/multiz10way/maf
     mkdir -p $mafDir/multiz10way
     cd /cluster/data/hg17/bed/multiz10way.v8/mafGlued
     ln -s `pwd`/*.maf $mafDir/multiz10way
     hgLoadMaf hg17 -warn multiz10way -pathPrefix=$mafDir/multiz10way
 
     # load summary table to replace pairwise
     cd /cluster/data/hg17/bed/multiz10way.v8/mafGlued/
     time cat chr*.maf | hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 hg17 multiz10waySummary stdin
     # Processed 27314693 components in 9081437 mafs from stdin
     # 30 minutes
 
 
 # CONSERVATION SCORING WITH PHASTCONS (DONE 2005-01-14 kate)
 # 1. Partition multiple alignment into windows, using "msa_split"
 # 2. Create starting tree model, with branch lengths
 #       use "phyloFit" on alignments
 # 3. Estimate GC avg. over all species, use "msa_view" on maf
 # 4. Estimate other model params, using phastCons (via doEstimate script)
 
     # NOTE: no alignment filtering done -- the scores don't look
     # particularly meaningful w/ this version of multiz.
     # Next time, run on "glued" (maf_projected) 
 
     ssh eieio
     cd /cluster/data/hg17/bed/multiz10way.v8
     set mafDir = /cluster/bluearc/hg17/multiz10way.v8/maf
     mkdir -p $mafDir
     cp -r maf/*.maf $mafDir
 
     ssh kk9
     cd /cluster/data/hg17/bed/multiz10way.v8
     mkdir cons
     cd cons
 
     # break up the genome-wide MAFs into pieces
     # NOTE: chrom fasta files are already on the bluearc
     # from previous run
     mkdir /cluster/bluearc/hg17/chrom
     cd /cluster/data/hg17
     foreach f (`cat chrom.lst`)
         echo $f
         cp -r $f/*.fa /cluster/bluearc/hg17/chrom
     end
 
     cd /cluster/data/hg17/bed/multiz10way.v8/cons
     mkdir run.split
     cd run.split
     set WINDOWS = /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
     cat << 'EOF' > doSplit.sh
 #!/bin/sh
 
 PHAST=/cluster/bin/phast
 FA_SRC=/cluster/bluearc/hg17/chrom
 WINDOWS=/cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS
 
 maf=$1
 c=`basename $maf .maf`
 echo $c
 mkdir -p /scratch/msa_split
 ${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg17,panTro1,mm5,rn3,canFam1,galGal2,xenTro1,fr1,tetNig1,danRer1 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -B 5000
 [ $? -eq 0 ] || exit 1
 echo "Copying..."
 cd /scratch/msa_split
 for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
 [ $? -eq 0 ] || exit 1
 rm -f /scratch/msa_split/$c.*.ss
 echo "Done copying"
 echo "Done" >> ${WINDOWS}/$c.done
 'EOF'
 # << for emacs
 
     set WINDOWS = /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS
     chmod +x doSplit.sh
     rm -f jobList
     foreach file (/cluster/bluearc/hg17/multiz10way.v8/maf/*.maf) 
         set c = $file:t:r
 	echo "doSplit.sh $file {check out line+ $WINDOWS/$c.done}" >> jobList
     end
     
     para create jobList
         # 46 jobs
     para try
     para check
 
     # TODO: cleanup
     # rm -fr $mafDir
     
     # now generate conservation scores and predicted elements
     set path = ($path /cluster/bin/phast); rehash
     cd /cluster/data/hg17/bed/multiz10way.v8/cons
     mkdir run.elements   
     cd run.elements
 
     # create a starting tree model from a chr1 ss files in WINDOWS dir.
     ssh kolossus
     cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
     gunzip -c /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/chr1.14996059-15998256.ss.gz \
                 > /tmp/phastCons.$$
      phyloFit -i SS /tmp/phastCons.$$ --out-root starting-tree --tree \
         "((((((hg17,panTro1),(mm5,rn3)),canFam1),galGal2),xenTro1),((fr1,tetNig1),danRer1))" 
     rm /tmp/phastCons.$$
     cat starting-tree.mod
 #ALPHABET: A C G T 
 #ORDER: 0
 #SUBST_MOD: REV
 #TRAINING_LNL: -2635749.517410
 #BACKGROUND: 0.247225 0.248374 0.250827 0.253574 
 #RATE_MAT:
 #-0.997890    0.201447    0.648573    0.147870 
 #0.200515   -1.020796    0.190184    0.630096 
 #0.639258    0.188324   -1.025170    0.197587 
 #0.144168    0.617176    0.195447   -0.956791 
 #TREE: ((((((hg17:0.006401,panTro1:0.008342):0.099376,(mm5:0.083404,rn3:0.105411):0.242694):0.020883,canFam1:0.221922):0.099131,galGal2:0.275759):0.041997,xenTro1:0.280306):0.064815,((fr1:0.137674,tetNig1:0.091463):0.118573,danRer1:0.250847):0.064815);
 
     # estimate model parameters
     # estimate avg. cross-species avg. GC content from chr1 maf's
     ssh kolossus
     set path = ($path /cluster/bin/phast); rehash
     cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
     msa_view --aggregate hg17,panTro1,rn3,mm5,canFam1,galGal2,xenTro1,danRer1,tetNig1,fr1 \
         -i MAF \
         --summary-only /cluster/data/hg17/bed/multiz10way.v8/maf/chr1.maf\
             > maf_summary.txt
     awk '$1 == "[aggregate]" {printf "%0.3f\n", $3 + $4}' maf_summary.txt
        # 0.424
 
     # generate models from random sample of genome (use 90 1Mb windows,
     # to conveniently run on rack 9 100-node cluster)
     # On first pass, used parameters from 8way alignment:
     #   expected-lengths 12 -taret-coverage .17
     # NOTE: there may be a cleverer way to select the first length param
     # On second pass, used parameters below, based on consEntropy
     # and featureBits coverage of elements, below
     cat << 'EOF' > doEstimate.sh
 #!/bin/sh
 zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.424 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 11 --target-coverage 0.20 --quiet --log $2 --estimate-trees $3
 'EOF'
     chmod u+x doEstimate.sh
     rm -fr LOG TREES
     mkdir -p LOG TREES
     ls /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/*.gz > all.windows
     /cluster/bin/phast/chooseLines -k 90 all.windows > subset.windows
     rm -f jobs.lst
     foreach f (`cat subset.windows`)
         set root = $f:t:r:r
 	echo doEstimate.sh /cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/$f LOG/$root.log TREES/$root >> jobs.lst
     end
     # run cluster job (about an hour)
     ssh kk9
     cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
     para create jobs.lst
        # 90 jobs written to batch
     para try; para check
     para push
 # 2 jobs crashed with out-of-mem; as we are just taking a sample
 # this is probably OK, but I've notified Adam
 # Average job time:                1055s      17.58m     0.29h    0.01d
 # Longest job:                     3647s      60.78m     1.01h    0.04d
 
     # NOTE: should have used ave.noncons.mod to improve parameter estimation
     # cp nave.noncons.mod starting-tree.mod
 
     ls TREES/*.cons.mod > cons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
     grep TREE ave.cons.mod
 # TREE: ((((((hg17:0.002313,panTro1:0.002931):0.036375,(mm5:0.029849,rn3:0.039008):0.095334):0.003258,canFam1:0.078205):0.047189,galGal2:0.158045):0.020103,xenTro1:0.169387):0.028857,((fr1:0.071610,tetNig1:0.057766):0.091165,danRer1:0.138905):0.028857);
     ls TREES/*.noncons.mod > noncons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
     grep TREE ave.noncons.mod
 # TREE: ((((((hg17:0.007342,panTro1:0.009340):0.116009,(mm5:0.095037,rn3:0.124288):0.304355):0.010633,canFam1:0.249367):0.151476,galGal2:0.507037):0.064317,xenTro1:0.549121):0.094733,((fr1:0.231246,tetNig1:0.185161):0.296288,danRer1:0.446734):0.094733);
 
 
     # analyze conservation genome-wide
     cat << 'EOF' > doPhastCons.sh
 #!/bin/sh
 
 mkdir -p /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS
 pref=`basename $1 .ss.gz`
 chr=`echo $pref | awk -F\. '{print $1}'`
 tmpfile=/scratch/phastCons.$$
 zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 11 --target-coverage 0.20 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
 gzip -c $tmpfile > /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS/$pref.pp.gz
 rm $tmpfile
 'EOF'
     chmod u+x doPhastCons.sh
 
     rm -fr /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS
     rm -f jobs2.lst
     foreach f (/cluster/bluearc/hg17/multiz10way.v8/cons/WINDOWS/*.ss.gz)
         echo doPhastCons.sh $f >> jobs2.lst
     end
 
     # run cluster job (it's quick -- 10 minutes or so)
     ssh kk
     cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
     para create jobs2.lst
         # 2932 jobs written to batch
     para try; para check
     para push
 # Average job time:                  80s       1.33m     0.02h    0.00d
 # Longest job:                      157s       2.62m     0.04h    0.00d
 # Submission to last job:           583s       9.72m     0.16h    0.01d
 
     # combine predictions and transform scores to be in 0-1000 interval
     # do in a way that avoids limits on numbers of args
     rm -f splitfiles* all.raw.bed
     find /cluster/bluearc/hg17/multiz10way.v8/phastCons/ELEMENTS -name "*.bed" > files
     split files splitfiles
     foreach s (splitfiles*)
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' `cat $s` >> all.raw.bed
     end
     /cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
     rm files splitfiles* 
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz10way.v8/cons/run.elements
     hgLoadBed hg17 phastConsElements10way all.bed
     sort -rn -k 5 all.bed | sed -n '1,100000p' > top100K.bed
     hgLoadBed hg17 phastConsElements10wayTop100K top100K.bed
 
     # check coverage -- reran estimation and conservation steps with new parameters till
     # coverage close to 5% and expected-length parameter is close to consEntropy recommended length
     
     featureBits hg17 phastConsElements10way
         # first pass
         #       .17 12
         # 132657993 bases of 2866216770 (4.628%) in intersection
         # second pass -- used this
         #       .20 11
         # 143386170 bases of 2866216770 (5.003%) in intersection
     featureBits hg17 phastConsElements
         # 137850739 bases of 2866216770 (4.810%) in intersection
 
     # check expected-length parameter
     # first pass
     /cluster/bin/phast/consEntropy .17 12 \
                         ave.cons.mod ave.noncons.mod --NH 9.78
         # recommended length 10.4
     # second pass -- good enough according to Adam
     /cluster/bin/phast/consEntropy .20 11 \
                         ave.cons.mod ave.noncons.mod --NH 9.78
         #( Solving for new omega: 11.000000 12.243251 12.155776 12.155369 )
         #Transition parameters: gamma=0.200000, omega=11.000000, mu=0.090909, nu=0.022727
         #Relative entropy: H=1.263205 bits/site
         #Required length: N=7.548911 sites
         #Total entropy: NH=9.535821 bits
         #Recommended expected length: omega=12.155369 sites (for NH=9.780000)
  
     # create wiggle data files
     ssh eieio
     cd /cluster/data/hg17/bed/multiz10way.v8/cons
     # sort post-prob files by chrom position using filename, then 
     #  use wigEncode to create binary files for wiggle
     find /cluster/bluearc/hg17/multiz10way.v8/phastCons/POSTPROBS \
         -name "*.pp.gz" | sort -t\. -k2,2n | xargs zcat | \
         wigEncode stdin phastCons10way.wig phastCons10way.wib
 
     hgWiggle -db=hg17 -doHistogram -hBinSize=0.001 \
         -hBinCount=1000 -hMinVal=0.0 -db=hg17 phastCons >histo.8way.data
     hgWiggle -db=hg17 -doHistogram -hBinSize=0.001 \
         -hBinCount=1000 -hMinVal=0.0 -db=hg17 phastCons10way >histo.10way.data
     hgWiggle -db=hg17 -doStats \
          phastCons > stats.8way.data
     hgWiggle -db=hg17 -doStats \
          phastCons10way > stats.10way.data
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz10way.v8/cons
     set wibDir = /gbdb/hg17/multiz10way/wib/phastCons10way
     mkdir -p $wibDir
     ln -s `pwd`/phastCons10way.wib $wibDir
     hgLoadWiggle hg17 phastCons10way phastCons10way.wig \
                 -pathPrefix=$wibDir
 
     # create tree image:
         # edit tree.nh to create species.nh with common names
         /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps
         # photoshop to enhance, then save as gif/jpg
         cp /cluster/data/hg17/bed/multiz10way.v8/species10.jpg \
             /usr/local/apache/htdocs/images/phylo/10way.jpg
 
     # get stats on the track
     ssh hgwdev
         
     featureBits hg17 -enrichment refGene:cds phastConsElements10way
         # refGene:cds 1.020%, phastConsElements10way 5.003%, both 0.711%, cover 69.73%, enrich 13.94x
         # compare to previous elements (generated from 8way)
     featureBits hg17 -enrichment refGene:cds phastConsElements
         # refGene:cds 1.020%, phastConsElements 4.810%, both 0.747%, cover 73.22%, enrich 15.22x
 
     # see how gluing reduces number of alignments
     ssh eieio
     cd /cluster/data/hg17/bed/multiz10way.v8
     mkdir stats
     grep score maf/chr22.maf | grep -v 0.0 | wc -l
         #179576
     grep score mafGlued/chr22.maf | grep -v 0.0 | wc -l
         #110550
 
     # look at distribution of alignment sizes after gluing
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz10way.v8
     mkdir mafTemp
     ln -s `pwd`/maf/chr1.maf mafTemp
     # load temp table
     hgLoadMaf hg17 -pathPrefix=mafTemp multiz10wayChr1
         #Loaded 1246727 mafs
     # again, compare to glued:
     echo "SELECT COUNT(*) FROM multiz10way"
         # 738030
         # again, ~40% fewer
 
     cd stats
     echo "SELECT chromEnd - chromStart FROM multiz10way WHERE chrom='chr1'" | \
         hgsql -N hg17 | sort -n > chr1.maf.glued.sizes
     echo "SELECT chromEnd - chromStart FROM multiz10wayChr1"| \
         hgsql -N hg17 | sort -n > chr1.maf.sizes
     # cleanup
     hgsql hg17 -e "DROP TABLE multiz10wayChr1"
     rm -fr ../mafTemp
 
     # coverage of multiple alignment, and pairs
     ssh kolossus
     cd /cluster/data/hg17/bed/multiz10way.v8
     cd stats
     nice mafRanges -notAllOGap ../mafGlued/chr1.maf hg17 \
                 hg17.chr1.mafRanges.bed
     nice mafRanges -notAllOGap /cluster/data/hg17/bed/multiz8way/maf/chr1.maf \
                 hg17 hg17.8way.chr1.mafRanges.bed
     foreach db (panTro1 canFam1 mm5 rn3 galGal2 xenTro1 fr1 tetNig1 danRer1)
         echo $db
         nice mafRanges /cluster/data/hg17/bed/blastz.$db/mafNet/chr1.*maf \
                -notAllOGap hg17 $db.chr1.mafRanges.bed
         ls /cluster/data/hg17/bed/blastz.$db/mafNet/chr1.*maf
     end
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz10way.v8/stats
     nice featureBits -chrom=chr1 hg17 refGene:cds hg17.chr1.mafRanges.bed -enrichment
 # refGene:cds 1.308%, hg17.chr1.mafRanges.bed 95.725%, both 1.307%, cover 99.94%, enrich 1.04x
     nice featureBits -chrom=chr1 hg17 refGene:cds hg17.8way.chr1.mafRanges.bed -enrichment
 # refGene:cds 1.308%, hg17.8way.chr1.mafRanges.bed 95.742%, both 1.307%, cover 99.97%, enrich 1.04x
 
     foreach db (panTro1 canFam1 mm5 rn3 galGal2 xenTro1 fr1 tetNig1 danRer1)
         nice featureBits -chrom=chr1 -enrichment hg17 refGene:cds $db.chr1.mafRanges.bed
     end
 
 #refGene:cds 1.308%, panTro1.chr1.mafRanges.bed 93.472%, both 1.264%, cover 96.65%, enrich 1.03x
 #refGene:cds 1.308%, canFam1.chr1.mafRanges.bed 55.377%, both 1.277%, cover 97.64%, enrich 1.76x
 #refGene:cds 1.308%, mm5.chr1.mafRanges.bed 37.342%, both 1.280%, cover 97.92%, enrich 2.62x
 #refGene:cds 1.308%, rn3.chr1.mafRanges.bed 35.429%, both 1.257%, cover 96.14%, enrich 2.71x
 #refGene:cds 1.308%, galGal2.chr1.mafRanges.bed 3.840%, both 0.936%, cover 71.61%, enrich 18.65x
 #refGene:cds 1.308%, xenTro1.chr1.mafRanges.bed 3.059%, both 0.881%, cover 67.36%, enrich 22.02x
 #refGene:cds 1.308%, fr1.chr1.mafRanges.bed 1.892%, both 0.854%, cover 65.29%, enrich 34.50x
 #refGene:cds 1.308%, tetNig1.chr1.mafRanges.bed 1.384%, both 0.805%, cover 61.57%, enrich 44.50x
 #refGene:cds 1.308%, danRer1.chr1.mafRanges.bed 2.716%, both 0.847%, cover 64.81%, enrich 23.86x
 
 
 # MAKE HG17-RN3 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
     ssh kolossus
     set chainDir = /cluster/data/hg17/bed/blastz.rn3/axtChain
     netChainSubset $chainDir/rat.net.gz $chainDir/all.chain.gz \
       /cluster/data/hg17/bed/bedOver/hg17ToRn3.over.chain
     
 
 # MAKE HG17-GALGAL2 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
     ssh kolossus
     set chainDir = /cluster/data/hg17/bed/blastz.galGal2/axtChain
     netChainSubset $chainDir/human.net $chainDir/all.chain \
       /cluster/data/hg17/bed/bedOver/hg17ToGalGal2.over.chain
     
 
 # DOWNLOADS FOR 10-WAY MULTIZ (2005-01-24 kate)
 #   Use "glued" mafs
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17
     mkdir -p multiz10way
     cd multiz10way
     foreach f (/cluster/data/hg17/bed/multiz10way.v8/mafGlued/*.maf)
         set c = $f:r:t
         echo $c
         nice gzip -c $f > $c.maf.gz
     end
     # copy README and edit
 
     # Create upstream files for download 
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz10way.v8
     echo hg17 panTro1 mm5 rn3 canFam1 galGal2 xenTro1 fr1 tetNig1 danRer1 > org.txt
     # mafFrags takes a while
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad
         awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         nice mafFrags hg17 multiz10way up.bed upstream$i.maf -orgs=org.txt
         rm up.bed
     end
     ssh eieio
     cd /cluster/data/hg17/bed/multiz10way.v8
     nice gzip upstream{1000,2000,5000}.maf
         # 6 mins.
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17
     mv /cluster/data/hg17/bed/multiz10way.v8/upstream*.maf.gz multiz10way
     cd multiz10way
     md5sum *.gz > md5sum.txt
 
     #	Create histogram of this phastCons data  (Hiram - 2005-02-07)
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz.2004-12-22/cons
     time hgWiggle -doHistogram \
         -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=hg17 phastCons > histogram.data 2>&1
     #	34 minutes
 
     cat << '_EOF_' > histo.gp
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 xff0000 xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Hg17 Histogram phastCons track"
 set xlabel "Hg17 phastCons score"
 set ylabel "p-Value"
 set y2label "Cumulative Probability Distribution"
 set y2range [0:1]
 set y2tics
 
 plot "histogram.data" using 2:5 title " pValue" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CPD" with lines
 '_EOF_'
     
     gnuplot histo.gp > histo.png
     display histo.png &
 
 
 # BLASTZ BOREOEUTHERIAN (BOREUT1) (DONE 1/29/05 braney)
     ssh kk
     mkdir /cluster/data/borEut1/bed/zb.hg17 
     ln -s /cluster/data/borEut1/bed/zb.hg17 /cluster/data/hg17/bed/blastz.borEut1
     cd /cluster/data/hg17/bed/blastz.borEut1
     # Use default (Human-Mouse) settings for starters.
     cat << '_EOF_' > DEF
 # human vs. dog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Default
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/scratch/hg/gs.18/build35/linSpecRep.notInDog
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog
 SEQ2_DIR=/iscratch/i/borEut1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastz.borEut1
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     /cluster/data/hg17/jkStuff/BlastZ_run0.sh
     cd run.0
     para push
 # Completed: 2728 of 2728 jobs
 # CPU time in finished jobs:     621440s   10357.34m   172.62h    7.19d  0.020 y
 # IO & Wait Time:                 19079s     317.98m     5.30h    0.22d  0.001 y
 # Average job time:                 235s       3.91m     0.07h    0.00d
 # Longest job:                     2340s      39.00m     0.65h    0.03d
 # Submission to last job:          2837s      47.28m     0.79h    0.03d
 
     ssh kki
     cd /cluster/data/hg17/bed/blastz.borEut1
     /cluster/data/hg17/jkStuff/BlastZ_run1.sh
 
     cd run.1
     para push
 # Completed: 341 of 341 jobs
 # CPU time in finished jobs:         95s       1.58m     0.03h    0.00d  0.000 y
 # IO & Wait Time:                   825s      13.75m     0.23h    0.01d  0.000 y
 # Average job time:                   3s       0.04m     0.00h    0.00d
 # Longest job:                       10s       0.17m     0.00h    0.00d
 # Submission to last job:            73s       1.22m     0.02h    0.00d
 
     ssh kk
     cd /cluster/data/hg17/bed/blastz.borEut1
     /cluster/data/hg17/jkStuff/BlastZ_run2.sh
     cd run.2
     para push
 
 # /cluster/data/hg17/bed/blastz.borEut1/axtChrom/chr18_random.axt is empty
 # /cluster/data/hg17/bed/blastz.borEut1/axtChrom/chr19_random.axt is empty
 # ..
 # Completed: 44 of 46 jobs
 # Crashed: 2 jobs
 # CPU time in finished jobs:        104s       1.73m     0.03h    0.00d  0.000 y
 # IO & Wait Time:                   482s       8.04m     0.13h    0.01d  0.000 y
 # Average job time:                  13s       0.22m     0.00h    0.00d
 # Longest job:                      134s       2.23m     0.04h    0.00d
 # Submission to last job:           142s       2.37m     0.04h    0.00d
 
 # END BLASTZ BOREOEUTHERIAN
 
 
 ##########################################################################
 # MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE braney 1/15/05)
 # Questions?  weirauch@soe.ucsc.edu or braney@soe.ucsc.edu
 #	tfbsConsSites table reloaded 2006-11-03 - Hiram - see below:
 ## reload tfbsCons table - it was based on a newer version of tfbs names that
 
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/tfbsCons
 cd /cluster/data/hg17/bed/tfbsCons
 
 # Define all parameters in 'PARAMS.txt'
 # Define all chromosomes in 'CHROMS.txt'
 # Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts  weirauch@soe.ucsc.edu
 set tarfile=/cluster/data/hg17/bed/tfbsCons/tfbsConsUtils.tar.gz
 tar zxf $tarfile
 
 nice ./getRefseqStats.pl &
 nice ./getBatchQueries.pl &
 
 ssh kk
 mkdir /cluster/bluearc/braney/tfloc
 # Copy ./tmp/ctfbs_batch_list.txt to this dir
 # Copy ./scripts/doit to this dir
 para create ctfbs_batch_list.txt
 para try
 para push
 
 # When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome.
 
 ssh kksilo (or hgwdev, or whatever)
 nice ./getBedFile.pl &
 
 hgLoadBed -noSort hg17 tfbsConsSites \
 	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql \
 	tfbsConsSites.bed -tab
 hgLoadBed -noSort hg17 tfbsConsFactors \
 	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsFactors.sql \
 	tfbsConsFactors.bed -tab
 
 # Feel free to delete or gzip anything in ./tmp
 #  (particularly the huge .maf and .bed files)
 #   after the final two bed files are sucessfully loaded
 
 ##########################################################################
 # CHICKEN RECIPROCAL-BEST NET FOR STRINGENT LIFTOVER (DONE 2/3/05 angie)
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.galGal2/axtChain
     # Run chainNet again, this time keeping both of its outputs:
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin ../S1.len ../S2.len h_g.net g_h.net
     # Get the chicken chains from the chicken-referenced (but human-centric)
     # net:
     chainSwap all.chain g_h.chain
     netChainSubset g_h.net g_h.chain stdout \
     | chainSort stdin g_h.subset.chain
     # Net those (sorted) chicken chains, and keep both outputs, to get 
     # reciprocal best nets referenced to both species:
     chainPreNet g_h.subset.chain ../S2.len ../S1.len stdout \
     | chainNet stdin ../S2.len ../S1.len g_h.rbest.net h_g.rbest.net
     # Get the chains from the recip-best nets for stringent liftOver:
     netChainSubset g_h.rbest.net g_h.chain galGal2ToHg17.rbest.over.chain
     netChainSubset h_g.rbest.net all.chain hg17ToGalGal2.rbest.over.chain
 
 
 
 ####### RE-BUILD RGD HUMAN QTL TRACKS (DONE 2/5/05 Fan) ##############
 
 mkdir -p /cluster/store8/rgd/human050205
 rm /cluster/data/hg17/bed/rgdQtl
 ln -s /cluster/store8/rgd/human050205 /cluster/data/hg17/bed/rgdQtl
 cd /cluster/data/hg17/bed/rgdQtl
 
 # download data files from RGD
 
 wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/human_QTL.gff
 
 # remove extra line feed character at the end of lines
 
 # !!! manually corrected the line of AASTH7_H because chromStart is greater than chrEnd
 
 rmLf human_QTL.gff > rgdQtl.gff
 
 # create rgdQtl.tab
 awk '{print $1"\t"$4"\t"$5"\t"$10}'  rgdQtl.gff |sed -e 's/Chr/chr/g'| \
 sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' > rgdQtl.tab
 
 # create rgdQtlLink.tab
 
 awk '{printf "%s\t%s\t", $12, $10; for (i = 14;i <= NF; ++i ) {printf "%s ", $i} printf "\n"} ' rgdQtl.gff | \
 sed -e 's/"//g'| sed -e 's/RGD://g' | sed -e 's/;//g'| sed -e 's/Note//g' > rgdQtlLink.tab
 
 # load rgdQtl table
 hgLoadBed hg17 rgdQtl rgdQtl.tab
 
 # check rgdQtl table
 checkTableCoords hg17 rgdQtl
 
 # load rgdQtlLink table
 hgsql hg17 -e "drop table hg17.rgdQtlLink;"
 hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql
 hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;'
 
 # updated trackDb.ra under /kent/src/hg/makeDb/trackDb/human/hg17 and
 # added rgdQtl.html.
 
 # GENOSCOPE TETRAODON (tetNig1) ECORES (DONE, 2005-02-08, hartera)
     ssh eieio
     mkdir -p /cluster/data/hg17/bed/ecoresTetNig1
     cd /cluster/data/hg17/bed/ecoresTetNig1
                                                                                 
     wget --timestamp \
          http://www.genoscope.cns.fr/externe//4ucsc/ExofishHs35Tnig1
     # this is in gff format
     # remove "Ecotig" from name field
     sed -e 's/Ecotig EG/EG/g' ExofishHs35Tnig1 > ExofishHs35Tnig1.gff
     # need to have tabs between fields not a space to load file into table
     sed -e 's/ /\t/g' ExofishHs35Tnig1.gff > Hs35Tnig1format.gff
     # if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads
     # correctly into the table.
     sed -e 's/ecore/CDS/' Hs35Tnig1format.gff | sed -e 's/ecotig/transcript/' \
            > Hg17vstetNig1.gff
     # add "chr" in front of the chromsome name in first field (2005-02-08)
     perl -pi.bak -e 's/^([0-9XYM]{1,2})/chr$1/' Hg17vstetNig1.gff
     rm *.bak
     # need to reload table
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/ecoresTetNig1
     echo 'drop table ecoresTetNig1;' | hgsql hg17
     nice ldHgGene hg17 ecoresTetNig1 Hg17vstetNig1.gff
     # Read 40172 transcripts in 186032 lines in 1 files
     #  40172 groups 42 seqs 1 sources 2 feature types
     # 40172 gene predictions
 # added ecoresTetNig1 entry to trackDb.ra in trackDb/human
 # and created ecoresTetNig1.html.  Genoscope will not be maintaining this
 # newest data in their Exofish comparative browser display.
 
 # UPDATE kgSpAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
 # Add new human protein display IDs to the alias table to support user search
     
     ssh hgwdev
     cd /cluster/data/hg17/bed/pb
     mkdir newDisplayId
     cd newDisplayId
  
     hgsql proteome -e 'select hg17.kgSpAlias.kgID, hg17.kgSpAlias.SpID, spOldNew.newDisplayId from spOldNew, hg17.kgSpAlias where spOldNew.acc=hg17.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >hg17.tab
     
     hgsql hg17 -e 'load data local infile "hg17.tab" into table hg17.kgSpAlias'
 
 # UPDATE kgProtAlias TABLE WITH NEW UNIPROT DISPLAY ID ENTRIES (done 2/11/05 Fan)
 # Add new hg17 protein display IDs to the alias table to support user search
     
     ssh hgwdev
     cd /cluster/data/hg17/bed/pb/newDisplayId
 
      hgsql proteome -e 'select hg17.kgSpAlias.kgID,spOldNew.oldDisplayId,spOldNew.newDisplayId from spOldNew, hg17.kgSpAlias where spOldNew.acc=hg17.kgSpAlias.spID and oldDisplayId != newDisplayId' |sort -u >hg17.kgProtAlias.tab
 
 # get rid of the header line at the end of the file
     vi hg17.kgProtAlias.tab 
 
     hgsql hg17 -e 'load data local infile "hg17.kgProtAlias.tab" into table hg17.kgProtAlias'
 
 
 
 # BLASTZ HUMAN TARGET, COW QUERY (DONE, Nov. 2004 - Jan. 2005, Heather)
   ssh kk
   # use /cluster/data/bosTau1 because more disk space there
   cd /cluster/data/bosTau1/bed
   mkdir zb.hg17
 
   # create DEF file 
   # for now, not doing ABRIDGE_REPEATS
   # this means I don't need to create lineage specific repeats
   # This is because blastz-run wouldn't take advantage of these
   # because my query is in scaffolds
 
 # human vs. cow
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Default
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human
 SEQ1_DIR=/scratch/hg/gs.18/build35/bothMaskedNibs
 #SEQ1_DIR=/iscratch/i/hg17/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow
 SEQ2_DIR=/iscratch/i/bosTau1/splitDir
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=
 SEQ2_IN_CONTIGS=1
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/zb.hg17
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 '_EOF_'
   # << this line keeps emacs coloring happy
   bash
   cd /cluster/data/bosTau1/bed/zb.hg17
   source DEF
   mkdir $RAW run.0
   # create S2.len so make-joblist doesn't have to
   /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run.0/j
   sh ./xdir.sh
   cd run.0
   # check how many lines in j
   sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
   para create jobList
   para try, para check, para push, para check....
 
   # convert out to lav
   ssh kki
   cd /cluster/data/bosTau1/bed/zb.hg17
   # run bash shell if not running it already
   source DEF
   mkdir -p $BASE/run.1
   mkdir -p $BASE/lav
   # create a new job list to convert out files to lav
   /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > run.1/jobList
   cd run.1
   # make sure the job list is OK
   wc -l jobList
   head jobList
   para create jobList
   para try
   para check
   para push
 
   # lavToAxt
   ssh kksilo
   cd /cluster/data/bosTau1/bed/zb.hg17
   mkdir axtTemp
   cd lav
   foreach i (*)
       catDir $i | lavToAxt stdin /cluster/data/hg17/nib \
         /cluster/data/bosTau1/bosTau1.2bit ../axtTemp/$i.axt
       echo done $i
   end
 
   # axtChain
   ssh kki
   cd /cluster/data/bosTau1/bed/zb.hg17
   mkdir -p axtChain/run1
   cd axtChain/run1
   mkdir out chainRaw
   ls -1S /cluster/data/bosTau1/bed/zb.hg17/axtTemp/*.axt > input.lst
 
   cat << '_EOF_' > gsub
   #LOOP
   doChain {check in exists $(path1)} {check out line+ chainRaw/$(root1).chain} {check out exists out/$(root1).out}
   #ENDLOOP
   '_EOF_'
   # << this line makes emacs coloring happy
     
   cat << '_EOF_' > doChain
   #!/bin/csh
 '_EOF_'
   axtChain $1 /iscratch/i/hg17/bothMaskedNibs /iscratch/i/bosTau1/nib/bosTau1.2bit $2 > $3
     
   # << this line makes emacs coloring happy
   chmod a+x doChain
   gensub2 input.lst single gsub jobList
   para create jobList
   para try
   para check
   para push
 
   # Completed: 46 of 46 jobs
   # Average job time:                  83s       1.39m     0.02h    0.00d
   # Longest job:                     1240s      20.67m     0.34h    0.01d
   # Submission to last job:          1326s      22.10m     0.37h    0.02d
 
   # mergesort
   ssh kksilo
   cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
   chainMergeSort run1/chainRaw/*.chain > all.chain.jan3
 
   # chainAntiRepeat
   ssh kksilo
   cd /cluster/data/bosTau1/bed/zb.hg17/axtChain/run1
   mkdir chainAntiRepeat
   # test with just one
   chainAntiRepeat /cluster/store5/gs.18/build35/nib /cluster/data/bosTau1/bosTau1.2bit \
     chainRaw/chr18.chain chainAntiRepeat/chr18.chain
   # do them all
   foreach f (chainRaw/*.chain)
     set f1 = $f:t
     echo $f1
       chainAntiRepeat /cluster/store5/gs.18/build35/nib /cluster/data/bosTau1/bosTau1.2bit \
         $f chainAntiRepeat/$f1
   end
 
   # mergesort again
   ssh kksilo
   cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
   chainMergeSort run1/chainAntiRepeat/*.chain > all.chain.jan5
   gzip all.chain.jan3
 
   # split
   mkdir chain
   chainSplit chain all.chain.jan5
 
   # look at the distribution
   foreach f (chain/*.chain)
     grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
     echo $f:t:r
     textHistogram -binSize=5000 /tmp/score.$f:t:r
     echo ""
   end
   # see files histogram.out and histogram.interesting
 
   # run chainFilter
   chainFilter -minScore=5000 all.chain.jan5 > all.chain.jan5.filtered
   gzip all.chain.jan5
 
   # split
   rm chain/*
   chainSplit chain all.chain.jan5.filtered
   gzip all.chain.jan5.filtered
 
   # load
   ssh hgwdev
   cd /cluster/data/bosTau1/bed/zb.hg17/axtChain/chain
   foreach i (*.chain)
     set c = $i:r
     echo loading $c
     hgLoadChain hg17 ${c}_chainBosTau1 $i
   end
 
     # featureBits -chrom=chr1 hg17 chainBosTau1Link
     # 103272818 bases of 222827847 (46.346%) in intersection
     # featureBits -chrom=chr2 hg17 chainBosTau1Link
     # 105920345 bases of 237506229 (44.597%) in intersection
     # featureBits -chrom=chr3 hg17 chainBosTau1Link
     # 89582887 bases of 194635740 (46.026%) in intersection
     # featureBits -chrom=chr4 hg17 chainBosTau1Link
     # 77513949 bases of 187161218 (41.416%) in intersection
     # featureBits -chrom=chr5 hg17 chainBosTau1Link
     # 80428726 bases of 177702766 (45.260%) in intersection
     # featureBits -chrom=chr6 hg17 chainBosTau1Link
     # 71830264 bases of 167317699 (42.930%) in intersection
     # featureBits -chrom=chr7 hg17 chainBosTau1Link
     # 64561289 bases of 154759139 (41.717%) in intersection
     # featureBits -chrom=chr8 hg17 chainBosTau1Link
     # 55896735 bases of 142612826 (39.195%) in intersection
     # featureBits -chrom=chr9 hg17 chainBosTau1Link
     # 52068957 bases of 117781268 (44.208%) in intersection
     # featureBits -chrom=chr10 hg17 chainBosTau1Link
     # 57427282 bases of 131613628 (43.633%) in intersection
     # featureBits -chrom=chr11 hg17 chainBosTau1Link
     # 58412709 bases of 131130853 (44.545%) in intersection
     # featureBits -chrom=chr12 hg17 chainBosTau1Link
     # 56076163 bases of 130259811 (43.049%) in intersection
     # featureBits -chrom=chr13 hg17 chainBosTau1Link
     # 37951944 bases of 95559980 (39.715%) in intersection
     # featureBits -chrom=chr14 hg17 chainBosTau1Link
     # 39896970 bases of 88290585 (45.188%) in intersection
     # featureBits -chrom=chr15 hg17 chainBosTau1Link
     # 37507979 bases of 81341915 (46.112%) in intersection
     # featureBits -chrom=chr16 hg17 chainBosTau1Link
     # 33883573 bases of 78884754 (42.953%) in intersection
     # featureBits -chrom=chr17 hg17 chainBosTau1Link
     # 31871034 bases of 77800220 (40.965%) in intersection
     # featureBits -chrom=chr18 hg17 chainBosTau1Link
     # 30359555 bases of 74656155 (40.666%) in intersection
 
 
 
 # NET 
 # run in stages to avoid memory problems
   ssh kolossus
   cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
 
   # PRE
   /cluster/bin/x86_64/chainPreNet all.chain.jan5.filtered ../S1.len ../S2.len chainPreNet.out
 
   # chainNet
   /cluster/bin/x86_64/chainNet chainPreNet.out \
     -minSpace=1 ../S1.len ../S2.len bosTau1.net.raw /dev/null
 
   # syntenic (using revision 1.6)
   /cluster/home/heather/bin/x86_64/netSyntenic bosTau1.net.raw bosTau1.net.syn
   # memory usage 2757492736, utime 13404 s/100, stime 616
 
   # backup/compress
   ssh kksilo
   cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
   gzip bosTau1.net.raw
   cp bosTau1.net.syn bosTau1.net.syn.backup
 
   # netClass
   # takes about 4 hours
   ssh hgwdev
   cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
   netClass -noAr bosTau1.net.syn hg17 bosTau1 bosTau1.net
 
   # backups
   ssh kksilo
   cp bosTau1.net bosTau1.net.backup
   rm bosTau1.net.syn.backup
 
   # load
   ssh hgwdev
   cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
   netFilter -minGap=10 bosTau1.net | hgLoadNet hg17 netBosTau1 stdin
   rm bosTau1.net.backup
 
   # index has NULL cardinality; analyze to fix
   hgsql hg17
   analyze table netBosTau1
 
   # generate axts 
   ssh kksilo
   cd /cluster/data/bosTau1/bed/zb.hg17
   mkdir axtNet
 
   # split first (not required?)
   cd axtChain
   mkdir net
   netSplit bosTau1.net.syn net
   cd net
   foreach i (*.net)
     netToAxt $i ../chain/$i:r.chain /cluster/data/hg17/nib /cluster/data/bosTau1/bosTau1.2bit ../../axtNet/$i:r.axt
   end
   gzip bosTau1.net.syn
   gzip bosTau1.net
 
   # axtSort (takes about 5 minutes)
   ssh kksilo
   cd /cluster/data/bosTau1/bed/zb.hg17
   mkdir axtNetSort
   foreach f ( axtNet/*.axt )
     set c = $f:t:r
     echo "axtSort on $c"
     axtSort $f axtNetSort/$c.axt
   end
 
   # make maf files
   mkdir mafNet
   foreach f (axtNetSort/*.axt)
     set c = $f:t:r
     echo "axtToMaf on $c"
     axtToMaf $f /cluster/data/hg17/chrom.sizes /cluster/data/bosTau1/chrom.sizes mafNet/$c.maf -tPrefix=hg17. -qPrefix=bosTau1.
   end
 
 # MAKE VSBOSTAU1 DOWNLOADABLES (DONE Feb. 15, 2005 Heather)
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg17
     mkdir vsBosTau1
     cd vsBosTau1
     mkdir axtNet
     cd /cluster/data/bosTau1/bed/zb.hg17/axtChain
     cp -p all.chain.gz  /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/cow.chain.gz
     cp -p bosTau1.net.gz /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/cow.net.gz
     cd ../axtNet
     cp -p * /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1/axtNet
 
     cd /usr/local/apache/htdocs/goldenPath/hg17/vsBosTau1
     # Make a README.txt which explains the files & formats.
     md5sum *.gz > md5sum.txt
     cd axtNet
     md5sum *.gz > md5sum.txt
 
 # YALE PSEUDOGENES (started Robert Baertsch, finished JK 2/21/05)
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir pseudoYale
     cd pseudoYale
     # Place file obtained from Mark Gerstein at yale in pseudoYale.gtf
     ldHgGene hg17 pseudoYale pseudoYale.gtf
     # Note - I'm guessing how this goes.  Robert left no record. -jk
 
 # added xenoRefGene track (markd ~2005-02-20)
     add to /cluster/data/genbank/genbank.con
     hg17.refseq.mrna.xeno.load  = yes
     hg17.refseq.mrna.xeno.loadDesc = yes
 
 # BUILD ccdsGene and ccdsInfo tables (markd 2005-02-25)
     # download files to the genbank data area, as this will eventually
     # be done automatically as part of the genbank build process.
     cd  /cluster/data/genbank
     mkdir -p data/ccds/hg17/2005-02-25
     cd data/ccds/hg17/2005-02-25
 
     # get the basic text dumps of the data, rather than the database dumps
     wget -r ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/
     # ends up with:
         About-NcbiHinxton.txt
         NcbiHinxton.txt
         NcbiHinxtonAllAccessions.txt
 
     # this is a preliminary release, it contained 2 PAR genes that had
     # bad coordinates and 7 genes that were determined at be pseudogenes
     # at the last minute.  The accessions for these 9 genes were
     # placed in skip.ccds and then removed:
     fgrep -v -f skip.ccds  /scratch/markd/gene-sets/ncbiDb/set1.5/NcbiHinxtonAllAccessions.txt >  /scratch/markd/gene-sets/ncbiDb/set1.5/NcbiHinxtonAllAccessions.cleaned.txt
 
     # create the tab files to load in the database
     /cluster/data/genbank/bin/i386/ccdsImport NcbiHinxtonAllAccessions.cleaned.txt ccdsGene.gp ccdsInfo.tab
 
     # load ccdsInfo
     hgsql hg17 <../../../../../lib/ccdsInfo.sql 
     hgsql -e 'load data local infile "ccdsInfo.tab" into table ccdsInfo'  hg17
 
     # load cdsGene.gp and check
     ldHgGene -predTab -genePredExt hg17 ccdsGene ccdsGene.gp 
     checkTableCoords hg17 -verbose=2 ccdsGene
     rm *.tab
     gzip -9 NcbiHinxton*.txt
   
     
 # BUILD refSeqKg TABLE TO SUPPORT CCDS GENES (RE-DONE, Fan 2/26/05)
 
     hgsql hg17 -N -e "select * from knownGene" >kg.gp
     hgsql hg17 -N -e "select * from refGene"   >ref.gp
 
     overlapSelect -inCds -strand  -idOutput -fraction=fraction.out -selectCds -overlapSimilarity=0.90 -selectFmt=genePred -inFmt=genePred kg.gp ref.gp refSeqKg.90.tab
     cat fraction.out|sort -u >refSeqKg.tab
 
     hgsql hg17 -e 'drop table refSeqKg'
     hgsql hg17 < ~/src/hg/lib/refSeqKg.sql
     hgsql hg17 -e 'load data local infile "refSeqKg.tab" into table refSeqKg'
 
    rm fraction.out
 
 # BUILD ccdsGene and ccdsInfo tables  (markd, reone 2005-03-17)
     cd /cluster/store5/genbank/data/ccds/hg17
     wget ftp://ftp.ncbi.nlm.nih.gov/pub/hcds/Hs35.1/CDSTrackDB/CCDS.20050303.tar.gz
     mkdir /scratch/tmp/ccds
     cd /scratch/tmp/ccds
     tar -zxf  /cluster/store5/genbank/data/ccds/hg17/CCDS.20050303.tar.gz     
 
     # import ccds database tables
     hgsql -e 'create database ccds'
     hgsql ccds </cluster/data/genbank/etc/createTables.sql 
     hgsql ccds </cluster/data/genbank/etc/createKeys.sql 
     /cluster/data/genbank/bin/i386/ccdsImport ccds data/[A-Z]*.txt
 
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/i386/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
 
     # refSeqKg table
 
     hgsql -N -e "select * from knownGene" hg17 >kg.gp
     hgsql -N -e "select * from refGene" hg17 >ref.gp
 
     overlapSelect -statsOutput -strand -inCds -selectCds -overlapSimilarity=0.90 kg.gp ref.gp stdout | tail +2 | sort -u >refSeqKg.tab
 
     hgsql hg17 -e 'drop table refSeqKg'
     hgsql hg17 < ~/compbio/kent/src/hg/lib/refSeqKg.sql
     hgsql hg17 -e 'load data local infile "refSeqKg.tab" into table refSeqKg'
     cd ..
     rm -r ccds
 
 
 # COW BACENDS (Done, Heather, Mar. 21, 2005)
  
 ssh hgwdev
 cd /cluster/data/hg17/bed
 mkdir bacendsCow
 cd bacendsCow
  
 # Obtain GFF file from Denis; unzip into BACendhg15.gff
 
 # Convert into BED 6:
  
   makebed.pl < BACendhg17.gff > BACendhg17.bed
   hgLoadBed -noBin hg17 bacendsCow BACendhg17.bed
   # 53403 warnings
  
 # add to kent/src/hg/makeDb/trackDb/human/hg17/trackDb.ra
 
 # make map between ccds and known genes (markd 2005/03/08)
 # this should be run whenever either known genes or ccds is updated
 /cluster/data/genbank/bin/i386/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
 
 # UPDATE WGRNA TRACK (DONE, 2004-12-13, Fan)
 
 # Received updated data file, wg_track_april2005.txt, from Michel Weber by email.
 
    cut -f 2-10  wg_track_april2005.txt |tail +2 >wg_track_april2005.tab
 
 # Use editor to remove the last blank line.
 
    hgLoadBed -sqlTable=/cluster/home/fanhsu/hg/lib/wgRna.sql hg17 wgRna wg_track_april2005.tab
 
 # Asked Donna to update Reference section according to Michel's email.  
 
 ## refresh vega tracks with vega build30        (done 5/4/04 Robert)
 ##download vega mysql tables
 cd /cluster/store8/ensembl
 mkdir vega30_35c
 cd vega30_35c
 ln /cluster/store8/ensembl/vega30_35c /cluster/data/hg17/bed/vega30 -s
 
 for i in `cat tables` ; do wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/$i.gz ; done
 wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/homo_sapiens_vega_30_35c_mysql40_compatible..sql.gz 
 gunzip *.gz
 ##create mysql database
 mysql 
 create database vega30
 use vega30
 source homo_sapiens_vega_30_35c_mysql40_compatible.sql
 source dropMt.sql
 source load.sql
 exit
 
 
 hgsql vega30 -N -B < vegaGene.sql > vegaGene.tab
 awk -f vegaGene.awk < vegaGene.tab > vegaGene.gp     
 ldHgGene hg17 vegaGene -predTab vegaGene.gp -gtf -genePredExt 
 hgsql vega30 -N -B < vegaPseudo.sql > vegaPseudo.tab
 awk -f vegaPseudo.awk < vegaPseudo.tab > vegaPseudo.gp     
 ldHgGene hg17 vegaPseudoGene -predTab vegaPseudo.gp -gtf -genePredExt 
 
 #load processed pseudogenes
 grep Processed vegaPseudo.tab > vegaProcPseudo.tab
 awk -f vegaPseudo.awk < vegaProcPseudo.tab > vegaProcPseudo.gp
 ldHgGene hg17 vegaProcessedPseudo -predTab vegaProcPseudo.gp -gtf -genePredExt
 
 #load vegaInfo
 hgsql vega30 -N -B < vegaGeneInfo.sql > vegaInfo.tab
 hgsql vega30 -N -B < vegaPseudoInfo.sql >> vegaInfo.tab
 
 hgsql hg17 -N -B < /cluster/home/baertsch/kent/src/hg/lib/vegaInfo.sql
 echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg17 -N -B
 
 #load down to hg16
 liftOver vegaGene.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaGeneHg16.gp unMapped.gp -genePred
 liftOver vegaPseudo.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaPseudoGeneHg16.gp unMappedPseudo.gp -genePred
 
 ldHgGene hg16 vegaGene -predTab vegaGeneHg16.gp -gtf 
 ldHgGene hg16 vegaPseudoGene -predTab vegaPseudoGeneHg16.gp -gtf 
 echo 'truncate table vegaInfo' | hgsql hg16 -N -B
 echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg16 -N -B
 
 
 #########################################################################
 # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)
 
 
 ####################################################################################
 # RE-BUILD KNOWN GENES TABLES, 2ND TRIAL WITH VARIANT PROTEINS (Started 5/13/05 Fan)
 
 # First build protein databases, sp050415 and proteins050415
 # See makeProteins050415.doc for details.
 
 # Create working subdirectories and temporary databases (kgHg17F)
 
   ssh hgwdev
   cd /cluster/store10/kg
   mkdir kgHg17F  
   ln -s /cluster/store10/kg/kgHg17F /cluster/store6/kgDB/bed/kgHg17F
   ln -s /cluster/store10/kg/kgHg17F /cluster/data/hg17/bed/kgHg17F
 
   hgsql hg17 -e "create database kgHg17F"   
   hgsql hg17 -e "create database kgHg17FTemp"
 
   mkdir /cluster/bluearc/kgDB/kgHg17F
   mkdir /cluster/bluearc/kgDB/kgHg17F/protBlat
   ln -s /cluster/bluearc/kgDB/kgHg17F/protBlat /cluster/store10/kg/kgHg17F/protBlat
   cd /cluster/store10/kg/kgHg17F/protBlat
 
 ################################################################# 
 # VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
 # The protBlat.psl was built during the first KG II build trial
 # The results are still valid, except that kgHg17E was used
 # instead of kgHg17F
 
 # Create working subdirectories and temporary databases (kgHg17E)
 
   ssh hgwdev
   cd /cluster/store10/kg
   mkdir kgHg17E  
   ln -s /cluster/store10/kg/kgHg17E /cluster/store6/kgDB/bed/kgHg17E
   ln -s /cluster/store10/kg/kgHg17E /cluster/data/hg17/bed/kgHg17E
 
   hgsql hg17 -e "create database kgHg17E"   
   hgsql hg17 -e "create database kgHg17ETemp"
 
   mkdir /cluster/bluearc/kgDB/kgHg17E
   mkdir /cluster/bluearc/kgDB/kgHg17E/protBlat
   ln -s /cluster/bluearc/kgDB/kgHg17E/protBlat /cluster/store10/kg/kgHg17E/protBlat
   cd /cluster/store10/kg/kgHg17E/protBlat
 
 # Get all human protein sequences
 
   hgsql -N sp050415 -e \
   'select proteins050415.spXref3.accession,protein.val from proteins050415.spXref3,protein where division="9606" and acc=accession' \
   |awk '{print ">" $1;print $2}' >humanProt.fa
 
 # Prepare and perform cluster run for protein/genome alignment
 
   ssh kk
   cd /cluster/data/hg17/bed/kgHg17E/protBlat
   mkdir prot
   faSplit sequence humanProt.fa 1000 prot/prot
   ls /cluster/bluearc/kgDB/kgHg17E/protBlat/prot/* > prot.lis
 
   ssh hgwdev
   cd /cluster/data/hg17/bed/kgHg17E/protBlat
   hgsql hg17 -N -e 'select chrom from chromInfo' > chrom.lis
   exit
   
   cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -noHead -t=dnax -q=prot /cluster/data/hg17/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgHg17E/protBlat/result/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
   mkdir result
   gensub2 chrom.lis prot.lis gsub jobList
 
   para create jobList
   para try
   para check
   para push
   para check ...
 # many output .psl will be empty, the warnings are OK.
 [kk:protBlat> para check
 45494 jobs in batch
 0 jobs (including everybody's) in Parasol queue.
 Checking finished jobs
 tracking errors: 1
 crashed: 12643
 ranOk: 32850
 total jobs in batch: 45494
 [kk:protBlat> para time
 45494 jobs in batch
 0 jobs (including everybody's) in Parasol queue.
 Checking finished jobs
 Completed: 32850 of 45494 jobs
 Crashed: 12643 jobs
 para.results: file not found.  paraHub can't write to this dir?
 CPU time in finished jobs:   36153510s  602558.50m 10042.64h  418.44d  1.146 y
 IO & Wait Time:               1585456s   26424.27m   440.40h   18.35d  0.050 y
 Average job time:                1149s      19.15m     0.32h    0.01d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:          155120s    2585.33m    43.09h    1.80d
 Submission to last job:        276342s    4605.70m    76.76h    3.20d
 
 # This cluster run took about 3 days.  Crashed jobs are due to empty BLAT result.  It is OK.
 
 # collect BLAT results
 
    ssh hgwdev
    cd /cluster/data/hg17/bed/kgHg17E/protBlat
 
    mkdir result2
    mkdir result3
 
    cat chrom.lis |sed -e 's/chr/do1 chr/g' >doall
 
    cat << '_EOF_' > do1.1
 echo processing $1
 cat result/$1_prot*.psl >result2/$1.psl
 '_EOF_'
 
    cat << '_EOF_' > do1.2
 echo processing $1
 pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 result2/$1.psl result3/$1.psl /dev/null >>j.out
 '_EOF_'
 
    chmod +x do*
 
    cp do1.1 do1
    doall
    cp do1.2 do1
    doall
 
    cat result3/*.psl >protBlat.psl
 
 #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # The end of protBlat.psl build, using kgHg17E
 ################################################################################
 
 
 
 
 ############################################################################
 # This part process the variant splice proteins.
 
 # First build variant splice protein tables.
 
 # Get all variant isoform human protein sequences
 
   ssh hgwdev
   cd /cluster/data/swissprot/050415/build
   wget --timestamp \
  ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz
   wget --timestamp \
 ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl_varsplic.fasta.gz
 
   gzip -d *varsplic.fasta.gz
 
   faToTab -type=protein uniprot_trembl_varsplic.fasta splicTrembl.tab
   faToTab -type=protein uniprot_sprot_varsplic.fasta splicSprot.tab
 
   cat splicTrembl.tab splicSprot.tab >varProtein.tab
   hgsql sp050415 < ~/src/hg/lib/varProtein.sql
   hgsql sp050415 -e 'load data local infile "varProtein.tab" into table varProtein'
 
   cat varProtein.tab |cut -f 1>j1
   cut -f 1 j1|sed -e 's/-/\t/g' >j2
   paste j1 j2 >splicProt.tab
 
   hgsql kgHg17FTemp -e 'drop table splicProt'
   hgsql kgHg17FTemp <~/src/hg/lib/splicProt.sql
   hgsql kgHg17FTemp -e 'load data local infile "splicProt.tab" into table splicProt'
 
   hgsql kgHg17FTemp -N -e \
   'select varAcc, varProtein.val from sp050415.varProtein,splicProt,proteins050415.spXref3 where accession=parAcc and varProtein.acc=splicProt.varAcc and division="9606"'| \
    awk '{print ">" $1;print $2}' >humanVarProt.fa
 
 
    cd /cluster/data/hg17/bed/kgHg17F
 
 # get all Human splicProtBlat records
 
 hgsql hg17 -N -e \
 'select splicProtBlat.* from splicProtBlat,proteins050415.spXref3,kgHg17FTemp.splicProt where qName=splicProt.varAcc and parAcc=accession and division="9606"'\
 |cut -f 2-22 \
 >humanVarProtBlat.psl
   
 # Combine the regular protein protBlat records with the variant protein psl records.
   cd /cluster/store10/kg/kgHg17F
   cat ../kgHg17E/protBlat/protBlat.psl humanVarProtBlat.psl >protBlat.psl
 
   hgLoadPsl hg17 protBlat.psl
 # Processing protBlat.psl
 # load of protBlat did not go as planned: 104064 record(s), 0 row(s) skipped, 1484 warning(s) loading psl.tab
 # Looked into the cause of these 1484 warnings.  It was due to that qBaseInsert and tBaseInsert 
 # have negative values, probably due to that this is protein alignment.
 
 
 # create all_mrna.psl and tight_mrna.psl
    hgsql hg17 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl
 
    pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
            all_mrna.psl tight_mrna.psl /dev/null
 
 # Use overlapSelect to get protein and mRNA alignment overlaps   
    overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
    -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.stat
 
    overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
    -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.out
 
 # Create protein/mRNA pair and protein lists
    cut -f 10,31 protMrna.out|sort -u >spMrna.tab
    cut -f 10    protMrna.out|sort -u >protein.lis
 
 # Load spMrna.tab into spMrna table in temp DB.
    hgsql kgHg17FTemp < ~/src/hg/lib/spMrna.sql
    hgsql kgHg17FTemp -e 'load data local infile "spMrna.tab" into table spMrna'
    hgsql kgHg17FTemp -e 'create index mrnaID on spMrna(mrnaID)'
 
 # Prepare and perform cluster run of protein/mRNA alignment
 
 # Get mRNA fa file.
    cd /cluster/data/hg17/bed/kgHg17F
    /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=hg17 \
    -gbRoot=/cluster/data/genbank genbank mrna mrna.fa
 
 # Create mrnaSeq table in kgHg17FTemp DB.
 
    faToTab mrna.fa mrnaSeq.tab
 
    hgsql kgHg17FTemp -e 'drop table mrnaSeq'
    hgsql kgHg17FTemp <~/src/hg/lib/mrnaSeq.sql
    hgsql kgHg17FTemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
 # Prepare files for cluster run
    ~/src/hg/protein/KG2.sh kgHg17F hg17 050415
 
 # Perform cluster run of protein/mRNA alignment
    ~/src/hg/protein/KG3.sh kgHg17F hg17 050415
 
 # Collect cluster run results
    cd kgBestMrna
 
    ls out | sed -e 's/prot/do1 prot/g' >doall
 
 # create do1 with the following 2 lines:
    cat << '_EOF_' > do1
 echo processing $1
 cat out/$1/*.out >>protMrnaRaw.psl
 '_EOF_'
 
    chmod +x do*
    doall
 
 # Filter out low quality alignments
    pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
    cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
    wc protMrna.lis
 
 # Load BLAT results into temp DB.
    hgsql kgHg17FTemp < ~/src/hg/lib/protMrnaBlat.sql
    hgsql kgHg17FTemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
    hgsql kgHg17FTemp -e 'create index tName on protMrnaBlat(tName)'
 
 # Create CDS files from protein/mRNA alignment results.
    hgsql kgHg17FTemp -N -e \
    'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
    |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds
 
 # Create protMrna.psl with proteinID_mrnaID as query ID.
    cut -f 22-30 ../protMrna.out > j1.tmp
    cut -f 32-42 ../protMrna.out > j2.tmp
    cut -f 10,31 ../protMrna.out|sed -e 's/\t/_/g' >j3.tmp
    paste j1.tmp j3.tmp j2.tmp >protMrna.psl
    rm j1.tmp j2.tmp j3.tmp
 
 # Run mrnaToGene to create protMrna.gp
    bash
    mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
    exit
 
 # Prepare refGene and all_mrna gp files.
 
    cd ..
    hgsql hg17 -N -e 'select * from refGene' >ref.gp
 
    hgsql hg17 -N -e \
    'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
    |sort -u > all_mrna.cds
 
    bash
    mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
    exit
 
 # Align proteins to RefSeq.
 
    overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
    protBlat.psl ref.gp ref.stat
    overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
    protBlat.psl ref.gp protRef.gp
 
    overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
    -selectFmt=genePred ref.gp protBlat.psl protRef.out
 
    cut -f 10,22 protRef.out | sort -u >spRef.tab
    cut -f 10 protRef.out    | sort -u >protRef.lis
 
    hgsql kgHg17FTemp -e 'drop table spRef'
    hgsql kgHg17FTemp <~/src/hg/lib/spRef.sql
    hgsql kgHg17FTemp -e 'load data local infile "spRef.tab" into table spRef'
 
 # Prepare and perform cluster runs for protein/RefSeq alignments
 
    ~/src/hg/protein/KGRef2.sh kgHg17F hg17 050415
    ~/src/hg/protein/KGRef3.sh kgHg17F hg17 050415
 
    cd kgBestRef
    ls out | sed -e 's/prot/do1 prot/g' >doall
 
    cat << '_EOF_' > do1
 echo processing $1
 cat out/$1/*.out >>protRefRaw.psl
 '_EOF_'
 
    chmod +x do*
    doall
 
 # Filter out low quality alignments.
    pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
    cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
    wc protRef.lis
 
    hgsql kgHg17FTemp -e 'drop table protRefBlat'
    hgsql kgHg17FTemp < ~/src/hg/lib/protRefBlat.sql
    hgsql kgHg17FTemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
    hgsql kgHg17FTemp -e 'create index tName on protRefBlat(tName)'
 
 # Run gene-check to filter out invalid gp entries
    cd /cluster/data/hg17/bed/kgHg17F
    cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
    gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/hg17/nib kgCandidate0.gp kgCandidate0.check
 
    hgsql kgHg17FTemp -e 'drop table kgCandidate0'
    hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidate0.sql 
    hgsql kgHg17FTemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'
 
    hgsql kgHg17FTemp -e 'drop table geneCheck'
    hgsql kgHg17FTemp < ~/src/hg/lib/geneCheck.sql
    hgsql kgHg17FTemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'
 
 
 # Run kgCheck to get all KG candidates that pass the KG gene check criteria
 
    kgCheck kgHg17FTemp hg17 kgCandidate0 geneCheck kgCandidate.tab
    hgsql kgHg17FTemp -e  'drop table kgCandidate'
    hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidate.sql
    hgsql kgHg17FTemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
    hgsql kgHg17FTemp -e 'create index alignID on kgCandidate(alignID)'
 
 # ####### NEXT TIME AROUND PUT IN AN EXTRA STEP TO BRING IN ITEMS ON A "PUT BACK" LIST
 # FOR SPECIAL CASES LIKE SELENOCYSTEINE, NON-AUG INITIATION CODON, RIBOSOMAL SLIPPAGE, ETC.
 # #######
 
 # Construct the kgCandidateX table that has alignID in the name field. 
    cut -f 2-10 kgCandidate.tab >j2.tmp
    cut -f 11 kgCandidate.tab >j1.tmp
    paste j1.tmp j2.tmp >kgCandidateX.tab
 
    hgsql kgHg17FTemp -e  'drop table kgCandidateX'
    hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateX.sql
    hgsql kgHg17FTemp -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'
 
 # Score protein/mRna and protein/RefSeq alignments
 
    kgResultBestMrna2 050415 kgHg17FTemp hg17|sort -u >protMrnaBlatScore.tab
    kgResultBestRef2  050415 kgHg17FTemp hg17|sort -u >protRefScore.tab
 
 # Combine scoring results and load them into temp DB.
    cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
    hgsql kgHg17FTemp -e 'drop table protMrnaScore'
    hgsql kgHg17FTemp < ~/src/hg/lib/protMrnaScore.sql
    hgsql kgHg17FTemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
    hgsql kgHg17FTemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
 
 
 # Run kgGetCds to get CDS structure of each gene
 
    kgGetCds kgHg17FTemp kgCandidateX jY.tmp
    cat jY.tmp |sort -u >kgCandidateY.tab
    rm jY.tmp
    hgsql kgHg17FTemp -e  'drop table kgCandidateY'
    hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateY.sql
    hgsql kgHg17FTemp -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'
 
 # Run kgPickPrep to replace long cds structure string with cdsId.
    kgPickPrep kgHg17FTemp kgCandidateZ.tab
    hgsql kgHg17FTemp -e  'drop table kgCandidateZ'
    hgsql kgHg17FTemp < ~/src/hg/lib/kgCandidateZ.sql
    hgsql kgHg17FTemp -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
    hgsql kgHg17FTemp -e 'create index cdsId on kgCandidateZ(cdsId)'
 
 # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.
 
    kgPick kgHg17FTemp hg17 proteins050415 kg3.tmp dupSpMrna.tmp
    sort -u dupSpMrna.tmp >dupSpMrna.tab
 
 # Sort KG genes to make the kg3.gp table file.
    ~/kent/src/hg/protein/sortKg.pl kg3.tmp >kg3.gp
 
    hgsql kgHg17FTemp -e  'drop table knownGene'
    hgsql kgHg17FTemp < ~/src/hg/lib/knownGene.sql
    hgsql kgHg17FTemp -e  'load data local infile "kg3.gp" into table knownGene'
 
    hgsql hg17 -e  'drop table kg3'
    hgsql hg17 < ~/src/hg/lib/kg3.sql
    hgsql hg17 -e  'load data local infile "kg3.gp" into table kg3'
 
 # Perform analysis before renaming the kg3 table to knownGene.
 
 # Load data into hg17 knownGene table.
    hgsql hg17 -e  'drop table knownGene'
    hgsql hg17 < ~/src/hg/lib/knownGene.sql
    hgsql hg17 -e  'load data local infile "kg3.gp" into table knownGene'
 
 # Build knownGeneMrna and knownGenePep tables.
 
    kgPepMrna kgHg17FTemp hg17 050415
    hgsql hg17 -e  'drop table knownGeneMrna'
    hgsql hg17 < ~/src/hg/lib/knownGeneMrna.sql
    hgsql hg17 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
    hgsql hg17 -e  'drop table knownGenePep'
    hgsql hg17 < ~/src/hg/lib/knownGenePep.sql
    hgsql hg17 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'
 
 
 # Build kgXref table
 
    kgXref2 kgHg17FTemp 050415 hg17
 
    hgsql hg17 -e  'drop table kgXref'
    hgsql hg17 < ~/src/hg/lib/kgXref.sql
    hgsql hg17 -e  'load data local infile "kgXref.tab" into table kgXref'
 
 # Build spMrna table
 
    hgsql hg17 -N -e 'select name, proteinID from knownGene' >kgSpMrna.tab
 
    hgsql hg17 -e  'drop table spMrna'
    hgsql hg17 <~/src/hg/lib/spMrna.sql
    hgsql hg17 -e 'load data local infile "kgSpMrna.tab" into table spMrna'
 
 # Build kgProtMap table
 
     ~/src/hg/protein/kgProtMap2.sh kgHg17F hg17 050415
 
 # Update and clean up kgResultBestMrna2.c and then check it in.
 
 #####################################
 # Build alias tables.		DONE 5/18/05 Fan.
 #	kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
 #	proteins050415.hugo.withdraws, hg17.kgXref.kgID
 #	to create kgAliasM.tab and geneAlias.tab
 #	by picking out those kgID items from kgXref where
 #	kgXref.geneSymbol == hugo.symbol
 
    kgAliasM hg17 proteins050415
 
 #	kgAliasKgXref reads from hg17.knownGene.proteinID,
 #	hg17.knownGene.name, hg17.kgXref.geneSymbol
 #	to create kgAliasKgXref.tab
 
    kgAliasKgXref hg17
 
 
 #	kgAliasRefseq reads from hg17.knownGene.name,
 #	hg17.knownGene.proteinID, hg17.kgXref.refseq
 #	to create kgAliasRefseq.tab
 
    kgAliasRefseq hg17
 
    hgsql sp050415 -N -e 'select name,gene.val from hg17.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
    | sort -u  > kgAliasP.tab
 
    hgsql hg17 -N -e 'select name, name from knownGene' >kgAliasDup.tab
    hgsql hg17 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
    
    cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
    sort |uniq > kgAlias.tab
 
    hgsql -e "drop table kgAlias;" hg17 
    hgsql hg17 < ~/kent/src/hg/lib/kgAlias.sql
    hgsql hg17 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 
 
 #	kgProtAlias reads from hg17.knownGene.name,
 #	hg17.knownGene.proteinID, hg17.knownGene.alignID,
 #	proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
 #	to create kgProtAlias.tab
 #
 
    kgProtAlias hg17 050415
 
    hgsql hg17 -N -e \
    'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
    | sort -u >kgProtAliasNCBI.tab
 
 # include variant splice protein IDs
    
    hgsql hg17 -N -e \
    'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
    |sort -u >kgProtAliasDup.tab
 # include duplicate protein IDs from dupSpMrna table
    hgsql hg17 -N -e \
    'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
    |sort -u >>kgProtAliasDup.tab
 
 # catch parent acc from dupProteinID too
    hgsql hg17 -N -e\
    'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
    |sort -u >>kgProtAliasDup.tab
     cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab
 
     echo "`date` creating table kgProtAlias"
     hgsql hg17 -e "drop table kgProtAlias;"
     hgsql hg17 <~/src/hg/lib/kgProtAlias.sql; 
     hgsql hg17 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  
 
 # Build kgSpAlias table
 
     hgsql hg17 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
     hgsql hg17 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >>j.tmp
     cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
     rm j.tmp
 
     hgsql hg17 -e 'drop table kgSpAlias';
     hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
     hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
 
 # MAKE FOLDUTR TABLES (DONE 2005-05-19, Fan)
 # First set up directory structure and extract UTR sequence on hgwdev
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir rnaStruct.2005-05-18
     rm rnaStruct
     ln -s rnaStruct.2005-05-18 rnaStruct
     cd rnaStruct
     mkdir -p utr3/split utr5/split utr3/fold utr5/fold
     utrFa hg17 knownGene utr3 utr3/utr.fa
     utrFa hg17 knownGene utr5 utr5/utr.fa
 
 # Split up files and make files that define job.
     ssh kk
     cd /cluster/data/hg17/bed/rnaStruct
     faSplit sequence utr3/utr.fa 50000 utr3/split/s
     faSplit sequence utr5/utr.fa 50000 utr5/split/s
     ls -1 utr3/split > utr3/in.lst
     ls -1 utr5/split > utr5/in.lst
     cd utr3
     cat > gsub <<end
 #LOOP
 rnaFoldBig split/\$(path1) fold
 #ENDLOOP
 end
     cp gsub ../utr5
 
 # Do cluster run for 3' UTRs
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 35692 of 35692 jobs
 # CPU time in finished jobs:    1272085s   21201.42m   353.36h   14.72d  0.040 y
 # IO & Wait Time:                102447s    1707.45m    28.46h    1.19d  0.003 y
 # Average job time:                  39s       0.64m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            6554s     109.23m     1.82h    0.08d
 # Submission to last job:          9100s     151.67m     2.53h    0.11d
 
 # Do cluster run for 5' UTRs 
     cd ../utr5
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 
 # Completed: 33693 of 33693 jobs
 # CPU time in finished jobs:     393764s    6562.74m   109.38h    4.56d  0.012 y
 # IO & Wait Time:                126205s    2103.41m    35.06h    1.46d  0.004 y
 # Average job time:                  15s       0.26m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           51595s     859.92m    14.33h    0.60d
 # Submission to last job:         52057s     867.62m    14.46h    0.60d
 
 # Load database
     ssh hgwdev
     cd /cluster/data/hg17/bed/rnaStruct/utr5
     hgLoadRnaFold hg17 foldUtr5 fold
     cd ../utr3
     hgLoadRnaFold hg17 foldUtr3 fold
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 # Build KEGG pathway tables.  DONE 5/19/05.  Fan.
    ssh hgwdev
    cd /cluster/store10/kg/kgHg17F
    md kegg
    cd kegg
 
    ~/src/hg/protein/KGpath.sh kgHg17F hg17 050415
 
    hgsql hg17 -e "drop table keggMapDesc"
    hgsql hg17 -e "drop table keggPathway"
    hgsql hg17 <~/src/hg/lib/keggMapDesc.sql
    hgsql hg17 <~/src/hg/lib/keggPathway.sql
    hgsql hg17 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
    hgsql hg17 -e 'load data local infile "keggPathway.tab" into table keggPathway'
   
 # Build CGAP pathway tables
 # RELOAD cgapAlias TABLE AFTER REMOVING REPLICATE ROWS (hartera, 2005-07-26)
 # duplicate rows. (hartera, 2005-07-26)
 # RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)
 
    cd ..
    ~/src/hg/protein/KGcgap.sh kgHg17F hg17 050415
    hgsql hg17 -e "drop table cgapAlias"
    hgsql hg17 -e "drop table cgapBiocDesc"
    hgsql hg17 -e "drop table cgapBiocPathway"
    hgsql hg17 <~/src/hg/lib/cgapAlias.sql
    hgsql hg17 <~/src/hg/lib/cgapBiocDesc.sql
    hgsql hg17 <~/src/hg/lib/cgapBiocPathway.sql
    hgsql hg17 -e 'load data local infile "cgapAlias.tab" \
                  into table cgapAlias'
    hgsql hg17 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
    hgsql hg17 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'
 
    # RELOAD cgapAlias TABLE. Sort and reload alias tab file to remove 
    # duplicate rows. (hartera, 2005-07-26)
    # DO TABLE RELOAD AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u
    # OR sort -n | uniq.
    #USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ(hartera, 2005-10-06)
    cd /cluster/store10/kg/kgHg17F
    hgsql hg17 -e "drop table cgapAlias"
    # cgapAlias.tab has replicated rows so sort and unique before loading
    sort -n cgapAlias.tab | uniq > cgapAliasSorted.tab
    hgsql hg17 < ~/kent/src/hg/lib/cgapAlias.sql 
    hgsql hg17 -e 'load data local infile "cgapAliasSorted.tab" \
                  into table cgapAlias'
 
 # LOAD ENSEMBL GENES (DONE, 5/23/05, Fan)
 # Ensembl changed things again! Please note there are two subtle changes to make it work.
 
     mkdir /cluster/data/hg17/bed/ensembl
     cd /cluster/data/hg17/bed/ensembl
     mkdir new
     cd new
 
     # Get the ensembl protein data from 
     # http://www.ensembl.org/Homo_sapiens/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Structures" box. 
     # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
     # Save as ensemblGene.gtf.gz
 
     # This time, there are some extra lines, like '		1;', 
     # that are causing problems, so added an extra filter in the beginning 
     # to get rid of them.
     # Ensembl handles random chromosomes differently than us, so we
     # strip this data.  Fortunately it just loses a couple of genes.
     # Add "chr" to front of each line in the gene data gtf file to make 
     # it compatible with our software.
     # Finally, get rid of the ".1" or ".2" after the name
 
 cat ensemblGene.gtf |sed -e 's/\t\t/xxxxx/g' \
     |grep -v xxxxx \
     | grep -v ^6_DR51 \
     | grep -v ^DR51 \
     | grep -v ^DR52 \
     | grep -v ^DR53 \
     | grep -v _NT_ \
     | perl -wpe 's/^([0-9]|X|Y|Un|MT)/chr$1/ \
                  || die "Line $. doesnt start with human chrom:\n$_"' \
     | sed -e 's/chrMT/chrM/g' \
     | sed -e 's/\..\"/\"/g' \
 >ensGene.gtf
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/ensembl/new
     /cluster/bin/i386/ldHgGene hg17 ensGene ensGene.gtf
 # Read 33581 transcripts in 699580 lines in 1 files
 # 33581 groups 25 seqs 1 sources 4 feature types
 # 33581 gene predictions
 
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Choose Text, tab-separated as the output format.  Result name ensGtp.
     # Save file as ensGtp.txt.gz
     gunzip ensGtp.txt.gz
     hgsql hg17 -e 'drop table ensGtp'
     hgsql hg17 < ~/kent/src/hg/lib/ensGtp.sql
     hgsql hg17 -e 'load data local infile "ensGtp.txt" into table ensGtp ignore 1 lines'
 
 # ensMart has some problem with the resulting ensemblPep.fa.gz, so use different
 # processing step instead:
 
     wget -timestamp \
     ftp://ftp.ensembl.org/pub/current_human/data/fasta/pep/Homo_sapiens.NCBI35.may.pep.fa.gz
     zcat Homo_sapiens.NCBI35.may.pep.fa.gz | sed -e "s/transcript:/\n>/g" | grep -v 'gene:' >ensPep.fa
     faToTab -type=protein ensPep.fa ensPep.tab
     hgsql hg17 -e 'drop table ensPep'
     hgsql hg17 < ~/kent/src/hg/lib/ensPep.sql
     hgsql hg17 -e 'load data local infile "ensPep.tab" into table ensPep'
 
 # kept the following, just in case Ensembl fixed the problem in the future
     # Load Ensembl peptides:
     # Get them from ensembl as above in the gene section except for
     # Page 3) Choose the "Sequences" box. 
     # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
     # Save file as ensemblPep.fa.gz
 # gunzip ensemblPep.fa.gz
 # hgPepPred hg17 ensembl ensemblPep.fa
 
 # UPDATE GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED - 2005-05-21, DONE 2005-05-23 - Fan)
 #	This should be done after knownGene tables are complete from known gene
 #	process.
 #
 # Cluster together various alt-splicing isoforms.
 #	Creates the knownIsoforms and knownCanonical tables
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/geneSorter.2005-05-21
 # remove old symbolic link
 rm /cluster/data/hg17/bed/geneSorter
 ln -s /cluster/data/hg17/bed/geneSorter.2005-05-21 \
 	/cluster/data/hg17/bed/geneSorter
 cd /cluster/data/hg17/bed/geneSorter
 hgClusterGenes hg17 knownGene knownIsoforms knownCanonical
 
 # Extract peptides from knownGenes into fasta file
 # and create a blast database out of them.
 mkdir /cluster/data/hg17/bed/geneSorter/blastp
 cd /cluster/data/hg17/bed/geneSorter/blastp
 pepPredToFa hg17 knownGenePep known.faa
 #	You may need to build this binary in src/hg/near/pepPredToFa
 /scratch/blast/formatdb -i known.faa -t known -n known
 #	This command is in /projects/compbio/bin/$MACH/formatdb
 
 # Copy over database to bluearc
 rm -fr /cluster/bluearc/hg17/blastp
 mkdir -p /cluster/bluearc/hg17/blastp
 cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* \
 	/cluster/bluearc/hg17/blastp
 
 # Split up fasta file into bite sized chunks for cluster
 cd /cluster/data/hg17/bed/geneSorter/blastp
 mkdir split
 faSplit sequence known.faa 8000 split/kg
 
 # Make parasol run directory
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
 cd /cluster/data/hg17/bed/geneSorter/blastp/self
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
 -e 0.01 -m 8 -b 1000
 '_EOF_'
     # << keep emacs happy
 chmod +x blastSome
 
 # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	'ls ../../split/*.fa' is too much, hence the echo
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 # Wait a couple of minutes, and do a para check,  if all is good
 # then do a
 para push
 # This should finish in ~15 minutes if the cluster is free.
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:     150459s    2507.64m    41.79h    1.74d  0.005 y
 # IO & Wait Time:                 22325s     372.09m     6.20h    0.26d  0.001 y
 # Average job time:                  22s       0.37m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             198s       3.30m     0.06h    0.00d
 # Submission to last job:          2019s      33.65m     0.56h    0.02d
 
 # Load into database.  This takes about 30 minutes
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
 time hgLoadBlastTab hg17 knownBlastTab *.tab
 # Scanning through 7739 files
 # Loading database with 9836439 rows
 # 232.300u 42.580s 23:13.41 19.7% 0+0k 0+0io 205pf+0w
 
 cd /cluster/data/hg17/bed/geneSorter
 # Create table that maps between known genes and RefSeq
 hgMapToGene hg17 refGene knownGene knownToRefSeq
 #	may need to build this command in src/hg/near/hgMapToGene
 #	hgsql -e "select count(*) from knownToRefSeq;" hg17
 #	row count changed 34667
 
 # Create table that maps between known genes and LocusLink
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 \
 	> refToLl.txt
 hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
 #	hgsql -e "select count(*) from knownToLocusLink;" hg17
 #	row count changed to 34667
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
 #	hgsql -e "select count(*) from knownToPfam;" hg17
 #	row count changed to 36010
 
 # Create table to map between known genes and GNF Atlas2
 # expression data.
     hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 #	hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
 #	row count changed to 32381
 
 # Create expression distance table - takes about an hour
     hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
     	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnfAtlas2 &
 # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
 # Got 32381 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
 
 #	hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
 #	row count changed to 32381000
 
 # Create a table that maps between known genes and 
 # the nice affy expression data.
 hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
 #	hgsql -e "select count(*) from knownToU133;" hg17
 #	row count changed to 32886
 
 # Create expression distance table.  This will take about 2.5 hours
 cd /tmp
 cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
 time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
 	-weights=affyUcla.weight -lookup=knownToU133 &
 # Have 43039 elements in affyUclaNorm
 # 211 genes, 42 weights, 26.500000 total wieght
 # Got 32886 unique elements in affyUclaNorm
 
 # Create table that maps between known genes and 
 # the GNF data.
 cd /tmp
 hgMapToGene hg17 affyU95 knownGene knownToU95
 #	row count changed to 17501
 #	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
 hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
 	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95 &
 # Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
 # Got 16450 unique elements in hgFixed.gnfHumanU95MedianRatio
 #	row count changed to 16450000  
 
 # Create known gene mapping table and expression distance tables
 # for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)
 
 hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
 hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
 	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnf1h &
 # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
 # Got 8814 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
 
 cd /cluster/data/hg17/bed/geneSorter
 hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
 #	row count changed to 35055 
 
 #### UPDATE GO DATABASE (DONE 5/21/05 Fan)
 
 # Download the terms and make the database.
 ssh hgwdev
 mkdir /cluster/store1/geneOntology/20050521
 cd /cluster/store1/geneOntology/20050521
 
 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200504-assocdb-data.gz
 
 hgsql mysql <<end
 create database go050521;
 end
 zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
 hgsql go050521 <j.tmp
 rm j.tmp
 
 wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz
 
 zcat gene_association.goa_uniprot.gz | hgGoAssociation go050521 goaPart stdin 
 # Passed 5589891 of 6584507 of 6584507, 84.89%
 
 # Ask sys-admin to switch the database pointer go to point to go050521.
 
 cd /cluster/data/hg17/bed/geneSorter
 
 # Rebuilt Ensembl Gene tables.  See documentation (5/23/05 Fan) above.
 
 # Create knownToEnsembl column
 hgMapToGene hg17 ensGene knownGene knownToEnsembl
 #	table row count went from previous version: 38251 to 35436
 
 # Make knownToCdsSnp table 
   ssh hgwdev
   nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
 # row count 94394
 # approx. 5 minutes running time
 
 # C.ELEGANS BLASTP FOR GENE SORTER 
     # Make C. elegans ortholog column using blastp on wormpep.
     # First make C. elegans protein database and copy it to iscratch/i
     # if it doesn't exist already:
 
 # The following section is done during mm6 build already.
 #    ssh eieio
 #    mkdir /cluster/data/ce2/bed/blastp
 #    cd /cluster/data/ce2/bed/blastp
 #    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
 #    # to find out the latest version.  Then use that in place of 142 below.
 #    wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
 #    formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
 #    ssh kkr1u00
 #    if (-e /iscratch/i/ce2/blastp) then
 #      rm -r /iscratch/i/ce2/blastp
 #    endif
 #    mkdir -p /iscratch/i/ce2/blastp
 #    cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
 #    iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastp/ce2/run/out
     cd /cluster/data/hg17/bed/blastp/ce2/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split >split.lst
     #EDIT split.lst to add "../../../geneSorter/blastp/split/" in front of "kg"
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:      60973s    1016.22m    16.94h    0.71d  0.002 y
 # IO & Wait Time:                 21292s     354.86m     5.91h    0.25d  0.001 y
 # Average job time:                  11s       0.18m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              50s       0.83m     0.01h    0.00d
 # Submission to last job:           570s       9.50m     0.16h    0.01d
 
 # Load into database.  
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastp/ce2/run/out
     hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
 # Scanning through 7739 files
 # Loading database with 25706 rows
 
 # Make mouse ortholog column using blastp on mouse known genes.
 # First make mouse protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This already exists.  See makeMm6.doc for procedure
 #	the directory: /cluster/bluearc/scratch/mus/mm6/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm6
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm6
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
 -i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:      65337s    1088.95m    18.15h    0.76d  0.002 y
 # IO & Wait Time:                 20794s     346.56m     5.78h    0.24d  0.001 y
 # Average job time:                  11s       0.19m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              80s       1.33m     0.02h    0.00d
 # Submission to last job:           598s       9.97m     0.17h    0.01d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm6/run/out
 hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
 # Scanning through 7739 files 
 #	row count changed to 32880  
 
 # Make rat ortholog column using blastp on rat known genes.
 # First make rat protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This already exists.  See makeRn3.doc for procedure.
 #	Files were put in this directory: /cluster/bluearc/rn3/blastp/
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/rn3/blastp/known \
 -i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:      28325s     472.08m     7.87h    0.33d  0.001 y
 # IO & Wait Time:                 20416s     340.27m     5.67h    0.24d  0.001 y
 # Average job time:                   6s       0.10m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              24s       0.40m     0.01h    0.00d
 # Submission to last job:           617s      10.28m     0.17h    0.01d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
 hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
 # Scanning through 7739 files
 # Loading database with 24140 rows
 
 # ZEBRAFISH BLASTP FOR GENE SORTER 
     # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
     # First make protein database and copy it to iscratch/i
     # if it doesn't exist already:
     ssh kkstore
     mkdir /cluster/data/danRer2/bed/blastp
     cd /cluster/data/danRer2/bed/blastp
     wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.may.pep.fa.gz 
     zcat Dan*.pep.fa.gz > ensembl.faa
     /scratch/blast/formatdb -i ensembl.faa -t ensembl -n ensembl
     ssh kkr1u00
     if (-e /iscratch/i/danRer2/blastp) then
       rm -r /iscratch/i/danRer2/blastp
     endif
     mkdir -p /iscratch/i/danRer2/blastp
     cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp
     iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastp/danRer2/run/out
     cd /cluster/data/hg17/bed/blastp/danRer2/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer2/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split \
     |sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' > split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:     113595s    1893.26m    31.55h    1.31d  0.004 y
 # IO & Wait Time:                 26231s     437.18m     7.29h    0.30d  0.001 y
 # Average job time:                  18s       0.30m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              99s       1.65m     0.03h    0.00d
 # Submission to last job:           445s       7.42m     0.12h    0.01d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastp/danRer2/run/out
     hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
 # Scanning through 7739 files
 # Loading database with 30731 rows
 
 
 # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
 # First make protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/sc1/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/sc1/blastp/sgd \
 -i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ... 
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:      18630s     310.50m     5.17h    0.22d  0.001 y
 # IO & Wait Time:                 20776s     346.27m     5.77h    0.24d  0.001 y
 # Average job time:                   5s       0.08m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              15s       0.25m     0.00h    0.00d
 # Submission to last job:           295s       4.92m     0.08h    0.00d
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
 hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
 # Loading database with 16540 rows
 
 # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
 # First make SwissProt protein database and copy it to cluster/bluearc
 # The following section was already done.
 # cd /cluster/data/dm1/bed
 # mkdir blastp
 # cd blastp
 #wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz
 # zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa
 # formatdb -i flyBase.faa -t flyBase -n flyBase
 # if (-e /cluster/bluearc/dm1/blastp) then
 #    rm -r /cluster/bluearc/dm1/blastp
 # endif
 # mkdir -p /cluster/bluearc/dm1/blastp
 # cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
 -i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7739 of 7739 jobs
 # CPU time in finished jobs:      73518s    1225.30m    20.42h    0.85d  0.002 y
 # IO & Wait Time:                 45038s     750.63m    12.51h    0.52d  0.001 y
 # Average job time:                  15s       0.26m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              69s       1.15m     0.02h    0.00d
 # Submission to last job:           762s      12.70m     0.21h    0.01d 
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
 hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
 # Loading database with 27212 rows
 
 # update knownToHInv table
 # Verified that there is now new release of HInv data.
 hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
 # count changed to 28851
 
 # The new KG process no longer need entries in knownGeneLink (used to store
 # info for DNA based RefSeqs.  So clean out the old data in knownGeneLink.
 
 hgsql hg17 -e "delete from knownGeneLink"
 
 #### RE-BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-05-27 - Fan)
 
 # Download latest Superfamily data files and build the Superfamily DB
 # from supfam.mrc-lmb.cam.ac.uk
 
     mkdir /cluster/store10/superfamily/050524
     ln -s /cluster/store10/superfamily/050524 /cluster/data/superfamily/050524
     cd /cluster/data/superfamily/050524
 
 # ftp over the following two files:
 ass_22-May-2005.tab.gz
 supfam_22-May-2005.sql.gz
     gzip -d *.gz
 
 # Load the Superfamily database
     hgsql hg17 -e "create database superfam050524"
     nice hgsql superfam050524 < supfam_22-May-2005.sql &
 
 # This may take about an hour.
 
 # Make sure to add an index on id of the des table of superfam050524.
     hgsql superfam050524 -e "create index id on des(id);"
 
     hgsql superfam050524 < ~/src/hg/lib/sfAssign.sql
     hgsql superfam050524 -e 'load data local infile "ass_22-May-2005.tab" into table 
 superfam050524.sfAssign;'
 
 # Build or rebuild Superfamily track and create sf tables needed for PB
 
    hgsql hg17 < ~/src/hg/lib/sfAssign.sql
 
    cd /cluster/data/superfamily/050524  
    hgsql hg17 -e 'load data local infile "ass_22-May-2005.tab" into table hg17.sfAssign;'
 
 # If hg17.sfDes already exists, drop it.
 
    hgsql superfam050524 -N -e "select * from des" >sfDes.tab
    hgsql hg17 < ~/src/hg/lib/sfDes.sql
    hgsql hg17 -e 'load data local infile "sfDes.tab" into table sfDes'
 
 # If hg17.superfamily already exists, drop it.
    cd /cluster/data/hg17/bed
    mkdir /cluster/data/hg17/sf.2004-1128
    ln -s sf.2004-1128 sf
    hgSuperfam hg17 > sf.log
 
 # It is normal that many proteins does not have corresponding Superfamily entries.
 
 # If hg17.sfDescription exists, drop it.
 
    hgsql hg17 < ~/src/hg/lib/sfDescription.sql
    hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg17.sfDescription;'
 
 # Finally, load the superfamily table.
 
    hgLoadBed hg17 superfamily superfamily.tab -tab
 
 # Create knownToSuperfamily table
 # Note hs is changed into ht for this Superfamily release.
    
    cat /cluster/data/superfamily/050524/ass_22-May-2005.tab \
    | hgKnownToSuper hg17 hs stdin
 # created 25287 rows in knownToSuper
 
 # Build tables needed by pbGlobal in proteins050415
 
    cd /cluster/data/superfamily/050524  
    hgsql proteins050415 -e 'load data local infile "ass_22-May-2005.tab" into table sfAssign'
 
    hgsql proteins050415 -e 'load data local infile "sfDes.tab" into table sfDes'
 
    cd /cluster/store10/kg/kgHg17F
    hgsql proteins050415 -e 'load data local infile "ensemblXref.tab" into table ensemblXref'
 
 # These sf tables and ensemblXref3 are needed for non-HMR KG proteins.
 # Should add content of ensemblXref3 of mm6 after it is done.
 # And similarly for rn4 and possibly for other non-HMR species.
 # CCDS <-> knownGene mapping need to be updated (markd 2005-05-29)
 # this should be part of the known gene build
     /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
 
 # AUGUSTUS GENES (DONE 6/1/2005 Andy)
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir augustus
     cd augustus/
     wget http://augustus.gobics.de/predictions/hg17/hg17.allchr.augustus.gtf.gz
     cp /cluster/data/dm2/bed/augustus/cleanAugustus.awk .
     zcat hg17.allchr.augustus.gtf.gz | awk -f cleanAugustus.awk | gzip > hg17.allchr.augustus.clean.gtf.gz
     ldHgGene -gtf hg17 augustus hg17.allchr.augustus.clean.gtf.gz
     rm hg17.allchr.augustus.gtf.gz
 
 # MAKE Mouse Proteins track (DONE for chr13 braney ~5/25/05)
     ssh kkstore01
     mkdir -p /cluster/data/hg17/blastDb
     cd /cluster/data/hg17/blastDb
     awk "{print \$2}" ../*/chr*/*.lft > subChr.lst
     for i in `cat subChr.lst`
     do
 	ln -s ../*/chr*/$i.fa 
 	echo formatdb -i $i.fa -p F
 	formatdb -i $i.fa -p F
     done
     rm *.log *.fa list
     cd ..
     for i in `cat chrom.lst`; do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft
 
     ssh kkr1u00
     rm -rf /iscratch/i/hg17/blastDb
     mkdir -p /iscratch/i/hg17/blastDb
     cd /cluster/data/hg17/blastDb
     for i in nhr nin nsq; do cp *.$i /iscratch/i/hg17/blastDb     ; echo $i; done
 
     cd
     iSync > sync.out
 
     mkdir -p /cluster/data/hg17/bed/tblastn.mm6KG
     cd /cluster/data/hg17/bed/tblastn.mm6KG
     echo  /panasas/store/hg17/blastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > query.lst    
     # back to kkstore01
     exit
 
     cd /cluster/data/hg17/bed/tblastn.mm6KG
     rm -rf /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa
     mkdir -p /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa
     split -l 560 /cluster/data/mm6/bed/blat.mm6KG/mm6KG.psl /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa/kg
     ln -s /cluster/bluearc/hg17/bed/tblastn.mm6KG/kgfa kgfa
     cd kgfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     rm -rf /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut
     mkdir -p /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut
     ln -s /cluster/bluearc/hg17/bed/tblastn.mm6KG/blastOut
     for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
     tcsh
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subLiftAll.lft carry $f.2                     
 	liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3                     
 	liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/mm6/bed/blat.mm6KG/protein.lft warn $f.4       
 
         if pslCheck -prot $3.tmp                                                                          
         then                                                                                              
             mv $3.tmp $3                                                                                  
             rm -f $f.1 $f.2 $f.3  $f.4
         fi
         exit 0                                                                                            
     fi                                                                                                    
 fi                                                                                                        
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
 
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
 
     ssh kk
     cd /cluster/data/hg17/bed/tblastn.mm6KG
     para create blastSpec
     para push
 # Completed: 214524 of 214524 jobs
 # CPU time in finished jobs:   44907411s  748456.85m 12474.28h  519.76d  1.424 y
 # IO & Wait Time:                712709s   11878.48m   197.97h    8.25d  0.023 y
 # Average job time:                 213s       3.54m     0.06h    0.00d
 # Longest finished job:            1363s      22.72m     0.38h    0.02d
 # Submission to last job:         75910s    1265.17m    21.09h    0.88d
 
 # just for chr13
 # completed: 55290 of 55290 jobs
 # cCPU time in finished jobs:    1487547s   24792.46m   413.21h   17.22d  0.047 y
 # cIO & Wait Time:                148854s    2480.89m    41.35h    1.72d  0.005 y
 # cAverage job time:                  30s       0.49m     0.01h    0.00d
 # cLongest running job:                0s       0.00m     0.00h    0.00d
 # cLongest finished job:              98s       1.63m     0.03h    0.00d
 # cSubmission to last job:          3904s      65.07m     1.08h    0.05d
 
 
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1)
 #ENDLOOP
 '_EOF_'
 
     ssh kki
     cd /cluster/data/hg17/bed/tblastn.mm6KG
     tcsh
     cat << '_EOF_' > chainOne
 (cd $1; cat q."$2"* | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.$2.psl)
 '_EOF_'
     chmod +x chainOne
 
     for j in blastOut/kg??; do for i in `cat ../../chrom.lst`; do echo chainOne $j chr"$i"; done ; done > chainSpec
 
     para create chainSpec
     para push
 # CPU time in finished jobs:         90s       1.50m     0.03h    0.00d  0.000 y
 # IO & Wait Time:                 19151s     319.18m     5.32h    0.22d  0.001 y
 # Average job time:                   3s       0.04m     0.00h    0.00d
 # Longest finished job:               5s       0.08m     0.00h    0.00d
 # Submission to last job:          1642s      27.37m     0.46h    0.02d
 
 # Completed: 7695 of 7695 jobs
 # CPU time in finished jobs:         48s       0.80m     0.01h    0.00d  0.000 y
 # IO & Wait Time:                 18931s     315.51m     5.26h    0.22d  0.001 y
 # Average job time:                   2s       0.04m     0.00h    0.00d
 # Longest finished job:               6s       0.10m     0.00h    0.00d
 # Submission to last job:          1618s      26.97m     0.45h    0.02d
 
     exit
     # back to kkstore01
     cd /cluster/data/hg17/bed/tblastn.mm6KG/blastOut
     for i in kg??
     do 
 	cat c.$i.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.90 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/hg17/bed/tblastn.mm6KG/blastMm6KG.psl
     cd ..
     ssh hgwdev
     cd /cluster/data/hg17/bed/tblastn.mm6KG
     hgLoadPsl hg17 blastHg17KG.psl
     # 1425966 bases of 64944656 (2.196%) 
 
     # back to kkstore01
     rm -rf blastOut
 # End tblastn of mouse proteins
 
 
 ####################################################################################
 # RE-BUILD KNOWN GENES TABLES, 3ND TRIAL WITH CORRECTED kgCheck and kgGetCds (DONE 6/5/05 Fan)
 
 # Start from the step that gene-check is run and kgCandidate0.gp is produced.
 
    cd
    cd /cluster/store10/kg/kgHg17F
 
    mkdir try3
    cd try3
    hgsql kgHg17FTempTry3 -e 'drop table kgCandidate0'
 
    hgsql kgHg17FTempTry3 -e 'drop table kgCandidate0'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidate0.sql 
    hgsql kgHg17FTempTry3 -e  'load data local infile "../kgCandidate0.gp" into table kgCandidate0'
 
    hgsql kgHg17FTempTry3 -e 'drop table geneCheck'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/geneCheck.sql
    hgsql kgHg17FTempTry3 -e  'load data local infile "../kgCandidate0.check" into table geneCheck ignore 2 lines'
 
 # Run kgCheck to get all KG candidates that pass the KG gene check criteria
 
    kgCheck kgHg17FTempTry3 hg17 kgCandidate0 geneCheck kgCandidate.tab
    hgsql kgHg17FTempTry3 -e  'drop table kgCandidate'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidate.sql
    hgsql kgHg17FTempTry3 -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
    hgsql kgHg17FTempTry3 -e 'create index alignID on kgCandidate(alignID)'
 
 # Construct the kgCandidateX table that has alignID in the name field. 
    cut -f 2-10 kgCandidate.tab >j2.tmp
    cut -f 11 kgCandidate.tab >j1.tmp
    paste j1.tmp j2.tmp >kgCandidateX.tab
 
    hgsql kgHg17FTempTry3 -e  'drop table kgCandidateX'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateX.sql
    hgsql kgHg17FTempTry3 -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'
 
 # Score protein/mRna and protein/RefSeq alignments
 
 #   kgResultBestMrna2 050415 kgHg17FTempTry3 hg17|sort -u >protMrnaBlatScore.tab
 #   kgResultBestRef2  050415 kgHg17FTempTry3 hg17|sort -u >protRefScore.tab
 
 # Combine scoring results and load them into temp DB.
    cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
    hgsql kgHg17FTempTry3 -e 'drop table protMrnaScore'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/protMrnaScore.sql
    hgsql kgHg17FTempTry3 -e 'load data local infile "../protMrnaScore.tab" into table protMrnaScore'
    hgsql kgHg17FTempTry3 -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
 
 
 # Run kgGetCds to get CDS structure of each gene
 
    kgGetCds kgHg17FTempTry3 kgCandidateX jY.tmp1
    cat jY.tmp1 |sort -u >kgCandidateY.tab
    rm jY.tmp1
    hgsql kgHg17FTempTry3 -e  'drop table kgCandidateY'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateY.sql
    hgsql kgHg17FTempTry3 -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'
 
 # Run kgPickPrep to replace long cds structure string with cdsId.
    kgPickPrep kgHg17FTempTry3 kgCandidateZ.tab
    hgsql kgHg17FTempTry3 -e  'drop table kgCandidateZ'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgCandidateZ.sql
    hgsql kgHg17FTempTry3 -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
    hgsql kgHg17FTempTry3 -e 'create index cdsId on kgCandidateZ(cdsId)'
 
 # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.
 
    kgPick kgHg17FTempTry3 hg17 proteins050415 kg3Try3.tmp dupSpMrna.tmp
 
    cat kg3Try3.tmp | grep NM_ > jNM
    cat kg3Try3.tmp | grep -v NM_ >jnoNM
    cut -f 1 jnoNM | sed -e "s/_/_\n/" |grep -v _ >jnoNM1
    cut -f 2-12  jnoNM >jnoNM2
    paste jnoNM1 jnoNM2 > kg3Try3B.tmp
    cat jNM >> kg3Try3B.tmp
 
    sort -u dupSpMrna.tmp >dupSpMrna.tab
 
    hgsql hg17 -e 'drop table dupSpMrna'
    hgsql hg17 < ~/src/hg/lib/dupSpMrna.sql
    hgsql hg17 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'
 
 
 # Add entries in the put back list
 # Obtain from Mark the put back list, kgPutBack.lis, for human RefSeq.
 
    hgsql kgHg17FTempTry3 -e 'drop table kgPutBack'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/kgPutBack.sql
    hgsql kgHg17FTempTry3 -e  'load data local infile "kgPutBack.lis" into table kgPutBack'
 
    kgPutBack kgHg17FTempTry3 hg17 proteins050415 kgPutBack kgPutBack.gp
 
 # Sort KG genes to make the kg3Try3.gp table file.
 
    cat kg3Try3B.tmp kgPutBack.gp >kg3Try3C.tmp
    ~/kent/src/hg/protein/sortKg.pl kg3Try3C.tmp >kg3Try3.gp
 
 # Manually edit to correct one line problem of O75438_BC009691
 
    hgsql kgHg17FTempTry3 -e  'drop table knownGene'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/knownGene.sql
    hgsql kgHg17FTempTry3 -e  'load data local infile "kg3Try3.gp" into table knownGene'
 
    hgsql hg17 -e  'drop table kg3Try3'
    hgsql hg17 < ~/src/hg/lib/kg3Try3.sql
    hgsql hg17 -e  'load data local infile "kg3Try3.gp" into table kg3Try3'
 
 # Perform analysis before renaming the kg3Try3 table to knownGene.
 
 # Load data into hg17 knownGene table.
    hgsql hg17 -e  'drop table knownGene'
    hgsql hg17 < ~/src/hg/lib/knownGene.sql
    hgsql hg17 -e  'load data local infile "kg3Try3.gp" into table knownGene'
 
 # Build knownGeneMrna and knownGenePep tables.
    hgsql kgHg17FTempTry3 -e  'drop table mrnaSeq'
    hgsql kgHg17FTempTry3 < ~/src/hg/lib/mrnaSeq.sql
    hgsql kgHg17FTempTry3 -e  'load data local infile "../mrnaSeq.tab" into table mrnaSeq'
 
    kgPepMrna kgHg17FTempTry3 hg17 050415
 
    hgsql hg17 -e  'drop table knownGeneMrna'
    hgsql hg17 < ~/src/hg/lib/knownGeneMrna.sql
    hgsql hg17 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
    hgsql hg17 -e  'drop table knownGenePep'
    hgsql hg17 < ~/src/hg/lib/knownGenePep.sql
    hgsql hg17 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'
 
 # Build spMrna table
 
    hgsql hg17 -N -e 'select proteinID, name from knownGene' |sort -u >kgSpMrna.tab
 
    hgsql hg17 -e  'drop table spMrna'
    hgsql hg17 <~/src/hg/lib/spMrna.sql
    hgsql hg17 -e 'load data local infile "kgSpMrna.tab" into table spMrna'
 
 # Build kgXref table
 
    kgXref2 kgHg17FTempTry3 050415 hg17
 
    hgsql hg17 -e  'drop table kgXref'
    hgsql hg17 < ~/src/hg/lib/kgXref.sql
    hgsql hg17 -e  'load data local infile "kgXref.tab" into table kgXref'
  
 # MAKE FOLDUTR TABLES 
 # First set up directory structure and extract UTR sequence on hgwdev
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir rnaStruct.2005-06-05
     rm rnaStruct
     ln -s rnaStruct.2005-06-05 rnaStruct
     cd rnaStruct
     mkdir -p utr3/split utr5/split utr3/fold utr5/fold
     utrFa hg17 knownGene utr3 utr3/utr.fa
     utrFa hg17 knownGene utr5 utr5/utr.fa
 
 # Split up files and make files that define job.
     ssh kk
     cd /cluster/data/hg17/bed/rnaStruct
     faSplit sequence utr3/utr.fa 50000 utr3/split/s
     faSplit sequence utr5/utr.fa 50000 utr5/split/s
     ls -1 utr3/split > utr3/in.lst
     ls -1 utr5/split > utr5/in.lst
     cd utr3
     cat > gsub <<end
 #LOOP
 rnaFoldBig split/\$(path1) fold
 #ENDLOOP
 end
     cp gsub ../utr5
 
 # Do cluster run for 3' UTRs
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 35774 of 35774 jobs
 # CPU time in finished jobs:    1174534s   19575.57m   326.26h   13.59d  0.037 y
 # IO & Wait Time:                 98071s    1634.51m    27.24h    1.14d  0.003 y
 # Average job time:                  36s       0.59m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            5409s      90.15m     1.50h    0.06d
 # Submission to last job:          6712s     111.87m     1.86h    0.08d
 
 # Do cluster run for 5' UTRs 
     cd ../utr5
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 33765 of 33765 jobs
 # CPU time in finished jobs:     341000s    5683.33m    94.72h    3.95d  0.011 y
 # IO & Wait Time:                106605s    1776.75m    29.61h    1.23d  0.003 y
 # Average job time:                  13s       0.22m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           30479s     507.98m     8.47h    0.35d
 # Submission to last job:         30622s     510.37m     8.51h    0.35d 
 
 # Load database
     ssh hgwdev
     cd /cluster/data/hg17/bed/rnaStruct/utr5
     hgLoadRnaFold hg17 foldUtr5 fold
     cd ../utr3
     hgLoadRnaFold hg17 foldUtr3 fold
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 # Build kgProtMap table
 
 # move all files under hg17Kg to old and copy try3/kgXref.tab up.
 
 # Note: it is important that tight_mrna.psl is here!
     cp old/tight_mrna.psl . -p
 
     ~/src/hg/protein/kgProtMap2.sh kgHg17F hg17 050415
 
 # Update and clean up kgResultBestMrna2.c and then check it in.
 
 # Build alias tables
 
 #	kgAliasM reads from proteins050415.hugo.symbol, proteins050415.hugo.aliases
 #	proteins050415.hugo.withdraws, hg17.kgXref.kgID
 #	to create kgAliasM.tab and geneAlias.tab
 #	by picking out those kgID items from kgXref where
 #	kgXref.geneSymbol == hugo.symbol
 
    kgAliasM hg17 proteins050415
 
 #	kgAliasKgXref reads from hg17.knownGene.proteinID,
 #	hg17.knownGene.name, hg17.kgXref.geneSymbol
 #	to create kgAliasKgXref.tab
 
    kgAliasKgXref hg17
 
 
 #	kgAliasRefseq reads from hg17.knownGene.name,
 #	hg17.knownGene.proteinID, hg17.kgXref.refseq
 #	to create kgAliasRefseq.tab
 
    kgAliasRefseq hg17
 
    hgsql sp050415 -N -e 'select name,gene.val from hg17.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
    | sort -u  > kgAliasP.tab
 
    hgsql hg17 -N -e 'select name, name from knownGene' >kgAliasDup.tab
    hgsql hg17 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
    
    cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
    sort |uniq > kgAlias.tab
 
    hgsql -e "drop table kgAlias;" hg17 
    hgsql hg17 < ~/kent/src/hg/lib/kgAlias.sql
    hgsql hg17 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 
 
 #	kgProtAlias reads from hg17.knownGene.name,
 #	hg17.knownGene.proteinID, hg17.knownGene.alignID,
 #	proteins050415.spXref3.accession, proteins050415.spSecondaryID, proteins050415.pdbSP.pdb
 #	to create kgProtAlias.tab
 #
 
    kgProtAlias hg17 050415
 
    hgsql hg17 -N -e \
    'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
    | sort -u >kgProtAliasNCBI.tab
 
 # include variant splice protein IDs
    
    hgsql hg17 -N -e \
    'select name, proteinID, parAcc from knownGene,sp050415.varAcc where varAcc=proteinID'\
    |sort -u >kgProtAliasDup.tab
 
 # include duplicate protein IDs from dupSpMrna table
    hgsql hg17 -N -e \
    'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
    |sort -u >>kgProtAliasDup.tab
 
 # catch parent acc from dupProteinID too
 
    hgsql hg17 -N -e\
    'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp050415.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
    |sort -u >>kgProtAliasDup.tab
     cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab
 
     echo "`date` creating table kgProtAlias"
     hgsql hg17 -e "drop table kgProtAlias;"
     hgsql hg17 <~/src/hg/lib/kgProtAlias.sql; 
     hgsql hg17 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  
 
 # Build kgSpAlias table
 
     hgsql hg17 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
     hgsql hg17 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >>j.tmp
     cat j.tmp|sort -u |grep -v 'kgID' >hg17.kgSpAlias.tab
     rm j.tmp
 
     hgsql hg17 -e 'drop table kgSpAlias';
     hgsql hg17 < ~/src/hg/lib/kgSpAlias.sql
     hgsql hg17 -e 'load data local infile "hg17.kgSpAlias.tab" into table kgSpAlias'
     
 
 # Build KEGG pathway tables
 
    ssh hgwdev
    cd /cluster/store10/kg/kgHg17F
    md kegg
    cd kegg
 
    ~/src/hg/protein/KGpath.sh kgHg17F hg17 050415
 
    hgsql hg17 -e "drop table keggMapDesc"
    hgsql hg17 -e "drop table keggPathway"
    hgsql hg17 <~/src/hg/lib/keggMapDesc.sql
    hgsql hg17 <~/src/hg/lib/keggPathway.sql
    hgsql hg17 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
    hgsql hg17 -e 'load data local infile "keggPathway.tab" into table keggPathway'
 
 # Build CGAP pathway tables
 
    cd ..
    ~/src/hg/protein/KGcgap.sh kgHg17F hg17 050415
    hgsql hg17 -e "drop table cgapAlias"
    hgsql hg17 -e "drop table cgapBiocDesc"
    hgsql hg17 -e "drop table cgapBiocPathway"
    hgsql hg17 <~/src/hg/lib/cgapAlias.sql
    hgsql hg17 <~/src/hg/lib/cgapBiocDesc.sql
    hgsql hg17 <~/src/hg/lib/cgapBiocPathway.sql
    hgsql hg17 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
    hgsql hg17 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
    hgsql hg17 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'
 
 # Build BioCyc pathway tables
 
 # Download BioCyc DB, create and load bioCyc DB
 # See makeBioCycDB.doc for details.
    
    hgsql hg17 -e "drop table bioCycMapDesc"
    hgsql hg17 <~/src/hg/lib/bioCycMapDesc.sql
    hgsql hg17 -e 'load data local infile "bioCycMapDesc.tab" into table bioCycMapDesc'
 
    kgBioCyc |sort -u > bioCycPathway.tab
 
    hgsql hg17 -e "drop table bioCycPathway"
    hgsql hg17 <~/src/hg/lib/bioCycPathway.sql
    hgsql hg17 -e 'load data local infile "bioCycPathway.tab" into table bioCycPathway'
 
 # CCDS <-> knownGene mapping need to be updated (Fan redone 2005-06-05)
 # this should be part of the known gene build
     /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
 
 
 ### HG17 PROTEOME BROWSER TABLES RE-BUILD ####  (DONE - 2005-06-05 - Fan)
 # These are instructions for rebuilding tables 
 # needed for the Proteome Browser.  
 # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
 # ARE REBUILT.  
 # This update is based on proteins DBs dated 050415.
 
 # Create the working directory
 
     ssh hgwdev
     mkdir /cluster/store10/kg/kgHg17F/pb-2005-06-05
     cd /cluster/data/hg17/bed
     rm pb
     ln -s /cluster/store10/kg/kgHg17F/pb-2005-06-05 pb
     cd pb
 
 # Move the existing PB tables by:
 
 hgsql hg17
 create database hg17Sav4;
 
 alter table hg17.pepCCntDist rename as hg17Sav4.pepCCntDist;
 alter table hg17.pepExonCntDist rename as hg17Sav4.pepExonCntDist;
 alter table hg17.pepHydroDist rename as hg17Sav4.pepHydroDist;
 alter table hg17.pepIPCntDist rename as hg17Sav4.pepIPCntDist;
 alter table hg17.pepMolWtDist rename as hg17Sav4.pepMolWtDist;
 alter table hg17.pepMwAa rename as hg17Sav4.pepMwAa;
 alter table hg17.pepPi rename as hg17Sav4.pepPi;
 alter table hg17.pepPiDist rename as hg17Sav4.pepPiDist;
 alter table hg17.pepResDist rename as hg17Sav4.pepResDist;
 
 alter table hg17.pbAnomLimit rename as hg17Sav4.pbAnomLimit;
 alter table hg17.pbResAvgStd rename as hg17Sav4.pbResAvgStd;
 alter table hg17.pbStamp rename as hg17Sav4.pbStamp;
 
 quit
 
 # Define pep* tables in hg17 DB
 
 	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
 
 #  First edit out pepPred table definition, then
 
 	hgsql hg17 < pepAll.sql
 
 # Build the pepMwAa table
 
   hgsql proteins050415 -N -e \
 "select info.acc, molWeight, aaSize from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab
 
 hgsql hg17 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'
 
 o Build the pepPi table
 
     hgsql proteins050415 -e \
     "select info.acc from sp050415.info, sp050415.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis
 
     hgsql hg17 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis
 
     pbCalPi protAcc.lis sp050415 pepPi.tab
     hgsql hg17 -e 'delete from pepPi'
     hgsql hg17 -e 'load data local infile "pepPi.tab" into table hg17.pepPi'
 
 # Calculate and load pep distributions
 
     pbCalDist sp050415 proteins050415 9606 hg17 >pbCalDist.out
     wc  pbCalDist.out
 
     hgsql hg17
     load data local infile "pepExonCntDist.tab" into table hg17.pepExonCntDist;
     load data local infile "pepCCntDist.tab" into table hg17.pepCCntDist;
     load data local infile "pepHydroDist.tab" into table hg17.pepHydroDist;
     load data local infile "pepMolWtDist.tab" into table hg17.pepMolWtDist;
     load data local infile "pepResDist.tab" into table hg17.pepResDist;
     load data local infile "pepIPCntDist.tab" into table hg17.pepIPCntDist;
     load data local infile "pepPiDist.tab" into table hg17.pepPiDist;
     quit
 
 # Calculate frequency distributions
 
     pbCalResStd sp050415 9606 hg17
 
 # Create pbAnomLimit and pbResAvgStd tables
 
    hgsql hg17 -e "drop table pbAnomLimit"
    hgsql hg17 -e "drop table pbResAvgStd"
    hgsql hg17 < ~/src/hg/lib/pbAnomLimit.sql
    hgsql hg17 < ~/src/hg/lib/pbResAvgStd.sql
 
    hgsql hg17 -e 'load data local infile "pbResAvgStd.tab" into table hg17.pbResAvgStd;'
    hgsql hg17 -e 'load data local infile "pbAnomLimit.tab" into table hg17.pbAnomLimit;'
 
 # Create pbStamp table for PB
   hgsql hg17 -e "drop table pbStamp"
   hgsql hg17 < ~/src/hg/lib/pbStamp.sql
   hgsql hg17Sav4 -N -e 'select * from pbStamp' > pbStamp.tab
   hgsql hg17 -e 'load data local infile "pbStamp.tab" into table hg17.pbStamp'
 
 # Adjust drawing parameters for Proteome Browser stamps
 
   Now invoke Proteome Browser and adjust various drawing parameters
   (mostly the ymax of each stamp) if necessary, by updating the 
   pbStamp.tab file and then delete and reload the pbStamp table. 
 
 # Perform preliminary review of Proteome Browser for hg17, then
   notify QA for formal review.
 
 
 # RE-BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (DONE - 2005-06-04 - Fan)
 #	This should be done after KG tables are complete from known genes build
 #	process.
 #
 # Cluster together various alt-splicing isoforms.
 #	Creates the knownIsoforms and knownCanonical tables
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/geneSorter.2005-06-04
 # remove old symbolic link
 rm /cluster/data/hg17/bed/geneSorter
 ln -s /cluster/data/hg17/bed/geneSorter.2005-06-04 /cluster/data/hg17/bed/geneSorter
 cd /cluster/data/hg17/bed/geneSorter
 hgClusterGenes hg17 knownGene knownIsoforms knownCanonical
 
 # Extract peptides from knownGenes into fasta file
 # and create a blast database out of them.
 mkdir /cluster/data/hg17/bed/geneSorter/blastp
 cd /cluster/data/hg17/bed/geneSorter/blastp
 pepPredToFa hg17 knownGenePep known.faa
 #	You may need to build this binary in src/hg/near/pepPredToFa
 /scratch/blast/formatdb -i known.faa -t known -n known
 #	This command is in /projects/compbio/bin/$MACH/formatdb
 
 # Copy over database to bluearc
 rm -fr /cluster/bluearc/hg17/blastp
 mkdir -p /cluster/bluearc/hg17/blastp
 cp -p /cluster/data/hg17/bed/geneSorter/blastp/known.* /cluster/bluearc/hg17/blastp
 
 # Split up fasta file into bite sized chunks for cluster
 cd /cluster/data/hg17/bed/geneSorter/blastp
 mkdir split
 faSplit sequence known.faa 8000 split/kg
 
 # Make parasol run directory
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/self
 cd /cluster/data/hg17/bed/geneSorter/blastp/self
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/hg17/blastp/known -i $1 -o $2 \
 -e 0.01 -m 8 -b 1000
 '_EOF_'
     # << keep emacs happy
 chmod +x blastSome
 
 # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	'ls ../../split/*.fa' is too much, hence the echo
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para push
 para check
 # Completed: 7735 of 7735 jobs
 # CPU time in finished jobs:     142764s    2379.39m    39.66h    1.65d  0.005 y
 # IO & Wait Time:                 67623s    1127.06m    18.78h    0.78d  0.002 y
 # Average job time:                  27s       0.45m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             144s       2.40m     0.04h    0.00d
 # Submission to last job:           392s       6.53m     0.11h    0.00d
 
 # Load into database.  This takes about 30 minutes
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/self/run/out
 time hgLoadBlastTab hg17 knownBlastTab *.tab
 # Scanning through 7735 files
 # Loading database with 9757382 rows
 # 255.200u 50.520s 25:19.66 20.1% 0+0k 0+0io 247pf+0w
 
 cd /cluster/data/hg17/bed/geneSorter
 # Create table that maps between known genes and RefSeq
 hgMapToGene hg17 refGene knownGene knownToRefSeq
 #	may need to build this command in src/hg/near/hgMapToGene
 #	hgsql -e "select count(*) from knownToRefSeq;" hg17
 #	row count changed 34667
 
 # Create table that maps between known genes and LocusLink
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg17 > refToLl.txt
 hgMapToGene hg17 refGene knownGene knownToLocusLink -lookup=refToLl.txt
 #	hgsql -e "select count(*) from knownToLocusLink;" hg17
 #	row count changed to 34773
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
 #	hgsql -e "select count(*) from knownToPfam;" hg17
 #	row count changed to 29171 
 
 # Create table to map between known genes and GNF Atlas2
 # expression data.
     hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 #	hgsql -e "select count(*) from knownToGnfAtlas2;" hg17
 #	row count changed to 32458
 
 # Create expression distance table - takes about an hour
     hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
     	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnfAtlas2 &
 # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
 # Got 32458 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
 #	hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
 #	row count changed to 32381000
 
 # Create a table that maps between known genes and 
 # the nice affy expression data.
 hgMapToGene "-type=bed 12" hg17 affyUclaNorm knownGene knownToU133
 #	hgsql -e "select count(*) from knownToU133;" hg17
 #	row count changed to 32965
 
 # Create expression distance table.  This will take about 2.5 hours
 cd /tmp
 cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
 time hgExpDistance hg17 affyUclaNorm affyUclaExp knownExpDistance \
 	-weights=affyUcla.weight -lookup=knownToU133 &
 # Have 43039 elements in affyUclaNorm
 # 211 genes, 42 weights, 26.500000 total wieght
 # Got 32965 unique elements in affyUclaNorm
 
 # Create table that maps between known genes and 
 # the GNF data.
 cd /tmp
 hgMapToGene hg17 affyU95 knownGene knownToU95
 #	row count changed to 17555
 #	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
 hgExpDistance hg17 hgFixed.gnfHumanU95MedianRatio \
 	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95 &
 # Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
 # Got 16501 unique elements in hgFixed.gnfHumanU95MedianRatio
 #	row count changed to 16450000  
 
 # Create known gene mapping table and expression distance tables
 # for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)
 
 hgMapToGene hg17 affyGnf1h knownGene knownToGnf1h
 hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
 	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnf1h &
 # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
 # Got 8827 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
 
 cd /cluster/data/hg17/bed/geneSorter
 hgMapToGene hg17 affyU133Plus2 knownGene knownToU133Plus2
 #	row count changed to 35139  
 
 #### UPDATE GO DATABASE (THIS PART WAS DONE 5/21/05 Fan)
 
 # Download the terms and make the database.
 ssh hgwdev
 mkdir /cluster/store1/geneOntology/20050521
 cd /cluster/store1/geneOntology/20050521
 
 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200504-assocdb-data.gz
 
 hgsql mysql <<end
 create database go050521;
 end
 zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
 hgsql go050521 <j.tmp
 rm j.tmp
 
 wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz
 
 zcat gene_association.goa_uniprot.gz | hgGoAssociation go050521 goaPart stdin 
 # Passed 5589891 of 6584507 of 6584507, 84.89%
 
 # Ask sys-admin to switch the database pointer go to point to go050521.
 
 cd /cluster/data/hg17/bed/geneSorter
 
 # Rebuilt Ensembl Gene tables.  See documentation (5/23/05 Fan) above.
 
 # Create knownToEnsembl column
 hgMapToGene hg17 ensGene knownGene knownToEnsembl
 # hgsql hg17 -e "select count(*) from knownToEnsembl" 
 #	table row count 35521
 
 # Make knownToCdsSnp table 
   ssh hgwdev
   nice hgMapToGene hg17 snp knownGene knownToCdsSnp -all -cds
 # hgsql hg17 -e "select count(*) from knownToCdsSnp" 
 # row count 94633
 # approx. 5 minutes running time
 
 # C.ELEGANS BLASTP FOR GENE SORTER 
     # Make C. elegans ortholog column using blastp on wormpep.
     # First make C. elegans protein database and copy it to iscratch/i
     # if it doesn't exist already:
 
 # The following section is done during mm6 build already.
 #    ssh eieio
 #    mkdir /cluster/data/ce2/bed/blastp
 #    cd /cluster/data/ce2/bed/blastp
 #    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
 #    # to find out the latest version.  Then use that in place of 142 below.
 #    wget -O wormPep142.faa ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep142/wormpep142
 #    formatdb -i wormPep142.faa -t wormPep142 -n wormPep142
 #    ssh kkr1u00
 #    if (-e /iscratch/i/ce2/blastp) then
 #      rm -r /iscratch/i/ce2/blastp
 #    endif
 #    mkdir -p /iscratch/i/ce2/blastp
 #    cp /cluster/data/ce2/bed/blastp/wormPep142.p?? /iscratch/i/ce2/blastp
 #    iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastp/ce2/run/out
     cd /cluster/data/hg17/bed/blastp/ce2/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep142 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split \
     |sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' >split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...  
 
 # Initial run has 13 jobs crashed.
 # crashed: 13
 # ranOk: 7722
 
 # para problems show the following typical message:
 
 # total jobs in batch: 7735
 # job: blastSome ../../../geneSorter/blastp/split/kg5911.fa out/kg5911.tab
 # id: 209522384
 # failure type: crash
 # host: kkr2u28.kilokluster.ucsc.edu
 # start time: Sat Jun  4 11:45:51 2005
 # return: 0
 # stderr:
 # [blastall] FATAL ERROR: blast: Unable to open input file ../../../geneSorter/blastp/split/kg5911.fa
 
 # para push again and these 13 ran fine.
 # Completed: 7735 of 7735 jobs
 # CPU time in finished jobs:      60319s    1005.32m    16.76h    0.70d  0.002 y
 # IO & Wait Time:                 31239s     520.65m     8.68h    0.36d  0.001 y
 # Average job time:                  12s       0.20m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              72s       1.20m     0.02h    0.00d
 # Submission to last job:           199s       3.32m     0.06h    0.00d
 
 # Load into database.  
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastp/ce2/run/out
     hgLoadBlastTab hg17 ceBlastTab -maxPer=1 *.tab
 # Scanning through 7735 files
 # Loading database with 25574 rows
 
 # Make mouse ortholog column using blastp on mouse known genes.
 # First make mouse protein database and copy it to /cluster/panasas
 # if it doesn't exist already
 #	This already exists.  See makeMm6.doc for procedure
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/mm6
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm6
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
 -i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7735 of 7735 jobs
 # CPU time in finished jobs:      85769s    1429.49m    23.82h    0.99d  0.003 y
 # IO & Wait Time:                 20587s     343.11m     5.72h    0.24d  0.001 y
 # Average job time:                  14s       0.23m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              78s       1.30m     0.02h    0.00d
 # Submission to last job:           206s       3.43m     0.06h    0.00d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm6/run/out
 hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
 # Scanning through 7735 files
 # Loading database with 32951 rows
 
 # Make rat ortholog column using blastp on rat known genes.
 # First make rat protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This already exists.  See makeRn3.doc for procedure.
 #	Files were put in this directory: /cluster/bluearc/rn3/blastp/
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/rn3
 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/rn3/blastp/known \
 -i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc 
 # Completed: 7735 of 7735 jobs
 # CPU time in finished jobs:      27804s     463.40m     7.72h    0.32d  0.001 y
 # IO & Wait Time:                 30334s     505.56m     8.43h    0.35d  0.001 y
 # Average job time:                   8s       0.13m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              26s       0.43m     0.01h    0.00d
 # Submission to last job:           119s       1.98m     0.03h    0.00d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/rn3/run/out
 hgLoadBlastTab hg17 rnBlastTab -maxPer=1 *.tab
 # Scanning through 7735 files
 # Loading database with 24030 rows
 
 # ZEBRAFISH BLASTP FOR GENE SORTER 
     # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
     # First make protein database and copy it to iscratch/i
     # if it doesn't exist already:
     ssh kkstore
     mkdir /cluster/data/danRer2/bed/blastp
     cd /cluster/data/danRer2/bed/blastp
     wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH4.may.pep.fa.gz 
     zcat Dan*.pep.fa.gz > ensembl.faa
     /scratch/blast/formatdb -i ensembl.faa -t ensembl -n ensembl
     ssh kkr1u00
     if (-e /iscratch/i/danRer2/blastp) then
       rm -r /iscratch/i/danRer2/blastp
     endif
     mkdir -p /iscratch/i/danRer2/blastp
     cp /cluster/data/danRer2/bed/blastp/ensembl.p?? /iscratch/i/danRer2/blastp
     iSync
 
     # Make parasol run directory 
     ssh kk
     mkdir -p /cluster/data/hg17/bed/blastp/danRer2/run/out
     cd /cluster/data/hg17/bed/blastp/danRer2/run
     # Make blast script
     cat > blastSome <<end
 #!/bin/csh
 setenv BLASTMAT /iscratch/i/blast/data
 /iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer2/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
 end
     chmod a+x blastSome
     # Make gensub2 file
     cat > gsub <<end
 #LOOP
 blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
 #ENDLOOP
 end
     # Create parasol batch
     ls -1S /cluster/data/hg17/bed/geneSorter/blastp/split \
     |sed -e 's=kg=../../../geneSorter/blastp/split/kg=g' > split.lst
     gensub2 split.lst single gsub spec
     para create spec
     para try, check, push, check, ...
 # Completed: 7735 of 7735 jobs
 # CPU time in finished jobs:     111467s    1857.78m    30.96h    1.29d  0.004 y
 # IO & Wait Time:                 21159s     352.65m     5.88h    0.24d  0.001 y
 # Average job time:                  17s       0.29m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              95s       1.58m     0.03h    0.00d
 # Submission to last job:           223s       3.72m     0.06h    0.00d
 
     # Load into database.  
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastp/danRer2/run/out
     hgLoadBlastTab hg17 drBlastTab -maxPer=1 *.tab
 # Scanning through 7735 files
 # Loading database with 30651 rows
 
 # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
 # First make protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/sc1/blastp should have data
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/sc1
 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/sc1/blastp/sgd \
 -i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7735 of 7735 jobs
 # CPU time in finished jobs:      18194s     303.23m     5.05h    0.21d  0.001 y
 # IO & Wait Time:                 24452s     407.53m     6.79h    0.28d  0.001 y
 # Average job time:                   6s       0.09m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              16s       0.27m     0.00h    0.00d
 # Submission to last job:           120s       2.00m     0.03h    0.00d
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/sc1/run/out
 hgLoadBlastTab hg17 scBlastTab -maxPer=1 *.tab
 # Scanning through 7735 files
 # Loading database with 16395 rows 
 
 # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
 # First make SwissProt protein database and copy it to cluster/bluearc
 # The following section was already done.
 # cd /cluster/data/dm1/bed
 # mkdir blastp
 # cd blastp
 #wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz
 # zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa
 # formatdb -i flyBase.faa -t flyBase -n flyBase
 # if (-e /cluster/bluearc/dm1/blastp) then
 #    rm -r /cluster/bluearc/dm1/blastp
 # endif
 # mkdir -p /cluster/bluearc/dm1/blastp
 # cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp
 
 # Make parasol run directory 
 ssh kk
 mkdir /cluster/data/hg17/bed/geneSorter/blastp/dm1
 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
 -i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 # Completed: 7735 of 7735 jobs
 # CPU time in finished jobs:      72141s    1202.35m    20.04h    0.83d  0.002 y
 # IO & Wait Time:                 41717s     695.28m    11.59h    0.48d  0.001 y
 # Average job time:                  15s       0.25m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              57s       0.95m     0.02h    0.00d
 # Submission to last job:           204s       3.40m     0.06h    0.00d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/dm1/run/out
 hgLoadBlastTab hg17 dmBlastTab -maxPer=1 *.tab
 # Scanning through 7735 files
 # Loading database with 27109 rows
 
 # update knownToHInv table
 # Verified that there is now new release of HInv data.
 hgMapToGene hg17 HInvGeneMrna knownGene knownToHInv
 # count changed to 28851
 
 # The new KG process no longer need entries in knownGeneLink (used to store
 # info for DNA based RefSeqs.  So clean out the old data in knownGeneLink.
 
 hgsql hg17 -e "delete from knownGeneLink"
 
 #### RE-BUILD SUPERFAMILY RELATED TABLES (DONE - 2005-05-27 - Fan)
 
 # Download latest Superfamily data files and build the Superfamily DB
 # from supfam.mrc-lmb.cam.ac.uk
 
     mkdir /cluster/store10/superfamily/050524
     ln -s /cluster/store10/superfamily/050524 /cluster/data/superfamily/050524
     cd /cluster/data/superfamily/050524
 
 # ftp over the following two files:
 ass_22-May-2005.tab.gz
 supfam_22-May-2005.sql.gz
     gzip -d *.gz
 
 # Load the Superfamily database
     hgsql hg17 -e "create database superfam050524"
     nice hgsql superfam050524 < supfam_22-May-2005.sql &
 
 # This may take about an hour.
 
 # Make sure to add an index on id of the des table of superfam050524.
     hgsql superfam050524 -e "create index id on des(id);"
 
     hgsql superfam050524 < ~/src/hg/lib/sfAssign.sql
     hgsql superfam050524 -e 'load data local infile "ass_22-May-2005.tab" into table 
 superfam050524.sfAssign;'
 
 # Build or rebuild Superfamily track and create sf tables needed for PB
 
    hgsql hg17 < ~/src/hg/lib/sfAssign.sql
 
    cd /cluster/data/superfamily/050524  
    hgsql hg17 -e 'load data local infile "ass_22-May-2005.tab" into table hg17.sfAssign;'
 
 # If hg17.sfDes already exists, drop it.
 
    hgsql superfam050524 -N -e "select * from des" >sfDes.tab
    hgsql hg17 < ~/src/hg/lib/sfDes.sql
    hgsql hg17 -e 'load data local infile "sfDes.tab" into table sfDes'
 
 # If hg17.superfamily already exists, drop it.
    cd /cluster/data/hg17/bed
    mkdir /cluster/data/hg17/sf.2004-1128
    ln -s sf.2004-1128 sf
    hgSuperfam hg17 > sf.log
 
 # It is normal that many proteins does not have corresponding Superfamily entries.
 
 # If hg17.sfDescription exists, drop it.
 
    hgsql hg17 < ~/src/hg/lib/sfDescription.sql
    hgsql hg17 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg17.sfDescription;'
 
 # Finally, load the superfamily table.
 
    hgLoadBed hg17 superfamily superfamily.tab -tab
 
 # Create knownToSuperfamily table
 # Note hs is changed into ht for this Superfamily release.
    
    cat /cluster/data/superfamily/050524/ass_22-May-2005.tab \
    | hgKnownToSuper hg17 hs stdin
 # created 32906 rows in knownToSuper
 
 # Build tables needed by pbGlobal in proteins050415
 
    cd /cluster/data/superfamily/050524  
    hgsql proteins050415 -e 'load data local infile "ass_22-May-2005.tab" into table sfAssign'
 
    hgsql proteins050415 -e 'load data local infile "sfDes.tab" into table sfDes'
 
    cd /cluster/store10/kg/kgHg17F
    hgsql proteins050415 -e 'load data local infile "ensemblXref.tab" into table ensemblXref'
 
 # These sf tables and ensemblXref3 are needed for non-HMR KG proteins.
 # Should add content of ensemblXref3 of mm6 after it is done.
 # And similarly for rn4 and possibly for other non-HMR species.
 # CCDS <-> knownGene mapping need to be updated (markd 2005-05-29)
 # this should be part of the known gene build
     /cluster/data/genbank/bin/i386/mkCcdsGeneMap -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
 
 # Build targetScanS track - (DONE - 2005-06-22 Fan)
 #       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
     ssh hgwdev 
     mkdir -p /cluster/data/hg17/bed/targetScanS
     cd /cluster/data/hg17/bed/targetScanS
     wget --timestamp http://genes.mit.edu/targetscan/tracks/targetscan.bed
 
 # Remove the first description line of targetscan.bed
 
     hgLoadBed -tab hg17 targetScanS targetscan.bed
 
 # Create/edit/check in targetScans.html and trackDb.ra under
 # kent/src/hg/makeDb/trackDb/human/hg17
 
 # Update mrnaRefseq table (DONE - Fan 6/22/05)
 # The old table contains non-human mrna/RefSeqs.
 # The new table contains only human mrna/RefSeq and RefSeq/RefSeq.
 
 # First build entrez DB tables, see makeMm6.doc for details.
 
     hgsql entrez -N -e \
     'select mrna, refseq from entrezRefseq, entrezMrna, hg17.all_mrna where entrezRefseq.geneID=entrezMrna.geneID and mrna=all_mrna.qName' \
     >mrnaRefseq1.tab
 
 # Include RefSeq as valid mRNA too.
     hgsql hg17 -N -e 'select name, name from refGene' >mrnaRefseq2.tab
 
     cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab
 
     hgsql hg17 -e 'drop table mrnaRefseq'
     hgsql hg17 < ~/src/hg/lib/mrnaRefseq.sql
     hgsql hg17 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'
 
 # BUILD KNOWN GENE LIST FOR GOOGLE.  DONE 6/27/05 Fan.
 # make knownGeneLists.html hg17GeneList.html mm5GeneList.html rm3GeneList.html
 
     cd /cluster/data/hg17/bed
     rm -rf knownGeneList/hg17
 
 # Run hgKnownGeneList to generate the tree of HTML pages
 # under ./knownGeneList/hg17
 
     hgKnownGeneList hg17
 
 # copy over to /usr/local/apache/htdocs
 
     rm -rf /usr/local/apache/htdocs/knownGeneList/hg17
     mkdir -p /usr/local/apache/htdocs/knownGeneList/hg17
     cp -Rfp knownGeneList/hg17/* /usr/local/apache/htdocs/knownGeneList/hg17
 
 ####  Blat knownGene proteins to determine exons (DONE braney 06-30-05)
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir blat.hg17KG.2005-06-17
     rm blat.hg17KG
     ln -s  blat.hg17KG.2005-06-17 blat.hg17KG
     cd blat.hg17KG
     pepPredToFa hg17 knownGenePep known.fa
     hgPepPred hg17 generic blastKGPep02 known.fa
     grep ">" known.fa | sed "s/>//" > kgName.lst
     kgName hg17 kgName.lst blastKGRef02
     hgsql hg17 < ~/kent/src/hg/lib/blastRef.sql
     echo "rename table blastRef to blastKGRef02" | hgsql hg17
     echo "load data local infile 'blastKGRef02' into table blastKGRef02" | hgsql hg17
     ssh kk
     cd /cluster/data/hg17/bed/blat.hg17KG
     cat << '_EOF_' > blatSome
 #!/bin/csh -fe
 /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
 '_EOF_'
     # << keep emacs happy
     chmod +x blatSome
     ls -1S /scratch/hg/gs.18/build35/bothMaskedNibs/*.nib > human.lst
     mkdir kgfa
     cd kgfa
     faSplit sequence ../known.fa 3020 kg
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     cat << '_EOF_' > blatGsub
 #LOOP
 blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
     gensub2 human.lst kg.lst blatGsub blatSpec
     mkdir psl
     cd psl
     foreach i (`cat ../human.lst`)
 	mkdir `basename $i .nib`
     end
     cd ..
     para create blatSpec
     para push
 # Completed: 134320 of 134320 jobs
 # CPU time in finished jobs:   22196680s  369944.67m  6165.74h  256.91d  0.704 y
 # IO & Wait Time:               1712586s   28543.10m   475.72h   19.82d  0.054 y
 # Average job time:                 178s       2.97m     0.05h    0.00d
 # Longest finished job:            7691s     128.18m     2.14h    0.09d
 # Submission to last job:        608750s   10145.83m   169.10h    7.05d
 
 # Completed: 133676 of 133676 jobs
 # CPU time in finished jobs:   29661130s  494352.16m  8239.20h  343.30d  0.941 y
 # IO & Wait Time:               2181179s   36352.99m   605.88h   25.25d  0.069 y
 # Average job time:                 238s       3.97m     0.07h    0.02d
 # Longest job:                   105972s    1766.20m    29.44h    1.23d
 
     ssh eieio
     cd /cluster/data/hg17/bed/blat.hg17KG
     pslSort dirs raw.psl /tmp psl/*
     pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
     pslUniq cooked.psl hg17KG.psl
     pslxToFa hg17KG.psl hg17KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/blat.hg17KG
     kgName hg17 hg17KG.psl blastKGRef02
     cut -f 10 hg17KG.psl > kgName.lst
     faSomeRecords known.fa kgName.lst hg17KG.fa
     hgPepPred hg17 generic blastKGPep02 hg17KG.fa
 #end blat proteins
     
 # MAKE Drosophila Proteins track (DONE 07-05-05 braney)
     ssh kk
     mkdir -p /cluster/data/hg17/bed/tblastn.dm2FB
     cd /cluster/data/hg17/bed/tblastn.dm2FB
     echo  /panasas/store/hg17/blastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > target.lst    
     mkdir fbfa
     # calculate a reasonable number of jobs 
     calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl|awk "{print \\\$1}"`/\(264630/`wc target.lst| awk "{print \\\$1}"`\)
 # 18929/(350000/5959) = 322.279746
     split -l 322 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
     cd fbfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S fbfa/*.fa > fb.lst
     mkdir -p /cluster/bluearc/hg17/bed/tblastn.dm2FB/blastOut  
     ln -s /cluster/bluearc/hg17/bed/tblastn.dm2FB/blastOut  
     for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
     tcsh
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
 if /cluster/bin/i386/blastToPsl $f.1 $f.2
 then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/hg17/jkStuff/subLiftAll.lft warn $f.2  
 	liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/hg17/jkStuff/liftAll.lft warn $f.3  
         liftUp -isPtoG -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.4
         mv $3.tmp $3
         rm -f $f.1 $f.2 $f.3
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp $f.3 $f.8
 exit 1
 '_EOF_'
 
     chmod +x blastSome
     gensub2 target.lst fb.lst blastGsub blastSpec
 
     ssh kk
     cd /cluster/data/hg17/bed/tblastn.dm2FB
     para create blastSpec
     para push
 
 # Completed: 351581 of 351581 jobs
 # CPU time in finished jobs:   30733031s  512217.19m  8536.95h  355.71d  0.975 y
 # IO & Wait Time:               1035790s   17263.16m   287.72h   11.99d  0.033 y
 # Average job time:                  90s       1.51m     0.03h    0.00d
 # Longest finished job:             816s      13.60m     0.23h    0.01d
 # Submission to last job:        135367s    2256.12m    37.60h    1.57d
 
     ssh kki
     cd /cluster/data/hg17/bed/tblastn.dm2FB
     tcsh
     
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1) $(path2)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainSome
 (cd $1; cat $2.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainSome
 
     ls -1dS `pwd`/blastOut/fb?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
 
     para create chainSpec
     para push
  
 # Completed: 2714 of 2714 jobs
 # CPU time in finished jobs:     222508s    3708.46m    61.81h    2.58d  0.007 y
 # IO & Wait Time:                 10577s     176.29m     2.94h    0.12d  0.000 y
 # Average job time:                  86s       1.43m     0.02h    0.00d
 # Longest finished job:            9787s     163.12m     2.72h    0.11d
 
     cd /cluster/data/hg17/bed/tblastn.dm2FB/blastOut
     for i in fb??
     do 
 	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.*.psl > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/hg17/bed/tblastn.dm2FB/blastDm2FB.psl
     cd ..
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/tblastn.dm2FB
     hgLoadPsl hg17 blastDm2FB.psl
     exit
 
     # back to kksilo
     rm -rf blastOut
 
 # End tblastn
 
 # Build kgReactome table for KG to Reactome xref.  Done 6/28/05 Fan.
 
     ssh hgwdev
     mkdir -p /cluster/store10/reactome/reactome050613
     rm    /cluster/data/reactome
     ln -s /cluster/store10/reactome/reactome050613 /cluster/data/reactome
 
     cd /cluster/data/reactome
     
     wget --timestamp http://www.reactome.org/download/current/sql.gz
     
     hgsql hg17 -e 'drop database reactome'
     hgsql hg17 -e 'create database reactome'
     zcat sql.gz| hgsql reactome 
 
     hgsql reactome -N -e 'select kgId, spID, DB_ID from ReferenceEntity, hg17.kgXref where identifier=spID' >kgReactome.tab;
 
     hgsql hg17 -e 'drop table kgReactome'
     hgsql hg17 < ~/src/hg/lib/kgReactome.sql
     hgsql hg17 -e 'load data local infile "kgReactome.tab" into table kgReactome'
 
 # UPDATE WGRNA TRACK (DONE, 2005-07-05, Fan)
 
   ssh hgwdev
   cd /cluster/data/hg17/bed
 
   mv wgRna wgRna-2005-06-16
   mkdir wgRna-2005-07-05
   cd wgRna-2005-07-05
 
 # Received the data file, wgtrack_july2005.txt, from Michel Weber's email (Michel.Weber@ibcg.biotoul.fr)
 # and place it under cd /cluster/data/hg17/bed/wgRna-2005-07-05.
 
   cat wgtrack_july2005.txt|sed -e 's/ /\t/g' >wgRna.tab
   
 # edit wgRna.tab to take out the first 5 lines of data field labels.
 
     hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
 
 # REBUILT knownToPfam TABLE TO ALLOW KG REPRESENTED BY VARIANT SPLICE PROTEINS MAPPED TO PFAM (DONE 7/14/05, Fan)
 # hgMapViaSwissProt.c was updated to support this.
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt hg17 knownGene name proteinID Pfam knownToPfam
 
 
 ## EVOFOLD (DONE, 2005-07-15, Jakob (jsp) ) 
 # EvoFold is a new comparative method for predicting functional RNA
 # secondary structures based on multiple sequence alignnments. The
 # predictions generated for the EvoFold track are based on the most
 # conserved elements of the 8-way alignment (multiz8way). The current
 # data is the result of a pilot study (ongoing research of mine), the
 # procedure used to generate the data will therefore be simplified
 # when forthcoming evofold tracks for other organism are made. The
 # documentation therefore skips the actual data generation, and
 # instead starts with a data file I provide.
 
   ssh -C hg17
   mkdir -p /cluster/data/hg17/bed/evofold
   cd /cluster/data/hg17/bed/evofold
   cp /cluster/home/jsp/data/rnass/genome-scan/vertebrate/folds_hg17.bed foldsHg17.bed
   # The folds_hg17.bed is a 9-column bed file: column 1-6 provide
   # standard bed information, column 7 is element length, column 8 is
   # the RNA secondary structure in parentheses format, and column nine
   # is a commaseparated list of position specific confidence scores
   # (floats).
   hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg17 evofold foldsHg17.bed
 
 ##########################################################################
 # TRANSFRAG PHASE 2 TABLES  - lifted from hg15 (Jakob Skou Pedersen)
 # Done: July 21, 2005
 #
 # These tables were lifted for use in my own research, but may be used
 # for the 'Affymetrix Transcriptome Project Phase 2' tracks.
   ssh -C hgwdev
   mkdir -p /cluster/data/hg17/bed/transfrag
   cd /cluster/data/hg17/bed/transfrag
   # lifting transfrag tables from hg15 via hg16 to hg17
   for name in A375CytosolicPolyAPlusTnFg FHs738LuCytosolicPolyAPlusTnFg HepG2CytosolicPolyAMinusTnFg HepG2CytosolicPolyAPlusTnFg HepG2NuclearPolyAMinusTnFg HepG2NuclearPolyAPlusTnFg JurkatCytosolicPolyAPlusTnFg NCCITCytosolicPolyAPlusTnFg PC3CytosolicPolyAPlusTnFg SKNASCytosolicPolyAPlusTnFg U87CytosolicPolyAPlusTnFg; do
     echo "select chrom, chromStart, chromEnd, name from ${name};" | hgsql hg15 | sed -e 1d > ${name}Hg15.bed
     liftOver ${name}Hg15.bed /cluster/data/hg15/bed/liftOver/hg15ToHg16.over.chain ${name}Hg16.bed unmappedHg16.bed
     liftOver ${name}Hg16.bed /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain ${name}Hg17.bed unmappedHg17.bed
     echo "hg16 unmapped count for ${name}: " `grep "#" unmappedHg16.bed | wc -l | awk '{print $1}'`
     echo "hg17 unmapped count for ${name}: " `grep "#" unmappedHg17.bed | wc -l | awk '{print $1}'`
     hgLoadBed hg17 ${name} ${name}Hg17.bed
     # clean up
     rm ${name}Hg15.bed ${name}Hg16.bed unmappedHg16.bed unmappedHg17.bed
   done
 
 # GLADSTONE ARRAY TRACK (DONE 7/19/2005 Andy)
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir gladHumES
     cd gladHumES/
     cp /cluster/data/hg16/bed/geneAtlas2/geneAtlas2.bed .
     cut -f1-12 geneAtlas2.bed > bed.hg16
     liftOver bed.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain bed.hg17 /dev/null
     hgMapMicroarray bed.hg17.data hgFixed.gladHumESRatio \
        -bedIn bed.hg17
 #Loaded 11087 rows of expression data from hgFixed.gladHumESRatio
 #Mapped 10925,  multiply-mapped 382, missed 23266, unmapped 162
     hgLoadBed hg17 gladHumES bed.hg17.data
 
 # PHASTODDS GENESORTER COLUMN (DONE 7/28/2005 Andy)
     ssh kolossus
     cd /panasas/store/andy
     mkdir phastOdds
     cd phastOdds/
     export PATH=${PATH}:/cluster/bin/phast/x86_64
     mkdir sso beds
     cat > runChrom.sh << "_EOF_"
 #!/bin/bash
 c=$1
 numDir=`echo ${c#chr} | sed 's/_random//'`
 ALNDIR=/cluster/data/hg17/bed/multiz10way
 
 echo msa_view $c
 /cluster/bin/phast/x86_64/msa_view --in-format MAF ${ALNDIR}/maf/${c}.maf --refseq /cluster/data/hg17/${numDir}/${c}.fa > /tmp/${c}.sso
 
 echo phastCons $c
 /cluster/bin/phast/x86_64/phastOdds -f ${ALNDIR}/cons/run.elements/ave.cons.mod -b ${ALNDIR}/cons/run.elements/ave.noncons.mod -g kg/${c}.bed /tmp/${c}.sso > /tmp/${c}.phastOdds.gtf
 cp /tmp/${c}.sso sso/
 rm /tmp/${c}.sso
 cp /tmp/${c}.phastOdds.gtf gtfs/
 rm /tmp/${c}.phastOdds.gtf
 echo $c done
 _EOF_
     ssh hgwdev
     cd /panasas/store/andy/phastOdds
     genePredToGtf hg17 knownGene kg.gtf
     exit 
     for c in `cut -f1 kg.gtf | sort | uniq`; do 
        grep "\b${c}\b" kg.gtf > kg/${c}.gtf; 
     done
     for f in kg/*.bed; do
        c=`basename $f .bed`;
        echo $c;
        ./runChrom.sh $c;
        addPhastOddsExons $f gtfs/$c.phastOdds.gtf beds/$c.bed
     done
     cat beds/* | sort -k4,4 -k1,1 -k2,2n -k3,3n > phastOdds.kg.bed
     cat > phastOdds.sql << "EOF"
 CREATE TABLE phastOdds (
     bin smallint not null, # Speedup.
     chrom varchar(255) not null,        # Human chromosome or FPC contig
     chromStart int unsigned not null,   # Start position in chromosome
     chromEnd int unsigned not null,     # End position in chromosome
     name varchar(255) not null, # Name of item
               #Indices
     score float not null, # phastOdds score.
     index(chrom(8),bin),
     index(name(10))
 );
 EOF
 # <<
     hgLoadBed -sqlTable=phastOdds.sql hg17 phastOdds phastOdds.kg.bed
     # Actually I probably don't need that hg17 table.
     echo create table phastOdds select name, score from hg17.phastOdds | hgsql hgFixed
     echo create index nameIndex on phastOdds (name(10)) | hgsql hgFixed    
 
 ##########################################################################
 # Illumina SNPs (Heather, July 2005)
 # Source: Jeff Ohmen, PhD, johmen@illumina.com, 858/232-2702
 
 # using /cluster/store11 because /cluster/data/hg17 is on store5,
 # which is currently being restored
 
   cd /cluster/store11/heather/illumina
   fix.pl < LinkageIVbSNP.txt > illumina.bed
   hgLoadBed hg17 snpIllumina -tab -strict -sqlTable=snpIllumina.sql illumina.bed
   # Reading illumina.bed
   # Loaded 6008 elements of size 4
   # Sorted
   # Saving bed.tab
   # Loading hg17
 
   # note: 28 rows where chrom = "chrXY"
 
 # reload rankProp and psiBlast gene sorter tables to link with new
 # known genes (markd 2005-07-15)
    (spLoadRankProp -noMapFile=max1k.nomap hg17 rankProp --  /cluster/bluearc/markd/rankprop/results/hs.sw+tr/max1k.rankp.gz)   >&max1k.hg17.out
    (spLoadPsiBlast hg17 spPsiBlast  /cluster/bluearc/markd/rankprop/results/hs.sw+tr.eval.gz) >&pslBlast.hg17.out 
 
 
 # BLASTZ/CHAIN/NET CANFAM2 (DONE 8/2/05 angie - REDONE 12/12/05 angie - REDONE 2/6/06 angie)
 # Unfortunately, this was done with a corrupted 
 # /san/sanvol1/scratch/hg17/nib/chr5.nib the first time around; 
 # also, a linSpecRep bug in blastz-run-ucsc has been fixed since then.
 # Doh, then Kate pointed out that linSpecReps were not snipped properly --
 # I had omitted the BLASTZ_ABRIDGE_REPEATS line from the DEF!!!  
 # Added an error message to doBlastzChainNet.pl to catch that next time.
 # Therefore I'm moving aside the previous run:
     mv /usr/local/apache/htdocs/goldenPath/hg17/vsCanFam2{,.bak}
 # And rerunning...
     ssh kkstore02
     mkdir /cluster/data/hg17/bed/blastz.canFam2.2006-02-06
     cd /cluster/data/hg17/bed/blastz.canFam2.2006-02-06
     cat << '_EOF_' > DEF
 # human vs. dog
 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
 SEQ1_SMSK=/san/sanvol1/scratch/hg17/linSpecRep.notInDog
 SEQ1_LEN=/cluster/data/hg17/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog
 SEQ2_DIR=/san/sanvol1/scratch/canFam2/nib
 SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
 SEQ2_LEN=/cluster/data/canFam2/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastz.canFam2.2006-02-06
 '_EOF_'
     # << for emacs
     doBlastzChainNet.pl DEF -bigClusterHub pk -smallClusterHub pk \
       -workhorse pk \
       -blastzOutRoot /san/sanvol1/scratch/blastzHg17CanFam2Out >& do.log &
     tail -f do.log
     rm -f /cluster/data/hg17/bed/blastz.canFam2
     ln -s blastz.canFam2.2006-02-06 /cluster/data/hg17/bed/blastz.canFam2
 
 # RE-RUN NETTOAXT, AXTTOMAF FOR CANFAM2 (DONE 10/26/05 angie)
     # Kate fixed netToAxt to avoid duplicated blocks, which is important 
     # for input to multiz.  Regenerate maf using commands from sub-script 
     # netChains.csh generated by doBlastzChainNet.pl above.  
 # Obsoleted by re-run of hg17-canFam2 above 12/12/05 angie...
     ssh kolossus
     cd /cluster/data/hg17/bed/blastz.canFam2.2005-08-01/axtChain
     netSplit hg17.canFam2.net.gz net
     chainSplit chain hg17.canFam2.all.chain.gz
     cd ..
     mv axtNet axtNet.orig
     mkdir axtNet
     foreach f (axtChain/net/*.net)
       netToAxt $f axtChain/chain/$f:t:r.chain \
         /panasas/store/hg17/nib /iscratch/i/canFam2/nib stdout \
       | axtSort stdin stdout \
       | gzip -c > axtNet/$f:t:r.hg17.canFam2.net.axt.gz
     end
     rm -r mafNet
     mkdir mafNet
     foreach f (axtNet/*.hg17.canFam2.net.axt.gz)
       axtToMaf -tPrefix=hg17. -qPrefix=canFam2. $f \
             /cluster/data/hg17/chrom.sizes /cluster/data/canFam2/chrom.sizes \
             stdout \
       | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
     end
     rm -r axtChain/{chain,net}/ axtNet.orig
 
 
 ############
 # Sangamo/EIO DNaseI Hypersensitive Sites (2005-08-15 kate)
 #   (Sangamo Biosciences and European Inst. Oncology)
 #   Contact: Fyodor Umov (fumov@sangamo.com)
 
     cd /cluster/data/hg17/bed
     mkdir sangamo
     cd sangamo
     grep chr 3314_hs_sites_browser.bed | grep -v browser | \
         hgLoadBed -noBin hg17 sangamoDnaseHs stdin
             # Loaded 3314 elements of size 6
     checkTableCoords -table=sangamoDnaseHs hg17
 
     # use "antiword" to create plain text from .doc description file
 
 # UPDATE WGRNA TRACK (DONE, 2005-08-24, Fan)
 
   ssh hgwdev
   cd /cluster/data/hg17/bed
 
   mkdir wgRna-2005-08-24
   cd wgRna-2005-08-24
 
 # Received the data file, wgtrack_aug2005.txt, from Michel Weber's email 
 # (Michel.Weber@ibcg.biotoul.fr)
 # and place it under cd /cluster/data/hg17/bed/wgRna-2005-08-24.
 
   cut -f 2-10 wgtrack_aug2005.txt >wgRna.tab
   vi wgRna.tab
   
 # edit wgRna.tab to take out the first line of data field labels.
 
     hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
 
 # Compared to 7/5 data, one record updated, one record dropped, one record added, out of 741 records.
 
 # Generate snpMask files (Done Heather Sept. 1, 2005)
 # Takes about 10-15 minutes
 # Consumes about 1 gig of disk
   ssh hgwdev
   cd /usr/local/apache/htdocs/goldenPath/hg17
   mkdir snpMask
   cd snpMask
   foreach chrom ( `cat /cluster/data/hg17/chrom.lst` )
       snpMaskChrom hg17 ${chrom} /gbdb/hg17/nib/${chrom}.nib ${chrom}.ambigs.fa
       gzip ${chrom}.ambigs.fa
   end
 
 #############################################################################
 # BLASTZ Mm7 (WORKING - 2005-09-06 - Hiram)
 #	Experiment, try the alignments without the linage specific
 #	repeats
     ssh pk
     mkdir /cluster/data/hg17/bed/blastzMm7.2005-09-06
     cd /cluster/data/hg17/bed
     ln -s blastzMm7.2005-09-06 blastz.mm7
     cd blastzMm7.2005-09-06
     cat << '_EOF_' > DEF
 # human vs mouse
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_H=2000
 BLASTZ_M=50
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human Hg17
 SEQ1_DIR=/cluster/bluearc/hg17/bothMaskedNibs
 SEQ1_LEN=/cluster/bluearc/hg17/chrom.sizes
 SEQ1_CTGDIR=/cluster/bluearc/hg17/bothMaskedNibs
 SEQ1_CTGLEN=/cluster/bluearc/hg17/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=500000
 SEQ1_LAP=50
 
 # QUERY: Mouse Mm7
 SEQ2_DIR=/cluster/bluearc/mm7/mm7.2bit
 SEQ2_LEN=/cluster/bluearc/mm7/chrom.sizes
 SEQ2_CTGDIR=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit
 SEQ2_CTGLEN=/cluster/bluearc/mm7/mm7Chroms_RandomContigs.sizes
 SEQ2_LIFT=/cluster/bluearc/mm7/Chroms_RandomContigs.lft
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=3000000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastzMm7.2005-09-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # happy emacs
 
     cp -p /cluster/data/hg17/chrom.sizes ./S1.len
     twoBitInfo /cluster/bluearc/mm7/mm7Chroms_RandomContigs.2bit S2.len
 
     #	establish a screen to control this job
     screen
     time ./doBlastzChainNet.pl -stop chainMerge \
 	-bigClusterHub=pk \
 	`pwd`/DEF > toChainMerge.run.out 2>&1 &
     #	STARTED - 2005-09-06 - 11:00
     #	detach from screen session: Ctrl-a Ctrl-d
     #	to reattach to this screen session:
     time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk \
 	-continue=cat -stop=cat \
 	`pwd`/DEF > catStep.out 2>&1 &
 
     time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk \
 	-continue=chainRun \
 	`pwd`/DEF > continueChainRun.out 2>&1 &
 
     time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk \
 	-continue=chainMerge -stop=chainMerge \
 	`pwd`/DEF > chainMerge.out 2>&1 &
     time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk \
 	-continue=net -stop=net \
 	`pwd`/DEF > net.out 2>&1 &
 
     time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk \
 	-continue=load -stop=load \
 	`pwd`/DEF > load.out 2>&1 &
 
     time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk \
 	-swap -stop=load \
 	`pwd`/DEF > swap.out 2>&1 &
 
     #	Create plain pslChrom files to load a simple blastz track
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastzMm7.2005-09-06
     mkdir -p pslChrom
 (cd pslParts; ls | awk -F"." '{print $1}' | sort -u) | while read C
 do
     echo -n "working ${C} ... "
     zcat pslParts/${C}.nib*.gz | gzip -c > pslChrom/${C}.psl.gz
     echo "done"
 done
     #	Load those alignments
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastzMm7.2005-09-06
 ls pslChrom | sed -e "s/.psl.gz//" | while read T
 do
 echo "hgLoadPsl -fastLoad -noTNameIx hg17 -table=${T}_blastzMm7 pslChrom/${T}.psl.gz"
 hgLoadPsl -fastLoad -noTNameIx hg17 -table=${T}_blastzMm7 pslChrom/${T}.psl.gz
 done
 
 
     #	After this same alignment was done with Hg17 query and Mm7
     #	target, came back to these swapped results in mm7 and manually loaded
     #	the swapped tables as: chainMm7LSR, chainMm7LSRLink and
     #	netMm7LSR
     #	41,223,632 total rows in the chainMm7Link split tables
     #	58,458,613 total rows in the chainMm7LSRLink table
 
     time featureBits hg17 chainMm7LSRLink
     #	959444893 bases of 2866216770 (33.474%) in intersection
     #	real    36m30.822s
     #	user    14m19.620s
     #	sys     5m13.910s
     time featureBits hg17 chainMm7Link
     #	955168137 bases of 2866216770 (33.325%) in intersection
     #	real    16m13.902s
     #	user    10m20.780s
     #	sys     3m42.810s
     #	And, their intersection:
     ssh kolossus
     time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 \
 	chainMm7LSRLink chainMm7Link
     #	952667399 bases of 2866216770 (33.238%) in intersection
     #	real    38m53.448s
     #	user    8m38.853s
     #	sys     2m23.362s
 
 # LOAD ACEMBLY TRACK (DONE, 2005-09-12, Fan)
     mv /cluster/data/hg17/bed/acembly /cluster/data/hg17/bed/acembly_050217
     mkdir -p /cluster/data/hg17/bed/acembly
     cd /cluster/data/hg17/bed/acembly
     # Data is obtained from Jean Thierry-Mieg     mieg@ncbi.nlm.nih.gov
  
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.proteins.fasta.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.gff.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.mrnas.fasta.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_35g.human.genes/acembly.ncbi_35g.genes.pfamhits.tar.gz
     tar xvzf acembly.ncbi_35g.genes.gff.tar.gz
     tar xvzf acembly.ncbi_35g.genes.proteins.fasta.tar.gz
     cd acembly.ncbi_35.genes.gff
     # the acembly dataset for hg16 had problems with reverse blocks so
     # check for these
 cat << '_EOF_' > checkReversedBlocks
 for i in x1*.gff
 do
         echo -n "$i working ..."
         awk -F"\t" '
 {
 if ($4 > $5) {
     printf "reverse blocks problem for $1"
         printf "\n"
 }
 }
 ' $i > $i.fixed
         echo " done"
 done
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x checkReversedBlocks
     ./checkReversedBlocks  
     ls -l *.fixed
     # all *.fixed files are empty so remove - there is no reversing of blocks
     rm *.fixed
 
     foreach f (x1.acemblygenes.*.gff)
       set c=$f:r:e
       egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
         perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
       if (-e ../../../$c/lift/random.lft) then
         liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
           ctg-chr${c}_random.gff
       endif
       grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
         grep -v "^chr//" > chr$c.gff
       echo "done $c"
     end
     
     #- Load into database - use extended genePred
     ssh hgwdev
     cd /cluster/data/hg17/bed/acembly
     # Reloaded without -genePredExt 1/6/05:
     ldHgGene -gtf hg17 acembly acembly.ncbi_35.genes.gff/chr*.gff
     # for entry with 28212470 from chr6.gff, change to chr6 
     # and for 29124352 in chr6.gff, change to chr6 (1/13/05)
     echo 'update acembly set chrom = "chr6" where chrom = "chr28212470";' \
          | hgsql hg17 
     echo 'update acembly set chrom = "chr6" where chrom = "chr29124352";' \
          | hgsql hg17 
     
     # checkTableCoords and runGeneCheck to check data
     checkTableCoords hg17 acembly
 
 hgPepPred hg17 generic acemblyPep \
         acembly.ncbi_35.genes.proteins.fasta/*.fasta
 
 # create table of Acembly gene classifications
 cd /cluster/data/hg17/bed/acembly/acembly.ncbi_35.genes.gff
 rm acemblyClass.tab
 foreach f (x1.acemblygenes.*.gff)
 
 cut -f 9 $f |sed -e 's/;/\t/g' |sed -e 's/transcript_id //' >j.tmp
 cut -f 2 j.tmp >j2.tmp
 cut -f 3 j.tmp >j3.tmp
 paste j3.tmp j2.tmp|sed -e 's/Main_gene/main/g' |sed -e 's/Putative_gene/putative/g' |sed -e 's/ //g' >>acemblyClass.tab
 end
 
 rm *.tmp
 hgsql hg17 -e 'drop table acemblyClass'
 hgsql hg17 < ~src/hg/lib/acemblyClass.sql
 hgsql hg17 -e 'load data local infile "acemblyClass.tab" into table acemblClass.tab'
 hgsql hg17 -e 'delete from acemblyClass where class!="main" and class!="putative"'
 
 # build acemblyPep table
 
 hgPepPred hg17 generic acemblyPep \
         acembly.ncbi_35.genes.proteins.fasta/*.fasta
 
 # Please note, per email from Jean Thierry-Mieg on 9/9/05,
 # there are AceView genes (~10,000) without corresponding 
 # protein sequences.  They will fix it next time.
 
 ###########################################################################
 # LOADING AFFYTXNPHASE2 TRACK (sugnet)
 
 # cd to where data is downloaded.
 cd /cluster/store10/sugnet/affyTranscription/graphs/transcriptome.affymetrix.com/download/publication/polyA_minus/graphs
 
 # lift data from hg16 to hg17. This takes a long time.
 ./liftWigFilesHg16ToHg17.sh
 
 # make the .wib and .wig files. This takes a long time.
 ./makeWibWigHg17.sh
 
 # Copy .wib files to /cluster/data/hg17 files
 mkdir /cluster/data/hg17/bed/affyTxnPhase2/wigData/
 cp `find ./ -name "*.hg17.wib"` /cluster/data/hg17/bed/affyTxnPhase2/wigData/
 chmod 775 /cluster/data/hg17/bed/affyTxnPhase2/wigData/
 chmod 664 /cluster/data/hg17/bed/affyTxnPhase2/wigData/*
 
 # Make gbdb entry
 mkdir /gbdb/hg17/wib/affyTxnPhase2
 chmod 775 /gbdb/hg17/wib/affyTxnPhase2
 cd /gbdb/hg17/wib/affyTxnPhase2
 ln -s /cluster/data/hg17/bed/affyTxnPhase2/wigData/* .
 cd -
 
 # Load the database tables (using bash) this takes a while
 for file in `find ./ -name "*hg17.wig"`; do
   base=`basename $file .hg17.wig`
   echo "Doing ${base}Txn"
   hgLoadWiggle -pathPrefix=/gbdb/hg17/wib/affyTxnPhase2  hg17 ${base}Txn $file
 done
 
 # Do the transfrags
 cd ../transfrags
 ./liftHg16ToHg17.sh
 ./loadHg17Tnfg.sh
 
 # End of affyTxnPhase2
 
 ###########################################################################
 #  Creating download files for the affyTxnPhase2 data
 #		(DONE - 2006-11-20 - Hiram)
     #	Copy all of the data above to a temporary /san/sanvol1/scratch/
     #	location, and run the following script:
 #!/bin/sh
 
 mkdir -p rawData/hg17
 
 TOP=`pwd`
 export TOP
 for dir in `find ./ -type d | grep '_' | grep -v A375_cytosolic_polyAPlus | grep -v FHs738Lu_cytosolic_polyAPlus | grep -v HepG2_CytosolVsNucleusDifferenceGraphs | grep -v HepG2_cytosolic_polyAPlus | grep -v HepG2_cytosolic_polyAMinus | sed -e "s#^./##"`; do
 base=`echo $dir | sed -e 's/\.\///; s/\//_/g' | sed -e 's/polyA-/polyAMinus/g' | sed -e 's/-/_/g' | sed -e 's/\+/Plus/g' | $TOP/changeName.pl`
 RAW=$TOP/rawData/hg17/$base.data
 echo $RAW
    cd $dir;
    zcat `ls -1 *hg17.bed.gz` | bedSort stdin stdout | cut -f 1,2,3,4 | grep chr | $TOP/avgerizeBed.pl > $RAW
    cd $TOP;
 done
 
     #	Then copy the rawData/hg17/ results directory back to:
     /cluster/data/hg17/bed/affyTxnPhase2/rawResults/
     #	And deliver to hgdownloads via symlinks on hgwdev:
     cd /usr/local/apache/htdocs/goldenPath/hg17/affyTxnPhase2/
     # to:
     ln -s /cluster/data/hg17/bed/affyTxnPhase2/rawData/*.data.gz .
     #	Remove the san scratch data
 
 ###########################################################################
 # ALTGRAPHX TRACK (sugnet)
 
 /cluster/store1/sugnet/altSplice
 mkdir hg17-2005.03.28
 
 # First get the RNA clusters
 cd hg17-2005.03.28
 
 # Don't use RAGE libraries for clone bounds.
 ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  hg17 rage.libs
 
 # Make spec file to run.
 foreach c (`echo 'select chrom from chromInfo' | hgsql hg17 | grep -v chrom`)
     set out = chrom/$c.bed
     echo "clusterRna -mrnaExclude=hg17.rage.libs hg17 /dev/null $out -chrom=$c" >> clusterRna.spec
 end
 
 # Tried running it on the minicluster, but can't connect to the 
 # cluster accounts so run it from here on hgwdev.
 ./clusterRna.spec >& clusterRna.log
 
 cd ..
 
 # Make script to setup parasol job file for raw altGraphX files on human
 cat << '_EOF_' > makeRun.sh
 #!/bin/sh
 
 for chrom in `echo "select chrom from chromInfo" | hgsql hg17 | grep -v chrom`; do
 echo 'echo "Doing $chrom"'
 echo "/cluster/home/sugnet/bin/i386/altSplice -db=hg17   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg17.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg17/nib/$chrom.nib"
 done
 '_EOF_'
     # << this line makes emacs coloring happy
 mkdir agxs
 chmod 755 makeRun.sh
 
 # Minicluster down, have to run on hgwdev.
 ./makeRun.sh > toRun.sh
 chmod 755 toRun.sh
 ./toRun.sh >& toRun.log &
 
 cat agxs/*.agx > hg17.agx
 
 # make raw altGraphX files for mouse
 mkdir ../mm5-2005.03.28/
 cd ../mm5-2005.03.28/
 
 # make the rnaClusters
 mkdir rnaCluster
 cd rnaCluster/
 mkdir chrom
 
 # Don't use RAGE libraries for clone bounds.
 ~/jk/hg/geneBounds/clusterRna/generateRageAccList.csh mm5 rage.libs
 # Doing select on mm5 into mm5.rage.libs
 # Done.
 
 # Make spec file to run.
 foreach c (`echo 'select chrom from chromInfo' | hgsql mm5 | grep -v chrom`)
     set out = chrom/$c.bed
     echo "clusterRna -mrnaExclude=mm5.rage.libs mm5 /dev/null $out -chrom=$c" >> clusterRna.spec
 end
 
 # Tried running it on the minicluster, but can't connect to the 
 # cluster accounts so run it from here on hgwdev.
 chmod 755 clusterRna.spec
 ./clusterRna.spec >& clusterRna.log &
 
 
 # Make the gene bounds in rnaCluster.
 mkdir agxs
 
 # This script generates the jobs, one per chromosome.
 echo << '_EOF_' > makeRun.sh
 #!/bin/sh
 
 for chrom in `echo "select chrom from chromInfo" | hgsql mm5 | grep -v chrom`; do
 echo 'echo "Doing $chrom"'
 echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm5   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm5.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm5/nib/$chrom.nib"
 done
 '_EOF_'
 chmod 755 makeRun.sh
 ./makeRun.sh > toRun.sh
 chmod 755 toRun.sh
 ./toRun.sh >& toRun.log & # Takes an hour or so...
 
 # Consolodiate all of the records in a single file.
 cat agxs/*.agx > mm5.agx
 
 
 # Make the orthologous splicing graphs.
 mkdir orthoSpliceExoniphy
 cd orthoSpliceExoniphy/
 
 # Get the exoniphy exons...
 echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg17 | grep -v txStart > hg17.exoniphy.bed
 
 # Set up the commands for the orthosplice run.
 echo 'select chrom, size from chromInfo' | hgsql hg17 | grep -v chrom > chromSizes.tab
 ln -s /cluster/data/hg17/bed/blastz.mm5/axtChain/mouseNet/ nets
 ln -s /cluster/data/hg17/bed/blastz.mm5/axtChain/chain/ chains
 mkdir agx report logs
 
 cat << '_EOF_' > makeRun.sh
 
 #!/usr/bin/perl -w
 
 open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
 while(<IN>) {
     chomp;
     @w = split;
     print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -exonFile=hg17.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../agxs/hg17.$w[0].agx -orthoAgxFile=../../mm5-2005.03.28/mm5.agx -db=hg17 -orthoDb=mm5 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg17.mm5.cons.t3.agx -reportFile=report/$w[0].hg17.report -edgeFile=report/$w[0].hg17.edge.report >& logs/$w[0].test.log\n";
 }
 '_EOF_'
 # << emacs
 
 ./makeRun.sh > orthoSplice.para.spec
 ssh kki
 cd /cluster/store1/sugnet/altSplice/hg17-2005.03.28/orthoSpliceExoniphy
 para create orthoSplice.para.spec
 para push
 
 cat agx/*.agx > hg17.mm5.t3.exoniphy.agx
 
 # Make bed file
 agxToBed hg17.mm5.t3.exoniphy.agx hg17.mm5.t3.exoniphy.bed
 
 # Load up files
 hgLoadBed hg17 agxBed hg17.mm5.t3.exoniphy.bed
 hgLoadBed -notItemRgb -sqlTable=/cluster/home/sugnet/kent/src/hg/lib/altGraphX.sql hg17 altGraphX hg17.mm5.t3.exoniphy.agx
 
 # end altGraphX track
 
 # EXONWALK TRACK (sugnet)
 
 # make altGraphX track (see above)
 
 cd /cluster/store1/sugnet/altSplice/hg17-2005.03.28/orthoSpliceExoniphy
 
 cd exonWalk
 mkdir beds
 
 # Make parasol script.
 foreach file (`ls ../agx/*.agx`)
   set base=`basename $file .agx`
   echo "/cluster/home/sugnet/bin/i386/exonWalk db=hg17 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
 end
 para create exonWalk.para.spec
 para push
 cat beds/*.bed > hg17.mm5.cons.t3.exoniphy.bed
 
 # Predict orfs
 mkdir orfs
 cd orfs
 mkdir bedOrf beds fa borf
 cp ~/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./
 splitFile ../../hg17.mm5.cons.t3.exoniphy.bed 500 exonWalk.
 cat < < '_EOF_' > makeFa.sh
 #!/bin/sh
 
 for file in "$@"
 do
  base=`basename $file`
  echo "Doing $file"
  echo "sequenceForBed -db=hg17 -bedIn=$file -fastaOut=fa/$base.fa "
  sequenceForBed -db=hg17 -bedIn=$file -fastaOut=fa/$base.fa 
 done
 '_EOF_'
 chmod 755 makeFa.sh
 makeFa.sh beds/*
 
 # Run borf lots of times...
 makeSpec.sh beds/* > para.spec
 para create para.spec
 para push
 mkdir genePred
 cat << '_EOF_' > makeGenePred.sh
 #!/bin/sh
 
 for file in "$@"
 do
   base=`basename $file`
   /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp 
 done
 '_EOF_'
     # << this line makes emacs coloring happy
 chmod 755 makeGenePred.sh 
 makeGenePred.sh beds/*
 
 cat beds/* > hg17.mm5.exonWalk.bed
 cat genePred/*.gp > hg17.mm5.exonWalk.gp
 
 wc *.bed *.gp
 # 155470 1865640 29956585 hg17.mm5.exonWalk.bed
 #  98433  984330 32495119 hg17.mm5.exonWalk.gp
 
 # Load it into the database.
 ldHgGene -predTab hg17 exonWalk hg17.mm5.exonWalk.gp 
 
 # end exonWalk
 
 ####################################################################
 ### hapmapRecombRate (Daryl; September 19, 2005)
     Lifted from hg16; see makeHg16.doc for details
     # Update (Jen; October 25, 2005)
     Data points that lifted to chroms other than 1-22 + X removed
     before release to RR (confirmed with Daryl).
     chr4_random: 11 data points
     chr6_hla_hap1: 25 data points
 ### hapmapRecombHotspot (Daryl; September 19, 2005)
     Lifted from hg16; see makeHg16.doc for details
 
 ### HapMap SNPs (Daryl; February 4, 2006)
   # most of this work was done in October and November 2005 for the ENCODE workshop
     cd /cluster/store4/gs.17/build34/bed/hapmap/frequencies/2005-10/non-redundant/hapmapSnps
     ln -sf ../hg17.daf.all/daf.txt.gz .
     ln -sf ../hg17.panTro1.rheMac1.txt.gz .
     zcat hg17.panTro1.rheMac1.txt | grep -v chrom | sort >! hg17.panTro1.rheMac1.sort.txt
     zcat daf.txt                  | grep -v chrom | sort >! daf.sort.txt
     # check that order matches; should be empty
     paste hg17.panTro1.rheMac1.sort.txt daf.sort.txt | awk '$1!=$17||$2!=$18||$3!=$19||$4!=$20||$5!=$21||$6!=$22||$7!=$23||$8!=$24||$11!=$25||$12!=$27||$15!=$26||$16!=$28{print $0;}'
     paste hg17.panTro1.rheMac1.sort.txt daf.sort.txt | awk '{printf "%s\t%d\t%d\t%s\t0\t%c\t%c\t%c\t%c\t%c\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n",$1,$2,$3,$4,$6,$7,$8,$11,$15,$12,$16,$29,$30,$31,$32,$33}' >! hapmapSnps.bed
     hgLoadBed hg17 hapmapSnps -sqlTable=hapmapSnps.sql hapmapSnps.bed
 
 ############################################################################################
 # HapMap SNPs rel21a (Feb. 2007, Heather)
 # June 2007 [partial fix of hapmapAllelesSummary released 6/25/07:
 # using hg17 instead of hg18 liftOver files... for most but not all
 # chroms! :( not documented below; error found by user]
 # 1/11/08, 1/24/08 (angie): regenerated hapmapAllelesSummary with corrected
 # hapmapAllelesChimp.
 
 # get files for each chrom, for each population
 # these contain data for all individuals
 # not using the JPT+CHB files
 ssh kkstore05
 cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant/zips
 wget http://www.hapmap.org/downloads/genotypes/2007-01/rs_strand/non-redundant/* 
 
 # get population data (needed to define children in CEU and YRI trios)
 cd /cluster/store12/snp/hapmap
 wget http://www.hapmap.org/downloads/samples_individuals/*gz
 gunzip pedinfo2sample_CEU.txt.gz
 filterPedigree.pl < pedinfo2sample_CEU.txt > CEU.filtered
 cp CEU.filtered rel21a/genotypes/2007-01/rs_strand/non-redundant/CEU.list
 gunzip pedinfo2sample_YRI.txt.gz
 filterPedigree.pl < pedinfo2sample_YRI.txt > YRI.filtered
 cp YRI.filtered rel21a/genotypes/2007-01/rs_strand/non-redundant/YRI.list
 
 #!/usr/bin/env perl
 while (<STDIN>) {
     my @fields = split;
     if ($fields[2] == 0 && $fields[3] == 0) {
         @subfields = split /:/, $fields[6];
         print $subfields[4];
         print "\n";
     }
 }
 
 cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant
 zcat zips/*chr22_CEU* | head -1 > header.CEU
 zcat zips/*chr22_YRI* | head -1 > header.YRI
 # add carriage returns to header.CEU and header.YRI
 grep -n -f CEU.list header.CEU > offsets.CEU
 grep -n -f YRI.list header.YRI > offsets.YRI
 # delete ids in offsets.CEU and offsets.YRI so just column numbers remain
 
 # for each population, combine all chroms, and combine all individuals
 # for CEU and YRI, filter out children from trios
 # This creates CEU.merge, CHB.merge, JPT.merge, YRI.merge
 
 ./merge.csh
 
 #!/bin/tcsh 
 rm -f CEU.merge
 rm -f CHB.merge
 rm -f JPT.merge
 rm -f YRI.merge
 
 foreach chrom (`cat chrom.list`)
 
     echo $chrom
 
     # CEU
     echo "CEU"
     set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_CEU_r21a_nr.txt.gz", $1}'`
     zcat $fileName | filterCEU.pl >> CEU.merge
 
     # CHB
     echo "CHB"
     set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_CHB_r21a_nr.txt.gz", $1}'`
     zcat $fileName | filterCHB.pl >> CHB.merge
 
     # JPT
     echo "JPT"
     set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_JPT_r21a_nr.txt.gz", $1}'`
     zcat $fileName | filterJPT.pl >> JPT.merge
 
     # YRI
     echo "YRI"
     set fileName=`echo $chrom | awk '{printf "zips/genotypes_%s_YRI_r21a_nr.txt.gz", $1}'`
     zcat $fileName | filterYRI.pl >> YRI.merge
 
 end
 
 
 # Below is filterCEU.pl
 # The others are very similar: YRI uses "offsets.YRI"
 # CHB and JPT just read the input directly
 
 #!/usr/bin/env perl 
 
 # read in a list of the columns that we are keeping
 sub initList {
 open LIST, "offsets.CEU";
 chomp(@list = <LIST>);
 close LIST;
 $listSize = @list;
 }
 
 &initList;
 
 while (<STDIN>) {
     my @fields = split;
     # skip header
     if ($fields[0] eq "rs#") { next; }
     # chrom
     print $fields[2];
     print " ";
     # position: add zero-based start coord
     print $fields[3] - 1;
     print " ";
     print $fields[3];
     print " ";
     # rsId
     print $fields[0];
     print " ";
     # score
     print "0 ";
     # strand
     print $fields[4];
     print " ";
     # observed
     print $fields[1];
     print " ";
     @alleles = ();
     for ( my $loop = 0; $loop < $listSize; $loop++ ) {
 	push (@alleles, $fields[@list[$loop]-1]);
     }
     # N is used for missing data
     $nCount = 0;
     # counts
     $aCountHomo = 0;
     $cCountHomo = 0;
     $gCountHomo = 0;
     $tCountHomo = 0;
     $aCountHetero = 0;
     $cCountHetero = 0;
     $gCountHetero = 0;
     $tCountHetero = 0;
     foreach $allele (@alleles) {
         $parent1 = substr($allele, 0, 1);
         $parent2 = substr($allele, 1, 1);
 	# Ns must be together
 	if ($parent1 eq "N" && $parent2 ne "N") { die "Unexpected input"; }
 	if ($parent2 eq "N" && $parent1 ne "N") { die "Unexpected input"; }
 	if ($parent1 eq "N" && $parent2 eq "N") { $nCount++; next; }
 
 	if ($parent1 eq "A" && $parent2 eq "A") { 
 	    $aCountHomo = $aCountHomo + 2;
 	    next; 
 	}
 	if ($parent1 eq "C" && $parent2 eq "C") { 
 	    $cCountHomo = $cCountHomo + 2;
 	    next; 
 	}
 	if ($parent1 eq "G" && $parent2 eq "G") { 
 	    $gCountHomo = $gCountHomo + 2;
 	    next; 
 	}
 	if ($parent1 eq "T" && $parent2 eq "T") { 
 	    $tCountHomo = $tCountHomo + 2;
 	    next; 
 	}
 
 	if ($parent1 eq "A") { $aCountHetero++; }
 	if ($parent1 eq "C") { $cCountHetero++; }
 	if ($parent1 eq "G") { $gCountHetero++; }
 	if ($parent1 eq "T") { $tCountHetero++; }
 	if ($parent2 eq "A") { $aCountHetero++; }
 	if ($parent2 eq "C") { $cCountHetero++; }
 	if ($parent2 eq "G") { $gCountHetero++; }
 	if ($parent2 eq "T") { $tCountHetero++; }
     }
     print "A ";
     print $aCountHomo;
     print " ";
     print $aCountHetero;
     print " ";
     print "C ";
     print $cCountHomo;
     print " ";
     print $cCountHetero;
     print " ";
     print "G ";
     print $gCountHomo;
     print " ";
     print $gCountHetero;
     print " ";
     print "T ";
     print $tCountHomo;
     print " ";
     print $tCountHetero;
     print " ";
 
     print "\n";
 
 }
 
 # << emacs
 # Switch to C programs from kent/src/hg/snp/snpLoad.
 
 # Determine allele1 and allele2 (set allele2 to "none" if monomorphic)
 # Alleles are in alphabetical order
 # Calculate score (minor allele frequency)
 # Log and skip if wrong number of elements in row
 # Log and skip if triallelic or quadallelic
 # Log and skip degenerate case (no alleles) 
 # No errors this run
 # Still running on kkstore05
 # Could rename "hapmap1" to "hapmapCondense"
 /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 CEU.merge CEU.condense
 wc -l hapmap1.log
 /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 CHB.merge CHB.condense
 wc -l hapmap1.log
 /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 JPT.merge JPT.condense
 wc -l hapmap1.log
 /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap1 YRI.merge YRI.condense
 wc -l hapmap1.log
 
 # save some space
 gzip *merge
 
 # load
 ssh hgwdev
 cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant
 cp /cluster/home/heather/kent/src/hg/lib/hapmapSnps.sql .
 # modify hapmapSnps for 4 populations
 hgLoadBed hg17 hapmapSnpsCEU -sqlTable=hapmapSnpsCEU.sql CEU.condense
 hgLoadBed hg17 hapmapSnpsCHB -sqlTable=hapmapSnpsCHB.sql CHB.condense
 hgLoadBed hg17 hapmapSnpsJPT -sqlTable=hapmapSnpsJPT.sql JPT.condense
 hgLoadBed hg17 hapmapSnpsYRI -sqlTable=hapmapSnpsYRI.sql YRI.condense
 
 # save some more space
 ssh kkstore05
 cd /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant
 gzip *condense
 
 # sanity check
 mysql> select count(*) from hapmapSnpsCEU where homoCount1 + homoCount2 + heteroCount = 0;
 +----------+
 | count(*) |
 +----------+
 |        0 |
 +----------+
 
 mysql> select count(*) from hapmapSnpsCHB where homoCount1 + homoCount2 + heteroCount = 0;
 +----------+
 | count(*) |
 +----------+
 |        0 |
 +----------+
 
 mysql> select count(*) from hapmapSnpsJPT where homoCount1 + homoCount2 + heteroCount = 0;
 +----------+
 | count(*) |
 +----------+
 |        0 |
 +----------+
 
 mysql> select count(*) from hapmapSnpsYRI where homoCount1 + homoCount2 + heteroCount = 0;
 +----------+
 | count(*) |
 +----------+
 |        0 |
 +----------+
 
 mysql> select max(score) from hapmapSnpsCEU;  
 +------------+
 | max(score) |
 +------------+
 |        500 |
 +------------+
 
 
 # create indexes
 mysql> alter table hapmapSnpsCEU add index name (name);
 mysql> alter table hapmapSnpsCEU add index chrom (chrom, bin);
 mysql> alter table hapmapSnpsCHB add index name (name);
 mysql> alter table hapmapSnpsCHB add index chrom (chrom, bin);
 mysql> alter table hapmapSnpsJPT add index name (name);
 mysql> alter table hapmapSnpsJPT add index chrom (chrom, bin);
 mysql> alter table hapmapSnpsYRI add index name (name);
 mysql> alter table hapmapSnpsYRI add index chrom (chrom, bin);
 
 # 2nd step in processing: create hapmapSnpsCombined
 ssh hgwdev
 cd /cluster/data/hg17/bed/hapmap/rel21a
 /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmap2 hg17
 hgLoadBed hg17 hapmapSnpsCombined -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapSnpsCombined.sql hapmapSnpsCombined.tab
 
 # create indexes (not used by browser)
 mysql> alter table hapmapSnpsCombined add index name (name);
 mysql> alter table hapmapSnpsCombined add index chrom (chrom, bin);
 
 # errors
 # nothing that isn't biallelic
 # nothing with mixed positions
 # over 500K that were not available in all 4 populations
 # YRI: 187,485
 # CEU: 129,359
 # CHB and JPT: 97,095
 
 # Also, 2 strand corrections done
 grep -v missing hapmap2.errors
 # different strands for rs1621378
 # different strands for rs5768
 
 # cleanup to save space
 rm hapmapSnpsCombined.tab
 
 # monomorphism
 
 YRI        867,835 
 CEU      1,252,743 
 CHB      1,496,438 
 JPT      1,539,094 
 combined   607,393 
 
 # observed strings
 # why is A/T different from other transversions?
 
 A/G      1,344,043
 C/T      1,344,542
 A/C        352,875
 A/T        275,670
 C/G        354,299
 G/T        354,149
 triallelic   1,370
 quadallelic    403
 other        1,226
 
 # some details on the others:
 125     -/A/T
 124     -/A/G
 107     -/C/T
 85      -/A/C
 79      -/G/T
 25      -/C/G
 
 18      -/A/C/T
 13      -/A/G/T
 12      -/A/C/G
 11      -/C/G/T
 
 7       (LARGEINSERTION)
 5       (LARGEDELETION)
 6       microsat
 2       het
 
 # check for collisions (more than one SNP at the same location)
 # none found
 ssh hgwdev
 cd /cluster/data/hg17/bed/hapmap/rel21a
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckCluster2 hg17 hapmapSnpsCombined > snpCheckCluster2.out
 
 # check against hg17.snp125
 ssh hgwdev
 cd /cluster/data/hg17/bed/hapmap/rel21a
 /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapLookup hg17 hapmapSnpsCombined snp125 snp125Exceptions
 
 # 1817 total that are complex type from dbSNP (hapmapLookup.log)
 # This is not based on observed string, only on size, class and locType
 # 1176 class = mixed
 #  616 class = single but locType != exact
 #   11 class = named
 #    6 class = insertion
 #    4 class = deletion
 #    2 class = microsat
 #    2 class = het
 
 # Generally if class = single the observed string is bi-allelic as expected
 # Exceptions to that:
 #    rs700519 quad-allelic, locType = rangeDeletion
 #    rs1572672 tri-allelic, locType = between
 #    rs2357412 tri-allelic, locType = range
 #    rs2364671 tri-allelic, locType = rangeSubstitution
 #    rs3959788 quad-allelic, locType = between
 
 # 74 items in hapmapLookup.error
 # 59 reverse complement (that's okay)
 #  7 multiple alignment (6 from chrX:154,219,000-154,220,500 which is close to PAR)
 # Also rs6645103 which is PAR 
 mysql> select chrom, chromStart, chromEnd, strand, observed, class, locType, weight from snp125 where name = "rs6645103";
 +-------------+------------+----------+--------+----------+--------+---------+--------+
 | chrom       | chromStart | chromEnd | strand | observed | class  | locType | weight |
 +-------------+------------+----------+--------+----------+--------+---------+--------+
 | chrX_random |     273788 |   273789 | -      | C/T      | single | exact   |      3 |
 | chrX        |     421141 |   421142 | +      | C/T      | single | exact   |      3 |
 | chrY        |     421141 |   421142 | +      | C/T      | single | exact   |      3 |
 +-------------+------------+----------+--------+----------+--------+---------+--------+
 
 
 #  4 observed with dbSNP complex, hapmap biallelic
 #  all positive strand, locType = between
 #  all cluster errors in dbSNP
 #  rs10485830
 #  rs7625205 (intronic)
 #  rs713582
 #  rs11403115 (class = insertion)
 
 # 3 observed mismatch
 # all dbSNP clustering error
 # rs2230624 (tri-allelic)
 # rs3963317 (monomorphic in hapmap, rangeSubstitution in dbSNP)
 # rs5017503 (monomoprhic in hapmap)
 
 # a strange one
 # rs731449
 # dbSNP strand = -, hapmap strand = +
 # dbSNP observed = G/T, hapmap observed = C/T
 # dbSNP clustering error rs2321451, which is C/T
 # hapmap monomorphic for T
 # ortho A
 # no repeats, no genes, no mRNAs, no conservation
 
 # Counts of rows where 3 populations have one major allele, the 4th has the other
 
 hapmapMixed hg17
 # countCEU = 162931
 # countCHB = 46543
 # countJPT = 48791
 # countYRI = 309105
 
 # Generate summary table (used by filters)
 # Summary table includes ortho allele and ortho qual score
 # Summary table score is heterozygosity
 # Individual zygosity is *not* preserved
 ssh hgwdev
 # 6/25/08: regenerated with mostly-corrected hapmapAllelesChimp
 # 1/11/08 angie: regenerated with finally-corrected (I hope) hapmapAllelesChimp
 # 1/24/08 angie: regenerated with finally-corrected (I hope!) hapmapAllelesChimp
 cd /cluster/data/hg17/bed/hapmap/rel21a
 /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapSummary hg17 hapmapSnpsCombined hapmapAllelesChimp hapmapAllelesMacaque
 hgLoadBed hg17 hapmapAllelesSummary -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesSummary.sql hapmapSummary.tab -tab
 
 # sanity check
 mysql> select count(*) from hapmapAllelesSummary where majorAlleleCountCEU > totalAlleleCountCEU;
 +----------+
 | count(*) |
 +----------+
 |        0 |
 +----------+
 
 mysql> select count(*) from hapmapAllelesSummary where majorAlleleCountCHB > totalAlleleCountCHB;
 +----------+
 | count(*) |
 +----------+
 |        0 |
 +----------+
 
 mysql> select max(score) from hapmapAllelesSummary;
 +------------+
 | max(score) |
 +------------+
 |        500 |
 +------------+
 
 mysql> select count(*), popCount from hapmapAllelesSummary group by popCount;
 +----------+----------+
 | count(*) | popCount |
 +----------+----------+
 |    52479 |        1 |
 |    72977 |        2 |
 |   207643 |        3 |
 |  3700478 |        4 |
 +----------+----------+
 
 mysql> select count(*), isMixed from hapmapAllelesSummary group by isMixed; 
 
 +----------+---------+
 | count(*) | isMixed |
 +----------+---------+
 |  3192896 | NO      |
 |   840681 | YES     |
 +----------+---------+
 
 
 # histogram of heterozygosity:
 
   0 ************************************************************ 883400
  25 ************** 204000
  50 ************* 188703
  75 *********** 157404
 100 ********** 143119
 125 ********* 131575
 150 ********* 126916
 175 ********* 128585
 200 ******** 123440
 225 ******** 119815
 250 ******** 120646
 275 ******** 120239
 300 ******** 122654
 325 ********* 128233
 350 ********* 130069
 375 ********** 144699
 400 ********** 152829
 425 ************ 172513
 450 *************** 225645
 475 ********************************** 503166
 500  5927
 
 ############################################################################################
 
 ### HapMap LD (Daryl; February 11, 2006)
     ## start from the genotypes files, run Haploview, reformat, and load
     mkdir -p /san/sanvol1/hg17/bed/hapmap/genotypes/2006-01/non-redundant/para
     cd /san/sanvol1/hg17/bed/hapmap/genotypes/2006-01/non-redundant
     # wget all genotype data:
     # ftp://www.hapmap.org/genotypes/2006-01/non-redundant/genotypes_chr*_*.b35.txt.gz
 
     # Haploview had to be recompiled because there was a missing JPT sample in the ped file
     ##runHaploview.csh
     #!/bin/csh
     if ( $#argv < 2 ) then
 	echo "usage: $0 <absolutePath> <genotypeFileName.gz> [<javaMaxMem>]"
 	echo "       $0 /cluster/bin/foo bar.gz 2G"
 	exit 1
     endif
     set path = $1
     set file = $2
     set root = $file:r
     set memFlag = ""
     if ( $#argv >= 3  ) then
 	set memFlag = "-Xmx$3"
     endif
     cd /scratch
     /bin/cp     -f $path/$file .
     /bin/gunzip -f $file
     /usr/java/jre1.5.0_06/bin/java -d64 $memFlag -jar /cluster/home/daryl/haploview/haploview/Haploview.jar -c -d -n -maxDistance 250 -a $root >&! $root.log
     /bin/gzip   -f $root.LD    $root.CHECK >>& $root.log
     /bin/mv     -f $root.LD.gz $root.CHECK.gz  $root.log $path/
     /bin/rm     -f $root*
     ###
 
     cd para
     set hv = /cluster/home/daryl/scripts/runHaploview.csh
     set ldDir = /cluster/store5/gs.18/build35/bed/hapmap/genotypes/2006-01/non-redundant
     
     foreach pop (YRI CEU CHB JPT JPT+CHB)
 	foreach chrom (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
 	    echo $hv $ldDir genotypes_chr${chrom}_{$pop}.b35.txt.gz 4G >> jobList
 	end
     end
     ssh pk
     # para create, para try, para push -maxNode=25 ...
     #Completed: 120 of 120 jobs
     #CPU time in finished jobs:    1564915s   26081.91m   434.70h   18.11d  0.050 y
     #IO & Wait Time:                 21862s     364.37m     6.07h    0.25d  0.001 y
     #Average job time:               13223s     220.39m     3.67h    0.15d
     #Longest running job:                0s       0.00m     0.00h    0.00d
     #Longest finished job:           40742s     679.03m    11.32h    0.47d
     #Submission to last job:        104809s    1746.82m    29.11h    1.21d
 
     #### makeDcc.pl 
     #!/usr/bin/perl -W
     $pop   = shift || die "usage: makeDcc.pl <pop> <chr>\n";
     $chrom = shift || die "usage: makeDcc.pl <pop> <chr>\n";
     $geno  = "geno/genotypes_${chrom}_${pop}.b35.txt.gz";
     $ld    = "ld/genotypes_${chrom}_${pop}.b35.txt.LD.gz";
     $txt   = "dcc/ld_${chrom}_${pop}.b35.txt";
     open(GENO,"zcat $geno | " ) || die "can't open $geno";
     open(LD,  "zcat $ld   | " ) || die "can't open $ld";
     open(TXT, " > $txt      " ) || die "can't open $txt";
     <GENO>;#ignore header
     while (<GENO>) { @fields = split / /; $pos{$fields[0]} = $fields[3]; }
     close(GENO);
     <LD>;#ignore header;
     while (<LD>) { @fields     = split /\t/; $chromStart = $pos{$fields[0]}; $chromEnd   = $pos{$fields[1]};
 	print TXT "$chromStart $chromEnd $pop $fields[0] $fields[1] $fields[2] $fields[4] $fields[3]\n"; }
     close(LD);
     close(TXT);
     system("gzip $txt");
     ####
 
     #### makeDcc.csh
     #!/bin/csh
     #set path = "/cluster/home/daryl/scripts";
     set path = ".";
     foreach pop (CEU CHB JPT YRI JPT+CHB)
 	foreach chr (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
 	    echo $path/makeDcc.pl $pop chr$chr
 	end
     end
     ####
 
     #### makeLdBed.pl
     #!/usr/bin/perl -W
     sub min ($$)
     {
 	my $a = shift @_;
 	my $b = shift @_;
 	if ($a<$b) {return $a;}
 	return $b;
     }
     sub encodeDprime($)
     {
 	my $val = shift @_;
 	if    ( ($val > 1) || ($val < -1) ) { die "Dprime value ($val) is out of range [-1,1]";}
 	elsif ($val>=0) { $ret = ord('a') + $val*9;}
 	else         { $ret = ord('A') - $val*9;}
 	return chr($ret);
     }
     sub encodeRsquared($)
     {
 	my $val = shift @_;
 	if ( ($val > 1) || ($val < 0) ) { die "R^2 value ($val) is out of range [0,1]";}
 	return encodeDprime($val);
     }
     sub encodeLod($$)
     {
 	my $lod    = shift @_;
 	my $dPrime = shift @_;
 	$ret = ord('a');
 	if ($lod>=2) # high LOD
 	{
 	if (abs($dPrime)<0.5) { $ret = ord('y'); } # high LOD, low D'  -> pink
 	else { $ret += min((int($lod-abs($dPrime)-1.5)), 9) ;}
 	}
 	elsif (abs($dPrime)>0.99) { $ret = ord('z'); } # high D', low LOD  -> blue
 	return chr($ret); 
     }
     $inDir  = shift||"data"; 
     $outDir = shift||"bed";
     $foo    = "";
     $bar    = ""; 
     @rest   = (); 
     @pops   = ("CEU", "CHB", "JPT", "YRI", "JPT+CHB");
     printf("> Starting        \t" . `date` . "\n");
     foreach $pop (@pops)
     {
 	opendir(DIR, $inDir) || die "can't open $inDir";
 	if ($pop eq "JPT+CHB") { @hmFiles = grep {/^ld_/ && /_JPT/ && /CHB.b35.txt.gz$/} readdir(DIR); }
 	else { @hmFiles = grep {/^ld_/ && /_${pop}.b35.txt.gz$/} readdir(DIR); }
 	closedir(DIR);
 	printf "POP:\t$pop\t$#hmFiles\n";
 	foreach $hmFile (sort @hmFiles)
 	{
 	    ($foo, $chrom, $bar) = split /_/, $hmFile;
 	    $chrom =~ s/chrx/chrX/;
 	    $chrom =~ s/chry/chrY/;
 	    $outfile = "$outDir/${pop}_${chrom}.hg17.bed";
 	    if ((-e $outfile)||(-e "$outfile.gz")) { next; }
 	    $tmpFile = "/tmp/${pop}_${chrom}.hg17.bed";
 	    printf("$inDir/$hmFile => $outfile.gz\t" . `date`);
 	    open(OUT, "> $tmpFile" ) || die "can't open $tmpFile";
 	    open(IN, "zcat $inDir/$hmFile | " ) || die "can't open $inDir/$hmFile";
 	    $line = <IN>;
 	    if (!defined $line){next;}
 	    chomp($line);
 	    ($chromStart, $chromEnd, $pop, $name, $marker2, $dprime, $rsquared, $lod, @rest) = split / /, $line;
 	    $ldCount = 1;
 	    while (<IN>)
 	    {
 	        chomp();
 		($chromStartNew, $chromEndNew, $pop, $nameNew, $marker2, $dprime, $rsquared, $lod, @rest) = split / /;
 		if ($chromStart ne $chromStartNew)
 		{
 		    $chromStart--;
 		    printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
 		    $chromStart   = $chromStartNew;
 		    $chromEnd     = $chromEndNew;
 		    $name         = $nameNew;
 		    $ldCount      = 1;
 		    $dprimeList   = encodeDprime($dprime);
 		    $rsquaredList = encodeRsquared($rsquared);
 		    $lodList      = encodeLod($lod, $dprime);
 		}
 		elsif ($chromEndNew-$chromStartNew<250000)
 		{
 		    $chromEnd     = $chromEndNew;
 		    $ldCount++;
 		    $dprimeList   .= encodeDprime($dprime);
 		    $rsquaredList .= encodeRsquared($rsquared);
 		    $lodList      .= encodeLod($lod, $dprime);
 		}
 	    }
 	    close(IN);
 	    $chromStart--;
 	    printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
 	    close(OUT);
 	    system("gzip $tmpFile");
 	    system("mv $tmpFile.gz $outDir");
 	}
     }
     printf("> Finished        \t" . `date` . "\n");
     ####
 
     #### getMax.csh -- check for consistency by chrom and population
     #!/bin/csh
     set out = maxDist.txt
     rm -f $out
     touch $out
     echo this takes about 4 hours to run completely >> $out
     foreach f (dcc/ld_*.b35.txt.gz)
 	echo -n "$f    " >> $out
 	zcat $f | awk '{if ($2-$1>max) max=$2-$1} END {print max}' >> $out
     end
 
     #### getSizes.csh -- should all be 249999
     #!/bin/csh
     set out = wcList.txt
     rm -f $out
     touch $out
     echo "this takes about 2 hours to run completely"
     foreach f (dcc/*.txt.gz) 
 	echo -n $f:r:r "    " | sed 's/ld_//;s/chr//;s/_/\t/' >> $out
 	zcat $f | cut -f1 -d " " | uniq | wc -l >> $out
     end
 
     #### load.csh
     #!/bin/csh
     set db = hg17
     sed 's/hapmapLd/hapmapLdCeu/'    ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
     sed 's/hapmapLd/hapmapLdChb/'    ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
     sed 's/hapmapLd/hapmapLdJpt/'    ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
     sed 's/hapmapLd/hapmapLdYri/'    ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
     sed 's/hapmapLd/hapmapLdChbJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql ${db}
     # about half an hour to an hour per population
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
 	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdCeu CEU_chr${c}.${db}.bed.gz
 	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdChb CHB_chr${c}.${db}.bed.gz
 	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdJpt JPT_chr${c}.${db}.bed.gz
 	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdYri YRI_chr${c}.${db}.bed.gz
 	hgLoadBed -noSort -oldTable -strict ${db} hapmapLdChbJpt JPT+CHB_chr${c}.${db}.bed.gz
     end
     rm -f bed.tab
     ###
 
 
 # AFFYHUEX1 TRACK (sugnet Wed Oct  5 12:16:42 PDT 2005)
 
 mkdir hg17
 cd hg17
 pwd
 # /cluster/store1/sugnet/affymetrixHumanAllExon/hg17
 mkdir gff beds annot
 cd gff
 # download gff design files
 
 # parse gff script...
 #!/usr/bin/perl -w
 if(scalar(@ARGV) == 0) {
     print STDERR "parseGff.pl - Parse out affymetrixes gff annotation 
 probesets for human all exon design.
 usage:
    parseGff.pl file1.design.gff file2.design.gff ... fileN.design.gff
 ";
     exit(1);
 }
 
 sub splitField($) {
     my $l = shift(@_);
     my @w = split / /, $l;
     return $w[1];
 }
 
 while($file = shift(@ARGV)) {
     if(!($file =~ /(.+)\.gff/)) {
 	die "$file doesn't have .gff suffix\n";
     }
     $prefix = $1;
     print STDERR "Doing file $file.\n";
     open(IN, $file) or die "Can't open $file to read.";
     open(BED, ">../beds/$prefix.pset.bed") or die "Can't open ../beds/$prefix.pset.bed to write.";
     open(ANNOT, ">../annot/$prefix.tab") or die "Can't open ../annot/$prefix.tab to write.";
     while($line = <IN>) {
 	# Only want the probeset records.
 	if($line =~ /\tprobeset\t/) {
 	    $score = 0;
 	    $cds = 0;
 	    $bounded = 0;
 	    chomp($line);
 	    # pop off an microsoft line endings.
 	    $line =~ s/\r$//;
 	    @words = split /\t/, $line;
 	    # This makes the evidence be comman separated.
 	    $words[8] =~ s/\" \"/,/g;
 	    # This gets rid of pesky quotes.
 	    $words[8] =~ s/\"//g;
 
 	    # Set the score based on the annotation type
 	    if($words[8] =~ /full/) {
 		$score = 200;
 	    }
 	    elsif($words[8] =~ /extended/) {
 		$score = 500;
 	    }
 	    elsif($words[8] =~ /core/) {
 		$score = 900;
 	    }
 	    if($words[8] =~ /bounded/) {
 		$score -= 200;
 	    }
 	    if($words[8] =~ /cds/) {
 		$score += 100;
 	    }
 	    if($score <= 0) {
 		$score = 100;
 	    }
 		
 	    # Print out the annotation fields.
 	    @fields = split /; /,$words[8];
 	    $id = splitField($fields[1]);
 	    $f = shift(@fields);
 	    $f = splitField($f);
 	    print ANNOT "$f";
 	    while($f = shift(@fields)) {
 		if($f =~ /^bounded/) {
 		    $bounded = 1;
 		}
 		if($f =~ /^cds/) {
 		    $cds = 1;
 		}
 		if(!($f =~ /^bounded/ || $f =~ /^cds/)) {
 		    $f = splitField($f);
 		    print ANNOT "\t$f";
 		}
 	    }
 	    print ANNOT "\t$bounded\t$cds";
 	    print ANNOT "\n";
 	    print BED "$words[0]\t$words[3]\t$words[4]\t$id\t$score\t$words[6]\n";
 	}
     }
     close(IN);
     close(BED);
     close(ANNOT);
 }
 
 ./parseGff.pl *.gff
 cat beds/*.bed > affyHuEx1.bed
 hgLoadBed hg17 affyHuEx1 affyHuEx1.bed -strict
 cat annot/*.tab > affyHuEx1.annot.tab 
 
 # Contents of affyHuEx1Annot.sql file
 CREATE TABLE affyHuEx1Annot (
   numIndependentProbes smallint not null,
   probesetId int(11) not null,
   exonClustId int(11) not null,
   numNonOverlapProbes smallint not null,
   probeCount smallint not null,
   transcriptClustId int(11) not null,
   probesetType smallint not null,
   numXHybeProbe smallint not null,
   psrId int(11) not null,
   level varchar(10) not null,
   evidence varchar(255) not null,
   bounded smallint not null,
   cds smallint not null,
   PRIMARY KEY (probesetId)
 );
 hg17S -A < affyHuEx1Annot.sql 
 echo "load data local infile 'affyHuEx1.annot.tab' into table affyHuEx1Annot;" | hg17S -A
 
 # end AFFYHUEX1 track
 
 ##########################################################################
 # AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14)
    ssh hgwdev
    cd /cluster/data/hg17/bed/affyHumanExon
    echo "select * from affyHuEx1" | hgsql hg17 | \
       tail +2 | awk 'BEGIN{OFS="\t"}{print $2,$3-1,$4,$5,$6,$7}' \
     > affyHuEx1.fixed.bed
    hgLoadBed hg17 affyHuEx1 affyHuEx1.fixed.bed
 
 ##########################################################################
 # NSCAN composite track - (2005-09-29 markd)  loaded proteins 2005-10-13
     cd /cluster/data/hg17/bed/nscan/
     # obtained NSCAN and NSCAN-EST predictions from michael brent's group
     # at WUSTL
     wget http://genome.cse.wustl.edu/predictions/human/hg17_nscan_mm5_9_14_2005/hg17_nscan_mm5_9_14_2005.tar.gz
     tar -zxf hg17_nscan_mm5_9_14_2005.tar.gz 
 
     wget http://genome.cse.wustl.edu/predictions/human/NCBI35_NSCAN_EST_4-16-2005.tar  
     gzip -9 NCBI35_NSCAN_EST_4-16-2005.tar
 
     # change protein fasta file to have transcript id in header
     foreach f (chr_ptx/*.ptx)
         awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
     end
     foreach f (NCBI35_NSCAN_EST_4-16-2005/chr_ptx/*.ptx)
         awk '/^>/{$0=$1".a"}{print $0}' $f >$f.fix
     end
 
     # load tracks.  Note that these have *utr features, rather than
     # exon featres.  currently ldHgGene creates separate genePred exons
     # for these.
     ldHgGene -gtf -genePredExt hg17 nscanGene  chr_gtf/chr*.gtf
     hgPepPred hg17 generic nscanPep chr_ptx/chr*.fix
     rm -rf chr_* *.tab
 
     ldHgGene -gtf -genePredExt hg17 nscanEstGene NCBI35_NSCAN_EST_4-16-2005/chr_gtf/chr*.gtf
     hgPepPred hg17 generic nscanEstPep NCBI35_NSCAN_EST_4-16-2005/chr_ptx/chr*.fix
     rm -rf NCBI35_NSCAN_EST_4-16-2005 *.tab
 
     # update trackDb; need a hg17-specific page to describe informants
     human/hg17/nscan.html    
     human/hg17/trackDb.ra
 
 ##########################################################################
 # NHGRI DNASE I HYPERSENSITIVE SITES (2005-10-05 kate)
 #  Submitted by Greg Crawford via web site,
 #  http://research.nhgri.nih.gov/DNaseHS/May2005/
 #  In addition, a file containing the 'randoms' was FTP'ed by Greg
 #  Submitted for hg16 -- lifted to hg17.
 #  Details of hg16 data prep are in makeHg16.doc
 
     mkdir /cluster/data/hg17/bed/nhgri
     cd /cluster/data/hg17/bed/nhgri
     cp /cluster/data/hg16/bed/nhgri/hs.bed hs.hg16.bed
     liftOver hs.hg16.bed /gbdb/hg16/liftOver/hg16ToHg17.over.chain \
                 hs.hg17.bed hs.unmapped
     grep '^chr' hs.unmapped | wc -l
         # 8 unmapped
     hgLoadBed hg17 nhgriDnaseHs hs.hg17.bed
         # Loaded 14216 elements of size 5
     checkTableCoords hg17 nhgriDnaseHs
 
 # UPDATE WGRNA TRACK (DONE, 2005-10-20, Fan)
 
   ssh hgwdev
   cd /cluster/data/hg17/bed
 
   mkdir wgRna-2005-10-20
   cd wgRna-2005-10-20
 
 # Received the data file, wgtrack_no_bin_oct2005.txt, from Michel Weber's email 
 # (Michel.Weber@ibcg.biotoul.fr)
 # and place it under cd /cluster/data/hg17/bed/wgRna-2005-10-20.
 
   cp wgtrack_no_bin_oct2005.txt wgRna.tab
   vi wgRna.tab
   
 # edit wgRna.tab to take out the first line of data field labels.
 
     hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
 
 # Compared to 8/24/05 data, a few records were changed.
 
 ##########################################################################
 # REBUILD hg17.gnfAtlas2Distance TABLE.  SOMEHOW IT HAD MUCH FEWER RECORDS. (DONE 10/27/05, Fan)
 
 # Create table to map between known genes and GNF Atlas2
 # expression data.
     hgMapToGene hg17 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 
 # Create expression distance table - takes about an hour
     hgExpDistance hg17 hgFixed.gnfHumanAtlas2MedianRatio \
         hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
         -lookup=knownToGnfAtlas2 &
 
 #       hgsql -e "select count(*) from gnfAtlas2Distance;" hg17
 #       row count changed to 32458000
 			
 ##########################################################################
 # BUILD ALLEN BRAIN TRACK (DONE 10/29/05 JK)
 
 # Make the working directory
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir allenBrain
     cd allenBrain
 
 # Remap the probe alignments from mm7 to hg17
     zcat /cluster/data/mm7/bed/bedOver/mm7.hg17.over.chain.gz \
         |  pslMap -chainMapFile -swapMap \
 	       /cluster/data/mm7/bed/allenBrain/allenBrainAli.psl stdin stdout
 	|  sort -k 14,14 -k 16,16n > unscored.psl
     pslRecalcMatch unscored.psl /cluster/data/hg17/nib \
         /cluster/data/mm7/bed/allenBrain/allProbes.fa allenBrainAli.psl
     
 
 # Load the database
    hgsql hg17 < ~/kent/src/hg/lib/allenBrainUrl.sql
    hgsql hg17 -e 'load data local infile "/cluster/data/mm7/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
    hgLoadPsl hg17 allenBrainAli.psl
    mkdir /gbdb/hg17/allenBrain
    ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/hg17/allenBrain/allProbes.fa
    hgLoadSeq hg17 /gbdb/hg17/allenBrain/allProbes.fa
 
 # Make mapping between known genes and allenBrain	
    hgMapToGene hg17 allenBrainAli -type=psl knownGene knownToAllenBrain 
 
 ##########################################################################
 # BUILD NIBB IMAGE PROGES (DONE 11/07/05 JK)
 
 # Make directory on san for cluster job and copy in sequence
     ssh pk
     mkdir /san/sanvol1/scratch/hg17/nibbPics
     cd /san/sanvol1/scratch/hg17/nibbPics
     cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
 
 # Make parasol job dir and sequence list files
     mkdir run
     cd run
     mkdir psl
     ls -1 /cluster/sanvol1/scratch/hg17/nib/*.nib > genome.lst
     echo ../nibbImageProbes.fa > rna.lst
 
 # Create parasol gensub file file
 cat << '_EOF_' > gsub
 #LOOP
 blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
     gensub2 genome.lst mrna.lst gsub spec
     para create spec
 
 # Do para try/push/time etc.
 #Completed: 46 of 46 jobs
 #CPU time in finished jobs:      11818s     196.97m     3.28h    0.14d  0.000 y
 #IO & Wait Time:                   145s       2.41m     0.04h    0.00d  0.000 y
 #Average job time:                 260s       4.33m     0.07h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:            1022s      17.03m     0.28h    0.01d
 #Submission to last job:          1060s      17.67m     0.29h    0.01d
 
 # Make sort and filter
     catDir psl | sort -k 10 \
         | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
 	| sort -k 14,14 -k 16,16n \
 	| sed 's/..\/..\/nib\/chr/chr/' \
 	| sed 's/.nib//' > ../nibbImageProbes.psl
 
 # Make bed file and copy in stuff
     ssh hgwdev
     cd /cluster/data/hg17/bed
     mkdir nibbPics
     cd nibbPics
     cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
     cp /san/sanvol1/scratch/hg17/nibbPics/nibbImageProbes.psl .
 
 # Load into database
     ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/hg17/nibbImageProbes.fa
     hgLoadSeq hg17 /gbdb/hg17/nibbImageProbes.fa
     hgLoadPsl hg17 nibbImageProbes.psl
 
 ###########################################################################
 # EXONIPHY WITH DOG (acs, 11/22/05) -- MM7, RN3, CANFAM2, HG17
 
     # first build 4-way multiz alignment from syntenic nets (helps reduce
     # false positive predictions due to paralogous alignments)
 
     # (prepare mafNet files from syntenic nets and copy to
     # /cluster/bluearc/hg17/mafNetSyn; do this for mm7, rn3, canFam2,
     # and galGal2)
 
     # make output dir and run dir
     ssh pk
     cd /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2
 
     mkdir -p mafSyn runSyn
     cd runSyn
 
     # create scripts to run multiz on cluster
     cat > oneMultiz.csh << 'EOF'
 #!/bin/csh -fe
     set c = $1
     set multi = /scratch/$user/multiz.hg17Mm7Rn3CanFam2.$c
     set pairs = /cluster/bluearc/hg17/mafNetSyn
 
     # special mode --
     # with 1 arg, cleanup
     if ($#argv == 1) then
         rm -fr $multi
         exit
     endif
 
     #  special mode --
     # with 3 args, saves an alignment file
     if ($#argv == 3) then
         cp $multi/$2/$c.maf $3
         exit
     endif 
         
     set s1 = $2
     set s2 = $3
     set flag = $4
 
     # locate input files -- in pairwise dir, or multiple dir
     set d1 = $multi
     set d2 = $multi
     if (-d $pairs/$s1) then
         set d1 = $pairs
     endif
     if (-d $pairs/$s2) then
         set d2 = $pairs
     endif
     set f1 = $d1/$s1/$c.maf
     set f2 = $d2/$s2/$c.maf
     # write to output dir
     set out = $multi/${s1}${s2}
     mkdir -p $out
 
     # check for empty input file
     if (-s $f1 && -s $f2) then
         echo "Aligning $f1 $f2 $flag"
         /cluster/bin/penn/multiz.v10.5 $f1 $f2 $flag > $out/$c.tmp.maf
         echo "Ordering $c.maf"
         /cluster/bin/penn/maf_project $out/$c.tmp.maf hg17.$c > $out/$c.maf
     else if (-s $f1) then
         cp $f1 $out
     else if (-s $f2) then
         cp $f2 $out
     endif
 'EOF'
 # << for emacs
     chmod +x oneMultiz.csh
 
     cat > allMultiz.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 oneMultiz.csh $c mm7 rn3 0
 oneMultiz.csh $c mm7rn3 canFam2 1
 # get final alignment file
 oneMultiz.csh $c mm7rn3canFam2 /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf
 #cleanup
 oneMultiz.csh $c
 'EOF'
 # << for emacs
     chmod +x allMultiz.csh
 
 cat > gsub << 'EOF'
 #LOOP
 allMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2/mafSyn/$(root1).maf}
 #ENDLOOP
 'EOF'
 # << for emacs
     cut -f 1 /cluster/data/hg17/chrom.sizes > chrom.lst
     set path = (/parasol/bin $path);rehash
     gensub2 chrom.lst single gsub jobList
     para create jobList
         # 46 jobs
     para try; para check
     para push
 
     # build chromosome-by-chromosome SS files
 
     cd /cluster/data/hg17/bed/multiz.hg17Mm7Rn3CanFam2
     mkdir run-ss-syn
     cd run-ss-syn
     mkdir -p /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn
     cat > makeSS.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 /cluster/bin/phast/msa_view -i MAF -o SS /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/hg17/chrom/$c.fa | gzip -c >  /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$c.ss.gz
 'EOF'
 # << for emacs
     chmod +x makeSS.csh
 
     rm -f jobList
     foreach chr (`cut -f 1 /cluster/data/hg17/chrom.sizes`)
 	echo "makeSS.csh $chr" >> jobList
     end
 
     para create jobList
         # 46 jobs
     para try; para check
     para push
 
     # now train hmm, with indel model
     # note: commands below require bash
 
     # first get a clean set of genes for training (with --indel-strict)
     mkdir -p /cluster/data/hg17/bed/exoniphy/train
     cd /cluster/data/hg17/bed/exoniphy/train
     mkdir -p stats genes
     CHROMS="chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22"
     for chr in ${CHROMS} ; do 
 	echo $chr 
 	zcat /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$chr.ss.gz | clean_genes genes/refGene.$chr.gff - --stats stats/$chr.stats --conserved --indel-strict --groupby exon_id --offset3 4 --offset5 4 > genes/refGene.$chr.clean.gff 
     done
 
     # get conserved noncoding seqs and add to GFFs
     mkdir -p cns
     for chr in ${CHROMS} ; do 
 	    echo $chr
 	    featureBits -bed=cns/$chr.bed -chrom=$chr hg17 phastConsElementsPaper \!knownGene:exon:100 \!refGene:exon:100 \!mrna \!ensGene \!intronEst \!twinscan
 	cp genes/refGene.$chr.clean.gff genes/refGene.$chr.withCNS.gff 
 	awk '{printf "%s\tphastCons\tCNS\t%d\t%d\t.\t.\t.\texon_id \"CNS.%s\"\n", $1, $2+1, $3, $4}' cns/$chr.bed >> genes/refGene.$chr.withCNS.gff 
     done
 
     # now train HMM
     # note: actually have to unzip SS files before this step
     rm -f alns gffs
     for chr in ${CHROMS} ; do 
 	echo /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSyn/$chr.ss >> alns 
 	echo genes/refGene.$chr.withCNS.gff >> gffs 
     done
     hmm_train -m '*alns' -c ~/phast/data/exoniphy/default.cm -g '*gffs' -R exon_id -i SS -I CDS,background,CNS,5\'splice,3\'splice,prestart -t "((hg17,(mm7,rn3)),canFam2)" > indels.hmm
 
     # training complete; now run exoniphy genome-wide
 
     # first need to split up alignments
     
     mkdir -p /cluster/data/hg17/bed/exoniphy/test/run-split
     cd /cluster/data/hg17/bed/exoniphy/test/run-split
     
     cat > doSplit.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 mkdir -p /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c
 /cluster/bin/phast/msa_split /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/$c.maf --refseq /cluster/bluearc/hg17/chrom/$c.fa -i MAF --windows 100000,0 --between-blocks 5000 --min-informative 1000 --out-format SS --out-root /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c/$c --tuple-size 3
 gzip /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/$c/$c*.ss
 'EOF'
 # << for emacs
     chmod +x doSplit.csh
 
     rm -f jobList
     for file in /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/mafSyn/chr*.maf ; do echo doSplit.csh `basename $file .maf` >> jobList ; done
 
     para create jobList
         # 43 jobs
     para try; para check
     para push
 
     # now set up exoniphy run
 
     mkdir -p /cluster/data/hg17/bed/exoniphy/test/run-exoniphy
     cd /cluster/data/hg17/bed/exoniphy/test/run-exoniphy
     cp -p ../../train/indels.hmm /cluster/bluearc/hg17/exoniphy/training
     mkdir -p /cluster/bluearc/hg17/exoniphy/GFF
     
     cat > doExoniphy.sh << 'EOF'
 #!/usr/local/bin/bash
 
 root=`basename $1 .ss.gz`
 chrom=`echo $root | awk -F\. '{print $1}'`
 no=`echo $root | awk 'BEGIN{FS="[-.]"} {printf "%d\n", ($2+10000)/100000}'`
 
 if [ ! -d /cluster/bluearc/hg17/exoniphy/GFF/$chrom ] ; then 
     mkdir -p /cluster/bluearc/hg17/exoniphy/GFF/$chrom
 fi
 
 zcat $1 | /cluster/bin/phast/exoniphy - --hmm  /cluster/bluearc/hg17/exoniphy/training/indels.hmm --reflect-strand --extrapolate default --score --indels --alias "hg17=human; mm7=mouse; rn3=rat; canFam2=dog" --seqname $chrom --idpref $chrom.$no  > /cluster/bluearc/hg17/exoniphy/GFF/$chrom/$root.gff
 'EOF'
 # << for emacs
     chmod +x doExoniphy.sh
 
     rm -f jobList
     for dir in /cluster/bluearc/hg17/multiz.hg17Mm7Rn3CanFam2/ssSynFrags/* ; do find $dir -name '*.ss.gz' | awk '{printf "doExoniphy.sh %s\n", $1}' >> jobList ; done
 
     para create jobList
         # 27070 jobs
     para try; para check
     para push
 
 #Completed: 27059 of 27070 jobs
 #Crashed: 11 jobs
 #CPU time in finished jobs:    8573545s  142892.41m  2381.54h   99.23d  0.272 y
 #IO & Wait Time:                 73412s    1223.54m    20.39h    0.85d  0.002 y
 #Average job time:                 320s       5.33m     0.09h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             593s       9.88m     0.16h    0.01d
 #Submission to last job:         22823s     380.38m     6.34h    0.26d    
 
     # crashed jobs all on random chroms, chrM, etc., and appear to be
     # due to all species not being present; okay to ignore
 
     # collect predictions and create track
     rm -f exoniphy.gff
     for dir in /cluster/bluearc/hg17/exoniphy/GFF/chr* ; do \
 	rm -f files.* tmp.gff ;\
 	find $dir -name "chr*.gff" > files ;\
 	split -l 1000 files files. ;\
 	for l in files.* ; do cat `cat $l` >> tmp.gff ; done ;\
 	refeature --sort tmp.gff >> exoniphy.gff ;\
     done
     ldHgGene -genePredExt -gtf hg17 exoniphyDog exoniphy.gff
 
 
 # COW SYNTENY (Done, Heather, Dec. 2005)
 # Data from Harris A. Lewin <h-lewin@uiuc.edu>
 
 ssh hgwdev
 cd /cluster/data/hg17/bed
 mkdir syntenyCow
 cd syntenyCow
 
 hgLoadBed -noBin hg17 syntenyCow syntenyCow.bed
 
 # add to kent/src/hg/makeDb/trackDb/human/hg17/trackDb.ra
  
 ###########################################################################
 # New Conservation track (WORKING 2005-12-15 kate)
 
 # Pairwise alignments needed for: monDom2, danRer3, bosTau2
 # Use existing alignments for:  
 # macaque_rheMac1
 # rat_rn3 
 # mouse_mm7
 # dog_canFam2
 # chicken_galGal2
 # xenopus_xenTro1
 # fugu_fr1
 # rabbit_oryCun1
 # armadillo_dasNov1
 # elephant_loxAfr1
 # tenrec_echTel1
 # tetraodon_tetNig1
 
 #########################################################################
 # BLASTZ danRer3 (DONE - 2005-12-20 kate)
 # Includes both randoms
     ssh pk
     mkdir /cluster/data/hg17/bed/blastz.danRer3.2005-12-20
     cd /cluster/data/hg17/bed
     ln -s blastz.danRer3.2005-12-20 blastz.danRer3
     cd blastz.danRer3
 
     cat << 'EOF' > DEF
 # human target, zebrafish query
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # use parameters suggested for human-fish evolutionary distance
 # recommended in doBlastzChainNet.pl help
 # (previously used for  hg16-fr1, danrer1-mm5)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
 
 # TARGET: Human hg17
 SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
 SEQ1_SMSK=/cluster/bluearc/hg17/linSpecRep.notInZebrafish
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes
 
 # QUERY: zebrafish danRer3
 # Use all chroms, including both randoms (chrUn and chrNA)
 SEQ2_DIR=/san/sanvol1/scratch/danRer3/nib
 SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
 SEQ2_LEN=/cluster/bluearc/danRer3/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=1000
 
 BASE=/cluster/data/hg17/bed/blastz.danRer3.2005-12-20
 TMPDIR=/scratch/tmp
 'EOF'
     # << happy emacs
 
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose  \
         -stop=net \
 	`pwd`/DEF >& blastz.out &
 
     # mistakenly started this in blastz.danRer3.2005-12-18 dir --
     # need to move DEF file and blastz.out to 2005-12-20 dir.
 
     # bogus stop at net step -- thinks it can't find chains
     # I'm just restarting there
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose  \
         -continue=net  \
 	`pwd`/DEF >& blastz.2.out &
 
     # stopped because vsDanRer3 downloads already there from
     # previous run. 
     ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/hg17/vsDanRer3"
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose  \
         -continue=download  \
 	`pwd`/DEF >& blastz.3.out &
 
     # measurements
     ssh hgwdev "featureBits hg17 chainDanRer2Link" >& danRer2.fb; cat danRer2.fb
         # 70696998 bases of 2866216770 (2.467%) in intersection
     ssh hgwdev "featureBits hg17 chainDanRer3Link" >& danRer3.fb; cat danRer3.fb
         # 55625762 bases of 2866216770 (1.941%) in intersection
 
     # not sure why there's lower coverage from the newer assembly.
     # It's possibly due to different parameters used in the other
     # alignment.  Rachel is experimenting with hg18/danRer3, and
     # if warranted, we might replace this later
 
 #########################################################################
 # BLASTZ bosTau2 (DONE - 2005-12-19 kate)
     ssh pk
     mkdir /cluster/data/hg17/bed/blastz.bosTau2.2005-12-19
     cd /cluster/data/hg17/bed
     rm blastz.bosTau2
     ln -s blastz.bosTau2.2005-12-19 blastz.bosTau2
     cd blastz.bosTau2
 
     cat << 'EOF' > DEF
 # human vs. cow
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.x86_64
 
 # using parameter used when not using lineage specific repeat
 # abridging.  This parameter restricts the # matches used by
 # dynamic masking.  (We can't currently use LSR repeat abridging
 # when either assembly sequence is in .2bit).
 
 BLASTZ_M=50
 
 # TARGET: Human (hg17)
 SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
 SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow (bosTau2)
 # chunk it as we can't do whole-genome on 2bits
 SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
 SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
 SEQ2_CHUNK=300000000
 SEQ2_LAP=10000
 
 BASE=/cluster/data/hg17/bed/blastz.bosTau2.2005-12-19
 TMPDIR=/scratch/tmp
 'EOF'
     # << happy emacs
     # use chain parameters for "close" species
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF >& blastz.out &
 
     ssh hgwdev "featureBits hg17 chainBosTau1Link" >& bosTau1.fb; cat bosTau1.fb
     ssh hgwdev "featureBits hg17 chainBosTau2Link" >& bosTau2.fb; cat bosTau2.fb
 
     #	swapping to get the lift over file in the other direction (Hiram)
     ssh pk
     mkdir /cluster/data/bosTau2/bed/blastz.hg17.swap
     cd /cluster/data/bosTau2/bed
     ln -s blastz.hg17.swap blastz.hg17
     cd blastz.hg17.swap
     
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	/cluster/data/hg17/bed/blastz.bosTau2.2005-12-19/DEF > swap.out 2>&1 &
     #	this failed during the load of the tables, but that is OK, we
     #	just wanted the liftOver files from this
 
     #	manually cleaned this up since the run faild during the MySQL
     #	load due to out of space problems.  These tables do not need to
     #	be loaded anyway.
     sh kkstore02
     cd /cluster/data/bosTau2/bed/blastz.hg17.swap
 
     rm -fr psl/
 
     rm -fr axtChain/run/chain/
     rm -f  axtChain/noClass.net
     rm -fr axtChain/net/
     rm -fr axtChain/chain/
 
 #########################################################################
 # BLASTZ rheMac2 (2006-02-08 kate)
     ssh pk
     mkdir /cluster/data/hg17/bed/blastz.rheMac2.2006-02-08
     cd /cluster/data/hg17/bed
     ln -s blastz.rheMac2.2006-02-08 blastz.rheMac2
     cd blastz.rheMac2
 
     cat << 'EOF' > DEF
 # macaca mulatta vs. hg18
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
 
 ALIGN=blastz-run
 BLASTZ=blastz.v7.x86_64
 
 # TARGET - hg17
 SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/san/sanvol1/scratch/hg17/chrom.sizes
 
 # QUERY - macaca mulatta
 SEQ2_DIR=/san/sanvol1/scratch/rheMac2/rheMac2.2bit
 SEQ2_CHUNK=5000000
 SEQ2_LAP=0
 SEQ2_LEN=/san/sanvol1/scratch/rheMac2/rheMac2.sizes 
 
 BASE=/san/sanvol1/scratch/hg17/blastz.rheMac2/
 RAW=$BASE/raw
 TMPDIR=/scratch/tmp
 'EOF'
     # << happy emacs
     # use chain parameters for "close" species
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF >& blastz.out &
 
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=chainRun \
 	`pwd`/DEF >& continueChainRun.out &
 
     # NOTE: must set -fileServer (e.g. to pk) if using base dir on SAN
 
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -fileServer=pk \
         -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=chainMerge \
 	`pwd`/DEF >& continueChainMerge.out &
 
     # netClass was crashing as it expected a bin in the
     # unsplit gap table.  Robert added the bin field.
 
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -fileServer=pk \
 	-continue=download \
 	`pwd`/DEF >& continueDownload.out &
 
     ssh hgwdev "featureBits hg17 chainRheMac1Link" >& rheMac1.fb; cat rheMac1.fb
     ssh hgwdev "featureBits hg17 chainRheMac2Link" >& rheMac2.fb; cat rheMac2.fb
 
     ssh kkstore02 
     cd /cluster/data/hg17/bed/blastz.rheMac2
     cp -rp mafNet /san/sanvol1/scratch/hg17/mafNet/rheMac2
 
 # SWAP CHAIN AND NET ALIGNMENTS OVER TO RHESUS (rheMac2) 
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
 # (DONE, 2006-03-22, hartera) 
     # Do the swap of hg17/rheMac2 alignments over to rheMac2 to produce
     # rheMac2/hg17 alignments. 
     ssh pk
     cd /cluster/data/hg17/bed/blastz.rheMac2
     # use chain parameters for "close" species
     /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
         `pwd`/DEF >& swap.log &
     # Took about 3 hours 40 minutes to run.
 
 #############################################################################
 # 17-WAY MULTIZ ALIGNMENTS (DONE - 2005-12-20 kate)
 #       # redo fix overlaps from xenTro1 and tetNig1 (2006-04-08 kate)
 
     # copy net mafs to cluster-friendly storage for multiz run (2006-01-25 kate)
 
     ssh kkstore01
     cd /cluster/data/hg17/bed/blastz.monDom2
     cp -rp mafNet /san/sanvol1/scratch/hg17/mafNet/monDom2
 
     ssh kkstore02
     cd /cluster/data/hg17/bed
     mkdir -p multiz17way.2005-12-20
     ln -s multiz17way.2005-12-20 multiz17way
     cd multiz17way
 
     # copy MAF's to cluster-friendly server
     # These MAF's already on bluearc:
     #  canFam2, fr1, galGal2, panTro1, rn3
     mkdir -p /san/sanvol1/scratch/hg17/mafNet
     cd /san/sanvol1/scratch/hg17/mafNet
     ln -s /cluster/bluearc/hg17/mafNet/{*} .
 
     # copy others
     foreach s (rheMac1 oryCun1 dasNov1 \
             loxAfr1 bosTau2 monDom1 xenTro1 tetNig1 danRer3)
         echo $s
         cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
     end
     # a few more
     set s = echTel1
     cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
     set s = mm7
     cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
     set s = canFam2
     cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
     set s = rheMac2
     cp -Rp /cluster/data/hg17/bed/blastz.$s/mafNet $s
 
     # thanks for the tree, Hiram! Taken from mm7 17way...
     # Hiram says this is derived from the latest ENCODE
     # tree, with some species removed and branch lengths
     # adjusted.  The ENCODE tree  from the Sept. freeze is:
     # ftp://kronos.nhgri.nih.gov/pub/outgoing/elliott/msa/SEP-2005/phylo/tree_4d.tba.v2.nh
     cd /cluster/data/hg17/bed/multiz17way
     cat << '_EOF_' > 17way.nh
 (((((((((
 (human_hg17:0.006690,chimp_panTro1:0.007571):0.024272,
   macaque_rheMac2:0.0592):0.023960,
   ((rat_rn3:0.081728,mouse_mm7:0.077017):0.229273,
       rabbit_oryCun1:0.206767):0.1065):0.023026,
 (cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,
 armadillo_dasNov1:0.149862):0.015994,
 (elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,
 monodelphis_monDom2:0.371073):0.189124,
 chicken_galGal2:0.454691):0.123297,
 xenopus_xenTro1:0.782453):0.156067,
 ((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
     zebrafish_danRer3:0.782561):0.156067);
 '_EOF_'
 
     /cluster/bin/phast/draw_tree 17way.nh > 17way.ps
     /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
     grep hg17 17way.distances.txt | sort -k3,3n | \
         awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
     # edit distances.txt to include featureBits, and chain parameters
     # from blastz run.
     cat distances.txt
         # 0.0143  chimp_panTro1
         # 0.0902  macaque_rheMac2
         # 0.2563  armadillo_dasNov1
         # 0.2651  dog_canFam2
         # 0.2677  elephant_loxAfr1
         # 0.2766  cow_bosTau2
         # 0.3682  rabbit_oryCun1
         # 0.4226  tenrec_echTel1
         # 0.4677  mouse_mm7
         # 0.4724  rat_rn3
     # use loose chain params and score from here, down (5000)
         # 0.7119  monodelphis_monDom1
         # 0.9847  chicken_galGal2
         # 1.4357  xenopus_xenTro1
         # 1.6577  tetraodon_tetNig1
         # 1.6983  fugu_fr1
         # 1.7480  zebrafish_danRer3
 
     # the order in the browser display will be by tree topology,
     #  not by distance, so it will be: 
 #  >>         # 0.0143  chimp_panTro1
 #  >>         # 0.0902  macaque_rheMac2
 #  >>         # 0.4677  mouse_mm7
 #  >>         # 0.4724  rat_rn3
 #  >>         # 0.3682  rabbit_oryCun1
 #  >>         # 0.2651  dog_canFam2
 #  >>         # 0.2766  cow_bosTau2
 #  >>         # 0.2563  armadillo_dasNov1
 #  >>         # 0.2677  elephant_loxAfr1
 #  >>         # 0.4226  tenrec_echTel1
 #  >>         # 0.7119  monodelphis_monDom1
 #  >>         # 0.9847  chicken_galGal2
 #  >>         # 1.4357  xenopus_xenTro1
 #  >>         # 1.6577  tetraodon_tetNig1
 #  >>         # 1.6983  fugu_fr1
 #  >>         # 1.7480  zebrafish_danRer3
 
     # make output dir and run dir
     ssh pk
     cd /cluster/data/hg17/bed/multiz17way.2005-12-20
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 17way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     mkdir -p maf run
     cd run
 
     # stash binaries 
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
     set db = hg17
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/mafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../{tree.nh,species.lst} $tmp
     pushd $tmp
     foreach s (`cat species.lst`)
         set in = $pairs/$s/$c.maf
         set out = $db.$s.sing.maf
         if ($s == hg17) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
 # << happy emacs
     chmod +x autoMultiz.csh
 
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg17/bed/multiz17way.2005-12-20/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
 # << happy emacs
 
     awk '{print $1}' /cluster/data/hg17/chrom.sizes > chrom.lst
 
     # REDO FOR OVERLAPS (2006-04-07 kate)
     mv ../maf ../maf.old
     # edit spec file to fix maf dir path
     gensub2 chrom.lst single spec jobList
     para create jobList
         # 46 files
     para try
     para check
     para push
     para time > run.time
         # 36 hrs (not typical -- previous runs were ~16 hrs)
 
 
 # PHASTCONS CONSERVATION (2006-01-05 kate)
 # Redone when multiz redone to fix overlaps (2006-04-12)
 # This process is distilled from Hiram and Adam's experiments
 # on mouse (mm7) 17way track.  Many parameters are now fixed, without
 # being experimentally derived, either because the experiments
 # were lengthy and produced similar results, or because they
 # weren't runnable given the alignment size.
 # These parameters are:
 # --rho
 # --expected-length
 # --target-coverage
 # Also, instead of generating cons and noncons tree models,
 # we use a single, pre-existing tree model -- Elliot Margulies' model 
 # from the (37-way) ENCODE alignments.
 #
 # NOTE: Redone 3/20/06, adding rheMac2 to non-informative options,
 #       by recommendation of Adam Siepel, to correct unwanted
 #       high conservation in regions with primate-only alignments
 
     # NOTE: reusing cluster-friendly chrom fasta files created earlier
     #cd /cluster/data/hg17
     #foreach f (`cat chrom.lst`)
         #echo $f
         #cp $f/*.fa /cluster/bluearc/hg17/chrom
     #end
 
     # Split chromosome MAF's into windows and use to generate
     # "sufficient statistics" (ss) files for phastCons input
     # NOTE: as the SAN fs has lotsa space, we're leaving these
     # big (temp) files unzipped, to save time during phastCons run.
     # Note also the larger chunk sizes from previous runs -- this
     # reduces run-time on the split, slows down the actual phastCons
     # enough so jobs don't crash (jobs are very quick, just a minute
     # or so), and according to Adam, will produce better results.
     # The previous small chunks were probably required by
     # the phyloFit step, which we are no longer using for the
     # human alignments.
     ssh pk
     mkdir /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
     cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
     cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod .
         # edit, changing rheMac1 -> rheMac2
     mkdir run.split
     cd run.split
     set WINDOWS = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
 
     cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
     set MAFS = /cluster/data/hg17/bed/multiz17way.2005-12-20/maf
     set WINDOWS = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss
     cd $WINDOWS
     set c = $1
     echo $c
     rm -fr $c
     mkdir $c
     /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
         -M /cluster/bluearc/hg17/chrom/$c.fa \
         -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
     echo "Done" >> $c.done
 'EOF'
 # << happy emacs
     chmod +x doSplit.csh
 
     rm -f jobList
     foreach f (../../maf/*.maf) 
         set c = $f:t:r
 	echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
     end
     
     para create jobList
         # 46 jobs
     para try
     para check
     para push
 # CPU time in finished jobs:       9511s     158.52m     2.64h    0.11d  0.000 y
 # IO & Wait Time:                  5391s      89.85m     1.50h    0.06d  0.000 y
 # Average job time:                 324s       5.40m     0.09h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            2354s      39.23m     0.65h    0.03d
 # Submission to last job:          2358s      39.30m     0.66h    0.03d
 
     # check tree model on 5MB chunk, using params recommended by Adam,
     # (to verify branch lengths on 2X species)
     # he ok'ed the results -- not necessary for next human run
     ssh kolossus
     cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
         --tree "`cat ../tree-commas.nh`" \
         /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/ss/chr7/chr7.115000658-120000000.ss \
         -o phyloFit.tree
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     cd ..
     mkdir run.cons
     cd run.cons
     cat > doPhast.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set tmp = /scratch/tmp/$f
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
 cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp
 pushd $tmp > /dev/null
 /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative panTro1,rheMac2 \
 	--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
 popd > /dev/null
 mkdir -p $san/pp/$c $san/bed/$c
 sleep 1
 mv $tmp/$f.pp $san/pp/$c
 mv $tmp/$f.bed $san/bed/$c
 rm -fr $tmp
 'EOF'
     # emacs happy
     chmod a+x doPhast.csh
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
     cat > template << 'EOF'
 #LOOP
 doPhast.csh $(root1) $(file1) 14 .008 .28
 #ENDLOOP
 'EOF'
     #	happy emacs
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
     ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/hg17/bed/multiz17way/cons/run.cons/in.list
     popd
 
     gensub2 in.list single template jobList
     para create jobList
         # 333 jobs
     para try
     para check
     para push
     # NOTE: these jobs go fast -- some crashed apparently having
     # difficulty accessing input files.  Just restart them and
     # they work
 #CPU time in finished jobs:      15520s     258.67m     4.31h    0.18d  0.000 y
 #IO & Wait Time:                 15796s     263.27m     4.39h    0.18d  0.001 y
 #Average job time:                  94s       1.57m     0.03h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             180s       3.00m     0.05h    0.00d
 #Submission to last job:         48266s     804.43m    13.41h    0.56d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
     #	The sed's and the sort get the file names in chrom,start order
     # (Hiram tricks -- split into columns on [.-/] with 
     #    identifying x,y,z, to allow column sorting and
     #    restoring the filename.  Warning: the sort column
     # will depend on how deep you are in the dir
     find ./bed -name "chr*.bed" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
 	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
 	/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/hg17/bed/multiz17way/cons
 
     # load into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way/cons
     hgLoadBed -strict hg17 phastConsElements17way mostConserved.bed
         # Loaded 2212445 elements of size 5
     # compare with previous tracks
     hgsql hg17 -e "select count(*) from phastConsElements10way"
         # 2011952
     hgsql hg17 -e "select count(*) from phastConsElements"
         # 1601903
     # Try for 5% overall cov, and 70% CDS cov (used elen=14, tcov=.008, rho=.28)
     
     featureBits hg17 -enrichment refGene:cds phastConsElements17way
         # refGene:cds 1.065%, phastConsElements17way 5.116%, both 0.759%, cover 71.27%, enrich 13.93x
 
     # compare with previous tracks
     featureBits hg17 -enrichment refGene:cds phastConsElements10way
         # refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x
     featureBits hg17 -enrichment refGene:cds phastConsElements
         # refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x
 
     # experiments
     # previous tracks
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements
         # refGene:cds 0.873%, phastConsElements 4.497%, both 0.630%, cover 72.10%, enrich 16.04x
     hgsql hg17 -e "select count(*) from phastConsElements where chrom='chr7'"
         # 81785
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements10way
         # refGene:cds 0.873%, phastConsElements10way 4.700%, both 0.602%, cover 68.94%, enrich 14.67x
     hgsql hg17 -e "select count(*) from phastConsElements10way where chrom='chr7'"
         # 102959
 
     # len=13, cov=.007, rho=.27
     # looks best -- similar chr7 measurements to previous tracks
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_007_27
         # refGene:cds 0.874%, phastConsElements17way_13_007_27 4.854%, both 0.607%, cover 69.43%, enrich 14.31x
     hgsql hg17 -e "select count(*) from phastConsElements17way_13_007_27 where chrom='chr7'"
 
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_005_28
        # refGene:cds 0.873%, phastConsElements17way_13_005_28 4.802%, both 0.612%, cover 70.12%, enrich 14.60x
     hgsql hg17 -e "select count(*) from phastConsElements17way_13_005_28 where chrom='chr7'"
        # 95203 
 
     # experiments with other parameters, below
     # len=15, cov=.10 
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_10
         # refGene:cds 0.873%, phastConsElements17way 7.989%, both 0.627%, cover 71.77%, enrich 8.98x
     hgsql hg17 -e "select count(*) from phastConsElements17way_15_10 where chrom='chr7'"
         # 217767 
     # => too much overall covg, and too many elements
 
     # len=15, cov=.05
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_05
         # refGene:cds 0.873%, phastConsElements17way_15_05 6.880%, both 0.627%, cover 71.77%, enrich 10.43x
     hgsql hg17 -e "select count(*) from phastConsElements17way_15_05 where chrom='chr7'"
         # 166868
 
     # len=15, cov=.01
     # These values were used by Elliott for ENCODE
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_01
         # refGene:cds 0.873%, phastConsElements17way_15_01 5.721%, both 0.628%, cover 71.89%, enrich 12.57x
     hgsql hg17 -e "select count(*) from phastConsElements17way_15_01 where chrom='chr7'"
         # 106034
 
     # len=20, cov=.01
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_20_01
         # refGene:cds 0.873%, phastConsElements17way_20_01 7.751%, both 0.634%, cover 72.56%, enrich 9.36x
     hgsql hg17 -e "select count(*) from phastConsElements17way_20_01 where chrom='chr7'"
         # 106005
     # -> wrong direction on coverage
 
     # len=10, cov=.01
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_10_01
         # refGene:cds 0.873%, phastConsElements17way_10_01 4.653%, both 0.616%, cover 70.48%, enrich 15.15x
     hgsql hg17 -e "select count(*) from phastConsElements17way_10_01 where chrom='chr7'"
         # 108279
     # => looks good on coverage and element count, check smoothness in browser
     # => undersmoothed
 
     # len=10, cov=.05
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_10_05
         # refGene:cds 0.873%, phastConsElements17way_10_05 5.365%, both 0.615%, cover 70.44%, enrich 13.13x 
     hgsql hg17 -e "select count(*) from phastConsElements17way_10_05 where chrom='chr7'"
         # 178372
     # => fragmented elements
 
     # len=15, cov=.005
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_15_005
         # refGene:cds 0.873%, phastConsElements17way_15_005 5.444%, both 0.628%, cover 71.93%, enrich 13.21x
     hgsql hg17 -e "select count(*) from phastConsElements17way_15_005 where chrom='chr7'"
         # 90855
 
     # len=20, cov=.005
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_20_005
         # refGene:cds 0.873%, phastConsElements17way_20_005 7.373%, both 0.634%, cover 72.61%, enrich 9.85x
     hgsql hg17 -e "select count(*) from phastConsElements17way_20_005 where chrom='chr7'"
         # 91858
 
     # len=17, cov=.005 rho=.3
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_17_005
         # refGene:cds 0.873%, phastConsElements17way_17_005 6.126%, both 0.631%, cover 72.24%, enrich 11.79x
     hgsql hg17 -e "select count(*) from phastConsElements17way_17_005 where chrom='chr7'"
         # 91243
 
     # len=12, cov=.01, rho=.28 -panTro1
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_12_01_28_p
         # refGene:cds 0.873%, phastConsElements17way_12_01_28_p 4.829%, both 0.612%, cover 70.02%, enrich 14.50x
     hgsql hg17 -e "select count(*) from phastConsElements17way_12_01_28_p where chrom='chr7'"
         # 123638
 
     # len=13, cov=.01, rho=.25 -panTro1
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_13_01_25_p
         # refGene:cds 0.873%, phastConsElements17way_13_01_25_p 4.793%, both 0.594%, cover 67.99%, enrich 14.19x
     hgsql hg17 -e "select count(*) from phastConsElements17way_13_01_25_p where chrom='chr7'"
         # 131895
 
     # len=14, cov=.008, rho=.28
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way_14_008_28
        # refGene:cds 0.874%, phastConsElements17way_14_008_28 5.227%, both 0.615%, cover 70.37%, enrich 13.46x
     hgsql hg17 -e "select count(*) from phastConsElements17way_14_008_28 where chrom='chr7'"
         # 106071
 
     # Create merged posterier probability file and wiggle track data files
     #	pk is currently closer to the san than any other machine
     ssh pk
     cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     #next time try Angie's simpler sort, below
     find ./pp -name "chr*.pp" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
         nice wigEncode stdin phastCons17way.wig phastCons17way.wib
     # about 23 minutes for above
         # GOT HERE ON REDO
         # NOTE: remember to flip /gbdb link from cons.old to cons
     #foreach chr (`awk '{print $1}' /cluster/data/hg17/chrom.sizes`)
       #echo $chr
       set chr = chr22
       cat `ls -1 pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \
         | nice wigEncode stdin phastCons17wayNewChr22.wig phastCons17wayNewChr22.wib
     #end
     date
 
     cp -p phastCons17way.wi? /cluster/data/hg17/bed/multiz17way/cons
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way/cons
     ln -s /cluster/data/hg17/bed/multiz17way/cons/phastCons17way.wib \
         /gbdb/hg17/multiz17way/phastCons17way.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 \
         phastCons17way phastCons17way.wig
 
 ############################################################################
     ## Run phastCons on Placental mammals
     ssh pk
     cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
     mkdir placental
     mkdir run.cons.alt
     cd run.cons.alt
 
     # create pruned trees 
     set tree_doctor = /cluster/bin/phast/tree_doctor
     sed 's/ /,/g' ../../species.lst 
     # hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3
     mkdir placental
     $tree_doctor ../elliotsEncode.mod \
         --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1 \
                 > placental/placental.mod
 
     cat > doPhast.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $6
 set tmp = /scratch/tmp/hg17/$grp/$f
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
 cp -p $san/ss/$c/$f.ss $grp/$grp.mod $tmp
 pushd $tmp > /dev/null
 /cluster/bin/phast/$MACHTYPE/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative panTro1,rheMac2 \
 	--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
 popd > /dev/null
 mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
 sleep 1
 mv $tmp/$f.pp $san/$grp/pp/$c
 mv $tmp/$f.bed $san/$grp/bed/$c
 rm -fr $tmp
 'EOF'
     # << emacs happy
 
     chmod a+x doPhast.csh
 
     # Create gsub file
     cat > template << 'EOF'
 #LOOP
 # template for 5% cov
 doPhast.csh $(root1) $(file1) 14 .2 .28 placental
 #ENDLOOP
 'EOF'
     cat > template << 'EOF'
 #LOOP
 # template same as vertebrate
 doPhast.csh $(root1) $(file1) 14 .008 .28 placental
 #ENDLOOP
 'EOF'
     #	happy emacs
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
     ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/hg17/bed/multiz17way/cons/run.cons.alt/in.list
     popd
 
     gensub2 in.list single template jobList
     para create jobList
         # 333 jobs
     para try
     para check
     para push
 #.2
 #CPU time in finished jobs:      15164s     252.74m     4.21h    0.18d  0.000 y
 #IO & Wait Time:                 14852s     247.53m     4.13h    0.17d  0.000 y
 #Average job time:                  90s       1.50m     0.03h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             170s       2.83m     0.05h    0.00d
 #Submission to last job:         86364s    1439.40m    23.99h    1.00d
 
 #.008
 #CPU time in finished jobs:      13712s     228.53m     3.81h    0.16d  0.000 y
 #IO & Wait Time:                 14407s     240.12m     4.00h    0.17d  0.000 y
 #Average job time:                  84s       1.41m     0.02h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             159s       2.65m     0.04h    0.00d
 #Submission to last job:          5291s      88.18m     1.47h    0.06d
 
     ssh pk
     cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/placental
     #	The sed's and the sort get the file names in chrom,start order
     # (Hiram tricks -- split into columns on [.-/] with 
     #    identifying x,y,z, to allow column sorting and
     #    restoring the filename.  Warning: the sort column
     # will depend on how deep you are in the dir
     find ./bed -name "chr*.bed" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
 	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
 	/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/hg17/bed/multiz17way/cons/placental
 
     # load into database
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way/cons/placental
     hgLoadBed -strict hg17 phastConsElementsPlacental mostConserved.bed
         # .2
         # Loaded 3775983 elements of size 5
         # .008
         # Loaded 1290060 elements of size 5
 
     # compare with vertebrate cons
     hgsql hg17 -e "select count(*) from phastConsElements17way"
         # 2212445
     featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental
     featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental_14_2_28
     featureBits hg17 -enrichment refGene:cds phastConsElements17way
 
     featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental
 # refGene:cds 1.070%, phastConsElementsPlacental 3.844%, both 0.667%, cover 62.32%, enrich 16.21x
 # refGene:cds 1.069%, phastConsElementsPlacental_14_008_28 3.844%, both 0.667%, cover 62.37%, enrich 16.22x
 
     featureBits hg17 -enrichment refGene:cds phastConsElementsPlacental_14_2_28
 #refGene:cds 1.070%, phastConsElementsPlacental_14_2_28 5.223%, both 0.691%, cover 64.62%, enrich 12.37x
     featureBits hg17 -enrichment refGene:cds phastConsElements17way
 #refGene:cds 1.070%, phastConsElements17way 5.116%, both 0.763%, cover 71.27%, enrich 13.93x
 
 
     # Create merged posterier probability file and wiggle track data files
     #	pk is currently closer to the san than any other machine
     ssh pk
     cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/placental
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     #next time try Angie's simpler sort, below
     find ./pp -name "chr*.pp" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
         nice wigEncode stdin phastConsPlacental.wig phastConsPlacental.wib
     # about 23 minutes for above
         # GOT HERE ON REDO
         # NOTE: remember to flip /gbdb link from cons.old to cons
 
     cp -p phastConsPlacental.wi?  \
                 /cluster/data/hg17/bed/multiz17way/cons/placental
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way/cons/placental
     ln -s \
      /cluster/data/hg17/bed/multiz17way/cons/placental/phastConsPlacental.wib \
         /gbdb/hg17/multiz17way/phastConsPlacental.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 \
         phastConsPlacental phastConsPlacental.wig
 
 ############################################################################
     ## Run phastCons on subgroups (mammals, placentals, and w/o low-cov)
     ssh pk
     cd /cluster/data/hg17/bed/multiz17way.2005-12-20/cons
     mkdir run.cons.groups
     cd run.cons.groups
 
     # create pruned trees 
     set tree_doctor = /cluster/bin/phast/tree_doctor
     sed 's/ /,/g' ../../species.lst 
     # hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3
 
     $tree_doctor ../elliotsEncode.mod \
         --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2,monDom2,galGal2,xenTro1,tetNig1,fr1,danRer3 \
                 > vertebrate-high.mod
     $tree_doctor ../elliotsEncode.mod \
         --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2,monDom2 \
                 > mammal-high.mod
     $tree_doctor ../elliotsEncode.mod \
         --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,bosTau2,canFam2 \
                 > placental-high.mod
     $tree_doctor ../elliotsEncode.mod \
         --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1,monDom2 \
                 > mammal.mod
     $tree_doctor ../elliotsEncode.mod \
         --prune-all-but=hg17,panTro1,rheMac2,rn3,mm7,oryCun1,bosTau2,canFam2,dasNov1,loxAfr1,echTel1 \
                 > placental.mod
     foreach f (*.mod)
         mkdir $f:r
         mv $f $f:r
     end
     cat > doPhast.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $6
 set tmp = /scratch/tmp/hg17/$grp/$f
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
 cp -p $san/ss/$c/$f.ss $grp/$grp.mod $tmp
 pushd $tmp > /dev/null
 /cluster/bin/phast/$MACHTYPE/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative panTro1,rheMac2 \
 	--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
 popd > /dev/null
 mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
 sleep 1
 mv $tmp/$f.pp $san/$grp/pp/$c
 mv $tmp/$f.bed $san/$grp/bed/$c
 rm -fr $tmp
 'EOF'
     # emacs happy
     chmod a+x doPhast.csh
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
     cat > template << 'EOF'
 #LOOP
 doPhast.csh $(root1) $(file1) 14   .21 .28 placental-high
 doPhast.csh $(root1) $(file1) 14   .2 .28 placental
 doPhast.csh $(root1) $(file1) 14   .11 .28 mammal
 doPhast.csh $(root1) $(file1) 14   .1 .28 mammal-high
 doPhast.csh $(root1) $(file1) 14 .0028 .28 vertebrate-high
 #ENDLOOP
 
 'EOF'
     #	happy emacs
 
     # Create parasol batch for just chr7 (for test purposes) and run it
     pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
     ls -1 ss/chr7/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups/in.list
     popd
 
     gensub2 in.list single template jobList
     para create jobList
         # 80 jobs
     para try
     para check
     para push
     # 24 minutes
 
 
     ## create Alt Most Conserved track
     ssh hgwdev
     cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
 cat > loadAltElements.csh << 'EOF'
     set b = /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups
     foreach d (mammal* placental* vertebrate*)
       echo $d
       cd $d
       find ./bed -name "chr*.bed" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
 	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
 	/cluster/bin/scripts/lodToBedScore /dev/stdin \
             > $b/$d/mostConserved.bed
         set table = `echo $d | perl -wpe "s/(.*)/phastConsElements\u$1/;s/-(.*)/\u$1/"`
         hgLoadBed -strict hg17 $table $b/$d/mostConserved.bed
         featureBits hg17 -enrichment refGene:cds -chrom=chr7 $table
       cd ..
     end
 'EOF'
     csh loadAltElements.csh >&! loadAltElements.log &
     grep refGene loadAltElements.log | sort -n -k4
 # refGene:cds 0.884%, phastConsElementsPlacentalHigh 4.828%, both 0.606%, cover 68.51%, enrich 14.19x
 # refGene:cds 0.884%, phastConsElementsMammal 4.869%, both 0.580%, cover 65.62%, enrich 13.48x
 # refGene:cds 0.884%, phastConsElementsMammalHigh 4.887%, both 0.624%, cover 70.60%, enrich 14.45x
 # refGene:cds 0.884%, phastConsElementsPlacental 4.904%, both 0.558%, cover 63.14%, enrich 12.88x
 # refGene:cds 0.884%, phastConsElementsVertebrateHigh 4.965%, both 0.652%, cover 73.74%, enrich 14.85x
 
     featureBits hg17 -enrichment refGene:cds -chrom=chr7 phastConsElements17way
 # refGene:cds 0.884%, phastConsElements17way 4.851%, both 0.623%, cover 70.48%, enrich 14.53x
 
     ssh kkstore02
     cd /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups
 cat > makeAltWiggle.csh << 'EOF'
     set b = `pwd`
     set san = /san/sanvol1/scratch/hg17/multiz17way/cons
     pushd $san
     foreach d (mammal* placental* vertebrate*)
       echo $d
       cd $d
       set table = `echo $d | perl -wpe 's/(.*)/phastCons\u$1/;s/-(.*)/\u$1/'`
       echo $table
       find ./pp -name "chr*.pp" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
         nice wigEncode stdin $table.wig $table.wib
       mv $table.wig $table.wib $b/$d
       cd ..
     end
     popd
 'EOF'
     csh makeAltWiggle.csh >&! makeAltWiggle.log &
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups
 cat > loadAltWiggle.csh << 'EOF'
     set b = `pwd`
     foreach d (mammal* placental* vertebrate*)
         echo $d
         cd $d
         set table = `echo $d | perl -wpe 's/(.*)/phastCons\u$1/;s/-(.*)/\u$1/'`
         echo $table
         ln -s `pwd`/$table.wib /gbdb/hg17/multiz17way
         hgLoadWiggle -pathPrefix=/gbdb/hg17/multiz17way hg17 $table $table.wig
         cd ..
     end
 'EOF'
     csh loadAltWiggle.csh >&! loadAltWiggle.log &
 
     # Create parasol batch for just chr7 (for test purposes) and run it
     pushd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons
     ls -1 ss/chr7/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/hg17/bed/multiz17way/cons/run.cons.groups/in.list
     popd
 
     gensub2 in.list single template jobList
     para create jobList
         # 80 jobs
     para try
     para check
     para push
     # 24 minutes
 
     # Downloads  (2006-02-22 kate)
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way
     mkdir mafDownloads
     cd mafDownloads
     # upstream mafs 
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits hg17 refGene:upstream:$i -fa=/dev/null -bed=up.bad
         awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         nice mafFrags hg17 multiz17way up.bed upstream$i.maf \
                 -orgs=../species.lst
         nice gzip upstream$i.maf
         rm up.bed
     end
     date
 'EOF'
     time csh mafFrags.csh >&! mafFrags.log &
         # ~1 hour
 
     ssh kkstore02
     cd cluster/data/hg17/bed/multiz17way/mafDownloads
 cat > downloads.csh << 'EOF'
     date
     foreach f (../maf/chr*.maf)
 	set c = $f:t:r
         echo $c
 	nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     time csh downloads.csh >&! downloads.log
         # ~2 hours
         # GOT HERE
 
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/hg17/multiz17way
     mkdir $dir
     ln -s /cluster/data/hg17/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir
     cp /usr/local/apache/htdocs/goldenPath/mm7/multiz17way/README.txt $dir
     # edit README
 
 
 # PHASTCONS SCORES DOWNLOADABLES FOR 17WAY (2006-03-20 kate)
 
     ssh kkstore02
     cd /cluster/data/hg17/bed/multiz17way 
     mkdir phastConsDownloads
     cd phastConsDownloads
 cat > downloads.csh << 'EOF'
     date
     cd /san/sanvol1/scratch/hg17/multiz17way.2005-12-20/cons/pp
     foreach chr (`awk '{print $1}' /cluster/data/hg17/chrom.sizes`)
       echo $chr
       cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
         | nice gzip -c \
             > /cluster/data/hg17/bed/multiz17way/phastConsDownloads/$chr.gz
     end
     date
 'EOF'
     csh downloads.csh >&! downloads.log &
 
         # ~20 minutes
     # << happy emacs
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way/phastConsDownloads
     md5sum *.gz > md5sum.txt
     set dir = /usr/local/apache/htdocs/goldenPath/hg17/phastCons17way
     mkdir $dir
     ln -s /cluster/data/hg17/bed/multiz17way/phastConsDownloads/{*.gz,md5sum.txt} $dir
     cp /usr/local/apache/htdocs/goldenPath/hg17/phastCons/README.txt $dir
     # edit 
 
 
 # UPDATE MONKEY DOWNLOADS (2006-01-12 kate)
 # EXTRACT AXT'S AND MAF'S FROM THE RheMac1 NET
 # The chr1 was hugely oversized -- the other's were OK, but
 # axt's were numbered oddly.
     ssh kkstore2
     cd /cluster/data/hg17/bed/blastz.rheMac1/axtChain
     gunzip -c  hg17.rheMac1.net.gz | netSplit stdin humanNet
     gunzip -c hg17.rheMac1.all.chain.gz | chainSplit chain stdin
     mkdir ../axtNet.new ../mafNet.new
 cat > makeMaf.csh << 'EOF'
     foreach f (humanNet/chr*.net)
         set c = $f:t:r
         echo "axtNet on $c"
         netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/rheMac1/rheMac1.2bit stdout | axtSort stdin ../axtNet.new/$c.axt
         axtToMaf ../axtNet.new/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/rheMac1/chrom.sizes \
             ../mafNet.new/$c.maf -tPrefix=hg17. -qPrefix=rheMac1.
     end
     cp -rp ../mafNet.new /san/sanvol1/scratch/hg17/mafNet/rheMac1.new
 'EOF'
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     pushd /san/sanvol1/scratch/hg17/mafNet
     rm -fr rheMac1
     mv rheMac1.new rheMac1
     popd
 
     rm -fr axtNet
     mv axtNet.new axtNet
     cd axtNet
     nice gzip *.axt
     md5sum *.gz > md5sum.txt
     # cleanup
     cd ..
     rm -fr chain humanNet
 
     ssh hgwdev
     ln -s /cluster/data/hg17/bed/blastz.rheMac1/axtNet \
          /usr/local/apache/htdocs/goldenPath/rheMac1/axtNet
     # Request push to downloads server
 
 
 # UPDATE OPOSSUM DOWNLOADS (2006-01-17 kate)
 # Fix overlaps
 
     ssh kkstore2
     cd /cluster/data/hg17/bed/blastz.monDom1
     mv axtNet axtNet.old
     mv mafNet mafNet.old
     mkdir axtNet mafNet
     cd axtChain/chain
     nice gunzip *.gz
     cd ..
     nice gunzip -c  human.net.gz | netSplit stdin humanNet
 cat > makeMaf.csh << 'EOF'
     foreach f (humanNet/chr*.net)
         set c = $f:t:r
         echo "axtNet on $c"
         netToAxt humanNet/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/monDom1/monDom1.2bit stdout | axtSort stdin ../axtNet/$c.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/monDom1/chrom.sizes \
             ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=monDom1.
     end
     cp -rp ../mafNet /san/sanvol1/scratch/hg17/mafNet/monDom1.new
 'EOF'
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     pushd /san/sanvol1/scratch/hg17/mafNet
     rm -fr monDom1
     mv monDom1.new monDom1
     popd
 
     rm -fr axtNet
     mv axtNet.new axtNet
     cd axtNet
     nice gzip *.axt
     md5sum *.gz > md5sum.txt
 
     # cleanup
     cd ..
     rm -fr chain humanNet
 
     ssh hgwdev
     ln -s /cluster/data/hg17/bed/blastz.monDom1/axtNet \
          /usr/local/apache/htdocs/goldenPath/monDom1/axtNet
     # Request push to downloads server
 
 
 # UPDATE COW DOWNLOADS (2006-01-17 kate)
 # Fix overlaps
 
     ssh kkstore2
     cd /cluster/data/bosTau1/bed/zb.hg17
     mv axtNet axtNet.old
     mv mafNet mafNet.old
     mkdir axtNet mafNet
 cat > makeMaf.csh << 'EOF'
     foreach f (net/chr*.net)
         set c = $f:t:r
         echo "axtNet on $c"
         netToAxt net/$c.net chain/$c.chain /cluster/data/hg17/nib /cluster/data/bosTau1/bosTau1.2bit stdout | axtSort stdin ../axtNet/$c.axt
         axtToMaf ../axtNet/$c.axt \
             /cluster/data/hg17/chrom.sizes /cluster/data/bosTau1/chrom.sizes \
             ../mafNet/$c.maf -tPrefix=hg17. -qPrefix=bosTau1.
     end
 'EOF'
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
 
     cd axtNet
     nice gzip *.axt
     md5sum *.gz > md5sum.txt
 
     ssh hgwdev
     ln -s /cluster/data/hg17/bed/blastz.bosTau1/axtNet \
          /usr/local/apache/htdocs/goldenPath/bosTau1/axtNet
     # Request push to downloads server
 
 
 ##### UPDATE hg17 knownToVisiGene (2006-01-21 galt)
 # Create table that maps between known genes and visiGene database 
     # mapping to other species such as mouse, zebrafish, frog
     # requires visiGene probe track vgImageProbes be created first
     knownToVisiGene hg17 -fromProbePsl=vgImageProbes
 
 
 ##### UPDATE hg17 mmBlastTab (2006-01-22 galt)
 # Make the protein seqs from mm7.knownGenePep
 cd /cluster/data/hg17/bed/geneSorter/blastp
 mkdir mm7
 cd mm7
 pepPredToFa mm7 knownGenePep known.faa
 #       You may need to build this binary in src/hg/near/pepPredToFa
 /cluster/bluearc/blast229/formatdb -i known.faa -t known -n known
 mkdir -p /cluster/panasas/home/store/mm7/blastp/
 cp known.* /cluster/panasas/home/store/mm7/blastp/
 
 # Make parasol run directory
 ssh kk
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm7
 mkdir run
 cd run
 mkdir out
 
 # Make blast script 
 # NOTE!! left off " b 1" from the end of the script because
 #  we wanted to be able to get the near-best, not just the best one.
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/panasas/home/store/mm7/blastp/known \
 -i $1 -o $2 -e 0.001 -m 8
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #       this echo trick is used because otherwise the command line is
 #       too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 
 Completed: 7735 of 7735 jobs
 CPU time in finished jobs:      97096s    1618.26m    26.97h    1.12d  0.003 y
 IO & Wait Time:                564656s    9410.94m   156.85h    6.54d  0.018 y
 Average job time:                  86s       1.43m     0.02h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:             240s       4.00m     0.07h    0.00d
 Submission to last job:          1272s      21.20m     0.35h    0.01d
 
 
 # Load into database.
 ssh hgwdev
 cd /cluster/data/hg17/bed/geneSorter/blastp/mm7/run/out
 hgLoadBlastTab hg17 mmBlastTab -maxPer=1 *.tab
 Scanning through 7735 files
 Loading database with 33306 rows
 
 # changed mm6 to mm7 in src/hg/hgGene/hgGeneData/Human/hg17/otherOrgs.ra
 # and checked it in.
 
 # hgLoadBlastTab hg17 mmBlastTabTopN -maxPer=250 *.tab
 #  (not done, this was only used for research)
 
 # hgLoadBlastTab hg17 mmBlastNearBest -topPercent=5 *.tab > hgMmNearBest.stats
 #  (this will be the new way to go)
 Reading seq lengths from hg17.knownGenePep
 Finding max gene combined-coverage scores in 7735 files
 Scanning through 7735 files
 Loading database with 51520 rows
 
 
 ##########################################################################
 # MYTOUCH FIX - jen - 2006-01-24
   sudo mytouch hg17 gencodeGeneClassJun05 0508301200.00
   note - gencodeGeneClassJun05 table on dev only
   sudo mytouch hg17 knownGeneLink 0506050000.00
   sudo mytouch hg17 ensGtp 0505241200.00
   sudo mytouch hg17 ccdsInfo 0505241200.00
 
 
 ##########################################################################
 # BLASTZ OPOSSUM monDom2 (WORKING - 2006-01-23 - Hiram)
 
     ssh kk
     #	running out of disk space on store5:
     [hiram@kk /cluster/data/hg17/bed] df -h .
     #Filesystem            Size  Used Avail Use% Mounted on
     #			   1.5T  1.3T   79G  95% /cluster/store5
     #	So, keep this elsewhere, and symlink it:
     cd /cluster/data/hg17/bed
     ln -s /cluster/store9/hg17/bed/blastzMonDom2.2006-01-23 \
 	./blastzMonDom2.2006-01-23
     ln -s blastzMonDom2.2006-01-23 blastz.monDom2
 
     cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23
 
     cat << '_EOF_' > DEF
 # human vs. opossum
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
 
 BLASTZ=blastz.v7
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human (hg17)
 SEQ1_DIR=/scratch/hg/hg17/bothMaskedNibs
 SEQ1_LEN=/cluster/data/hg17/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom2
 SEQ2_DIR=/scratch/hg/monDom2/monDom2.2bit
 SEQ2_LEN=/scratch/hg/monDom2/chrom.sizes
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastzMonDom2.2006-01-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
 
     #	real    1122m44.191s
     #	failed during the load of chr19
     #	hgLoadChain hg17 chr19_chainMonDom2 chr19.chain
     #	Out of memory needMem - request size 56 bytes
 
     #	So, go to kolossus:
     ssh kolossus
     #	There isn't any hg17 db here yet, get it established with a
     #	chromInfo and a 2bit sequence:
     hgsql -e "create database hg17;" mysql
     cd /cluster/data/hg17
     twoBitInfo hg17.2bit stdout |
         awk '{printf "%s\t%s\t/gbdb/hg17/hg17.2bit\n", $1,$2}' \
 		> chromInfo.kolossus.tab
     hgsql hg17 < $HOME/kent/src/hg/lib/chromInfo.sql
     hgsql hg17 \
 -e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;'
     #	it appears /gbdb/hg17 already exists
     ln -s /cluster/data/hg17/hg17.2bit /gbdb/hg17/hg17.2bit
     #	now, loading only chr19:
     cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23/axtChain
     hgLoadChain hg17 chr19_chainMonDom2 chain/chr19.chain
     #	real    33m31.689s
     #	while that is running, back on hgwdev, get the other chains loaded
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastzMonDom2.2006-01-23/axtChain
     cp loadUp.csh loadUp.noChr19.csh
     #	change the foreach line to eliminate the chr19.chain:
     diff loadUp.csh loadUp.noChr19.csh
     < foreach f (*.chain)
     ---
     > foreach f (`ls *.chain | grep -v chr19.chain`)
     #	And then run that script
     time ./loadUp.noChr19.csh > load.noChr19.out 2>&1
     #	real    76m8.757s
     
     #	When the kolossus load finishes, email to push-request and ask
     #	for the two tables to be pushed from kolossus to hgwdev:
     #	chr19_chainMonDom2
     #	chr19_chainMonDom2Link
 
     #	then, continuing:
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-continue=download -bigClusterHub=pk -chainMinScore=5000 \
 	-chainLinearGap=loose `pwd`/DEF > download.out 2>&1 &
     #	real    2m42.505s
 
     #	now, back on kolossus to run a featurebits
     time featureBits hg17 chainMonDom2Link >fb.hg17.chainMonDom2Link 2>&1
     #	355119482 bases of 2866216770 (12.390%) in intersection
     featureBits hg17 chainMonDom1Link
     #	456069062 bases of 2866216770 (15.912%) in intersection
 
     #	Then, to swap the results:
     ssh kk
     cd /cluster/data/hg17/bed/blastz.monDom2
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-swap -bigClusterHub=pk -chainMinScore=5000 \
 	-chainLinearGap=loose `pwd`/DEF > swap.out 2>&1 &
     #	running 2006-01-30 11:25
     #	real    47m27.082s
     #	failed during the load - as with the Hg18 experiment, something
     #	is really huge about these results.
 
 #########################################################################
 # BUILD MAF ANNOTATION FOR MULTIZ17WAY  (kate 2006-02-16)
 # Redo to fix overlaps (2006-04-09 kate)
 # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
 
     ssh kkstore01
     cd /cluster/data/rheMac2
     twoBitInfo -nBed rheMac2.2bit rheMac2.N.bed
 
     ssh kkstore02
     cd /cluster/data/hg17/bed/multiz17way
     mkdir anno 
     cd anno
     mkdir maf run
     cd run
     rm sizes nBeds
     foreach i (`cat /cluster/data/hg17/bed/multiz17way/species.lst`)
         ln -s  /cluster/data/$i/chrom.sizes $i.len
         ln -s  /cluster/data/$i/$i.N.bed $i.bed
         echo $i.bed  >> nBeds
         echo $i.len  >> sizes
     end
 
     rm jobs.csh
     echo date > jobs.csh
     foreach i (../../maf/*.maf)
         echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg17/hg17.2bit ../maf/`basename $i` >> jobs.csh
         echo "echo $i" >> jobs.csh
     end 
     echo date >> jobs.csh
 
     # do smaller jobs first
     tac jobs.csh > jobsRev.csh
     mv jobsRev.csh jobs.csh
     
     csh jobs.csh >&! jobs.log &
         # 1.5 hrs.
         # 9 hours for redo -- something wrong ?
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way/anno/maf
     mkdir -p /gbdb/hg17/multiz17way/anno/maf
     ln -s /cluster/data/hg17/bed/multiz17way/anno/maf/*.maf \
         /gbdb/hg17/multiz17way/anno/maf
 cat > loadMaf.csh << 'EOF'
     date
     hgLoadMaf -pathPrefix=/gbdb/hg17/multiz17way/anno/maf \
                 hg17 multiz17way
     date
 'EOF'
     csh loadMaf.csh >&! loadMaf.log &
 
     # load summary table on kolossus, as it crashes on hgwdev
     ssh kolossus
     cd /cluster/data/hg17/bed/multiz17way/anno/maf
     cat *.maf | \
         nice hgLoadMafSummary hg17 -minSize=30000 -mergeGap=1500 \
             -maxSize=200000  multiz17waySummary stdin
 
         # Created 3212623 summary blocks from 114139253 components and 17522217 mafs from stdin
     # request push to hgwdev
 
     # Dropped unused indexes (2006-05-09 kate)
     # NOTE: this is not required in the future, as the loader
     # has been fixed to not generate these indexes
     hgsql hg17 -e "alter table multiz17waySummary drop index chrom_2"
     hgsql hg17 -e "alter table multiz17waySummary drop index chrom_3"
 
     ssh kkstore02
     cd /cluster/data/hg17/bed/multiz17way
     set sanDir = /san/sanvol1/scratch/hg17/multiz17way/frames
     mkdir -p $sanDir/maf
     cp -rp maf/* $sanDir/maf
     mkdir frames
     cd frames
     cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
     cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
     #edit Makefile to correct species names and set and sanDir
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way/frames
     make getGenes >&! getGenes.log &
         # ~1 minute
     make getFrames >&! getFrames.log &
         # ~2 hours
     # NOTE: if jobs get hung up (e.g. running for hours, when
     #   they should run for minutes, do 'para stop' so that
     #   the 'para make' can restart the job
     make loadDb >&! loadDb.log &
 
     ###
     # rebuild frames to get bug fix, using 1-pass maf methodology
     # (2006-06-09 markd)
     ssh kkstore02
     cd /cluster/data/hg17/bed/multiz17way/frames
     mv mafFrames/ mafFrames.old
     nice tcsh # easy way to get process niced
     (cat  ../maf/*.maf | genePredToMafFrames hg17 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg17 genes/hg17.gp.gz mm7 genes/mm7.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac2 genes/rheMac2.gp.gz rn3 genes/rn3.gp.gz xenTro1 genes/xenTro1.gp.gz |  gzip >multiz17way.mafFrames.gz)>&log&
     ssh hgwdev
     cd /cluster/data/hg17/bed/multiz17way/frames
 
     hgLoadMafFrames hg17 multiz17wayFrames multiz17way.mafFrames.gz >&log&
 
 
 # EXTRACT LINEAGE-SPECIFIC REPEATS FOR RAT (DONE 2/8/06 angie)
     ssh kolossus
     mkdir /cluster/data/hg17/rmsk
     cd /cluster/data/hg17/rmsk
     ln -s ../*/chr*.fa.out .
     # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
     # whether repeats in -query are also expected in -comp species.  
     # Even though we already have the human-mouse linSpecReps,
     # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
     # additions.  So add mouse, then ignore it.  
     # Rat in extra column 1, Mouse in extra column 2
     foreach outfl ( *.out )
         echo "$outfl"
         /cluster/bluearc/RepeatMasker/DateRepeats \
           ${outfl} -query human -comp rat -comp mouse
     end
     # Now extract rat (extra column 1), ignore mouse.
     cd ..
     mkdir linSpecRep.notInRat
     foreach f (rmsk/*.out_rat*_mus-musculus)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                         linSpecRep.notInRat/$base.out.spec
     end
     # Distribute and clean up.
     rsync -av linSpecRep.notInRat /san/sanvol1/scratch/hg17/
     rm -r rmsk
 
 
 # BLASTZ/CHAIN/NET RN4 (DONE 2/10/06 angie)
     ssh kkstore01
     mkdir /cluster/data/hg17/bed/blastz.rn4.2006-02-08
     cd /cluster/data/hg17/bed/blastz.rn4.2006-02-08
     cat << '_EOF_' > DEF
 # human vs. rat
 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/san/sanvol1/scratch/hg17/nib
 SEQ1_SMSK=/san/sanvol1/scratch/hg17/linSpecRep.notInRat
 SEQ1_LEN=/cluster/data/hg17/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat
 SEQ2_DIR=/san/sanvol1/scratch/rn4/nib
 SEQ2_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
 SEQ2_LEN=/cluster/data/rn4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastz.rn4.2006-02-08
 '_EOF_'
     # << for emacs
     doBlastzChainNet.pl DEF -chainLinearGap medium \
       -bigClusterHub pk -smallClusterHub pk -workhorse pk \
       -blastzOutRoot /san/sanvol1/scratch/blastzHg17Rn4Out >& do.log &
     tail -f do.log
     rm -f /cluster/data/hg17/bed/blastz.rn4
     ln -s blastz.rn4.2006-02-08 /cluster/data/hg17/bed/blastz.rn4
 
 # UPDATE WGRNA TRACK (DONE, 2006-02-15, Fan)
 
   ssh hgwdev
   cd /cluster/data/hg17/bed
 
   mkdir wgRna-2006-02-15
   cd wgRna-2006-02-15
 
 # Received the data file, wg_track_hg17_feb2006_completed.txt, from Michel Weber's email
 # (Michel.Weber@ibcg.biotoul.fr)
 # and place it under cd /cluster/data/hg17/bed/wgRna-2006-02-15.
 
   cp -p wg_track_hg17_feb2006_completed.txt wgRna.tab
   hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg17 wgRna wgRna.tab
 
 # Compared to previous data, 2 records deleted, 27 records added.
 
 
 ########################################################################
 #  BLASTZ Opossum monDom4 (DONE - 2006-02-21 - 2006-02-26 - Hiram)
     ssh pk
     mkdir /cluster/data/hg17/bed/blastzMonDom4.2006-02-21
     cd /cluster/data/hg17/bed
     ln -s blastzMonDom4.2006-02-21 blastz.monDom4
     cd blastzMonDom4.2006-02-21
 
     cat << '_EOF_' > DEF
 # human vs. opossum
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # settings for more distant organism alignments
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human (hg17)
 SEQ1_DIR=/scratch/hg/hg17/bothMaskedNibs
 SEQ1_LEN=/cluster/data/hg17/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom4
 SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
 SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg17/bed/blastzMonDom4.2006-02-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     #  << happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits hg17 chainMonDom4Link \
 	> fb.hg17.chainMonDom4Link 2>&1
     time nice -n +19 featureBits monDom4 chainHg17Link \
 	> fb.monDom4.chainHg17Link 2>&1
 
 ########################################################################
 ##  Measuring MonDom4 chain pile ups (DONE - 2006-02-26 - Hiram)
     ssh kkstore02
     cd /cluster/data/hg17/bed/blastz.monDom4/axtChain
     #	extract coordinates on the target genome of the chains
     zcat hg17.monDom4.all.chain.gz | grep "^chain " \
     | awk '{printf "%s\t%s\t%s\t%s\t%s\n", $3, $6, $7, $5, $2}' \
         | gzip -c > target.chain.bed.gz
     # turn that into a wiggle graph with bedItemOverlapCount
     # use HGDB_CONF for read-only access to the hg17 DB in bedItemOverlapCount
     #	it wants to read chromInfo ...
     export HGDB_CONF=~/.hg.conf.read-only
     #	ignore chains longer than 1,000,000
     zcat target.chain.bed.gz | awk '$3-$2<1000000 {print}' \
 	| sort -k1,1 -k2,2n \
 	    | bedItemOverlapCount hg17 stdin \
 		| wigEncode stdin monDom4PileUps.wig monDom4PileUps.wib
     #	Do the same for the query coordinates to find out where these
     #	chains are coming from
     zcat hg17.monDom4.all.chain.gz | grep "^chain " \
     | awk '{printf "%s\t%s\t%s\t%s\t%s\n", $8, $11, $12, $10, $2}' \
         | gzip -c > query.chain.bed.gz
     zcat query.chain.bed.gz | awk '$3-$2<1000000 {print}' \
     | sort -k1,1 -k2,2n \
         | bedItemOverlapCount monDom4 stdin \
             | wigEncode stdin hg17PileUps.wig hg17PileUps.wib
     #	load those wiggles
     ssh hgwdev
     cd /cluster/data/hg17/bed/blastz.monDom4/axtChain
     ln -s `pwd`/monDom4PileUps.wib /gbdb/hg17/wib
     ln -s `pwd`/hg17PileUps.wib /gbdb/monDom4/wib
 
     hgLoadWiggle -verbose=2 hg17 monDom4PileUp monDom4PileUps.wig
     hgLoadWiggle -verbose=2 monDom4 hg17PileUps hg17PileUps.wig
     #	add wiggle track type entries to the respective trackDb.ra files
 
 
 
 # UPDATE hg17 knownToVisiGene (2006-03-07 galt)
 # Create table that maps between known genes and visiGene database 
     # mapping to other species such as mouse, zebrafish, frog
     # requires visiGene probe track vgImageProbes be created first
     knownToVisiGene hg17 -fromProbePsl=vgImageProbes
 
 ############################################################################
 # Add Landmark track (2006-03-08 giardine)
 # Note: This track is for regulatory regions and other landmarks that are not
     #included in other tracks.  It is being gathered from the locus experts
     #that are contributing data to the Human Mutation track.  This should
     #be helpful in understanding the data in the mutation track.
 #table definitions for autoSql
 autoSql landmark.as landmark -dbLink
 #change index on bin to normal index not primary key
 #move bin in struct so works as bed 4+
 #copy autoSql files to hg/lib and hg/inc (add .o file to makefile)
 #cat together landmark files from sources in landmark.bed then sort
 grep "^chr" landmark.bed | sort -k1,1 -k2,2n > sortedLandmark.bed
 #loading
 hgsql hg17 < landmark.sql
 hgLoadBed hg17 landmark sortedLandmark.bed -noSort -oldTable -tab
 #add to trackDb.ra file (human hg17 level)
 
 #changed landmark track to provide links and attributes in prep for ORegAnno
 #data.  Got set of test data by grabbing their .gff file used for custom
 #tracks and converting to bed, then to landmarks format.
 cd humPhen/landmarkData/June06/
 #convert data to new formats then
 cat newLandmark.txt landmarkORA.txt  > allLandmarks.txt
 grep "^chr" allLandmarks.txt | sort -k1,1 -k2,2n > sortedAllLandmark.txt
 #start new tables
 cd humPhen/kent/src/hg/lib/
 autoSql landmark.as landmark -dbLink
 #move bin in .h file to end of structure, to make load work
 mv landmark.h ../inc/landmark.h
 #change primary key to indexes where not unique, add index on landmarkId
 #limit name, landmarkType, raKey size to 64
 hgsql -e "drop table landmark;" hg17
 hgsql hg17 < landmark.sql
 cd ~giardine/humPhen/landmarkData/June06/
 hgLoadBed hg17 landmark sortedAllLandmark.txt -noSort -oldTable -tab
 hgsql hg17
         load data local infile "landmarkAttrORA.txt" into table landmarkAttr;
         load data local infile "landmarkAttrLinkORA.txt" into table landmarkAttrLink;
         load data local infile "landmarkAttrCat.txt" into table landmarkAttrCat;
 cd ../../kent/src/
 make clean
 make libs
 cd hg
 make cgi
 cd makeDb/trackDb
 make DBS=hg17 update
 #test in hgwdev-giardine
 
 #redo landmarks, moving categories out of database
 convertORAformat < ORegAnnoBed
 #start new tables
 cd humPhen/kent/src/hg/lib/
 autoSql landmark.as landmark -dbLink
 #move bin in .h file to end of structure, to make load work
 mv landmark.h ../inc/landmark.h
 #change primary key to indexes, add primary key on landmarkId
 #limit name, landmarkType, raKey size to 64
 #only need to reload attributes rest of data & tables same
 hgsql -e "drop table landmarkAttr;" hg17
 hgsql -e "drop table landmarkAttrCat;" hg17
 cd ../../../../landmarkData/June06/
 hgsql hg17
         #cut and paste in create table landmarkAttr
         load data local infile "landmarkAttrORA.txt" into table landmarkAttr;
 #Records: 2028  Deleted: 0  Skipped: 0  Warnings: 8 ???
 cd ../../kent/src/
 make clean
 make libs
 cd hg
 make cgi
 cd makeDb/trackDb
 make DBS=hg17 update
 #test in hgwdev-giardine
 
 
 ############################################################################
 # hg15 -> hg17 LIFTOVER CHAINS (STARTED 3/9/06, DONE 3/10/06 Fan)
     # I used a size of 10kb instead of 3kb for the split (blat query) sizes in
     # hg17.  This had a huge affect on the amount of hits in the blat, which
     # then had a huge effect on the amount of chains.  I should also mention 
     # that hg17 chromosomes chr1 and chr2 were split further 
     # into more than a single query file.  This helped a LOT in avoiding 
     # cluster hippos classically associated with those chroms.
     
     ######## LIFTOVER PREPARATION
     # Split up hg17
     ssh pk
     cd /san/sanVol1/scratch/hg17
     mkdir -p liftSplits/{split,lift}
     bash
     for fa in /cluster/data/hg17/?{,?,*hap*}/*.fa; do
       c=`basename $fa .fa`
       echo $c
       faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
     done
     mkdir -p biggerSplits/split
     cd biggerSplits/
     ln -s ../liftSplits/lift
     cd split/
     ln -s ../../liftSplits/split/* .
     faSplit sequence chr1.fa 5 chr1_
     faSplit sequence chr2.fa 5 chr2_
     rm chr{1,2}.fa
 
     # Make some dirs
     cd /san/sanVol1/scratch
     mkdir -p hg15
 
     # Copy 11.ooc files to hg15 subdirectory.
     cp -p /cluster/store5/gs.16/build33/11.ooc hg15
 
     ## First, copy over scripts. (Already done before)
 
     # mkdir -p /san/sanVol1/scratch/fan
     # cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
     # cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan
 
     ######## LIFTOVER BLATTING  
 
     # HG15
     ssh pk
     cd /cluster/data/hg15
 #    makeLoChain-align hg15 /scratch/hg/hg15/bothMaskedNibs hg17 \     
 
     makeLoChain-align hg15 /scratch/hg/hg15/chromTrfMixedNib hg17 \
 /san/sanVol1/scratch/hg17/biggerSplits/split
     cd bed
 
     mv blat.hg17.2006-03-09 /san/sanVol1/scratch/hg15 
     cd /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/run/
     sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg15ToHg17"}' > newspec
     para create newspec
     para try
     para push
 # Saw some failures, keep pushing again, they finally all finished.  
 # The problems were all from one node.  
 # Used "para remove machine ..." to remove that node from the cluster.
 # Completed: 2376 of 2376 jobs
 # CPU time in finished jobs:     626355s   10439.25m   173.99h    7.25d  0.020 y
 # IO & Wait Time:                 49512s     825.20m    13.75h    0.57d  0.002 y
 # Average job time:                 284s       4.74m     0.08h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3693s      61.55m     1.03h    0.04d
 # Submission to last job:          4165s      69.42m     1.16h    0.05d
 
     ######## LIFTOVER CHAINING
     # LIFTING
     ssh pk
     cd /san/sanVol1/scratch/fan
     cp mm7SplitLift.sh hg17SplitLift.sh
 
     # change andy to fan, mm7 to hg17, and chrX to chr2, and remove chrUn_random 
     vi hg17SplitLift.sh
 
     cat << 'EOF' > hg17ChainMergeSplit.sh
 #!/bin/bash
 cp -r chainRaw/ /scratch/fan/hg17Lifts
 pushd /scratch/fan/hg17Lifts
 mkdir chain
 /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
 cp -r chain `dirs +1`
 rm -rf chain chainRaw
 'EOF'
 
     chmod +x hg17ChainMergeSplit.sh
 
     # HG15
     cd /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/raw
     /san/sanVol1/scratch/fan/hg17SplitLift.sh
     cd ../    
     mkdir chainRun chainRaw
     cd chainRun
     cat > gsub << 'EOF'
 #LOOP
 /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg15/chromTrfMixedNib  /san/sanVol1/scratch/hg17/nib {check out line+ ../chainRaw/$(root1).chain}
 #ENDLOOP
 'EOF'
     ls -1S ../psl/*.psl > in.lst
     gensub2 in.lst single gsub spec
     para create spec
     para push
     para time
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:       3546s      59.10m     0.98h    0.04d  0.000 y
 # IO & Wait Time:                   895s      14.92m     0.25h    0.01d  0.000 y
 # Average job time:                  97s       1.61m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             270s       4.50m     0.07h    0.00d
 # Submission to last job:           270s       4.50m     0.07h    0.00d
 
     ######### CHAINMERGE/NET/NETSUBSET
     ssh kolossus
     mkdir -p /scratch/fan/hg17Lifts
     cd /scratch/fan/hg17Lifts
 
     cp -r /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/chainRaw/ .
     mkdir chain
     /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
 # about 30 minutes.
 
     cp -rp chain /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/
     rm -rf chain
     rm -rf chainRaw
 
     ssh pk
     cd /san/sanvol1/scratch/fan
     cat << 'EOF' > netOver.sh 
 #!/bin/bash
 
 chain=$1
 chrom=`basename $chain .chain`
 sizesHGOld=$2
 sizesHG17=/cluster/data/hg17/chrom.sizes
 chainDir=`dirname $chain`
 blatDir=`dirname $chainDir`
 net=${blatDir}/net/${chrom}.net
 over=${blatDir}/over/${chrom}.over
 
 mkdir -p ${blatDir}/{over,net}
 /cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG17 $net /dev/null
 /cluster/bin/x86_64/netChainSubset $net $chain $over
 'EOF'
 # << emacs
     chmod +x netOver.sh
 
     mkdir netRun
 
     cd netRun/
 
     find /san/sanVol1/scratch/hg15/blat.hg17.2006-03-09/chain -name "*.chain" \
      | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg15/chrom.sizes"}' >> spec
     para create spec
     para push
     para time
 # Completed: 44 of 44 jobs
 # CPU time in finished jobs:        427s       7.12m     0.12h    0.00d  0.000 y
 # IO & Wait Time:                   248s       4.13m     0.07h    0.00d  0.000 y
 # Average job time:                  15s       0.26m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              29s       0.48m     0.01h    0.00d
 # Submission to last job:            46s       0.77m     0.01h    0.00d
 
 # seems much faster than mm7.
 
     ########## FINISHING
     ssh hgwdev
 
     # HG15
     cd /san/sanvol1/scratch/hg15/blat.hg17.2006-03-09/over
     cat * >> ../hg15ToHg17.over.chain
     cd ../
     rm -rf psl/ net/ chain/ chainRaw/ over/
     cd ../
     cp -rp blat.hg17.2006-03-09/ /cluster/data/hg15/bed
 
     cd /cluster/data/hg15/bed
     ln -s blat.hg17.2006-03-09 blat.hg17
     ln -s `pwd`/blat.hg17/hg15ToHg17.over.chain liftOver/hg15ToHg17.over.chain
     ln -s `pwd`/liftOver/hg15ToHg17.over.chain /gbdb/hg15/liftOver/hg15ToHg17.over.chain
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg15/liftOver
     cd /usr/local/apache/htdocs/goldenPath/hg15/liftOver
     cp /gbdb/hg15/liftOver/hg15ToHg17.over.chain .
     gzip hg15ToHg17.over.chain
     hgAddLiftOverChain hg15 hg17 /gbdb/hg15/liftOver/hg15ToHg17.over.chain
 
 # UPDATED hg17.knownToVisiGene (2006-03-14 galt)
 #  after making sure hg17.vgAllProbes was up to date (see makeVisiGene.doc)
 ssh hgwdev
 knownToVisiGene hg17 -fromProbePsl=vgAllProbes
 
 ########################################################################
 ### microRNA targets tracks  (DONE - 2006-03-17 - 2006-04-27 - Hiram)
 ### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
 ### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
 ### Yi-Lu Wang ylw205@nyu.edu
 ### dg@thp.Uni-Koeln.DE
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/picTar
     cd /cluster/data/hg17/bed/picTar
     wget --timestamping \
 	'http://pictar.bio.nyu.edu/ucsc/new_mammals_bed' -O newMammals.bed
 
     wget --timestamping \
 	'http://pictar.bio.nyu.edu/ucsc/new_mammals_chicken_bed' \
 	-O newMammalsChicken.bed
 
     grep -v "^track" newMammals.bed \
 	| hgLoadBed -strict hg17 picTarMiRNA4Way stdin
     #	Loaded 205263 elements of size 9
     grep -v "^track" newMammalsChicken.bed \
 	| hgLoadBed -strict hg17 picTarMiRNA5Way stdin
     #	Loaded 43081 elements of size 9
 
     nice -n +19 featureBits hg17 picTarMiRNA4Way
     #	608549 bases of 2866216770 (0.021%) in intersection
     nice -n +19 featureBits hg17 picTarMiRNA5Way
     #	109059 bases of 2866216770 (0.004%) in intersection
 
 ############################################################################
 # dbSNP BUILD 125 (Heather, March 2006)
 
 # Set up directory structure
 ssh kkstore02
 cd /cluster/data/dbSnp
 mkdir 125
 cd 125
 mkdir shared
 mkdir shared/data
 mkdir shared/schema
 mkdir organisms
 mkdir organisms/human_9606
 mkdir organisms/human_9606/rs_fasta
 mkdir organisms/human_9606/database
 mkdir organisms/human_9606/database/organism_data
 mkdir organisms/human_9606/database/organism_data/hg17
 mkdir organisms/human_9606/database/schema
 
 # Get data from NCBI
 
 # Shared data includes data dictionary, 
 # Shared data includes defined types such as validity, class, function, locType
 # Actually this is independent of hg17 build and should go in separate makeDoc
 cd shared/data
 ftp ftp.ncbi.nih.gov
 cd snp/database/organism_shared_data
 mget *.gz
 
 cd ../schema
 ftp ftp.ncbi.nih.gov
 cd snp/database/schema/shared_schema
 mget *.gz
 
 # using headers of fasta files for molType, class and observed
 cd ../organisms/human_9606/rs_fasta
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/human_9606/rs_fasta
 mget *.gz
 
 cd ../database/organism_data/hg17
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/human_9606/database/organism_data
 # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
 get b125_SNPContigLoc_35_1.bcp.gz
 # ContigLocusId has function
 get b125_SNPContigLocusId_35_1.bcp.gz
 get b125_ContigInfo_35_1.bcp.gz
 # MapInfo has alignment weights
 get b125_SNPMapInfo_35_1.bcp.gz
 # SNP has validation status and heterozygosity
 get SNP.bcp.gz
 # done with FTP
 
 # rename
 mv b125_SNPContigLoc_35_1.bcp.gz ContigLoc.gz
 mv b125_SNPContigLocusId_35_1.bcp.gz ContigLocusId.gz
 mv b125_ContigInfo_35_1.bcp.gz ContigInfo.gz
 mv b125_SNPMapInfo_35_1.bcp.gz MapInfo.gz
 mv SNP.bcp.gz SNP.gz
 
 # edit table descriptions
 cd /cluster/data/dbSnp/125/organisms/human_9606/database/schema
 # get CREATE statements from human_9606_table.sql for our 5 tables
 # store in table.tmp
 # convert and rename tables
 sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
 rm table.tmp
 sed -f 'tableRename.sed' table2.tmp > table.sql
 rm table2.tmp
 
 # get header lines from rs_fasta
 cd /cluster/data/dbSnp/125/organisms/human_9606/rs_fasta
 /bin/csh gnl.csh
 
 # load on kkr5u00
 ssh kkr5u00
 hgsql -e mysql 'create database dbSnpHumanBuild125' 
 cd /cluster/data/dbSnp/125/organisms/human_9606/database/schema
 hgsql dbSnpHumanBuild125 < table.sql
 cd ../organism_data/hg17
 /bin/csh load.csh
 
 # note rowcount
 # ContigLoc     24135144 
 # SNP           10430754 
 # MapInfo       10271016 
 # ContigLocusId  9539145 
 
 # create working /scratch dir
 cd /scratch/snp
 mkdir 125
 cd 125
 mkdir human
 cd human
 
 # get hg17 ctgPos, load into dbSnpHumanBuild125, compare contig list between ctgPos and ContigInfo
 
 # get gnl files
 cp /cluster/data/dbSnp/125/organisms/human_9606/rs_fasta/*.gnl .
 
 # examine ContigInfo for group_term and edit pipeline.csh
 # use "ref_haplotype" 
 
 # filter ContigLoc into ContigLocFilter
 # this gets rid of alternate assemblies and poor quality alignments
 # uses ContigInfo and MapInfo (weight == 10 || weight == 3)
 # assumes all contigs are positively oriented
 # will abort if not true
 
 mysql> desc ContigLocFilter;
 #  +---------------+-------------+------+-----+---------+-------+
 #  | Field         | Type        | Null | Key | Default | Extra |
 #  +---------------+-------------+------+-----+---------+-------+
 #  | snp_id        | int(11)     | NO   |     |         |       |
 #  | ctg_id        | int(11)     | NO   |     |         |       |
 #  | chromName     | varchar(32) | NO   |     |         |       |
 #  | loc_type      | tinyint(4)  | NO   |     |         |       |
 #  | phys_pos_from | int(11)     | NO   |     |         |       |
 #  | phys_pos      | varchar(32) | YES  |     | NULL    |       |
 #  | orientation   | tinyint(4)  | NO   |     |         |       |
 #  | allele        | blob        | YES  |     | NULL    |       |
 #  +---------------+-------------+------+-----+---------+-------+
  
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter dbSnpHumanBuild125 ref_haplotype
 # note rowcount
 # ContigLocFilter  10113426
 # how many are positive strand? hopefully 90%
 mysql> select count(*) from ContigLocFilter where orientation = 0;
 # 9161012
 
 # filter ContigLocusId into ContigLocusIdFilter
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter dbSnpHumanBuild125 ref_haplotype
 # note rowcount 
 # ContigLocusIdFilter  5352542
 
 # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
 # assumes SNPs are in numerical order
 # will errAbort if not true
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense dbSnpHumanBuild125 
 # note rowcount
 # expect about 50% for human
 # ContigLocusIdCondense 4129899
 # could delete ContigLocusIdFilter table here
 
 # create chrN_snpFasta tables from *.gnl files
 # snpLoadFasta.error will report all SNPs with "lengthTooLong"
 # here we have 4428 SNPs with lengthTooLong
 # these are noted as ObservedNotAvailable
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta dbSnpHumanBuild125 
 
 # split ContigLocFilter by chrom (could start using pipeline.csh here)
 # pipeline.csh takes about 35 minutes to run
 # create the first chrN_snpTmp
 # we will reuse this table name, adding/changing columns as we go
 # at this point chrN_snpTmp will have the same description as ContigLocFilter
 # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom dbSnpHumanBuild125 ref_haplotype
 
 # generate true coords using loc_type
 # possible errors logged to snpLocType.error:
 # "Missing quotes in phys_pos for range"
 # "Chrom end <= chrom start for range"
 # "Wrong size for exact"
 # "Unknown locType"
 # "Unable to get chromEnd"
 # We got none of these
 
 # possible exceptions logged to snpLocType.exceptions:
 # RefAlleleWrongSize
 # this run got just 40 
 
 # morph chrN_snpTmp 
 
 mysql> desc chr1_snpTmp;
 
 #  +---------------+-------------+------+-----+---------+-------+
 #  | Field         | Type        | Null | Key | Default | Extra |
 #  +---------------+-------------+------+-----+---------+-------+
 #  | snp_id        | int(11)     | NO   |     |         |       |
 #  | ctg_id        | int(11)     | NO   |     |         |       |
 #  | chromStart    | int(11)     | NO   |     |         |       |
 #  | chromEnd      | int(11)     | NO   |     |         |       |
 #  | loc_type      | tinyint(4)  | NO   |     |         |       |
 #  | orientation   | tinyint(4)  | NO   |     |         |       |
 #  | allele        | blob        | YES  |     | NULL    |       |
 #  +---------------+-------------+------+-----+---------+-------+
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype dbSnpHumanBuild125 ref_haplotype
 
 # expand allele as necessary
 # report syntax errors to snpExpandAllele.errors
 # this run had 63 of these
 # possible exceptions logged to snpExpandAllele.exceptions:
 # RefAlleleWrongSize
 # this run has 512 
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele dbSnpHumanBuild125 ref_haplotype
 
 # the next few steps prepare for working in UCSC space
 # sort by position
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort dbSnpHumanBuild125 ref_haplotype
 # get hg17 nib files
 # get hg17 chromInfo, load into dbSnpHumanBuild125 with editted path
 hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" dbSnpHumanBuild125
 
 # lookup reference allele in nibs
 # keep reverse complement to use in error checking (snpCheckAlleles)
 # check here for SNPs larger than 1024
 # errAbort if detected
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC dbSnpHumanBuild125
 
 # morph chrN_snpTmp 
 
 mysql> desc chr1_snpTmp;
 
 #  +--------------------+-------------+------+-----+---------+-------+
 #  | Field              | Type        | Null | Key | Default | Extra |
 #  +--------------------+-------------+------+-----+---------+-------+
 #  | snp_id             | int(11)     | NO   |     |         |       |
 #  | ctg_id             | int(11)     | NO   |     |         |       |
 #  | chromStart         | int(11)     | NO   |     |         |       |
 #  | chromEnd           | int(11)     | NO   |     |         |       |
 #  | loc_type           | tinyint(4)  | NO   |     |         |       |
 #  | orientation        | tinyint(4)  | NO   |     |         |       |
 #  | allele             | blob        | YES  |     | NULL    |       |
 #  | refUCSC            | blob        | YES  |     | NULL    |       |
 #  | refUCSCReverseComp | blob        | YES  |     | NULL    |       |
 #  +--------------------+-------------+------+-----+---------+-------+
 
 # compare allele from dbSNP to refUCSC
 # locType between is excluded from this check
 # log exceptions to snpCheckAllele.exceptions
 # if SNP is positive strand, expect allele == refUCSC
 # log RefAlleleMismatch if not
 # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
 # If allele == refUCSCRevComp, log RefAlleleNotRevComp
 # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
 # This run we got:
 # 0 RefAlleleMismatch
 # 49763   RefAlleleNotRevComp
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles dbSnpHumanBuild125
 
 # add class, observed and molType from chrN_snpFasta tables
 # log errors to snpReadFasta.errors
 # errors detected: no data available, duplicate data
 # This run we got:
 # 49 no data available
 # 226048 duplicate
 # chrN_snpFasta has class = 'in-del'
 # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpReadFasta dbSnpHumanBuild125
 
 # morph chrN_snpTmp
 #  +--------------------+---------------+------+-----+---------+-------+
 #  | Field              | Type          | Null | Key | Default | Extra |
 #  +--------------------+---------------+------+-----+---------+-------+
 #  | snp_id             | int(11)       | NO   |     |         |       |
 #  | chromStart         | int(11)       | NO   |     |         |       |
 #  | chromEnd           | int(11)       | NO   |     |         |       |
 #  | loc_type           | tinyint(4)    | NO   |     |         |       |
 #  | class              | varchar(255)  | NO   |     |         |       |
 #  | orientation        | tinyint(4)    | NO   |     |         |       |
 #  | molType            | varchar(255)  | NO   |     |         |       |
 #  | allele             | blob          | YES  |     | NULL    |       |
 #  | refUCSC            | blob          | YES  |     | NULL    |       |
 #  | refUCSCReverseComp | blob          | YES  |     | NULL    |       |
 #  | observed           | blob          | YES  |     | NULL    |       |
 #  +--------------------+---------------+------+-----+---------+-------+
 
 # generate exceptions for class and observed
 
 # SingleClassBetweenLocType
 # SingleClassRangeLocType
 # NamedClassWrongLocType
 
 # ObservedNotAvailable
 # ObservedWrongFormat
 # ObservedWrongSize
 # ObservedMismatch
 
 # RangeSubstitutionLocTypeExactMatch
 
 # SingleClassTriAllelic
 # SingleClassQuadAllelic
 
 # This will also detect IUPAC symbols in allele
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved dbSnpHumanBuild125
 
 # add function
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction dbSnpHumanBuild125
 
 # add validation status and heterozygosity
 # log error if validation status > 31 or missing
 # this run we got 8 missing
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP dbSnpHumanBuild125
 
 # generate chrN_snp125 and snp125Exceptions tables
 cp snpCheckAlleles.exceptions snpCheckAlleles.tab
 cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
 cp snpExpandAllele.exceptions snpExpandAllele.tab
 cp snpLocType.exceptions snpLocType.tab
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable dbSnpHumanBuild125
 
 # PAR SNPs
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpPAR dbSnpHumanBuild125
 hgsql -e 'load data local infile "snpPARexceptions.tab" into table snp125Exceptions' dbSnpHumanBuild125
 
 # concat into snp125.tab
 # cat chr*_snp125.tab >> snp125.tab
 /bin/sh concat.sh
 
 # load
 hgsql dbSnpHumanBuild125 < /cluster/home/heather/kent/src/hg/lib/snp125.sql
 hgsql -e 'load data local infile "snp125.tab" into table snp125' dbSnpHumanBuild125
 
 # check for multiple alignments
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple dbSnpHumanBuild125
 mysql> load data local infile 'snpMultiple.tab' into table snp125Exceptions;
 
 # run and review snpCompareLoctype (currently tuned for 124/125 differences)
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareLoctype dbSnpHumanBuild125 snp124subset snp125
 # cat snpCompareLoctypeCounts.out
 # exactToExact = 8310192
 # exactToBetween = 107956
 # exactToRange = 16200
 # betweenToBetween = 206224
 # betweenToExact = 4012
 # betweenToRange = 715
 # rangeToRange = 98648
 # rangeToBetween = 3151
 # rangeToExact = 6198
 # oldToNew = 10224
 
 # 12043 coord changes in exact (.1%)
 # 1370 moved to different chroms
 
 # 3664 coord changes in between (1.7%)
 # 2260 off-by-one
 # 13 moved to different chroms
 
 # 22198 coord changes in range (22.5%)
 # 19548 look like fixes: observedLengthOld != coordSpanOld && observedLengthNew == coordSpanNew
 # 1296 look like errors: observedLengthOld == coordSpanOld && observedLengthNew != coordSpanNew
 
 # load on hgwdev
 cp snp125.tab /cluster/home/heather/transfer/snp
 hgsql dbSnpHumanBuild125 -e 'select * from snp125Exceptions' > /cluster/home/heather/transfer/snp/snp125Exceptions.tab
 ssh hgwdev
 mysql> load data local infile 'snp125.tab' into table snp125; 
 
 # create indexes
 mysql> alter table snp125 add index name (name);
 mysql> alter table snp125 add index chrom (chrom, bin);
 
 mysql> load data local infile 'snp125Exceptions.tab' into table snp125Exceptions; 
 mysql> alter table snp125Exceptions add index name(name);
 
 # create snp125ExceptionDesc table
 cd /cluster/data/dbSnp
 # add counts to exception.template
 hgsql hg17 < snp125ExceptionDesc.sql
 mysql> load data local file 'exception.template' into table snp125ExceptionDesc;
 
 #######
 # Add new case for ObservedWrongSize (Heather June 9, 2006)
 # revisions 1.25 and 1.26 kent/src/hg/snp/snpLoad/snpCheckClassAndObserved.c
 ssh kkr5u00
 cd /scratch/snp/125/human
 /bin/csh pipeline.csh
 # wait 35 minutes
 grep ObservedWrongSize snpCheckClassAndObserved.exceptions > ObservedWrongSize
 grep ObservedWrongSize snpPARexceptions.tab >> ObservedWrongSize
 cp ObservedWrongSize /cluster/home/heather/transfer/snp
 ssh hgwdev
 hgsql -e 'alter table snp125Exceptions drop index name' hg17
 hgsql -e 'load data local infile "/cluster/home/heather/transfer/snp/ObservedWrongSize" into table snp125Exceptions' hg17
 hgsql -e 'alter table snp125Exceptions add index name'
 
 # fix counts
 hgsql -e 'select count(*), exception from snp125Exceptions group by exception' hg17
 
 +----------+------------------------------------+
 | count(*) | exception                          |
 +----------+------------------------------------+
 |   785903 | MultipleAlignments                 |
 |      623 | NamedClassWrongLocType             |
 |     7686 | ObservedMismatch                   |
 |     4333 | ObservedNotAvailable               |
 |       97 | ObservedWrongFormat                |
 |    73558 | ObservedWrongSize                  |
 |      466 | RangeSubstitutionLocTypeExactMatch |
 |       62 | RefAlleleMismatch                  |
 |    99849 | RefAlleleNotRevComp                |
 |     1278 | RefAlleleWrongSize                 |
 |    20749 | SingleClassBetweenLocType          |
 |     2306 | SingleClassQuadAllelic             |
 |    15639 | SingleClassRangeLocType            |
 |    19330 | SingleClassTriAllelic              |
 +----------+------------------------------------+
 
 # edit /cluster/data/dbSNP/exception.template (need to automate this)
 hgsql -e 'delete from snp125ExceptionDesc' hg17
 hgsql -e 'load data local infile "/cluster/data/dbSNP/exception.template" into table snp125ExceptionDesc' hg17
 
 
 ###########################
 
 # add rs_fasta to seq/extFile (Heather Nov 2006)
 # use 126 rs_fasta files because I didn't save 125 version
 ssh hgwdev
 mkdir /gbdb/hg17/snp
 ln -s /cluster/store12/snp/126/human/rs_fasta/snp.fa /gbdb/hg17/snp/snp.fa
 cd /cluster/store12/snp/126/human/rs_fasta
 hgLoadSeq hg17 /gbdb/hg18/snp/snp.fa
 # clean up after hgLoadSeq
 rm seq.tab
 
 # look up id in extFile
 # move into separate table
 hgsql hg17 < snpSeq.sql
 hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 33852294' hg17
 hgsql -e 'delete from seq where extFile = 33852294' hg17
 hgsql -e 'alter table snpSeq add index acc (acc)' hg17
 
 #############################################################
 # Get panTro2 and rheMac2 allele for all SNPs (Heather, Dec 2006, Feb 2007 and
 # June 2007 [partial fix released 6/25/07: using hg17 instead of hg18 liftOver
 # files... for most but not all chroms! :( not documented below; error found 
 # by user]
 # 1/11/08 (angie): re-running panTro2Qual and subsequent chimp & summary 
 # steps, so hg17 liftOver files will have been used for all outputs.
 # Deletions will probably lift okay
 # The insertions have start == end so none of them will lift
 # 1/24/08 (angie): constant quality score of 98 for chimp chr{21,M,Y,Y_random}
 # was previously put in score field -- corrected to orthoScore.
 
 ssh hgwdev
 cd /san/sanvol1/snp/liftOver/hg17
 mkdir panTro2All
 mkdir rheMac2All
 mkdir input
 cd input
 hgsql -N -e 'select chrom, chromStart, chromEnd, name, score, strand from snp125' hg17 > snp125.bed
 lineFileSplit snp.bed lines 100000 snp-
 ln -s /san/sanvol1/snp/liftOver/hg17/input /san/sanvol1/snp/liftOver/hg17/panTro2All/input
 ln -s /san/sanvol1/snp/liftOver/hg17/input /san/sanvol1/snp/liftOver/hg17/rheMac2All/input
 cd ../panTro2All
 ./makeJobList.csh
 mkdir output
 mkdir unmapped
 cd ../rheMac2All
 ./makeJobList.csh
 mkdir output
 mkdir unmapped
 
 # cluster run
 ssh pk
 cd /san/sanvol1/snp/liftOver/hg17/panTro2All
 para create jobList; para try; para check; para push
 para time
 # Completed: 108 of 108 jobs
 # CPU time in finished jobs:      67758s    1129.29m    18.82h    0.78d  0.002 y
 # IO & Wait Time:                   961s      16.02m     0.27h    0.01d  0.000 y
 # Average job time:                 636s      10.60m     0.18h    0.01d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1543s      25.72m     0.43h    0.02d
 # Submission to last job:         61513s    1025.22m    17.09h    0.71d
 
 cd /san/sanvol1/snp/liftOver/hg17/rheMac2All
 para create jobList; para try; para check; para push
 para time
 # Completed: 108 of 108 jobs
 # CPU time in finished jobs:       1833s      30.56m     0.51h    0.02d  0.000 y
 # IO & Wait Time:                  1744s      29.06m     0.48h    0.02d  0.000 y
 # Average job time:                  33s       0.55m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              82s       1.37m     0.02h    0.00d
 # Submission to last job:         59987s     999.78m    16.66h    0.69d
 
 # add sequence
 # next time do this at the same time as lift
 
 cd /san/sanvol1/snp/liftOver/hg17
 mkdir panTro2Seq
 mkdir panTro2Seq/input
 mkdir panTro2Seq/output
 cp panTro2All/output/snp*out panTro2Seq/input
 cd panTro2Seq
 ./makeJobList.csh
 
 cat << 'EOF' > makeJobList.csh
 #!/bin/tcsh
 rm -f jobList
 foreach fileName (`ls input/*`)
     set baseName = $fileName:t
     echo $baseName
     echo "/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq $fileName /scratch/hg/panTro2/panTro2.2bit output/$baseName" >> jobList
 end
 'EOF'
 
 cd /san/sanvol1/snp/liftOver/hg17
 mkdir rheMac2Seq
 mkdir rheMac2Seq/input
 mkdir rheMac2Seq/output
 cp rheMac2All/output/snp*out rheMac2Seq/input
 cd rheMac2Seq
 cat << 'EOF' > makeJobList.csh
 #!/bin/tcsh
 rm -f jobList
 foreach fileName (`ls input/*`)
     set baseName = $fileName:t
     echo $baseName
     echo "/cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq $fileName /scratch/hg/rheMac2/rheMac2.2bit output/$baseName" >> jobList
 end
 'EOF'
 
 # cluster run for sequence
 ssh pk
 cd /san/sanvol1/snp/liftOver/hg17/panTro2Seq
 para create jobList; para try; para check; para push
 para time
 # Completed: 108 of 108 jobs
 # CPU time in finished jobs:      30509s     508.48m     8.47h    0.35d  0.001 y
 # IO & Wait Time:                   325s       5.42m     0.09h    0.00d  0.000 y
 # Average job time:                 286s       4.76m     0.08h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             551s       9.18m     0.15h    0.01d
 # Submission to last job:          1195s      19.92m     0.33h    0.01d
 
 cd /san/sanvol1/snp/liftOver/hg17/rheMac2Seq
 para create jobList; para try; para check; para push
 para time
 # Completed: 108 of 108 jobs
 # CPU time in finished jobs:      28517s     475.28m     7.92h    0.33d  0.001 y
 # IO & Wait Time:                   576s       9.61m     0.16h    0.01d  0.000 y
 # Average job time:                 269s       4.49m     0.07h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             509s       8.48m     0.14h    0.01d
 # Submission to last job:          1166s      19.43m     0.32h    0.01d
 
 # quality scores 
 # This takes about 24 hours for each species!!  Ugh.
 # Solution is to use -bedFile argument to hgWiggle
 ssh hgwdev
 cd /san/sanvol1/snp/liftOver/hg17
 cd panTro2Seq/output
 cat << 'EOF' > concat.csh
 #!/bin/tcsh
 rm -f all.out
 foreach fileName (`ls snp*.out`)
     cat $fileName >> all.out
 end
 'EOF'
 
 sort all.out > all.sort
 rm all.out
 
 cd /san/sanvol1/snp/liftOver/hg17
 mkdir panTro2Qual
 cp panTro2Seq/output/all.sort panTro2Qual
 cd panTro2Qual
 mkdir input
 splitFileByColumn all.sort input
 mkdir output
 
 # If we do this again, we should write a c program to read qac files into
 # memory -- much faster than one hgWiggle process per line.
 cat << 'EOF' > addQual.pl
 #!/usr/bin/perl -W
 $db=shift;
 $chromName=shift;
 while (<STDIN>)
 {
     my @fields = split;
     my $chrom = $fields[0];
     my $chromStart = $fields[1];
     my $chromEnd = $fields[2];
     my $name = $fields[3];
     my $strand = $fields[5];
     my $allele = $fields[6];
 
     $cmd="hgWiggle -db=$db -chrom=$chromName -position=$chrom:$chromStart-$chromStart -rawDataOut quality";
     open(RESULT, "$cmd |") or die "can't start '$cmd'\n";
     while ($line = <RESULT>)
     {
         $score = int($line);
         print "$chrom\t$chromStart\t$chromEnd\t$name\t$score\t$strand\t$allele\n";
     }
 }
 'EOF'
 cat << 'EOF' > getQual.csh
 #!/bin/tcsh
 foreach fileName (`ls input/*`)
     set chromName = $fileName:t:r
     echo $chromName
     addQual.pl panTro2 $chromName < $fileName > output/$chromName
 end
 'EOF'
 # << emacs
 ./getQual.csh
 
 cd rheMac2Seq/output
 concat.csh
 sort all.out > all.sort
 rm all.out
 cd /san/sanvol1/snp/liftOver/hg17
 mkdir rheMac2Qual
 cp rheMac2Seq/output/all.sort rheMac2Qual
 cd rheMac2Qual
 mkdir input
 splitFileByColumn all.sort input
 mkdir output
 ./getQual.csh
 
 # concatenate, merge and load
 # chimp has no qual scores for chr21, chrY and chrM, just use seq files
 cd /san/sanvol1/snp/liftOver/hg17/panTro2Qual/output
 grep chr21 ../../panTro2Seq/output/all.sort > chr21
 grep chrY ../../panTro2Seq/output/all.sort | grep -v random > chrY
 grep chrY ../../panTro2Seq/output/all.sort | grep random > chrY_random
 grep chrM ../../panTro2Seq/output/all.sort > chrM
 #-----------------------------------------------------------------------------
 # 1/11/08: replace outputs for chroms that apparently were skipped in the June 
 # run, and re-run subsequent steps for chimp.  
 cd /san/sanvol1/snp/liftOver/hg17/panTro2Qual
 mv output output-jun25
 foreach f (output-jun25/chr*)
   if ( "X"`cmp $f output-feb26/$f:t` == "X" ) then
     echo $f:t
   endif
 end
 #chr21
 #chrM
 #chrY
 #chrY_random
 # <<-- those are the ones that may not have actually been regenerated.
 
 # It appears that the Feb. outputs, instead of the June Seq files, were copied 
 # to the June output for those chroms.  oops!
 # As a minor improvement, skip duplicate rows instead of just copying.
 foreach chr (chr21 chrM chrY chrY_random)
   echo $chr
   uniq input/$chr.sort > output/$chr
 end
 # << emacs
 
 mkdir output-jun25-incorrect
 mv output-jun25/chr{21,M,Y,Y_random} output-jun25-incorrect
 cat output-jun25/chr* output/chr* > output/qual.tab
 
 # end 1/11/08 fix-specific; proceeding to post-concat.csh chimp steps.
 #-----------------------------------------------------------------------------
 ./concat.csh
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrtho hg17 snp125 qual.tab
 hgLoadBed hg17 snp125OrthoPanTro2 snpOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp125OrthoPanTro2.sql
 #Loaded 9591230 elements of size 17
 # previously 9590961
 # add index
 hgsql hg17
   alter table snp125OrthoPanTro2 add index name (name);
   alter table snp125OrthoPanTro2 add index chrom (chrom, bin);
 # 1/24/08: these used to set score; should have set orthoScore all along.
 # tweak to match panTro2 assembly
   update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chr21";
 #Query OK, 129170 rows affected (25.37 sec)
 #Rows matched: 129170  Changed: 129170  Warnings: 0
   update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY";
 #Query OK, 22081 rows affected (25.16 sec)
 #Rows matched: 22081  Changed: 22081  Warnings: 0
   update snp125OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY_random";
 #Query OK, 155 rows affected (25.41 sec)
 #Rows matched: 155  Changed: 155  Warnings: 0
 
 # macaque
 cd /san/sanvol1/snp/liftOver/hg17/rheMac2Qual/output
 ./concat.csh
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrtho hg17 snp125 qual.tab
 hgLoadBed hg17 snp125OrthoRheMac2 snpOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/snp125OrthoRheMac2.sql
 # add index
 alter table snp125OrthoRheMac2 add index name (name);
 alter table snp125OrthoRheMac2 add index chrom (chrom, bin);
 
 # get hapmap subset for chimp
 # skip if lift wasn't size 1
 # this run 124822 skipped
 cd /cluster/data/hg17/bed/hapmap/rel21a
 time /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg17 \
   hapmapSnpsCombined snp125OrthoPanTro2
 #108.505u 16.869s 2:26.22 85.7%  0+0k 0+0io 4pf+0w
 hgLoadBed hg17 hapmapAllelesChimp hapmapOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesChimp.sql
 #Loaded 3930564 elements of size 13
 hgsql hg17 -e 'alter table hapmapAllelesChimp add index name(name); \
   alter table hapmapAllelesChimp add index chrom (chrom, bin);'
 
 # get hapmap subset for macaque
 # this run 106607 skipped
 /cluster/home/heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg17 hapmapSnpsCombined snp125OrthoRheMac2
 hgLoadBed hg17 hapmapAllelesMacaque hapmapOrtho.tab -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/hapmapAllelesMacaque.sql
 rm hapmapOrtho.tab
 rm hapmapOrtho.err
 rm bed.tab
 alter table hapmapAllelesMacaque add index name(name);
 alter table hapmapAllelesMacaque add index chrom (chrom, bin);
 
 ##############################################################################
 # HapMap Recombination Rate Phase 2 (Heather Feb. 2006)
 # Contacts:
 # Gil McVean [mcvean@stats.ox.ac.uk]                                                                                                 
 # Colin Freeman [cfreeman@stats.ox.ac.uk]                                                                                            
 # Simon Myers [smyers@broad.mit.edu]         
 # Data is missing chromEnd.  I am setting chromEnd = chromStart + 1 as a
 # kludge for now.
 # Solution is to interpolate range but remove gaps.
 
 ## ****************************************
 # This is a bad assumption about the data format -- here is a description.
 ## ****************************************
 # The recombination rates are for the regions _between_ snps, so these
 # files need to be processed slightly differently.  For each line i in
 # the file (except the header and the last line), the recombination
 # rate is for the position on the current line minus 1 [pos(${i})-1] to
 # the position on the subsequent line [pos({$i+1})].  The precision is
 # a bit obnoxious and can be truncated to 3 or 4 significant figures.
 # (Note that the recombination rate on the last line is 0, as this is a
 # placeholder.)  Here is an example:
 #
 # > head genetic_map_chr1.txt 
 # position COMBINED_rate(cM/Mb) Genetic_Map(cM)
 # 45413 2.98182170902573 0
 # 72434 2.08241435350679 0.0805718043995841
 # 78032 2.08135840137317 0.0922291599505152
 # 244859 2.88844902005393 0.439455937976397
 # 604461 2.88749757426825 1.47814798248583
 # 604484 2.88586385769306 1.47821439493004
 # 605296 2.88389196108775 1.48055771638249
 #
 ### BED format (like a bedGraph)
 # chr1 45412  72434  2.982
 # chr1 72433  78302  2.082
 # chr1 78031  244859 2.081
 # chr1 244858 604461 2.888
 # chr1 604460 604484 2.887
 # chr1 604483 605296 2.886
 # chr1 605295 .....  2.884
 #
 # See /cluster/data/hg16/bed/hapmap/recombination/Perlegen/makeBed.pl for an example.  /cluster/data/hg16/bed/hapmap/recombination/Perlegen/cmds.csh is also useful. 
 ## ****************************************
 
 ssh hgwdev
 cd /cluster/data/hg17/bed/hapmap
 mkdir recombination
 cd recombination
 mkdir phase2
 cd phase2
 wget --no-check-certificate -N https://mathgen.stats.ox.ac.uk/HapMap_Phase2_rates_hotspots/HapMap_Phase2_rates_hotspots.tgz  
 # data also available at
 # http://www.hapmap.org/downloads/recombination/2006-10_rel21_phaseII
 gunzip *.tgz
 tar xvf *.tar
 cat << 'EOF' > makeBed.csh
 #!/bin/tcsh
 rm -f recomb.bed
 foreach chrom (`cat chrom.list`)
     echo $chrom
     set fileName=`echo $chrom | awk '{printf "genetic_map_%s.txt", $1}'`
     makeBed.pl $chrom < $fileName >> recomb.bed
 end
 makeBed.pl chrX < genetic_map_chrX_par1.txt >> recomb.bed
 makeBed.pl chrX < genetic_map_chrX_non-par.txt >> recomb.bed
 makeBed.pl chrX < genetic_map_chrX_par2.txt >> recomb.bed
 'EOF'
 cat << 'EOF' > makeBed.pl
 #!/usr/bin/env perl
 $chromName = shift;
 while (<STDIN>) {
     my @fields = split;
     # skip header
     if ($fields[0] eq "position") { next; }
     print $chromName;
     print "\t";
     print $fields[0];
     print "\t";
     print $fields[0] + 1;
     print "\t";
     my $val1000 = $fields[1] * 1000;
     my $valRound = int($val1000);
     my $newVal = $valRound / 1000.0;
     print $newVal;
     print "\n";
 }
 'EOF'
 ./makeBed.csh
 hgLoadBed hg17 snpRecombRateHapmapPhase2 recomb.bed -tab -bedGraph=4
 hgsql -e 'alter table snpRecombRateHapmapPhase add index chrom (chrom, bin)' hg17
 
 ############
 # UPDATE hg17 knownToVisiGene (2006-04-05 galt)
 # Create table that maps between known genes and visiGene database 
     # mapping to other species such as mouse, zebrafish, frog
     # requires visiGene probe track vgImageProbes be created first
     knownToVisiGene hg17 -fromProbePsl=vgImageProbes
 #############################################################
 # ADD A NEW TRACK GROUP (DONE, 6/3/06, Fan)
 # Create a new track group, "phenDis".
 
     echo 'INSERT INTO grp (name, label, priority) VALUES ("phenDis", "Phenotype and Disease Associations", 2.5)' \
    | hgsql hg17
 
 #############################################################
 # hgMut - Human Mutation track - Belinda Giardine
 # list of tables by show tables like 'hgMut%'
 # summary of current load June 7, 2006
 #table definitions for autoSql
 autoSql hgMut.as hgMut -dbLink
 #move bin in struct so works as bed 4+
 #hgMut.sql: change INDEXes as needed, put in enums
 #shrink mutId to 64 chars, plus acc to 48
 #data files and details under ~giardine/humPhen/
 cd humPhen/hgMutData/April2006/
 cat hgMutHbVar.txt hgMutPah.txt hgMutBgmut.txt hgMutCftr.txt hgMutARdb.txt > hgMutUnsorted.txt
 grep "^chr" hgMutUnsorted.txt | sort -k1,1 -k2,2n > hgMut.bed
 #create tables
 hgsql hg17 < ../../hgMut.sql
 #loading
 hgLoadBed hg17 hgMut hgMut.bed -noSort -oldTable -tab
 #load small vocab control tables
 hgsql hg17 < hgMutLink.sql
 hgsql hg17 < hgMutAttrClass.sql
 hgsql hg17 < hgMutAttrName.sql
 hgsql hg17 < hgMutSrc.sql
 #from hgsql hg17
 load data local infile "hgMutExtLinkHbVar.txt" into table hgMutExtLink;
 load data local infile "hgMutExtLinkARdb.txt" into table hgMutExtLink;
 load data local infile "hgMutExtLinkBgmut.txt" into table hgMutExtLink;
 load data local infile "hgMutExtLinkCFTR.txt" into table hgMutExtLink;
 load data local infile "hgMutExtLinkPah.txt" into table hgMutExtLink;
 load data local infile "hgMutExtLinkSP.txt" into table hgMutExtLink;
 load data local infile "hgMutAttrHbVar2.txt" into table hgMutAttr;
 load data local infile "hgMutAttrHbvarProt2.txt" into table hgMutAttr;
 load data local infile "hgMutAttrARdb.txt" into table hgMutAttr;
 load data local infile "hgMutAttrARdbProt.txt" into table hgMutAttr;
 load data local infile "hgMutAliasHbVar.txt" into table hgMutAlias;
 load data local infile "hgMutAliasARdb.txt" into table hgMutAlias;
 load data local infile "hgMutAliasBgmut.txt" into table hgMutAlias;
 load data local infile "hgMutAliasPah.txt" into table hgMutAlias;
 load data local infile "hgMutExtLinkHbVarOmim.txt" into table hgMutExtLink;
 load data local infile "hgMutAttrLink.txt" into table hgMutAttrLink;
 load data local infile "hgMutAttrSP.txt" into table hgMutAttr;
 
 #############################################################
 # gv*  Belinda Giardine
 # These tables are to replace the hgMut tables
 # Most data is converted by me (on PSU machines) to loadable format and copied.
 # The Swiss-Prot/UniProt data, is generated from the UniProt database at UCSC, 
 # using perl scripts and table dumps.
 # scripts in kent/src/hg/utils/gvParsers/swissProt/
 # everything redone to not depend on the dv track in July 2006
 
 #make list of variants from Swiss-Prot (make sure featureClass 23 is variant)
 hgsql -N uniProt > spVars.txt <<end
 select feature.acc, start, end-start, featureType.val, featureId.val from
 feature, featureType, accToTaxon, featureId where featureClass=23 and
 featureType=featureType.id and accToTaxon.acc=feature.acc and taxon=9606 and
 feature.featureId=featureId.id;
 end
 #need list mapping 3 letter amino acids to 1 letter. (aminoInfoDump from PSU)
 #known gene protein map (kgProtMap) has psl data from (blastp)
 # with qName being the spId
 hgsql -N hg17 > kgProtMapDump.txt <<end
 select kgProtMap.* from kgProtMap, uniProt.feature where kgProtMap.qName =
 uniProt.feature.acc;
 end
 #table join duplicates; perl script to throw out extra before use
 uniqueRows < kgProtMapDump.txt > kgProtMapUniq.txt
 #check variables for output and input file names
 computeSpVars > errors.txt
 #errors.txt will list variants that couldn't be mapped
 #July 18, 2006
 #37 gaps, 564 proteins (2228 variants) not in kgProtMap (test one did align)
 #found 22389
 #Swiss-Prot attributes: 
 hgsql hg17 < listSPconnections.sql > listSPconnections.txt
 hgsql proteome < listSpXref2.sql > listSpXref2.txt
 convertOmimTitle > gvLinkSPomim.txt
 hgsql hg17 < listGeneVals.sql > listGeneVals.txt
 convertDisPoly > gvLinkSPuni.txt
 cat gvLinkSPuni.txt gvLinkSPomim.txt > gvLinkSp.txt
 cp gvLinkSp.txt ../../../gv/gvData/
 
 #creating gv* tables and loading 
 #June 27, 2006
 autoSql gv.as gv -dbLink
 #edit indexes and string lengths in .sql file
 id=48, srcId=48, raKey=48, attrType=48,
 primary key=index on bin and attr and link ids (id, attrType for attrs)
 #do enums
 #add unique index, to prevent doubles
 UNIQUE KEY (chrom(12), chromStart, chromEnd, name)
 #added id field to gvPos struct so can keep ID when change name
 #    char *id;   /* Added field to hold ID if change name */
 #set to null in gv.c file
 
 #reload data July 2006 with more data and corrected Swiss-Prot data
 #also moved gv*(except gvPos) and omimTitle to hgFixed
 #prep data: concatenate all the gvPos data, sort
 cat gvPosSP.txt gvPosHbVar.txt gvPosARdb.txt gvPosBgmut.txt gvPosCftr.txt
 gvPosPah.txt gvPosSrd5a2.txt gvPosBrca.txt > gvPosAll.txt
 grep "^chr" gvPosAll.txt | sort -k1,1 -k2,2n > gvPosSortedHg17.bed
 
 #load tables
 hgLoadBed hg17 gvPos gvPosSortedHg17.bed -noSort -oldTable -tab
 hgsql hg17 < gvSrc.sql
 hgsql hg17
         load data local infile "gvBrca.txt" into table gv;
         load data local infile "gvAttrBrca.txt" into table gvAttr;
         load data local infile "gvLinkBrca.txt" into table gvLink;
         load data local infile "gvLinkSP.txt" into table gvLink;
         load data local infile "gvLinkSPgene.txt" into table gvLink;
         load data local infile "gvSP.txt" into table gv;
         load data local infile "gvAttrSP.txt" into table gvAttr;
         load data local infile "gvAttrLongSP.txt" into table gvAttrLong;
         load data local infile "gvLinkHbVar.txt" into table gvLink;
         load data local infile "gvHbVar.txt" into table gv;
         load data local infile "gvAttrHbVar.txt" into table gvAttr;
         load data local infile "gvARdb.txt" into table gv;
         load data local infile "gvAttrARdb.txt" into table gvAttr;
         load data local infile "gvBgmut.txt" into table gv;
         load data local infile "gvAttrBgmut.txt" into table gvAttr;
         load data local infile "gvAttrLongBgmut.txt" into table gvAttrLong;
         load data local infile "gvLinkBgmut.txt" into table gvLink;
         load data local infile "gvCftr.txt" into table gv;
         load data local infile "gvAttrCftr.txt" into table gvAttr;
         load data local infile "gvPah.txt" into table gv;
         load data local infile "gvAttrPah.txt" into table gvAttr;
         load data local infile "gvLinkPah.txt" into table gvLink;
         load data local infile "gvSrd5a2.txt" into table gv;
         load data local infile "gvAttrSrd5a2.txt" into table gvAttr;
         load data local infile "gvAttrConservedDisease.txt" into table gvAttr;
 
 #get disease association predictions for conserved variants
 #get list a variants that are already done
 hgsql -N hg17 > gvWithDiseaseStatus.txt <<end
 select id from gvAttr where attrType = 'disease';
 end
 #use table browser to get variants that intersect most conserved track
 #set conserved variants that are null to likely
 computeDiseaseAssocCons > gvAttrConservedDisease.txt
 
 #Belinda Giardine Sept 2006
 #reload tables, removed ones with sequence mismatches, added label and strand
 #added new lsbd BTKbase
 #Sequence mismatches were determined by using the position in the reference
 #sequence to fetch the sequence affected by the variant.  Then for substitions
 #and deletions with the nts deleted listed, the sequence was compared.
 #Insertions and large deletions could not be checked.
 
 #Belinda Giardine Dec 2006
 #reload tables, additions to previous sources and more IDbases
 #details in hg18 doc
 
 #Belinda Giardine Jan 2007
 #reload tables, additions and corrections, details in hg18 doc
 
 #############################################################
 # Illumina Hap300 (Heather, July 2006)
 
 ssh hgwdev
 cd /cluster/data/hg17/bed
 mkdir illumina
 cd illumina
 trim.pl < Illumina_HumanHap300_SNPlist_01.13.2006.txt > trim.out
 hgsql hg17 < illuminaTmp.sql
 hgsql -e "load data local infile 'trim.out' into table illuminaTmp" hg17
 # illuminaLookup generates bin
 /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg17 illuminaTmp snp125 snp125Exceptions illuminaLookup.out illuminaLookup.err
 # errors:
 # unexpected chrom chr1 for snp rs1291584
 # unexpected chrom chr17 for snp rs3826555
 # unexpected locType between for snp rs2036773
 # unexpected locType between for snp rs2249255
 # unexpected locType between for snp rs8051412
 # unexpected locType between for snp rs1017238
 # unexpected locType between for snp rs5019493
 # 16 with locType = range*
 # 402 not found!
 # None that have multiple alignments.
 hgsql hg17 < snpArrayIllumina300.sql
 hgsql -e "load data local infile 'illuminaLookup.out' into table snpArrayIllumina300" hg17
 hgsql -e "alter table snpArrayIllumina300 add index name (name)" hg17
 hgsql -e "alter table snpArrayIllumina300 add index chrom (chrom, bin)" hg17
 
 #############################################################
 # Illumina Hap550 and Hap650 (Heather, April 2007)
 # Transfer from hg18 for Bert Gold at NCI
 
 ssh hgwdev
 cd /cluster/data/hg17/bed/illumina
 hgsql hg18 < getHg18-550.sql > 550.hg18
 hgsql hg18 < getHg18-650.sql > 650.hg18
 # get name, chrom, chromStart, chromEnd, strand observed from snp125
 # where class = "single" and locType = "exact" and chromEnd = chromStart + 1
 # Including tri/quad allelic and multiple-aligning for now
 hgsql hg17 < getHg17.sql > snp125single.hg17
 # sort and join
 sort 550.hg18 > 550.hg18.sort
 sort 650.hg18 > 650.hg18.sort
 sort snp125single.hg17 > snp125single.hg17.sort
 # 560704 lines in 550.join
 # 660137 lines in 650.join
 # 687 lines in 550.missing
 # 706 lines in 650.missing
 join 550.hg18.sort snp125single.hg17.sort > 550.join
 join 650.hg18.sort snp125single.hg17.sort > 650.join
 join -v 1 550.hg18.sort snp125single.hg17.sort > 550.missing
 join -v 1 650.hg18.sort snp125single.hg17.sort > 650.missing
 # fix column order
 awk '{print $2, $3, $4, $1, 0, $5, $6}' 550.join > 550.bed
 awk '{print $2, $3, $4, $1, 0, $5, $6}' 650.join > 650.bed
 # load
 hgLoadBed hg17 snpArrayIllumina550 550.bed -sqlTable=snpArrayIllumina550.sql
 hgLoadBed hg17 snpArrayIllumina650 650.bed -sqlTable=snpArrayIllumina650.sql
 # indices
 mysql> alter table snpArrayIllumina550 add index name (name);
 mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin);
 mysql> alter table snpArrayIllumina650 add index name (name);
 mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin);
 
 #############################################################
 # Affy 500K (Heather, September 2006)
 # look up rsId using position
 
 ssh hgwdev
 cd /cluster/data/hg17/bed/snp/affyData/500K
 # awk to create bed format from tsv files
 /bin/csh cmds.csh
 hgsql hg17 < affy250Nsp.sql
 hgsql hg17 < affy250Sty.sql
 hgsql -e "load data local infile 'Mapping250K_Nsp.bed' into table affy250Nsp" hg17
 hgsql -e "load data local infile 'Mapping250K_Sty.bed' into table affy250Sty" hg17
 
 # look up dbSNP rsIDs using position
 # affy250Nsp
 # 4311 missing, 7276 multiple
 /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy250Nsp snp125
 mv affyLookup.out affy250Nsp.bed
 mv affyLookup.err affy250Nsp.err
 hgsql hg17 < snpArrayAffy250Nsp.sql
 hgLoadBed hg17 snpArrayAffy250Nsp affy250Nsp.bed -sqlTable=snpArray250Nsp.sql -tab
 hgsql -e "alter table snpArrayAffy250Nsp add index name (name)" hg17
 hgsql -e "alter table snpArrayAffy250Nsp add index chrom (chrom, bin)" hg17
 
 # affy250Sty
 # 3540 missing, 6901 multiple
 /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy250Sty snp125
 mv affyLookup.out affy250Sty.bed
 mv affyLookup.err affy250Sty.err
 hgsql hg17 < snpArrayAffy250Sty.sql
 hgLoadBed hg17 snpArrayAffy250Sty affy250Sty.bed -sqlTable=snpArray250Sty.sql -tab
 hgsql -e "alter table snpArrayAffy250Sty add index name (name)" hg17
 hgsql -e "alter table snpArrayAffy250Sty add index chrom (chrom, bin)" hg17
 
 
 #############################################################
 # Affy 10K (Sept. 2006, Heather)
 # look up rsId using position
 
 ssh hgwdev
 cd /cluster/data/hg17/bed/snp/affyData/10K100Kagain
 
 # affy10
 # 14 missing, 807 multiple
 cp affy10K.txt affy10Temp.bed
 hgLoadBed hg17 affy10Temp affy10Temp.bed -sqlTable=affy10Temp.sql -tab -noBin
 /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy10Temp snp125
 mv affyLookup.out affy10.bed
 mv affyLookup.err affy10.err
 hgLoadBed hg17 snpArrayAffy10 affy10.bed -sqlTable=snpArrayAffy10.sql -tab
 
 # affy10v2
 # 12 missing, 716 multiple
 hgLoadBed hg17 affy10v2Temp affy10v2Temp.bed -sqlTable=affy10v2Temp.sql -tab -noBin
 /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy10v2Temp snp125
 mv affyLookup.out affy10v2.bed
 mv affyLookup.err affy10.errv2
 hgLoadBed hg17 snpArrayAffy10v2 affy10v2.bed -sqlTable=snpArrayAffy10v2.sql -tab
 
 # affy50HindIII
 # 156 missing, 1396 multiple
 hgLoadBed hg17 affy50HindIIITemp affy50HindIII.bed -sqlTable=affy50HindIIITemp.sql -tab -noBin
 /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy50HindIIITemp snp125
 mv affyLookup.out affy50HindIII.bed
 mv affyLookup.err affy50HindIII.err
 hgLoadBed hg17 snpArrayAffy50HindIII affy50HindIII.bed -sqlTable=snpArrayAffy50HindIII.sql -tab
 hgsql -e "alter table snpArrayAffy50HindIII add index name (name)" hg17
 hgsql -e "alter table snpArrayAffy50HindIII add index chrom (chrom, bin)" hg17
 
 # affy50XbaI
 # 115 missing, 1745 multiple
 hgLoadBed hg17 affy50XbaITemp affy50XbaI.bed -sqlTable=affy50XbaITemp.sql -tab -noBin
 /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg17 affy50XbaI snp125
 mv affyLookup.out affy50XbaI.bed
 mv affyLookup.err affy50XbaI.err
 hgLoadBed hg17 snpArrayAffy50XbaI affy50XbaI.bed -sqlTable=snpArrayAffy50XbaI.sql -tab
 hgsql -e "alter table snpArrayAffy50XbaI add index name (name)" hg17
 hgsql -e "alter table snpArrayAffy50XbaI add index chrom (chrom, bin)" hg17
 
 
 #########################################################################
 # REGULATORY POTENTIAL (DONE - 2006-06-14 - Hiram)
     #	download data from "James Taylor" <james@bx.psu.edu>
     ssh kkstore02
     mkdir /cluster/store11/hg17/bed/regPotential7X
     cd /cluster/data/hg17/bed
     ln -s /cluster/store11/hg17/bed/regPotential7X ./regPotential7X
     cd regPotential7X
     
     #	This is a lot of data
     time for C in 1 2 3 4 5 6 7 8 9 X Y 10 11 12 13 14 15 16 17 18 19 20 21 22
     do
     wget --timestamping \
 "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/chr${C}.scores.truncated.bz2"
     done
     #	real    115m1.855s
 
     wget --timestamping \
 "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/trackDb.html" -O description.html
 
     time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
     do
 	bzcat chr${C}.scores.truncated.bz2
     done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
     #	Converted stdin, upper limit 1.00, lower limit -0.00
     #	real    33m48.487s
 
     #	Loading the table on hgwdev
     ssh hgwdev
     cd /cluster/data/hg17/bed/regPotential7X
     ln -s /cluster/data/hg17/bed/regPotential7X/regPotential7X.wib \
 	/gbdb/hg17/wib/regPotential7X.wib
     #	using the tmpDir is faster since it is on local disk and it will
     #	clean up any temporary .tab file it creates there
     time hgLoadWiggle -tmpDir=/scratch/tmp \
 	hg17 regPotential7X regPotential7X.wig
 
     #	How about a histogram of the data.
     ssh kolossus
     cd /cluster/data/hg17/bed/regPotential7X
     time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
 	-hMinVal=0.0 -db=hg17 regPotential7X > histogram.data 2>&1
     #	real    2m48.810s
     #	73 % of the data values are zero
 
     #	create download gzip files from the bz2 files:
     for F in chr*.scores.truncated.bz2
     do
 	C=`echo $F | awk -F'.' '{print $1}'`
 	echo -n "${C}.regPotential7X.hg17.gz working ... "
 	bzcat ${F} | gzip > ${C}.regPotential7X.hg17.gz
 	echo
     done
 
 #########################################################################
 ####### RE-BUILD RGD HUMAN QTL TRACKS (DONE 06/21/06 Fan) ##############
 # DELETED RECORD FROM rgdQtlLink SO CONSISTENT WITH REMOVAL FROM rgdQtl
 # (DONE, 2006-06-30, hartera)
 
 ssh hgwdev
 mkdir -p /cluster/store8/rgd/human12062005
 rm /cluster/data/hg17/bed/rgdQtl
 ln -s /cluster/store8/rgd/human12062005 /cluster/data/hg17/bed/rgdQtl
 cd /cluster/data/hg17/bed/rgdQtl
 
 # download data files from RGD
 
 wget --timestamp ftp://rgd.mcw.edu/pub/RGD_genome_annotations/human/rgd_human_qtl_12062005.gff
 
 # remove extra line feed character at the end of lines
 
 rmLf rgd_human_qtl_12062005.gff > rgdQtl.gff
 
 # create rgdQtl.tab
 awk '{print $1"\t"$4"\t"$5"\t"$10}'  rgdQtl.gff |sed -e 's/Chr/chr/g'| \
 sed -e 's/"//g' |sed -e 's/RGD://g' | sed -e 's/;//g' |sort -u > rgdQtl.tab
 
 # create rgdQtlLink.tab
 
 cat rgdQtl.gff |cut -f 9 |sed -e 's/; Note /\t/g'|\
 sed -e 's/Alignment //' |sed -e 's/;Note /\t/' |\
 sed -e 's/"//g' |sed -e 's/RGD://' >j.tmp
 cut -f 2 j.tmp >j.1
 cut -f 1,3 j.tmp >j.2
 paste j.1 j.2 |sort -u >rgdQtlLink.tab
 rm j.1 j.2 j.tmp
 
 # load rgdQtl table
 hgLoadBed hg17 rgdQtl rgdQtl.tab
 
 # check rgdQtl table
 checkTableCoords hg17 rgdQtl
 
 # Go the following error messages:
 
 #hg17.rgdQtl has 2 records with end > chromSize.
 #hg17.rgdQtl has 2 records with end > chromSize.
 #hg17.rgdQtl has 2 records with end > chromSize.
 #hg17.rgdQtl has 1 records with end > chromSize.
 #hg17.rgdQtl has 2 records with end > chromSize.
 #hg17.rgdQtl has 3 records with end > chromSize.
 #hg17.rgdQtl has 1 records with end > chromSize.
 #hg17.rgdQtl has 3 records with end > chromSize.
 #hg17.rgdQtl has 2 records with end > chromSize.
 #hg17.rgdQtl has 1 records with end > chromSize.
 #hg17.rgdQtl has 3 records with end > chromSize.
 #hg17.rgdQtl has 2 records with end > chromSize.
 #hg17.rgdQtl has 1 records with end > chromSize.
 #hg17.rgdQtl has 1 records with end > chromSize.
 #hg17.rgdQtl has 1 records with end > chromSize.
 #hg17.rgdQtl has 1 records with end > chromSize.
 #hg17.rgdQtl has 1 records with end < start.
 
 hgsql hg17 -N -e 'select "do1", name, c.size from rgdQtl r, chromInfo c where chromEnd > c.size and r.chrom=c.chrom' >doall
 
 cat << '_EOF_' > do1
 hgsql hg17 -e "update rgdQtl set chromEnd = '${2}' where name='${1}'"
 '_EOF_'
 
 chmod +x do*
 doall
 
 checkTableCoords hg17 rgdQtl
 #hg17.rgdQtl has 1 records with end < start.
 
 hgsql hg17 -e 'select * from rgdQtl where chromEnd < chromStart'
 # bin     chrom   chromStart      chromEnd        name
 # 9       chr10   7135612 371019  BW63_H
 # 0       chr20   77628133        5242324 AASTH39_H
 
 # Don't know why checkTableCoords only catches one of the two erros.
 hgsql hg17 -e "update rgdQtl set chromStart = 271019 where name='BW63_H'"
 hgsql hg17 -e "update rgdQtl set chromEnd = 7135612 where name='BW63_H'"
 
 # Delete the following record.  The RGD QTL is very questionable. 
 hgsql hg17 -e "delete from rgdQtl where name='AASTH39_H'"
 
 # load rgdQtlLink table
 hgsql hg17 -e "drop table hg17.rgdQtlLink;"
 hgsql hg17 <~/kent/src/hg/lib/rgdQtlLink.sql
 hgsql hg17 -e 'load data local infile "rgdQtlLink.tab" into table hg17.rgdQtlLink;'
 
 # Delete the record from rgdQtlLink table that was removed from the rgdQtl
 # table above. (hartera, 2006-06-30)
 hgsql hg17 -e "delete from rgdQtlLink where name='AASTH39_H'"
 
 ########################################################################
 
 #########################################################################
 #Reload omimTitle table			Belinda Giardine June 28, 2006
 #fetched omim.txt.Z from OMIM downloads.
 #parse out title lines (*FIELD* TI)
 convertTitle < omim.txt > omimTitle.txt
 #load into omimTitle table
 truncate table omimTitle;
 load data local infile "omimTitle.txt" into table omimTitle;
 
 #############################################################
 # Lift SV track from hg16 (Heather, July 2006)
 # hg16 SV track is comprised of 7 subtracks:
 # cnpFosmid, cnpSebat, cnpIafrate, cnpSharp, delConrad, delMccarroll, delHinds
 # Use the same table formats as hg16; pre-create
 # (No bin for del tables)
 
 cd /cluster/data/hg17/bed
 mkdir svMixed
 cd svMixed
 
 # I got hg17 coords from Andy Sharp for cnpFosmid and delHinds
 trimFosmid.pl < cnpFosmid.txt > cnpFosmid.bed
 hgLoadBed -tab hg17 cnpFosmid cnpFosmid.bed
 hinds.pl < hinds.txt > delHinds.bed
 hgLoadBed -tab -noBin hg17 delHinds delHinds.bed
 
 # (7-27-2006 Brooke Rhead -- edited the cnpFosmid table)
 # According to Andy Sharp, the name='Gap' items should be removed from
 # cnpFosmid.  I dumped the table, removed the 'Gap' lines, then dumped the
 # table again.
 
 cd /cluster/data/hg17/bed/svMixed
 hgsql hg17 -e "select * from cnpFosmid" > cnpFosmid_withGaps.bed
 
 hgsql hg17
 delete from cnpFosmid where name='Gap';
 
 hgsql hg17 -e "select * from cnpFosmid" > cnpFosmid_withoutGaps.bed
 
 # Simple lifts for delMccarroll
 cat << '_EOF_' > liftConrad.csh
 #!/bin/csh
 hgsql -N -e 'select * from delMccarroll' hg16 > delMccarroll.hg16
 liftOver -minMatch=0.7 delMccarroll.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delMccarroll.bed delMccarroll.err
 hgLoadBed -sqlTable=delMccarroll.sql -tab -noBin hg17 delMccarroll delMccarroll.bed
 '_EOF_'
 
 # Lift both chromStart/chromEnd and thickStart/thickEnd for delConrad and join
 cat << '_EOF_' > liftConrad.csh
 #!/bin/csh
 hgsql -N -e 'select chrom, chromStart, chromEnd, name, score, strand from delConrad' hg16 > delConrad.hg16.1
 hgsql -N -e 'select chrom, thickStart, thickEnd, name, score, strand from delConrad' hg16 > delConrad.hg16.2
 liftOver -minMatch=0.7 delConrad.hg16.1 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delConrad.tmp.1 delConrad.err.1
 liftOver -minMatch=0.7 delConrad.hg16.2 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz delConrad.tmp.2 delConrad.err.2
 trimConrad.pl < delConrad.tmp.1 > delConrad.trim.1
 trimConrad.pl < delConrad.tmp.2 > delConrad.trim.2
 sort delConrad.trim.1 > delConrad.sort.1
 sort delConrad.trim.2 > delConrad.sort.2
 join delConrad.sort.1 delConrad.sort.2 > delConrad.join
 awk '{print $2, $3, $4, $1, 1000, $5, $7, $8}' delConrad.join > delConrad.bed
 hgLoadBed -sqlTable=delConrad.sql -noBin hg17 delConrad delConrad.bed
 '_EOF_'
 
 # Andy Sharp says the Sebat data has already been lifted, so be conservative here
 # Create hg16.cnpSebatLiftCandidate that excludes 5 rows that had wild proliferations
 cat << '_EOF_' > liftSebat.csh
 hgsql -N -e 'select chrom, chromStart, chromEnd, name, probes, individuals from cnpSebatLiftCandidate' hg16 > cnpSebat.hg16
 liftOver -minMatch=0.7 -bedPlus=4 cnpSebat.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpSebat.bed cnpSebat.err
 hgLoadBed -sqlTable=cnpSebat.sql -tab hg17 cnpSebat cnpSebat.bed
 '_EOF_'
 
 # For Andy's data, use bacEndPairs first, then lift the remainder
 cat << '_EOF_' > liftSharp.csh
 # assumes a copy of hg16.cnpSharp in hg17.cnpSharpHg16Copy
 /cluster/home/heather/kent/src/hg/snp/snpLoad/cnpLookup hg17 bacEndPairs cnpSharpHg16Copy cnpSharpLookup.out cnpSharpLookup.lift cnpSharpLookup.log
 sed -e 's/Gain and Loss/GainAndLoss/' cnpSharpLookup.lift > cnpSharpLookup.lift.fix
 mv cnpSharpLookup.lift.fix cnpSharpLookup.lift
 liftOver -minMatch=0.7 -bedPlus=4 cnpSharpLookup.lift /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpSharp.bed cnpSharp.err
 sed -e 's/GainAndLoss/Gain And Loss/' cnpSharp.bed > cnpSharp.bed.fix
 mv cnpSharp.bed.fix cnpSharp.bed
 hgLoadBed -tab -sqlTable=cnpSharp.sql hg17 cnpSharp cnpSharpLookup.out
 hgLoadBed -tab -oldTable hg17 cnpSharp cnpSharp.bed
 '_EOF_'
 
 # For the Iafrate data, the BAC End lookup wasn't good, so just lift
 # Create hg16.cnpIafrateLiftCandidate that excludes 2 rows that had wild proliferations
 cat << '_EOF_' > liftIafrate.csh
 hgsql -N -e 'select chrom, chromStart, chromEnd, name, variationType, score from cnpIafrateLiftCandidate' hg16 > cnpIafrate.hg16
 sed -e 's/Gain and Loss/GainAndLoss/' cnpIafrate.hg16 > cnpIafrate.hg16.fix
 mv cnpIafrate.hg16.fix cnpIafrate.hg16
 liftOver -minMatch=0.7 -bedPlus=4 cnpIafrate.hg16 /gbdb/hg16/liftOver/hg16ToHg17.over.chain.gz cnpIafrate.bed cnpIafrate.err
 sed -e 's/GainAndLoss/Gain And Loss/' cnpIafrate.bed > cnpIafrate.bed.fix
 mv cnpIafrate.bed.fix cnpIafrate.bed
 hgLoadBed -sqlTable=cnpIafrate.sql -tab hg17 cnpIafrate cnpIafrate.bed
 '_EOF_'
 
 ##############################################################################
 # Add HapMap CNVRs from Matt Hurles (Heather Dec 2006)
 ssh hgwdev
 cd /cluster/data/hg17/bed/svRedon
 # File from Matthew Hurles (meh@sanger.ac.uk) was essentially bed 4
 # I decided to use bed 6 with score always 0 and strand always +
 awk '{printf "%st%d\t%d\tcnp%s\t0\t%s\n", $1, $4, $5, $3, $7}' input.gff > input.bed
 hgLoadBed hg17 cnpRedon input.bed
 
 ##############################################################################
 #  dbRIP POLYALUL1SVA track added (2006-07-14 - DONE - Hiram)
 #  dbRIP polyAluL1SVA
 #	Data provider: Dr. Liang at the Liang lab:
 #	http://falcon.roswellpark.org/index.html
 #	Ping.Liang@roswellpark.org
 
 #	Adding this track is a new data type into our browser.
 #	data definitions for dbRIP and polyGenotype were added to
 #	the hg/lib/ directory:
 #	-rw-rw-r--  1  351 Jul 13 12:20 polyGenotype.as
 #	-rw-rw-r--  1  694 Jul 13 12:22 polyGenotype.sql
 #	-rw-rw-r--  1 6398 Jul 13 12:22 polyGenotype.c
 #	-rw-rw-r--  1   980 Jul 10 17:59 dbRIP.as
 #	-rw-rw-r--  1 11408 Jul 13 11:16 dbRIP.c
 #	-rw-rw-r--  1  1578 Jul 13 12:06 dbRIP.sql
 #	With associated .h files in hg/inc/
 #	-rw-rw-r--  1 4600 Jul 10 18:00 dbRIP.h
 #	-rw-rw-r--  1 4375 Jul 13 16:16 polyGenotype.h
 #	Changes in hgTracks and hgc to make this track appear as it does
 #	at their browser:
 #	http://falcon.roswellpark.org:9090/cgi-bin/hgTables
 
 #	For this first instance of the track, the data was obtained
 #	directly from their Genome browser via the tables browser,
 #	dumping the tables:
 #	hg17.polyAluL1 and hg17.polyGenotype
 #	saving these data dumps to:
 #		(after a couple of versions were used ...)
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/dbRIP
     cd /cluster/data/hg17/bed/dbRIP
     #	-rw-rw-r--  1  994485 Aug  1 16:03 dbRIP.2006-08-01.txt.gz
     #	-rw-rw-r--  1   18532 Aug  1 16:05 polyGenotype.2006-08-01.txt.gz
 
     #	Rearrange their data columns to more closely match the
     #	standard BED definitions, and split into three different
     #	data sets:
 zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' '
 {
 chromStart=$6
 chromStart -= 1
 chromEnd=$7
 if (match($1,"^RIP_SVA_.*")) {
 printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21
 }
 }' | sort -k1,1 -k2,2n > dbRIP.SVA.txt
 
 zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' '
 {
 chromStart=$6
 chromStart -= 1
 chromEnd=$7
 if (match($1,"^RIP_L1_.*")) {
 printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21
 }
 }' | sort -k1,1 -k2,2n > dbRIP.L1.txt
 
 zcat dbRIP.2006-08-01.txt.gz | headRest 1 stdin | awk -F'\t' '
 {
 chromStart=$6
 chromStart -= 1
 chromEnd=$7
 if (match($1,"^RIP_Alu_.*")) {
 printf "%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $5, chromStart, chromEnd, $1, $12, $2, $3, $4, $8, $9, $10, $11, $15, $13, $14, $16, $17, $18, $19, $20, $21
 }
 }' | sort -k1,1 -k2,2n > dbRIP.Alu.txt
 
     #	Create three specific sql table create definitions:
     sed -e "s/dbRIP/dbRIP_SVA/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_SVA.sql
     sed -e "s/dbRIP/dbRIP_L1/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_L1.sql
     sed -e "s/dbRIP/dbRIP_Alu/" $HOME/kent/src/hg/lib/dbRIP.sql > dbRIP_Alu.sql
 
     #	And loading those three data tables:
     hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \
         -sqlTable=dbRIP_SVA.sql hg17 dbRIP_SVA dbRIP.SVA.txt
 
     hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \
 	-sqlTable=dbRIP_L1.sql hg17 dbRIP_L1 dbRIP.L1.txt
 
     hgLoadBed -verbose=3 -maxChromNameLength=6 -strict -tab -notItemRgb \
 	-sqlTable=dbRIP_Alu.sql hg17 dbRIP_Alu dbRIP.Alu.txt
 
     #	And an associated table of genotype frequencies
     #	Add three extra rows to the original data to provide a better handle
     #	on MySQL lookups for allele Frequency
     hgsql hg17 -e "drop table polyGenotype;"
     hgsql hg17 < $HOME/kent/src/hg/lib/polyGenotype.sql
     zcat polyGenotype.2006-08-01.txt.gz | headRest 1 stdin | \
 awk -F'\t' '
 {
 sampleSize = $3 + $4 + $5
 plus = ($3 * 2) + $4
 minus = ($5 * 2) + $4
 if ((plus + minus) < 1) { alleleFreq=0 } else
         { alleleFreq = plus / (plus + minus) }
 if (sampleSize > 0) {
 heteroZyg = (2 * alleleFreq * (1.0 - alleleFreq)) * ((sampleSize * 2)/((sampleSize * 2) - 1))
 } else {
 heteroZyg = 2 * alleleFreq * (1.0 - alleleFreq)
 }
 printf "%s\t%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\n", $1, $2, $3, $4, $5, sampleSize, alleleFreq, heteroZyg
 }
 ' > polyGenotype.txt
 
     hgsql hg17 -e \
 	'load data local infile "polyGenotype.txt" into table polyGenotype;'
 
     #	A composite track was added to human/hg17/trackDb.ra to contain
     #	these three tracks, and search methods to get the name column
     #	participating in the search.  Need to figure out how to get some
     #	of the other text-rich columns participating in the search.
 
 ##############################################################################
 # hg17 -> hg15 LIFTOVER CHAINS (DONE 7/27/06 Fan)
     # I used a size of 10kb instead of 3kb for the split (blat query) sizes in
     # hg15.  This had a huge affect on the amount of hits in the blat, which
     # then had a huge effect on the amount of chains.  I should also mention 
     # that hg15 chromosomes chr1 and chr2 were split further 
     # into more than a single query file.  This helped a LOT in avoiding 
     # cluster hippos classically associated with those chroms.
     
     ######## LIFTOVER PREPARATION
     # Split up hg15
     ssh pk
     cd /san/sanVol1/scratch/hg15
     mkdir -p liftSplits/{split,lift}
     bash
     for fa in /cluster/data/hg15/?{,?}/*.fa; do
       c=`basename $fa .fa`
       echo $c
       faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
     done
     mkdir -p biggerSplits/split
     cd biggerSplits/
     ln -s ../liftSplits/lift
     cd split/
     ln -s ../../liftSplits/split/* .
     faSplit sequence chr1.fa 5 chr1_
     faSplit sequence chr2.fa 5 chr2_
     rm chr{1,2}.fa
 
     # Make some dirs
     # cd /san/sanVol1/scratch
     # mkdir -p hg17
 
     # Copy 11.ooc files to hg17 subdirectory.
     # cp -p /cluster/store5/gs.16/build33/11.ooc hg17
 
     ## First, copy over scripts. (Already done before)
 
     # mkdir -p /san/sanVol1/scratch/fan
     # cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
     # cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan
 
     ######## LIFTOVER BLATING  
 
     # HG17
     ssh kk
     cd /cluster/data/hg17
     #makeLoChain-align hg17 /scratch/hg/hg17/nib hg15 /san/sanVol1/scratch/hg15/biggerSplits/split
 
     makeLoChain-align hg17 /scratch/hg/hg17/bothMaskedNibs hg15  /san/sanVol1/scratch/hg15/liftOver/biggerSplits/split 
 
 # Completed: 2392 of 2392 jobs
 # CPU time in finished jobs:   25651277s  427521.28m  7125.35h  296.89d  0.813 y
 # IO & Wait Time:                 74118s    1235.30m    20.59h    0.86d  0.002 y
 # Average job time:               10755s     179.25m     2.99h    0.12d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           82545s    1375.75m    22.93h    0.96d
 # Submission to last job:         82579s    1376.32m    22.94h    0.96d
 
     ssh kkstore02
 
     cd /cluster/data/hg17
     cd bed
 
     mv blat.hg15.2006-07-25 /san/sanVol1/scratch/hg17 
 
     ssh pk
     cd /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/run/
     sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg17ToHg15"}' > newspec
     para create newspec
     para try
     para push
 # Completed: 2392 of 2392 jobs
 # CPU time in finished jobs:     612316s   10205.26m   170.09h    7.09d  0.019 y
 # IO & Wait Time:                 12421s     207.02m     3.45h    0.14d  0.000 y
 # Average job time:                 261s       4.35m     0.07h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3524s      58.73m     0.98h    0.04d
 # Submission to last job:          3588s      59.80m     1.00h    0.04d
 
     ######## LIFTOVER CHAINING
     # LIFTING
     ssh pk
     cd /san/sanVol1/scratch/fan
     cp mm7SplitLift.sh hg15SplitLift.sh
 
     # change andy to fan, mm7 to hg15, and chrX to chr2, and remove chrUn_random 
     vi hg15SplitLift.sh
 
     cat << 'EOF' > hg15ChainMergeSplit.sh
 #!/bin/bash
 cp -r chainRaw/ /scratch/fan/hg15Lifts
 pushd /scratch/fan/hg15Lifts
 mkdir chain
 /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
 cp -r chain `dirs +1`
 rm -rf chain chainRaw
 'EOF'
 
     chmod +x hg15ChainMergeSplit.sh
 
     # HG17
     cd /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/raw
     /san/sanVol1/scratch/fan/hg15SplitLift.sh
 # There was an extra file, nib22.fa, under /cluster/data/hg15/nib, which should not be there.
 # -rw-rw-r--  1 2429 protein  50466533 May 20  2003 nib22.fa
 # This caused hg15SplitLift.sh to end abnormally.
 
     cd ../    
     mkdir chainRun chainRaw
     cd chainRun
     cat > gsub << 'EOF'
 #LOOP
 /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg17/bothMaskedNibs  /san/sanVol1/scratch/hg15/nib {check out line+ ../chainRaw/$(root1).chain}
 #ENDLOOP
 'EOF'
     ls -1S ../psl/*.psl > in.lst
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
     para time
 # Completed: 44 of 44 jobs
 # CPU time in finished jobs:       3596s      59.94m     1.00h    0.04d  0.000 y
 # IO & Wait Time:                   919s      15.31m     0.26h    0.01d  0.000 y
 # Average job time:                 103s       1.71m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             274s       4.57m     0.08h    0.00d
 # Submission to last job:           284s       4.73m     0.08h    0.00d
     ######### CHAINMERGE/NET/NETSUBSET
     ssh kolossus
     mkdir -p /scratch/fan/hg15Lifts
     cd /scratch/fan/hg15Lifts
 
     cp -r /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/chainRaw/ .
     mkdir chain
     /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
 # about 30 minutes.
 
     cp -rp chain /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/
     rm -rf chain
     rm -rf chainRaw
 
     ssh pk
     cd /san/sanvol1/scratch/fan
     cat << 'EOF' > netOver.sh 
 #!/bin/bash
 
 chain=$1
 chrom=`basename $chain .chain`
 sizesHGOld=$2
 sizesHG15=/cluster/data/hg15/chrom.sizes
 chainDir=`dirname $chain`
 blatDir=`dirname $chainDir`
 net=${blatDir}/net/${chrom}.net
 over=${blatDir}/over/${chrom}.over
 
 mkdir -p ${blatDir}/{over,net}
 /cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG15 $net /dev/null
 /cluster/bin/x86_64/netChainSubset $net $chain $over
 'EOF'
     chmod +x netOver.sh
 
     mkdir netRun
 
     cd netRun/
 
     find /san/sanVol1/scratch/hg17/blat.hg15.2006-07-25/chain -name "*.chain" \
      | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg17/chrom.sizes"}' > spec
     para create spec
     para push
     para time
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:        438s       7.30m     0.12h    0.01d  0.000 y
 # IO & Wait Time:                   118s       1.97m     0.03h    0.00d  0.000 y
 # Average job time:                  12s       0.20m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              28s       0.47m     0.01h    0.00d
 # Submission to last job:            67s       1.12m     0.02h    0.00d
 
     ########## FINISHING
     ssh hgwdev
 
     # HG17
     cd /san/sanvol1/scratch/hg17/blat.hg15.2006-07-25/over
     cat * >> ../hg17ToHg15.over.chain
     cd ../
     rm -rf psl/ net/ chain/ chainRaw/ over/
     cd ../
     cp -rp blat.hg15.2006-07-25/ /cluster/data/hg17/bed
 
     cd /cluster/data/hg17/bed
     ln -s blat.hg15.2006-07-25 blat.hg15
     ln -s `pwd`/blat.hg15/hg17ToHg15.over.chain liftOver/hg17ToHg15.over.chain
     ln -s `pwd`/liftOver/hg17ToHg15.over.chain /gbdb/hg17/liftOver/hg17ToHg15.over.chain
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/liftOver
 
     gzip /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain
     ln -s /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz /gbdb/hg17/liftOver/
     cp -p /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz /cluster/data/hg17/bed/liftOver 
 
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
     ln -s /cluster/store5/gs.18/build35/bed/blat.hg15/hg17ToHg15.over.chain.gz hg17ToHg15.over.chain.gz
     hgAddLiftOverChain hg17 hg15 
 
 ############################################################################
 
 ############################################################################
 # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-08-15 markd)
     cd /cluster/data/genbank/data/ccds/hg17
     ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
     ftp> get CCDS.20060815.tar.gz
     mkdir /scratch/tmp/ccds
     cd /scratch/tmp/ccds
     tar -zxf  /cluster/data/genbank/data/ccds/hg17/CCDS.20060815.tar.gz
 
     # import ccds database tables
     hgsql -e 'create database ccds'
     hgsql ccds </cluster/data/genbank/etc/createTables.sql 
     hgsql ccds </cluster/data/genbank/etc/createKeys.sql 
     /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
 
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
     checkTableCoords hg17 -verbose=2 ccdsGene
 
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
     rm -rf /scratch/tmp/ccds
 
     # 2006-08-23 - found bug with some source genes missing from ccdsInfo, fixed ccdsMkTables
     # and reload
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
     checkTableCoords hg17 -verbose=2 ccdsGene
     joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
 
     # << emacs
 ##########################################################################
 #  hars 1 to 202  Sol 09/10/2006
 
     set bedDir = /gbdb/hg17/haseq/bed
     mkdir -p $bedDir/hars
     pushd /projects/hg/wet/Sol/hars1to49
     cp -p hars_1to202.hg17.bed  $bedDir/hars/hars_1to202.bed
     hgLoadBed hg17 hars         $bedDir/hars/hars_1to202.bed
     rm -f                       $bedDir/hars/hars_1to202.bed
     popd
 
 # BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (DONE 9/11/06)
 
 # First, build hprdToCdna.tab and hprdToUniProt.tab. 
 # See hg18.txt for details.
 
     cd ~/data/hprd
     mkdir hg17
     cd hg17
 
     hgsql hg17 -e 'drop table hprdToCdna'
     hgsql hg17 <~/src/hg/lib/hprdToCdna.sql
     hgsql hg17 -e 'load data local infile "../hprdToCdna.tab" into table hprdToCdna'
 
     hgsql hg17 -e 'drop table hprdToUniProt'
     hgsql hg17 <~/src/hg/lib/hprdToUniProt.sql
     hgsql hg17 -e 'load data local infile "../hprdToUniProt.tab" into table hprdToUniProt'
 
 # build knownToHprd table
 
     hgsql hg17 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=kgId' >j.kg1
     hgsql hg17 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2
 
     cat j.kg1 j.kg2 |sort -u >knownToHprd.tab
     wc knownToHprd.tab
 
     hgsql hg17 -e 'drop table knownToHprd'
     hgsql hg17 <~/src/hg/lib/knownToHprd.sql
 
     hgsql hg17 -e 'load data local infile "knownToHprd.tab" into table knownToHprd'
     hgsql hg17 -e 'select count(*) from knownToHprd'
 
 # 19,345 records created.
 
 # remove temporary files.
 
     rm j*
 
 # Do the same for hg17.  See hg17.txt for details.
 
 
 ############################################################################
 # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-09-20 markd)
 # Reloaded due to bug that results in multiple versions of the same accession
 # in the ccdsInfo table.
 
     cd /cluster/data/genbank/data/ccds/hg17
     ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
     get CCDS.20060920.tar.gz
     mkdir /scratch/tmp/ccds
     cd /scratch/tmp/ccds
     tar -zxf  /cluster/data/genbank/data/ccds/hg17/CCDS.20060920.tar.gz
 
     # import ccds database tables
     hgsql -e 'drop database ccds; create database ccds'
     hgsql ccds </cluster/data/genbank/etc/createTables.sql 
     hgsql ccds </cluster/data/genbank/etc/createKeys.sql 
     /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
 
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
     checkTableCoords hg17 -verbose=2 ccdsGene
     joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner
     rm -rf /scratch/tmp/ccds
 
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
 
     # << emacs
 
 
 #########################################################
 # BUILD GAD TRACK (Done, 9/26/06, Fan)
 
    mkdir /cluster/store/gad060926
    ln -s /cluster/store/gad060926 /cluster/data/gad
 
 # Receive "allxlsAStxt.txt" from GAD/NIA 
 # contact person: Shenoy, Narmada, shenoyn@grc.nia.nih.gov  
    
    hgsql hg17 -e 'drop table gadAll'
    hgsql hg17 <~/src/hg/lib/gadAll.sql
    hgsql hg17 -e 'load data local infile "allxlsAStxt.txt" into table gadAll ignore 2 lines'
    hgsql hg17 -e 'create index geneSymbol on gadAll(geneSymbol(10))'
 
 # create gad table
 
    hgsql hg17 -N -e \
    'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll where chromStart <>0'|\
    sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gad.bed
 
    hgLoadBed hg17 gad gad.bed
 
 #####################################################################
 # YALE TRANSCRIPTIONALLY ACTIVE REGIONS (TARs/TransFrags) TRACK IDENTIFIED 
 # USING A WHOLE GENOME TILING ARRAY (DONE, 2006-10-20, hartera)
 # Data is from the paper: Bertone et al. Science 24 December 2004:
 # Vol. 306. no. 5705, pp. 2242 - 2246. From Mark Gerstein's lab at Yale.
 # Contact at Yale: Joel S. Rozowsky, joel.rozowsky@yale.edu
 # The data consist of Transcriptionally Active Regions (TARs or TransFrags)
 # found using Affymetrix genome tiling arrays. The data is from the lab
 # of Mark Gerstein at Yale.
      ssh kkstore02
      mkdir /cluster/data/hg17/bed/yaleBertoneTars/
      cd /cluster/data/hg17/bed/yaleBertoneTars/
      # download Bertone et al. data from this URL:
     #http://dart.gersteinlab.org/cgi-bin/ar/download.cgi?ID=TAR_data_NCBI31.txt
      # and put it in this directory.
      # The sequences used to design the microarrays were from
      # UCSC hg13/NCBI Build 31 so the sequences
      # should be aligned again using Blat since this is probably better
      # than using liftOver across so many assemblies.
 
      # Get sequences from TARs file and put in FASTA format:
      # Remove characters from Windows:
      dos2unix TAR_data_NCBI31.txt
      # The TARs are in order of IDs in the file so the first TAR has ID 1, the
      # second is 2 up to the last which is 17517. These IDs are used to link
      # to the DART database of TARs at Yale so use these IDs in the FASTA
      # header lines. Need to add "TAR" as prefix to ID so that it is unique
      # in the seq table.
    awk 'BEGIN {FS="\t";n=0;}{if ($1 ~ /^chr/) print ">TAR"n"\n"$14"\n";n++;}' \
          TAR_data_NCBI31.txt > yaleBertoneTARSeqs.fa
      ssh pk
      mkdir -p /san/sanvol1/scratch/hg17/TARs/
      cp /cluster/data/hg17/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
         /san/sanvol1/scratch/hg17/TARs/
      # Set up to Blat the TAR sequences against hg17
      cd /cluster/data/hg17/bed/yaleBertoneTars
      ls -1 /san/sanvol1/scratch/hg17/TARs/yaleBertoneTARSeqs.fa > tars.lst
      ls -1 /san/sanvol1/scratch/hg17/nib/*.nib > genome.lst
      # output dir
      mkdir psl
 
      cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/x86_64/blat -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/hg17/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << for emacs
      gensub2 genome.lst tars.lst template.sub para.spec
      para create para.spec
      para try, para check, para push ...
      para time
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:        429s       7.16m     0.12h    0.00d  0.000 y
 # IO & Wait Time:                   153s       2.54m     0.04h    0.00d  0.000 y
 # Average job time:                  13s       0.21m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              38s       0.63m     0.01h    0.00d
 # Submission to last job:           107s       1.78m     0.03h    0.00d
      # sort and then filter
      pslSort dirs raw.psl tmp psl
      # use these parameters as for Genbank alignments of native mRNAs
      # for finished assemblies.
      pslCDnaFilter -minId=0.96 -minCover=0.25 -localNearBest=0.001 \
        -minQSize=20 -minNonRepSize=16 -ignoreNs -bestOverlap \
        raw.psl yaleBertoneTars.psl
 #                         seqs    aligns
 #             total:     17512   37530
 # drop minNonRepSize:     121     254
 #     drop minIdent:     3827    14532
 #     drop minCover:     571     897
 #       weird over:     232     837
 #        kept weird:     197     201
 #    drop localBest:     2359    3896
 #              kept:     17498   17951
      
      # 99.9% were kept. 
      # check how many aligned
      grep '>' yaleBertoneTARSeqs.fa | wc -l
      # 17517
      # 99.89% of the original set of sequences are in this filtered PSL file.
      pslCheck yaleBertoneTars.psl
      # psl is ok
 
      # load into database
      ssh hgwdev
      cd /cluster/data/hg17/bed/yaleBertoneTars
      hgLoadPsl hg17 yaleBertoneTars.psl
 
      # Add sequences to /gbdb/hg18 and to seq and extFile tables.
      mkdir -p /gbdb/hg17/yaleTARs/
      ln -s /cluster/data/hg17/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
            /gbdb/hg17/yaleTARs/
      hgLoadSeq hg17 /gbdb/hg17/yaleTARs/yaleBertoneTARSeqs.fa
     
      # trackDb.ra entry is in trackDb/human/trackDb.ra and 
      # a description exist already as this track is also on hg18.
 
 ######################################################################
 ## reload tfbsCons table - it was based on a newer version of tfbs names that
 #	are not yet public domain (2006-11-03 - Hiram)
     mkdir /cluster/data/hg17/bed/tfbsCons
     cd /cluster/data/hg17/bed/tfbsCons
     cp -p /cluster/store6/weirauch/TFLOC/hg17/tfbsConsSites.bed .
     hgLoadBed -strict hg17 tfbsConsSites \
 	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql \
 	tfbsConsSites.bed -tab
 # this leads to a bunch of extra names in Factors
     hgsql -N -e "select name from tfbsConsSites;" hg17 | sort -u > names.new
     hgsql -N -e "select name from tfbsConsFactors;" hg17 \
 	| sort -u > names.factors
     comm -13 names.new names.factors > names.extra.factors
 for N in `cat extra.names.factors`
 do
         echo "delete from tfbsConsFactors where name=\"${N}\";" hg17
         hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg17
 done
 
 #########################################################
 # BUILD GAD TRACK (Re-Re-Done, 12/12/06, Fan)
 
    mkdir /cluster/store12/gad061211
    rm /cluster/data/gad
    ln -s /cluster/store12/gad061211 /cluster/data/gad
 
 # Receive "GAD-Hg17DATA.txt" from GAD/NIA 
 # contact person: Shenoy, Narmada, shenoyn@grc.nia.nih.gov  
    
    hgsql hg17 -e 'drop table gadAll'
    hgsql hg17 <~/src/hg/lib/gadAll.sql
    hgsql hg17 -e 'load data local infile "GAD-Hg17DATA.txt" into table gadAll ignore 1 lines'
    hgsql hg17 -e 'create index geneSymbol on gadAll(geneSymbol(10))'
 
 # create gad table
 
    hgsql hg17 -N -e \
    'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll
    where chromStart <>0 and chromosome<>""'|\
    sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gad.bed
 
    hgLoadBed hg17 gad gad.bed
 
 ##########################################################################
 #  xxBlastTab - Help filter out unwanted paralogs  (Galt 2007-01-11)
 #
 # Background: The xxBlastTab tables are made with a simple blastall 
 # (blastp with -b 1) which chooses the best match.  Unfortunately this
 # means that if there is no proper match it will still pick something
 # even though it's probably not orthologous. This is especially a problem
 # in organisms like rat knownGene which has only 30% gene coverage. 
 # The strategy here is to filter our xxBlastTab using synteny mappings from
 # the chains. This is done by simply taking $db.kg and using /gbdb/$db chains
 # and pslMap to lift the genes to the target xx assembly.  Then hgMapToGene
 # will find which of those mapped ids have good overlap with xx.knownGene.
 # The final mapping is then created by doing an inner join between 
 # the traditional xxBlastTab and the mapping table produced above.
 # Then simply drop the old table and rename the new table.
 #
 # We are starting with xxBlastTab tables already built in the usual way with
 # blastall/blastp, probably with doHgNearBlastp.pl script.
 #
 # I created a new utility script called synBlastp.csh since I have to do this
 # several times. 
 #
 # we want to update hg17 for rat and mouse, 
 # so check ./hgGeneData/Human/hg17/otherOrgs.ra for current settings
 
 ssh hgwdev
 
 synBlastp.csh hg17 rn3
 #hg17.rnBlastTab results:
 #new number of unique query values:
 #10728
 #new number of unique target values
 #5177
 #old number of unique query values:
 #24030
 #old number of unique target values
 #5535
 
 synBlastp.csh hg17 mm7
 #new number of unique query values:
 #25506
 #new number of unique target values
 #13462
 #old number of unique query values:
 #32951
 #old number of unique target values
 #14803
 
 
 
 #####################################################################
 ##########################################################################
 # GenBank gbMiscDiff table (markd 2007-01-10)
 # Supports `NCBI Clone Validation' section of mgcGenes details page
 
    # genbank release 157.0 now contains misc_diff fields for MGC clones
    # reloading mRNAs results in gbMiscDiff table being created.
    ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg17
 
 ############
 # UPDATE hg17 knownToVisiGene (DONE galt 2007-02-15)
 # Create table that maps between known genes and visiGene database 
     # mapping to other species such as mouse, zebrafish, frog
     # requires visiGene probe track vgImageProbes be created first
     knownToVisiGene hg17 -fromProbePsl=vgImageProbes
 
 #########################################################
 # Chimp Paralogy data from Eichlers lab (DONE Heather Feb. 2007)
 cd /cluster/data/hg17/bed/eichler
 hgLoadBed hg17 chimpParalogy chimpParalogy.bed -tab -sqlTable=chimpParalogy.sql
 
 ############################################################################
 # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd)
 
     cd /cluster/data/genbank/data/ccds/
     ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
     get CCDS.20070228.tar.gz
     mkdir /scratch/tmp/ccds
     cd /scratch/tmp/ccds
     tar -zxf /cluster/data/genbank/data/ccds/CCDS.20070228.tar.gz
 
     # import ccds database tables
     /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
 
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg17 ccdsInfo ccdsGene
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg17 -loadDb ccdsGene knownGene ccdsKgMap
     checkTableCoords hg17 -verbose=2 ccdsGene
     joinerCheck -database=hg17 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
 
     # build initial version of ccdsMgcMap table, updated by nightly  genbank update
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg17 ccdsGene mgcGenes ccdsMgcMap
     
     # load trackDb
     cd kent/src/hg/makeDb/trackDb
     make alpha
     # check in browser
 
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
         ccdsMgcMap
     # << emacs
 
 #####################################################
 # Vista Enhancers (galt 2007-02-23 done)
 #
 # Vista from Lawrence-Berkeley has assayed
 # 301 human conserved non-coding intra- and inter-
 # genic elements for their ability to promote
 # lacZ in mouse embryos.  A positive looks like
 # a mouse with a purple spine.
 #
 # They provided a custom track with two tracks for pos and neg.
 # http://enhancer-test.lbl.gov/cgi-bin/customTrack.cgi
 # I am combining the tracks into one with high score for pos.
 #
 cd /cluster/data/hg17/bed
 mkdir vistaEnhancers
 cd vistaEnhancers
 wget -O custTrk "http://enhancer-test.lbl.gov/cgi-bin/customTrack.cgi"
 cat custTrk | head -116 | tail +2 > pos
 cat custTrk | tail +118 > neg
 cat pos | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t900"}' > bed5
 cat neg | gawk '{print $1"\t"$2"\t"$3"\t"$4"\t200"}' >> bed5
 wc -l bed5
 #301 bed5
 hgLoadBed hg17 vistaEnhancers bed5
 #Loaded 301 elements of size 5
 
 # add to human/trackDb.ra
 track vistaEnhancers
 shortLabel Vista Enhancers
 longLabel Vista HMR-Conserved Non-coding Human Enhancers from LBNL
 group regulation
 priority 93
 visibility hide
 color 50,70,120
 type bed 5 .
 useScore 1
 url http://enhancer-test.lbl.gov/cgi-bin/imagedb.pl?form=presentation&show=1&experiment_id=$$
 
 ###
 # UPDATES (2007-10-18, conodera)
 # see also /projects/compbiousr/wet/browser/vista_enhancer/17Oct2007/Makefile
 
 cd /projects/compbiousr/wet/browser/vista_enhancer/
 
 # download data file from the vista browser (coordinates are for hg17)
 # http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1
 # save as enhancerbrowser.datadownload.txt
 
 # give elements with positive label a score of 900, 
 # give elements with negative label a score of 200.
 # print to 5-field bed file
 vista_enhancer.hg17.txt: enhancerbrowser.datadownload.txt
         grep ">" $< \
         | sed -e 's/>//' \
         | tr :- '       ' \
         | sed -e 's/positive/900/'\
         | sed -e 's/negative/200/' \
         | awk '{print $$1"\t"$$2"\t"$$3"\telement_"$$6"\t"$$8}' \
         > $@; \
 hgLoadBed hg17 vistaEnhancers vista_enhancer.hg17.txt;
 # loaded 446 elements of length 5
 
 
 #########################################################################
 # EPONINE-TSS (TRANSCRIPTON START SITE) PREDICTION 
 # (DONE, 2007-03-08, hartera)
 # The Eponine software is version 2 and has not changed in several years
 # (contact: Thomas Down at Sanger, td2@sanger.ac.uk). The version downloaded
 # for hg16 should be the same as the current version but download again just
 # to check. The application includes the TSS model file: eponine-tss2.xml 
 
      ssh kkstore02
      # Eponine runs fine on 2.5Mb contig, but barfs on much larger contig;
      # chop up sequence at gaps into ~2.5Mb chunks for cluster run.
      mkdir /san/sanvol1/scratch/hg17/chunks
      cd /cluster/data/hg17
      foreach f (?{,?}/NT_*/NT_??????.fa)
        set ctg = $f:t:r
        /cluster/bin/x86_64/faSplit -minGapSize=10 \
         -lift=/san/sanvol1/scratch/hg17/chunks/${ctg}.lft \
         gap $f 2500000 /san/sanvol1/scratch/hg17/chunks/${ctg}.chunk
      end
      # seems to ignore the chunk part of the file name
      mkdir /cluster/data/hg17/bed/eponine
      cd /cluster/data/hg17/bed/eponine
      wget --timestamping \
        http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar
      # file has the same date and same size as the one downloaded for hg16
      # the script requires all of the path setting found in my .tcshrc file.
      # Using only set path = (/usr/java/jre1.5.0_06/bin $path)
      # as in the doEpo file for hg16 does not work.
      cat << '_EOF_' > doEpo
 #!/bin/csh -ef
 set path = (/usr/java/jre1.5.0_06/bin /bin /usr/bin /usr/X11R6/bin \
              /usr/local/bin . /cluster/home/hartera/bin/x86_64 \
              /cluster/bin/x86_64 /projects/compbio/bin/x86_64 \
              /projects/compbio/bin /projects/compbio/bin/x86_64-linux \
              /cluster/bin/scripts)
 java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2
 '_EOF_'
      # << emacs
      chmod a+x doEpo
      cp /dev/null jobList
      foreach f (/san/sanvol1/scratch/hg17/chunks/NT*.fa)
         echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \
       >> jobList
      end
      mkdir out
      ssh pk
      cd /cluster/data/hg17/bed/eponine
      /parasol/bin/para create jobList
      /parasol/bin/para try, check, push, check etc.....
      /parasol/bin/para time
 # Completed: 1415 of 1415 jobs
 # CPU time in finished jobs:     104501s    1741.68m    29.03h    1.21d  0.003 y
 # IO & Wait Time:                  6594s     109.91m     1.83h    0.08d  0.000 y
 # Average job time:                  79s       1.31m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             127s       2.12m     0.04h    0.00d
 # Submission to last job:           488s       8.13m     0.14h    0.01d
      # lift chunks -> contigs
     mkdir contigs/
     foreach l (/san/sanvol1/scratch/hg17/chunks/*.lft)
       set ctg = $l:t:r
       liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff
     end
     # lift contigs -> chrom
     liftUp eponine.gff /cluster/data/hg17/jkStuff/liftAll.lft \
            warn contigs/NT_*.gff
     # Translate to bed 4 + float-score -- it would be a shame to lose 
     # those scores in genePred or bed 5 (int score)
     awk 'BEGIN {i=0;} \
          {printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \
           i = i + 1;}' \
       eponine.gff > eponine.bed
     # load up
     ssh hgwdev
     cd /cluster/data/hg17/bed/eponine
     sed -e 's/bed6FloatScore/eponine/g' \
       $HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql
     hgLoadBed hg17 eponine eponine.bed -tab -sqlTable=eponine.sql
     # Loaded 61013 elements of size 6
     # trackDb.ra entry and eponine.html already exist in trackDb directory.
 
 ###########################################################################
 # ACEScan Track (DONE 2007-03-15  Andy
 
 ssh hgwdev
 cd /cluster/data/hg17/bed
 mkdir acescan
 cd acescan/
 cp ~/acescan.gff .
 tail +2 acescan.gff > acescan.nh.gff
 ldHgGene -out=gp hg17 acescan acescan.nh.gff 
 rm *.gff
 ldHgGene -predTab hg17 acescan acescan.hg17.gp
 
 ###########################################################################
 # augustusHints track (DONE 2007-4-5 Mario)
 
 mkdir -p /cluster/data/hg17/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.Xp.final
 cd /cluster/data/hg17/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.Xp.final
 wget http://augustus.gobics.de/predictions/hg17/usingEvidence/augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.gff
 wget http://augustus.gobics.de/predictions/hg17/usingEvidence/augustus.hg17.Trefseq.hmRNA.hsEST.R.pep.aa
 ldHgGene -bin hg17 augustusHints augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.gff
 hgPepPred hg17 generic augustusHintsPep augustus.hg17.Trefseq.hmRNA.hsEST.R.Xp.pep.aa
 
 ###########################################################################
 # augustus de novo track (DONE 2007-4-5 Mario)
 
 mkdir -p /cluster/data/hg17/bed/augustus/usingHints/predictions/Xp.RA.it
 cd /cluster/data/hg17/bed/augustus/usingHints/predictions/Xp.RA.it
 wget http://augustus.gobics.de/predictions/hg17/deNovo/augustus.hg17.Xp.RA.it.gff
 wget http://augustus.gobics.de/predictions/hg17/deNovo/augustus.hg17.Xp.RA.it.pep.aa
 ldHgGene -bin hg17 augustusXRA augustus.hg17.Xp.RA.it.gff
 hgPepPred hg17 generic augustusXRAPep augustus.hg17.Xp.RA.it.pep.aa
 
 
 ###########################################################################
 # SwitchDB TSS Track (DONE 2007-04-12 Andy)
 
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/switchDbTss
 cd /cluster/data/hg17/bed/switchDbTss
 # (obtained from Nathan Trinklein <nathant@switchgeargenomics.com>)
 cp ~/all_tss_switchdb_psgene.gz .
 gunzip all_tss_switchdb_psgene.gz
 cat << "EOF" > reformat.awk
 BEGIN{FS="\t"}
 {
 if (NR > 1)
    {
    if ($9 !~ "^PSEUDO.*")
        {
        pseudo = "none";
        }
    else
        {
        pseudo = $9;
        }
    printf("%s\t%d\t%d\t%s\t1000\t%s\t%s\t%s\t%s\t%s\t%s\n", $2, $8, $8+1, $6, $5, $7, $1, $3, $4, pseudo); 
    }
 }
 EOF
 awk -f reformat.awk all_tss_switchdb_psgene > switchDbTss.bed
 ln -s ~/kent/src/hg/lib/switchDbTss.sql
 hgLoadBed -sqlTable=switchDbTss.sql hg17 switchDbTss switchDbTss.bed
 
 ############################################################################
 # enable ORFeome track build.  (markd 2007-05-02)
     cd ~/kent/src/hg/makeDb/genbank
     cvs update -d etc
     # edit etc/genbank.conf to add 
         hg17.orfeomeTables.hgwdev = yes
         hg17.orfeomeTables.hgwbeta = yes
     # will need to enable for rr later.  In the future, this can just be enabled
     # as part the normal genbank build.  Change above to:
         hg18.orfeomeTables.default = yes
     
 ###########################################################################
 # Transcriptome Phase 3 tracks (Andy 2007-06-10)
 
 ssh hgwdev
 bash
 cd /san/sanVol1/scratch/andy
 mkdir transcriptome
 cd transcriptome/
 cp /var/ftp/encode/Affy_transcriptome_phase3.tar .
 tar xfv Affy_transcriptome_phase3.tar 
 find . -name '*.bz2' -exec bunzip2 '{}' \;
 cat > processWig.sh << "EOF"
 #!/bin/bash
 
 theDir=`dirname $1`;
 theFile=`basename $1`;
 table=affyTxnPhase3${theFile%.sig.wig};
 tmp=/scratch/tmp/trans3rdPhase.$$
 
 mkdir $tmp
 cp $1 $tmp
 pushd $tmp
 head -n1 $theFile > $table.sig.track.txt
 tail +2 $theFile > tmp; mv tmp $theFile
 wigEncode $theFile $table.wig $table.wib
 popd
 cp $tmp/${table}.* $theDir
 rm -rf $tmp
 EOF
 chmod +x processWig.sh
 cat > gsub << "EOF"
 #LOOP
 ./processWig.sh {check in line+ $(path1)} {check out exists $(dir1)/$(root1).track.txt}
 #ENDLOOP
 EOF
 find . -name '*.sig.wig' > wig.lst
 gensub2 wig.lst single gsub spec
 ssh pk
 cd /san/sanVol1/scratch/andy/transcriptome
 para create spec
 para push
 exit
 cd /cluster/data/hg17/bed
 mkdir transcriptome3rdPhase/{wig,wib,bed}
 cd transcriptome3rdPhase/wib/
 cp /san/sanVol1/scratch/andy/transcriptome/graphs/human_{long,short}_rna/affyTxnPhase3*.wib .
 pushd /gbdb/hg17/wib
 ln -s /cluster/data/hg17/bed/transcriptome3rdPhase/wib/* .
 popd
 cd ../wig/
 cp /san/sanVol1/scratch/andy/transcriptome/graphs/human_{long,short}_rna/affyTxnPhase3*.wig .
 for f in *; do
    hgLoadWiggle hg17 ${f%.wig} $f
 done
 cd ../bed
 for f in /san/sanVol1/scratch/andy/transcriptome/transfrags/human_{long,short}_rna/*; do
    newName=`basename $f`;
    newName=${newName%.bz2};
    bzcat $f | tail +2 > $newName;
    tab=affyTxnPhase3Frags${newName%.bed};
    hgLoadBed hg17 $tab $newName;
 done
 
 #######################################################################
 # CLEANUP OF DANRER1 BLASTZ SWAP (DONE, 2007-06-25, hartera)
    ssh kkstore02
    cd /cluster/data/hg17/bed/blastz.danRer1.swap/axtChain
    rm -r run1
    cd ../mafNet.new
    gzip *.maf
    # we don't tend to keep the Blastz PSLs anymore and this is an old 
    # zebrafish assembly so remove these.
    cd ../
    rm -r pslChrom
    # this removed 1.2 G of data.
 
 #######################################################################
 # CLEANUP OF ACEMBLY_050217 DATA (DONE, 2007-06-25, hartera)
    ssh kkstore02
    cd /cluster/store5/gs.18/build35/bed/acembly_050217
    rm GeneCheck.out GeneCheck2 acembly acembly.chk acembly.details \
       chrom.out genePred.tab hg16.name hg16Pep.name
    cd acembly_gene_lists
    rm test transcripts.names *.bak main_gene.list.IDsort mp.ids mp.sort ptest \
       maintest gid.tab gid.tab.sort genesGffs.ids genesGffs.ids.uniq 
    cd ../
    # remove fasta files as included in gzipped tar file
    rm -r acembly.ncbi_35.genes.proteins.fasta 
    cd acembly.ncbi_35.genes.gff
    gzip *.gff 
 
 #######################################################################
 # CLEANUP OF DANRER2 BLASTZ SWAP (DONE, 2007-06-25, hartera)
    ssh kkstore02
    cd /cluster/store5/gs.18/build35/bed/blastz.danRer2.2004-12-08
    # remove old axtChrom directory
    rm -r axtChrom.orig
    cd axtChain
    # chain directories can be recreated from all.chain files so remove
    rm -r chain chainAR
    # gzip net files
    gzip net/*.net
    # gzip .over files
    gzip over/*.over
    # removed ~1.3 G data 
 
 #############################################################################
 # Duke DNaseI HS (2007-06-26 kate)
 #
 #  Submitted by Terry Furey <terry.furey@duke.edu>
 #  in collaboration with Greg Crawford
 # Resubmitted 9/26/07 from FTP site
 # Resubmitted 10/25/07 from FTP site
 
     ssh kkstore02
     cd /cluster/data/hg17/bed
 
     # download 19GB archive from Duke site, password protected,
     # user=ucsc, password=dnase
     mkdir -p dukeDnase/2007-10-25/lab
     cd dukeDnase/2007-10-25/lab
 
     sftp ucsc@sftp.igsp.duke.edu
     mget *
     # dukeDnaseHsCd4.bed
     # dukeDnaseHsCd4Wiggle.tgz
 
     # unpack and load wiggle (signal) data
     nice tar xvfz dukeDnaseHsCd4Wiggle.tgz
     # packaged as chr*_dukeDnaseHsCd4Wiggle.out
     # fixedStep 1 files
 
     # create wiggle and load into database
     cd ..
     cat lab/chr*.out | nice wigEncode stdin \
         dukeDnaseCd4Signal.wig dukeDnaseCd4Signal.wib >&! wigencode.log &
         # upper limit 25.74, lower limit -0.66
 
     ssh hgwdev
     cd /cluster/data/hg17/bed/dukeDnase/2007-10-25
     rm -f /gbdb/hg17/wib/dukeDnaseCd4Signal.wib
     ln -s /cluster/data/hg17/bed/dukeDnase/2007-10-25/dukeDnaseCd4Signal.wib \
             /gbdb/hg17/wib
     nice hgLoadWiggle hg17 dukeDnaseCd4Signal -pathPrefix=/gbdb/hg17/wib \
                 dukeDnaseCd4Signal.wig
 
     # load bed file (sites)
     ssh hgwdev
     cd /cluster/data/hg17/bed/dukeDnase/2007-10-25/
     set table = dukeDnaseCd4Sites
     sed "s/bed5FloatScore/$table/" ~/kent/src/hg/lib/bed5FloatScore.sql > \
             $table.sql
     hgsql hg17 -e "DROP TABLE IF EXISTS $table"
     hgsql hg17 < $table.sql
     hgLoadBed -sqlTable=$table.sql hg17 $table lab/dukeDnaseHsCd4.bed
         # Loaded 95723 elements of size 6
         # min value:  0.000103164
         # max value: 25.7442
     #textHistogram -col=5 lab/dukeDnaseHsCd4.bed -binSize=50 
     300 ******************************** 11789
     350 ************************************************************ 22253
     400 ********************************************* 16854
     450 ********************************* 12333
     500 ************************* 9179
     550 ********************* 7870
     600 ************* 4987
     650 ********* 3271
     700 ******** 2789
     750 ****** 2153
     800 **** 1303
     850 ** 567
     900 * 219
     950  85
     1000  71
 
 
 ###########################################################################
 # Stanford ChIP-seq (Apr - July 2007, Heather)
 # Submitted 2007-03-14 by David Johnson <seasquirtdoctor@gmail.com>
 # 25bp tags (Solexa sequencing of IP fragments)
 # genome-wide, but funded by ENCODE, hence the location of the data
 
     ssh hgwdev
     cd /cluster/data/encode/stanford
     mkdir -p 2007-03-14/lab
     cd 2007-03-14/lab
     sort NRSF_chipseq_hg17.bed > data.bed
     sort NRSF_chipseq_control_hg17.bed > control.bed
     fix.pl < data.bed > fix.bed
     fix.pl < control.bed > control_fix.bed
     hgLoadBed hg17 stanfordNRSFEnriched fix.bed -tab
     hgLoadBed hg17 stanfordNRSFControl control_fix.bed -tab
 
 ############################################################################
 # Stanford ChIP/chip
 # Submitted 2007-07-11 by David Johnson (seasquirtdoctor@gmail.com)
 # Replaces submission from 2007-03-23
 # 12 subtracks
 # genome-wide, but funded by ENCODE, hence the location of the data
 
     ssh hgwdev
     cd /cluster/data/encode/stanford/2007-07-11/lab
     # Dave gave us bed 5, we need bed 4
     ./shrink.sh
     ./load.sh
 
 #########################################################################
 # REGULATORY POTENTIAL 7X UPDATED (DONE - 2007-08-01 - Hiram)
     #	download data from "James Taylor" <james@bx.psu.edu>
     ssh kkstore02
     mkdir /cluster/data/hg17/bed/regPotential7X.update
     cd /cluster/data/hg17/bed/regPotential7X.update
     ## In theory, only chr4, chr8, chr9 and chrY have updated, fetch them
     ## all and verify with ../regPotential7X
 for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
 do
 wget --timestamping \
 "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg17/chr${C}.scores.truncated.bz2"
 echo "DONE - chr${C}.scores.bz2"
 done
 
     #	create download gzip files from the bz2 files:
     time for F in chr*.scores.truncated.bz2
     do
 	C=`echo $F | awk -F'.' '{print $1}'`
 	echo -n "${C}.regPotential7X.hg17.gz working ... "
 	bzcat ${F} | gzip > ${C}.regPotential7X.hg17.gz
 	echo "done"
     done
 
     time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
     do
 	zcat chr${C}.regPotential7X.hg17.gz
     done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
     #	Converted stdin, upper limit 1.00, lower limit -0.00
     #	real    16m51.215s
 
     #	Loading the table on hgwdev
     ssh hgwdev
     cd /cluster/data/hg17/bed/regPotential7X.update
     mkdir /gbdb/hg17/wib/061116
     ln -s /cluster/data/hg17/bed/regPotential7X.update/regPotential7X.wib \
 	/gbdb/hg17/wib/061116/regPotential7X.wib
     #	using the tmpDir is faster since it is on local disk and it will
     #	clean up any temporary .tab file it creates there
     time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \
 	-pathPrefix=/gbdb/hg17/wib/061116 hg17 regPotential7X regPotential7X.wig
     #	real    0m40.523s
 
     #	How about a histogram of the data.
     ssh kolossus
     cd /cluster/data/hg17/bed/regPotential7X.update
     time nice -n +19 hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 \
       -hBinCount=100 -hMinVal=0.0 -db=hg17 regPotential7X > histogram.data 2>&1
     #	real    3m3.829s
     #	73 % of the data values are zero
 
 
     # renaming file directory -- kuhn 08-17-2007
     cd /gbdb/hg17/wib
     mv 061116 regPot061116
     hgsql -e " update regPotential7X SET file = \
       /gbdb/hg17/wib/regPot061116/regPotential7X.wib" hg17
     Query OK, 2366123 rows affected (31.46 sec)
     Rows matched: 2366123  Changed: 2366123  Warnings: 0
 
 ###########################################################################
 ## Create gc5Base download raw data file (DONE - 2007-08-29 - Hiram)
     ssh kkstore02
     cd /cluster/data/hg17/bed/gc5Base
     hgGcPercent -wigOut -doGaps -file=stdout -win=5 \
 	hg17 /cluster/data/hg17/hg17.2bit 2> /dev/null \
 	| gzip > hg17.gc5Base.txt.gz
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg17/gc5Base
     cd /usr/local/apache/htdocs/goldenPath/hg17/gc5Base
     ln -s /cluster/data/hg17/bed/gc5Base/hg17.gc5Base.txt.gz .
 
 ############################################################################
 # INDEL-BASED CONSERVATION TRACK (DONE, 2007-10-02 - 2007-10-03, hartera)
 # Data from the Gerton Lunter (gerton.lunter@anat.ox.ac.uk), MRC 
 # Functional Genetics Unit, University of Oxford, United Kingdom.
 # Data is from the paper:
 # Lunter G, Ponting CP and Hein J Genome-wide identification of human
 # functional DNA using a neutral indel model. PLoS Comput Biol. 2006
 # Jan;2(1):e5.
     ssh kkstore02 
     mkdir -p /cluster/data/hg17/bed/consIndels/data
     cd /cluster/data/hg17/bed/consIndels/
     # Add a README.indels with the e-mail from Gerton Lunter, copy over 
     # from hg18 condIndels
     cp /cluster/data/hg18/bed/consIndels/README.indels .
     # get the data
     cd data
     wget --timestamping \
          http://wwwfgu.anat.ox.ac.uk/~gerton/IPS/IPSs.zip
     # 15 Mb zip file in GFF format. This contains data for hg17
     # comparing it to mm5 (NCBI Build 33) and 
     # canFam1 (Broad Institute, July 2004). The chr*.mm5.GFF data is old
     # data that can be removed.
     unzip IPSs.zip    
     cd /cluster/data/hg17/bed/consIndels
     rm ./data/*mm5.GFF
     foreach f (./data/*.GFF)
        set r = $f:r
        echo $r
        grep -v "track" $f > ${r}NoHeader.gff
     end
   
     # strip off the end of the name e.g. IGS0001:p=.26
     # so that the name displayed is short - IGS0001.1. The score field
     # is used to determine colouring and this is calculated from FDR 
     ssh kkstore02 
     cd /cluster/data/hg18/bed/consIndels
     perl -pi.bak -e \
          's/(IGS[0-9a-z]+\.?[0-9XY]*):p=?<?\.[0-9]+/$1/'  \
          ./data/chr*NoHeader.gff
     # check this looks ok then clean up
     rm *.bak
     # makes sense to store this as a BED5 table in order to use the score
     # for display.
     foreach f (./data/*NoHeader.gff)
        awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$4,$5,$9,$6}' $f \
        >> consIndelsHg17Mm5CanFam1.bed
     end
  
     # load data
     ssh hgwdev
     cd /cluster/data/hg17/bed/consIndels
     hgLoadBed hg17 consIndelsHg17Mm5CanFam1 consIndelsHg17Mm5CanFam1.bed
     # Loaded 593298 elements of size 5
     
     # Get the IDs, posterior probabilities (p) for the segment being neutral, 
     # and the FDR from the original GFFs for a separate table. Some items 
     # have p<.001. Can not do Table Browser queries restricting 
     # p to <, =, or > a specified value unless all values are floats.
     # Contacted the data contributor, Gerton Lunter, and he said it would be 
     # ok to change all p<.001 to p=0.0005
     ssh kkstore02
     
     cd /cluster/data/hg17/bed/consIndels/
     awk '{if ($1 !~ /random/) print $1;}' /cluster/data/hg17/chrom.sizes \
         | sed -e 's/chr//' | sort -n > chrom.lst
     grep -v 'hap' chrom.lst > tmp2
     tail +4 tmp2 > tmp3
     echo "X\nY\n" >> chrom.lst
     rm tmp2 tmp3 
     # chrom.lst has a list of chroms 1-22, then X and Y
     foreach c (`cat chrom.lst`)
        echo $c
        foreach f (./data/chr${c}.GFF)
           echo $f
           awk 'BEGIN {FS="\t"} {OFS="\t"}{if ($9 ~ /IGS/) print $9,$6;}' $f \
               | sed -e 's/:/\t/' \
               | sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \
               >> consIndelsConf.txt
        end
     end
     # Add the FDR. 
     # For this set, there is no false discovery rate (FDR) field but it 
     # can be related to the score. If score is 999 then FDR is 1% (0.01) and
     # if score is 500 then FDR is 10% (0.10). Score is in column 6.
     # there are no GFF files for the haplotype chroms
    
     awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($3 ~ /500/) print $1, $2, "0.10"; else if ($3 ~ /999/) print $1, $2, "0.01";}' consIndelsConf.txt > consIndelsHg17Mm5CanFam1Conf.txt 
 
     # Create a table definition for the table of identifier,  posterior 
     # probability and false discovery rate (FDR). Already created for hg18
     # track (see hg18.txt). It is $HOME/kent/src/hg/lib/itemConf.as.
    
     ssh hgwdev
     cd /cluster/data/hg17/bed/consIndels
     hgLoadSqlTab hg17 consIndelsHg17Mm5CanFam1Conf \
          $HOME/kent/src/hg/lib/itemConf.sql \
          consIndelsHg17Mm5CanFam1Conf.txt
     # check that all itesm are in this table.
     hgsql -N -e 'select distinct(name) from consIndelsHg17Mm5CanFam1;' hg17 \
          | sort > consIndels.names.sort
     hgsql -N -e 'select distinct(id) from consIndelsHg17Mm5CanFam1Conf;' hg17 \
          | sort > consIndels.idsfromConf.sort
     wc -l *.sort
     # 593298 consIndels.idsfromConf.sort
     # 593298 consIndels.names.sort
 
     comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l
     # 593298
     # so all element IDs are in both tables.
     # cleanup
     rm ./data/*.bak *.sort
     
     # add trackDb/human/hg17/trackDb.ra entry and add description that
     # was written by the data contributor. Add code to hgc.c to display
     # the posterior probability and the FDR on the details page for
     # track elements. Gerton Lunter provided a description for the data
     # on 2007-09-12.
     cd ~/kent/src/hg/makeDb/trackDb/human/hg17
     cp ../hg18/consIndelsHg18Mm8CanFam2.html consIndelsHg17Mm5CanFam1.html
     # check this is correct and add trackDb.ra track entry and search.  
 
 ##############################################################
 # NIMH Bipolar Genome Graphs built-in (DONE 2007-10-04 Galt)
 #
 #  See hg18.txt for details.
 #############################################################
 
 #############################################################
 # CCC Genome Graphs (DONE 2007-Sept Andy)
 # 
 # See hg18 make doc.
 
 ###############################################################
 # Affy Transcriptome Phase 3 chrY fix (DONE 2007-12-10, Andy)
 
 ssh kkstore05
 cd /cluster/store12/hg17/bed/affyTxnPhase3/raw
 zcat sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz | grep -n chrY
 #256994657:variableStep chrom=chrY span=1
 zcat sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz | head -n256994656 | gzip -c >tmp.wig.gz
 mv tmp.wig.gz sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz
 zcat sRNA.affyTxnPhase3HeLaTopStrand.wig.gz | grep -n chrY 
 256994657:variableStep chrom=chrY span=1
 zcat sRNA.affyTxnPhase3HeLaTopStrand.wig.gz | head -n256994656 | gzip -c > tmp.wig.gz
 mv tmp.wig.gz sRNA.affyTxnPhase3HeLaTopStrand.wig.gz
 ssh kolossus
 cd /cluster/store12/hg17/bed/affyTxnPhase3/raw
 wigEncode sRNA.affyTxnPhase3HeLaBottomStrand.wig.gz affyTxnPhase3HeLaBottomStrand.{wig,wib}
 wigEncode sRNA.affyTxnPhase3HeLaTopStrand.wig.gz affyTxnPhase3HeLaTopStrand.{wig,wib}
 mv *.wig /cluster/data/hg17/bed/affyTxnPhase3/wig/
 mv *.wib /cluster/data/hg17/bed/affyTxnPhase3/wib/
 ssh hgwdev
 cd /cluster/data/hg17/bed/affyTxnPhase3/wig
 hgLoadWiggle hg17 affyTxnPhase3HeLaTopStrand{,.wig}
 hgLoadWiggle hg17 affyTxnPhase3HeLaBottomStrand{,.wig}
 
 ###########################################################################
 # Reload CCDS (2007-12-12 markd)
     # import ccds database as described in ccds.txt
     set db=hg17
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     # build initial version of ccdsMgcMap table, updated by nightly genbank update
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
         ccdsMgcMap
     # << emacs
 
 ############################################################################
 # ADD LINKS TO GENETESTS ON hgGene DETAILS PAGE (DONE 12/12/07 Fan)
 
 See hg18.txt for details.
 
 ############################################################################
 # Reload CCDS (2008-02-01 markd)
     # import ccds database as described in ccds.txt
     set db=hg17
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
     # << emacs
 
 ############################################################################
 # CREATE huge TABLE FOR HuGE LINK (DONE 3/6/08, Fan)
 
 # See the HuGE section in hg18.txt for details.
 ############################################################################
 
 
 ############################################################################
-# DGV V6 (DATABASE OF GENOMIC VARIANTS) (DONE 2/23/09 angie)
+# DGV V7 (DATABASE OF GENOMIC VARIANTS) (DONE 3/11/09 angie)
+# DGV V6 thin regions dropped 2/23/09
 # DGV V6 with useless thin regions done 11/12/08
 # DGV V5 done 8/11/08
 # DGV V4 done 5/9/08
     ssh hgwdev
-    mkdir /cluster/data/hg17/bed/dgv.v6
-    cd /cluster/data/hg17/bed/dgv.v6
+    mkdir /hive/data/genomes/hg17/bed/dgv.v7
+    cd /hive/data/genomes/hg17/bed/dgv.v7
     wget --timestamping \
-      http://projects.tcag.ca/variation/downloads/variation.hg17.v6.nov.2008.txt
+      http://projects.tcag.ca/variation/downloads/variation.hg17.v7.mar.2009.txt
     wget --timestamping \
-      http://projects.tcag.ca/variation/downloads/indel.hg17.v6.nov.2008.txt
+      http://projects.tcag.ca/variation/downloads/indel.hg17.v7.mar.2009.txt
     # shuffle fields into bed8+ (input has one start coord==0, but min 
     # nonzero size of 99 not 100 implies most coords are 1-based):
-    foreach f (*.v6.*.txt)
+    foreach f (*.v7.*.txt)
       tail -n +2 $f \
       | perl -wpe 'chomp; \
         ($id, $landmark, $chr, $start, $end, $varType, \
          undef, undef, undef, $ref, $pmid, $method, \
          undef, undef, undef, undef, $sample) = split("\t"); \
         $id =~ s/^Variation_//; \
         $start-- unless ($start == 0); \
         $landmark = "" if ($landmark =~ /^chr.*\d\.\.\d/); \
         $rgb = "255,128,0"; \
         $rgb = "200,0,0" if ($varType =~ /^Inv/); \
         $rgb = "0,100,0" if ($varType eq "InDel"); \
         $_ = join("\t", $chr, $start, $end, $id, 0, "+", \
                   $start, $end, $rgb, $landmark, $varType, \
                   $ref, $pmid, $method, $sample) . "\n";' \
           > $f:r.bed
     end
-    hgsql hg17 -e 'rename table dgv to dgvV5'
+    hgsql hg17 -e 'rename table dgv to dgvV6'
     hgLoadBed hg17 dgv *.bed \
       -onServer -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -tab
-#Loaded 17479 elements of size 15
+#Loaded 17473 elements of size 15
 
 
 ############################################################################
 # KIDD/EICHLER DISCORDANT CLONE ENDS (DONE 6/10/08 angie)
 # 8/11/08: Added kiddEichlerToNcbi (ID xref table).
     ssh kkstore02
     mkdir /cluster/data/hg17/bed/kiddEichlerDiscordant
     cd /cluster/data/hg17/bed/kiddEichlerDiscordant
     wget --user=uuuu  --password=ppppppp \
       http://eichlerlab.gs.washington.edu/kiddj/downloads/fosmids.hg17.tgz
     tar xvzf fosmids.hg17.tgz
     cd bd35
     # 8 clone-end linkedFeaturesSeries tracks and one bed custom track.
     # bed has illegal coords (maybe for unplaced ends?).
 
     # Load the tracks (translate bacEndPairs format to bed12):
     ssh hgwdev
     cd /cluster/data/hg17/bed/kiddEichlerDiscordant/bd35
     foreach f (abc*.txt)
       set track = `echo $f:r \
         | perl -wpe 's/^(G|abc)(\d+)discordant/kiddEichlerDisc\u$1$2/ || die;'`
       if ($status != 0) break
       perl -wpe 'next if s/^#.*\n$//; \
         ($c, $s, $e, $n, $sc, $st, undef, $bs, $bSt, $bSz)=split; \
         @bSts = split(",", $bSt);  @bSzs = split(",", $bSz); \
         $s--; \
         if ($n =~ /transchr/) { \
           $bs = 1; \
           $#bSts = 0;  $#bSzs = 0; \
           $bSts[0]--;  $e--; \
           $bSts[0] -= $s; \
         } elsif ($n =~ /OEA/) { \
           $bSts[0]--; \
           die "bSts[0] $bSts[0] != s $s\n" if ($bSts[0] != $s); \
           $bE = $bSts[0] + $bSzs[0]; \
           die "bE $bE != e $e\n" if ($bE != $e); \
           $bSts[0] -= $s; \
         } elsif ($bs == 2) { \
           $bSts[0]--;  $bSts[1]--; \
           if ($bSts[0] > $bSts[1]) { \
             # warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \
             $tmp = $bSts[0];  $bSts[0] = $bSts[1];  $bSts[1] = $tmp; \
             $tmp = $bSzs[0];  $bSzs[0] = $bSzs[1];  $bSzs[1] = $tmp; \
           } \
           if ($bSts[0] != $s) { \
             # warn "Tweaking $n start from $s to $bSts[0]\n"; \
             $s = $bSts[0]; \
           } \
           $bE0 = $bSts[0] + $bSzs[0]; \
           $bE1 = $bSts[1] + $bSzs[1]; \
           $bE = $bE0 > $bE1 ? $bE0 : $bE1; \
           if ($bE != $e) { \
             # warn "Tweaking $n end from $e to $bE\n"; \
             $e = $bE; \
           } \
           $bSts[0] -= $s;  $bSts[1] -= $s; \
         } else { die "#blks is $bs for $n\n"; } \
         $bSt = join(",", @bSts) . ",";  $bSz = join(",", @bSzs) . ","; \
         $rgb = ($n =~ /deletion/) ? "224,0,0" : \
                ($n =~ /insertion/) ? "0,0,224" : \
                ($n =~ /inversion/) ? "0,224,0" : \
                ($n =~ /OEA/) ? "240,160,64" : "0,0,0"; \
         $_ = join("\t", $c, $s, $e, $n, $sc, $st, $s, $e, $rgb, \
                         $bs, $bSz, $bSt) . "\n";' $f \
       | hgLoadBed -tab hg17 $track stdin
     end
     perl -pe 'next if s/^track .*\n$//; \
         ($c, $s, $e, $n, $sc, $st, $tS, $tE, $r, $bs, $bSz, $bSt) = split; \
         @bSts = split(",", $bSt);  @bSzs = split(",", $bSz); \
         if ($n =~ /transchr/) { \
           $bs = 1; \
           $#bSts = 0;  $#bSzs = 0; \
         } elsif ($n =~ /OEA/) { \
           $s--; # weird that this is required only for OEA here \
           die "$n: bSts[0] $bSts[0] != 0\n" if ($bSts[0] != 0); \
           $bE = $s + $bSts[0] + $bSzs[0]; \
           die "$n: bE $bE != e $e\n" if ($bE != $e); \
         } elsif ($bs == 2) { \
           $bSts[0] += $s;  $bSts[1] += $s; \
           if ($bSts[0] > $bSts[1]) { \
             # warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \
             $tmp = $bSts[0];  $bSts[0] = $bSts[1];  $bSts[1] = $tmp; \
             $tmp = $bSzs[0];  $bSzs[0] = $bSzs[1];  $bSzs[1] = $tmp; \
           } \
           if ($bSts[0] != $s) { \
             # warn "Tweaking $n start from $s to $bSts[0]\n"; \
             $s = $bSts[0]; \
           } \
           $bE0 = $bSts[0] + $bSzs[0]; \
           $bE1 = $bSts[1] + $bSzs[1]; \
           $bE = $bE0 > $bE1 ? $bE0 : $bE1; \
           if ($bE != $e) { \
             # warn "Tweaking $n end from $e to $bE\n"; \
             $e = $bE; \
           } \
          $bSts[0] -= $s;  $bSts[1] -= $s; \
         } else { die "#blks is $bs\n"; } \
         $bSt = join(",", @bSts) . ",";  $bSz = join(",", @bSzs) . ","; \
         $tS = $s;  $tE = $e; \
         $rgb = ($n =~ /deletion/) ? "224,0,0" : \
                ($n =~ /insertion/) ? "0,0,224" : \
                ($n =~ /inversion/) ? "0,224,0" : \
                ($n =~ /OEA/) ? "240,160,64" : "0,0,0"; \
         $_ = join("\t", $c, $s, $e, $n, $sc, $st, $tS, $tE, $rgb, \
                         $bs, $bSz, $bSt) . "\n";' G248discordant.txt \
     | hgLoadBed -tab hg17 kiddEichlerDiscG248 \
         stdin
 
     # 8/11/08: get clone ID -> NCBI acc mapping.
     ssh kkstore02
     mkdir /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds
     cd /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds
     # Saved off emailed file from Jeff Kidd to clones_used_3nov.txt.accessions;
     # get trace archive trace names for end reads:
     foreach n (7 9 10 11 12 13 14)
       wget http://hgsv.washington.edu/general/download/clone_mapping/ABC$n/ABC$n.conversion.gz
     end
     # ABC8 has _a and _b files:
     wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_a.conversion.gz
     wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_b.conversion.gz
     # That file is not available for G248.
     gunzip *.gz
     # Combine the relevant data from the .conversion files; keep only those
     # IDs that are used in the tracks.
     cut -f 4 ../bd35/*discordant.txt \
     | egrep -v '^(#chrom|track|name)' \
     | sed -e 's/,.*//' \
     | sort -u > discIds.txt
     perl -wpe 's/^OurClone.*\n// || s/^\d+_(HUMAN|\d+_).*\n$// || \
       s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\.(F|FORWARD|R|REVERSE)\.\d+\t(\w+)$/$2$3_$6\t$7\t$8/ || \
       warn "Parse line $.:\n$_";' \
       *.conversion \
     | sort > allEnds.tab
     grep -wFf discIds.txt allEnds.tab > discEnds.txt
     wc -l discIds.txt allEnds.tab discEnds.txt 
 #   223051 discIds.txt
 # 17498527 allEnds.tab
 #   573974 discEnds.txt
     # discEnds.txt has 2 lines (forward & reverse) for most of its ids...
     # ideally we would see 2*(223051) lines in discEnds.txt.
     # Get a list of which discordant clone IDs don't have ends in *.conv*:
     cut -f 1 allEnds.tab | uniq > all.tmp
     comm -23 discIds.txt all.tmp > discNotInConv.txt
     wc -l discNotInConv.txt
 #16318 discNotInConv.txt
     cat > combine.pl <<'_EOF_'
 #!/usr/bin/perl -w
 use strict;
 my ($cloneFile, $endsFile) = @ARGV;
 open(CLONES, $cloneFile) || die "Can't open $cloneFile: $!\n";
 my %idInfo;
 while(<CLONES>) {
   (s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\t(\w+)$/$2$3_$6\t$7/ && m/^(\w+)\t(\w+)/) || \
   m/^(G248\w+)\t(\w+)$/ || die "Parse line $.:$_";
   my ($id, $acc) = ($1, $2);
   $idInfo{$id}->[0] = $acc;
 }
 close(CLONES);
 open(ENDS, $endsFile) || die "Can't open $endsFile: $!\n";
 while (<ENDS>) {
   chomp; my ($id, $dir, $traceName) = split("\t");
   if ($dir =~ /^F/) {
     $idInfo{$id}->[1] = $traceName;
   } elsif ($dir =~ /^R/) {
     $idInfo{$id}->[2] = $traceName;
   } else { die "What is this \$dir: $dir ?\n"; }
 }
 close(ENDS);
 foreach my $id (sort keys %idInfo) {
   my $infoRef = $idInfo{$id};
   $infoRef->[0] = '' if (! defined $infoRef->[0]);
   $infoRef->[1] = 0 if (! defined $infoRef->[1]);
   $infoRef->[2] = 0 if (! defined $infoRef->[2]);
   print join("\t", $id, @{$infoRef}) . "\n";
 }
 '_EOF_'
     # << emacs
     chmod a+x combine.pl
     combine.pl clones_used_3nov.txt.accessions discEnds.txt \
     | sort > kiddEichlerToNcbi.txt
     # Load table:
     ssh hgwdev
     cd /cluster/data/hg17/bed/kiddEichlerDiscordant/cloneIds
     hgLoadSqlTab hg17 kiddEichlerToNcbi \
       $HOME/kent/src/hg/lib/kiddEichlerToNcbi.sql kiddEichlerToNcbi.txt
     # Add to makeDb/schema/all.joiner, then check:
     runJoiner.csh hg17 kiddEichlerToNcbi $HOME/kent/src/hg/makeDb/schema
 
 
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 
 
 ############################################################################
 # KIDD/EICHLER VALIDATED SITES (DONE 6/11/08 angie)
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/kiddEichlerValid
     cd /cluster/data/hg17/bed/kiddEichlerValid
     wget http://hgsv.washington.edu/general/download/validated_sites/Kidd_2008_sample_level_valided_sites.xls
     # Open in Excel, save as Kidd_2008_sample_level_valided_sites.txt,
     # move first 9 lines to Kidd_2008_sample_level_valided_sites.header.
     # Split into one file per individual:
     foreach id (Abc7 Abc8 Abc9 Abc10 Abc11 Abc12 Abc13 Abc14 G248)
       set ID = `echo $id | tr 'a-z' 'A-Z'`
       grep ${ID}_ Kidd_2008_sample_level_valided_sites.txt \
       | perl -wpe 'chomp; s/\r//; ($c, $s, $e, $n, $t) = split; \
         $rgb = ($n =~ /deletion/) ? "224,0,0" : \
                ($n =~ /insertion/) ? "0,0,224" : \
                ($n =~ /inversion/) ? "0,224,0" : "0,0,0"; \
         $t =~ s/:/,/g; \
         $n =~ s/^'$ID'_//;  $n = "$n,$t"; \
         $_ = join("\t", $c, $s, $e, $n, "0", "+", $s, $e, $rgb) . \
                   "\n";' \
       | hgLoadBed -tab hg17 kiddEichlerValid$id stdin
     end
 
 
 ################################################
 # SPLIT EXPRESSION & REGULATION GROUPS
 # (2008-09-09 kate)
 
 echo "insert into grp (name, label, priority, defaultIsClosed) values ('expression', 'Expression', 4.5, 1)" | hgsql hg17
 echo "update grp set label='Regulation' where name='regulation'" | hgsql hg17
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 hg17.upstreamGeneTbl = refGene
 hg17.upstreamMaf = multiz17way /hive/data/genomes/hg17/bed/multiz17way/species.lst
 
 
 #############################################################################
 # MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08 angie)
     ssh hgwdev
     mkdir /cluster/data/hg17/bed/mrnaPcr
     cd /cluster/data/hg17/bed/mrnaPcr
     # First, get consistent FA and PSL for UCSC Genes.
     genePredToBed /cluster/data/hg17/bed/kgHg17F/try3/kg3Try3.gp > ucscGenes.bed
     hgsql hg17 -NBe 'select kgId,geneSymbol from kgXref' \
     | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
       > idSub.txt
     subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
     sequenceForBed -keepName -db=hg17 -bedIn=ucscGenesIdSubbed.bed \
       -fastaOut=stdout \
     | faToTwoBit -ignoreDups stdin kgTargetSeq.2bit
     cut -f 1-10 /cluster/data/hg17/bed/kgHg17F/try3/kg3Try3.gp \
     | genePredToFakePsl hg17 stdin kgTargetAli.psl /dev/null
 
     # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
     cd /cluster/data/hg17/bed/mrnaPcr
     hgLoadPsl hg17 kgTargetAli.psl
     mkdir /gbdb/hg17/targetDb
     ln -s /cluster/data/hg17/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg17/targetDb/
 
     # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
     # /gbdb/hg17/targetDb/kgTargetSeq.2bit .
 
     ssh hgwdev
     # Add records to hgcentraltest blatServers and targetDb:
     hgsql hgcentraltest -e \
       'INSERT into blatServers values ("hg17Kg", "blat13", 17797, 0, 1);'
     hgsql hgcentraltest -e \
       'INSERT into targetDb values("hg17Kg", "UCSC Genes", \
          "hg17", "kgTargetAli", "", "", \
          "/gbdb/hg17/targetDb/kgTargetSeq.2bit", 1, now(), "");'
 
 
 #############################################################################
 # fox2ClipSeq from Gene Yeo (DONE - 2009-01-08 - Hiram)
     mkdir /hive/data/genomes/hg17/bed/fox2ClipSeq
     cd /hive/data/genomes/hg17/bed/fox2ClipSeq
     #	fetch data
     wget --timestamping \
 'http://www.snl.salk.edu/~geneyeo/stuff/FOX2.rmsk.BED.gz' \
         -O FOX2.rmsk.BED.gz
     #	remove track line and sort
     zcat FOX2.rmsk.BED.gz | grep -v "^track" | sort -k1,1 -k2,2n \
         | gzip > sorted.bed.gz
     #	separate strand data, and turn the positive into blue
     zcat sorted.bed.gz | awk '$6 == "+"' | sed -e "s/255,0,0/0,0,255/" \
 	| gzip > forwardStrand.bed.gz
     zcat sorted.bed.gz | awk '$6 == "-"' | gzip > reverseStrand.bed.gz
     #	turn into wiggle density plot
     zcat forwardStrand.bed.gz | bedItemOverlapCount hg17 stdin \
         | wigEncode stdin fox2ClipSeqDensityForwardStrand.wig \
 	fox2ClipSeqDensityForwardStrand.wib
     #	Converted stdin, upper limit 2401.00, lower limit 1.00
     zcat reverseStrand.bed.gz | bedItemOverlapCount hg17 stdin \
         | wigEncode stdin fox2ClipSeqDensityReverseStrand.wig \
 		fox2ClipSeqDensityReverseStrand.wib
     #	Converted stdin, upper limit 1406.00, lower limit 1.00
     #	and load tables
     zcat forwardStrand.bed.gz reverseStrand.bed.gz \
 	| hgLoadBed hg17 fox2ClipSeq stdin
     #	Loaded 4418298 elements of size 9
     ln -s `pwd`/*.wib /gbdb/hg17/wib
     hgLoadWiggle hg17 fox2ClipSeqDensityForwardStrand \
 	fox2ClipSeqDensityForwardStrand.wig
     hgLoadWiggle hg17 fox2ClipSeqDensityReverseStrand \
 	fox2ClipSeqDensityReverseStrand.wig
     #	add composite track definitions to makeDb/trackDb/human/trackDb.ra
 
 #############################################################################