src/hg/makeDb/doc/hg18.txt 1.378

1.378 2009/08/23 04:14:05 hartera
Documented adding new code to handle Vega Genes track and loaded a vegaGtp table.
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.377
retrieving revision 1.378
diff -b -B -U 1000000 -r1.377 -r1.378
--- src/hg/makeDb/doc/hg18.txt	18 Aug 2009 22:47:49 -0000	1.377
+++ src/hg/makeDb/doc/hg18.txt	23 Aug 2009 04:14:05 -0000	1.378
@@ -1,28663 +1,28703 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes how we made the browser database on
 # NCBI build 36 (October 2005 freeze)
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +---------------+-------------------------------------+
 #  | tableName     | type                                |
 #  +---------------+-------------------------------------+
 #  | knownGene     | genePred knownGenePep knownGeneMrna |
 #  | refGene       | genePred refPep refMrna             |
 #  | xenoRefGene   | genePred xenoRefPep xenoRefMrna     |
 #  | mgcGenes      | genePred                            |
 #  | ensGene       | genePred ensPep                     |
 #  | nscanGene     | genePred nscanPep                   |
 #  | sgpGene       | genePred sgpPep                     |
 #  | geneid        | genePred geneidPep                  |
 #  | genscan       | genePred genscanPep                 |
 #  | exonWalk      | genePred                            |
 #  | ecoresTetNig1 | genePred                            |
 #  +---------------+-------------------------------------+
 
 
 # HOW TO BUILD AN ASSEMBLY FROM NCBI FILES
 # ---------------------------------------
 # 10/06/2005
 # Make gs.19 directory, gs.19/build36 directory, and gs.19/ffa directory.
     ssh kkstore02
     mkdir /cluster/store11/gs.19
     mkdir /cluster/store11/gs.19/build36
     mkdir /cluster/store11/gs.19/agp
     mkdir /cluster/store11/gs.19/ffa
 
 #    Make a symbolic link from /cluster/store1 to this location
 #	(I assume there is some use for this later ?)
 
     cd /cluster/store1
     ln -s /cluster/store11/gs.19 ./gs.19
     ln -s /cluster/store11/gs.19/build36 /cluster/data/hg18
 
 #    Make a symbolic link from your home directory to the build dir:
 #	(Investigate what this is used for, may no longer be necessary)
 
     cd
     ln -s /cluster/store11/gs.19/build36 ~/oo
 
 # NCBI download site, fetch everything into this one directory:
 
 #	with the machine and password in your $HOME/.netrc file, this
 #	wget command will require no login.  Your $HOME/.netrc file
 #	is set to 'chmod 600 .netrc' to prevent anyone from finding
 #	the data.  (There were some early files that later moved
 #		into an OLD subdirectory.  They were broken.)
 
 # 11/16/2005
 # Received answer from Greg to go ahead with the new build.
 
     ssh kkstore02
     mkdir /cluster/store11/gs.19/ncbi
     cd /cluster/store11/gs.19/ncbi
     bash
     wget --timestamp ftp://ftp-private.ncbi.nih.gov/build_36/*
 
 #	New to this build is the sequence: NC_001807 which is the
 #	mitochondria sequence.  This prefix NC_ is new to the process
 #	and will have to be accounted for below.  The other two special
 #	prefixes are similar to what was seen before:
 #	from DR52.agp NG_002392
 #	Homo sapiens major histocompatibility complex, class II,
 #		DR52 haplotype (DR52) on chromosome 6
 #	and from DR53.agp NG_002433
 #	Homo sapiens major histocompatibility complex, class II,
 #		DR53 haplotype (DR53) on chromosome 6
 
 #	Fixup seq_contig.md
 #
 #	It has a bunch of stuff belonging to the Celera
 #	genome assembly.  Filter those out.  I don't know what the
 #	NT_07959[0-7] items are, but there are no definitions for them
 #	in the agp files and no sequence in any fa.gz file.
 #	Fixup the names for the NG_ items, and change chrom MT to be M
 
 # get the seq_contig.md file Craig just made for us on 11/28/05.
 cd /cluster/store11/gs.19/ncbi
 wget --timestamp ftp://ftp-private.ncbi.nih.gov/build_36/seq_contig.md
 
 # remove Celera and Toronto entries
 # and replace chrom number for those haplotypes
 
 ssh hgwdev
 cd /cluster/store11/gs.19/build36
 egrep -v "Celera|NT_07959[0-7]" ../ncbi/seq_contig.md |grep -v CRA_TCA >seq_contig0.tab
 
 hgsql hg18 -e 'drop table seq_contig0'
 hgsql hg18 <~/src/hg/lib/seq_contig0.sql
 hgsql hg18 -e 'load data local infile "seq_contig0.tab" into table seq_contig0'
 
 #     fix seq_contig and
 #	get the randoms sorted in proper order.  The createNcbiLifts
 #	does not work correctly if the randoms are not grouped together
 #	by chromosome
 fixMd0 hg18 |sed -e "s/6_qbl_hap1/6_qbl_hap2/"| sed -e "s/MT/M/" | grep -v "|" >seq_contig1.tab
 
 hgsql hg18 -e 'drop table seq_contig1'
 hgsql hg18 <~/src/hg/lib/seq_contig1.sql
 hgsql hg18 -e 'load data local infile "seq_contig1.tab" into table seq_contig1'
 fixMd hg18 seq_contig1 >seq_contig.md
 
 #	This pulls out all the randoms and groups them within the
 #	same chrom but leaving them in the same order as they orginally
 #	were  (warning this is BASH code ...)
 bash
     grep "|" seq_contig0.tab | awk -F"|" '{print $1}' | \
         awk '{print $2}' | sort -n -u | while read CHR
 do
         grep "[^0-9]${CHR}|" seq_contig0.tab
 done >> seq_contig.md
 exit
 
 hgsql hg18 -e 'drop table seq_contig'
 hgsql hg18 <~/src/hg/lib/seq_contig.sql
 hgsql hg18 -e 'load data local infile "seq_contig.md" into table seq_contig'
 
 # FYI: agp file format documented at:
 #	http://www.ncbi.nlm.nih.gov/Genbank/WGS.agpformat.html# fixup a couple of names for our own purposes here
 cd /cluster/store11/gs.19/agp
 ln -s ../ncbi/chr*.agp ../ncbi/chr*.fa.gz .
 
 sed -e "s#MT/NC_001807#NC_001807#" ../ncbi/chrMT.agp > chrM.agp
 
 cat ../ncbi/c22_H2.agp > chr22_h2_hap1.agp
 cat ../ncbi/c5_H2.agp  > chr5_h2_hap1.agp
 cat ../ncbi/c6_COX.agp > chr6_cox_hap1.agp
 cat ../ncbi/c6_QBL.agp > chr6_qbl_hap2.agp
 
 cp -p ../ncbi/c22_H2.fa.gz chr22_h2_hap1.fa.gz
 cp -p ../ncbi/c5_H2.fa.gz  chr5_h2_hap1.fa.gz
 cp -p ../ncbi/c6_COX.fa.gz chr6_cox_hap1.fa.gz
 cp -p ../ncbi/c6_QBL.fa.gz chr6_qbl_hap2.fa.gz
 
 mkdir sav
 cp -p *hap*.agp sav
 
 # fix hap type agp files that have multiple contigs.
 
 fixAgp hg18 sav/chr5_h2_hap1.agp chr5_h2_hap1.agp
 fixAgp hg18 sav/chr6_qbl_hap2.agp chr6_qbl_hap2.agp
 
 # PLEASE NOTE THAT THESE TWO CORRECTED .agp FILES ABOVE ARE USED LATER,
 # NOT BY THE NEXT STEP IMMEDIATELY.
 
 #  Put all the agp files together into one.
 
 #	The chrM sequence now has its own agp, remove it from
 #	ref_placed.agp
 # sed -e "/^NC_001807/d" ../ncbi/ref_placed.agp > ref_placed.agp
 
 # PLEASE NOTE THAT THE ORIGINAL NCBI .agp FILES FOR THOSE
 # SPECIAL HAP TYPE SEQUENCES ARE USED, NOT THE CORRECTED ONES.
 
 cd /cluster/store11/gs.19/build36
 cat ../ncbi/ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \
 ../ncbi/c22_H2.agp \
 ../ncbi/c5_H2.agp \
 ../ncbi/c6_COX.agp \
 ../ncbi/c6_QBL.agp \
 ../ncbi/PAR.agp > ncbi_build36.agp
 
 # cat ../ncbi/ref_placed.agp ../agp/chrM.agp ../ncbi/ref_unplaced.agp \
 # ../agp/chr22_h2_hap1.agp ../agp/chr5_h2_hap1.agp \
 # ../agp/chr6_cox_hap1.agp ../agp/chr6_qbl_hap2.agp \
 # ../ncbi/PAR.agp > ncbi_build36.agp
 
     zcat ../ncbi/chrMT.fa.gz | \
 	sed -e "s/gi|17981852|ref|NC_001807.4/ref|NC_001807/" | \
 	gzip > chrM.fa.gz
 
 #	and into ffa
     cd /cluster/store11/gs.19/ffa
 # NO LONGER TRUE FOR GS19!
 # There is a single bogus line at the end of ref_placed.fa.gz
 #	declaring the NC_001807 MT sequence, this was later replaced by
 #	chrMT.fa.gz, so remove that one line:
     zcat ../ncbi/ref_placed.fa.gz | sed -e "/^>ref|NC_001807/d" | \
     gzip > ref_placed.fa.gz
 #	(That's a 40 minute job)
 
 #	sequence.inf is usually here, symlink it
 #ln -s ../ncbi/sequence.inf
     ln -s ../ncbi/chromosome_extents.inf
 #	put all the fa.gz files together in one big fa.gz
 #   time zcat ref_placed.fa.gz ../agp/chrM.fa.gz ../ncbi/ref_unplaced.fa.gz \
 time zcat ../ncbi/ref_placed.fa.gz ../ncbi/ref_unplaced.fa.gz \
 ../agp/*hap?.fa.gz ../ncbi/PAR.fa.gz | gzip \
 > ncbi_build36.fa.gz
 
 #	Make a listing of all the fasta record headers, just FYI:
     cd /cluster/store11/gs.19
     zcat ffa/ncbi_build36.fa.gz | grep "^>" > ncbi.fa.headers
 
 # Sanity check, checkYbr was updated to handle the NC_ identifier
 cd /cluster/store11/gs.19/build36
 zcat ../ffa/ncbi_build36.fa.gz | $HOME/bin/i386/checkYbr ncbi_build36.agp stdin seq_contig.md >check.seq_contig
 #	result should be clean:
 cat check.seq_contig
 # Read 378 contigs from ncbi_build36.agp
 # Verifying sequence sizes in stdin
 # 0 problems detected
 
 # Convert fa files into UCSC style fa files and place in "contigs"
 # directory inside the gs.19/build36 directory
 #	(a check that can be done here is make a list of the contigs
 #	in this ./contigs directory before and compare it with the
 #	list of distributed contigs created after they have been
 #	disbursed.)
 #	faNcbiToUcsc was fixed to handle the NC_ identifier
 
 cd /cluster/store11/gs.19/build36
 
 # We've been through this often
 
 # mv contigs contigs.0
 zcat ../ffa/ncbi_build36.fa.gz | $HOME/bin/i386/faNcbiToUcsc \
 -split -ntLast stdin contigs
 
 #	If you want to compare anything to previous work, check now, then:
 #     rm -fr contigs.0
 
 # Determine the chromosome sizes from agps
 #	Watch carefully how chrY gets constructed.  I'm not sure
 #	this chrom_sizes represents the whole length of chrY with
 #	the PAR added.  We will see about that.
 #	Script updated to handle new chrom names:
 #	my @chroms = (1 .. 22, 'X', 'Y', 'M', '6_hla_hap1', '6_hla_hap2');
 
 cd /cluster/store11/gs.19/build36
 /cluster/bin/scripts/getChromSizes ../agp
 
 #	Create chrom.lst list for use in foreach() loops
 awk '{print $1}' chrom_sizes | sed -e "s/chr//" > chrom.lst
 
 # Create lift files (this will create chromosome directory structure) and
 #	inserts file
 
 /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .
 
 # Create contig agp files (will create contig directory structure)
 
 /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build36.agp .
 
 # Create chromsome random agp files.
 
 /cluster/bin/scripts/createNcbiChrAgp -randomonly .
 
 # Copy the original chrN.agp files from the gs.19/agp directory
 #    into each of the chromosome directories since they contain better
 #    gap information. Delete the comments at top from these.
 cd /cluster/store11/gs.19/build36
 foreach c ( `cat chrom.lst` )
 	sed -e "/^#.*/d" ../agp/chr${c}.agp > ./${c}/chr${c}.agp
 end
 #	chrM needs a name fixup
 sed -e "s#NC_001807#chrM#" ../agp/chrM.agp > M/chrM.agp
 
 # Distribute contig .fa to appropriate directory (assumes all files
 # are in "contigs" directory).
 
 # Create inserts file from agp and lift files (new - added by Terry, 2004-07-12)
 /cluster/bin/scripts/createInserts /cluster/data/hg18 > /cluster/data/hg18/inserts
 
 # create global data link for everyone.  No more home directory
 # links required.
 ln -s /cluster/store11/gs.19/build36 /cluster/data/hg18
 cd /cluster/data/hg18
 /cluster/bin/scripts/distNcbiCtgFa contigs .
 #	Verify that everything was moved properly, the contigs directory
 #	should be empty:
 ls contigs
 #	Nothing there, then remove it
 rmdir  contigs
 
 #	Make a list of the contigs for use later
     rm contig.lst
     touch contig.lst
     foreach chrom ( `cat chrom.lst` )
 	foreach c ( $chrom/N{C,G,T}_?????? )
 	    set contig = $c:t
 	    echo "${chrom}/${contig}/${contig}.fa" >> contig.lst
 	end
     end
 #   For later comparisons, this is how many contigs we have:
     wc -l contig.lst
 # 378 contig.lst
 
 #	Note 2004-06-30 - there are some clone numbers left in some of
 #	the NCBI files that are incorrect.  Due to version number
 #	changes, more than one version is listed.  Namely for accession
 #	numbers: AC004491 AC004921 AC004983 AC005088 AC006014 AC099654
 #	The AGP files are correct, the sequence.inf file lists these
 #	twice: AC004491.1 AC004491.2
 #	AC004921.1 AC004921.2 AC004983.2 AC004983.3
 #	AC005088.2 AC005088.3 AC006014.2 AC006014.3
 #	AC099654.4 AC099654.5
 
 # for hg18, NCBI did not provide the seq.inf file.
 
 # FILES ARE NOW READY FOR REPEAT MASKING - start that process as
 #	other steps here can proceed in parallel.
 
 #	Previous practice used to copy everything over for jkStuff from a
 #	previous build.  Rather than do that, pick up whatever is needed
 #	at the time it is needed and verify that it is going to do what
 #	you expect.
 
     cd /cluster/data/hg18
     mkdir jkStuff
 
 # Create the contig.gl files - XXX - NCBI doesn't deliver
 # contig_overlaps.agp - 2004-06-18 - this is beginning to come
 # together and there is now a contig_overlaps.agp file
 
 #	This is properly done below with a combination of psLayout
 #	alignments to create the contig_overlaps.agp file
 # /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
 # Create chromosome gl files
 # jkStuff/liftGl.csh contig.gl
 
 # CREATING DATABASE  (DONE - 2005-11-30 - Fan)
 
     ssh hgwdev
 
 # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
     df -h /var/lib/mysql
 # Filesystem            Size  Used Avail Use% Mounted on
 # /dev/sdc1             1.8T  1.3T  356G  79% /var/lib/mysql
 
 # Create the database.
     hgsql -e 'create database hg18' mysql
 # Copy over grp table (for track grouping) from another database:
     hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp" hg18
 
 # The DB updates to grp below are not needed since we copied from hg17.
 # ENCODE groups
 # Added 2005-08016 kate
     echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg18
     echo 'UPDATE grp SET priority=8 WHERE name="encode"'| hgsql hg18
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg18
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg18
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg18
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg18
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg18
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg18
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg18
 
 # MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS
 #	(DONE - 2005-12-02 - Fan)
 
 # Make nib/, unmasked until RepeatMasker and TRF steps are done.
 # Do this now so that the chromInfo table will exist and thus the
 #	trackDb tables can be built in the next step.
 #	These unmasked nibs will be replaced by the masked nibs after
 #	repeat mask and trf are done.
     ssh kkstore02
     cd /cluster/data/hg18
     cp /cluster/data/hg17/jkStuff/chrFa.csh jkStuff -p
 
 # Make chr*.fa from contig .fa
 #  Copied chrFa.sh from hg17/jkStuff, renamed it to chrFa.csh
 bash
 time ./jkStuff/chrFa.csh
 # real    2m34.406s
 # user    1m17.405s
 # sys     0m16.730s
 exit
 
     mkdir nib
     foreach c (`cat chrom.lst`)
       foreach f ($c/chr${c}{,_random}.fa)
         if (-e $f) then
           echo "nibbing $f"
           /cluster/bin/i386/faToNib $f nib/$f:t:r.nib
         endif
       end
     end
 
 # Make symbolic links from /gbdb/hg18/nib to the real nibs.
     ssh hgwdev
     mkdir -p /gbdb/hg18/nib
     ln -s /cluster/data/hg18/nib/chr*.nib /gbdb/hg18/nib
 # Load /gbdb/hg18/nib paths into database and save size info.
     cd /cluster/data/hg18
     hgsql hg18  < $HOME/kent/src/hg/lib/chromInfo.sql
     hgNibSeq -preMadeNib hg18 /gbdb/hg18/nib */chr*.fa
     hgsql -N -e "select chrom,size from chromInfo order by chrom" hg18 \
 	> chrom.sizes
 # You can compare this chrom.sizes with the previously created
 # chrom_sizes.  Should be no difference
     sort chrom_sizes > s0
     sort chrom.sizes | grep -v random > s1
     diff s0 s1
     rm s0 s1
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2005-12-06 - Fan)
 #	dbDb orderKey updated 2005-12-06 - Fan
     ssh hgwdev
 #	reset dbDb orderKey - these have never been ordered properly
 #	before, this will get them on the program.
     hgsql -e 'update dbDb set orderKey=11 where name = "hg17";' \
 	-h genome-testdb hgcentraltest
     hgsql -e 'update dbDb set orderKey=12 where name = "hg16";' \
 	-h genome-testdb hgcentraltest
     hgsql -e 'update dbDb set orderKey=13 where name = "hg15";' \
 	-h genome-testdb hgcentraltest
     hgsql -e 'update dbDb set orderKey=14 where name = "hg13";' \
 	-h genome-testdb hgcentraltest
 
 # Enter hg18 into hgcentraltest.dbDb so test browser knows about it:
     hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
 	defaultPos, active, orderKey, genome, scientificName, \
 	htmlPath, hgNearOk, hgPbOk, sourceName) \
 	VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \
 	"chr7:127,664,479-127,689,005", 1, 10, "Human", "Homo sapiens", \
 	"/gbdb/hg18/html/description.html", 0, 0, "NCBI Build 36.1");' \
 	-h genome-testdb hgcentraltest
 # Make trackDb table so browser knows what tracks to expect:
     cd ~/kent/src/hg/makeDb/trackDb
     cvs up -d -P .
 # Edit the makefile to add hg18 in all the right places and do
     make update
     make alpha
     cvs commit makefile
 
 # MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2005-12-07 Fan)
     cd /cluster/data/hg18
     mkdir -p jkStuff
     cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft
 # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
 # Note: this ncbi.lift will not lift floating contigs to chr_random coords,
 # but it will show the strand orientation of the floating contigs
 # (grep for '|').
 #   mdToNcbiLift seq_contig.md jkStuff/ncbi.lft
 #	XXXX - appears to be unused, not done - Hiram
 
 # REPEAT MASKING (DONE - 2005-12-09 - Fan)
 
 #	Record the RM version here:
 #	as this changes over time and there is no record in the results
 ls -l /cluster/bluearc/RepeatMasker
 # lrwxrwxrwx    1 angie    protein        18 Nov  3 10:40
 #	/cluster/bluearc/RepeatMasker -> RepeatMasker051101
 
 #	beware that you can not actually include the precise single line output
 #	by this command since it is a CVS ident line and it will get
 #	changed as this file is checked into CVS.  Remove the Id and
 #	dollar sign business to allow it to stay as it is here.
 /cluster/bluearc/RepeatMasker/RepeatMasker | head -1
 # RepeatMasker version development-:
 #	RepeatMasker,v 1.10 2005/11/03 18:39:27 angie Exp
 
     cat /cluster/bluearc/RepeatMasker051101/Libraries/version
     #	RepBase Update 9.11, RM database version 20050112
 
 # Split contigs, run RepeatMasker, lift results
 #	This split takes a few minutes
     ssh kkstore02
     cd /cluster/data/hg18
     foreach chrom ( `cat chrom.lst` )
 	foreach c ( $chrom/N{C,G,T}_?????? )
 	    set contig = $c:t
 	    echo "splitting ${chrom}/${contig}/${contig}.fa"
 	    faSplit size ${chrom}/${contig}/$contig.fa 500000 \
 		${chrom}/${contig}/${contig}_ \
 		-lift=${chrom}/${contig}/$contig.lft -maxN=500000
 	end
     end
 
 #- Make the run directory and job list:
     cd /cluster/data/hg18
     mkdir -p jkStuff
 #  According to RepeatMasker help file, no arguments are required to
 #	specify species because its default is set for primate (human)
 #  This run script saves the .tbl file to be sent to Arian.  He uses
 # those for his analysis.  Sometimes he needs the .cat and .align files for
 # checking problems.  Krish needs the .align files, they are large.
 
     cat << '_EOF_' > jkStuff/RMHuman
 #!/bin/csh -fe
 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/hg18/$2
 /bin/cp $2 /tmp/hg18/$2/
 cd /tmp/hg18/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2
 popd
 /bin/cp /tmp/hg18/$2/$2.out ./
 if (-e /tmp/hg18/$2/$2.align) /bin/cp /tmp/hg18/$2/$2.align ./
 if (-e /tmp/hg18/$2/$2.tbl) /bin/cp /tmp/hg18/$2/$2.tbl ./
 # if (-e /tmp/hg18/$2/$2.cat) /bin/cp /tmp/hg18/$2/$2.cat ./
 /bin/rm -fr /tmp/hg18/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/hg18/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/hg18
 '_EOF_'
 # << this line makes emacs coloring happy
     chmod +x jkStuff/RMHuman
 
     ssh kkstore02
     cd /cluster/data/hg18
     mkdir RMRun
     rm -f RMRun/RMJobs
     touch RMRun/RMJobs
     foreach d ( `cat chrom.lst` )
      foreach c ( ${d}/N{C,G,T}_*/N{C,G,T}_*_*.fa )
         set f = $c:t
         set cc = $c:h
         set contig = $cc:t
         echo /cluster/store11/gs.19/build36/jkStuff/RMHuman \
    		/cluster/store11/gs.19/build36/${d}/${contig} $f \
    '{'check out line+ /cluster/store11/gs.19/build36/${d}/${contig}/$f.out'}' \
           >> RMRun/RMJobs
       end
     end
 
 # We have 5990 jobs in RMJobs:
     wc RMRun/RMJobs
 #	5990   41930 1127992 RMRun/RMJobs
 
 #- Do the run
     ssh pk
     cd /cluster/data/hg18/RMRun
     para create RMJobs
     para try, para check, para check, para push, para check,...
 
 #- While that is running, you can run TRF (simpleRepeat) on the small
 # cluster.  See SIMPLE REPEAT section below
 # Completed: 5990 of 5990 jobs
 # CPU time in finished jobs:   30661460s  511024.34m  8517.07h  354.88d  0.972 y
 # IO & Wait Time:                 38038s     633.96m    10.57h    0.44d  0.001 y
 # Average job time:                5125s      85.42m     1.42h    0.06d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            6693s     111.55m     1.86h    0.08d
 # Submission to last job:         86532s    1442.20m    24.04h    1.00d
 
 #	Lift up the split-contig .out's to contig-level .out's
 #
 #	If a mistake is made in the following it would be possible to
 #	destroy all the RM output.  So, just to be paranoid, save all
 #	the RM output in bluearc for the time being:
     ssh kkstore02
 
     cd /cluster/data/hg18
     mkdir /cluster/bluearc/hg18/RMOutput
     foreach c ( `cat chrom.lst` )
      foreach d ( ${c}/N{C,G,T}_* )
 	set T = /cluster/bluearc/hg18/RMOutput/${d}
 	mkdir -p ${T}
         cd ${d}
         set contig = $d:t
         cp -p ${contig}_?{,?,??}.fa.out ${T}
         cd ../..
 	echo "${d} done"
      end
     end
 #	Make sure we got them all:
 #	(this doesn't work later since there are more *.fa.out files
 #	after the lifting.  More explicitly to find just these:
 #		find . -name "N?_*_*.fa.out" -print | wc -l
     find . -name "*.fa.out" -print | wc -l
 #	5990
     find /cluster/bluearc/hg18/RMOutput -type f | wc -l
 #	5990
 #	same count
 
 #	OK, now you can try this operation, do it in a script like this
 #	and save the output of the script for a record of what happened.
 
     cat << '_EOF_' > jkStuff/liftRM.csh
 #!/bin/csh -fe
 foreach c ( `cat chrom.lst` )
  foreach d ( ${c}/N{C,G,T}_* )
     cd $d
     set contig = $d:t
     liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out
     cd ../..
  end
 end
 '_EOF_'
     chmod +x jkStuff/liftRM.csh
     mkdir scriptsOutput
 
     script lift.log
     bash
     time jkStuff/liftRM.csh > scriptsOutput/liftRM.1 2>&1
     exit
     exit
 
 #	Check that they all were done:
     grep "fa.out" scriptsOutput/liftRM.1 | wc -l
 #	5990
 #	same count as above
 
 #- Lift up RepeatMask .out files to chromosome coordinates via
 # picked up jkStuff/liftOut2.sh from the hg17 build.  Renamed to
 # liftOut2.csh, changed the line that does the chrom listing
     bash
     time ./jkStuff/liftOut2.csh > scriptsOutput/liftOut2 2>&1
 # real    0m30.488s
 # user    0m24.670s
 # sys     0m2.797s
 # seems much faster than hg17 ???
 
 # hg17 numbers:
 #	real    9m46.780s
 #	user    1m18.900s
 #	sys     7m33.990s
 
 #- By this point, the database should have been created (above):
     ssh hgwdev
     cd /cluster/data/hg18
     bash
     time hgLoadOut hg18 ?/*.fa.out ??/*.fa.out *hap*/*.fa.out > \
 	scriptsOutput/hgLoadOut 2>&1
 # real    9m9.045s
 # user    2m19.500s
 # sys     0m24.440s
 
 # errors during this load:  (there are always a couple of these)
 # Strange perc. field -1.2 line 153851 of 2/chr2.fa.out
 # Strange perc. field -10423.3 line 174747 of 3/chr3.fa.out
 # Strange perc. field -5635.9 line 174747 of 3/chr3.fa.out
 # Strange perc. field -259.3 line 174747 of 3/chr3.fa.out
 # Strange perc. field -1.4 line 205545 of 4/chr4.fa.out
 # Strange perc. field -0.1 line 167690 of 7/chr7.fa.out
 # Strange perc. field -1331.2 line 198656 of 7/chr7.fa.out
 # Strange perc. field -1460.4 line 198656 of 7/chr7.fa.out
 # Strange perc. field -4.2 line 223183 of 7/chr7.fa.out
 # Strange perc. field -3192.0 line 60424 of 8/chr8.fa.out
 # Strange perc. field -423.4 line 60424 of 8/chr8.fa.out
 # Strange perc. field -784.0 line 60424 of 8/chr8.fa.out
 # Strange perc. field -0.1 line 52020 of X/chrX.fa.out
 # Strange perc. field -4526.7 line 190254 of X/chrX.fa.out
 # Strange perc. field -3757.2 line 190254 of X/chrX.fa.out
 # Strange perc. field -597.2 line 190254 of X/chrX.fa.out
 # Strange perc. field -13030.4 line 137624 of 16/chr16.fa.out
 # Strange perc. field -1359.8 line 137624 of 16/chr16.fa.out
 # Strange perc. field -2223.5 line 137624 of 16/chr16.fa.out
 # Strange perc. field -1.3 line 11573 of 22/chr22.fa.out
 # Strange perc. field -12.7 line 69873 of 22/chr22.fa.out
 
 
 #	Verify we have similar results to previous assembly:
 #	featureBits hg18 rmsk
 # 	1406290513 bases of 3107677273 (45.252%) in intersection
 #	featureBits -countGaps hg17 rmsk
 #	1390952984 bases of 3095016460 (44.942%) in intersection
 #	featureBits hg17 rmsk
 #	1391378842 bases of 2867328468 (48.525%) in intersection
 #	featureBits hg16 rmsk
 #	1388770568 bases of 2865248791 (48.469%) in intersection
 #	Now proceed to MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF
 #	following the SIMPLE REPEAT sections below
 
 # let Rachel know that RepeatMask is done.
 
 # SIMPLE REPEAT [TRF] TRACK (DONE - 2005-12-07 - Fan)
 #	Copy the contigs, first to the bluearc, then to /iscratch/i
     ssh kkstore02
     mkdir /cluster/bluearc/hg18
     mkdir /cluster/bluearc/hg18/contigs
 
     cd /cluster/data/hg18
     foreach ctg ( `cat contig.lst` )
 	set c = $ctg:t
  	echo "$ctg > /cluster/bluearc/hg18/contigs/$c"
 	cp -p $ctg /cluster/bluearc/hg18/contigs/$c
     end
 #	Check how much is there:
 #	du -hsc /cluster/bluearc/hg18/contigs
 #	2.8G    /cluster/bluearc/hg18/contigs
     exit
 
 # Distribute contigs to /iscratch/i
     ssh pk
     mkdir -p /san/sanvol1/scratch/hg18/unmaskedContigs
     cd /san/sanvol1/scratch/hg18/unmaskedContigs
     cp -p /cluster/bluearc/hg18/contigs/* .
     ls .
 
 # Verify same amount made it there:
 #	du -hsc /san/sanvol1/scratch/hg18/unmaskedContigs
 #	2.9G    /san/sanvol1/scratch/hg18/unmaskedContigs
 #	Then send them to the other 7 Iservers
 #    /cluster/bin/iSync
 
 #	Go to the small cluster for this business:
     ssh pk
 
     mkdir -p /cluster/data/hg18/bed/simpleRepeat
     cd /cluster/data/hg18/bed/simpleRepeat
     mkdir trf
     cat << '_EOF_' > runTrf
 #!/bin/csh -fe
 #
 set path1 = $1
 set inputFN = $1:t
 set outpath = $2
 set outputFN = $2:t
 mkdir -p /tmp/$outputFN
 cp $path1 /tmp/$outputFN
 pushd .
 cd /tmp/$outputFN
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
 popd
 rm -f $outpath
 cp -p /tmp/$outputFN/$outputFN $outpath
 rm -fr /tmp/$outputFN/*
 rmdir --ignore-fail-on-non-empty /tmp/$outputFN
 '_EOF_'
 # << this line makes emacs coloring happy
     chmod +x runTrf
 
     cat << '_EOF_' > gsub
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
 #ENDLOOP
 '_EOF_'
 # << this line makes emacs coloring happy
 
     ls -1S /san/sanvol1/scratch/hg18/unmaskedContigs/*.fa > genome.lst
     gensub2 genome.lst single gsub jobList
     para create jobList
     para try
     para check
     para push
     para check
 # Completed: 378 of 378 jobs
 # CPU time in finished jobs:      18956s     315.93m     5.27h    0.22d  0.001 y
 # IO & Wait Time:                  2519s      41.98m     0.70h    0.03d  0.000 y
 # Average job time:                  57s       0.95m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            2345s      39.08m     0.65h    0.03d
 # Submission to last job:          2427s      40.45m     0.67h    0.03d
 
 bash
 liftUp simpleRepeat.bed /cluster/data/hg18/jkStuff/liftAll.lft \
 warn trf/*.bed  > lu.out 2>&1
 
 # Load into the database:
     ssh hgwdev
     cd /cluster/data/hg18/bed/simpleRepeat
     /cluster/bin/i386/hgLoadBed hg18 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
 #	Loaded 629076 elements of size 16
 #	Compare with previous assembly
     featureBits hg18 simpleRepeat
 # 56164158 bases of 3107677273 (1.807%) in intersection
 # 	featureBits hg17 simpleRepeat
 #	54952425 bases of 2866216770 (1.917%) in intersection
 #     featureBits hg16 simpleRepeat
 #	54320136 bases of 2865248791 (1.896%) in intersection
 #	GAPS weren't in hg18 yet at this point, after gaps added:
 #	featureBits hg18 simpleRepeat
 #	54964044 bases of 2867328468 (1.917%) in intersection
 #	featureBits -countGaps hg18 simpleRepeat
 #	54964044 bases of 3096628158 (1.775%) in intersection
 
 # CREATE MICROSAT TRACK (done 2006-7-5 JK)
      ssh hgwdev
      cd /cluster/data/hg18/bed
      mkdir microsat
      cd microsat
      awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
     /cluster/bin/i386/hgLoadBed hg18 microsat microsat.bed
 
 # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2005-12-09 - Fan)
 # After the simpleRepeats track has been built, make a filtered version
 # of the trf output: keep trf's with period <= 12:
     ssh kkstore02
     mkdir -p cd /cluster/data/hg18/bed/simpleRepeat
     cd /cluster/data/hg18/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/*.bed)
       awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
 
 #	The 4 lines below were left over from makeHg17.doc.
 #	EXPERIMENT, at a filter of <= 12, we have coverage:
 #	20904399 bases of 2867328468 (0.729%) in intersection
 #	at a filter of <= 9, we have coverage:
 #	19271270 bases of 2867328468 (0.672%) in intersection
 
 # Lift up filtered trf output to chrom coords as well:
     cd /cluster/data/hg18
     mkdir bed/simpleRepeat/trfMaskChrom
     foreach c ( `cat chrom.lst` )
       if (-e $c/lift/ordered.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/ordered.lst > $c/lift/oTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
       endif
       if (-e $c/lift/random.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
            $c/lift/random.lst > $c/lift/rTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
       endif
     end
 
 # MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2005-12-09, Fan)
 # This used to be done right after RepeatMasking.  Now, we mask with
 # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above,
 #	and after Repeat Masker is complete.
     ssh kkstore02
     cd /cluster/data/hg18
 
 # Make chr*.fa from contig .fa
 #  chrFa.csh was already copied from hg17/jkStuff
     bash
     time ./jkStuff/chrFa.csh > scriptsOutput/chrFa.out 2>&1 &
 # real    2m35.734s
 # user    1m18.351s
 # sys     0m16.596s
 # much faster than hg17 numbers as shown below.  ???
 
 # old hg17 numbers:
 #	real    13m18.512s
 #	user    9m1.670s
 #	sys     1m7.290s
 
 #- Soft-mask (lower-case) the contig and chr .fa's
     time ./jkStuff/makeFaMasked.csh > scriptsOutput/maFaMasked.out 2>&1
 # real    8m47.289s
 # user    3m45.698s
 # sys     1m44.416s
 
 #	old hg17 numbers:
 #	real    29m31.623s
 #	user    13m49.700s
 #	sys     5m58.750s
 
 #- Make hard-masked .fa.masked files as well:
     time ./jkStuff/makeHardMasked.csh > scriptsOutput/maHardMasked.out 2>&1
 # real    5m48.833s
 # user    1m41.926s
 # sys     0m52.084s
 
 #- Create the bothMasksNib/ directory
     time ./jkStuff/makeNib.csh > scriptsOutput/maNib.out 2>&1
 # real    2m23.280s
 # user    1m6.462s
 # sys     0m19.795s
 
 # old hg17 numbers:
 #	real    14m41.694s
 #	user    6m28.000s
 #	sys     1m42.500s
 
 # Make symbolic links from /gbdb/hg18/nib to the real nibs.
     ssh hgwdev
     cd /cluster/store11/gs.19/build36
     mv nib nib.raw
     mv bothMasksNib nib
     rm /gbdb/hg18/nib/*.nib
     ln -s `pwd`/nib/* /gbdb/hg18/nib
 
 # Load /gbdb/hg18/nib paths into database and save size info.
 
     cd /cluster/data/hg18
     hgNibSeq -preMadeNib hg18 /gbdb/hg18/nib */chr*.fa
 # 3107677273 total bases
 
 #	Should be the same size as before
     hgsql -N -e "select chrom,size from chromInfo order by chrom" hg18 \
 	> chrom.sizes.masked
     diff chrom.sizes chrom.sizes.masked
 #	should be no output at all, thus:
     rm chrom.sizes.masked
 
 # Copy the masked contig fa to /scratch and /iscratch
 #	And everything else we will need for blastz runs, etc ...
 #	Best to do this sequence first to /cluster/bluearc/scratch,
 #	which is going to be the source for the /scratch copy.
 #	And then from there to the /iscratch
 #	Make sure you are on the fileserver for the original source:
     ssh kkstore02
     mkdir -p /cluster/bluearc/scratch/hg/gs.19/build36
     cd /cluster/bluearc/scratch/hg/gs.19/build36
 
 #	these copies take less than 2 minutes each
     mkdir bothMaskedNibs
     cp -p /cluster/data/hg18/nib/*.nib ./bothMaskedNibs
     mkdir maskedContigs
     foreach chrom ( `cat /cluster/data/hg18/chrom.lst` )
 	cp -p /cluster/data/hg18/${chrom}/N{C,G,T}_*/N{C,G,T}_??????.fa \
 		./maskedContigs
 	echo "done ${chrom}"
     end
 #	make sure you have them all:
     ls maskedContigs | wc -l
 #	378
     wc -l /cluster/data/hg18/contig.lst
 #	378
     mkdir rmsk
     foreach chrom ( `cat /cluster/data/hg18/chrom.lst` )
 	cp -p /cluster/data/hg18/${chrom}/*.out ./rmsk
 	echo "done ${chrom}"
     end
 
 #	Now, go to the destination for /iscratch and copy from the
 #	bluearc
     ssh kkr1u00
     mkdir -p /iscratch/i/gs.19/build36
     cd /iscratch/i/gs.19/build36
 #	This takes about 5 minutes
     rsync -arlv /cluster/bluearc/scratch/hg/gs.19/build36/ .
 
     bash
     time /cluster/bin/iSync
 #	real    7m27.649s
 
 # request rsync of /cluster/bluearc/scratch to the KiloKluster /scratch
 
 # Ask sysadmin to bring up BLAT server.
 
 # update central dbDb table to add the new blat server entry
 
     echo 'INSERT INTO blatServers (db, host, port, isTrans) \
                 VALUES ("hg18", "blat19", "17778", "1"); \
           INSERT INTO blatServers (db, host, port, isTrans) \
                 VALUES ("hg18", "blat19", "17779", "0");' \
     | hgsql -h genome-testdb hgcentraltest
 
 # LOAD ctgPos table - Contig position track
 #	After fixing up hgCtgPos to accept the -chromLst argument, simply:
     cd /cluster/data/hg18
     hgCtgPos -chromLst=chrom.lst hg18 .
 
 # GOLD AND GAP TRACKS (DONE - 2005-12-10 - Fan)
 		      (RE-DONE - 2006-04-06 - Fan)
     ssh hgwdev
     cd /cluster/data/hg18
 
 # manually edit the 4 haplotype .agp files to change the first col from
 # contig IDs into chrom name.
 
     hgGoldGapGl -noGl -chromLst=chrom.lst hg18 /cluster/data/hg18 .
     #	Disappointing to see this create so many tables ...
     #	_gap and _gold for each chrom
 
 # contig.gl ... section skipped for the time being.  (Fan 2005-12-13).
 
 #############################################################################
 # GC5BASE (DONE - 2005-12-13 - Fan)
     ssh kkstore02
     mkdir -p /cluster/data/hg18/bed/gc5Base
     cd /cluster/data/hg18/bed/gc5Base
     hgGcPercent -wigOut -doGaps -file=stdout -win=5 hg18 \
         /cluster/data/hg18/nib | wigEncode stdin gc5Base.wig gc5Base.wib
 
     #   runs for about 17 minutes
 
     #   load database
     ssh hgwdev
     cd /cluster/data/hg18/bed/gc5Base
     mkdir /gbdb/hg18/wib
     ln -s `pwd`/gc5Base.wib /gbdb/hg18/wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 gc5Base gc5Base.wig
 
     #   verify index is correct:
     hgsql hg18 -e "show index from gc5Base;"
     #   should see good numbers in Cardinality column
 
 #########################################################################
 # GENBANK auto update (DONE 2005-12-13 Fan)
     # align with revised genbank process. drop xeno ESTs.
     cd ~/kent/src/hg/makeDb/genbank
     cvs update -d etc
     # edit etc/genbank.conf to add hg18
 
 # hg18
 hg18.serverGenome = /cluster/data/hg18/nib/chr*.nib
 hg18.clusterGenome = /scratch/hg/gs.18/build36/bothMaskedNibs/chr*.nib
 hg18.ooc = /scratch/hg/h/11.ooc
 hg18.lift = /cluster/store11/gs.19/build36/jkStuff/liftAll.lft
 hg18.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
 hg18.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
 hg18.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
 hg18.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
 #hg18.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
 #hg18.genbank.est.xeno.pslCDnaFilter    = ${finished.genbank.est.xeno.pslCDnaFilter}
 #hg18.genbank.est.xeno.load = yes
 hg18.refseq.mrna.xeno.load  = yes
 hg18.refseq.mrna.xeno.loadDesc = yes
 hg18.mgcTables.default = full
 hg18.mgcTables.mgc = all
 hg18.downloadDir = hg18
 
     ### NOTE: in the future, enable orfeome tracks as part of this (markd)
 
     # update /cluster/data/genbank/
     make etc-update
 
     ssh kkstore02
     cd /cluster/data/genbank
     nice bin/gbAlignStep -initial hg18 &
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     nice ./bin/gbDbLoadStep -drop -initialLoad  hg18&
 
 # CPGISLANDS (DONE - 2005-12-14 - Fan)
     ssh hgwdev
     mkdir -p /cluster/data/hg18/bed/cpgIsland
     cd /cluster/data/hg18/bed/cpgIsland
 
     # Build software from Asif Chinwalla (achinwal at watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     #	gcc readseq.c cpg_lh.c -o cpglh.exe
     mv cpglh.exe /cluster/data/hg18/bed/cpgIsland/
 
     # cpglh.exe requires hard-masked (N) .fa's.
     # There may be warnings about "bad character" for IUPAC ambiguous
     # characters like R, S, etc.  Ignore the warnings.
     ssh kkstore02
     cd /cluster/data/hg18/bed/cpgIsland
     foreach f (../../*/chr*.fa.masked)
       set fout=$f:t:r:r.cpg
       echo running cpglh on $f to $fout
       ./cpglh.exe $f > $fout
     end
     #	the warnings:
 # Bad char 0x52 = 'R' at line 2046, base 102229, sequence chr16_random
 # Bad char 0x4d = 'M' at line 1216113, base 60805573, sequence chr3
 # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
 # Bad char 0x52 = 'R' at line 1216118, base 60805801, sequence chr3
 
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 /* Input columns: */
 /* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
 /* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
 /* Output columns: */
 /* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
 /* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     # << this line makes emacs coloring happy
     awk -f filter.awk chr*.cpg > cpgIsland.bed
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/cpgIsland
     hgLoadBed hg18 cpgIslandExt -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
 # Reading cpgIsland.bed
 # Loaded 28226 elements of size 10
 # Sorted
 # Saving bed.tab
 # Loading hg18
 
 ########################################################################
 # PRODUCING GENSCAN PREDICTIONS (DONE - 2005-12-16 - Fan)
 # RELOADED PEPTIDE TABLE, GENSCANPEP (DONE, 2006-07-11, hartera)
 
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/genscan
     cd /cluster/data/hg18/bed/genscan
     cvs co hg3rdParty/genscanlinux
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the contigs
     # *that do not have pure Ns* (due to heterochromatin, unsequencable
     # stuff) which would cause genscan to run forever.
     rm -f genome.list
     bash
     for f in `cat /cluster/data/hg18/contig.lst`
     do
       egrep '[ACGT]' /cluster/data/hg18/$f.masked > /dev/null
 	if [ $? = 0 ]; then
 	    echo /cluster/data/hg18/$f.masked >> genome.list
 	fi
     done
     # exit your bash shell if you are [t]csh ...
     #	This egrep matched all the contigs in hg18.  I guess none of
     #	them are complete Ns* at this point.
 
     # Log into kki (not kk !).  kki is the driver node for the small
     # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
     # big cluster, due to limitation of memory and swap space on each
     # processing node).
     ssh kki
     cd /cluster/data/hg18/bed/genscan
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 genome.list single gsub jobList
     para create jobList
     para try
     para check
     para push ... etc ...
 # Completed: 377 of 378 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:      78976s    1316.27m    21.94h    0.91d  0.003 y
 # IO & Wait Time:                  4961s      82.68m     1.38h    0.06d  0.000 y
 # Average job time:                 223s       3.71m     0.06h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3491s      58.18m     0.97h    0.04d
 # Submission to last job:          7541s     125.68m     2.09h    0.09d
 
     #	Running the single failed job on kolossus with a smaller window:
 
 ssh kkr7u00.kilokluster.ucsc.edu
 /cluster/bin/x86_64/gsBig /cluster/data/hg18/5/NT_006576/NT_006576.fa.masked \
         gtf/NT_006576.fa.gtf -trans=pep/NT_006576.fa.pep \
         -subopt=subopt/NT_006576.fa.bed -exe=hg3rdParty/genscanlinux/genscan \
         -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2000000
 
     # If there were out-of-memory problems (run "para problems"), then
     # re-run those jobs by hand but change the -window arg from 2400000
     # something lower.  In build33, this was 22/NT_011519
     #  In build34 there were NO failures !
 
     # Convert these to chromosome level files as so:
     ssh kkstore02
     cd /cluster/data/hg18/bed/genscan
     $HOME/bin/i386/liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N*.gtf
     $HOME/bin/i386/liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft \
 	warn subopt/N*.bed
     cat pep/*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/hg18/bed/genscan
     ldHgGene hg18 genscan genscan.gtf
 # Reading genscan.gtf
 # Read 43122 transcripts in 329799 lines in 1 files
 # 43122 groups 49 seqs 1 sources 1 feature types
 # 43122 gene predictions
 
     hgPepPred hg18 generic genscanPep genscan.pep
     #	Processing genscan.pep
     hgLoadBed hg18 genscanSubopt genscanSubopt.bed
     # Reading genscanSubopt.bed
     # Loaded 514065 elements of size 6
     #	Sorted
     #	Creating table definition for
     #	Saving bed.tab
     #	Loading hg18
 
     # featureBits hg18 genscan
     # 56039161 bases of 2881515245 (1.945%) in intersection
     #	featureBits hg17 genscan
     #	55323340 bases of 2866216770 (1.930%) in intersection
     #	featureBits hg16 genscan
     #	55333689 bases of 2865248791 (1.931%) in intersection
     #	featureBits hg18 genscanSubopt
     # 55685959 bases of 2881515245 (1.933%) in intersection
     # featureBits hg17 genscanSubopt
     #	55986178 bases of 2866216770 (1.953%) in intersection
     #	featureBits hg16 genscanSubopt
     #	56082952 bases of 2865248791 (1.957%) in intersection
 
     #	Should be zero intersection with rmsk
     #	featureBits -chrom=chr1 hg18 genscan rmsk
 
     # Reload genscanPep table - requested by a user. It has been dropped
     # from hgwdev.
     # (hartera, 2006-07-11)
     ssh hgwdev
     cd /cluster/data/hg18/bed/genscan
     hgPepPred hg18 generic genscanPep genscan.pep
 
 ############################################################################
 # CREATE 2 BIT FILE (DONE 12/20/05, Fan)
 
    ssh kkstore02
    cd /cluster/data/hg18
    faToTwoBit */chr*.fa hg18.2bit
 
 # BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
 # ZEBRAFISH (danRer3) (DONE, 2005-12-23, hartera)
     ssh pk
     # Blastz uses lineage-specific repeats. There are none for mouse
     # and fish so use all repeats for each species as lineage-specific.
     mkdir -p /san/sanvol1/scratch/hg18/linSpecRep.notInOthers
    foreach f (/cluster/bluearc/hg18/linSpecRep/notInOthers/chr*.out.spec)
      cp -p $f /san/sanvol1/scratch/hg18/linSpecRep.notInOthers/
    end
 
     # get only lineage specific repeats for chr1-25 and chrM
     mkdir -p /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
     foreach f (/cluster/data/danRer3/*/chr[0-9M]*.fa.out)
       cp -p $f \
           /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/$f:t:r:r.out.spec
     end
     # make a nib dir without random chroms
     mkdir -p /san/sanvol1/scratch/hg18/chromNib
     cp -p /cluster/data/hg18/nib/chr*.nib \
        /san/sanvol1/scratch/hg18/chromNib
     rm -r chr*_random.nib
     # make a nib dir that is also just chr1-25 and chrM
     mkdir -p /san/sanvol1/scratch/danRer3/chromNib
     cp /cluster/data/danRer3/nib/chr[0-9M]*.nib \
        /san/sanvol1/scratch/danRer3/chromNib
 
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.danRer3.2005-12-17
     cd /cluster/data/hg18/bed
     ln -s blastz.danRer3.2005-12-17 blastz.danRer3
     # Three separate runs done to create chains. Runs 1 and 3 could be
     # combined into one.
     # RUN 1: hg18 chroms (no randoms) vs danRer3 chr1-25 and chrM using
     # lineage-specific repeats.
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.danRer3
     # make run dir
     mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
     ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
     # make out dir
     mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut
     ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut
     cd chromsRun
     # use parameters as for hg17 vs danRer2 - see makeHg17.doc
     cat << '_EOF_' > DEF
 # human (hg18) vs zebrafish (danRer3)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz.v7.x86_64
 
 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human (hg18)
 SEQ1_DIR=/san/sanvol1/scratch/hg18/chromNib
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRep.notInOthers
 SEQ1_LIMIT=30
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer3)
 # just chroms 1-25 and chrM
 SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
 SEQ2_LIMIT=30
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1chroms.len
 SEQ2_LEN=$BASE/S2chroms.len
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x DEF
 
     grep -v random /cluster/data/hg18/chrom.sizes > S1chroms.len
     grep -v chrUn /cluster/data/danRer3/chrom.sizes \
             | grep -v chrNA > S2chroms.len
     # do blastz and create chains for danRer3 chr1-25 and chrM using
     # all repeats as lineage-specific repeats.
     # chickenHumanTuned.gap scoring matrix is now used by axtChain if the
     # linearGap parameter is set to "loose".
     nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
        -bigClusterHub=pk \
        -smallClusterHub=pk \
        -workhorse=pk \
        -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut \
        -chainMinScore=5000 \
        -chainLinearGap loose \
        -stop chainRun `pwd`/DEF >& doChains.log &
     # Took 2 hours 45 minutes to run.
     # Then run the human hg18 chroms and randoms vs danRer3 chrUn and chrNA
     ssh hgwdev
     # get file of scaffolds for hg18 randoms. Use the Table Browser to
     # select sequence from the whole genome for the ctgPos table of contigs
     # restricting to chrom like "%_random" in the Free-form query box of
     # the filter. hg18RandomContigs.fa
     cd /cluster/data/hg18/bed/blastz.danRer3
     # get the position and contig name from the ctgPos table
     hgsql -N -e 'select chrom, chromStart, chromEnd, contig from ctgPos \
           where chromlike "%_random";' hg18 > contigPosAndNames.txt
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.danRer3
     # change header to just the position
     perl -pi.bak -e 's/>.+range=(chr[0-9XY]+_random:[0-9]+\-[0-9]+).+/>$1/' \
          hg18RandomContigs.fa
 awk '{print "perl -pi.bak -e s/"$1":"$2+1"-"$3"/"$4"/ hg18RandomContigs.fa"}' \
        contigPosAndNames.txt > addContigNames.csh
     chmod +x addContigNames.csh
     # run script
     addContigNames.csh
     ssh hgwdev
     # make a 2 bit file of the chroms and random scaffolds
     cd /cluster/data/hg18
     set dir=/san/sanvol1/scratch/hg18
     faToTwoBit [1-9]/chr[1-9].fa [12][0-9]/chr[12][0-9].fa M/chrM.fa \
                X/chrX.fa Y/chrY.fa *hap[12]/chr*.fa \
                /cluster/data/hg18/bed/blastz.danRer3/hg18RandomContigs.fa \
                $dir/chromsAndRandoms.2bit
     twoBitInfo $dir/chromsAndRandoms.2bit $dir/chromsAndRandoms.len
     # make a 2 bit file for just the random scaffolds
     faToTwoBit /cluster/data/hg18/bed/blastz.danRer3/hg18RandomContigs.fa \
                $dir/randoms.2bit
     twoBitInfo $dir/randoms.2bit $dir/randoms.len
     # make sure all the random chroms contigs are included - should be 88.
     # make a 2 bit file for all the chroms and random chroms, make sure to
     # get the haplotype chrom sequences.
     faToTwoBit [1-9MXY]/chr*.fa [12][0-9]/chr*.fa *hap[12]/chr*.fa \
                $dir/hg18.2bit
     twoBitInfo $dir/hg18.2bit $dir/hg18Chroms.len
     twoBitInfo /san/sanvol1/scratch/danRer3/danRer3.2bit \
                /san/sanvol1/danRer3/danRer3Chroms.len
     # make file of scaffolds lengths for NA and Un scaffolds
     twoBitInfo \
        /san/sanvol1/scratch/danRer3/scaffoldsNAandUn/danRer3NAandUnScaf.2bit \
        /san/sanvol1/scratch/danRer3/scaffoldsNAandUn/NAandUnScafs.len
     cd /cluster/data/hg18/bed/blastz.danRer3
     # make a lift file for the hg18 randoms contigs
     cat /cluster/data/hg18/*/lift/random.lft >> $dir/randomContigs.lft
     # RUN 2: hg18 chroms and random chroms contigs vs danRer3 chrNA and
     # chrUn scaffolds with no lineage-specific repeats as there are too
     # many scaffolds in chrNA and chrUn. Use the dynamic masking function
     # of Blastz instead.
     # make run dir
     mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun
     ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun
     # make out dir
     mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut
     ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut
     cd chromsAndRandomsRun
     # use parameters similar to hg17 vs danRer2 - see makeHg17.doc
     # As lineage-specific repeats can not be used with chrUn and chrNA
     # scaffolds, then use dynamic masking, M=50.
     cat << '_EOF_' > DEF
 # human (hg18) vs zebrafish (danRer3)
 # human chroms and random chrom contigs vs zebrafish chrNA and chrUn scaffolds
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz.v7.x86_64
 
 # Reuse some parameters from hg16-fr1, danRer-hg17 and mm5-danRer
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human (hg18)
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.2bit
 SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/chromsAndRandoms.2bit
 SEQ1_LIFT=/san/sanvol1/scratch/hg18/randomContigs.lft
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=
 SEQ1_LIMIT=30
 SEQ1_IN_CONTIGS=0
 # 500 kb target with 5 kb overlap
 SEQ1_CHUNK=500000
 SEQ1_LAP=5000
 
 # QUERY: Zebrafish (danRer3)
 # just scaffolds for chrUn and chrNA
 SEQ2_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
 SEQ2_CTGDIR=/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/danRer3NAandUnScaf.2bit
 SEQ2_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=1000000000
 SEQ2_LAP=0
 
 BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsRun
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len
 SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/chromsAndRandoms.len
 SEQ2_LEN=/san/sanvol1/scratch/danRer3/danRer3Chroms.len
 SEQ2_CTGLEN=/san/sanvol1/scratch/danRer3/scaffoldsNAandUn/NAandUnScafs.len
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x DEF
 
     # do blastz and create chains for human chroms and random chroms in contigs
     # vs zebrafish danRer3 chrNA and chrUn in scaffolds without
     # lineage-specific repeats but using blastz's dynamic masking.
     # chickenHumanTuned.gap scoring matrix is now used by axtChain if the
     # linearGap parameter is set to "loose".
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
    -bigClusterHub=pk \
    -smallClusterHub=pk \
    -workhorse=pk \
    -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsAndRandomsOut \
    -chainMinScore=5000 \
    -chainLinearGap loose \
    -stop chainRun `pwd`/DEF >& doChains.log &
     # Took about 15 hours to finish.
     ssh hgwdev
     # Try running hg18 random chroms in contigs vs danRer3 chroms 1-25 and chrM
     # with lineage-specific repeats.
     # make directory of human contigs repeats to serve as lineage-specific
     # repeats for the random chroms contigs.
     mkdir -p /san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers
     cd /cluster/data/hg18/bed/blastz.danRer3
     awk '{print $4}' contigPosAndNames.txt > contigNames.txt
     foreach c (`cat contigNames.txt`)
       foreach f (/cluster/data/hg18/*/${c}/${c}.fa.out)
       cp -p $f \
       /san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers/$f:t:r:r.out.spec
       end
     end
     # RUN 3: hg18 random chroms contigs vs danRer3 chr1-25 and chrM using
     # lineage-specific repeats.
     # make run dir
     mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun
     ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun
     # make out dir
     mkdir -p /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut
     ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut
     set dir=/san/sanvol1/scratch
     cp $dir/hg18/blastzDanRer3/chromsRun/S2chroms.len \
        $dir/danRer3/chr1to25andM.len
     # make nib dir for random contigs for hg18
     mkdir -p $dir/hg18/randomContigsNib
     foreach c (`cat contigNames.txt`)
       foreach f (/cluster/data/hg18/*/${c}/${c}.fa)
       faToNib -softMask $f $dir/hg18/randomContigsNib/$f:t:r.nib
       end
     end
     cd randomsRun
     cat << '_EOF_' > DEF
 # human (hg18) vs zebrafish (danRer3)
 # human random chrom contigs vs zebrafish chr1-15 and chrM
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz.v7.x86_64
 
 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human (hg18)
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.2bit
 SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/randomContigsNib
 SEQ1_LIFT=/san/sanvol1/scratch/hg18/randomContigs.lft
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRepRandoms.notInOthers
 SEQ1_LIMIT=30
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer3)
 # just chr1-25 and chrM
 SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_LIMIT=30
 SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/san/sanvol1/scratch/hg18/blastzDanRer3/randomsRun
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len
 SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/randoms.len
 SEQ2_LEN=/san/sanvol1/scratch/danRer3/chr1to25andM.len
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x DEF
 
     # do blastz and create chains for human random chroms in contigs
     # vs zebrafish danRer3 chroms 1 to 25 and chrM using all repeats
     # as lineage-specific repeats.
     # chickenHumanTuned.gap scoring matrix is now used by axtChain if the
     # linearGap parameter is set to "loose".
   nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
    -bigClusterHub=pk \
    -smallClusterHub=pk \
    -workhorse=pk \
    -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/randomsOut \
    -chainMinScore=5000 \
    -chainLinearGap loose \
    -stop chainRun `pwd`/DEF >& doChains.log &
    # Took 15 minutes.
    # chains are sorted by score so move into one directory and use
    # chainMergeSort
    ssh kolossus
    set blastzDir=/cluster/data/hg18/bed/blastz.danRer3
    cd $blastzDir/chromsRun/axtChain
    mkdir -p chainsNotMerged
    foreach r (chromsRun chromsAndRandomsRun randomsRun)
      nice cp -p ${blastzDir}/${r}/axtChain/run/chain/*.chain \
           ${blastzDir}/chromsRun/axtChain/chainsNotMerged/
    end
    nice chainMergeSort ./chainsNotMerged/*.chain | nice gzip -c \
         > hg18.danRer3.all.chain.gz
    # split into chains by chrom
    nice zcat hg18.danRer3.all.chain.gz | chainSplit chain stdin
    # check chains, there are 48 should be 49. Chains for chr11_random
    # are missing. These sequences have a lot of repeats in the regions that
    # hits danRer3 with BLAT.
    # carry on with doBlastzChainNet.pl starting from net step
    ssh hgwdev
    cd /cluster/data/hg18/bed/blastz.danRer3/chromsRun
    mv DEF DEF.chroms
    # edit DEF to give hg18.2bit as the SEQ1_DIR and danRer3.2bit as SEQ2_DIR
    # and remove lineage-specfic repeats.
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
    -bigClusterHub=pk \
    -smallClusterHub=pk \
    -workhorse=pk \
    -blastzOutRoot /san/sanvol1/scratch/hg18/blastzDanRer3/chromsOut \
    -chainMinScore=5000 \
    -chainLinearGap loose \
    -continue net `pwd`/DEF >& doNetAndDownloads.log &
    # Took about 25 minutes.
    # crashed on ssh -X sanhead1 for cleanup so re-run script
    cleanUp.csh
    # copy chainDanRer3.html and netDanRer3.html to
    # kent/src/hg/makeDb/trackDb/human/hg18/ and edit to describe method used.
    # Add tracks to trackDb.ra there. Edit README.txt in the downloads
    # directory to describe method used for alignments.
 # featureBits -chrom=chr1 hg18 refGene:cds chainDanRer3Link -enrichment
 # refGene:cds 1.378%, chainDanRer3Link 2.601%, both 0.927%, cover 67.26%,
 # enrich 25.86x
 # featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
 # refGene:cds 1.386%, chainDanRer3Link 2.742%, both 0.909%, cover 65.58%,
 # enrich 23.91x
 # So similar coverage and enrichment to hg17 vs danRer2 chains.
 
 #########################################################################
 # BLASTZ MOUSE Mm7 second time (DONE - 2005-12-24 - 2005-12-25 Fan)
     #	After fixing a bug in the lineage specific repeat snip business
     #	in blastz-run-ucsc script
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzMm7.2005-12-24
     cd /cluster/data/hg18/bed
     rm blastz.mm7
     ln -s blastzMm7.2005-12-24 blastz.mm7
     cd blastzMm7.2005-12-24
 
     cat << '_EOF_' > DEF
 # human vs mouse
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInMouse
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 
 # QUERY: Mouse Mm7 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/hg/mm7/nib
 SEQ2_SMSK=/scratch/hg/mm7/linSpecRep/notInHumanDogCow
 SEQ2_LEN=/cluster/bluearc/mm7/chrom.sizes
 SEQ2_CHUNK=3000000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzMm7.2005-12-24
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-stop=load \
 	`pwd`/DEF > to-load.out 2>&1 &
     #	Started 2005-12-24 06:15
 
     mv to-load.out to-load.out.1
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
     -continue=chainMerge -stop=load \
     `pwd`/DEF > to-load.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
     -continue=download \
     `pwd`/DEF > download.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
     -swap -continue=download \
     `pwd`/DEF > swap-download.out 2>&1 &
 
 # PLEASE NOTE THAT SOME .OUT FILES MIGHT HAVE BEEN OVERWRITTEN
 # DUE TO RETRIES AND/OR NEXT STEP COMMAND NOT FULLY EDITED CORRECTLY.
 
     #	Measurements:
 
     ssh hgwdev
 
     featureBits mm7 chainHg18Link
     # 990285408 bases of 2583394090 (38.333%) in intersection
 
     featureBits hg18 chainMm7Link
     # 991769039 bases of 2881515245 (34.418%) in intersection
 
     # each of above took about half hour.
 
 #########################################################################
 # BLASTZ CHICKEN GalGal2 second time (DONE - 2005-12-28 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzGalGal2.2005-12-28
     cd /cluster/data/hg18/bed
     rm blastz.galGal2
     ln -s blastzGalGal2.2005-12-28 blastz.galGal2
     cd blastzGalGal2.2005-12-28
 
     cat << '_EOF_' > DEF
 # human vs chicken
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken GalGal2 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/hg/galGal2/nib
 SEQ2_LEN=/cluster/bluearc/galGal2/chrom.sizes
 SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzGalGal2.2005-12-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-stop=load \
 	`pwd`/DEF > load.out 2>&1 &
     #	Started 2005-12-28 10:35
 
     # Two jobs stuck in the same node.  Did manual para stop and para push.
     # Both finished within a few minutes.
 
     # Done! On Wed Dec 28 15:32:45 PST 2005.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
     # Had an error at the net step
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
       -swap -continue=net -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
     -continue=download \
     `pwd`/DEF > download.out 2>&1 &
 
     # the gzip job on kolossus seems not moving at all.
     # killed it manually.  Try again.
 
     # Seemed not moving, kill it again.  Now use pk instead of kolossus.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -workhorse=pk \
     -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
     -continue=download \
     `pwd`/DEF > download.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -workhorse=pk \
     -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
     -swap -continue=download \
     `pwd`/DEF > swap-download.out 2>&1 &
 
     # Done! Wed Dec 28 20:39:44 PST 2005
 
     #	Measurements:
 
     ssh hgwdev
 
     nice featureBits galGal2 chainHg18Link
     # 91564024 bases of 1054197620 (8.686%) in intersection
     nice featureBits hg18 chainGalGal2Link
     # 102417858 bases of 2881515245 (3.554%) in intersection
 
     nice featureBits galGal2 chainHg17Link
     # 93277286 bases of 1054197620 (8.848%) in intersection
     nice featureBits hg17 chainGalGal2Link
     # 103882699 bases of 2866216770 (3.624%) in intersection
 
 #########################################################################
 # BLASTZ DOG CanFam2 time (DONE - 2005-12-28 - 2005-12-29 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzCanFam2.2005-12-28
     cd /cluster/data/hg18/bed
     rm blastz.canFam2
     ln -s blastzCanFam2.2005-12-28 blastz.canFam2
     cd blastzCanFam2.2005-12-28
 
     cat << '_EOF_' > DEF
 # human vs dog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for dog (per Webb email to Brian Raney)
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog CanFam2 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/hg/canFam2/nib
 SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes
 SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzCanFam2.2005-12-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-stop=load \
 	`pwd`/DEF > load.out 2>&1 &
     #	Started 2005-12-28 21:33
 
     # Two jobs stuck in the same node.  Did manual para stop and para push.
     # Both finished within a few minutes.
 
     # Done! On Thu Dec 29 05:27:31 PST 2005.
 
     # system seems hang on kolossus (3 processes of [tcsh -c nice chainMergeSort], not moving)
     # manually killed the jobs.
     # now use pk as the workhorse.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
       -workhorse=pk \
       -continue=chainMerge \
 	-stop=load \
 	`pwd`/DEF > load2.out 2>&1 &
 
     # Done! Thu Dec 29 09:10:02 PST 2005.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=pk \
       -swap -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
     # Had an error at the load step,
     # mySQL error 2013: Lost connection to MySQL server during query,
     # probably due to sys admin working on network connections,
     # continue at the load step
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
       -workhorse=pk \
       -swap -continue=load -stop=load \
 	`pwd`/DEF > swap-load2.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
     -workhorse=pk \
     -continue=download \
     `pwd`/DEF > download.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -workhorse=pk \
     -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
     -swap -continue=download \
     `pwd`/DEF > swap-download.out 2>&1 &
 
     # Done! Dec 29 13:21
 
     #	Measurements:
 
     ssh hgwdev
 nice featureBits canFam2 chainHg18Link
 # 1477551526 bases of 2384996543 (61.952%) in intersection
 nice featureBits hg18 chainCanFam2Link
 # 1524764349 bases of 2881515245 (52.915%) in intersection
 nice featureBits canFam2 chainHg17Link
 # 1487483112 bases of 2384996543 (62.368%) in intersection
 nice featureBits hg17 chainCanFam2Link
 # 1530197469 bases of 2866216770 (53.387%) in intersection
 
 # ENABLE GENBANK UPDATE (1/3/06 Fan)
 
 # add hg18 to the following two files and check them in.
 
      src/hg/makeDb/genbank/etc/align.dbs
      src/hg/makeDb/genbank/etc/hgwdev.dbs
 
 # then go to /cluster/data/genbank/etc and do cvs update on these two files.
 
 #########################################################################
 # BLASTZ RAT Rn3 (STARTED - 2005-12-22, DONE 2006-01-05 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzRn3.2005-12-22
     cd /cluster/data/hg18/bed
     rm blastz.rn3
     ln -s blastzRn3.2005-12-22 blastz.rn3
     cd blastzRn3.2005-12-22
 
     cat << '_EOF_' > DEF
 # human vs rat
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Muman Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInRat
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole
 pieces
 SEQ2_DIR=/scratch/rat/rn3/softNib
 SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
 SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes
 SEQ2_CHUNK=300000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzRn3.2005-12-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     # happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-stop=load \
 	`pwd`/DEF > to-load.out 2>&1 &
 
 # start processing again on 12/31/05.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=pk \
       -swap \
       -stop=load \
 	`pwd`/DEF > swap.out 2>&1 &
 
 # Either UCSC RR and hgwdev systems or network went down around 11 AM 12/31/05.
 
 # After holidays, start again on 1/3/06 and again on 1/5/06.
 
     ssh pk
     cd /cluster/data/hg18/bed
     cd blastzRn3.2005-12-22
     screen
     bash
 
       time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=pk \
       -swap \
       -continue=net \
       -stop=load \
 	`pwd`/DEF > swap6.out 2>&1 &
 
 # DONE! Jan  5 13:39
 
 # Measurements:
 nice featureBits rn3 chainHg18Link
 # 962630574 bases of 2571104688 (37.440%) in intersection
 nice featureBits hg18 chainRn3Link
 # 964251210 bases of 2881515245 (33.463%) in intersection
 
 #########################################################################
 # BLASTZ FUGU fr1 (STARTED - 2005-12-20, DONE 2006-01-05 Fan)
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzFr1.2005-12-20
     cd /cluster/data/hg18/bed
     ln -s blastzFr1.2005-12-20 blastz.fr1
     cd blastzFr1.2005-12-20
 
     cat << '_EOF_' > DEF
 # human vs. fugu
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Reuse parameters from human-chicken, except L=6000 (more relaxed)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18 - testing 100,000,000 sized chunk on pk kluster
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 
 # QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once
 SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
 SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
 SEQ2_CHUNK=400000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzFr1.2005-12-20
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     ssh pk
     cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -stop=load \
 	`pwd`/DEF > thruLoad.out 2>&1 &
 
     ssh pk
     cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -continue=chainMerge -stop=load \
 	`pwd`/DEF > thruLoad.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -continue=download \
 	`pwd`/DEF > download.clean.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -swap \
 	`pwd`/DEF > swap.out 2>&1 &
 
 # Finish the remaining step, 1/4/05.
 
     ssh pk
     cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
     screen
     bash
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 \
 	-swap -continue=download \
 	`pwd`/DEF > DownloadSwap.out 2>&1 &
 
 # First try found the DEF was some how altered for rn3.
 # Re-generated DEF and try again.
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 \
 	-swap -continue=download \
 	`pwd`/DEF > DownloadSwap2.out 2>&1 &
 
 # Done.  Jan  4 09:48.
 
 # measurements
 
 nice featureBits hg18 chainFr1Link
 # 51795958 bases of 2881515245 (1.798%) in intersection
 nice featureBits hg17 chainFr1Link
 #50831650 bases of 2866216770 (1.773%) in intersection
 
 nice featureBits hg18 netFr1
 # 691148929 bases of 2881515245 (23.986%) in intersection
 nice featureBits hg17 netFr1
 # 714234935 bases of 2866216770 (24.919%) in intersection
 
 nice featureBits fr1 chainHg18Link
 # 43267869 bases of 315518167 (13.713%) in intersection
 # nice featureBits fr1 chainHg17Link
 0 bases of 315518167 (0.000%) in intersection
 nice featureBits fr1 netHg18
 # 140843080 bases of 315518167 (44.639%) in intersection
 nice featureBits fr1 netHg17
 # 0 bases of 315518167 (0.000%) in intersection
 
 # BLASTZ TETRAODON TetNig1 second time (DONE - 2006-01-07 Fan)
 
 ssh pk
 mkdir /cluster/data/hg18/bed/blastzTetNig1.2006-01-07
 cd /cluster/data/hg18/bed
 rm blastz.tetNig1
 ln -s blastzTetNig1.2006-01-07 blastz.tetNig1
 cd blastzTetNig1.2006-01-07
 
     cat << '_EOF_' > DEF
 # human vs tetraodon
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
 SEQ2_CHUNK=410000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzTetNig1.2006-01-07
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
 # establish a screen to control this job
 screen
 bash
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -stop=load \
 `pwd`/DEF > load.out 2>&1 &
 # Started Sat Jan  7 05:40:51 PST 2006
 
 # Encountered an error:
 startStep: 0, at step 5 net to stopStep 6
 netChains: looks like previous stage was not successful (can't find [hg18.tetNig1.]all.chain[.gz]).
 
 # Try it with pk as the workhorse.
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -continue=net \
 -stop=load \
 `pwd`/DEF > load2.out 2>&1 &
 
 # Load done.  Sat Jan  7 07:34:56 PST 2006
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -swap -stop=load \
 `pwd`/DEF > swap-load.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -continue=download \
  `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Done! Sat Jan  7 08:02:14 PST 2006
 # The download and swap-download took less than 10 seconds each.  ???
 
 # Measurements:
 
 ssh hgwdev
 nice featureBits tetNig1 chainHg18Link
 # 50026847 bases of 342403326 (14.611%) in intersection
 nice featureBits hg18 chainTetNig1Link
 # 57654754 bases of 2881515245 (2.001%) in intersection
 
 nice featureBits tetNig1 chainHg17Link
 # 34379509 bases of 342403326 (10.041%) in intersection
 nice featureBits hg17 chainTetNig1Link
 # 35910128 bases of 2866216770 (1.253%) in intersection
 
 # BLASTZ FROG XenTro1 second time (STARTED - 2006-01-06 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06
     cd /cluster/data/hg18/bed
     rm blastz.xenTro1
     ln -s blastzXenTro1.2006-01-06 blastz.xenTro1
     cd blastzXenTro1.2006-01-06
 
     cat << '_EOF_' > DEF
 # human vs frog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Frog XenTro1 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
 SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
 SEQ2_LIMIT=400
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
       -stop=load \
 	`pwd`/DEF > load.out 2>&1 &
 # Started Fri Jan  6 20:19:30 PST 2006
 # Blastz run done.  Jan  7 02:07 load.out
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
 # got the following error:
 
 startStep: 4, at step 5 net to stopStep 6
 netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]).
 
 # Try it with pk instead of kolossus:
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -swap -stop=load \
 `pwd`/DEF > swap-load2.out 2>&1 &
 
 # It worked, swap-load done. Jan  7 06:05
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -continue=download \
 `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -workhorse=pk \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Done! Jan  7 06:18
 
 # Measurements:
 
 ssh hgwdev
 nice featureBits xenTro1 chainHg18Link
 # 61197900 bases of 1381238994 (4.431%) in intersection
 nice featureBits hg18 chainXenTro1Link
 # 67810866 bases of 2881515245 (2.353%) in intersection
 
 nice featureBits xenTro1 chainHg17Link
 # 81777842 bases of 1381238994 (5.921%) in intersection
 nice featureBits hg17 chainXenTro1Link
 # 85701475 bases of 2866216770 (2.990%) in intersection
 
 ############################################################################
 # BLASTZ COW BosTau2 second time (STARTED - 2006-01-07, DONE 2006-01-08 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzBosTau2.2006-01-07
     cd /cluster/data/hg18/bed
     rm blastz.bosTau2
     ln -s blastzBosTau2.2006-01-07 blastz.bosTau2
     cd blastzBosTau2.2006-01-07
 
     cat << '_EOF_' > DEF
 # human vs cow
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow BosTau2 - single chunk big enough to run entire genome
 SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/bosTau2/chrom.sizes
 SEQ2_CHUNK=3200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzBosTau2.2006-01-07
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
 # establish a screen to control this job
 screen
 bash
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -stop=load \
 -workhorse=pk \
 `pwd`/DEF > load.out 2>&1 &
 
 # Started Sat Jan  7 07:57:22 PST 2006
 # blastz run (and load) done Jan  8 00:13
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -swap -stop=load \
 `pwd`/DEF > swap-load.out 2>&1 &
 
 # took a long time to finish.
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -continue=download \
 `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Done! Jan  8 21:10
 
 # Measurements:
 
 ssh hgwdev
 nice featureBits bosTau2 chainHg18Link
 # 1357027317 bases of 2812203870 (48.255%) in intersection
 nice featureBits hg18 chainBosTau2Link
 # 1357291762 bases of 2881515245 (47.103%) in intersection
 nice featureBits bosTau2 chainHg17Link
 # 0 bases of 2812203870 (0.000%) in intersection
 nice featureBits hg17 chainBosTau2Link
 # 1350076765 bases of 2866216770 (47.103%) in intersection
 
 #######################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE - 2006-01-11 - Fan)
     ssh kkstore02
     cd /cluster/data/hg18
 
     blat hg18.2bit \
 	 /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
 # Wrote 30378 overused 11-mers to 11.ooc
 
 # Copy over to the bluearc
    cp -p 11.ooc /cluster/bluearc/hg18
 
 #######################################################################
 # PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE
 #	(DONE - 2006-01-12 - 2006-04-04 - Hiram)
 #  (RE-DONE 2006-10-31 - Hiram - see section:)
 # REWORK PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/coverage
     cd /cluster/data/hg18/bed/coverage
     #	find all the clones that were used in the assembly
     sed -e "/^#.*/d" ../../ncbi_build36.agp | \
         awk '{if (!match($5,"N")) {print $6}}' | \
         sort -u > placed_in_assembly.list
     wc -l placed_in_assembly.list
     #	27093 placed_in_assembly.list
 
     #	And all possible clones considered for assembly.
     #	The AADB clones are the Celera assembly, don't want them.
     sed -e "/^#.*/d" /cluster/store11/gs.19/ncbi/sequence.inf | \
 	grep for_assembly | grep -v AADB | awk '{print $1}' | sort -u \
 	    > allButOneClonesConsidered.list
     (grep AADB01066164.1 \
 	/cluster/store11/gs.19/ncbi/sequence.inf | awk '{print $1}'; \
 	cat allButOneClonesConsidered.list) | sort -u \
 	    > allClonesConsidered.list
     #	The grep for AADB eliminates a single clone: AADB01066164.1
     #	Which actually should be in the list since it is in the
     #	ncbi_build36.agp file.  Back in Hg17, this was the only AADB
     #	clone in the sequence.inf file, now there are 400,673 of them in
     #	this Hg18 sequence.inf file marked "for_assembly"
 
     #	Later after a lot of this was done, it was discovered that some
     #	of the clones on this allConsidered list are actually obsolete
     #	and have newer versions in use.  They were identified by the
     #	following perl script:
 
     cat << '_EOF_' > ckMultipleVersions.pl
 #!/usr/bin/env perl
 use warnings;
 use strict;
 sub usage() {
     print "usage: ./ckMultipleVersions.pl allClonesConsidered.list\n";
     exit 255;
 }
 my $argc = scalar(@ARGV);
 if ($argc != 1) { usage; }
 my $fileName = shift;
 open (FH,"<$fileName") or die "Can not open $fileName";
 my %cloneAcc;   #       key is clone accession major number, value is version
 while (my $clone = <FH>) {
     chomp $clone;
     my ($major, $version) = split('\.', $clone);
     if (exists($cloneAcc{$major})) {
         my $previousVersion = $cloneAcc{$major};
         if ($previousVersion >= $version) {
             printf STDERR "$major.$version - obsolete\n";
         } else {
             printf STDERR "$major.$previousVersion - obsolete\n";
             $cloneAcc{$major} = $version;
         }
     } else {
         $cloneAcc{$major} = $version;
     }
 }
 close (FH);
 foreach my $major (sort keys %cloneAcc) {
     printf "$major.$cloneAcc{$major}\n";
 }
 '_EOF_'
     #	happy emacs
     chmod +x ckMultipleVersions.pl
 
     ./ckMultipleVersions.pl allClonesConsidered.list \
 	2> obsoleteClone.list > allClones.notObsolete.list
     #	After this obsolete list was made, those clone results were
     #	removed from the kluster run hierarchies of results.
     #	And when we finally got to loading up the coverage track
     #	2006-04-04, a few additional ones had crept into the mix.
     #	These were added to this list at that loading time.
 
     comm -12 allClonesConsidered.list \
 	/cluster/data/hg17/bed/contig_overlaps/sequence.list \
 	    > allClones.InHg17AndHg18.list
     comm -23 allClonesConsidered.list \
 	/cluster/data/hg17/bed/contig_overlaps/sequence.list \
 	    > allClones.InHg18NotHg17.list
     comm -13 allClonesConsidered.list \
 	/cluster/data/hg17/bed/contig_overlaps/sequence.list \
 	    > allClones.InHg17NotHg18.list
 
     #	how many are the same as previous build:
     comm -12 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \
 	placed_in_assembly.list > sameAsHg17.list
     wc sameAsHg17.list
     #	26775  26775 300641 sameAsHg17.list
     #	There is one clone: AADB01066164.1
     #	Which is listed in allClones.InHg17NotHg18.list
     #	But it is on the Hg18 placed_in_assembly.list
     #	And it is on the Hg17 placed_in_assembly.list but it isn't
     #	actually found in Hg17 ?  Perhaps it didn't align good enough.
     comm -23 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \
 	placed_in_assembly.list > uniqueToHg17.list
     wc uniqueToHg17.list
     #	97   97 1080 uniqueToHg17.list
     #	and unique to hg18, not in hg17:
     comm -13 /cluster/data/hg17/bed/contig_overlaps/placed_in_assembly.list \
 	placed_in_assembly.list > newToHg18.list
     wc newToHg18.list
     #	318  318 3547 newToHg18.list
     #	make a list of these new contigs:
     #	using the previous perl scripts:
     cp -p /cluster/data/hg17/bed/contig_overlaps/*.pl .
 
     #	Now, we need to distribute the clone sequence files in a
     #	directory hierarchy by chrom name.  Using the contigAcc.pl file
     #	from the previous release:
     cp /cluster/data/hg17/bed/contig_overlaps/contigAcc.pl .
     #	This newer version is generalized a bit better to take command
     #	line arguments for the two files it is to read instead of having
     #	them explicitly in the code, then:
     ./contigAcc.pl /cluster/data/hg18/ncbi_build36.agp \
 	/cluster/data/hg18/seq_contig.md > cloneToChrom.list 2>&1
     #	And now, since most of the clone sequence already exists in the
     #	Hg17 work directory, we only need to make symlinks to the
     #	existing ones, and move only the new ones.  The following script
     #	will find an existing copy and symlink it correctly.
 
     cat << '_EOF_' > createPlacedHierarchy.sh
 #!/bin/sh
 
 mkdir -p placedClones
 
 sed -e "/^#.*/d" cloneToChrom.list | while read L
 do
     CHROM=`echo "${L}" | awk '{print $1}'`
     CLONE=`echo "${L}" | awk '{print $2}'`
     if [ ! -d "placedClones/${CHROM}" ]; then
 	mkdir placedClones/${CHROM}
     fi
     HG17_version="/cluster/data/hg17/bed/contig_overlaps/${CHROM}/${CLONE}"
     HG18_version_0="/cluster/data/hg18/bed/coverage/newToHg18/${CLONE}"
   HG18_version_1="/cluster/data/hg18/bed/coverage/allClones.newToHg18/${CLONE}"
     if [ -f "${HG17_version}" ]; then
 	if [ -f "${HG18_version_0}" -o -f "${HG18_version_1}" ]; then
 	  echo "ERROR: Why is there both an Hg17 and Hg18 version for ${CLONE}"
 	  exit 255
 	fi
 	ln -s "/cluster/data/hg17/bed/contig_overlaps/${CHROM}/${CLONE}" \
 		"./placedClones/${CHROM}/${CLONE}"
     else
 	if [ -f "${HG18_version_0}" -a -f "${HG18_version_1}" ]; then
 	    echo "ERROR: Why are there two Hg18 copies for ${CLONE}"
 	    exit 255
 	fi
 	if [ -f "${HG18_version_0}" ]; then
 	    ln -s "${HG18_version_0}" "./placedClones/${CHROM}/${CLONE}"
 	else
 	    if [ -f "${HG18_version_1}" ]; then
 		ln -s "${HG18_version_1}" "./placedClones/${CHROM}/${CLONE}"
 	    else
 		# must be on a different chrom in hg17
 		HG17_chrom=`grep -v "^#" \
 	/cluster/data/hg17/bed/contig_overlaps/disburseEm.list \
 	| grep "^${L}$" | awk '{print $1}'`
     HG17_version="/cluster/data/hg17/bed/contig_overlaps/${HG17_chrom}/${CLONE}"
 		if [ -f "${HG17_version}" ]; then
 		    echo "ERROR: Why is there no version for ${CLONE}"
 		    exit 255
 		fi
 		ln -s "${HG17_version}" "./placedClones/${CHROM}/${CLONE}"
 	    fi
 	fi
     fi
 done
 '_EOF_'
     #	happy emacs
     chmod +x createPlacedHierarchy.sh
     ./createPlacedHierarchy.sh
     #	There should be no errors
 
     #	We need masked contigs for the psLayout alignments
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/coverage/maskedContigs
     cd /cluster/data/hg18/bed/coverage/maskedContigs
     hgsql -N \
 	-e "select chrom,chromStart,chromEnd,contig,size from ctgPos;" hg18 \
 	> ctgPos.txt
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/coverage/maskedContigs
     #	verify each contig only listed once:
     awk '{print $4}' ctgPos.txt | sort | uniq -c | sort -n | less
     #	should all have a count of one
     #	verify all chrom sizes match the contig sizes:
     awk '{print $3-$2}' ctgPos.txt > chrSize.list
     awk '{print $5}' ctgPos.txt > ctgSize.list
     diff ctgSize.list chrSize.list
     #	should be no difference
     #	OK, now fetch the contigs from the twoBit file:
 
     cat << '_EOF_' > 2bitToFa.pl
 #!/usr/bin/env perl
 use warnings;
 use strict;
 while (my $line=<>) {
 chomp $line;
 my ($chrom, $start, $end, $contig, $size) = split('\s',$line);
 $chrom =~ s/chr//;
 printf "echo -n 'working $contig ...'; mkdir -p $chrom; twoBitToFa /cluster/data/hg18/hg18.2bit:chr$chrom:$start-$end stdout | sed -e 's/^>.*/>$contig/' > $chrom/$contig.fa; gzip $chrom/$contig.fa; echo 'done'\n";
 }
 '_EOF_'
     # happy emacs
     chmod +x 2bitToFa.pl
     cat ctgPos.txt | ./2bitToFa.pl > 2bitToFa.sh
     chmod +x 2bitToFa.sh
     time ./2bitToFa.sh
 
     #	and create a lift file for these contigs
     cat << '_EOF_' > mkCtgLift.pl
 #!/usr/bin/env perl
 use warnings;
 use strict;
 while (my $line=<>)
 {
 chomp $line;
 my ($start, $chrCtg, $size, $chrom, $chrLen) = split('\s',$line);
 $chrCtg =~ s#.*/##;
 printf "%s\t%s\t%s\t%s\t%s\n", $start, $chrCtg, $size, $chrom, $chrLen;
 }
 '_EOF_'
     #	happy emacs
     chmod +x mkCtgLift.pl
     cat /cluster/data/hg18/jkStuff/liftAll.lft \
 	| ./mkCtgLift.pl > liftContigs.lft
 
     #	Create individual ooc files for each contig
     mkdir ooc
     for C in `ls */*.fa.gz | sed -e "s/.fa.gz//"`
     do
 	CONTIG=`basename ${C}`
 	CHR=`dirname ${C}`
 	mkdir -p ooc/${CHR}
 	zcat ${C}.fa.gz | blat -repMatch=256 \
 	    -makeOoc=ooc/${CHR}/${CONTIG}.10.ooc -tileSize=10 \
 	    stdin /dev/null /dev/null
 	echo "done: ${CONTIG}"
     done
 
     #	Copy everything to san filesystem for kluster run:
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/coverage
     cd /san/sanvol1/scratch/hg18/coverage
     rsync -a --progress --copy-links \
 	/cluster/data/hg18/bed/coverage/placedClones/ ./placedClones/
     rsync -a --progress --copy-links \
 	/cluster/data/hg18/bed/coverage/maskedContigs/ ./maskedContigs/
 
     mkdir /san/sanvol1/scratch/hg18/coverage/runPlaced
     cd /san/sanvol1/scratch/hg18/coverage/runPlaced
 
     cat << '_EOF_' > runPsLayout.sh
 #!/bin/sh
 #   runPsLayout.sh <chrom> <clone> <contig>
 #     where <chrom> is the chrom this contig is on
 #      <clone> is one of the .fa.gz files in
 #	  /san/sanvol1/scratch/hg18/coverage/placedClones/<chrom>/<clone>.fa.gz
 #      <contig> is one of the contigs found in:
 #	/san/sanvol1/scratch/hg18/coverage/maskedContigs/<chrom>/<contig>.fa.gz
 #
 HERE=`pwd`
 CHROM=$1
 CLONE=$2
 CONTIG=$3
 TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz
 CLONESRC=/san/sanvol1/scratch/hg18/coverage/placedClones/$CHROM/$CLONE.fa.gz
 OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc
 RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl"
 mkdir -p psl/${CHROM}/${CONTIG}
 if [ ! -s ${CLONESRC} ]; then
         echo "Can not find: ${CLONESRC}" 1>/dev/stderr
         exit 255
 fi
 if [ ! -s ${TARGET} ]; then
         echo "Can not find: ${TARGET}" 1>/dev/stderr
         exit 255
 fi
 if [ ! -s ${OOC} ]; then
         echo "Can not find: ${OOC}" 1>/dev/stderr
         exit 255
 fi
 WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}"
 mkdir -p "${WRKDIR}"
 cd ${WRKDIR}
 zcat ${CLONESRC} > ${CLONE}.fa
 zcat ${TARGET} > ${CONTIG}.fa
 cp -p ${OOC} ./10.ooc
 /cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT}
 RET=$?
 cd ${HERE}
 rm -fr ${WRKDIR}
 rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}"
 rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}"
 exit ${RET}
 '_EOF_'
     #	happy emacs
     chmod +x runPsLayout.sh
 
     #	create jobList from cloneToChrom.list:
     grep -v "^#" /cluster/data/hg18/bed/coverage/cloneToChrom.list \
 	| sed -e "s/.fa.gz//" \
 	| awk '{
 printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s/%s.psl}\n",
         $1, $2, $3, $1, $3, $2
 }' > masterJobList
 
     #	To do a quick test, run just chrM:
     grep " M " masterJobList > jobList
 s
     para create jobList
     para try ... check ... etc ...
 
 
     #	Then, the whole run:
     rm -fr psl err
     para create masterJobList
     para try ... check ... push ... etc ...
     #	running 2006-01-17  16:41
 
     #	We need the phase information from the sequence.inf file:
     ssh hgwdev
     cd /cluster/data/hg18/bed/coverage
     cp /cluster/data/hg17/phase.pl .
     #	this script was fixed up for hg18 to take an argument to the
     #	sequence.inf file:
     ./phase.pl /cluster/data/hg18/ncbi/sequence.inf > phase.txt
     #	what kind of phases do we have:
     awk '{print $2}' phase.txt | sort | uniq -c
     #	  1134 D
     #	562513 F
     #	 17270 P
     #	Compared to hg17 we had:
     awk '{print $2}' /cluster/data/hg17/phase.txt | sort | uniq -c
     #	  1088 D
     #	146900 F
     #	 17300 P
 
     #	Back in the kluster runPlaced directory, we put together the
     #	kluster run results with:
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/coverage/runPlaced/filteredLifted
     cd /san/sanvol1/scratch/hg18/coverage/runPlaced/filteredLifted
 
     cat << '_EOF_' > filterLift.sh
 #!/bin/sh
 
 for C in 22
 do
     echo -n "chr${C} working ... "
     mkdir -p ${C}
     OUT="${C}/filterLift.out"
     pslSort dirs ${C}/raw.psl tmp ../psl/${C}/N* > ${OUT} 2>&1
     pslReps -singleHit -nearTop=0.001 ${C}/raw.psl ${C}/repsSingle.psl \
         /dev/null >> ${OUT} 2>&1
     liftUp ${C}/chr${C}.psl ../../maskedContigs/liftContigs.lft warn \
         ${C}/repsSingle.psl >> ${OUT} 2>&1
     clusterClone -agp -minCover=80 -maxGap=60000 ${C}/repsSingle.psl \
         > ${C}/single.agp 2>> ${OUT} 2>&1
     liftUp ${C}/rawLifted.psl ../../maskedContigs/liftContigs.lft warn \
         ${C}/raw.psl >> ${OUT} 2>&1
     clusterClone -agp -minCover=80 -maxGap=60000 ${C}/chr${C}.psl \
         > ${C}/chr${C}.bed 2>> ${OUT}
     echo "done"
 done
 '_EOF_'
     #	happy emacs
     chmod +x filterLift.sh
     time ./filterLift.sh
 
     cp /cluster/data/hg17/fixPhase.pl .
     #	fixed up the script to take an argument pointing to the phase.txt file
 
     ssh kkstore02
     cd /cluster/data/hg18
     grep "for_assembly" ncbi/sequence.inf \
 	| sed -e "s/\tW\t/\t3\t/;" > sequence.inf
     cd /cluster/store11/gs.19/ffa
     ln -s ../build36/sequence.inf .
 
     ssh hgwdev
     cd /cluster/data/hg18
     #	currently working only on chr22
     echo "22" > clonePos.list
     #	need to reload gold gap *and* gl at this time.  gl wasn't loaded
     #	before this.  It is required for the clonePos track.
     hgGoldGapGl -chrom=chr22 hg18 /cluster/store11/gs.19 build36
     hgClonePos  -maxErr=3 -maxWarn=2000 -chromLst=clonePos.list \
         hg18 /cluster/data/hg18 ./sequence.inf /cluster/store11/gs.19 \
         2> clone.pos.errors
 
 
     #	OK, now for the hard part.  The unplaced clones.
     #	First we will make an attempt to determine which clones they
     #	belong to by using information from the previous build, the
     #	sequence.inf file, the seq_contig.md file, and the
     #	ncbi_build36.agp file.
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/coverage
     comm -13 placed_in_assembly.list allClonesConsidered.list \
 	> unplaced.clone.list
     comm -12 unplaced.clone.list allClones.InHg17AndHg18.list \
 	> common.to.hg17.unplaced.list
     comm -23 unplaced.clone.list allClones.InHg17AndHg18.list \
 	> unique.to.hg18.unplaced.list
 
     awk '{print $1,$6}' /cluster/data/hg17/contig_overlaps.agp \
 	| sed -e "s/_[0-9]*$//" | sort -u > hg17.contig.clone.list
 
     awk '{print $1,$6}' ../../sequence.inf | sed -e "s/(//; s/)//" \
 	> cloneToChrom.from.seq.inf.txt
 
     #	using the contig to clone information from Hg17, attempt to
     #	locate the common.to.hg17.unplaced.list in terms of chrom and
     #	contig.  Along with the ncbi_build36.agp, seq_contig.md and
     #	cloneToChrom.from.seq.inf.txt infomation, we can attempt to
     #	place clones that have perhaps moved, or don't have entries in
     #	one file or another.  The relationships obtained from the
     #	various files:
     #   ncbi_build36.agp - gives clone to contig name and clone to chr name
     #			but for placed clones only, not useful here
     #			unless they moved from hg17 (try this with the
     #			placed list)
     #   seq_contig.md - gives contig to chrom relationship
 
     ./chrCloneContig.pl /cluster/data/hg18/ncbi_build36.agp \
 	hg17.contig.clone.list /cluster/data/hg18/seq_contig.md \
 	    common.to.hg17.unplaced.list cloneToChrom.from.seq.inf.txt \
 		> chrCloneContigCommonToHg17.list \
 		    2> common.to.hg17.unplaced.stderr
 
     #	With this chrCloneContigCommonToHg17.list list in hand, can now
     #	create a hierarchy of ./unPlacedClones/
     ./createUnplacedHierarchy.sh
 
     #	Then, copy them to the san for kluster run
     ssh pk
     cd /san/sanvol1/scratch/hg18/coverage
     rsync -a --progress --copy-links \
 	/cluster/data/hg18/bed/coverage/unPlacedClones/ ./unPlacedClones/
 
 
     mkdir runUnPlaced
     cd runUnPlaced
     #	create jobList from the chrCloneContigCommonToHg17.list
     egrep -v "^#|XX_000" \
 	/cluster/data/hg18/bed/coverage/chrCloneContigCommonToHg17.list \
 	| sed -e "s/.fa.gz//" \
 	| awk '{
 printf "./runPsLayout.sh %s %s %s {check out line+ psl/%s/%s/%s.psl}\n",
         $1, $2, $3, $1, $3, $2
 }' > masterJobList
 
     #	Test a subset:
     grep " Y " masterJobList > jobListY
 
     para create jobListY
     para try ... check ... etc ...
 
     #	... some time later ... 2006-04-04
     #	All the clones were eventually run through the placement kluster
     #	runs.  Ending up with five different directory results:
     [hiram@hgwdev64 /san/sanvol1/scratch/hg18/coverage]
     #	-rw-rw-r--  1  3144245541 Mar 15 09:24 runFishClones/raw.psl
     #	-rw-rw-r--  1    91182723 Mar 15 10:44 runUnPlaced/raw.psl
     #	-rw-rw-r--  1   102642706 Mar 15 10:49 runPlaced/raw.psl
     #	-rw-rw-r--  1 15839733941 Mar 15 14:56 runLastRecover/raw.psl
     #	-rw-rw-r--  1 14338192704 Mar 15 18:25 runLastOnes/raw.psl
 
     #	Combining those results together required a large memory
     #	machine and a couple of days processing time:
     ssh hgwdev64
     cd /san/sanvol1/scratch/hg18/coverage
     pslSort dirs raw.psl tmp runPlaced runUnPlaced runFishClones \
 	runLastRecover runLastOnes > raw.psl.out 2>&1
     #	resulting in a 33 Gb result file:
     -rw-rw-r--    1 33515995907 Apr  2 10:54 raw.psl
     #	trimming that down with pslReps:
     time pslReps -nohead -nearTop=0.001 -singleHit \
 	raw.psl repsSingle.psl /dev/null
     #	real    14m58.371s
     #	-rw-rw-r--    1    42333543 Apr  4 10:22 repsSingle.psl
     #	wc -l repsSingle.psl
     #	48005 repsSingle.psl
     #	Now, clustering those alignments together:
     clusterClone -allowDuplicates -agp -minCover=80 -maxGap=60000 \
         repsSingle.psl > single.agp 2> single.out
     wc -l single.agp
     #	45714 single.agp
     #	Sort them, and set their phase correctly:
     sort -k1,1 -k2,2n single.agp \
     | ./fixPhase.pl /cluster/data/hg18/bed/coverage/phase.txt \
         > contig_overlaps.agp
     #	some of them are not in the phase.txt file, these are
     #	set to draft status:
     #	WARN: can not find contig AC024654.2 in phase.txt
     #	WARN: can not find contig AL133291.12 in phase.txt
     #	WARN: can not find contig AC055712.12 in phase.txt
     #	WARN: can not find contig AC024480.2 in phase.txt
     #	WARN: can not find contig AC068738.2 in phase.txt
     #	WARN: can not find contig AL354703.14 in phase.txt
     #	WARN: can not find contig AL354756.17 in phase.txt
     #	WARN: can not find contig AL157825.11 in phase.txt
     #	WARN: can not find contig AC073306.1 in phase.txt
     #	WARN: can not find contig AL138892.13 in phase.txt
     #	WARN: can not find contig AL590104.7 in phase.txt
     #	WARN: can not find contig AC079146.4 in phase.txt
     #	WARN: can not find contig AC024497.3 in phase.txt
     #	WARN: can not find contig AC021295.3 in phase.txt
     #	WARN: can not find contig AC040906.3 in phase.txt
     #	WARN: can not find contig AC008372.5 in phase.txt
     #	WARN: can not find contig AC026054.3 in phase.txt
     #	WARN: can not find contig AC053504.4 in phase.txt
 
     #	create the gl files from that overlaps.agp file:
     ssh hgwdev
     cd /cluster/data/hg18
     cp -p /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp .
     #	after going through this sequence and loading everything,
     #	a few clones were discovered to have crept into the list that
     #	were obsolete.  So, add them to the list used by the
     #	removeObsoleteClones.sh script:
     awk '{print $6}' contig_overlaps.agp > clone.coverage.list
     bed/coverage/ckMultipleVersions.pl clone.coverage.list \
 	> /dev/null 2> /tmp/clone.transitions
     awk '{if (! match($1,$3)){ print }}' /tmp/clone.transitions \
 	>> bed/coverage/obsoleteClone.list
 
     time ./removeObsoleteClones.sh
     wc -l /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp \
 	./contig_overlaps.agp
     #	45714 /san/sanvol1/scratch/hg18/coverage/contig_overlaps.agp
     #	45597 ./contig_overlaps.agp
     #	after adding ten new ones the second time around:
     #	45587 ./contig_overlaps.agp
     time agpToGl contig_overlaps.agp . -md=seq_contig.md
     #       this liftGl.csh finds all the contig.gl files under each
     #       contig directory and creates chromsome coordinate chr*.gl
     #       files in each chrom directory
     jkStuff/liftGl.csh contig.gl
     #       Then hgGoldGapGl uses those chrom level chr*.gl files to add
     #       the gl tables (as well as gold and gap
     hgGoldGapGl -chromLst=chrom.lst hg18 /cluster/store11/gs.19 build36
 
     #	strip some business from the sequence.inf file that is not needed
     #	The sed here has to be done in a shell script, those tabs are
     #	actual tabs and not the explicit ^I
     mkdir -p /scratch/tmp
     grep -v AADB /cluster/store11/gs.19/ncbi/sequence.inf \
 	> /scratch/tmp/seq0.inf
     (cat /scratch/tmp/seq0.inf; \
     grep AADB01066164.1 /cluster/store11/gs.19/ncbi/sequence.inf) \
 	| grep "for_assembly" \
 	| sed -e "s/^IW^I/^I3^I/" > cleanedSequence.inf
     #       Then hgClonePos uses those tables to create the Coverage track
     hgClonePos  -maxErr=600 -maxWarn=50000 -chromLst=clonePos.list \
         hg18 /cluster/data/hg18 ./cleanedSequence.inf /cluster/store11/gs.19 \
         > clone.pos.errors 2>&1
 
 ###########################################################################
 # RECOMBINATION RATES (DONE 2006-02-15 Fan)
 
 # The STS MArkers track must be completed prior to creating this track
 
     ssh kkstore02
     cd /cluster/data/hg18/bed
     mkdir -p recombRate
     cd recombRate
 
 # Copy other necessary files here (in future, can take from previous version)
 # NOTE: these are stable, and could be saved in a permanent spot
 
     cp -p /projects/hg2/booch/psl/info/decode_all .
     cp -p /projects/hg2/booch/psl/info/marshfield_all .
     cp -p /projects/hg2/booch/psl/info/genethon_all .
 
 # Compared these 3 files with the 3 files of hg17, they are identical.
 
 # Determine maximum concordant set of markers for each of the maps
     /cluster/bin/scripts/assignGPsts -full -maxcon \
         /cluster/data/ncbi/sts.10/stsAlias.bed \
         /cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \
         decode_all > decode.marker.rdb
     /cluster/bin/scripts/assignGPsts -full -maxcon \
         /cluster/data/ncbi/sts.10/stsAlias.bed \
         /cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \
         marshfield_all > marshfield.marker.rdb
     /cluster/bin/scripts/assignGPsts -full -maxcon \
         /cluster/data/ncbi/sts.10/stsAlias.bed \
         /cluster/data/hg18/bed/sts/stsMarkers_pos.rdb \
         genethon_all > genethon.marker.rdb
 
 # Determine the rates for each of the maps
     /cluster/bin/scripts/markers_to_recomb_rate.terry.pl decode.marker.rdb \
             /cluster/data/hg18/chrom.sizes 1000000 1000000 \
                 > decode_1mb_slide_1mb
     /cluster/bin/scripts/markers_to_recomb_rate.terry.pl genethon.marker.rdb \
             /cluster/data/hg18/chrom.sizes 1000000 1000000 \
 * genethon_1mb_slide_1mb
 # got 338 "... DISCARDING" messages.
 
     /cluster/bin/scripts/markers_to_recomb_rate.terry.pl marshfield.marker.rdb \
             /cluster/data/hg18/chrom.sizes 1000000 1000000 \
 * marshfield_1mb_slide_1mb
 # Got 424 "... DISCARDING" messages.
 
 # Convert files to proper format
     /cluster/bin/scripts/convertRecombRate decode_1mb_slide_1mb \
         /cluster/data/hg18/inserts \
         /cluster/data/hg18 1000 > decode_1mb_slide_1mb_conv
     /cluster/bin/scripts/convertRecombRate marshfield_1mb_slide_1mb \
         /cluster/data/hg18/inserts \
          /cluster/data/hg18 1000 > marshfield_1mb_slide_1mb_conv
     /cluster/bin/scripts/convertRecombRate genethon_1mb_slide_1mb \
         /cluster/data/hg18/inserts \
 	    /cluster/data/hg18 1000 > genethon_1mb_slide_1mb_conv
 
 # Create bed file and load
     /cluster/bin/scripts/createRRbed decode_1mb_slide_1mb_conv \
         marshfield_1mb_slide_1mb_conv genethon_1mb_slide_1mb_conv \
                 > recombRate.bed
 
     ssh hgwdev
     cd /cluster/store11/gs.19/build36/bed/recombRate
     hgLoadBed -noBin -tab \
         -sqlTable=/cluster/home/kent/src/hg/lib/recombRate.sql \
 	    hg18 recombRate recombRate.bed
 
 ###########################################################################
 # FISH CLONES (DONE - 2006-01-13 - 2006-02-07 - Hiram)
 #  **** RE-LOAD fishClones after bacEnds update - see below 2007-09-04 ****
 # The STS Marker, Coverage, and BAC End Pairs tracks must be completed prior to
 # creating this track  (and why is this ?)
 
     ssh kkstore01
     mkdir /cluster/data/ncbi/fishClones/fishClones.2006-01/
     cd /cluster/data/ncbi/fishClones/fishClones.2006-01/
 
 # Download information from NCBI
         # point browser at:
 #   http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
 # change "Sequence tag:" to "placed on contig"
         # change "Show details on sequence-tag" to "yes"
         # change "Download or Display" to "Download table for UNIX"
         # press Submit - save as
 # /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.txt
     chmod 664 /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.txt
 #	Unfortunately the format of this hbrc file has changed since
 #	last time.  The columns have been rearranged, and one important
 #	column is missing, the contig information.  So, let's see if we
 #	can recover the original format by putting this together with
 #	some other things we have here.
     $HOME/kent/src/hg/fishClones/fixup.hbrc.pl hbrc.txt \
 	/cluster/data/hg18/bed/fishClones/seq_clone.pmd > fixed.hbrc.txt \
 	    2> dbg
     #	the seq_clone.pmd file was obtained via email from Wonhee Jang
     #	jang at ncbi.nlm.nih.gov - I have asked for clarification where
     #	such a file can be fetched without resorting to email.
 
 # Get current clone/accession information
     wget --timestamping http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
 
 # Create initial Fish Clones bed file
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/fishClones
     cd /cluster/data/hg18/bed/fishClones
 
 # Copy previous sts info from fhcrc (take from previous build in future)
     cp -p /cluster/data/ncbi/fishClones/fishClones.2004-07/fhcrc.sts .
 #	This fhcrc.sts listing doesn't change.  It is merely a listing
 #	of aliases that remain in effect.
 
     #	Create cl_acc_gi_len file form cloneend information:
     grep -v "^#" /cluster/data/hg18/bed/cloneend/all.txt \
     | awk '{gsub("\.[0-9]*$", "", $2);
 	printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,$8}' > cl_acc_gi_len
 
 
     ssh hgwdev
     #	have to be on hgwdev for this since it is going to read from the
     #	database.  Had to work on this program to get it past what is
     #	evidently a bad entry in hbrc.fixed where columns of information
     #	are missing for one clone in particular
     time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg18 \
 	/cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt \
 	/cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out \
          ./cl_acc_gi_len \
          /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl \
             fishClones
     #	real    2m4.708s
 # Reading Fish Clones file /cluster/data/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
 # reading fishInfo file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
 # Reading Clone/Acc (clac.out) file /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out
 # Reading BAC Ends file ./cl_acc_gi_len
 # Reading BAC Ends psl file /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl
 # Reading additional STS Marker links fhcrc.sts
 # Determining good positions
 #	findClonePos: determining positions of fish clones
 # Writing output file
 # ERROR: at line # 170, no cytoband info for chrX:104048913-104206974
 # RP11-79L11
 # ERROR: at line # 171, no cytoband info for chrX:104048913-104206974
 # RP11-79L11
 
     # Load the track
     ssh hgwdev
     cd /cluster/data/hg18/bed/fishClones
     hgLoadBed -notItemRgb -noBin -tab \
         -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
 	hg18 fishClones fishClones.bed
     #	Loaded 9461 elements of size 16
 
 ###########################################################################
 # CHROMOSOME BANDS TRACK (DONE - 2006-01-20 - 2006-02-07 - Hiram)
 # This must wait until the Fish Clones tracks is done
 #	This was loaded in place of the previously loaded ideoband data
 #	created from NCBI information, see below for "ideogram"
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/cytoband
     cd /cluster/data/hg18/bed/cytoband
 
     # Copy in some necessary files (usually from previous version)
     cp -p /cluster/data/hg17/bed/cytoband/pctSetBands.txt .
     cp -p /cluster/data/hg17/bed/cytoband/ISCN800.txt .
 
     # Create some preliminary information files
     /cluster/bin/scripts/createSetBands pctSetBands.txt \
 	/cluster/data/hg18/inserts /cluster/data/hg18  100 > setBands.txt
     /cluster/bin/scripts/makeBands ISCN800.txt \
         /cluster/data/hg18 > cytobands.pct.bed
     /cluster/bin/scripts/makeBandRanges cytobands.pct.bed \
         > cytobands.pct.ranges
 
     # Reformat fishClones file
     /cluster/bin/scripts/createBanderMarkers \
 	/cluster/data/hg18/bed/fishClones/fishClones.bed > fishClones.txt
 
     /cluster/bin/scripts/runBander fishClones.txt \
 	ISCN800.txt setBands.txt /cluster/data/hg18
     # Should be 862 bands
     wc  -l cytobands.bed
     # 862    cytobands.bed
 
     hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
         hg18 cytoBand cytobands.bed
 
     hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
         hg18 cytoBandIdeo cytobands.bed
 
 ###########################################################################
 #  BLASTZ SELF (DONE - 2006-01-17 - 2006-01-20 - Hiram)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzSelf.2006-01-17
     cd /cluster/data/hg18/bed/blastzSelf.2006-01-17
 
     cat << '_EOF_' > DEF
 # human vs human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_M=400
 
 # TARGET: Human Hg18
 SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib
 SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Human Hg18
 SEQ2_DIR=/san/sanvol1/scratch/hg18/selfNib
 SEQ2_LEN=/san/sanvol1/scratch/hg18/self.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/cluster/data/hg18/bed/blastzSelf.2006-01-17
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	happy emacs
 
     cd /cluster/data/hg18/bed/blastzSelf.2006-01-17
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	real    640m37.637s
 
     ssh kolossus
     cd /cluster/data/hg18/bed/blastzSelf.2006-01-17
     time HGDB_CONF=~/.hg.conf.read-only featureBits \
 	-noRandom -noHap hg18 chainSelfLink > fb.chainSelfLink 2>&1 &
     #	real    21m52.697s
     #	324067552 bases of 2858034764 (11.339%) in intersection
 
     #	compared to Hg17:
     cd /cluster/data/hg17/bed/blastzSelf.2004-07-01
     time HGDB_CONF=~/.hg.conf.read-only featureBits \
 	-noRandom -noHap hg17 chainSelfLink > fb.chainSelfLink 2>&1 &
     #	real    56m34.802s
     #	240976607 bases of 2851352871 (8.451%) in intersection
 
     #	reloaded these chains to add normalized score column
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastzSelf.2006-01-17/axtChain
     chainSplit chain hg18.hg18.all.chain.gz
     cd /cluster/data/hg18/bed/blastzSelf.2006-01-17/axtChain/chain
     foreach f (*.chain)
 	set c = $f:r
 	hgLoadChain -normScore hg18 ${c}_chainSelf $f
     end
     cd ..
     rm -fr chain
 
 ##############################################################################
 # CLONE ENDS - BACEND TRACK (DONE - 2006-01-11 - Fan)
 
     ssh kkstore02
     cd /cluster/data/hg18
     # check disk space: 73Gb free
     df -h .
 # Filesystem            Size  Used Avail Use% Mounted on
 # /export/cluster/store11
                       1.8T  1.4T  323G  82% /cluster/store11
 
     mkdir -p bed/cloneend/ncbi
     cd bed/cloneend/ncbi
 
     wget --timestamping ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/homo_sapiens/*
 
 # Somehow the wget did not work.  Did it by hand.
 
     cd /cluster/data/hg18/bed/cloneend
     # seems like the *.mfa files were split just for convenience
     # concatenate
 
     bash
     for F in ncbi/*.mfa.gz
     do
 	zcat ${F}
     done | gzip > all.mfa.gz
 
     exit
 
     # Convert the title line of the all.mfa file
     cat << '_EOF_' > convert.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 while (my $line = <>) {
     if ($line !~ m/^>/) {
 	print $line
     } else {
         my @fields = split('\|', $line);
 	my $fieldCount = scalar(@fields);
         my $printed = 0;
         for (my $i = 0; $i < $fieldCount; $i++) {
                 if ($fields[$i] eq "gb" || $fields[$i] eq "dbj" || $fields[$i] eq "emb") {
                         (my $name, my $vers) = split(/\./,$fields[$i+1]);
                         print ">$name\n";
                         $i= $fieldCount;
                         $printed = 1;
                 }
         }
         if (!$printed) {
                 die("Failed for $line\n");
         }
     }
 }
 '_EOF_'
     # < happy emacs
     chmod +x convert.pl
     zcat all.mfa | ./convert.pl | gzip > cloneEnds.fa.gz
 
     #	make sure nothing got broken:
     faSize all.mfa.gz
 # 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files
     faSize cloneEnds.fa.gz
 # 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files
     #	identical numbers
 
     # concatenate the text files, too
     bash
     for F in ncbi/*.txt.gz
     do
 	zcat ${F}
     done | gzip > all.txt.gz
 
     # generate cloneEndPairs.txt and cloneEndSingles.txt
     cp -p /cluster/data/mm6/bed/cloneend/ncbi/convertTxt.pl .
     zcat all.txt.gz >all.txt
     ./convertTxt.pl all.txt
 
 
     # Reading in end info
     # Writing out pair info
     # Writing out singleton info
     # 249619 pairs and 318500 singles
 
     #	faSplit does not function correctly if given a .gz source file
     #	AND, we need the unzipped file for sequence loading below
     gunzip cloneEnds.fa.gz
     # split
     mkdir splitdir
     cd splitdir
     faSplit sequence ../cloneEnds.fa 100 cloneEnds
     #	Check to ensure no breakage:
     cat *.fa | faSize stdin
 # 400704107 bases (5941742 N's 394762365 real 255711893 upper 139050472 lower) in 832860 sequences in 1 files
     #	same numbers as before
 
     #	Copy to san for cluster runs
     ssh pk
     cd /cluster/data/hg18/bed/cloneend/splitdir
     mkdir /san/sanvol1/scratch/hg18/cloneEnds
     cp -p *.fa /san/sanvol1/scratch/hg18/cloneEnds
     rm *
     cd ..
     rmdir splitdir
 
     # load sequences
     ssh hgwdev
     mkdir /gbdb/hg18/cloneend
     cd /gbdb/hg18/cloneend
       ln -s /cluster/data/hg18/bed/cloneend/cloneEnds.fa .
     cd /tmp
     hgLoadSeq hg18 /gbdb/hg18/cloneend/cloneEnds.fa
     #  Advisory lock created
     # Creating .tab file
     # Adding /gbdb/hg18/cloneend/cloneEnds.fa
     # 832860 sequences
     # Updating seq table
     # Advisory lock has been released
     # All done
 
 ############################################################################
 # BACEND SEQUENCE ALIGNMENTS (STARTED - 2006-01-11, DONE 2006-01-18 - Fan)
 #	REDONE 2006-02-02 - Hiram
     ssh pk
     #	The ooc file was created earlier into /cluster/bluearc/hg18/11.ooc
     cp -p /cluster/bluearc/hg18/11.ooc  /san/sanvol1/scratch/hg18/11.ooc
 
     mkdir /san/sanvol1/scratch/hg18/bacends
     cd /san/sanvol1/scratch/hg18/bacends
     ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
     ls -1S /san/sanvol1/scratch/hg18/cloneEnds/cloneEnds???.fa > bacends.lst
         # 378 contigs vs 98 bacends files -> 37,044 jobs
 
     mkdir out
 cat > template << '_EOF_'
 #LOOP
 /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc {check out line+ out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     gensub2 contigs.lst bacends.lst template jobList
     foreach f (`cat bacends.lst`)
         set d = $f:r:t
         echo $d
         mkdir out/$d
     end
 
     para create jobList
 # 37044 jobs in batch
     para try, check, push, etc ...
 
     # lift alignments
     ssh pk
     cd /san/sanvol1/scratch/hg18/bacends
     pslSort dirs raw.psl temp out/cloneEnds*
     #	37044 files in 98 dirs
     #	Got 37044 files 192 files per mid file
     #	real    32m24.804s
     #	-rw-rw-r--    1 6487445210 Feb  2 21:08 raw.psl
     time pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                 raw.psl  bacEnds.psl /dev/null > pslReps.out 2>&1 &
     #	real    6m33.218s
     #	Processed 51898639 alignments
 
     mkdir lifted
     time liftUp lifted/bacEnds.lifted.psl ./liftContigs.lft warn bacEnds.psl
     #	real    0m30.067s
     pslSort dirs bacEnds.sorted.psl temp lifted
 
     # cleanup
     rmdir temp
     rm -fr out /cluster/store7/kate/hg17/bacends
 
     wc -l *.sorted.psl
     #	2490892 bacEnds.sorted.psl
 
     time pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
 	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 	-mismatch -verbose bacEnds.sorted.psl \
 	/cluster/data/hg18/bed/cloneend/cloneEndPairs.txt \
 	all_bacends bacEnds
     #	Reading pair file
     #	Reading psl file
     #	Creating Pairs
     #	Writing to files
     #	real    0m11.221s
     #	this creates the files:
     #	-rw-rw-r--    1   16224182 Feb  2 21:36 bacEnds.pairs
     #	-rw-rw-r--    1    4655633 Feb  2 21:36 bacEnds.orphan
     #	-rw-rw-r--    1     399525 Feb  2 21:36 bacEnds.slop
     #	-rw-rw-r--    1     106252 Feb  2 21:36 bacEnds.mismatch
     #	-rw-rw-r--    1     634909 Feb  2 21:36 bacEnds.short
     #	-rw-rw-r--    1       4023 Feb  2 21:36 bacEnds.long
 
     # create header required by "rdb" tools
     # TODO: replace w/ awk & sort
     echo -e \
 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
     echo -e '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
     cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
 	| headchg -del > bacEndPairs.bed
     cat header bacEnds.slop bacEnds.short bacEnds.long \
 	bacEnds.mismatch bacEnds.orphan \
         | row score ge 300 | sorttbl chr start | headchg -del \
 	> bacEndPairsBad.bed
 
     extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
                 bacEndPairsBad.bed | \
                         sorttbl tname tstart | headchg -del > bacEnds.load.psl
 
     #	Move the previous build out of the way and copy these
     #	results over to the primary hg18 bed location:
     mv /cluster/data/hg18/bed/bacends /cluster/data/hg18/bed/bacends.2006-01-18
     mkdir /cluster/data/hg18/bed/bacends
     cp -p bacEnd* /cluster/data/hg18/bed/bacends
     cp -p lifted/bacEnds.lifted.psl /cluster/data/hg18/bed/bacends
 
     #	load them into the database
     ssh hgwdev
     cd /cluster/data/hg18/bed/bacends
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $5}' bacEndPairs.bed | sort | uniq -c
     #	result should be the scores, no extraneous strings:
     #	156984 1000
     #	   195 300
     #	   316 375
     #	   297 500
     #	  1476 750
     #	edit the file and fix it if it has a bad name.
     hgLoadBed -notItemRgb hg18 bacEndPairs bacEndPairs.bed \
                  -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     # Loaded 159268
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed -notItemRgb hg18 bacEndPairsBad bacEndPairsBad.bed \
                  -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     # Loaded 69788
     #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
     # NOTE: truncates file to 0 if -nobin is used
     # NOTE: truncates file to 0 if -nobin is used
     hgLoadPsl hg18 -table=all_bacends bacEnds.load.psl
     #	no complaints !  Usually there are, this loaded:
     hgsql -N -e "select count(*) from all_bacends;" hg18
     #	1249956
 
     nice featureBits hg18 all_bacends
 # 191078854 bases of 2881515245 (6.631%) in intersection
     nice featureBits hg17 all_bacends
 # 225763317 bases of 2866216770 (7.877%) in intersection
 
     nice featureBits hg18 bacEndPairs
 # 2842800422 bases of 2881515245 (98.656%) in intersection
     nice featureBits hg17 bacEndPairs
 # 2846568377 bases of 2866216770 (99.314%) in intersection
 
     nice featureBits hg18 bacEndPairsBad
 # 729313572 bases of 2881515245 (25.310%) in intersection
     nice featureBits hg17 bacEndPairsBad
 # 797412909 bases of 2866216770 (27.821%) in intersection
 
 ############################################################################
 # BACEND PAIRS TRACK (OBSOLETE - DONE ABOVE) (DONE - 2006-01-18 - Fan)
     ssh kolossus
     cd /cluster/data/hg18/bacends
     bash
 
 time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 -mismatch -verbose bacEnds.psl \
 ../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds
 
     # create header required by "rdb" tools
 echo -e \
 "chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
 echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header
 
 cat header bacEnds.pairs | \
 /cluster/bin/scripts/row score ge 300 | \
 /cluster/bin/scripts/sorttbl chr start | \
 /cluster/bin/scripts/headchg -del > bacEndPairs.bed
 
 cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
 bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
 /cluster/bin/scripts/sorttbl chr start | \
 /cluster/bin/scripts/headchg -del > bacEndPairsBad.bed
 
 /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
 bacEndPairsBad.bed >j1.out
 cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out
 cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl
 
 rm j1.out j2.out
 
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $5}' bacEndPairs.bed | sort | uniq -c
     #	result should be the scores, no extraneous strings:
     #	156984 1000
     #	   195 300
     #	   316 375
     #	   297 500
     #	  1476 750
     #	edit the file and fix it if it has a bad name.
 
     # load into database
     ssh hgwdev
     cd /cluster/data/hg18/bacends
     hgLoadBed -strict -notItemRgb hg18 bacEndPairs bacEndPairs.bed \
 	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     # Loaded 146284 elements of size 11
 
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed -strict -notItemRgb hg18 bacEndPairsBad bacEndPairsBad.bed \
 	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     # Loaded 75995 elements of size 11
 
     # NOTE: truncates file to 0 if -nobin is used
     hgLoadPsl hg18 -table=all_bacends bacEnds.load.psl
 
     nice featureBits hg18 all_bacends
 # 162081172 bases of 2881515245 (5.625%) in intersection
     nice featureBits hg17 all_bacends
 # 225763317 bases of 2866216770 (7.877%) in intersection
 
     nice featureBits hg18 bacEndPairs
 # 2835522069 bases of 2881515245 (98.404%) in intersection
     nice featureBits hg17 bacEndPairs
 # 2846568377 bases of 2866216770 (99.314%) in intersection
 
     nice featureBits hg18 bacEndPairsBad
 # 781697678 bases of 2881515245 (27.128%) in intersection
     nice featureBits hg17 bacEndPairsBad
 # 797412909 bases of 2866216770 (27.821%) in intersection
 
 ##########################################################################
 # BLASTZ OPOSSUM monDom2 second time (DONE - 2006-02-13 - Hiram)
 
     ssh kk
     mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
     cd /cluster/data/hg18/bed
     ln -s blastzMonDom2.2006-02-13 blastz.monDom4
     cd blastzMonDom2.2006-02-13
 
     cat << '_EOF_' > DEF
 # human vs. opossum
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
 
 BLASTZ=blastz.v7
 
 # settings for more distant organism alignments
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human (hg18)
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom4
 SEQ2_DIR=/iscratch/i/monDom4/monDom4RMExtra.2bit
 SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
 
     ssh kolossus
     cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
     time nice -n +19 featureBits hg18 chainMonDom4Link \
 	> fb.hg18.chainMonDom4Link 2>&1 &
     cat fb.hg18.chainMonDom4Link
     #	356865888 bases of 2881515245 (12.385%) in intersection
 
     #	for the swap, see makeMonDom4.doc 2006-04-28
 
     #	Creating download directory (DONE - 2006-07-18 - Hiram)
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastzMonDom2.2006-02-13
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
         -continue=download -stop=download `pwd`/DEF > download.out 2>&1
 
 ##########################################################################
 # BLASTZ OPOSSUM monDom2 first time (EXPERIMENT - 2006-01-23 - Hiram)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
     cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
 
     cat << '_EOF_' > DEF
 # human vs. opossum
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human (hg18)
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom2
 SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/monDom2/chrom.sizes
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzMonDom2.2006-01-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	happy emacs
 
     cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	real    912m22.818s
 
     #	This failed during the load of the chains due to the size of
     #	chr19.chain.  So, go to kolossus:
     ssh kolossus
     #	There isn't any hg18 db here yet, get it established with a
     #	chromInfo and a 2bit sequence:
     hgsql -e "create database hg18;" mysql
     cd /cluster/data/hg18
     twoBitInfo hg18.2bit stdout |
         awk '{printf "%s\t%s\t/gbdb/hg18/hg18.2bit\n", $1,$2}' \
 		> chromInfo.kolossus.tab
     hgsql hg18 < $HOME/kent/src/hg/lib/chromInfo.sql
     hgsql hg18 \
 -e 'load data local infile "chromInfo.kolossus.tab" into table chromInfo;'
     mkdir /gbdb/hg18
     ln -s /cluster/data/hg18/hg18.2bit /gbdb/hg18/hg18.2bit
     #	now, loading only chr19:
     cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain
     hgLoadChain hg18 chr19_chainMonDom2 chain/chr19.chain
     #	while that is running, back on hgwdev, get the other chains loaded
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23/axtChain
     cp loadUp.csh loadUp.noChr19.csh
     #	change the foreach line to eliminate the chr19.chain:
     diff loadUp.csh loadUp.noChr19.csh
     < foreach f (*.chain)
     ---
     > foreach f (`ls *.chain | grep -v chr19.chain`)
     #	And then run that script
     time ./loadUp.noChr19.csh > load.noChr19.out 2>&1
 
     #	When the kolossus load finishes, email to push-request and ask
     #	for the two tables to be pushed from kolossus to hgwdev:
     #	chr19_chainMonDom2
     #	chr19_chainMonDom2Link
 
     #	then, continuing:
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-continue=download -bigClusterHub=pk -chainMinScore=5000 \
 	-chainLinearGap=loose `pwd`/DEF > download.out 2>&1 &
     #	real    2m42.505s
 
     ssh kolossus
     cd /cluster/data/hg18/bed/blastz.monDom2
     time HGDB_CONF=~/.hg.conf.read-only featureBits \
 	hg18 chainMonDom2Link > fb.hg18.chainMonDom2Link 2>&1
     #	real    124m34.435s
     cat fb.hg18.chainMonDom2Link
     #	357258631 bases of 2881515245 (12.398%) in intersection
 
     #	then, to swap
     ssh pk
     cd /cluster/data/hg18/bed/blastzMonDom2.2006-01-23
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > swap.out 2>&1 &
     #	running 2006-01-25 17:28
     #	real    51m27.447s
     #	this swap failed at:
     #	startStep: 4, at step 5 net to stopStep 9
     #	netChains: looks like previous stage was not successful
     #	(can't find [monDom2.hg18.]all.chain[.gz]).
     #	This failure does not make any sense.  The end of swapChains
     #	does an nfsNoodge on this file to verify it exists.
     #	I don't understand why it wouldn't be in existence
     #	as netChains starts up.
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net `pwd`/DEF > net-swap.out 2>&1 &
     #	running 2006-01-26 09:28
     #	real    27m57.077s
     #	This swap failed at the load chain:
     #	startStep: 5, at step 6 load to stopStep 9
     #	# chmod a+x
     #	# /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain/loadUp.csh
     #	# ssh -x hgwdev nice
     #	# /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain/loadUp.csh
     #	cd /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain
     #	hgLoadChain -tIndex monDom2 chainHg18 monDom2.hg18.all.chain.gz
     #	Out of memory needMem - request size 56 bytes
 
     #	So, over to kolossus to give it a try:
 
     #	There isn't any monDom2 db here yet, get it established with a
     #	chromInfo and a 2bit sequence:
     hgsql -e "create database monDom2;" mysql
     cd /cluster/data/monDom2
     hgsql monDom2 < $HOME/kent/src/hg/lib/chromInfo.sql
     hgsql monDom2 \
 -e 'load data local infile "chromInfo.tab" into table chromInfo;'
     mkdir /gbdb/monDom2
     ln -s /cluster/data/monDom2/monDom2.2bit /gbdb/monDom2/monDom2.2bit
     #	now, loading into monDom2
     cd /cluster/data/monDom2/bed/blastz.hg18.swap/axtChain
     time hgLoadChain -tIndex monDom2 chainHg18 monDom2.hg18.all.chain.gz \
 	> kolossus.load
     #	running - 2006-01-26
 
 ##########################################################################
 #  test BLASTZ Opossum MonDom1  (DONE - 2006-01-30 - Hiram)
 #	to see what happened with the blow up of data in monDom2
 #
 
     ssh kk
     mkdir /cluster/data/hg18/bed/blastzMonDom1.2006-01-30
     cd /cluster/data/hg18/bed/blastzMonDom1.2006-01-30
 
     cat << '_EOF_' > DEF
 # human vs. opossum
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
 
 BLASTZ=blastz.v7
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human (hg18)
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom1
 SEQ2_DIR=/iscratch/i/monDom1/chunks
 SEQ2_LEN=/iscratch/i/monDom1/chrom.sizes
 SEQ2_IN_CONTIGS=1
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzMonDom1.2006-01-30
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	started 2006-01-30 - 15:40
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=cat -stop=load `pwd`/DEF > cat_load.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-stop=net `pwd`/DEF > blastz.out 2>&1 &
 
 ############################################################################
 ############################################################################
 # STS MARKERS (STARTED 2006-01-27 Fan - DONE 2006-02-06 - Hiram)
 #	FOR NEXT TIME - a lot of the perl scripts used in this process
 #	need to be placed into the source tree and cleaned up to modern
 #	perl warnings and strict standards.  In particular, one script
 #	was placed into the source tree this time: src/utils/findAccession.pl
 
    # update from NCBI
     ssh kkstore02
     # use store11 for space
     mkdir -p /cluster/store11/sts.2006-01
     ln -s /cluster/store11/sts.2006-01 /cluster/data/ncbi
     ln -s /cluster/data/ncbi/sts.2006-01 sts.10
     cd /cluster/data/ncbi/sts.2006-01
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
 # old
 #    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
 #    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
     wget --timestamping ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
     gunzip sts.gz
     mv sts dbSTS.fa
 
     #	these items are copied in from the previous builds
     cp -p /cluster/data/ncbi/sts.9/all.STS.fa ./all.STS.fa.prev
     cp -p /cluster/data/ncbi/sts.9/stsInfo2.bed ./stsInfo2.bed.prev
 
     # Convert dbSTS.fa file to easier reading format, and get accessions
     /cluster/bin/scripts/convertGbFaFile dbSTS.fa > UniSTS.convert.fa
     grep ">" UniSTS.convert.fa | cut -f 2 -d ">" > UniSTS.acc
 
     # NOTE: updateStsInfo creates new stsInfo2.bed, all.primers,
     #   all.STS.fa, stsAlias.bed files
 #### XXX - FOR NEXT TIME: need to fix something here for the
 #### XXX - broken symbol AFM067XA9 which has over 6,000 aliases.
 #### XXX - This isn't right
 #### hand-editted the record for AFM067XA9.  KUHN/ARCHANA 10-08-2007
 #### preserving the list of otherNames that showed up stsInfo2.otherNames for
 ####    trueName=AFM067XA9
 #### cp hg18.AFM067XA9.otherNames /cluster/data/hg18/bed/sts
 #### preserving the list of stsMarkers that showed up in stsAlias.alias
 ####    in excess of those in the above file (10 k total)
 #### cp hg18.AFM067XA9.dropped.aliases /cluster/data/hg18/bed/sts
 
     updateStsInfo -verbose=1 -gb=UniSTS.acc stsInfo2.bed.prev all.STS.fa.prev \
 	UniSTS.sts UniSTS.aliases UniSTS.convert.fa new
 # 5610    MFD330  1000006 (0) not in dbSTS anymore
 # 5667    D3S4560 1000008 (0) not in dbSTS anymore
 # 5686    ATA92F01        1000007 (0) not in dbSTS anymore
 # 5945    MFD206  1000009 (0) not in dbSTS anymore
 # 6591    MFD311  1000011 (0) not in dbSTS anymore
 # 6841    MFD306  1000013 (0) not in dbSTS anymore
 # 6842    MFD310  1000012 (0) not in dbSTS anymore
 # 6844    MFD349  1000026 (0) not in dbSTS anymore
 # 7024    D12S2343        1000015 (0) not in dbSTS anymore
 # 7042    ATA73C05        1000014 (0) not in dbSTS anymore
 # 7226    MFD341  1000016 (0) not in dbSTS anymore
 # 7500    D17S2200        1000018 (0) not in dbSTS anymore
 # 7628    ATA92E03        1000020 (0) not in dbSTS anymore
 # 7642    GATA178F11      1000019 (0) not in dbSTS anymore
 # 7910    MFD338  1000022 (0) not in dbSTS anymore
 # 97723   GATA172D05      1000023 (0) not in dbSTS anymore
 # 205088  CPLA3610        1000000 (0) not in dbSTS anymore
 # 205089  COX_1935        1000001 (0) not in dbSTS anymore
 # 205090  24534CA2        1000002 (0) not in dbSTS anymore
 # 205091  D5S811  1000003 (0) not in dbSTS anymore
 # 205092  AC016604-5      1000004 (0) not in dbSTS anymore
 # 205093  CA-JAP-180      1000005 (0) not in dbSTS anymore
 # 205094  D10S1120        1000025 (0) not in dbSTS anymore
 # 205095  D21S2039        1000024 (0) not in dbSTS anymore
 # 205102  D12S1013        1000028 (0) not in dbSTS anymore
 
     mv new.info stsInfo2.bed
     mv new.primers all.primers
     mv new.alias stsAlias.bed
     mv new.fa all.STS.fa
 
     # get list of all STS id's in the fasta file
     sed -n 's/^>\([0-9][0-9]*\) .*/\1/p' all.STS.fa | sort -n >  all.STS.id
     wc -l all.STS.id
     # 93698 total sequences
     /cluster/bin/scripts/convertPrimerToFA all.primers > all.primers.fa
 
     # Copy stsInfo2.bed and stsAlias.bed to data directory becuase
     # these will be loaded into the database later
     mkdir -p /cluster/data/hg18/bed/sts
     cp -p stsInfo2.bed /cluster/data/hg18/bed/sts/
     cp -p stsAlias.bed /cluster/data/hg18/bed/sts/
 
     # Create sts sequence alignments
     mkdir /san/sanvol1/scratch/hg18/sts
     mkdir /san/sanvol1/scratch/hg18/sts/split
 
     faSplit sequence all.STS.fa 200 /san/sanvol1/scratch/hg18/sts/split/sts
     cp -p all.STS.fa /san/sanvol1/scratch/hg18/sts
 
     ssh pk
     cd /cluster/data/hg18/bed/sts
     mkdir run
     cd run
     ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
     ls -1S /san/sanvol1/scratch/hg18/sts/split/sts*.fa > sts.lst
     mkdir /san/sanvol1/scratch/hg18/sts/out
 
     foreach f (`cat sts.lst`)
         set d = $f:t:r
         mkdir /san/sanvol1/scratch/hg18/sts/out/$d
     end
 
     # create alignments
 cat > template << '_EOF_'
 #LOOP
 /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/hg18/11.ooc -stepSize=5 {check out line+ /san/sanvol1/scratch/hg18/sts/out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # happy emacs
 
     gensub2 contigs.lst sts.lst template jobList
     para create jobList
         # 70686 jobs
     para try ... check ... push ... etc
 # Completed: 70686 of 70686 jobs
 # CPU time in finished jobs:     117490s    1958.16m    32.64h    1.36d  0.004 y
 # IO & Wait Time:                195274s    3254.57m    54.24h    2.26d  0.006 y
 # Average job time:                   4s       0.07m     0.00h    0.00d
 # Longest finished job:              97s       1.62m     0.03h    0.00d
 # Submission to last job:          8085s     134.75m     2.25h    0.09d
 
     # Compile sts sequence results
     ssh pk
     cd /san/sanvol1/scratch/hg18/sts
     time pslSort dirs raw.psl temp out/sts*
     #	real    8m50.714s
     #	-rw-rw-r--    1 810548945 Feb  3 14:19 raw.psl
     #	70686 files in 187 dirs
     #	Got 70686 files 266 files per mid file
     rm -rf temp
     time pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons raw.psl \
 	stsMarkers.psl /dev/null
     #	Processed 7252745 alignments
     #	real    0m28.102s
     #	-rw-rw-r--    1  10981952 Feb  3 14:26 stsMarkers.psl
 
     cp -p stsMarkers.psl /cluster/data/hg18/bed/sts/run
 
     # Lift them and get them ready to combine with primer alignments
     liftUp -nohead stsMarkers.lifted.psl \
         /cluster/data/hg18/jkStuff/liftContigs.lft \
 	     warn stsMarkers.psl
 
     /cluster/bin/scripts/extractPslInfo stsMarkers.lifted.psl
         # creates stsMarkers.lifted.psl.initial
     wc stsMarkers.lifted.psl.initial
     #	93236  559416 4111801 stsMarkers.lifted.psl.initial
     $HOME/kent/src/utils/findAccession.pl -agp stsMarkers.lifted.psl.initial \
 	/cluster/data/hg18
     wc stsMarkers.lifted.psl.initial.acc
     #	93236  652652 4947261 stsMarkers.lifted.psl.initial.acc
 
     sort -k4,4n stsMarkers.lifted.psl.initial.acc > stsMarkers.final
 
     # determine found markers (4th field in file)
     cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.found
     wc -l stsMarkers.found
     #	90676 stsMarkers.found
     #	out of 93698 total sequences
     #		from wc /cluster/data/ncbi/sts.2006-01/all.STS.id)
 
     # extract sequences for markers not yet found, and
     # blat w/o ooc to try to place more
     comm -1 -3  stsMarkers.found /cluster/data/ncbi/sts.2006-01/all.STS.id \
                 > stsMarkers.notFound
     wc -l stsMarkers.notFound
     # 3022 stsMarkers.notFound
 
     faSomeRecords /san/sanvol1/scratch/hg18/sts/all.STS.fa stsMarkers.notFound \
                 notFound.STS.fa
 
     mkdir /san/sanvol1/scratch/hg18/sts/splitNotFound
     faSplit sequence notFound.STS.fa 20 \
                 /san/sanvol1/scratch/hg18/sts/splitNotFound/sts
 
     # blat with 11.ooc misses alignments, so reblat w/o the
     # sequences that aren't found
     # NOTE: filtering produces yield of only 101 markers placed (out of 3022).
     # not enough to justify this step next time
     ssh pk
     mkdir /cluster/data/hg18/bed/sts/run.noOoc
     cd /cluster/data/hg18/bed/sts/run.noOoc
     ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
     ls -1S /san/sanvol1/scratch/hg18/sts/splitNotFound/sts*.fa > sts.lst
 
     mkdir /san/sanvol1/scratch/hg18/sts/out.noOoc
 
     foreach f (`cat sts.lst`)
         set d = $f:t:r
         mkdir /san/sanvol1/scratch/hg18/sts/out.noOoc/$d
     end
 
 cat > template << '_EOF_'
 #LOOP
 /cluster/bin/x86_64/blat $(path1) $(path2) -stepSize=5 {check out line+ /san/sanvol1/scratch/hg18/sts/out.noOoc/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # happy emacs
 
     gensub2 contigs.lst sts.lst template jobList
     para create jobList
     # 7182 jobs written to batch
     para try
     para check
 
     # process this set of alignments
     cd /san/sanvol1/scratch/hg18/sts
     pslSort dirs raw.noOoc.psl temp out.noOoc/*
     #	-rw-rw-r--    1 459858612 Feb  3 15:56 raw.noOoc.psl
     #	Wow, that is almost half the size of the original raw.psl with
     #	everything in it.
 
     rm -rf temp
     pslReps -nearTop=0.0001 -minCover=0.6 -minAli=0.8 -noIntrons \
         raw.noOoc.psl stsMarkers.noOoc.psl /dev/null
     # Processed 4027664 alignments
 
     # Lift them and get them ready to combine with primer alignments
     liftUp -nohead stsMarkers.noOoc.lifted.psl \
         /cluster/data/hg18/jkStuff/liftContigs.lft \
         warn stsMarkers.noOoc.psl
 
     /cluster/bin/scripts/extractPslInfo stsMarkers.noOoc.lifted.psl
         # creates <file>.initial
     $HOME/kent/src/utils/findAccession.pl -agp \
         stsMarkers.noOoc.lifted.psl.initial /cluster/data/hg18
 
     #rm stsMarkers.lifted.psl.initial
     mv stsMarkers.final stsMarkers.ooc.final
     sort -k4,4n stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.extra
     sort -k4,4n stsMarkers.lifted.psl.initial.acc \
                 stsMarkers.noOoc.lifted.psl.initial.acc > stsMarkers.final
 
     # determine found markers (4th field in file)
     cut -f 4 stsMarkers.final | sort -n -u > stsMarkers.more.found
     wc -l stsMarkers.more.found
         #  90777 stsMarkers.found
     cut -f 4 stsMarkers.extra | sort -n -u > stsMarkers.extra.found
     wc -l stsMarkers.extra.found
         #   101 out of 3022 attempted
 	#  out of 93698 total sequences
 
     cp -p stsMarkers.final stsMarkers.lifted.psl \
 	stsMarkers.*lifted.psl.initial* stsMarkers.found \
                 /cluster/data/hg18/bed/sts
 
     # Alignments from noOoc set were not added to all_sts_seq but info for the
     # markers is in stsMap and stsInfo2. Some of the alignments are bad so
     # filter by removing all alignments from noOoc psl file where
     # tBaseInsert >=1000. Add the remaining alignments to the set of final
     # alignments for stsMarkers. The information for the removed markers
     # from the filtered set was also removed from stsMap and stsInfo2.
     ssh pk
     mkdir /cluster/data/hg18/bed/sts/fix
     cd /cluster/data/hg18/bed/sts/fix
     cp /san/sanvol1/scratch/hg18/sts/stsMarkers.noOoc.lifted.psl .
     awk '{if ($8 < 1000) print}' stsMarkers.noOoc.lifted.psl \
 	> stsMarkers.noOoc.lifted.filt1000.psl
     wc -l *.filt*.psl
     # 23   483  4206 stsMarkers.noOoc.lifted.filt1000.psl
     sort -k4,4n \
 	/san/sanvol1/scratch/hg18/sts/stsMarkers.noOoc.lifted.psl.initial.acc \
 	> stsMarkers.extra
     awk '{print $4}' stsMarkers.extra | sort -n | uniq >  extra.ids
     # in psl file, the ids are the 10th field
     awk '{print $10}' stsMarkers.noOoc.lifted.psl | sort -n | uniq \
         > noOoc.ids
     diff extra.ids noOoc.ids
     # there is no difference as expected
     # get list of IDs from filtered file, filter < 1000
     awk '{print $10}' stsMarkers.noOoc.lifted.filt1000.psl \
         | sort -n | uniq > filt1000.ids
     for i in `cat filt1000.ids`
     do
      awk 'BEGIN {OFS="\t"} \
          {if ($4 == "'$i'") print $1, $2, $3, $4, $5, $6, $7}' \
          stsMarkers.extra >> stsMarkers.extra.filt1000
     done
     cp -p ../stsMarkers.final stsMarkers.final
      # need to filter stsMarkers.final not just cat this on the end
     # get list of alignments with tBaseInsert >= 1000 and remove these
     cd /cluster/data/hg18/bed/sts/fix
     awk '{if ($8 >= 1000) print;}' stsMarkers.noOoc.lifted.psl \
 	> stsMarkers.noOoc.lifted.filtToRemove.psl
     wc -l *.filt*.psl
     #	 23 stsMarkers.noOoc.lifted.filt1000.psl
     #	175 stsMarkers.noOoc.lifted.filtToRemove.psl
     # get list of IDs that need to be removed
     awk '{print $10;}' stsMarkers.noOoc.lifted.filtToRemove.psl | sort -n \
         | uniq  > noOoc.IdsToRemove.txt
     # get chrom and co-ordinates for IDs to be removed
     awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
         stsMarkers.noOoc.lifted.filtToRemove.psl | sort | uniq \
         > sts.noOoc.filtToRemove.coords
     # checked that the stsMarkers.final contain the noOoc alignments
     # use this perl script to remove lines with these IDs from stsMarkers.final
 cat << '_EOF_' > removeIds.pl
 #!/usr/bin/env perl
 use warnings;
 use strict;
 
 my $ids = $ARGV[0];
 my $file = $ARGV[1];
 # list of IDs with chrom and coords to remove
 open(IDS, $ids) || die "Can not open $ids: $!\n";
 # file for removal of IDs
 open(FILE, $file) || die "Can not open $file: $!\n";
 open(OUT, ">removed.txt") || die "Can not create removed.txt: $!\n";
 
 my %idsHash;
 
 while (<IDS>) {
    chomp;
    my @a = split(/\t/);
 
    my $chr = $a[0];
    my $st = $a[1];
    my $end = $a[2];
    my $id = $a[3];
    my $key = $id."_".$chr . "_" . $st . "_" . $end;
    $idsHash{$key}->{chrom} = $chr;
    $idsHash{$key}->{start} = $st;
    $idsHash{$key}->{end} = $end;
 }
 close IDS;
 
 while (<FILE>) {
    chomp;
    my $l = $_;
    my $found = "FALSE";
    my @f = split(/\t/, $l);
    foreach my $k (keys(%idsHash)) {
       # if the id is contained in the key
       if ($k =~ /^$f[3]/) {
          my $c = $idsHash{$k}->{chrom};
          my $s = $idsHash{$k}->{start};
          my $e = $idsHash{$k}->{end};
          if ($f[0] eq $c && $f[1] == $s && $f[2] == $e) {
              print OUT "$c\t$s\t$e\t$f[3]\n";
              $found = "TRUE";
          }
       }
    }
    if ($found eq "FALSE") {
       print "$l\n";
    }
 }
 '_EOF_'
     chmod +x removeIds.pl
     ./removeIds.pl sts.noOoc.filtToRemove.coords stsMarkers.final \
          > stsMarkers.final.new
     wc -l stsMarkers.final*
     wc stsMarkers.final*
     #	93434  654038 4957784 stsMarkers.final
     #	93259  652813 4948484 stsMarkers.final.new
 
     # There are 175 ids and sets of co-ordinates in list of Ids to remove
     #	175 stsMarkers.noOoc.lifted.filtToRemove.psl
     # check that stsMarkers.final.new contains all the alignments that
     # are in filtered set: stsMarkers.noOoc.lifted.filt1000.psl
     awk 'BEGIN {OFS = "\t"} {print $14,$16,$17,$10}' \
         stsMarkers.noOoc.lifted.filt1000.psl | sort | uniq \
         > sts.noOoc.filt1000.coords
     awk 'BEGIN {OFS = "\t"} {print $1,$2,$3,$4}' \
         stsMarkers.final.new | sort | uniq \
         > sts.finalnew.coords
     diff sts.finalnew.coords sts.noOoc.filt1000.coords > finalnewvsfilt1000
     grep '>' finalnewvsfilt1000
     # there is nothing in sts.noOoc.filt1000.coords not found in the
     # sts.finalnew.coords file therefore this contains all the alignments
     # from the filtered noOoc file.
     cp ../primers/primers.final .
     awk '{print $4}' stsMarkers.final.new | sort | uniq > stsfinal.new.ids
 
     # primers
     ssh eieio
     cd /cluster/data/ncbi/sts.10
     # strip out N's and wobbles (KS) from primers, as isPcr
     # can't currently handle them
     # strip out primers < 10 as isPcr can't handle them
     awk '$0 !~ /[^ACGT0-9\-\t]/ && (length($2) > 10) && (length($3) > 10) {printf "dbSTS_%s\t%s\t%s\n", $1,$2,$3}' \
                 all.primers > all.primers.ispcr
     mkdir -p /san/sanvol1/scratch/hg18/sts.10/primers
     cd /san/sanvol1/scratch/hg18/sts.10/primers
     split -l 4000 /cluster/data/ncbi/sts.10/all.primers.ispcr primers_
 
     ssh pk
     mkdir /cluster/data/hg18/bed/sts/primers
     cd /cluster/data/hg18/bed/sts/primers
     mkdir run
     cd run
     ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contigs.lst
     ls -1S /san/sanvol1/scratch/hg18/sts.10/primers/primers_* > primers.lst
     mkdir /san/sanvol1/scratch/hg18/sts.10/primers/out
 
 cat > template << '_EOF_'
 #LOOP
 /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/san/sanvol1/scratch/hg18/10ooc/$(root1).10.ooc  -stepSize=5 $(path1) $(path2) {check out line /san/sanvol1/scratch/hg18/sts.10/primers/out/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # happy emacs
 
     gensub2 contigs.lst primers.lst template jobList
     para create jobList
 	# 29106 jobs
     para try ... check ... push ... etc ...
 # Completed: 29106 of 29106 jobs
 # CPU time in finished jobs:     658245s   10970.76m   182.85h    7.62d  0.021 y
 # IO & Wait Time:                 82764s    1379.39m    22.99h    0.96d  0.003 y
 # Average job time:                  25s       0.42m     0.01h    0.00d
 # Longest finished job:             534s       8.90m     0.15h    0.01d
 # Submission to last job:          2282s      38.03m     0.63h    0.03d
 
 
     # Filter output file quickly based on simple parameters
     ssh pk
     cd /san/sanvol1/scratch/hg18/sts.10/primers
     mkdir filter
     pslQuickFilter -minMatch=26 -maxMismatch=5 \
 	-maxTinsert=5000 -verbose out/ filter/
     # Note: there will be many messages saying files are empty - this is OK
     time pslSort dirs ../primers.psl.unlifted temp filter
     #	Got 29106 files 171 files per mid file
     #	real    3m31.401s
     # filter primer alignments and create not found primer file for ePCR run
     cd /san/sanvol1/scratch/hg18/sts.10
     pslFilterPrimers primers.psl.unlifted  \
 	/cluster/data/ncbi/sts.10/all.primers primers.filter.unlifted.psl
     # creates primers.filter.unlifted.psl.notfound.primers
     wc -l primers.filter.unlifted.psl.notfound.primers
     # 22943  primers.filter.unlifted.psl.notfound.primers
 
     # use Greg Schuler's ePCR to attempt alignment of primers missed
     # by isPcr
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/sts.10/epcr
     mkdir /san/sanvol1/scratch/hg18/sts.10/epcr/out
     cd /san/sanvol1/scratch/hg18/sts.10/epcr
     split -l 3000 ../primers.filter.unlifted.psl.notfound.primers  primers_
     mkdir /cluster/data/hg18/bed/sts/primers/run.epcr
     cd /cluster/data/hg18/bed/sts/primers/run.epcr
     ls -1S /san/sanvol1/scratch/hg18/sts.10/epcr/primers_* > primers.lst
     #  These jobs are going to go quickly, make sure all I/O comes and
     #  goes from something that can handle it.
     ls -1S /san/sanvol1/scratch/hg18/maskedContigs/*.fa > contig.lst
 
     #	This runEpcr64 script was made from the existing runEpcr script
     #	and from the looks of it, I doubt the original script works in
     #	the way this was set up here.  It appears to be reading the
     #	second argument $(path2) line by line and sending that as
     #	arguments to e-PCR.  That wouldn't be right here.
 
 cat > template << '_EOF_'
 #LOOP
 /cluster/bin/scripts/runEpcr64 $(path1) $(path2) {check out line /san/sanvol1/scratch/hg18/sts.10/epcr/out/$(root1).$(root2).epcr}
 #ENDLOOP
 '_EOF_'
     # << emacs
     gensub2 primers.lst contig.lst template jobList
     para create jobList
 	# 3420 jobs
     para try ... check ... push ... etc ...
 
 # Completed: 3024 of 3024 jobs
 # CPU time in finished jobs:      31802s     530.04m     8.83h    0.37d  0.001 y
 # IO & Wait Time:                 12804s     213.40m     3.56h    0.15d  0.000 y
 # Average job time:                  15s       0.25m     0.00h    0.00d
 # Longest finished job:             193s       3.22m     0.05h    0.00d
 # Submission to last job:           372s       6.20m     0.10h    0.00d
 
     # merge output
     ssh pk
     cd /cluster/bluearc/hg17/sts/primers/epcr
     cd /san/sanvol1/scratch/hg18/sts.10/epcr
     cat out/*.epcr > all.epcr
     wc -l all.epcr
     # 3792
 
     #	should be on the fileserver (kkstore02) for the following heavy
     #	I/O operations.  Didn't do that here, was on pk instead.
     # use all.epcr file to re-filter alignemnts and determine which
     # ePCR records to keep
     cp all.epcr /cluster/data/hg18/bed/sts/primers
     cd /cluster/data/hg18/bed/sts/primers
     pslFilterPrimers -epcr=all.epcr -verbose=1 \
 	/san/sanvol1/scratch/hg18/sts.10/primers.psl.unlifted \
 	/cluster/data/ncbi/sts.10/all.primers primers.unlifted.epcr.psl
     #	creates three files:
 # -rw-rw-r-  1   148528 Feb  6 10:39 epcr.not.found
 # -rw-rw-r-  1 51632003 Feb  6 10:39 primers.unlifted.epcr.psl
 # -rw-rw-r-  1  1189756 Feb  6 10:39 primers.unlifted.epcr.psl.notfound.primers
 
     # convert to PSL and combine with other psl file
     time /cluster/bin/scripts/epcrToHgPsl epcr.not.found \
         /cluster/data/ncbi/sts.10/all.primers /cluster/data/hg18
     #	real    81m24.041s  (on pk, may have been better on kkstore02
     #	where all of the data is)
     cat primers.unlifted.epcr.psl epcr.not.found.psl \
                 | sort -k 10n > primers.final.unlifted.psl
     wc -l primers.final.unlifted.psl
     #	454869 primers.final.unlifted.psl
 
     #	should have been on kkstore02 already
     ssh kkstore02
     cd /cluster/data/hg18/bed/sts/primers
     # Fix the query gap lengths so that they match the all.primers.fa
     #   file lengths
     time /cluster/bin/scripts/fixPrimersQueryGaps \
         /cluster/data/ncbi/sts.10/all.primers primers.final.unlifted.psl \
                 > primers.final.unlifted.fix.psl
     #	real    0m19.814s
     wc -l primers.final.unlifted.fix.psl
     #	454869 primers.final.unlifted.fix.psl
 
     # lift results from contigs to chrom coordinates, and create final file
     time liftUp -nohead primers.psl \
             /cluster/data/hg18/jkStuff/liftContigs.lft warn \
             primers.final.unlifted.fix.psl
     #	real    0m2.897s
     wc -l primers.psl
     #	454869 primers.psl
 
     # Extract relevant info, make alignments unique, and create final file to
     #	be merged with full sequence alignments
     time /cluster/bin/scripts/extractPslInfo primers.psl
     #	real    0m15.303s
     wc -l primers.psl.initial
     #	451023 primers.psl.initial
     $HOME/kent/src/utils/findAccession.pl -agp primers.psl.initial \
                 /cluster/data/hg18
     wc -l primers.psl.initial.acc
     #	451023 primers.psl.initial.acc
 
     /cluster/bin/scripts/getStsId /cluster/data/hg18/bed/sts/stsInfo2.bed \
 	primers.psl.initial.acc \
         | sort -k 4n > primers.final
     #rm primers.psl.initial.acc
     wc -l primers.final
     # 451023 primers.final
     #	There doesn't appear to be any use for this primers.ids list
     #	except for curiosity.  Check the head and tail of this list to
     #	verify no garbage is in here.  There should just be numbers.
     awk '{print $4}' primers.final | sort -n | uniq > primers.ids
     wc -l primers.ids
     #	287465 primers.ids
 
     # Merge primer and sequence files to create final bed file
     # Merge (combineSeqPrimerPos) takes about an hour to run
     ssh kkstore02
     cd /cluster/data/hg18/bed/sts
     time /cluster/bin/scripts/combineSeqPrimerPos stsMarkers.final \
 	primers/primers.final
     #	real    55m33.254so
     wc -l stsMarkers_pos.rdb
     #	307082 stsMarkers_pos.rdb
 
     time /cluster/bin/scripts/createSTSbed \
 	/cluster/data/ncbi/sts.10/stsInfo2.bed stsMarkers_pos.rdb > stsMap.bed
     #	real    0m13.351s
     wc -l stsMap.bed
     #	300492 stsMap.bed
 
     # Set up sequence files
     ssh hgwdev
     mkdir /gbdb/hg18/sts.10/
     ln -s /cluster/data/ncbi/sts.10/all.STS.fa /gbdb/hg18/sts.10/all.STS.fa
     ln -s /cluster/data/ncbi/sts.10/all.primers.fa \
         /gbdb/hg18/sts.10/all.primers.fa
 
     # Load all files
     cd /cluster/data/hg18/bed/sts
     hgLoadSeq hg18 /gbdb/hg18/sts.10/all.STS.fa /gbdb/hg18/sts.10/all.primers.fa
     #	Advisory lock created
     #	Creating .tab file
     #	Adding /gbdb/hg18/sts.10/all.STS.fa
     #	93698 sequences
     #	Adding /gbdb/hg18/sts.10/all.primers.fa
     #	306885 sequences
     #	Updating seq table
     #	Advisory lock has been released
     #	All done
     #	real    1m25.459s
 
 
     hgsql hg18 < $HOME/kent/src/hg/lib/stsInfo2.sql
     hgsql hg18 < $HOME/kent/src/hg/lib/stsAlias.sql
     #	these two files are already here from previous operations above
     # cp /cluster/data/ncbi/sts.10/{stsInfo2.bed,stsAlias.bed} .
     hgsql hg18 -e 'load data local infile "stsInfo2.bed" into table stsInfo2'
     hgsql hg18 -e 'load data local infile "stsAlias.bed" into table stsAlias'
     #	a couple minutes for each load above
     hgLoadBed -notItemRgb -noBin -tab \
 	-sqlTable=$HOME/kent/src/hg/lib/stsMap.sql \
 	    hg18 stsMap stsMap.bed
     hgLoadPsl -nobin -table=all_sts_primer hg18 primers/primers.psl
     #	load of all_sts_primer did not go as planned: 454869 record(s),
     #	0 row(s) skipped, 10 warning(s) loading primers/primers.psl
     hgLoadPsl -nobin -table=all_sts_seq hg18 stsMarkers.lifted.psl
 
 # PRUNE stsMap RECORDS (DONE 3/3/06)
 
   hgsql hg18 -e 'delete from stsMap where chromEnd-chromStart > 5000'
 
 ###########################################################################
 # CREATE HAPLOTYPEPOS TRACK (DONE 1/31/06, Fan)
 
   ssh kkstore02
   cd /cluster/data/hg18/bed
 
   mkdir haplotypePos
   cd haplotypePos
 
   cp /cluster/data/hg18/*hap*/*.fa . -p
   ls *.fa|sed -e 's/chr/split1 chr/' |sed -e 's/.fa//' >splitAll
 
 cat << '_EOF_' > split1
 echo processing $1
 faSplit2 -lift=$1.lft -overlap=500 size $1.fa 3500 split/$1
 '_EOF_'
 
 chmod +x split*
 mkdir split
 mkdir result
 splitAll
 
 ls ./split/*.fa > split.lst
 
 cat << '_EOF_' > gsub
 #LOOP
 /cluster/store11/gs.19/build36/bed/haplotypePos/hblat1 $(file1) {check out line+ /cluster/store11/gs.19/build36/bed/haplotypePos/result/$(root1).psl}
 #ENDLOOP
 '_EOF_'
 
 gensub2 split.lst single gsub jobList
 
 ssh pk
 cd /cluster/data/hg18/bed/haplotypePos
 mkdir result
 
 para create jobList
 para try, push, check ...
 
 # Completed: 3091 of 3092 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:      33164s     552.73m     9.21h    0.38d  0.001 y
 # IO & Wait Time:                172783s    2879.72m    48.00h    2.00d  0.005 y
 # Average job time:                  67s       1.11m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             300s       5.00m     0.08h    0.00d
 # Submission to last job:           743s      12.38m     0.21h    0.01d
 
 # The single job that crashed was due to chr5_h2_hap1368.fa, which
 # does not have a decent alignment on chr5.
 
 # collect BLAT results
 cat result/*.psl >all.psl
 
 # keep the main alignments
 pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 all.psl all_filtered.psl all.psr
 
 cat chr*.lft > hap.lft
 liftUp lifted.psl hap.lft warn all_filtered.psl -pslQ
 
 mkdir tNibs qNibs
 cp -p /cluster/data/hg18/nib/*hap*.nib qNibs
 
 cp -p /cluster/data/hg18/nib/chr5.nib tNibs
 cp -p /cluster/data/hg18/nib/chr6.nib tNibs
 cp -p /cluster/data/hg18/nib/chr22.nib tNibs
 
 axtChain -psl -linearGap=medium lifted.psl tNibs qNibs out.chain
 
 chainAntiRepeat tNibs qNibs out.chain final.chain
 
 cat << '_EOF_' > hap.chrom.lis
 /cluster/data/hg18/nib/chr5.nib
 /cluster/data/hg18/nib/chr6.nib
 /cluster/data/hg18/nib/chr22.nib
 '_EOF_'
 
 ls *.fa >q.lis
 
 chainToPsl final.chain /cluster/data/hg18/chrom.sizes \
 /cluster/data/hg18/chrom.sizes hap.chrom.lis q.lis haplotypePos.psl
 # took about 20 minutes
 
 hgLoadPsl hg18 haplotypePos.psl
 
 # add haplotypePos entry in trackDb.ra
 
 ###########################################################################
 # LOAD AFFYRATIO (DONE - 2006-02-01 - Fan)
 #	Copied from Hg17 doc
 # NOTE: Jim recommends that, in the future, all AFFY blat alignments should drop
 # -mask=lower for blat and drop -minIdentity=95 to -minIdentity=90 as the
 # higher minIdentity is causing alignments to be dropped that should not be.
 # e.g.
 # /cluster/bin/i386/blat -fine -minIdentity=90 -ooc=/cluster/bluearc/hg18/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 # pslReps can be used to handle filtering at a later step. Blat's minIdentity
 # seems to be more severe than that for pslReps as it takes insertions and
 # deletions into account.
 #
 # NOTE FROM QA (brooke, 8/28/07):  In the future, run hgLoadBed without the
 # -sqlTable=$HOME/src/hg/lib/affyRatio.sql option, so that tableDescriptions
 # will be built properly.  affyRatio.sql was needed before Jim added bed15
 # capability to hgLoadBed (in Oct. 2003), but now bed15 tables can use the
 # default bedExp.as and bedExp.sql files.
 #
     # Set up cluster job to align consenesus/exemplars to hg18
     ssh kkstore02
     mkdir /cluster/bluearc/hg18/affyGnf
     cp -p /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa \
     /cluster/bluearc/hg18/affyGnf
 
     ssh kkr1u00
     mkdir -p /iscratch/i/affyGnf
     cp -p /cluster/bluearc/hg18/affyGnf/* /iscratch/i/affyGnf
     /cluster/bin/iSync
 
     ssh kki
     mkdir /cluster/data/hg18/bed/affyGnf.2004-06-09
     cd /cluster/data/hg18/bed/affyGnf.2004-06-09
     ls -1 /iscratch/i/affyGnf/* > affy.lst
     ls -1 /iscratch/i/gs.19/build36/maskedContigs/* > allctg.lst
     cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/hg18/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 allctg.lst affy.lst template.sub jobList
     mkdir psl
     para create jobList
     para try, push, check
 # Completed: 378 of 378 jobs
 # CPU time in finished jobs:       3055s      50.91m     0.85h    0.04d  0.000 y
 # IO & Wait Time:                  1267s      21.12m     0.35h    0.01d  0.000 y
 # Average job time:                  11s       0.19m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              78s       1.30m     0.02h    0.00d
 # Submission to last job:           367s       6.12m     0.10h    0.00d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyU95.psl
     ssh kkstore02
     cd /cluster/data/hg18/bed/affyGnf.2004-06-09
     pslSort dirs raw.psl tmp psl
 
     # change filter parameters for these sequences. only use alignments that
     # cover 30% of sequence and have at least 95% identity in aligned
     # region.
     # minAli = 0.97 too high. low minCover as a lot of n's in these
     # sequences
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl
     #   Eliminate the long names
     sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl
 
     # Merge with spot data and load into database. added -chip flag to
     # affyPslAndAtlasToBed to allow correct parsing
     ssh hgwdev
     cd /cluster/data/hg18/bed/affyGnf.2004-06-09
 
     bash
     /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 \
 	affyU95shortQname.psl \
 	/projects/compbio/data/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt \
 	affyRatio.bed affyRatio.exr > affyPslAndAtlasToBed.log 2>&1
 
     hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg18 \
 	affyRatio affyRatio.bed
     # Loaded 13043 elements of size 15
 
     mkdir affyU95
     hgLoadPsl hg18 -table=affyU95 affyU95shortQname.psl
     # sequences loaded 2006-02-1
     hgLoadSeq -abbr=U95Av2: hg18 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
     #	Advisory lock created
     #	Creating .tab file
     #	Adding /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
     #	12386 sequences
     #	Updating seq table
     #	Advisory lock has been released
     #	All done
 
 # Load AFFYUCLANORM, extended version of affyUcla track. Hopefully
 # final freeze of data set.		(DONE - 2006-02-01 - Fan)
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/affyUclaNorm
     cd /cluster/data/hg18/bed/affyUclaNorm
 
     cp -p /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa .
 
     ssh pk
     cd /cluster/data/hg18/bed/affyUclaNorm
     ls -1 /scratch/hg/gs.19/build36/maskedContigs/* > contig.lst
 
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
     mkdir psl
     ls HG-U133AB_all.fa > affy.lst
     gensub2 contig.lst affy.lst gsub jobList
     para create jobList
     para try
     para check
     para push ... etc
 # Completed: 378 of 378 jobs
 # CPU time in finished jobs:       6766s     112.77m     1.88h    0.08d  0.000 y
 # IO & Wait Time:                  1541s      25.68m     0.43h    0.02d  0.000 y
 # Average job time:                  22s       0.37m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             202s       3.37m     0.06h    0.00d
 # Submission to last job:           302s       5.03m     0.08h    0.00d
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/affyUclaNorm
     pslSort dirs hg18.affyU133AB_all.psl tmp psl
     wc hg18.affyU133AB_all.psl
     # 62043  1302842 13163424 hg18.affyU133AB_all.psl
 
     liftUp hg18.affyU133AB_all.lifted.psl \
 	/cluster/data/hg18/jkStuff/liftAll.lft warn hg18.affyU133AB_all.psl
     pslReps -minCover=0.5 -sizeMatters -minAli=0.97 \
 	-nearTop=0.005  hg18.affyU133AB_all.lifted.psl \
 	hg18.affyU133AB_all.lifted.pslReps.psl out.psr
     # Processed 62038 alignments
     ~/kent/src/hg/affyGnf/affyUclaMergePslData \
     -pslFile=hg18.affyU133AB_all.lifted.pslReps.psl \
 	-affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt \
 	-bedOut=hg18.affyUcla.bed \
 	-expRecordOut=hg18.affyUcla.expRecords \
 	-expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt
 
     ~/kent/src/hg/affyGnf/addUclaAnnotations.pl hg18.affyUcla.expRecords \
 	/projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg18.affyUcla.annotations.expRecords
 
     # Load the databases
     ssh hgwdev
     cd /cluster/data/hg18/bed/affyUclaNorm
     sed -e 's/affyRatio/affyUclaNorm/' ~/kent/src/hg/lib/affyRatio.sql > affyUclaNorm.sql
     hgLoadBed hg18 affyUclaNorm hg18.affyUcla.bed -sqlTable=affyUclaNorm.sql
 
 ############################################################################
 # MAKE AFFY U133 - made after above affyUclaNorm (DONE - 2006-02-01 - Fan)
     #	Someday the names can be fixed.
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/affyU133
     cd /cluster/data/hg18/bed/affyU133
     ln -s ../affyUclaNorm/hg18.affyU133AB_all.lifted.pslReps.psl affyU133.psl
 
     hgLoadPsl hg18 affyU133.psl
     hgsql -e "select count(*) from affyU133;" hg18
     #	row count in hg17: 44620, in hg18: 45559
     hgLoadSeq hg18 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
     #	44792 sequences
 
 # GNF ATLAS 2 (DONE - 2006-02-01 - Fan)
     # Align probes from GNF1H chip.
     ssh pk
     cd /cluster/data/hg18/bed
     mkdir -p geneAtlas2/run/psl
     cd geneAtlas2/run
     #	This bluearc/geneAtlas2 directory already exists
     # mkdir -p /cluster/bluearc/geneAtlas2
     # cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2
     ls -1 /scratch/hg/gs.19/build36/maskedContigs > genome.lst
     ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/blat -fine -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.19/build36/maskedContigs/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     gensub2 genome.lst mrna.lst gsub jobList
     para create jobList
     para try
     para check
     para push
     para time
 # Completed: 378 of 378 jobs
 # CPU time in finished jobs:       4038s      67.29m     1.12h    0.05d  0.000 y
 # IO & Wait Time:                  2182s      36.37m     0.61h    0.03d  0.000 y
 # Average job time:                  16s       0.27m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             250s       4.17m     0.07h    0.00d
 # Submission to last job:           322s       5.37m     0.09h    0.00d
 # Estimated complete:                 0s       0.00m     0.00h    0.00d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create gnf1h.psl.
     pslSort dirs raw.psl tmp psl
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     # Processed 79733 alignments
     liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl
     rm -r contig.psl raw.psl psl
 
     # Load probes and alignments from GNF1H into database.
     ssh hgwdev
     cd /cluster/data/hg18/bed/geneAtlas2
     #	Already symlinked
     # ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa \
     #	/gbdb/hgFixed/affyProbes
     hgLoadPsl hg18 affyGnf1h.psl
     hgLoadSeq hg18 /gbdb/hgFixed/affyProbes/gnf1h.fa
 
     grep -v U133B ../affyUclaNorm/hg18.affyU133AB_all.lifted.pslReps.psl \
 	| sed -e "s/exemplar://; s/consensus://; s/U133A://" \
 	| sed -e "s/;//" > affyU133A.psl
 
     hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
     	affyU133A.psl  /cluster/data/hg18/bed/geneAtlas2/affyGnf1h.psl
 
     # Loaded 44696 rows of expression data from hgFixed.gnfHumanAtlas2MedianRatio
     # Mapped 32926,  multiply-mapped 2000, missed 48, unmapped 11770
 
     hgLoadBed hg18 gnfAtlas2 gnfAtlas2.bed
     # Loaded 34926 elements of size 15
 
 ########################################################################
 #  Creating the ideoband data track  (DONE - 2006-02-02 - Hiram)
 #	This was reloaded upon completion of the cytoband sequence
 #	mentioned above.
 #  Received the following files in email from Wonhee Jang from NCBI:
 # -rw-rw-r--    1     1917 Feb  2 14:01 setBands.txt
 # -rw-rw-r--    1    39058 Feb  2 14:01 human_ideogram.dat
 # -rw-rw-r--    1   673148 Feb  2 14:01 fish.markers.bed
 #	placed them into /cluster/data/hg18/bed/ideogram
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/ideogram
     cd /cluster/data/hg18/bed/ideogram
 
     cat << '_EOF_' > mkBands.sh
 #!/bin/sh
 
 T=/cluster/data/hg18/bed/ideogram
 HI=${T}/human_ideogram.dat
 FM=${T}/fish.markers.bed
 SB=${T}/setBands.txt
 
 bander chr1 ${HI} ${FM} ${SB} 1 247199719 100 2.0 2
 bander chr2 ${HI} ${FM} ${SB} 2 242751149 100 2.0 2
 bander chr3 ${HI} ${FM} ${SB} 3 199446827 100 2.0 2
 bander chr4 ${HI} ${FM} ${SB} 4 191263063 100 2.0 2
 bander chr5 ${HI} ${FM} ${SB} 5 180837866 100 2.0 2
 bander chr6 ${HI} ${FM} ${SB} 6 170896992 100 2.0 2
 bander chr7 ${HI} ${FM} ${SB} 7 158821424 100 2.0 2
 bander chr8 ${HI} ${FM} ${SB} 8 146274826 100 2.0 2
 bander chr9 ${HI} ${FM} ${SB} 9 140273252 100 2.0 2
 bander chr10 ${HI} ${FM} ${SB} 10 135374737 100 2.0 2
 bander chr11 ${HI} ${FM} ${SB} 11 134452384 100 2.0 2
 bander chr12 ${HI} ${FM} ${SB} 12 132289534 100 2.0 2
 bander chr13 ${HI} ${FM} ${SB} 13 114127980 100 2.0 2
 bander chr14 ${HI} ${FM} ${SB} 14 106360585 100 2.0 2
 bander chr15 ${HI} ${FM} ${SB} 15 100338915 100 2.0 2
 bander chr16 ${HI} ${FM} ${SB} 16 88822254 100 2.0 2
 bander chr17 ${HI} ${FM} ${SB} 17 78654742 100 2.0 2
 bander chr18 ${HI} ${FM} ${SB} 18 76117153 100 2.0 2
 bander chr19 ${HI} ${FM} ${SB} 19 63806651 100 2.0 2
 bander chr20 ${HI} ${FM} ${SB} 20 62435964 100 2.0 2
 bander chr21 ${HI} ${FM} ${SB} 21 46944323 100 2.0 2
 bander chr22 ${HI} ${FM} ${SB} 22 49591432 100 2.0 2
 bander chrX ${HI} ${FM} ${SB} X 154913754 100 2.0 2
 bander chrY ${HI} ${FM} ${SB} Y 57443437 100 2.0 2
 
 for I in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
 do
     cat chr${I}.bed
 done > cytobands.bed
 '_EOF_'
     #	happy emacs
 
     chmod +x mkBands.sh
     ./mkBands.sh
 
     #	should be 862
     wc cytobands.bed
     #	862    4310   29911 cytobands.bed
 
     hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
         hg18 cytoBand cytobands.bed
 
     hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
         hg18 cytoBandIdeo cytobands.bed
 
 ############################################################################
 # H-INVITATIONAL GENE ANNOTATION DATABASE (DONE 2006-0202, Fan)
     # http://www.jbirc.aist.go.jp/hinv/top.html
     # Create knownGene table to reference HINV gene ID's
     #  for link on knownGenes details page
     # Also, create an HINV gene track
 
     # download CDNA file release 2.2 (Jan 20, 2006) -- got release # from downloads page).
     ssh kkstore03
     cd /cluster/data/hinv
     mkdir 2005-02-02
     cd 2005-02-02
     wget --timestamp http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz
     gunzip FCDNA.gz
     mv FCDNA FCDNA.2.2
 
     # set up assembly work area
     ssh kkstore02
     cd /cluster/data/hg18
     mkdir -p bed/hinv
     cd bed/hinv
 
     # extract H-INV ID's and Genbank accessions of mRNAs
     awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > accessions.txt
     awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > ids.txt
     paste accessions.txt ids.txt > queries.txt
     wc -l ids.txt
     #   56419 ids.txt
 
     # create PSL file from alignments for these mRNA's, extracted from the
     #       table of all aligned mRNA's
     ssh hgwdev
     cd /cluster/data/hg18/bed/hinv
     hgsql hg18 -s -e "SELECT * FROM all_mrna"  | cut -f 2- > all_mrna.tab
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/hinv
     pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
 
     # using pslReps to generate the PSL file header
     pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl
 
     # NEXT TIME, LOAD HInvGeneMrna TABLE AFTER HInv TABLE IS LOADED TO AVOID
     # joinerCheck TO COMPLAIN.
     # load track of mrna alignments
     ssh hgwdev
     cd /cluster/data/hg18/bed/hinv
     hgLoadPsl hg18 -table=HInvGeneMrna hinv_mrna.psl
     hgsql hg18 -s -e \
         "select distinct(qName) from HInvGeneMrna order by qName" > hg18.mrna
     hgsql hg17 -s -e \
         "select distinct(qName) from HInvGeneMrna order by qName" > hg17.mrna
     wc -l hg*.mrna
         # 41023 hg17.mrna
         # 54974 hg18.mrna
 
     comm -1 -3 *.mrna > hg18.aligned
     wc -l hg18.aligned
         # 14758 (transcripts newly aligned in hg18)
     comm -2 -3 *.mrna > hg17.aligned
     wc -l hg17.aligned
         # 807 (transcripts no longer aligned in hg18)
     comm -2 -3 ids.txt hg18.mrna > hg18.notaligned
     wc -l hg18.notaligned
         # 1445 (transcripts not aligned in hg18 -- checking on why...)
 
     # also make a table with various useful items for each transcript
     ssh hgwdev
     hgsql hg18 < ~/kent/src/hg/lib/HInv.sql
     cd /cluster/data/hg18/bed/hinv
     /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/2005-02-02/FCDNA.2.2 > HInv.tab
     echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg18
     hgsql hg17 -s -e "select count(*) from HInv"
         # 41118
     hgsql hg18 -s -e "select count(*) from HInv"
         # 56419
 
     # !!! DO THIS AFTER KG IS BUILD !!!
     # DONE (4/13/06 Fan).
     # create table for knownGenes detail page
     ssh hgwdev
     cd /cluster/data/hg18/bed/hinv
     hgMapToGene hg18 HInvGeneMrna knownGene knownToHInv
 
 # QA NOTE (3-6-2006): did a mytouch to update the time for the HInvGeneMrna table
 # (because joinerCheck was complaining during -times check):
 # sudo mytouch hg18 HInvGeneMrna 200602031600.00
 # touch -t 200602031600.00 /var/lib/mysql/hg18/HInvGeneMrna.MYD
 
 
 # PRODUCE FUGU BLAT ALIGNMENT (DONE - 2006-02-02 - Fan)
 
     ssh kk
     mkdir /cluster/data/hg18/bed/blatFr1
     cd /cluster/data/hg18/bed/blatFr1
     mkdir psl
     # next time, use N?_?????? (to pick up NG_ contigs)
     foreach f ( `cat /cluster/data/hg18/contig.lst` )
       set c=$f:t:r
       echo $c
       mkdir psl/$c
     end
 
     # create cluster job
     mkdir run
     cd run
     ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
     ls -1S /scratch/hg/gs.19/build36/maskedContigs/*.fa > human.lst
 cat << 'EOF' > gsub
 #LOOP
 /cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg18/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl}
 #ENDLOOP
 'EOF'
     # << keep emacs happy
     gensub2 human.lst fugu.lst gsub jobList
     para create jobList
     # 218484 jobs written to batch
     para try
     para check
     para push -maxQueue=300000 -maxPush=220000
     para check
 # Completed: 218484 of 218484 jobs
 # CPU time in finished jobs:    5073329s   84555.48m  1409.26h   58.72d  0.161 y
 # IO & Wait Time:                692572s   11542.87m   192.38h    8.02d  0.022 y
 # Average job time:                  26s       0.44m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             910s      15.17m     0.25h    0.01d
 # Submission to last job:         14753s     245.88m     4.10h    0.17d
 
         # cd ../psl
         # count files with aligments
         # find . -not -size 427c | wc -l
         # 44458
         # count files with no aligments
         # find . -size 427c | wc -l
         # 174405
 
    # When cluster run is done, sort alignments
    # into chrom directory
     ssh kkstore02
     cd /cluster/data/hg18/bed/blatFr1
     pslCat -dir psl/N?_?????? | \
       liftUp -type=.psl stdout \
       /cluster/data/hg18/jkStuff/liftAll.lft warn stdin | \
       pslSortAcc nohead chrom temp stdin
 
     # Processed 218887 lines into 1 temp files
 
     # Rename to correspond with tables and load into database:
     ssh hgwdev
     cd /cluster/data/hg18/bed/blatFr1/chrom
     foreach i (chr*.psl)
         set r = $i:r
         echo mv $i ${r}_blatFr1.psl
         mv $i ${r}_blatFr1.psl
     end
 
     # lift fugu scaffolds to Fugu browser chrUn,
     # so you can link to other browser.  And don't need to load sequence
     cd /cluster/data/hg18/bed/blatFr1
     liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl
 
     hgLoadPsl -table=blatFr1 hg18 all.psl
 
     nice featureBits hg18 blatFr1 refGene:CDS
     # 14636876 bases of 2881515245 (0.508%) in intersection
     nice featureBits hg17 blatFr1 refGene:CDS
     # 14488047 bases of 2866216770 (0.505%) in intersection
 
 
 #######################################################################
 #  OPOSSUM BLASTZ - (DONE - 2006-02-10 - Hiram)
     ssh kk
     #	this was done again after this, see 2006-02-13
     mkdir /cluster/data/hg18/bed/blastzMonDom4.2006-02-10
     cd /cluster/data/hg18/bed/blastzMonDom4.2006-02-10
 
     cat << '_EOF_' > DEF
 # human vs. opossum
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
 
 BLASTZ=blastz.v7
 
 # settings for more distant organism alignments
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET: Human (hg18)
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom4
 SEQ2_DIR=/iscratch/i/monDom4/monDom4.2bit
 SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-10
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-stop=net `pwd`/DEF > blastz.out 2>&1 &
     #	running 2006-02-10
 # Completed: 43469 of 43470 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:   25745592s  429093.20m  7151.55h  297.98d  0.816 y
 # IO & Wait Time:               8466642s  141110.70m  2351.85h   97.99d  0.268 y
 # Average job time:                 787s      13.12m     0.22h    0.01d
 # Longest finished job:           51561s     859.35m    14.32h    0.60d
 # Submission to last job:        103470s    1724.50m    28.74h    1.20d
     #	There wasn't actually an outstanding job, it had been completed.
 # Completed: 345 of 345 jobs
 # CPU time in finished jobs:        620s      10.33m     0.17h    0.01d  0.000 y
 # IO & Wait Time:                  1631s      27.19m     0.45h    0.02d  0.000 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest finished job:              69s       1.15m     0.02h    0.00d
 # Submission to last job:           255s       4.25m     0.07h    0.00d
 
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:     224697s    3744.94m    62.42h    2.60d  0.007 y
 # IO & Wait Time:                  4790s      79.84m     1.33h    0.06d  0.000 y
 # Average job time:                4683s      78.06m     1.30h    0.05d
 # Longest finished job:          115041s    1917.35m    31.96h    1.33d
 # Submission to last job:        115147s    1919.12m    31.99h    1.33d
 
 
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=cat -stop=net `pwd`/DEF > cat-net.out 2>&1 &
     #	running 2006-02-11
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=load -stop=load `pwd`/DEF > load.out 2>&1 &
 
     ssh kolossus
     cd /cluster/data/hg18/bed/blastz.monDom4
     time nice -n +19 featureBits hg18 chainMonDom4Link \
 	> fb.hg18.chainMonDom4Link 2>&1 &
     cat fb.hg18.chainMonDom4Link
     #	356865888 bases of 2881515245 (12.385%) in intersection
 
 ####################################################################################
 # BUILD KNOWN GENES TABLES (STARTED 2/1/06, DONE 2/13/06 Fan)
 
 # First build protein databases, sp060115 and proteins060115
 # See makeProteins060115.doc for details.
 
 # Create working subdirectories and temporary databases (kgHg18A)
 
     ssh hgwdev
   cd /cluster/store11/kg
   mkdir kgHg18A
   ln -s /cluster/store11/kg/kgHg18A /cluster/store6/kgDB/bed/kgHg18A
   ln -s /cluster/store11/kg/kgHg18A /cluster/data/hg18/bed/kgHg18A
 
   hgsql hg18 -e "create database kgHg18A"
   hgsql hg18 -e "create database kgHg18ATemp"
 
   mkdir /cluster/bluearc/kgDB/kgHg18A
   mkdir /cluster/bluearc/kgDB/kgHg18A/protBlat
   ln -s /cluster/bluearc/kgDB/kgHg18A/protBlat /cluster/store11/kg/kgHg18A/protBlat
   cd /cluster/store11/kg/kgHg18A/protBlat
 
 # Get all human protein sequences
 
   hgsql -N sp060115 -e \
   'select p.acc, p.val from protein p, accToTaxon x where x.taxon=9606 and p.acc=x.acc'\
   |awk '{print ">" $1;print $2}' >humanProt.fa
 
   hgsql -N sp060115 -e \
   'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x where v.parAcc = p.acc and x.taxon=9606   and v.parAcc=x.acc'\
   |awk '{print ">" $1;print $2}' \
   >humanVarProt.fa
 
 # append var proteins to humanProt.fa
   cat humanVarProt.fa >>humanProt.fa
 
 # Prepare and perform cluster run for protein/genome alignment
 
   ssh pk
   cd /cluster/data/hg18/bed/kgHg18A/protBlat
   mkdir prot
   faSplit sequence humanProt.fa 2000 prot/prot
   ls /cluster/bluearc/kgDB/kgHg18A/protBlat/prot/* > prot.lis
 
   ssh hgwdev
   cd /cluster/data/hg18/bed/kgHg18A/protBlat
   hgsql hg18 -N -e 'select chrom from chromInfo' > chrom.lis
   exit
 
   cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/blat -t=dnax -q=prot /cluster/data/hg18/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgHg18A/protBlat/result/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
   mkdir result
   gensub2 chrom.lis prot.lis gsub jobList
 
   para create jobList
   para try
   para check
   para push
   para check ...
 # Completed: 97020 of 97020 jobs
 # CPU time in finished jobs:   16070335s  267838.92m  4463.98h  186.00d  0.510 y
 # IO & Wait Time:                279789s    4663.15m    77.72h    3.24d  0.009 y
 # Average job time:                 169s       2.81m     0.05h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:          152051s    2534.18m    42.24h    1.76d
 # Submission to last job:        152235s    2537.25m    42.29h    1.76d
 
 # This cluster run took a little less than 2 days.
 
 # collect BLAT results
 
    pslSort -nohead dirs raw.psl temp result
    pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl protBlat.psl /dev/null
 
    ssh hgwdev
    cd /cluster/bluearc/kgDB/kgHg18A/protBlat
    hgLoadPsl hg18 protBlat.psl
 
 # create all_mrna.psl and tight_mrna.psl
 
    hgsql hg18 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl
 
    pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
            all_mrna.psl tight_mrna.psl /dev/null
 
 # Save a copy of the following hg18 tables:
 
 all_mrna
 gbCdnaInfo
 gbExtFile
 gbLoaded
 gbSeq
 gbStatus
 genbank.lis
 refFlat
 refGene
 refLink
 refSeqAli
 refSeqStatus
 refSeqSummary
 xenoMrna
 xenoRefFlat
 xenoRefGene
 xenoRefSeqAli
 
 # Use overlapSelect to get protein and mRNA alignment overlaps
    overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
    -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.stat
 
    overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
    -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.out
 
 # Create protein/mRNA pair and protein lists
    cut -f 10,31 protMrna.out|sort -u >spMrna.tab
    cut -f 10    protMrna.out|sort -u >protein.lis
    mv protein.lis ..
 
 # Load spMrna.tab into spMrna table in temp DB.
    hgsql kgHg18ATemp < ~/src/hg/lib/spMrna.sql
    hgsql kgHg18ATemp -e 'load data local infile "spMrna.tab" into table spMrna'
    hgsql kgHg18ATemp -e 'create index mrnaID on spMrna(mrnaID)'
 
 # Prepare and perform cluster run of protein/mRNA alignment
 
 # Get mRNA fa file.
    cd /cluster/data/hg18/bed/kgHg18A
    /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=hg18 \
    -gbRoot=/cluster/data/genbank genbank mrna mrna.fa
 
 # Create mrnaSeq table in kgHg18ATemp DB.
 
    faToTab mrna.fa mrnaSeq.tab
 
    hgsql kgHg18ATemp -e 'drop table mrnaSeq'
    hgsql kgHg18ATemp <~/src/hg/lib/mrnaSeq.sql
    hgsql kgHg18ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
 
 # Prepare files for cluster run
 
    cd /cluster/bluearc/kgDB/kgHg18A
    ~/src/hg/protein/KG2B.sh kgHg18A hg18 060115
 
 # Perform cluster run of protein/mRNA alignment
    ~/src/hg/protein/KG3.sh kgHg18A hg18 060115
 
 # Collect cluster run results
    cd kgBestMrna
 
    ls out | sed -e 's/prot/do1 prot/g' >doall
 
 # create do1 with the following 2 lines:
    cat << '_EOF_' > do1
 echo processing $1
 cat out/$1/*.out >>protMrnaRaw.psl
 '_EOF_'
 
    chmod +x do*
    doall
 
 # Filter out low quality alignments
    pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
    cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
    wc protMrna.lis
 
 # Load BLAT results into temp DB.
    ssh hgwdev
    cd /cluster/store11/kg/kgHg18A/kgBestMrna
    hgsql kgHg18ATemp < ~/src/hg/lib/protMrnaBlat.sql
    hgsql kgHg18ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
    hgsql kgHg18ATemp -e 'create index tName on protMrnaBlat(tName)'
 
 # Create CDS files from protein/mRNA alignment results.
    hgsql kgHg18ATemp -N -e \
    'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
    |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds
 
 # Create protMrna.psl with proteinID_mrnaID as query ID.
    cut -f 22-30 ../protBlat/protMrna.out > j1.tmp
    cut -f 32-42 ../protBlat/protMrna.out > j2.tmp
    cut -f 10,31 ../protBlat/protMrna.out|sed -e 's/\t/_/g' >j3.tmp
    paste j1.tmp j3.tmp j2.tmp >protMrna.psl
    rm j1.tmp j2.tmp j3.tmp
 
 # Run mrnaToGene to create protMrna.gp
    bash
    mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
    exit
 
 # move kgBestMrna to /san/sanvol1 to save space on store11
 
    mv /cluster/store11/kg/kgHg18A/kgBestMrna/clusterRun /san/sanvol1/scratch/fan/hg18/kgHg18A/kgBestMrna
    ln -s /san/sanvol1/scratch/fan/hg18/kgHg18A/kgBestMrna/clusterRun \
    /cluster/store11/kg/kgHg18A/kgBestMrna/clusterRun
 
 # Prepare refGene and all_mrna gp files.
 
    cd ..
    cp -p base/refGene.tab ref.gp
 
 #   hgsql hg18 -N -e 'select * from refGene' >ref.gp
 
    hgsql hg18 -N -e \
    'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
    |sort -u > all_mrna.cds
 
    cat base/all_mrna.tab |cut -f 2-22 >all_mrna.psl
    bash
    mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
    exit
 
 # Align proteins to RefSeq.
 
    overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
    protBlat/protBlat.psl ref.gp ref.stat
    overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
    protBlat/protBlat.psl ref.gp protRef.gp
 
    overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
    -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out
 
    cut -f 10,22 protRef.out | sort -u >spRef.tab
    cut -f 10 protRef.out    | sort -u >protRef.lis
 
    hgsql kgHg18ATemp -e 'drop table spRef'
    hgsql kgHg18ATemp <~/src/hg/lib/spRef.sql
    hgsql kgHg18ATemp -e 'load data local infile "spRef.tab" into table spRef'
 
 # Prepare and perform cluster runs for protein/RefSeq alignments
 
    ~/src/hg/protein/KGRef2.sh kgHg18A hg18 060115
 # Took 7 hours.  This step should be investigated and improved.
 
    ~/src/hg/protein/KGRef3.sh kgHg18A hg18 060115
 
    cd kgBestRef
    ls out | sed -e 's/prot/do1 prot/g' >doall
 
    cat << '_EOF_' > do1
 echo processing $1
 cat out/$1/*.out >>protRefRaw.psl
 '_EOF_'
 
    chmod +x do*
    doall
 
 # Filter out low quality alignments.
    pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
    cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
    wc protRef.lis
 
    hgsql kgHg18ATemp -e 'drop table protRefBlat'
    hgsql kgHg18ATemp < ~/src/hg/lib/protRefBlat.sql
    hgsql kgHg18ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
    hgsql kgHg18ATemp -e 'create index tName on protRefBlat(tName)'
 
 # Run gene-check to filter out invalid gp entries
    cd /cluster/data/hg18/bed/kgHg18A
    cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
    gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/hg18/nib kgCandidate0.gp kgCandidate0.check
 
    hgsql kgHg18ATemp -e 'drop table kgCandidate0'
    hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidate0.sql
    hgsql kgHg18ATemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'
 
    hgsql kgHg18ATemp -e 'drop table geneCheck'
    hgsql kgHg18ATemp < ~/src/hg/lib/geneCheck.sql
    hgsql kgHg18ATemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'
 
 # Run kgCheck to get all KG candidates that pass the KG gene check criteria
 
    kgCheck kgHg18ATemp hg18 kgCandidate0 geneCheck kgCandidate.tab
    hgsql kgHg18ATemp -e  'drop table kgCandidate'
    hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidate.sql
    hgsql kgHg18ATemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
    hgsql kgHg18ATemp -e 'create index alignID on kgCandidate(alignID)'
 
 # ####### NEXT TIME AROUND PUT IN AN EXTRA STEP TO BRING IN ITEMS ON A "PUT BACK" LIST
 # FOR SPECIAL CASES LIKE SELENOCYSTEINE, NON-AUG INITIATION CODON, RIBOSOMAL SLIPPAGE, ETC.
 # #######
 
 # Construct the kgCandidateX table that has alignID in the name field.
    cut -f 2-10 kgCandidate.tab >j2.tmp
    cut -f 11 kgCandidate.tab >j1.tmp
    paste j1.tmp j2.tmp >kgCandidateX.tab
 
    hgsql kgHg18ATemp -e  'drop table kgCandidateX'
    hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateX.sql
    hgsql kgHg18ATemp -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'
 
 # Score protein/mRna and protein/RefSeq alignments
 
    ln -s protBlat/protein.lis protein.lis
    kgResultBestMrna2 060115 kgHg18ATemp hg18 protMrnaBlat|sort -u >protMrnaBlatScore.tab
    kgResultBestRef2  060115 kgHg18ATemp hg18 protRefBlat|sort -u >protRefScore.tab
 
 # Combine scoring results and load them into temp DB.
    cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
    hgsql kgHg18ATemp -e 'drop table protMrnaScore'
    hgsql kgHg18ATemp < ~/src/hg/lib/protMrnaScore.sql
    hgsql kgHg18ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
    hgsql kgHg18ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
 
 # Run kgGetCds to get CDS structure of each gene
 
    kgGetCds kgHg18ATemp 060115 kgCandidateX jY.tmp
    cat jY.tmp |sort -u >kgCandidateY.tab
    rm jY.tmp
    hgsql kgHg18ATemp -e  'drop table kgCandidateY'
    hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateY.sql
    hgsql kgHg18ATemp -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'
 
 # Run kgPickPrep to replace long cds structure string with cdsId.
    kgPickPrep kgHg18ATemp kgCandidateZ.tab
    hgsql kgHg18ATemp -e  'drop table kgCandidateZ'
    hgsql kgHg18ATemp < ~/src/hg/lib/kgCandidateZ.sql
    hgsql kgHg18ATemp -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
    hgsql kgHg18ATemp -e 'create index cdsId on kgCandidateZ(cdsId)'
 
 # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.
 
    kgPick kgHg18ATemp hg18 sp060115 kg3.tmp dupSpMrna.tmp
    sort -u dupSpMrna.tmp >dupSpMrna.tab
 
 # Create put back list
 
 # gbGetSeqs2, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line.
 
    gbGetSeqs2 -gbRoot=/cluster/data/genbank db=hg18 -get=ra RefSeq mrna ref.ra
    cat ref.ra | sed -e 's/ /\t/' | sort -u >refRa.tab
 
    hgsql hg18 -e  'drop table refRa'
    hgsql hg18 < ~/src/hg/lib/refRa.sql
    hgsql hg18 -e  'load data local infile "refRa.tab" into table refRa ignore 1 lines'
 
     hgsql hg18 -N -e \
 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and  r3.val="Homo sapiens"' \
     >kgPutBack2.tab
 
     hgsql hg18 -N -e \
 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
     >>kgPutBack2.tab
 
     hgsql hg18 -N -e \
 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
     >>kgPutBack2.tab
 
     hgsql hg18 -N -e \
     'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
     >>kgPutBack2.tab
 
     hgsql hg18 -N -e \
 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Homo sapiens"' \
     >>kgPutBack2.tab
 
    hgsql kgHg18ATemp -e 'drop table kgPutBack2'
    hgsql kgHg18ATemp < ~/src/hg/lib/kgPutBack2.sql
    hgsql kgHg18ATemp -e  'load data local infile "kgPutBack2.tab" into table kgPutBack2'
 
    kgPutBack kgHg18ATemp hg18 sp060115 kgPutBack2 kgPutBack2.gp
 
 # No matching protein found for NM_201397.
 # No matching protein found for NM_203341.
 # No matching protein found for NM_213593.
 # No matching protein found for NM_052987.
 # No matching protein found for NM_201397.
 # No matching protein found for NM_203341.
 # No matching protein found for NM_213593.
 
 # Sort KG genes to make the kg4.gp table file.
    cat kgPutBack2.gp kg3.tmp > kg4.tmp
    ~/kent/src/hg/protein/sortKg.pl kg4.tmp >knownGene.tab
 
    hgsql kgHg18ATemp -e  'drop table knownGene'
    hgsql kgHg18ATemp < ~/src/hg/lib/knownGene.sql
    hgsql kgHg18ATemp -e  'load data local infile "knownGene.tab" into table knownGene'
 
 # Load data into hg18 knownGene table.
    hgsql hg18 -e  'drop table knownGene'
    hgsql hg18 < ~/src/hg/lib/knownGene.sql
    hgsql hg18 -e  'load data local infile "knownGene.tab" into table knownGene'
 
 # Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain.
 
    hgsql hg18 -e  'drop table dupSpMrna'
    hgsql hg18 < ~/src/hg/lib/dupSpMrna.sql
    hgsql hg18 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'
 
 # Perform analysis on KG
 
 # Build knownGeneMrna and knownGenePep tables.
 
    kgPepMrna kgHg18ATemp hg18 060115
    hgsql hg18 -e  'drop table knownGeneMrna'
    hgsql hg18 < ~/src/hg/lib/knownGeneMrna.sql
    hgsql hg18 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
    hgsql hg18 -e  'drop table knownGenePep'
    hgsql hg18 < ~/src/hg/lib/knownGenePep.sql
    hgsql hg18 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'
 
 # Build kgXref table
 
    kgXref2 kgHg18ATemp 060115 hg18
 
    hgsql hg18 -e  'drop table kgXref'
    hgsql hg18 < ~/src/hg/lib/kgXref.sql
    hgsql hg18 -e  'load data local infile "kgXref.tab" into table kgXref'
 
 # Build spMrna table
 
    hgsql hg18 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab
 
    hgsql hg18 -e  'drop table spMrna'
    hgsql hg18 <~/src/hg/lib/spMrna.sql
    hgsql hg18 -e 'load data local infile "kgSpMrna.tab" into table spMrna'
 
 # Build kgProtMap table
 
     ~/src/hg/protein/kgProtMap2.sh kgHg18A hg18 060115
 
 # Found the number of kgProtMap table was less than 20,000,
 # indicating missing a lot of entries.  The problem was
 # due to that tight_mrna.psl was now in ~/hg18Kg/protBlat.
 # Manually ran the following to correct the problem:
 
 cd ~/hg18Kg/kgProtMap/psl.tmp
 cat ~/hg18Kg/protBlat/tight_mrna.psl refSeqAli.psl > both.psl
 
 pslMap kgProtMrna.psl both.psl stdout | sort -u| \
         sort -k 14,14 -k 16,16n -k 17,17n > kgProtMap.psl
 
 hgsql hg18 -e "drop table kgProtMap;"
 hgLoadPsl -tNameIx hg18 kgProtMap.psl
 
 #####################################
 # Build alias tables.
 
    kgAliasM hg18 proteins060115
 
 #	kgAliasKgXref reads from hg18.knownGene.proteinID,
 #	hg18.knownGene.name, hg18.kgXref.geneSymbol
 #	to create kgAliasKgXref.tab
 
    kgAliasKgXref hg18
 
 #	kgAliasRefseq reads from hg18.knownGene.name,
 #	hg18.knownGene.proteinID, hg18.kgXref.refseq
 #	to create kgAliasRefseq.tab
 
    kgAliasRefseq hg18
 
    hgsql sp060115 -N -e 'select name,gene.val from hg18.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
    | sort -u  > kgAliasP.tab
 
    hgsql hg18 -N -e 'select name, name from knownGene' >kgAliasDup.tab
    hgsql hg18 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
 
    cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
    sort |uniq > kgAlias.tab
 
    hgsql -e "drop table kgAlias;" hg18
    hgsql hg18 < ~/kent/src/hg/lib/kgAlias.sql
    hgsql hg18 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias'
 
 #	kgProtAlias reads from hg18.knownGene.name,
 #	hg18.knownGene.proteinID, hg18.knownGene.alignID,
 #	proteins060115.spXref3.accession, proteins060115.spSecondaryID, proteins060115.pdbSP.pdb
 #	to create kgProtAlias.tab#
 
    kgProtAlias hg18 060115
 
    hgsql hg18 -N -e \
    'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
    | sort -u >kgProtAliasNCBI.tab
 
 # include variant splice protein IDs
 
    hgsql hg18 -N -e \
    'select name, proteinID, parAcc from knownGene,sp060115.varAcc where varAcc=proteinID'\
    |sort -u >kgProtAliasDup.tab
 
 # include duplicate protein IDs from dupSpMrna table
    hgsql hg18 -N -e \
    'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
    |sort -u >>kgProtAliasDup.tab
 
 # catch parent acc from dupProteinID too
    hgsql hg18 -N -e\
    'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp060115.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
    |sort -u >>kgProtAliasDup.tab
     cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab
 
     echo "`date` creating table kgProtAlias"
     hgsql hg18 -e "drop table kgProtAlias;"
     hgsql hg18 <~/src/hg/lib/kgProtAlias.sql;
     hgsql hg18 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'
 
 # Build kgSpAlias table
 
     hgsql hg18 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
     hgsql hg18 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >>j.tmp
     cat j.tmp|sort -u |grep -v 'kgID' >hg18.kgSpAlias.tab
     rm j.tmp
 
     hgsql hg18 -e 'drop table kgSpAlias';
     hgsql hg18 < ~/src/hg/lib/kgSpAlias.sql
     hgsql hg18 -e 'load data local infile "hg18.kgSpAlias.tab" into table kgSpAlias'
 
 # QA NOTE (3-6-2006): did a mytouch to update the time for the knownGene table
 # (because joinerCheck was complaining during -times check):
 # [hgwdev:~/joiner> sudo mytouch hg18 knownGene 200602061707
 # touch -t 200602061707 /var/lib/mysql/hg18/knownGene.MYD
 
 # MAKE FOLDUTR TABLES (DONE 2006-02-09, Fan)
 # First set up directory structure and extract UTR sequence on hgwdev
     ssh hgwdev
     cd /cluster/data/hg18/bed
     mkdir rnaStruct.2006-02-09
     rm rnaStruct
     ln -s rnaStruct.2006-02-09 rnaStruct
     cd rnaStruct
     mkdir -p utr3/split utr5/split utr3/fold utr5/fold
     utrFa hg18 knownGene utr3 utr3/utr.fa
     utrFa hg18 knownGene utr5 utr5/utr.fa
 
 # Split up files and make files that define job.
     ssh pk
     cd /cluster/data/hg18/bed/rnaStruct
     faSplit sequence utr3/utr.fa 50000 utr3/split/s
     faSplit sequence utr5/utr.fa 50000 utr5/split/s
     ls -1 utr3/split > utr3/in.lst
     ls -1 utr5/split > utr5/in.lst
     cd utr3
     cat > gsub <<end
 #LOOP
 rnaFoldBig split/\$(path1) fold
 #ENDLOOP
 end
     cp gsub ../utr5
 
 # Do cluster run for 3' UTRs
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 36097 of 36097 jobs
 # CPU time in finished jobs:     335580s    5593.00m    93.22h    3.88d  0.011 y
 # IO & Wait Time:                653230s   10887.16m   181.45h    7.56d  0.021 y
 # Average job time:                  27s       0.46m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1730s      28.83m     0.48h    0.02d
 # Submission to last job:          6007s     100.12m     1.67h    0.07d
 
 # Do cluster run for 5' UTRs
     cd ../utr5
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 34011 of 34011 jobs
 # CPU time in finished jobs:      78543s    1309.05m    21.82h    0.91d  0.002 y
 # IO & Wait Time:                938250s   15637.50m   260.62h   10.86d  0.030 y
 # Average job time:                  30s       0.50m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            5873s      97.88m     1.63h    0.07d
 # Submission to last job:          6139s     102.32m     1.71h    0.07d
 
 # Load database
     ssh hgwdev
     cd /cluster/data/hg18/bed/rnaStruct/utr5
     hgLoadRnaFold hg18 foldUtr5 fold
     cd ../utr3
     hgLoadRnaFold hg18 foldUtr3 fold
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 # Build KEGG pathway tables.  DONE 5/19/05.  Fan.
    ssh hgwdev
    cd /cluster/store11/kg/kgHg18A
    md kegg
    cd kegg
 
    ~/src/hg/protein/KGpath.sh kgHg18A hg18 060115
 
    hgsql hg18 -e "drop table keggMapDesc"
    hgsql hg18 -e "drop table keggPathway"
    hgsql hg18 <~/src/hg/lib/keggMapDesc.sql
    hgsql hg18 <~/src/hg/lib/keggPathway.sql
    hgsql hg18 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
    hgsql hg18 -e 'load data local infile "keggPathway.tab" into table keggPathway'
 
 # Build CGAP pathway tables
 # RELOAD cgapAlias TABLE AFTER REMOVING REPLICATE ROWS (hartera, 2005-07-26)
 # duplicate rows. (hartera, 2005-07-26)
 # RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)
 
    cd ..
    mkdir cgap
    cd cgap
    ~/src/hg/protein/KGcgap.sh kgHg18A hg18 060115
 
    cat cgapBIOCARTAdesc.tab |sort -u > cgapBIOCARTAdescSorted.tab
    hgsql hg18 -e "drop table cgapAlias"
    hgsql hg18 -e "drop table cgapBiocDesc"
    hgsql hg18 -e "drop table cgapBiocPathway"
    hgsql hg18 <~/src/hg/lib/cgapAlias.sql
    hgsql hg18 <~/src/hg/lib/cgapBiocDesc.sql
    hgsql hg18 <~/src/hg/lib/cgapBiocPathway.sql
 
    hgsql hg18 -e 'load data local infile "cgapAlias.tab" \
                  into table cgapAlias'
    hgsql hg18 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
    hgsql hg18 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'
 
 
 
 # Build hg18 PROTEOME BROWSER TABLES
 
 # These are instructions for building tables
 # needed for the Proteome Browser.
 
 # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
 # ARE REBUILT.
 # This build is based on proteins DBs dated 060115.
 
 # Create the working directory
 
     ssh hgwdev
     mkdir /cluster/store11/kg/kgHg18A/pb-2006-02-10
     cd /cluster/data/hg18/bed
     rm pb
     ln -s /cluster/store11/kg/kgHg18A/pb-2006-02-10 pb
     cd pb
 
 # Define pep* tables in hg18 DB
 
 	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
 
 #  First edit out pepPred table definition, then
 
 	hgsql hg18 < pepAll.sql
 
 # Build the pepMwAa table
 
   hgsql proteins060115 -N -e \
 "select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > pepMwAa.tab
 
 hgsql hg18 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'
 
 o Build the pepPi table
 
     hgsql proteins060115 -e \
     "select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=9606 and accToTaxon.acc = info.acc" > protAcc.lis
 
     hgsql hg18 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis
 
     pbCalPi protAcc.lis sp060115 pepPi.tab
     hgsql hg18 -e 'delete from pepPi'
     hgsql hg18 -e 'load data local infile "pepPi.tab" into table hg18.pepPi'
 
 # Calculate and load pep distributions
 
     pbCalDist sp060115 proteins060115 9606 hg18 >pbCalDist.out
     wc  pbCalDist.out
 
     hgsql hg18
     load data local infile "pepExonCntDist.tab" into table hg18.pepExonCntDist;
     load data local infile "pepCCntDist.tab" into table hg18.pepCCntDist;
     load data local infile "pepHydroDist.tab" into table hg18.pepHydroDist;
     load data local infile "pepMolWtDist.tab" into table hg18.pepMolWtDist;
     load data local infile "pepResDist.tab" into table hg18.pepResDist;
     load data local infile "pepIPCntDist.tab" into table hg18.pepIPCntDist;
     load data local infile "pepPiDist.tab" into table hg18.pepPiDist;
     quit
 
 # Calculate frequency distributions
 
     pbCalResStd sp060115 9606 hg18
 
 # Create pbAnomLimit and pbResAvgStd tables
 
    hgsql hg18 -e "drop table pbAnomLimit"
    hgsql hg18 -e "drop table pbResAvgStd"
    hgsql hg18 < ~/src/hg/lib/pbAnomLimit.sql
    hgsql hg18 < ~/src/hg/lib/pbResAvgStd.sql
 
    hgsql hg18 -e 'load data local infile "pbResAvgStd.tab" into table hg18.pbResAvgStd;'
    hgsql hg18 -e 'load data local infile "pbAnomLimit.tab" into table hg18.pbAnomLimit;'
 
 # Create pbStamp table for PB
   hgsql hg18 -e "drop table pbStamp"
   hgsql hg18 < ~/src/hg/lib/pbStamp.sql
   hgsql hg17 -N -e 'select * from pbStamp' > pbStamp.tab
   hgsql hg18 -e 'load data local infile "pbStamp.tab" into table hg18.pbStamp'
 
 # Turn on Proteome Browser for hg18.
 
   hgsql -e 'delete from dbDb where name="hg18"' \
         -h genome-testdb hgcentraltest
 
   hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
         defaultPos, active, orderKey, genome, scientificName, \
         htmlPath, hgNearOk, hgPbOk, sourceName) \
         VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \
         "chr7:127,664,479-127,689,005", 1, 10, "Human", "Homo sapiens", \
         "/gbdb/hg18/html/description.html", 0, 1, "NCBI Build 36.1");' \
         -h genome-testdb hgcentraltest
 
 # Adjust drawing parameters for Proteome Browser stamps
 
   Now invoke Proteome Browser and adjust various drawing parameters
   (mostly the ymax of each stamp) if necessary, by updating the
   pbStamp.tab file and then delete and reload the pbStamp table.
 
   hgsql hg18 -e "drop table pbStamp"
   hgsql hg18 < ~/src/hg/lib/pbStamp.sql
   hgsql hg18 -e 'load data local infile "pbStamp.tab" into table hg18.pbStamp'
 
 # Perform preliminary review of Proteome Browser for hg18, then
   notify QA for formal review.
 
 
 # First build entrez DB tables.
 
    cd /cluster/store10/entrez
    mkdir 060208
    ln -s /cluster/store10/entrez/060208 /cluster/data/entrez/060208
    cd /cluster/data/entrez/060208
 
    wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
    wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
    wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
    gzip -d *.gz
 
    cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g' > entrezMrna.tab
    cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
    cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g' > entrezRefProt.tab
 
    hgsql entrez -e 'drop table entrezRefseq'
    hgsql entrez -e 'drop table entrezMrna'
    hgsql entrez -e 'drop table entrezRefProt'
 
    hgsql entrez < ~/src/hg/lib/entrezRefseq.sql
    hgsql entrez < ~/src/hg/lib/entrezMrna.sql
    hgsql entrez < ~/src/hg/lib/entrezRefProt.sql
 
    hgsql entrez -e 'load data local infile "entrezRefseq.tab" into table entrezRefseq'
    hgsql entrez -e 'load data local infile "entrezMrna.tab" into table entrezMrna'
    hgsql entrez -e 'load data local infile "entrezRefProt.tab" into table entrezRefProt'
 
    cd /cluster/store11/kg/kgHg18A
    hgsql entrez -N -e \
         'select mrna, refseq from entrezRefseq, entrezMrna, hg18.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \
    >mrnaRefseq1.tab
 
 # Include RefSeq as valid mRNA too.
     hgsql hg18 -N -e 'select name, name from refGene' >mrnaRefseq2.tab
 
     cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab
 
     hgsql hg18 -e 'drop table mrnaRefseq'
     hgsql hg18 < ~/src/hg/lib/mrnaRefseq.sql
     hgsql hg18 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'
 
 # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 2/16/06 Fan)
 # This depends on the go and uniProt databases as well as
 # the kgAlias and kgProAlias tables.  The hgKgGetText takes
 # about 5 minutes when the database is not too busy.  The rest
 # is real quick.
      ssh hgwdev
      cd /cluster/store11/kg/kgHg18A
      mkdir index
      cd index
      hgKgGetText hg18 knownGene.text
      ixIxx knownGene.text knownGene.ix knownGene.ixx
      ln -s /cluster/store11/kg/kgHg18A/index/knownGene.ix  /gbdb/hg18/knownGene.ix
      ln -s /cluster/store11/kg/kgHg18A/index/knownGene.ixx /gbdb/hg18/knownGene.ixx
 
 # BUILD KNOWN GENE LIST FOR GOOGLE.  (REDONE 8/12/08 JK)
 # make knownGeneLists.html hg18GeneList.html mm5GeneList.html rm3GeneList.html
 
     cd /cluster/data/hg18/bed
     rm -rf knownGeneList/hg18
 
 # Run hgKnownGeneList to generate the tree of HTML pages
 # under ./knownGeneList/hg18
 
     hgKnownGeneList hg18
 
 # copy over to /usr/local/apache/htdocs
 
     rm -rf /usr/local/apache/htdocs/knownGeneList/hg18
     mkdir -p /usr/local/apache/htdocs/knownGeneList/hg18
     cp -Rfp knownGeneList/hg18/* /usr/local/apache/htdocs/knownGeneList/hg18
 
 ##################################################################################
 # Create description.html for hg18
 
 mkdir -p ~/kent/src/hg/makeDb/trackDb/human/hg18
 cd ~/kent/src/hg/makeDb/trackDb/human/hg18
 cp ../hg17/description.html .
 
 vi description.html
 # Change release date and build number and change hg17 to hg18
 # Check it into CVS
 
 mkdir -p /cluster/data/hg18/html
 cp -p description.html /cluster/data/hg18/html
 
 ln -s /cluster/data/hg18/html/description.html /gbdb/hg18/html/description.html
 
 # BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-02-11, DONE 2006-02-14 - Fan)
 #	This should be done after KG tables are complete from known genes build
 #	process.
 #
 # Cluster together various alt-splicing isoforms.
 #	Creates the knownIsoforms and knownCanonical tables
 ssh hgwdev
 mkdir /cluster/data/hg18/bed/geneSorter.2006-02-11
 # remove old symbolic link
 rm /cluster/data/hg18/bed/geneSorter
 ln -s /cluster/data/hg18/bed/geneSorter.2006-02-11 /cluster/data/hg18/bed/geneSorter
 cd /cluster/data/hg18/bed/geneSorter
 hgClusterGenes hg18 knownGene knownIsoforms knownCanonical
 
 # Extract peptides from knownGenes into fasta file
 # and create a blast database out of them.
 mkdir /cluster/data/hg18/bed/geneSorter/blastp
 cd /cluster/data/hg18/bed/geneSorter/blastp
 pepPredToFa hg18 knownGenePep known.faa
 #	You may need to build this binary in src/hg/near/pepPredToFa
 /scratch/blast/formatdb -i known.faa -t known -n known
 #	This command is in /projects/compbio/bin/$MACH/formatdb
 
 # Copy over database to bluearc
 rm -fr /cluster/bluearc/hg18/blastp
 mkdir -p /cluster/bluearc/hg18/blastp
 cp -p /cluster/data/hg18/bed/geneSorter/blastp/known.* /cluster/bluearc/hg18/blastp
 
 # Split up fasta file into bite sized chunks for cluster
 cd /cluster/data/hg18/bed/geneSorter/blastp
 mkdir split
 faSplit sequence known.faa 8000 split/kg
 
 # Make parasol run directory
 ssh pk
 mkdir /cluster/data/hg18/bed/geneSorter/blastp/self
 cd /cluster/data/hg18/bed/geneSorter/blastp/self
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/hg18/blastp/known -i $1 -o $2 \
 -e 0.01 -m 8 -b 1000
 '_EOF_'
     # << keep emacs happy
 chmod +x blastSome
 
 # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	'ls ../../split/*.fa' is too much, hence the echo
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para push
 para check
 # Completed: 7733 of 7733 jobs
 # CPU time in finished jobs:      56608s     943.47m    15.72h    0.66d  0.002 y
 # IO & Wait Time:                467120s    7785.33m   129.76h    5.41d  0.015 y
 # Average job time:                  68s       1.13m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             119s       1.98m     0.03h    0.00d
 # Submission to last job:          1433s      23.88m     0.40h    0.02d
 
 # Load into database.  This takes about 20 minutes
 ssh hgwdev
 cd /cluster/data/hg18/bed/geneSorter/blastp/self/run/out
 bash
 time hgLoadBlastTab hg18 knownBlastTab *.tab
 # Scanning through 7733 files
 # Loading database with 9647176 rows
 # real    21m51.039s
 
 cd /cluster/data/hg18/bed/geneSorter
 # Create table that maps between known genes and RefSeq
 hgMapToGene hg18 refGene knownGene knownToRefSeq
 #	may need to build this command in src/hg/near/hgMapToGene
 #	hgsql -e "select count(*) from knownToRefSeq;" hg18
 #	row count changed 34267
 
 # Create table that maps between known genes and LocusLink
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg18 > refToLl.txt
 hgMapToGene hg18 refGene knownGene knownToLocusLink -lookup=refToLl.txt
 #	hgsql -e "select count(*) from knownToLocusLink;" hg18
 #	row count changed to 34267
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt hg18 knownGene name proteinID Pfam knownToPfam
 #	hgsql -e "select count(*) from knownToPfam;" hg18
 #	row count changed to 34177
 
 # Create table to map between known genes and GNF Atlas2
 # expression data.
     hgMapToGene hg18 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 #	hgsql -e "select count(*) from knownToGnfAtlas2;" hg18
 #	row count changed to 32015
 
 # Create expression distance table - takes about an hour
     hgExpDistance hg18 hgFixed.gnfHumanAtlas2MedianRatio \
     	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnfAtlas2 &
 # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
 # Got 32015 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
 #	hgsql -e "select count(*) from gnfAtlas2Distance;" hg18
 #	row count changed to 32015000
 
 # Create a table that maps between known genes and
 # the nice affy expression data.
 hgMapToGene "-type=bed 12" hg18 affyUclaNorm knownGene knownToU133
 #	hgsql -e "select count(*) from knownToU133;" hg18
 #	row count changed to 32632
 
 # Create expression distance table.  This will take about 2.5 hours
 cd /tmp
 cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
 time hgExpDistance hg18 affyUclaNorm affyUclaExp knownExpDistance \
 	-weights=affyUcla.weight -lookup=knownToU133 &
 # Have 43039 elements in affyUclaNorm
 # 211 genes, 42 weights, 26.500000 total wieght
 # Got 32965 unique elements in affyUclaNorm
 
 # Create table that maps between known genes and
 # the GNF data.
 cd /tmp
 hgMapToGene hg18 affyU95 knownGene knownToU95
 #	row count changed to 17401
 #	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
 hgExpDistance hg18 hgFixed.gnfHumanU95MedianRatio \
 	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95 &
 # Have 11545 elements in hgFixed.gnfHumanU95MedianRatio
 # Got 16378 unique elements in hgFixed.gnfHumanU95MedianRatio
 #	row count changed to 16378000
 
 # Create known gene mapping table and expression distance tables
 # for GNF Atlas 2.  (The hgExpDistance takes only 10 minutes.)
 
 hgMapToGene hg18 affyGnf1h knownGene knownToGnf1h
 hgExpDistance hg18 hgFixed.gnfHumanAtlas2MedianRatio \
 	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnf1h &
 # Have 44696 elements in hgFixed.gnfHumanAtlas2MedianRatio
 # Got 8739 unique elements in hgFixed.gnfHumanAtlas2MedianRatio
 
 # AFFYMETRIX HG-U133 PLUS TRACK (DONE, 2006-02-11, Fan)
 # Loaded the HG-U133 Plus 2 sequences for hg18 (DONE, 2006-03-29, hartera)
 # The below was already done.
 # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
      ssh hgwdev
      mkdir -p /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
      # Go to http://www.affymetrix.com/support/technical/byproduct.affx?product=hg-u133-plus
      # and download the consensus and exemplar sequences to this directory
      cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
      unzip HG-U133_Plus_2_consensus.zip
      unzip HG-U133_Plus_2_exemplar.zip
      cat HG-U133_Plus_2_consensus HG-U133_Plus_2_exemplar >> U133Plus2_all.fa
      perl -pi.bak -e "s/(consensus|exemplar):HG-U133_Plus_2:/U133+2:/" \
                      U133Plus2_all.fa
      # remove ";" from probe set names
      perl -pi.bak -e "s/;//" U133Plus2_all.fa
      # clean up
      rm *.zip *.bak
 
      # Set up cluster job to align consensus/exemplars to hg16
      ssh kkr1u00
      mkdir -p /iscratch/i/affy
      mv /cluster/data/hg18/bed/affyU133Plus2.2006-02-11/U133Plus2_all.fa \
         /iscratch/i/affy
      iSync
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # The above is already done by Rachel during hg17 build.
 
      ssh hgwdev
      cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
      mkdir -p /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
      cp -p U133Plus2_all.fa /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
 
      cd /projects/compbio/data/microarray/affyHuman/HG-U133Plus2
      mkdir -p /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
      cp -p U133Plus2_all.fa /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
 
      ssh kk
      cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
      ls -1 /iscratch/i/affy/U133Plus2_all.fa > affy.lst
      ls -1 /iscratch/i/gs.19/build36/maskedContigs/* > allctg.lst
 
     cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << for emacs
     gensub2 allctg.lst affy.lst template.sub para.spec
     mkdir psl
     para create para.spec
     para try, para check, para push ...
 # Completed: 378 of 378 jobs
 # CPU time in finished jobs:      24764s     412.74m     6.88h    0.29d  0.001 y
 # IO & Wait Time:                 13823s     230.38m     3.84h    0.16d  0.000 y
 # Average job time:                 102s       1.70m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             782s      13.03m     0.22h    0.01d
 # Submission to last job:           827s      13.78m     0.23h    0.01d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyU133Plus2.psl
     pslSort dirs raw.psl tmp psl
 
     # use filter parameters for these sequences. only use alignments that
     # cover 30% of sequence and have at least 95% identity in aligned region.
     # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
 
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp affyU133Plus2.psl ../../jkStuff/liftAll.lft warn contig.psl
     perl -pi.bak -e "s/U133\+2://" affyU133Plus2.psl
     # load into the database
     ssh hgwdev
     cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
     hgLoadPsl hg18 affyU133Plus2.psl
 
 # The below was already done.
 # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
 
     # Add sequence data to database
         # Copy probe sequence to /gbdb if it isn't already
     mkdir -p /gbdb/hgFixed/affyProbes
     cd /gbdb/hgFixed/affyProbes
     ln -s /projects/compbio/data/microarray/affyHuman/HG-U133Plus2/U133Plus2_all.fa .
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # The above is already done by Rachel during hg17 build.
     cd /cluster/data/hg18/bed/affyU133Plus2.2006-02-11
     # the sequences need to be loaded for the hg18 database
     # (2006-03-29, hartera)
     hgLoadSeq -abbr=U133+2: hg18 /gbdb/hgFixed/affyProbes/U133Plus2_all.fa
 
     # clean up
     rm -r psl tmp err contig.psl raw.psl *.bak psl.tab seq.tab
 
 # Added knownToU133Plus2 track
 
 cd /cluster/data/hg18/bed/geneSorter
 hgMapToGene hg18 affyU133Plus2 knownGene knownToU133Plus2
 #	row count changed to 34745
 
 # Make knownToCdsSnp table (DONE Sept 12, 2007, jk)
   ssh hgwdev
   hgMapToGene hg18 snp126 knownGene knownToCdsSnp -all -cds
 # approx. 5 minutes running time
 
 # UPDATE GO DATABASE
 # Download the terms and make the database.
 ssh hgwdev
 mkdir /cluster/store1/geneOntology/20060211
 cd /cluster/store1/geneOntology/20060211
 
 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200601-assocdb-data.gz
 
 hgsql mysql <<end
 create database go060211;
 end
 zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
 hgsql go060211 <j.tmp
 rm j.tmp
 
 wget --timestamping ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_uniprot.gz
 
 # The format of gene_association.goa_uniprot.gz changed, there is 6 comment lines at the head now.
 # Updated hgGoAssociation.c to skip first 6 lines.
 
 zcat gene_association.goa_uniprot.gz | /cluster/home/fanhsu/bin/i386/hgGoAssociation go060211 goaPart stdin
 # Passed 6832447 of 7933823 of 7933823, 86.12%
 
 # Ask sys-admin to switch the database pointer go to point to go060211.
 
 # HGNEAR PROTEIN BLAST TABLES (DONE 2/12/06 Fan)
 
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/hgNearBlastp
     cd /cluster/data/hg18/bed/hgNearBlastp
     cat << _EOF_ > config.ra
 # Latest human vs. other Gene Sorter orgs:
 # mouse, rat, zebrafish, worm, yeast, fly
 
 targetGenesetPrefix human
 targetDb hg18
 queryDbs mm7 rn3 danRer3 ce2 sacCer1 dm2
 
 hg18Fa /cluster/data/hg18/bed/blastp/known.faa
 mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa
 rn3Fa /cluster/data/rn3/bed/blastp/known.faa
 danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
 ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
 sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
 dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa
 
 buildDir /cluster/data/hg18/bed/hgNearBlastp
 scratchDir /san/sanvol1/scratch/hg18HgNearBlastp
 _EOF_
 #    doHgNearBlastp.pl config.ra >& do.log &
 
     doHgNearBlastp.pl config.ra >do3.log
 #    tail -f do.log
 0657.tab dm2_0658.tab dm2_0659.tab dm2_0660.tab dm2_0661.tab dm2_0662.tab dm2_0663.tab dm2_0664.tab dm2_0665.tab dm2_0666.tab dm2_0667.tab dm2_0668.tab dm2_0669.tab dm2_0670.tab
 Scanning through 671 files
 Loading database with 14488 rows
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/hg18.split
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/hg18.formatdb
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/mm7.split
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/mm7.formatdb
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/rn3.split
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/rn3.formatdb
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/danRer3.split
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/danRer3.formatdb
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/ce2.split
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/ce2.formatdb
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/sacCer1.split
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/sacCer1.formatdb
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/dm2.split
 # ssh -x pk rm -rf /san/sanvol1/scratch/hg18HgNearBlastp/dm2.formatdb
 # ssh -x pk rmdir /san/sanvol1/scratch/hg18HgNearBlastp
 
  *** All done!
  *** Check these tables in hg18:
  *** humanBlastTab mmBlastTab rnBlastTab drBlastTab ceBlastTab scBlastTab dmBlastTab
  *** and hgBlastTab in these databases:
  *** mm7 rn3 danRer3 ce2 sacCer1 dm2
 
 # MAKE ORGANISM-SPECIFIC HGNEARDATA FILES
     cd ~/kent/src/hg/near/hgNear/hgNearData
 # any updates necessary?
 
 # ENABLE HGNEAR FOR HG18 IN HGCENTRALTEST
     echo "update dbDb set hgNearOk = 1 where name = 'hg18';" \
       | hgsql -h genome-testdb hgcentraltest
 
 # END OF HGNEAR STUFF
 
 #############################################################################
 # UPDATE BIOCYCTABLES NEEDED BY hgGene (DONE 2/16/06 Fan)
 
 # First register with BioCyc to download their HumanCyc database
 # The site will email you the URL for download
 
     wget --timestamping http://bioinformatics.ai.sri.com/ecocyc/dist/pdff-XXXXXX/humancyc-flatfiles.zip
     unzip  humancyc-flatfiles.zip
 
     cp genes.col genes.tab
     cp pathways.col pathways.tab
 
 # delete the first 20 or so header lines from these two files.
     vi genes.tab
     vi pathways.tab
 
     hgsql hg18 -e 'create database bioCyc060216'
     hgsql bioCyc060216 < ~/src/hg/lib/bioCycGenes.sql
     hgsql bioCyc060216 -e 'load data local infile "genes.tab" into table genes'
 
     hgsql bioCyc060216 < ~/src/hg/lib/bioCycPathways.sql
     hgsql bioCyc060216 -e 'load data local infile "pathways.tab" into table pathways'
 
 # Create bioCycMapDesc.tab
     hgsql bioCyc060216 -N -e 'select UNIQUE_ID, NAME from pathways' |sort -u >  bioCycMapDesc.tab
 
 # Create bioCycPathway.tab
     kgBioCyc0 bioCyc060216 hg18 hg17
 
     hgsql hg18 -e 'delete from bioCycPathway'
     hgsql hg18 -e 'delete from bioCycMapDesc'
     hgsql hg18 < ~/src/hg/lib/bioCycPathway.sql
     hgsql hg18 < ~/src/hg/lib/bioCycMapDesc.sql
 
 # Load results into hg18.
 
    hgsql hg18 -e 'LOAD DATA local INFILE "bioCycMapDesc.tab" into table bioCycMapDesc'
    hgsql hg18 -e 'LOAD DATA local INFILE "bioCycPathway.tab" into table bioCycPathway'
 
 #############################################################################
 # BLASTZ/CHAIN/NET RN4 (DONE 2/17/06 Fan)
     ssh kkstore02
 
     cd /cluster/store11/gs.19/build36
     cp -Rp linSpecRep /san/sanvol1/scratch/hg18
     cp -Rp nib /san/sanvol1/scratch/hg18
 
     mkdir /cluster/data/hg18/bed/blastz.rn4.2006-02-17
     cd /cluster/data/hg18/bed/blastz.rn4.2006-02-17
 
     cat << '_EOF_' > DEF
 # human vs. rat
 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/san/sanvol1/scratch/hg18/nib
 SEQ1_SMSK=/san/sanvol1/scratch/hg18/linSpecRep/notInRat
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat
 SEQ2_DIR=/san/sanvol1/scratch/rn4/nib
 SEQ2_SMSK=/san/sanvol1/scratch/rn4/linSpecRep.notInHuman
 SEQ2_LEN=/cluster/data/rn4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.rn4.2006-02-17
 '_EOF_'
     # << for emacs
     doBlastzChainNet.pl DEF -chainLinearGap medium \
       -bigClusterHub pk -smallClusterHub pk -workhorse pk \
       -blastzOutRoot /san/sanvol1/scratch/blastzHg17Rn4Out >& do.log &
     tail -f do.log
     rm -f /cluster/data/hg18/bed/blastz.rn4
     ln -s blastz.rn4.2006-02-17 /cluster/data/hg18/bed/blastz.rn4
 
 #############################################################################
 # BUILD WGRNA TRACK (DONE, 2006-02-22, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2006-05-15, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2007-05-31, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2007-10-05, Fan)
 
   ssh hgwdev
   cd /cluster/data/hg18/bed
 
   mkdir wgRna-2006-02-22
   cd wgRna-2006-02-22
 
 # Received the data file, wg_hg18_track.txt, from Michel Weber's email
 # (Michel.Weber at ibcg.biotoul.fr)
 # and place it under cd /cluster/data/hg18/bed/wgRna-2006-02-22.
 
   cp -p wg_hg18_track.txt wgRna.tab
 
   hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
 
 #############################################################################
 # RE-BUILD WGRNA TRACK (DONE, 2006-05-15, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2007-05-31, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2007-10-05, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK       (DONE, 2008-05-29, Fan)
 
     ssh hgwdev
     cd /cluster/data/hg18/bed
 
     mkdir wgRna-2008-05-28
     cd wgRna-2008-05-28
 
 # Received the data file, wgtrack_may2008.doc, from Michel Weber's
 # email
 # (Michel.Weber at ibcg.biotoul.fr)
 # Save it as .txt file and change all blanks into tabs.
 # and place it under cd /cluster/data/hg18/bed/wgRna-2008-05-28.
 
     cp -p wgtrack_may2008.txt wgRna.tab
 
     hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
 
 #############################################################################
 # 17-WAY MULTIZ ALIGNMENTS (DONE - 2006-02-22 Fan)
 
     # copy net mafs to cluster-friendly storage for multiz run
 
     ssh kkstore02
 
     ln -s /cluster/data/hg18/bed/blastzMonDom4.2006-02-13 /cluster/data/hg18/bed/blastz.monDom4
     cd /cluster/data/hg18/bed/blastz.monDom4
 
     cd /cluster/data/hg18/bed
     mkdir -p multiz17way.2006-02-18
     ln -s multiz17way.2006-02-18 multiz17way
     cd multiz17way
 
     # copy MAF's to cluster-friendly server
     # These MAF's already on bluearc:
     #  canFam2, fr1, galGal2, panTro1, rn4
     mkdir -p /san/sanvol1/scratch/hg18/mafNet
     cd /san/sanvol1/scratch/hg18/mafNet
     ln -s /cluster/bluearc/hg18/mafNet/{*} .
 
     # copy others
     foreach s (bosTau2 canFam2 danRer3 dasNov1 echTel1 fr1 galGal2 loxAfr1 \
                mm8 monDom4 oryCun1 panTro1 rn4 tetNig1 xenTro1 rheMac2)
         echo $s
         cp -Rp /cluster/data/hg18/bed/blastz.$s/mafNet $s
     end
 
 # danRer3 directory structure is different.  It is under /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun
 
     ln -s /san/sanvol1/scratch/hg18/blastzDanRer3/chromsRun/mafNet /san/sanvol1/scratch/hg18/mafNet/danRer3
 
     # thanks for the tree, Hiram! Taken from mm7 17way...
     cd /cluster/data/hg18/bed/multiz17way
     cat << '_EOF_' > 17way.nh
 (((((((((
 (human_hg18:0.006690,chimp_panTro1:0.007571):0.024272,
   macaque_rheMac2:0.0592):0.023960,
   ((rat_rn4:0.081728,mouse_mm8:0.077017):0.229273,
       rabbit_oryCun1:0.206767):0.1065):0.023026,
 (cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,
 armadillo_dasNov1:0.149862):0.015994,
 (elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,
 monodelphis_monDom4:0.371073):0.189124,
 chicken_galGal2:0.454691):0.123297,
 xenopus_xenTro1:0.782453):0.156067,
 ((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
     zebrafish_danRer3:0.782561):0.156067);
 '_EOF_'
 
     /cluster/bin/draw_tree 17way.nh > 17way.ps
     /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
     grep hg18 17way.distances.txt | sort -k3,3n | \
         awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
     # edit distances.txt to include featureBits, and chain parameters
     # from blastz run.
     cat distances.txt
 # 0.0143  chimp_panTro1
 # 0.0902  macaque_rheMac2
 # 0.2563  armadillo_dasNov1
 # 0.2651  dog_canFam2
 # 0.2677  elephant_loxAfr1
 # 0.2766  cow_bosTau2
 # 0.3682  rabbit_oryCun1
 # 0.4226  tenrec_echTel1
 # 0.4677  mouse_mm8
 # 0.4724  rat_rn4
 # use loose chain params and score from here, down (5000)
 # 0.7119  monodelphis_monDom4
 # 0.9847  chicken_galGal2
 # 1.4357  xenopus_xenTro1
 # 1.6577  tetraodon_tetNig1
 # 1.6983  fugu_fr1
 # 1.7480  zebrafish_danRer3
 
     # the order in the browser display will be by tree topology,
     # not by distance, so it will be:
 #  >>         # 0.0143  chimp_panTro1
 #  >>         # 0.0902  macaque_rheMac2
 #  >>         # 0.4677  mouse_mm8
 #  >>         # 0.4724  rat_rn4
 #  >>         # 0.3682  rabbit_oryCun1
 #  >>         # 0.2651  dog_canFam2
 #  >>         # 0.2766  cow_bosTau2
 #  >>         # 0.2563  armadillo_dasNov1
 #  >>         # 0.2677  elephant_loxAfr1
 #  >>         # 0.4226  tenrec_echTel1
 #  >>         # 0.7119  monodelphis_monDom4
 #  >>         # 0.9847  chicken_galGal2
 #  >>         # 1.4357  xenopus_xenTro1
 #  >>         # 1.6577  tetraodon_tetNig1
 #  >>         # 1.6983  fugu_fr1
 #  >>         # 1.7480  zebrafish_danRer3
 
     # make output dir and run dir
     ssh pk
     cd /cluster/data/hg18/bed/multiz17way.2006-02-18
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 17way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     mkdir -p maf run
     cd run
 
     # stash binaries
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
     set db = hg18
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/mafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../tree/tree.nh ../species.lst $tmp
     pushd $tmp
     foreach s (`cat species.lst`)
         set in = $pairs/$s/$c.maf
         set out = $db.$s.sing.maf
         if ($s == hg18) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
 # << happy emacs
     chmod +x autoMultiz.csh
 
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz17way.2006-02-18/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
 # << happy emacs
 
     awk '{print $1}' /cluster/data/hg18/chrom.sizes > chrom.lst
     gensub2 chrom.lst single spec jobList
     para create jobList
         # 49 files
     para try
     para check
     para push
 
 #  NOTE: much faster than V10 (40 hrs for hg17 V10, 14.53 hrs for hg17 V11)
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:     341776s    5696.26m    94.94h    3.96d  0.011 y
 # IO & Wait Time:                122801s    2046.69m    34.11h    1.42d  0.004 y
 # Average job time:                9481s     158.02m     2.63h    0.11d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           81334s    1355.57m    22.59h    0.94d
 # Submission to last job:         81334s    1355.57m    22.59h    0.94d
 
     # Load into database
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way/maf
     mkdir -p /gbdb/hg18/multiz17way/maf
     ln -s /cluster/data/hg18/bed/multiz17way/maf/*.maf \
         /gbdb/hg18/multiz17way/maf
 cat > loadMaf.csh << 'EOF'
     time hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/maf hg18 multiz17way
     cat *.maf | \
         nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 -maxSize=200000  multiz17waySummary stdin
 'EOF'
     # 3213116
 
 #<< happy emacs
     # expect lengthy load time for this -- a few hours ?
     # csh loadMaf.csh >&! loadMaf.log &
     script loadMaf.log
     csh loadMaf.csh
     exit
 
 ###############################################################
 # PHASTCONS CONSERVATION (DONE, 2ND TIME, 2006-03-28 Fan)
 # This process is distilled from Hiram and Adam's experiments
 # on mouse (mm7) 17way track.  Many parameters are now fixed, without
 # being experimentally derived, either because the experiments
 # were lengthy and produced similar results, or because they
 # weren't runnable given the alignment size.
 # These parameters are:
 # --rho
 # --expected-length
 # --target-coverage
 # Also, instead of generating cons and noncons tree models,
 # we use a single, pre-existing tree model -- Elliot Margulies' model
 # from the (37-way) ENCODE alignments.
 
     # NOTE: reusing cluster-friendly chrom fasta files created earlier
 
     ssh kkstore02
     mkdir /cluster/bluearc/hg18/chrom
     cd /cluster/data/hg18
     foreach f (`cat chrom.lst`)
     echo $f
     cp $f/*.fa /cluster/bluearc/hg18/chrom
     end
 
     # Split chromosome MAF's into windows and use to generate
     # "sufficient statistics" (ss) files for phastCons input
     # NOTE: as the SAN fs has lotsa space, we're leaving these
     # big (temp) files unzipped, to save time during phastCons run.
     # Note also the larger chunk sizes from previous runs -- this
     # reduces run-time on the split, slows down the actual phastCons
     # enough so jobs don't crash (jobs are very quick, just a minute
     # or so), and according to Adam, will produce better results.
     # The previous small chunks were probably required by
     # the phyloFit step, which we are no longer using for the
     # human alignments.
     ssh pk
     mkdir /cluster/data/hg18/bed/multiz17way.2006-02-18/cons
     cd /cluster/data/hg18/bed/multiz17way.2006-02-18/cons
     cp /cluster/store5/gs.18/build35/bed/multiz17way.2005-12-20/cons/elliotsEncode.mod .
     # edit, change to hg18, monDom4, mm8, and rn4.
     mkdir run.split
     cd run.split
     set WINDOWS = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
 
     cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
 # unfortunately this exhausts 2G mem limit currently on pk
 # next time, run on mini-cluster
     set MAFS = /cluster/data/hg18/bed/multiz17way.2006-02-18/maf
     set WINDOWS = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss
     cd $WINDOWS
     set c = $1
     echo $c
     rm -fr $c
     mkdir $c
     /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
         -M /cluster/bluearc/hg18/chrom/$c.fa \
         -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
     echo "Done" >> $c.done
 'EOF'
 # << happy emacs
     chmod +x doSplit.csh
 
 rm -f jobList
 foreach f (../../maf/*.maf)
 set c = $f:t:r
 echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
 end
 
     para create jobList
         # 49 jobs
     para try
     para check
     para push
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:       9254s     154.24m     2.57h    0.11d  0.000 y
 # IO & Wait Time:                 15027s     250.44m     4.17h    0.17d  0.000 y
 # Average job time:                 496s       8.26m     0.14h    0.01d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1916s      31.93m     0.53h    0.02d
 # Submission to last job:          1921s      32.02m     0.53h    0.02d
 
     # check tree model on 5MB chunk, using params recommended by Adam,
     # (to verify branch lengths on 2X species)
     # he ok'ed the results -- not necessary for next human run
     ssh kolossus
     cd /cluster/data/hg18/bed/multiz17way.2006-02-18/cons
     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS -E -p MED -s HKY85 \
         --tree "`cat ../tree-commas.nh`" \
         /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/ss/chr7/chr7.110000001-120000000.ss \
         -o phyloFit.tree
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     # cd ..
     mkdir run.cons
     cd run.cons
     cat > doPhast.csh << 'EOF'
 #!/bin/csh -fe
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set tmp = /scratch/tmp/$f
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons
 cp -p $san/ss/$c/$f.ss ../elliotsEncode.mod $tmp
 pushd $tmp > /dev/null
 /cluster/bin/phast/$MACHTYPE/phastCons $f.ss elliotsEncode.mod \
 --rho $rho --expected-length $len --target-coverage $cov --quiet \
 --not-informative panTro1,rheMac2 \
 --seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
 popd > /dev/null
 mkdir -p $san/pp/$c $san/bed/$c
 sleep 1
 mv $tmp/$f.pp $san/pp/$c
 mv $tmp/$f.bed $san/bed/$c
 rm -fr $tmp
 'EOF'
     # emacs happy
     chmod a+x doPhast.csh
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
     cat > template << 'EOF'
 #LOOP
 doPhast.csh $(root1) $(file1) 14 .008 .28
 #ENDLOOP
 'EOF'
     #	happy emacs
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons
     # mkdir /cluster/data/hg18/bed/multiz17way/cons/run.cons
     ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/hg18/bed/multiz17way/cons/run.cons/in.list
 
     ssh pk
     cd /cluster/store11/gs.19/build36/bed/multiz17way.2006-02-18/cons/run.cons
 
     gensub2 in.list single template jobList
     para create jobList
         # 337 jobs
     para try
     para check
     para push
 
 # NOTE: some jobs crashed due to can not stat some /san/... files, but worked when pushed once again
 # Completed: 337 of 337 jobs
 # CPU time in finished jobs:      16000s     266.66m     4.44h    0.19d  0.001 y
 # IO & Wait Time:                 13307s     221.79m     3.70h    0.15d  0.000 y
 # Average job time:                  87s       1.45m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             173s       2.88m     0.05h    0.00d
 # Submission to last job:           225s       3.75m     0.06h    0.00d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons
     #	The sed's and the sort get the file names in chrom,start order
     # (Hiram tricks -- split into columns on [.-/] with
     #    identifying x,y,z, to allow column sorting and
     #    restoring the filename.  Warning: the sort column
     # will depend on how deep you are in the dir
     find ./bed -name "chr*.bed" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
 	awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
 	/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/hg18/bed/multiz17way/cons
 
     # load into database
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way/cons
     hgLoadBed -strict hg18 phastConsElements17way mostConserved.bed
         # Loaded 2037557 elements
     # compare with previous tracks
     hgsql hg18 -e "select count(*) from phastConsElements17way"
         # 2260575
     # hgsql hg18 -e "select count(*) from phastConsElements"
     # hg18 does not have phastConsElements table
         # 1601903
     # Try for 5% overall cov, and 70% CDS cov (used elen=13, tcov=.007, rho=.27)
     featureBits hg18 -enrichment refGene:cds phastConsElements17way
     # refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x
     featureBits hg17 -enrichment refGene:cds phastConsElements17way
     # refGene:cds 1.064%, phastConsElements17way 5.104%, both 0.748%, cover 70.29%, enrich 13.77x
 
     # compare with previous tracks
     featureBits hg18 -enrichment refGene:cds phastConsElements10way
         # refGene:cds 1.062%, phastConsElements10way 5.003%, both 0.734%, cover 69.18%, enrich 13.83x
     featureBits hg18 -enrichment refGene:cds phastConsElements
         # refGene:cds 1.062%, phastConsElements 4.810%, both 0.771%, cover 72.65%, enrich 15.11x
 
     # Create merged posterier probability file and wiggle track data files
     #	pk is currently closer to the san than any other machine
     ssh pk
     cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/
     # sort by chromName, chromStart so that items are in numerical order
     #  for wigEncode
     find ./pp -name "chr*.pp" | \
         sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
 	sort -k7,7 -k9,9n | \
 	sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
 	xargs cat | \
         nice wigEncode stdin phastCons17way.wig phastCons17way.wib
     # about 23 minutes for above
 
     cp -p phastCons17way.wi? /cluster/data/hg18/bed/multiz17way/cons
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way/cons
     ln -s `pwd`/phastCons17way.wib /gbdb/hg18/multiz17way/phastCons17way.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz17way hg18 \
         phastCons17way phastCons17way.wig
     #  ~ 3 minute load
 
     # Downloads  (2006-02-22 Fan)
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way
     mkdir mafDownloads
     cd mafDownloads
     # upstream mafs (mafFrags takes a while)
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
         awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         nice mafFrags hg18 multiz17way up.bed upstream$i.maf \
                 -orgs=../species.lst
         rm up.bed
     end
     date
 'EOF'
 
     time csh mafFrags.csh > mafFrags.log
     nice gzip up*.maf
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz17way/mafDownloads
 cat > downloads.csh << 'EOF'
     date
     foreach f (../maf/chr*.maf)
 	set c = $f:t:r
         echo $c
 	nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     time csh downloads.csh > downloads.log
 
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz17way
     mkdir $dir
     ln -s /cluster/data/hg18/bed/multiz17way/mafDownloads/{*.gz,md5sum.txt} $dir
 
 ##############################################################################
 # SET DEFAULT POSITION TO chrX:151,073,054-151,383,976, TO SHOW GENE GABRA3
 
   hgsql -e 'delete from dbDb where name="hg18"' \
           -h genome-testdb hgcentraltest
 
   hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
 	    defaultPos, active, orderKey, genome, scientificName, \
             htmlPath, hgNearOk, hgPbOk, sourceName) \
             VALUES("hg18", "Feb. 2006", "/gbdb/hg18/nib", "Human", \
 	    "chrX:151,073,054-151,383,976", 1, 9, "Human", "Homo sapiens", \
 	    "/gbdb/hg18/html/description.html", 1, 1, "NCBI Build 36.1");' \
 	    -h genome-testdb hgcentraltest
 
 ############################################################################
 # HG16/HG17 -> HG18 LIFTOVER CHAINS (DONE 2/24/06 Fan)
     # These chains hopefully don't suck.
     # Sorry I only used the makeLoChain-align script from the set of scripts
     # already created for this task.  I wanted more control.  I should mention
     # I used a size of 10kb instead of 3kb for the split (blat query) sizes in
     # hg18.  This had a huge affect on the amount of hits in the blat, which
     # then had a huge effect on the amount of chains.  I should also mention
     # that hg18 chromosomes chr1 and chr2 were split further
     # into more than a single query file.  This helped a LOT in avoiding
     # cluster hippos classically associated with those chroms.
     ######## LIFTOVER PREPARATION
     # Split up hg18
     ssh pk
     cd /san/sanVol1/scratch/hg18
     mkdir -p liftSplits/{split,lift}
     bash
     for fa in /cluster/data/hg18/?{,?,*hap*}/*.fa; do
       c=`basename $fa .fa`
       echo $c
       faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
     done
     mkdir -p biggerSplits/split
     cd biggerSplits/
     ln -s ../liftSplits/lift
     cd split/
     ln -s ../../liftSplits/split/* .
     faSplit sequence chr1.fa 5 chr1_
     faSplit sequence chr2.fa 5 chr2_
     rm chr{1,2}.fa
 
     # Make some dirs
     cd /san/sanVol1/scratch
     mkdir -p hg{15,16,17}
 
     # Copy 11.ooc files to each of hg15, hg16, hg17 dirs.
     cp -p /cluster/store5/gs.16/build33/11.ooc hg15
     cp -p /cluster/store4/gs.17/build34/11.ooc hg16
     cp -p /cluster/store5/gs.18/build35/11.ooc hg17
 
     ## First, copy over Andy's scripts.
 
     mkdir -p /san/sanVol1/scratch/fan
     cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
     cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan
 
     ######## LIFTOVER BLATING
 
     # HG16
     ssh pk
     cd /cluster/data/hg16
     makeLoChain-align hg16 /scratch/hg/hg16/bothMaskedNibs hg18 \
     /san/sanVol1/scratch/hg18/biggerSplits/split
     cd bed/
     mv blat.hg18.2006-02-24 /san/sanVol1/scratch/hg16
     cd /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/run/
     sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg16ToHg18"}' > newspec
     para create newspec
     para try
     para push
 # Completed: 2394 of 2394 jobs
 # CPU time in finished jobs:     623927s   10398.79m   173.31h    7.22d  0.020 y
 # IO & Wait Time:                 13255s     220.91m     3.68h    0.15d  0.000 y
 # Average job time:                 266s       4.44m     0.07h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3613s      60.22m     1.00h    0.04d
 # Submission to last job:          4112s      68.53m     1.14h    0.05d
 
     # HG17
     ssh pk
     cd /cluster/data/hg17
     makeLoChain-align hg17 /scratch/hg/hg17/bothMaskedNibs hg18 /san/sanVol1/scratch/hg18/biggerSplits/split
     cd bed/
     mv blat.hg18.2006-02-24/ /san/sanVol1/scratch/hg17
     cd /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/run/
     sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg17ToHg18"}' > newspec
     para create newspec
     para try
     para push
 # Completed: 2622 of 2622 jobs
 # CPU time in finished jobs:     618557s   10309.28m   171.82h    7.16d  0.020 y
 # IO & Wait Time:                 13735s     228.92m     3.82h    0.16d  0.000 y
 # Average job time:                 241s       4.02m     0.07h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3655s      60.92m     1.02h    0.04d
 # Submission to last job:          4228s      70.47m     1.17h    0.05d
 
     ######## LIFTOVER CHAINING
     # LIFTING
     ssh pk
     cd /san/sanVol1/scratch/fan
     cp mm7SplitLift.sh hg18SplitLift.sh
 
     # change andy to fan, mm7 to hg18, and chrX to chr2, and remove chrUn_random
     vi hg18SplitLift.sh
 
     cat << 'EOF' > hg18ChainMergeSplit.sh
 #!/bin/bash
 cp -r chainRaw/ /scratch/fan/hg18Lifts
 pushd /scratch/fan/hg18Lifts
 mkdir chain
 /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
 cp -r chain `dirs +1`
 rm -rf chain chainRaw
 'EOF'
 
     chmod +x hg18ChainMergeSplit.sh
 
     # HG16
     cd /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/raw
     /san/sanVol1/scratch/fan/hg18SplitLift.sh
     cd ../
     mkdir chainRun chainRaw
     cd chainRun
     cat > gsub << 'EOF'
 #LOOP
 /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg16/bothMaskedNibs /san/sanVol1/scratch/hg18/nib {check out line+ ../chainRaw/$(root1).chain}
 #ENDLOOP
 'EOF'
     ls -1S ../psl/*.psl > in.lst
     gensub2 in.lst single gsub spec
     para create spec
     para push
     para time
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:       3599s      59.98m     1.00h    0.04d  0.000 y
 # IO & Wait Time:                  1040s      17.34m     0.29h    0.01d  0.000 y
 # Average job time:                  95s       1.58m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             303s       5.05m     0.08h    0.00d
 # Submission to last job:           303s       5.05m     0.08h    0.00d
 
     # HG17
     cd /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/raw
     /san/sanVol1/scratch/fan/hg18SplitLift.sh
 
     cd ../
     mkdir chainRun chainRaw
     cd chainRun
 
     cat > gsub << 'EOF'
 #LOOP
 /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg17/bothMaskedNibs /san/sanVol1/scratch/hg18/nib {check out line+ ../chainRaw/$(root1).chain}
 #ENDLOOP
 'EOF'
     ls -1S ../psl/*.psl > in.lst
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
     para time
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:       3671s      61.19m     1.02h    0.04d  0.000 y
 # IO & Wait Time:                  1186s      19.76m     0.33h    0.01d  0.000 y
 # Average job time:                  99s       1.65m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             282s       4.70m     0.08h    0.00d
 # Submission to last job:           282s       4.70m     0.08h    0.00d
 
     ######### CHAINMERGE/NET/NETSUBSET
     ssh kolossus
     mkdir -p /scratch/fan/hg18Lifts
     cd /scratch/fan/hg18Lifts
     cp -rp /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/chainRaw/ .
     mkdir chain
     time /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
 
     cp -rp chain /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/
 
     mv chain chain.17
 # remove it later
     rm -rf chain.17
 
     cp -r /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/chainRaw/ .
     mkdir chain
     /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
 # about 30 minutes.
 
     cp -rp chain /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/
     rm -rf chain*
 
     ssh pk
     cd /san/sanvol1/scratch/fan
     cat << 'EOF' > netOver.sh
 #!/bin/bash
 
 chain=$1
 chrom=`basename $chain .chain`
 sizesHGOld=$2
 sizesHG18=/cluster/data/hg18/chrom.sizes
 chainDir=`dirname $chain`
 blatDir=`dirname $chainDir`
 net=${blatDir}/net/${chrom}.net
 over=${blatDir}/over/${chrom}.over
 
 mkdir -p ${blatDir}/{over,net}
 /cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG18 $net /dev/null
 /cluster/bin/x86_64/netChainSubset $net $chain $over
 'EOF'
     chmod +x netOver.sh
     mkdir netRun
 
     cd netRun/
 
     find /san/sanVol1/scratch/hg16/blat.hg18.2006-02-24/chain -name "*.chain" \
      | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg16/chrom.sizes"}' >> spec
     find /san/sanVol1/scratch/hg17/blat.hg18.2006-02-24/chain -name "*.chain" \
      | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg17/chrom.sizes"}' >> spec
     para create spec
     para push
     para time
 # Completed: 88 of 88 jobs
 # CPU time in finished jobs:        881s      14.68m     0.24h    0.01d  0.000 y
 # IO & Wait Time:                   284s       4.74m     0.08h    0.00d  0.000 y
 # Average job time:                  13s       0.22m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              33s       0.55m     0.01h    0.00d
 # Submission to last job:            73s       1.22m     0.02h    0.00d
 
 # seems much faster than mm7.
 
     ########## FINISHING
     ssh hgwdev
 
     # HG16
     cd /san/sanvol1/scratch/hg16/blat.hg18.2006-02-24/over
     cat * >> ../hg16ToHg18.over.chain
     cd ../
     rm -rf psl/ net/ chain/ chainRaw/ over/
     cd ../
     cp -rp blat.hg18.2006-02-24/ /cluster/data/hg16/bed
 
     cd /cluster/data/hg16/bed
     ln -s blat.hg18.2006-02-24 blat.hg18
     ln -s `pwd`/blat.hg18/hg16ToHg18.over.chain liftOver/hg16ToHg18.over.chain
     ln -s `pwd`/liftOver/hg16ToHg18.over.chain /gbdb/hg16/liftOver/hg16ToHg18.over.chain
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/liftOver
     cd /usr/local/apache/htdocs/goldenPath/hg16/liftOver
     cp /gbdb/hg16/liftOver/hg16ToHg18.over.chain .
     gzip hg16ToHg18.over.chain
     hgAddLiftOverChain hg16 hg18 /gbdb/hg16/liftOver/hg16ToHg18.over.chain
 
     # HG17
     cd /san/sanvol1/scratch/hg17/blat.hg18.2006-02-24/over
     cat * >> ../hg17ToHg18.over.chain
     cd ../
     rm -rf psl/ net/ chain/ chainRaw/ over/
     cd ../
     cp -r blat.hg18.2006-02-24/ /cluster/data/hg17/bed
     cd /cluster/data/hg17/bed
     ln -s blat.hg18.2006-02-24 blat.hg18
     ln -s `pwd`/blat.hg18/hg17ToHg18.over.chain liftOver/hg17ToHg18.over.chain
     ln -s `pwd`/liftOver/hg17ToHg18.over.chain /gbdb/hg17/liftOver/hg17ToHg18.over.chain
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg17/liftOver
     cd /usr/local/apache/htdocs/goldenPath/hg17/liftOver
     cp /gbdb/hg17/liftOver/hg17ToHg18.over.chain .
     gzip hg17ToHg18.over.chain
     hgAddLiftOverChain hg17 hg18 /gbdb/hg17/liftOver/hg17ToHg18.over.chain
 
 ############################################################################
 ##  BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
     ssh pk
     cd /cluster/data/mm8/bed/blastzHg18.2006-02-16
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
         `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits hg18 chainMm8Link
     #   994530182 bases of 2881515245 (34.514%) in intersection
 
 # GENOSCOPE TETRAODON (tetNig1) ECORES (DONE, 2006-03-03, Fan)
 # GENOSCOPE TETRAODON (tetNig1) ECORES (REBUILT, 2006-04-04, Fan)
     ssh kkstore02
     mkdir -p /cluster/data/hg18/bed/ecoresTetNig1
     cd /cluster/data/hg18/bed/ecoresTetNig1
 
     wget --timestamp \
          http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecoresHumBuild36/EXOFISH_HS_WITH_TN.gff
     wget --timestamp \
          http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecoresHumBuild36/EXOFISH_TN_WITH_HS.gff
 
     # this is in gff format
     # remove "Ecotig" from name field
     sed -e 's/Ecotig EG/EG/g' EXOFISH_HS_WITH_TN.gff |sed -e 's/CHR//' > ExofishHs36Tnig1.gff
     # sed -e 's/Ecotig EG/EG/g' ExofishHs36Tnig1 > ExofishHs36Tnig1.gff
     # need to have tabs between fields not a space to load file into table
     sed -e 's/ /\t/g' ExofishHs36Tnig1.gff > Hs36Tnig1format.gff
     # if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads
     # correctly into the table.
     sed -e 's/ecore/CDS/' Hs36Tnig1format.gff | sed -e 's/ecotig/transcript/' \
 	    | cut -f 1-8,11 > Hg18vstetNig1.gff
     # add "chr" in front of the chromsome name in first field (2005-02-08)
     perl -pi.bak -e 's/^([0-9XYM]{1,2})/chr$1/' Hg18vstetNig1.gff
     rm *.bak
     # need to reload table
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/ecoresTetNig1
     echo 'drop table ecoresTetNig1;' | hgsql hg18
     nice ldHgGene hg18 ecoresTetNig1 Hg18vstetNig1.gff
 
 #########################################################################
 # BUILD MAF ANNOTATION FOR MULTIZ17WAY (DONE 2006-03-07, Fan)
     ssh kkstore01
     cd /cluster/data/monDom4
     twoBitInfo -nBed monDom4.2bit monDom4.N.bed
 
     cd /cluster/data/rn4
     twoBitInfo -nBed rn4.2bit rn4.N.bed
 
     cd /cluster/data/mm8
     twoBitInfo -nBed mm8.2bit mm8.N.bed
 
     ssh kolossus
     cd /cluster/data/hg18/bed/multiz17way
     mkdir anno
     cd anno
     mkdir maf run
     cd run
     rm sizes nBeds
 
     foreach i (`cat /cluster/data/hg18/bed/multiz17way/species.lst`)
         ln -s  /cluster/data/$i/chrom.sizes $i.len
         ln -s  /cluster/data/$i/$i.N.bed $i.bed
         echo $i.bed  >> nBeds
         echo $i.len  >> sizes
     end
 
     echo date > jobs.csh
     foreach i (../../maf/*.maf)
         echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg18/hg18.2bit ../maf/`basename $i` >> jobs.csh
         echo "echo $i" >> jobs.csh
     end
     echo date >> jobs.csh
 
     # do smaller jobs first
     tac jobs.csh > jobsRev.csh
     mv jobsRev.csh jobs.csh
 
     csh jobs.csh > jobs.log
 
     # This took 10 hours.  Hg17 took 1.5 hrs.
 
     ssh kolossus
     # loading here because summary table load crashed on hgwdev
     cd /cluster/data/hg18/bed/multiz17way/anno/maf
     mkdir -p /gbdb/hg18/multiz17way/anno/maf
     ln -s /cluster/data/hg18/bed/multiz17way/anno/maf/*.maf \
         /gbdb/hg18/multiz17way/anno/maf
 cat > loadMaf.csh << 'EOF'
     date
     hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/anno/maf \
                 hg18 multiz17way
     date
     cat *.maf | \
         nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
             -maxSize=200000  multiz17waySummary stdin
     date
 'EOF'
     csh loadMaf.csh > loadMaf.log
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz17way
     mkdir frames
     cd frames
     cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
     cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
     #edit Makefile to correct species names
 
 cat > copy.csh << 'EOF'
     set dir = /cluster/bluearc/hg18/multiz17way/frames/maf
     mkdir -p $dir
     foreach i (../maf/*.maf)
         echo $i
         cp -p $i $dir
     end
 'EOF'
     csh copy.csh > copy.log
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way/frames
     time make getGenes > getGenes.log
     # 26.100u 4.360s 1:02.78 48.5%    0+0k 0+0io 29643pf+0w
     time make getFrames > getFrames.log
 
 # Batch failed after 4 tries on ../mkMafFrames bosTau2 hg18 /san/sanvol1/scratch/hg18/multiz17way/frames/genes/bosTau2.gp.gz /cluster/data/hg18/bed/multiz17way/maf/chr1.maf /san/sanvol1/scratch/hg18/multiz17way/frames/mafFrames/bosTau2/chr1.mafFrames
 #make[1]: *** [mafFrames/bosTau2.cluster.done] Error 255
 
 # copy Makefile to Makefile.try2 and remove bosTau2
     time make -f Makefile.try2 getFrames > getFrames.try2.log
 
 # copy Makefile to Makefile.try3 and with only bosTau2 remains
     time make -f Makefile.try3 getGenes  > getGenes.try3.log
     time make -f Makefile.try3 getFrames > getFrames.try3.log
     time make -f Makefile.try3 getFrames > getFrames.try5.log
     time make -f Makefile.try3 getFrames > getFrames.try6.log
 
 # Finally after Mark fixed the bug and recompiled, it worked.
     time make -f Makefile.try3 getFrames > getFrames.try7.log
 
     time make loadDb > loadDb.log
 
 #########################################################################
 # Build maf annotation for multiz17way  (STARTED 2006-02-28, DONE 2006-03-09, Fan)
 # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
 
     ssh kkstore01
     cd /cluster/data/monDom4
     twoBitInfo -nBed monDom4.2bit monDom4.N.bed
 
     cd /cluster/data/rn4
     twoBitInfo -nBed rn4.2bit rn4.N.bed
 
     cd /cluster/data/mm8
     twoBitInfo -nBed mm8.2bit mm8.N.bed
 
     ssh kolossus
     cd /cluster/data/hg18/bed/multiz17way
     mkdir anno
     cd anno
     mkdir maf run
     cd run
     rm sizes nBeds
 
     foreach i (`cat /cluster/data/hg18/bed/multiz17way/species.lst`)
         ln -s  /cluster/data/$i/chrom.sizes $i.len
         ln -s  /cluster/data/$i/$i.N.bed $i.bed
         echo $i.bed  >> nBeds
         echo $i.len  >> sizes
     end
 
     echo date > jobs.csh
     foreach i (../../maf/*.maf)
         echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/hg18/hg18.2bit ../maf/`basename $i` >> jobs.csh
         echo "echo $i" >> jobs.csh
     end
     echo date >> jobs.csh
 
     # do smaller jobs first
     tac jobs.csh > jobsRev.csh
     mv jobsRev.csh jobs.csh
 
     csh jobs.csh > jobs.log
 
     # This took 10 hours.  Hg17 took 1.5 hrs.
 
     ssh hgwdev
     # loading here because summary table load crashed on hgwdev
     cd /cluster/data/hg18/bed/multiz17way/anno/maf
     mkdir -p /gbdb/hg18/multiz17way/anno/maf
     ln -s /cluster/data/hg18/bed/multiz17way/anno/maf/*.maf \
         /gbdb/hg18/multiz17way/anno/maf
 cat > loadMaf.csh << 'EOF'
     date
     hgLoadMaf -pathPrefix=/gbdb/hg18/multiz17way/anno/maf \
                 hg18 multiz17way
     date
     cat *.maf | \
         nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
             -maxSize=200000  multiz17waySummary stdin
     date
 'EOF'
     csh loadMaf.csh > loadMaf.log
 
     # Dropped unused indexes (2006-05-09 kate)
     # NOTE: this is not required in the future, as the loader
     # has been fixed to not generate these indexes
     hgsql hg18 -e "alter table multiz17waySummary drop index chrom_2"
     hgsql hg18 -e "alter table multiz17waySummary drop index chrom_3"
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz17way
     mkdir frames
     cd frames
     cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
     cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
 
     # !!! NEXT TIME, COPY ALL maf FILES OVER TO san TO AVOID kkstore02 OVERLOAD.
     # edit Makefile to correct species names
 
 cat > copy.csh << 'EOF'
     set dir = /cluster/bluearc/hg18/multiz17way/frames/maf
     mkdir -p $dir
     foreach i (../maf/*.maf)
         echo $i
         cp -p $i $dir
     end
 'EOF'
     csh copy.csh > copy.log
 
     #for i in ../../maf/*.maf; do echo $i; cp $i /san/sanvol1/scratch/hg18/multiz17wayFrames/maf/$i; done
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way/frames
     time make getGenes > getGenes.log
     # 26.100u 4.360s 1:02.78 48.5%    0+0k 0+0io 29643pf+0w
     time make getFrames > getFrames.log
         # ~2 hours
 
     time make loadDb > loadDb.log
 
     ###
     # rebuild frames to get bug fix, using 1-pass maf methodology
     # (2006-06-09 markd)
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz17way/frames
     mv mafFrames/ mafFrames.old2
     nice tcsh # easy way to get process niced
     (cat  ../maf/*.maf | time genePredToMafFrames hg18 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro1 genes/panTro1.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz xenTro1 genes/xenTro1.gp.gz  |  gzip >multiz17way.mafFrames.gz)>&frames.log&
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way/frames
 
     hgLoadMafFrames hg18 multiz17wayFrames multiz17way.mafFrames.gz >&log&
 
 
 ##########################################################################
 # BUILD ALLEN BRAIN TRACK (DONE 03/11/06 Fan)
 
 # Make the working directory
     ssh hgwdev
     cd /cluster/data/hg18/bed
     mkdir allenBrain
     cd allenBrain
 
 # Remap the probe alignments from mm7 to hg18
 
     zcat /gbdb/mm7/liftOver/mm7ToHg18.over.chain.gz \
         |  pslMap -chainMapFile -swapMap \
 	       /cluster/data/mm7/bed/allenBrain/allenBrainAli.psl stdin stdout \
 	  |  sort -k 14,14 -k 16,16n > unscored.psl
 
     pslRecalcMatch unscored.psl /cluster/data/hg18/nib \
         /cluster/data/mm7/bed/allenBrain/allProbes.fa allenBrainAli.psl
 
 # Load the database
    hgsql hg18 < ~/kent/src/hg/lib/allenBrainUrl.sql
    hgsql hg18 -e 'load data local infile "/cluster/data/mm7/bed/allenBrain/allenBrainUrl.tab" into table allenBrainUrl;'
    hgLoadPsl hg18 allenBrainAli.psl
    mkdir /gbdb/hg18/allenBrain
    ln -s /cluster/data/mm7/bed/allenBrain/allProbes.fa /gbdb/hg18/allenBrain/allProbes.fa
    hgLoadSeq hg18 /gbdb/hg18/allenBrain/allProbes.fa
 
 # Make mapping between known genes and allenBrain
    hgMapToGene hg18 allenBrainAli -type=psl knownGene knownToAllenBrain
 
 ##########################################################################
 ####  Blat knownGene proteins to determine exons
 #	(DONE - 2006-03-15 - 2006-03-24 - hiramc)
     ssh hgwdev
     cd /cluster/data/hg18/bed
     mkdir blat.hg18KG.2006-03-15
     rm blat.hg18KG
     ln -s  blat.hg18KG.2006-03-15 blat.hg18KG
     cd blat.hg18KG
     pepPredToFa hg18 knownGenePep known.fa
 
     #	The kluster run
     ssh pk
     cd /cluster/data/hg18/bed/blat.hg18KG
     cat << '_EOF_' > blatSome
 #!/bin/csh -fe
 blat -t=dnax -q=prot -out=pslx /scratch/hg/gs.19/build36/bothMaskedNibs/$1.nib \
 	kgfa/$2.fa $3
 '_EOF_'
     # << keep emacs happy
     chmod +x blatSome
     ls -1S /scratch/hg/gs.19/build36/bothMaskedNibs > human.lst
     mkdir kgfa
     cd kgfa
     #	This split should be done on the file server, not over NFS
     faSplit sequence ../known.fa 3000 kg
     ls -1S *.fa > ../kg.lst
     cd ..
     cat << '_EOF_' > template
 #LOOP
 blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
     gensub2 human.lst kg.lst template jobList
     mkdir psl
     cd psl
     sed -e "s/.nib//" ../human.lst | xargs mkdir
     cd ..
     para create jobList
     para try ... check ... push ... etc
 # Completed: 142100 of 142100 jobs
 # CPU time in finished jobs:    7520598s  125343.30m  2089.06h   87.04d  0.238 y
 # IO & Wait Time:                415523s    6925.38m   115.42h    4.81d  0.013 y
 # Average job time:                  56s       0.93m     0.02h    0.00d
 # Longest finished job:            5737s      95.62m     1.59h    0.07d
 # Submission to last job:         72538s    1208.97m    20.15h    0.84d
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/blat.hg18KG.2006-03-15
     pslSort dirs raw.psl /tmp psl/*
     #	-rw-rw-r--   1 568238823 Mar 20 13:30 raw.psl
     pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
     #	-rw-rw-r--   1  43446007 Mar 24 11:13 cooked.psl
     pslUniq cooked.psl hg18KG.psl
     #	-rw-rw-r--   1  41321225 Mar 24 11:14 hg18KG.psl
     cut -f 10 hg18KG.psl > kgName.lst
     faSomeRecords known.fa kgName.lst hg18KG.fa
     faSize hg18KG.fa
     #	16419953 bases (12961273 N's 3458680 real 3458680 upper 0 lower)
     #	in 36727 sequences in 1 files
     faSize known.fa
     #	16430067 bases (12969298 N's 3460769 real 3460769 upper 0 lower)
     #	in 36798 sequences in 1 files
 
     #	You may need to build this pslxToFa - it is not in the standard build
     pslxToFa hg18KG.psl hg18KG_ex.fa -liftTarget=genome.lft \
 	-liftQuery=protein.lft
     #	-rw-rw-r--   1  11294262 Mar 24 11:31 protein.lft
     #	-rw-rw-r--   1  21428637 Mar 24 11:31 hg18KG_ex.fa
     #	-rw-rw-r--   1  14324928 Mar 24 11:31 genome.lft
     wc -l *.psl *.lft *.fa kgName.lst
     #	  39908 cooked.psl
     #	  36727 hg18KG.psl
     #	1521400 raw.psl
     #	 303516 genome.lft
     #	 303516 protein.lft
     #	 383037 hg18KG.fa
     #	 607032 hg18KG_ex.fa
     #	 383348 known.fa
     #	  36727 kgName.lst
     #	3615211 total
 
     #	back on hgwdev
     ssh hgwdev
     cd /cluster/data/hg18/bed/blat.hg18KG
     kgName hg18 hg18KG.psl blastKGRef04
     #	After about an hour, it exited with this message:
     #	sqlFreeConnection called on cache (hg18) that doesn't contain
     #	the given connection
     #	This may be a lurking error in this program, because the
     #	resulting file seems to have the correct number of lines:
     hgsql hg18 < ~/kent/src/hg/lib/blastRef.sql
     echo "rename table blastRef to blastKGRef04" | hgsql hg18
     echo "load data local infile 'blastKGRef04' into table blastKGRef04" | hgsql hg18
      wc -l kgName.lst blastKGRef04 hg18KG.psl
     #	 36727 kgName.lst
     #	 36727 blastKGRef04
     #	 36727 hg18KG.psl
     #	110181 total
     hgPepPred hg18 generic blastKGPep04 hg18KG.fa
     #	end blat proteins
 
 ##########################################################################
 # BUILD NIBB IMAGE PROGES (DONE 2006-03-14 galt following Jim's hg17 example)
 
 # Make directory on san for cluster job and copy in sequence
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/nibbPics
     cd /san/sanvol1/scratch/hg18/nibbPics
     cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
 
 # Make parasol job dir and sequence list files
     mkdir run
     cd run
     mkdir psl
     ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
     echo ../nibbImageProbes.fa > mrna.lst
 
 # Create parasol gensub file file
 cat << '_EOF_' > gsub
 #LOOP
 blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
     gensub2 genome.lst mrna.lst gsub spec
     para create spec
 
 # Do para try/push/time etc.
 #Completed: 49 of 49 jobs
 #CPU time in finished jobs:      12585s     209.74m     3.50h    0.15d  0.000 y
 #IO & Wait Time:                   411s       6.86m     0.11h    0.00d  0.000 y
 #Average job time:                 265s       4.42m     0.07h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:            1145s      19.08m     0.32h    0.01d
 #Submission to last job:          1195s      19.92m     0.33h    0.01d
 
 
 # Make sort and filter
     catDir psl | sort -k 10 \
         | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
 	| sort -k 14,14 -k 16,16n \
 	| sed 's#/san/sanvol1/scratch/hg18/nib/chr#chr#' \
 	| sed 's/.nib//' > ../nibbImageProbes.psl
 
 # Make bed file and copy in stuff
     ssh hgwdev
     cd /cluster/data/hg18/bed
     mkdir nibbPics
     cd nibbPics
     cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
     cp /san/sanvol1/scratch/hg18/nibbPics/nibbImageProbes.psl .
 
 # Load into database
     ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/hg18/nibbImageProbes.fa
     hgLoadSeq hg18 /gbdb/hg18/nibbImageProbes.fa
     hgLoadPsl hg18 nibbImageProbes.psl
 
 ##########################################################################
 # UPDATED hg18.knownToVisiGene (2006-03-15 galt)
 #  after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc)
 ssh hgwdev
 knownToVisiGene hg18 -fromProbePsl=vgAllProbes
 
 ##########################################################################
 # GENERATE SUMMARY STATISTICS (DONE, Fan 3/18/06)
 
    ssh hgwdev
    cd /cluster/data/hg18
    mkdir stat
    cd stat
 
    stats.pl ~/hg18 >hg18.pl.out
    hgCalStat hg18.pl.out hg18 hg18.out
 
    cp hg18.out hg18.out.sorted
 # Editi hg18.out.sorted to order by chromosomes and
 # replace the "?" in the Y chrom line with 6265435 and align its position.
   vi hg18.out.sorted
 
 # Add the hg18 stats to goldenPath/stats.html
 
   cd ~/browser/goldenPath
 
 # insert hg18.out.sorted into stats.html and add necessary
 # surrounding HTML lines for the hg18 section.
 
   vi stats.html
 
   cvs update stats.html
   cvs commit stats.html
 
 # Change description of hg18, per suggestion by Kim at NCBI (3/20/06, Fan).
 
   ssh hgwdev
   echo "update dbDb set description='Mar. 2006' where name = 'hg18';" \
       | hgsql -h genome-testdb hgcentraltest
 
 ############################################################################
 # hg18 -> hg17 LIFTOVER CHAINS (DONE 3/20/06 Fan)
     # I used a size of 10kb instead of 3kb for the split (blat query) sizes in
     # hg17.  This had a huge affect on the amount of hits in the blat, which
     # then had a huge effect on the amount of chains.  I should also mention
     # that hg17 chromosomes chr1 and chr2 were split further
     # into more than a single query file.  This helped a LOT in avoiding
     # cluster hippos classically associated with those chroms.
 
     ######## LIFTOVER PREPARATION
     # The following paragraph was already done during hg15 to hg17 liftover built
     # Split up hg17
     ssh pk
     cd /san/sanVol1/scratch/hg17
     mkdir -p liftSplits/{split,lift}
     bash
     for fa in /cluster/data/hg17/?{,?,*hap*}/*.fa; do
       c=`basename $fa .fa`
       echo $c
       faSplit -lift=liftSplits/lift/${c}.lft size $fa -oneFile 10000 liftSplits/split/$c
     done
     mkdir -p biggerSplits/split
     cd biggerSplits/
     ln -s ../liftSplits/lift
     cd split/
     ln -s ../../liftSplits/split/* .
     faSplit sequence chr1.fa 5 chr1_
     faSplit sequence chr2.fa 5 chr2_
     rm chr{1,2}.fa
 
     # Make some dirs
     cd /san/sanVol1/scratch
     mkdir -p hg18
 
     # Copy 11.ooc files to hg18 subdirectory.
     # cp -p /cluster/store5/gs.16/build33/11.ooc hg18
 
     ## First, copy over scripts. (Already done before)
 
     # mkdir -p /san/sanVol1/scratch/fan
     # cp -p /san/sanVol1/scratch/fan/*.sh /san/sanVol1/scratch/fan
     # cp /san/sanVol1/scratch/andy/psl.header /san/sanVol1/scratch/fan
 
     ######## LIFTOVER BLATING
 
     # HG18
     ssh pk
     cd /cluster/data/hg18
     makeLoChain-align hg18 /scratch/hg/hg18/nib hg17 /san/sanVol1/scratch/hg17/biggerSplits/split
     cd bed
 
     mv blat.hg17.2006-03-20 /san/sanVol1/scratch/hg18
     cd /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/run/
     sed 's/^blat/blat.sh/; s/\}.*$/}/' spec | awk '{print "/san/sanVol1/scratch/fan/" $0 " hg18ToHg17"}' > newspec
     para create newspec
     para try
     para push
 # Completed: 2646 of 2646 jobs
 # CPU time in finished jobs:     633021s   10550.35m   175.84h    7.33d  0.020 y
 # IO & Wait Time:                 14063s     234.39m     3.91h    0.16d  0.000 y
 # Average job time:                 245s       4.08m     0.07h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3645s      60.75m     1.01h    0.04d
 # Submission to last job:          6153s     102.55m     1.71h    0.07d
 
     ######## LIFTOVER CHAINING
     # LIFTING
     ssh pk
     cd /san/sanVol1/scratch/fan
     cp mm7SplitLift.sh hg17SplitLift.sh
 
     # change andy to fan, mm7 to hg17, and chrX to chr2, and remove chrUn_random
     vi hg17SplitLift.sh
 
     cat << 'EOF' > hg17ChainMergeSplit.sh
 #!/bin/bash
 cp -r chainRaw/ /scratch/fan/hg17Lifts
 pushd /scratch/fan/hg17Lifts
 mkdir chain
 /cluster/bin/x86_64/chainMergeSort chainRaw/*.chain | /cluster/bin/x86_64/chainSplit chain stdin
 cp -r chain `dirs +1`
 rm -rf chain chainRaw
 'EOF'
 
     chmod +x hg17ChainMergeSplit.sh
 
     # HG18
     cd /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/raw
     /san/sanVol1/scratch/fan/hg17SplitLift.sh
     cd ../
     mkdir chainRun chainRaw
     cd chainRun
     cat > gsub << 'EOF'
 #LOOP
 /cluster/bin/x86_64/axtChain -verbose=0 -linearGap=medium -psl $(path1) /scratch/hg/hg18/nib  /san/sanVol1/scratch/hg17/nib {check out line+ ../chainRaw/$(root1).chain}
 #ENDLOOP
 'EOF'
     ls -1S ../psl/*.psl > in.lst
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
     para time
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:       3713s      61.88m     1.03h    0.04d  0.000 y
 # IO & Wait Time:                  1284s      21.41m     0.36h    0.01d  0.000 y
 # Average job time:                 109s       1.81m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             310s       5.17m     0.09h    0.00d
 # Submission to last job:           310s       5.17m     0.09h    0.00d
     ######### CHAINMERGE/NET/NETSUBSET
     ssh kolossus
     mkdir -p /scratch/fan/hg17Lifts
     cd /scratch/fan/hg17Lifts
 
     cp -r /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/chainRaw/ .
     mkdir chain
     /cluster/bin/x86_64/chainMergeSort chainRaw/* | /cluster/bin/x86_64/chainSplit chain stdin
 # about 30 minutes.
 
     cp -rp chain /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/
     rm -rf chain
     rm -rf chainRaw
 
     ssh pk
     cd /san/sanvol1/scratch/fan
     cat << 'EOF' > netOver.sh
 #!/bin/bash
 
 chain=$1
 chrom=`basename $chain .chain`
 sizesHGOld=$2
 sizesHG17=/cluster/data/hg17/chrom.sizes
 chainDir=`dirname $chain`
 blatDir=`dirname $chainDir`
 net=${blatDir}/net/${chrom}.net
 over=${blatDir}/over/${chrom}.over
 
 mkdir -p ${blatDir}/{over,net}
 /cluster/bin/x86_64/chainNet $chain $sizesHGOld $sizesHG17 $net /dev/null
 /cluster/bin/x86_64/netChainSubset $net $chain $over
 'EOF'
     chmod +x netOver.sh
 
     mkdir netRun
 
     cd netRun/
 
     find /san/sanVol1/scratch/hg18/blat.hg17.2006-03-20/chain -name "*.chain" \
      | awk '{print "/san/sanVol1/scratch/fan/netOver.sh " $1 " /cluster/data/hg18/chrom.sizes"}' > spec
     para create spec
     para push
     para time
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:        431s       7.18m     0.12h    0.00d  0.000 y
 # IO & Wait Time:                   151s       2.52m     0.04h    0.00d  0.000 y
 # Average job time:                  12s       0.20m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              30s       0.50m     0.01h    0.00d
 # Submission to last job:            43s       0.72m     0.01h    0.00d
 
     ########## FINISHING
     ssh hgwdev
 
     # HG18
     cd /san/sanvol1/scratch/hg18/blat.hg17.2006-03-20/over
     cat * >> ../hg18ToHg17.over.chain
     cd ../
     rm -rf psl/ net/ chain/ chainRaw/ over/
     cd ../
     cp -rp blat.hg17.2006-03-20/ /cluster/data/hg18/bed
 
     cd /cluster/data/hg18/bed
     ln -s blat.hg17.2006-03-20 blat.hg17
     ln -s `pwd`/blat.hg17/hg18ToHg17.over.chain liftOver/hg18ToHg17.over.chain
     ln -s `pwd`/liftOver/hg18ToHg17.over.chain /gbdb/hg18/liftOver/hg18ToHg17.over.chain
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg18/liftOver
     cd /usr/local/apache/htdocs/goldenPath/hg18/liftOver
     cp /gbdb/hg18/liftOver/hg18ToHg17.over.chain .
     gzip hg18ToHg17.over.chain
     hgAddLiftOverChain hg18 hg17 /gbdb/hg18/liftOver/hg18ToHg17.over.chain
 
 ##########################################################################
 # NSCAN track - ( markd)
 # hg17 had both NSCAN and NSCAN-EST tracks, in a composite track.
 # currently have only NSCAN for hg18
     cd /cluster/data/hg18/bed/nscan/
 
     # obtainedf NSCAN predictions from michael brent's group
     # at WUSTL
     wget -nv http://genes.cse.wustl.edu/jeltje/hg18/hg18.nscan.gtf
     wget -r -np -nv http://genes.cse.wustl.edu/jeltje/hg18/chr_ptx/
     mv genes.cse.wustl.edu/jeltje/hg18/chr_ptx .
     rm -rf genes.cse.wustl.edu chr_ptx/index.html*
     gzip -9 hg18.nscan.gtf chr_ptx/*.fa
     chmod a-w hg18.nscan.gtf.gz chr_ptx/*.gz
 
     # load tracks.  Note that these have *utr features, rather than
     # exon features.  currently ldHgGene creates separate genePred exons
     # for these.
     ldHgGene -bin -gtf -genePredExt hg18 nscanGene hg18.nscan.gtf.gz
     # add .a suffix to match transcript id
     hgPepPred -suffix=.a hg18 generic nscanPep chr_ptx/*.fa.gz
     rm -f *.tab
 
     # update trackDb; need a hg18-specific page to describe informants
     human/hg18/nscanGene.html
     human/hg18/trackDb.ra
 
 # QA NOTE [ASZ 9-11-2006]: mytouch nscanPep 200603271900.00
 
 ##########################################################################
 # UPDATED hg18.knownToVisiGene (2006-04-05 galt)
 #  after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc)
 ssh hgwdev
 knownToVisiGene hg18 -fromProbePsl=vgAllProbes
 
 
 ##############################################################################
 # BLASTZ CHIMP PanTro1 second time (STARTED - 2006-01-05, DONE 2006-01-13 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzPanTro1.2006-01-05
     cd /cluster/data/hg18/bed
     rm blastz.panTro1
     ln -s blastzPanTro1.2006-01-05 blastz.panTro1
     cd blastzPanTro1.2006-01-05
 
     cat << '_EOF_' > DEF
 # human vs chimp
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_H=2000
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/hg18.2bit
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 
 # QUERY: Chimp PanTro1 - single chunk big enough to run entire genome
 SEQ2_DIR=/san/sanvol1/scratch/panTro1/panTro1.2bit
 SEQ2_LEN=/scratch/hg/panTro1/chrom.sizes
 SEQ2_CHUNK= 30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzPanTro1.2006-01-05
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
 screen
 bash
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -workhorse=pk \
 -stop=load \
 `pwd`/DEF > load.out 2>&1 &
 # Started Thu Jan  5 11:26:45 PST 2006
 # Encountered an error at the net step:
 
 startStep: 0, at step 5 net to stopStep 6
 # chmod a+x /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh
 # ssh -x pk nice /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh
 cd /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain
 chainPreNet hg18.panTro1.all.chain.gz /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes stdout
 chainNet stdin -minSpace=1 /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes stdout /dev/null
 netSyntenic stdin noClass.net
 Got 49 chroms in /scratch/hg/hg18/chrom.sizes, 52 in /scratch/hg/panTro1/chrom.sizes
 Finishing nets
 writing stdout
 writing /dev/null
 memory usage 363347968, utime 1042 s/100, stime 56
 netChainSubset -verbose=0 noClass.net hg18.panTro1.all.chain.gz stdout
 chainSort stdin stdout
 gzip -c
 Out of memory needMem - request size 6 bytes
 
 gzip: stdout: Broken pipe
 Command failed:
 ssh -x pk nice /cluster/data/hg18/bed/blastzPanTro1.2006-01-05/axtChain/netChains.csh
 
 # 1/9/06, Retry again
 
 ssh pk
 cd /cluster/data/hg18/bed
 cd blastzPanTro1.2006-01-05
 screen
 bash
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -workhorse=pk \
 -continue=net \
 -stop=load \
 `pwd`/DEF > load2.out 2>&1 &
 
 # Same error.
 
 # Try with kolossus
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -continue=net \
 -stop=load \
 `pwd`/DEF > load3.out 2>&1 &
 
 # Still have problems, which seem to be related to the
 # wrong $MACHTYPE and $PATH on kolossus.  Updated my .cshrc
 
 # Did the following manually on kolossus:
 
 # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
 
 /cluster/bin/x86_64/netChainSubset -verbose=0 noClass.net hg18.panTro1.all.chain.gz stdout | chainSort stdin stdout | gzip -c > hg18.panTro1.over.chain.gz
 
 mkdir -p /cluster/data/hg18/bed/liftOver
 cp -p hg18.panTro1.over.chain.gz /cluster/data/hg18/bed/liftOver/hg18ToPanTro1.over.chain.gz
 
 # Make axtNet for download: one .axt per hg18 seq.
 netSplit noClass.net net
 cd ..
 mkdir axtNet
 foreach f (axtChain/net/*.net)
 netToAxt $f axtChain/chain/$f:t:r.chain \
   /scratch/hg/hg18/hg18.2bit /san/sanvol1/scratch/panTro1/panTro1.2bit stdout \
   | axtSort stdin stdout \
   | gzip -c > axtNet/$f:t:r.hg18.panTro1.net.axt.gz
 end
 
 # Make mafNet for multiz: one .maf per hg18 seq.
 mkdir mafNet
 foreach f (axtNet/*.hg18.panTro1.net.axt.gz)
   axtToMaf -tPrefix=hg18. -qPrefix=panTro1. $f \
         /scratch/hg/hg18/chrom.sizes /scratch/hg/panTro1/chrom.sizes \
         stdout \
   | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
 end
 
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ above by hand.
 
 ssh pk
 cd /cluster/data/hg18/bed
 cd blastzPanTro1.2006-01-05
 
 screen
 bash
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -continue=load \
 -stop=load \
 `pwd`/DEF > load4.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -swap -stop=load \
 `pwd`/DEF > swap-load.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -continue=download \
 `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Measurements:
 
 # Go to kolossus to run featureBits to avoid out of memory problem.
 
 ssh kolossus
 bash
 
 time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 chainHg18Link
 # 2641472125 bases of 2733948177 (96.617%) in intersection
 
 time HGDB_CONF=~/.hg.conf.read-only featureBits hg18 chainPanTro1Link
 # 2681146909 bases of 2881515245 (93.046%) in intersection
 
 time HGDB_CONF=~/.hg.conf.read-only featureBits panTro1 chainHg17Link
 # 0 bases of 2733948177 (0.000%) in intersection
 
 time HGDB_CONF=~/.hg.conf.read-only featureBits hg17 chainPanTro1Link
 # 2633869032 bases of 2866216770 (91.894%) in intersection
 
 #########################################################################
 # BLASTZ RAT Rn3 (STARTED - 2005-12-22, DONE 2006-01-05 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzRn3.2005-12-22
     cd /cluster/data/hg18/bed
     rm blastz.rn3
     ln -s blastzRn3.2005-12-22 blastz.rn3
     cd blastzRn3.2005-12-22
 
     cat << '_EOF_' > DEF
 # human vs rat
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Muman Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_SMSK=/scratch/hg/hg18/linSpecRep/notInRat
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn3 - chunk big enough to do all chroms in single whole
 pieces
 SEQ2_DIR=/scratch/rat/rn3/softNib
 SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
 SEQ2_LEN=/cluster/bluearc/rat/rn3/chrom.sizes
 SEQ2_CHUNK=300000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzRn3.2005-12-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     # happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-stop=load \
 	`pwd`/DEF > to-load.out 2>&1 &
 
 # start processing again on 12/31/05.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=pk \
       -swap \
       -stop=load \
 	`pwd`/DEF > swap.out 2>&1 &
 
 # Either UCSC RR and hgwdev systems or network went down around 11 AM 12/31/05.
 
 # After holidays, start again on 1/3/06 and again on 1/5/06.
 
     ssh pk
     cd /cluster/data/hg18/bed
     cd blastzRn3.2005-12-22
     screen
     bash
 
       time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=pk \
       -swap \
       -continue=net \
       -stop=load \
 	`pwd`/DEF > swap6.out 2>&1 &
 
 # DONE! Jan  5 13:39
 
 # Measurements:
 nice featureBits rn3 chainHg18Link
 # 962630574 bases of 2571104688 (37.440%) in intersection
 nice featureBits hg18 chainRn3Link
 # 964251210 bases of 2881515245 (33.463%) in intersection
 
 #########################################################################
 # BLASTZ ARMADILLO DasNov1 (STARTED - 2006-01-06 - 2006-01-09 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzDasNov1.2006-01-06
     cd /cluster/data/hg18/bed
     rm blastz.dasNov1
     ln -s blastzDasNov1.2006-01-06 blastz.dasNov1
     cd blastzDasNov1.2006-01-06
 
     cat << '_EOF_' > DEF
 # human vs armadillo
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for armadillo (per Webb email to Brian Raney)
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=30000000
 SEQ1_LAP=10000
 
 # QUERY: Armadillo DasNov1
 SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
 SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes
 SEQ2_LIMIT=400
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzDasNov1.2006-01-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -workhorse=pk \
 	-stop=load \
 	`pwd`/DEF > load.out 2>&1 &
 # Started Fri Jan  6 06:20:12 PST 2006
 
 # 1:20 PM, 1/7/06
 # The blastz cluster run seemed finished OK, but make jobList some how
 # does not end, even after creating the run.time file manually.  Kill it manually.
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -workhorse=pk \
 -continue=cat \
 -stop=load \
 `pwd`/DEF > load2.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -workhorse=pk \
 -swap -stop=load \
 `pwd`/DEF > swap-load.out 2>&1 &
 
 # Done, Jan  8 21:40.
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -continue=download \
 `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Done! Jan  9 06:11
 
     # Reciprocal best net mafs for multiz (kate)
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.dasNov1
     ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 dasNov1 >&! rbest.log &
 
     # Load nets (2007-03-12 kate)
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.dasNov1/axtChain
     netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netDasNov1 stdin
     netFilter -minGap=10 hg18.dasNov1.rbest.net.gz |  \
         hgLoadNet -warn hg18 netRBestDasNov1 stdin
 
 #########################################################################
 # BLASTZ DOG CanFam2 second time (DONE - 2005-12-28 - 2005-12-29 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzCanFam2.2005-12-28
     cd /cluster/data/hg18/bed
     rm blastz.canFam2
     ln -s blastzCanFam2.2005-12-28 blastz.canFam2
     cd blastzCanFam2.2005-12-28
 
     cat << '_EOF_' > DEF
 # human vs dog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for dog (per Webb email to Brian Raney)
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog CanFam2 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/hg/canFam2/nib
 SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes
 SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzCanFam2.2005-12-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-stop=load \
 	`pwd`/DEF > load.out 2>&1 &
     #	Started 2005-12-28 21:33
 
     # Two jobs stuck in the same node.  Did manual para stop and para push.
     # Both finished within a few minutes.
 
     # Done! On Thu Dec 29 05:27:31 PST 2005.
 
     # system seems hang on kolossus (3 processes of [tcsh -c nice chainMergeSort], not moving)
     # manually killed the jobs.
     # now use pk as the workhorse.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
       -workhorse=pk \
       -continue=chainMerge \
 	-stop=load \
 	`pwd`/DEF > load2.out 2>&1 &
 
     # Done! Thu Dec 29 09:10:02 PST 2005.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=pk \
       -swap -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
     # Had an error at the load step,
     # mySQL error 2013: Lost connection to MySQL server during query,
     # probably due to sys admin working on network connections,
     # continue at the load step
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
       -workhorse=pk \
       -swap -continue=load -stop=load \
 	`pwd`/DEF > swap-load2.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
     -workhorse=pk \
     -continue=download \
     `pwd`/DEF > download.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -workhorse=pk \
     -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
     -swap -continue=download \
     `pwd`/DEF > swap-download.out 2>&1 &
 
     # Done! Dec 29 13:21
 
     #	Measurements:
 
     ssh hgwdev
 nice featureBits canFam2 chainHg18Link
 # 1477551526 bases of 2384996543 (61.952%) in intersection
 nice featureBits hg18 chainCanFam2Link
 # 1524764349 bases of 2881515245 (52.915%) in intersection
 nice featureBits canFam2 chainHg17Link
 # 1487483112 bases of 2384996543 (62.368%) in intersection
 nice featureBits hg17 chainCanFam2Link
 # 1530197469 bases of 2866216770 (53.387%) in intersection
 
 
 #########################################################################
 # BLASTZ ELEPHANT LoxAfr1 second time (STARTED - 2006-01-03 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03
     cd /cluster/data/hg18/bed
     rm blastz.loxAfr1
     ln -s blastzLoxAfr1.2006-01-03 blastz.loxAfr1
     cd blastzLoxAfr1.2006-01-03
 
     cat << '_EOF_' > DEF
 # human vs elephant
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 BLASTZ=blastz.v7.x86_64
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Elephant LoxAfr1 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
 SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes
 SEQ2_LIMIT=300
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -workhorse=pk \
 -stop=load \
 `pwd`/DEF > load.out 2>&1 &
 
 # failed at step 2 due to kki cluster not started.
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -workhorse=pk \
 -smallClusterHub=pk \
 -continue=cat \
 -stop=load \
 `pwd`/DEF > load2.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -smallClusterHub=pk \
 -continue=net \
 -stop=load \
 `pwd`/DEF > load3.out 2>&1 &
 
 # Same broken pipe error.
 
 netChainSubset -verbose=0 noClass.net hg18.loxAfr1.all.chain.gz stdout
 chainSort stdin stdout
 gzip -c
 Out of memory needMem - request size 28 bytes
 
 gzip: stdout: Broken pipe
 Command failed:
 ssh -x kolossus nice /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03/axtChain/netChains.csh
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -continue=net \
 -stop=load \
 `pwd`/DEF > load4.out 2>&1 &
 
 # Finally, a success!
 
 tail load4.out
 #...
 # cd /cluster/data/hg18/bed/blastzLoxAfr1.2006-01-03/axtChain
 #netClass -verbose=0 -noAr noClass.net hg18 loxAfr1 hg18.loxAfr1.net
 #netFilter -minGap=10 hg18.loxAfr1.net
 #hgLoadNet -verbose=0 hg18 netLoxAfr1 stdin
 #startStep: 5, at step 7 download to stopStep 6
 
 # *** All done!
 # *** Add {chain,net}LoxAfr1 tracks to trackDb.ra if necessary.
 
 # The swap-load was not successful, after several tries.
 # Last one seems was due to out of memory problem.
 # Per Hiram, we no longer do swap for 2X genomes, unless specifically requested.
 # Mark made an inquiry, but said he can get by with hg18->loxAfr1 nets.
 
     # reciprocal best net mafs for multiz (2007-03-09 kate)
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.loxAfr1
     ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 loxAfr1 >&! rbest.log &
 
     # load net and reciprocal best net for comparison
     # note sure why these tables and cleanup aren't done -- ask Fan
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.loxAfr1/axtChain
     netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netLoxAfr1 stdin
     netFilter -minGap=10 hg18.loxAfr1.rbest.net.gz |  \
         hgLoadNet -warn hg18 netRBestLoxAfr1 stdin
 
 #########################################################################
 # BLASTZ COW BosTau2 second time (STARTED - 2006-01-07 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzBosTau2.2006-01-07
     cd /cluster/data/hg18/bed
     rm blastz.bosTau2
     ln -s blastzBosTau2.2006-01-07 blastz.bosTau2
     cd blastzBosTau2.2006-01-07
 
     cat << '_EOF_' > DEF
 # human vs cow
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow BosTau2 - single chunk big enough to run entire genome
 SEQ2_DIR=/san/sanvol1/scratch/bosTau2/bosTau2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/bosTau2/chrom.sizes
 SEQ2_CHUNK=3200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzBosTau2.2006-01-07
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
 # establish a screen to control this job
 screen
 bash
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -stop=load \
 -workhorse=pk \
 `pwd`/DEF > load.out 2>&1 &
 
 # Started Sat Jan  7 07:57:22 PST 2006
 # blastz run (and load) done Jan  8 00:13
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -swap -stop=load \
 `pwd`/DEF > swap-load.out 2>&1 &
 
 # took a long time to finish.
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -continue=download \
 `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Done! Jan  8 21:10
 
 # Measurements:
 
 ssh hgwdev
 nice featureBits bosTau2 chainHg18Link
 # 1357027317 bases of 2812203870 (48.255%) in intersection
 nice featureBits hg18 chainBosTau2Link
 # 1357291762 bases of 2881515245 (47.103%) in intersection
 nice featureBits bosTau2 chainHg17Link
 # 0 bases of 2812203870 (0.000%) in intersection
 # nice featureBits hg17 chainBosTau2Link
 1350076765 bases of 2866216770 (47.103%) in intersection
 
 #########################################################################
 # BLASTZ TENREC EchTel1 second time (STARTED - 2006-01-09 DONE 2006-01-12 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzEchTel1.2006-01-09
     cd /cluster/data/hg18/bed
     rm blastz.echTel1
     ln -s blastzEchTel1.2006-01-09 blastz.echTel1
     cd blastzEchTel1.2006-01-09
 
     cat << '_EOF_' > DEF
 # human vs tenrec
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Tenrec EchTel1
 SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
 SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
 SEQ2_LIMIT=400
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzEchTel1.2006-01-09
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
 # establish a screen to control this job
 screen
 bash
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -stop=load \
 `pwd`/DEF > load.out 2>&1 &
 
 # Started Mon Jan  9 08:09:03 PST 2006
 
 # Found over a thousand jobs failed, all with the following 7 hosts.
 
 [pk:run.blastz> fgrep host j1.err | sort -u
 host: kkr10u06.kilokluster.ucsc.edu
 host: kkr10u58.kilokluster.ucsc.edu
 host: kkr10u62.kilokluster.ucsc.edu
 host: kkr11u34.kilokluster.ucsc.edu
 host: kkr11u39.kilokluster.ucsc.edu
 host: kkr12u18.kilokluster.ucsc.edu
 host: kkr12u29.kilokluster.ucsc.edu
 
 # manually created /scratch/tmp on above machines (except one).
 
 # 2 jobs still running for more than 5 hours each.
 para stop
 para recover jobList newJobList
 
 # newJobList contains only 2 jobs.  Checked the .psl files under psl confirming only two files missing.
 para create newJobList
 para push
 # This 2 jobs finished within a couple of mintues!
 para time >run.time
 
 bash
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -continue=cat \
 -stop=load \
 `pwd`/DEF > load2.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -swap -stop=load \
 `pwd`/DEF > swap-load.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -continue=net \
 -swap \
 -stop=load \
 `pwd`/DEF > swap-load3.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -continue=download \
 `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Done! On Jan 12 09:18
 
     # reciprocal best net mafs for multiz (2007-03-09 kate)
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.echTel1
     ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 echTel1 >&! rbest.log &
     # reloading chains which disappeared (2007-04-17 kate)
     cd /cluster/data/hg18/bed/blastz.echTel1/axtChain
     # edit loadUp.csh --> create loadUp2.csh and loadUp3.csh
     # run loadUp2.csh (does chainSplit) on kkstore02
     # run loadUp3.csh (does hgLoadChain) on hgwdev
 
 #########################################################################
 # BLASTZ CHICKEN GalGal2 second time (DONE - 2005-12-28 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzGalGal2.2005-12-28
     cd /cluster/data/hg18/bed
     rm blastz.galGal2
     ln -s blastzGalGal2.2005-12-28 blastz.galGal2
     cd blastzGalGal2.2005-12-28
 
     cat << '_EOF_' > DEF
 # human vs chicken
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken GalGal2 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/hg/galGal2/nib
 SEQ2_LEN=/cluster/bluearc/galGal2/chrom.sizes
 SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzGalGal2.2005-12-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-stop=load \
 	`pwd`/DEF > load.out 2>&1 &
     #	Started 2005-12-28 10:35
 
     # Two jobs stuck in the same node.  Did manual para stop and para push.
     # Both finished within a few minutes.
 
     # Done! On Wed Dec 28 15:32:45 PST 2005.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
     # Had an error at the net step
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
       -swap -continue=net -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
     -continue=download \
     `pwd`/DEF > download.out 2>&1 &
 
     # the gzip job on kolossus seems not moving at all.
     # killed it manually.  Try again.
 
     # Seemed not moving, kill it again.  Now use pk instead of kolossus.
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -workhorse=pk \
     -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
     -continue=download \
     `pwd`/DEF > download.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
     -workhorse=pk \
     -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
     -swap -continue=download \
     `pwd`/DEF > swap-download.out 2>&1 &
 
     # Done! Wed Dec 28 20:39:44 PST 2005
 
     #	Measurements:
 
     ssh hgwdev
 
     nice featureBits galGal2 chainHg18Link
     # 91564024 bases of 1054197620 (8.686%) in intersection
     nice featureBits hg18 chainGalGal2Link
     # 102417858 bases of 2881515245 (3.554%) in intersection
 
     nice featureBits galGal2 chainHg17Link
     # 93277286 bases of 1054197620 (8.848%) in intersection
     nice featureBits hg17 chainGalGal2Link
     # 103882699 bases of 2866216770 (3.624%) in intersection
 
 # BLASTZ FROG XenTro1 second time (DONE - 2006-01-07 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06
     cd /cluster/data/hg18/bed
     rm blastz.xenTro1
     ln -s blastzXenTro1.2006-01-06 blastz.xenTro1
     cd blastzXenTro1.2006-01-06
 
     cat << '_EOF_' > DEF
 # human vs frog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Frog XenTro1 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
 SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
 SEQ2_LIMIT=400
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
       -stop=load \
 	`pwd`/DEF > load.out 2>&1 &
 # Started Fri Jan  6 20:19:30 PST 2006
 # Blastz run done.  Jan  7 02:07 load.out
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
 # got the following error:
 
 startStep: 4, at step 5 net to stopStep 6
 netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]).
 
 # Try it with pk instead of kolossus:
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -swap -stop=load \
 `pwd`/DEF > swap-load2.out 2>&1 &
 
 # It worked, swap-load done. Jan  7 06:05
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -continue=download \
 `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -workhorse=pk \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Done! Jan  7 06:18
 
 # Measurements:
 
 ssh hgwdev
 nice featureBits xenTro1 chainHg18Link
 # 61197900 bases of 1381238994 (4.431%) in intersection
 nice featureBits hg18 chainXenTro1Link
 # 67810866 bases of 2881515245 (2.353%) in intersection
 
 nice featureBits xenTro1 chainHg17Link
 # 81777842 bases of 1381238994 (5.921%) in intersection
 nice featureBits hg17 chainXenTro1Link
 # 85701475 bases of 2866216770 (2.990%) in intersection
 
 # BLASTZ FROG XenTro1 second time (STARTED - 2006-01-06 Fan)
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzXenTro1.2006-01-06
     cd /cluster/data/hg18/bed
     rm blastz.xenTro1
     ln -s blastzXenTro1.2006-01-06 blastz.xenTro1
     cd blastzXenTro1.2006-01-06
 
     cat << '_EOF_' > DEF
 # human vs frog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Frog XenTro1 - single chunk big enough to run entire genome
 SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
 SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
 SEQ2_LIMIT=400
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzXenTro1.2006-01-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
       -stop=load \
 	`pwd`/DEF > load.out 2>&1 &
 # Started Fri Jan  6 20:19:30 PST 2006
 # Blastz run done.  Jan  7 02:07 load.out
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap -stop=load \
 	`pwd`/DEF > swap-load.out 2>&1 &
 
 # got the following error:
 
 startStep: 4, at step 5 net to stopStep 6
 netChains: looks like previous stage was not successful (can't find [xenTro1.hg18.]all.chain[.gz]).
 
 # Try it with pk instead of kolossus:
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -swap -stop=load \
 `pwd`/DEF > swap-load2.out 2>&1 &
 
 # It worked, swap-load done. Jan  7 06:05
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -continue=download \
 `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -workhorse=pk \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Done! Jan  7 06:18
 
 # Measurements:
 
 ssh hgwdev
 nice featureBits xenTro1 chainHg18Link
 # 61197900 bases of 1381238994 (4.431%) in intersection
 nice featureBits hg18 chainXenTro1Link
 # 67810866 bases of 2881515245 (2.353%) in intersection
 
 nice featureBits xenTro1 chainHg17Link
 # 81777842 bases of 1381238994 (5.921%) in intersection
 nice featureBits hg17 chainXenTro1Link
 # 85701475 bases of 2866216770 (2.990%) in intersection
 
 # BLASTZ TETRAODON TetNig1 second time (DONE - 2006-01-07 Fan)
 
 ssh pk
 mkdir /cluster/data/hg18/bed/blastzTetNig1.2006-01-07
 cd /cluster/data/hg18/bed
 rm blastz.tetNig1
 ln -s blastzTetNig1.2006-01-07 blastz.tetNig1
 cd blastzTetNig1.2006-01-07
 
     cat << '_EOF_' > DEF
 # human vs tetraodon
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Tetraodon TetNig1 - single chunk big enough to run entire genome
 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
 SEQ2_CHUNK=410000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzTetNig1.2006-01-07
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
 # establish a screen to control this job
 screen
 bash
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -stop=load \
 `pwd`/DEF > load.out 2>&1 &
 # Started Sat Jan  7 05:40:51 PST 2006
 
 # Encountered an error:
 startStep: 0, at step 5 net to stopStep 6
 netChains: looks like previous stage was not successful (can't find [hg18.tetNig1.]all.chain[.gz]).
 
 # Try it with pk as the workhorse.
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -continue=net \
 -stop=load \
 `pwd`/DEF > load2.out 2>&1 &
 
 # Load done.  Sat Jan  7 07:34:56 PST 2006
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -swap -stop=load \
 `pwd`/DEF > swap-load.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -continue=download \
  `pwd`/DEF > download.out 2>&1 &
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 -workhorse=pk \
 -swap -continue=download \
 `pwd`/DEF > swap-download.out 2>&1 &
 
 # Done! Sat Jan  7 08:02:14 PST 2006
 # The download and swap-download took less than 10 seconds each.  ???
 
 # Measurements:
 
 ssh hgwdev
 nice featureBits tetNig1 chainHg18Link
 # 50026847 bases of 342403326 (14.611%) in intersection
 nice featureBits hg18 chainTetNig1Link
 # 57654754 bases of 2881515245 (2.001%) in intersection
 
 nice featureBits tetNig1 chainHg17Link
 # 34379509 bases of 342403326 (10.041%) in intersection
 nice featureBits hg17 chainTetNig1Link
 # 35910128 bases of 2866216770 (1.253%) in intersection
 
 #########################################################################
 # BLASTZ FUGU fr1 (STARTED - 2005-12-20, DONE 2006-01-04 Fan)
     ssh pk
     mkdir /cluster/data/hg18/bed/blastzFr1.2005-12-20
     cd /cluster/data/hg18/bed
     ln -s blastzFr1.2005-12-20 blastz.fr1
     cd blastzFr1.2005-12-20
 
     cat << '_EOF_' > DEF
 # human vs. fugu
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Reuse parameters from human-chicken, except L=6000 (more relaxed)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18 - testing 100,000,000 sized chunk on pk kluster
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 
 # QUERY: Fugu Fr1 - chunk big enough to run the whole chrom at once
 SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
 SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
 SEQ2_CHUNK=400000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzFr1.2005-12-20
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     ssh pk
     cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -stop=load \
 	`pwd`/DEF > thruLoad.out 2>&1 &
 
     ssh pk
     cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
     screen
     bash
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -continue=chainMerge -stop=load \
 	`pwd`/DEF > thruLoad.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -continue=download \
 	`pwd`/DEF > download.clean.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -swap \
 	`pwd`/DEF > swap.out 2>&1 &
 
 # Finish the remaining step, 1/4/05.
 
     ssh pk
     cd /cluster/data/hg18/bed/blastzFr1.2005-12-20
     screen
     bash
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 \
 	-swap -continue=download \
 	`pwd`/DEF > DownloadSwap.out 2>&1 &
 
 # First try found the DEF was some how altered for rn3.
 # Re-generated DEF and try again.
 
 time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 \
 	-swap -continue=download \
 	`pwd`/DEF > DownloadSwap2.out 2>&1 &
 
 # Done.  Jan  4 09:48.
 
 # measurements
 
 nice featureBits hg18 chainFr1Link
 # 51795958 bases of 2881515245 (1.798%) in intersection
 nice featureBits hg17 chainFr1Link
 #50831650 bases of 2866216770 (1.773%) in intersection
 
 nice featureBits hg18 netFr1
 # 691148929 bases of 2881515245 (23.986%) in intersection
 nice featureBits hg17 netFr1
 # 714234935 bases of 2866216770 (24.919%) in intersection
 
 nice featureBits fr1 chainHg18Link
 # 43267869 bases of 315518167 (13.713%) in intersection
 # nice featureBits fr1 chainHg17Link
 0 bases of 315518167 (0.000%) in intersection
 nice featureBits fr1 netHg18
 # 140843080 bases of 315518167 (44.639%) in intersection
 nice featureBits fr1 netHg17
 # 0 bases of 315518167 (0.000%) in intersection
 
 ##################################################
 
 # For blastz runs between hg18 and other organisms, they are documented in
 # makeMm8.doc makeRn4.doc, makeRheMac2.doc, makeDanRer3.doc.
 
 # PHASTCONS SCORES DOWNLOADABLES FOR 17WAY (2006-04-06 Fan)
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz17way
     mkdir phastConsDownloads
     cd phastConsDownloads
 cat > downloads.csh << 'EOF'
     date
     cd /san/sanvol1/scratch/hg18/multiz17way.2006-02-18/cons/pp
     foreach chr (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
       echo $chr
       cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
         | nice gzip -c \
             > /cluster/data/hg18/bed/multiz17way/phastConsDownloads/$chr.gz
     end
     date
 'EOF'
     csh downloads.csh >&! downloads.log &
 
         # ~20 minutes
     # << happy emacs
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way/phastConsDownloads
     md5sum *.gz > md5sum.txt
     set dir = /usr/local/apache/htdocs/goldenPath/hg18/phastCons17way
     mkdir $dir
     ln -s /cluster/data/hg18/bed/multiz17way/phastConsDownloads/{*.gz,md5sum.txt} $dir
     cp /usr/local/apache/htdocs/goldenPath/hg17/phastCons17way/README.txt $dir
     # edit this file to reflect the latest releases used.
     vi $dir/README.txt
 
 ##########################################################################
 # RE-BUILT GO DATABASE (DONE 4/12/06, Fan)
 
 # GO changed the content of gene_association.goa_uniprot.gz.
 # Tho original one we use no longer has human, mouse, etc in it.
 # They are placed in separate files.
 # Per GO's suggestion, we now get the file from the submission sub-directory.
 # This seems cover more than concatenating the individual goa... files.
 
 # Download the terms and make the database.
 ssh hgwdev
 mkdir /cluster/store1/geneOntology/20060330
 cd /cluster/store1/geneOntology/20060330
 
 wget --timestamping http://www.godatabase.org/dev/database/archive/latest/go_200603-assocdb-data.gz
 
 hgsql mysql <<end
 create database go060330;
 end
 zcat go_*data.gz | sed -e 's/ENGINE=MyISAM DEFAULT CHARSET=latin1/TYPE=MyISAM/g' >j.tmp
 hgsql go060330 <j.tmp
 rm j.tmp
 
 wget --timestamping \
 "ftp://ftp.geneontology.org/pub/go/gene-associations/submission/gene_association.goa_uniprot.gz"
 
 # Updated hgGoAssociation.c so that it does not skip any line in the beginning */
 
 zcat gene_association.goa_uniprot.gz|\
 /cluster/home/fanhsu/bin/i386/hgGoAssociation go060330 goaPart stdin
 
 # Ask sys-admin to switch the database pointer go to point to go060330.
 
 ##########################################################################
 # GENEID GENE PREDICTIONS (DONE - 2006-04-21 FIXED: 2006-05-09 - Hiram)
 # RELOADED PEPTIDE TABLE, GENEIDPEP (DONE, 2006-07-11, hartera)
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/geneid
     cd /cluster/data/hg18/bed/geneid
     for C in `awk '{print $1;}' ../../chrom.sizes`
     do
 	wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/$C.gtf
 	wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/$C.prot
     done
     # Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
     for F in chr*.prot
     do
       perl -wpe 's/^(>chr\S+)/$1.1/' $F
     done >> geneid.fa
     #	one of the files in this delivery, chr1.prot, did *not* have a
     #	terminal <CR> character and it caused the next protein in the
     #	next file processed, chr10.prot, to be a continuation of the
     #	last protein in chr1.prot.  To check for this:
     grep ">" geneid.fa | grep -v "^>"
     #	shows a line:
     #	AVSET>chr10_1.1
     #	This turns out to have been the result of a truncated file.
     #	Fetch that file again:
     mv chr1.prot chr1.prot.orig
     wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/geneid_v1.2/chr1.prot
     #	That's better:
     wc -l chr1.prot chr1.prot.orig
     #	24494 chr1.prot
     #	4524 chr1.prot.orig
     rm chr1.prot.orig
     #	run the above loop again to generate geneid.fa after:
     rm geneid.fa
 
     ldHgGene -gtf -genePredExt hg18 geneid *.gtf
     #	Read 33410 transcripts in 275347 lines in 49 files
     #	33410 groups 49 seqs 1 sources 3 feature types
     #	33410 gene predictions
     hgPepPred hg18 generic geneidPep geneid.fa
 
     #	verify same names in both tables:
     awk '{print $1}' geneidPep.tab | sort > pep.names
     awk '{print $1}' genePred.tab | sort > id.names
     wc -l pep.names id.names
     #	33410 pep.names
     #	33410 id.names
 
     comm -12 pep.names id.names | wc -l
     #	33410
 
 # QA NOTE (ASZ 5-11-2006) I dropped the geneidPep table and the reference
 # to it from the trackDb.ra file.  This functionality is now done on the
 # fly and this table is no longer needed.
     # Added back the geneidPep table as requested by a user
     # (hartera, 2006-07-11)
     ssh hgwdev
     cd /cluster/data/hg18/bed/geneid
     hgPepPred hg18 generic geneidPep geneid.fa
     # The trackDb.ra file in kent/src/makeDb seems to have a reference
     # to the geneidPep table already.
 
 ##########################################################################
 # BLASTZ/CHAIN/NET XENTRO2 (DONE 4/20/06 angie)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.xenTro2.2006-04-20
     cd /cluster/data/hg18/bed/blastz.xenTro2.2006-04-20
     cat << '_EOF_' > DEF
 # human vs. frog
 BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
 
 # Use same params as used for mammal-xenTro1 (see makeXenTro1.doc)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Frog xenTro2 - single chunk big enough to run two of the
 #               largest scaffolds in one job
 SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/cluster/data/hg18/bed/blastz.xenTro2.2006-04-20
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/hg18XenTro2 \
       -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
       >& do.log & tail -f do.log
     ln -s blastz.xenTro2.2006-04-20 /cluster/data/hg18/bed/blastz.xenTro2
 
 ###########################################################################
 # BLASTZ CHAIN SWAP FOR ZEBRAFISH (danRer4) (DONE, 2006-04-25, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
     # See also makeDanRer4.doc
     # alignments are in: /cluster/data/hg18/bed/blastz.danRer4.swap
     # Blastz parameters used were:
     # BLASTZ_H=2000
     # BLASTZ_Y=3400
     # BLASTZ_L=6000
     # BLASTZ_K=2200
     # BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
     # There are no lineage-specific repeats defined for this species pair so
     # all repeats were used as lineage-specific.
     ssh pk
     cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
     nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -swap `pwd`/DEF >& doSwap.log &
     # Took about 15 minutes.
 
     # check with featureBits and compare to danRer3 chains:
     featureBits hg18 chainDanRer4Link
     # 57415379 bases of 2881515245 (1.993%) in intersection
     featureBits hg18 chainDanRer3Link
     # 64801985 bases of 2881515245 (2.249%) in intersection
 
     featureBits -chrom=chr1 hg18 refGene:cds chainDanRer4Link -enrichment
     # refGene:cds 1.389%, chainDanRer4Link 2.337%, both 0.937%, cover 67.47%,
     # enrich 28.87x
 
     featureBits -chrom=chr1 hg18 refGene:cds chainDanRer3Link -enrichment
     # refGene:cds 1.389%, chainDanRer3Link 2.601%, both 0.931%, cover 67.01%,
     # enrich 25.76x
 
     featureBits -chrom=chr1 hg17 refGene:cds chainDanRer2Link -enrichment
     # refGene:cds 1.395%, chainDanRer2Link 2.742%, both 0.911%, cover 65.31%,
     # enrich 23.82x
 
     # similar coverage and enrichment for danRer4 and danRer3 chains
     # which is good.
     featureBits -chrom=chr1 hg18 refGene:cds netDanRer4 -enrichment
     # refGene:cds 1.389%, netDanRer4 31.001%, both 1.096%, cover 78.91%,
     # enrich 2.55x
     featureBits -chrom=chr1 hg18 refGene:cds netDanRer3 -enrichment
     # refGene:cds 1.389%, netDanRer3 29.929%, both 1.080%, cover 77.72%,
     # enrich 2.60x
     # Similar coverage and enrichment for danRer4 net on hg18 as for danRer3.
 
 #  LOAD FIRSTEF TRACK (DONE 2006-04-25 Fan)
 
     ssh hgwdev
     mkdir -p /cluster/data/hg18/bed/firstEF
     cd /cluster/data/hg18/bed/firstEF
 
 # receive the file firstEFMar05New.bed.gz from email (ramana.davuluri at osumc.edu) into this subdirectory
 
     cat << '_EOF_' > sedScript
 s/chr23/chrX/g
 s/chr24/chrY/g
 /^>/d
 /^$/d
 /^No/d
 '_EOF_'
     # << this line keeps emacs coloring happy
     bash
     zcat firstEFMar05New.bed.gz | sed -f sedScript | awk  "{OFS=\"\t\"} {\$3 +=1; print  \$0}" > firstEF.bed
     exit
 
     hgLoadBed hg18 firstEF firstEF.bed
     rm firstEF.bed bed.tab
 
 #done firstEF
 
 ###########################################################################
 # ALTGRAPHX TRACK (sugnet) Wed Apr 26 13:46:46 PDT 2006
 
 cd /cluster/store1/sugnet/altSplice/
 mkdir hg18-2006.04.13
 cd hg18-2006.04.13
 mkdir rnaCluster
 cd rnaCluster
 
 # Don't use RAGE libraries for clone bounds.
 ~/latestJk/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  hg18 rage.libs
 
 # Make spec file to run.
 foreach c (`echo 'select chrom from chromInfo' | hgsql hg18 | grep -v chrom`)
     set out = chrom/$c.bed
     echo "clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c" >> clusterRna.spec
 end
 
 
 # Tried running it on the minicluster, but can't connect to the
 # cluster accounts so run it from here on hgwdev.
 chmod 755 clusterRna.spec
 mkdir chrom
 ./clusterRna.spec >& clusterRna.log
 
 cd ..
 
 # Make script to setup parasol job file for raw altGraphX files on human
 cat << '_EOF_' > makeRun.sh
 #!/bin/sh
 
 for chrom in `echo "select chrom from chromInfo" | hgsql hg18 | grep -v chrom`; do
 echo 'echo "Doing $chrom"'
 echo "/cluster/home/sugnet/bin/i386/altSplice -db=hg18   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg18.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg18/nib/$chrom.nib"
 done
 '_EOF_'
     # << this line makes emacs coloring happy
 
 mkdir agxs
 chmod 755 makeRun.sh
 chmod 755 toRun.sh
 ./toRun.sh >& toRun.log &
 
 cat agxs/*.agx > hg18.agx
 
 mkdir hg18
 mv agxs/ makeRun.sh toRun.log toRun.sh hg18.agx hg18
 cd ..
 
 mkdir mm7
 cd mm7
 # make the rnaClusters
 mkdir rnaCluster
 cd rnaCluster/
 mkdir chrom
 
 # Don't use RAGE libraries for clone bounds.
 ~/latestJk/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  mm7 rage.libs
 
 foreach c (`echo 'select chrom from chromInfo' | hgsql mm7 | grep -v chrom`)
     set out = chrom/$c.bed
     echo "clusterRna -mrnaExclude=mm7.rage.libs mm7 /dev/null $out -chrom=$c" >> clusterRna.spec
 end
 
 # tried to run on kki, but no longer can access db from minicluster.
 chmod 755 clusterRna.spec
 ./clusterRna.spec >& clusterRna.log &
 
 cd ..
 
 cat << '_EOF_' > makeRun.sh
 #!/bin/sh
 
 for chrom in `echo "select chrom from chromInfo" | hgsql mm7 | grep -v chrom`; do
 echo 'echo "Doing $chrom"'
 echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm7   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm7.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm7/nib/$chrom.nib"
 done
 '_EOF_'
     # << this line keeps emacs coloring happy
 chmod 755 makeRun.sh
 ./makeRun.sh > toRun.sh
 chmod 755 toRun.sh
 mkdir agxs
 ./toRun.sh >& toRun.log &
 
 cat agxs/*.agx > mm7.agxc
 cd ..
 mkdir orthoSpliceExoniphy
 cd orthoSpliceExoniphy/
 
 echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg17 | grep -v txStart > hg17.exoniphy.bed
 liftOver hg17.exoniphy.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.exoniphy.bed hg17.exoniphy.unmapped.bed
 mkdir orthoSplice
 cd orthoSplice
 ln -s ../orthoSpliceExoniphy/hg18.exoniphy.bed .
 echo 'select chrom, size from chromInfo' | hgsql hg18 | grep -v chrom > chromSizes.tab
 cp /cluster/data/hg18/bed/blastz.mm7/axtChain/hg18.mm7.all.chain.gz .
 chainSplit chains hg18.mm7.all.chain
 cp /cluster/data/hg18/bed/blastz.mm7/axtChain/hg18.mm7.net.gz .
 netSplit hg18.mm7.net.gz nets
 
 mkdir agx report logs
 cat << '_EOF_' > makeRun.sh
 #!/usr/bin/perl -w
 
 open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
 while(<IN>) {
     chomp;
     @w = split;
     print "/cluster/home/sugnet/bin/i386/orthoSplice -chromSize=$w[1] -exonFile=hg18.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../hg18/agxs/hg18.$w[0].agx -orthoAgxFile=../mm7/mm7.agx -db=hg18 -orthoDb=mm7 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg18.mm7.cons.t3.agx -reportFile=report/$w[0].hg18.report -edgeFile=report/$w[0].hg18.edge.report >& logs/$w[0].test.log\n";
 }
 '_EOF_'
     # << this line keeps emacs coloring happy
 
 # clean up disk space we're not using
 rm hg18.mm7.all.chain hg18.mm7.net.gz nets/* chains/*
 chmod 755 makeRun.sh
 ./makeRun.sh > orthoSplice.para.spec
 
 ssh kki
 cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice
 para create orthoSplice.para.spec
 para push
 
 cat agx/*.agx > hg18.mm7.t3.exoniphy.agx
 cp ~/latestJk/kent/src/hg/lib/altGraphX.sql .
 hgLoadBed -notItemRgb -sqlTable=altGraphX.sql hg18 altGraphX hg18.mm7.t3.exoniphy.agx
 
 # end AltGraphX track.
 
 ####################################################################
 # EXONWALK TRACK (sugnet) Wed Apr 26 13:51:14 PDT 2006
 
 # first make altGraphX track (see above)
 cd /cluster/store1/sugnet/altSplice/hg18-2006.04.13/orthoSplice
 mkdir exonWalk
 mkdir beds
 cd exonWalk
 mkdir beds
 foreach file (`ls ../agx/*.agx`)
   set base=`basename $file .agx`
   echo "/cluster/home/sugnet/bin/i386/exonWalk db=hg18 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
 end
 
 para create exonWalk.para.spec
 para push
 cat beds/*.bed > hg18.mm7.cons.t3.exoniphy.bed
 
 mkdir orfs
 cd orfs
 mkdir bedOrf beds fa borf
 cp ~/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./
 splitFile ../../hg18.mm7.cons.t3.exoniphy.bed 500 exonWalk.
 cat << '_EOF_' > makeFa.sh
 #!/bin/sh
 
 for file in "$@"
 do
  base=`basename $file`
  echo "Doing $file"
  echo "sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa "
  sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa
 done
 '_EOF_'
 chmod 755 makeFa.sh
 makeFa.sh beds/*
 
 
 cat << '_EOF_' > makeGenePred.sh
 #!/bin/sh
 
 for file in "$@"
 do
   base=`basename $file`
   /cluster/home/sugnet/bin/i386/borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp
 done
 '_EOF_'
 chmod 755 makeGenePred.sh
 
 makeGenePred.sh beds/*
 cat beds/* > hg18.mm7.exonWalk.bed
 cat genePred/*.gp > hg18.mm7.exonWalk.gp
 ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.gp
 
 cat << '_EOF_' > makeNoNmdGenePred.sh
 #!/bin/sh
 
 for file in "$@"
 do
   base=`basename $file`
   /cluster/home/sugnet/bin/i386/borfMatcher beds/$base borf/$base.borf bedOrfNoNmd/$base.bed genePredNoNmd/$base.gp
 done
 '_EOF_'
 
 mkdir bedOrfNoNmd genePredNoNmd
 chmod 755 ./makeNoNmdGenePred.sh
 
 wc beds/*
  275987 3311844 57319256 total
 wc genePredNoNmd/*.gp
  169203 1692030 59907679 total
 wc genePred/*.gp
  225252 2252520 83619240 total
 
 
 cat genePred/*.gp > hg18.mm7.exonWalk.nmd.gp
 cat genePredNoNmd/*.gp > hg18.mm7.exonWalk.noNmd.gp
 cat beds/* > hg18.mm7.exonWalk.all.bed
 
 # Plain "exonWalk" track is the only one used on regular genome browser.
 ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.noNmd.gp
 hgLoadBed hg18 exonWalkAll hg18.mm7.exonWalk.all.bed
 ldHgGene -predTab hg18 exonWalkWithNmd hg18.mm7.exonWalk.nmd.gp
 
 cat hg18.mm7.exonWalk.noNmd.gp  | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
 Q1 1.000000
 median 3.000000
 Q3 7.000000
 average 10.670556
 min 1.000000
 max 3844.000000
 count 15857
 total 169203.000000
 standard deviation 63.330761
 
 cat hg18.mm7.exonWalk.nmd.gp  | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
 ave counts.txt
 Q1 1.000000
 median 3.000000
 Q3 8.000000
 average 14.037891
 min 1.000000
 max 7278.000000
 count 16046
 total 225252.000000
 standard deviation 99.406890
 
 trackGenome hg18 all refGene:cds trackGenome.spec
 Track Specification      track  overlap track    cov   track   new    cum
                           size     size  geno  track     cov   cov    cov
 -----------------------------------------------------------------------------
 exonWalk:cds          31207765 27951670  1.00%  89.57%  90.24% 90.24% 90.24%
 # end ExonWalk track.
 
 ###########################################################################
 # ALTGRAPHX2 TRACK (kent) in progress Fri Jan 19 11:27:45 PST 2007
 # The exoniphy and human/mouse blastz/chain/nets need to be done before
 # this.
 
 ssh hgwdev
 cd /cluster/store1/sugnet/altSplice/
 mkdir hg18-2007.01.19
 cd hg18-2007.01.19
 mkdir rnaCluster
 cd rnaCluster
 
 # Don't use RAGE libraries for clone bounds.
 ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  hg18 rage.libs
 
 # Make spec file to run.
 echo "#!/bin/tcsh -ef@ > clusterRna.spec
 foreach c (`echo 'select chrom from chromInfo' | hgsql hg18 | grep -v chrom`)
     set out = chrom/$c.bed
     echo "clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c" >> clusterRna.spec
 end
 
 # Run the file. Needs to be done on machine with database access.
 # Takes an hour or so.
 chmod 755 clusterRna.spec
 mkdir chrom
 ./clusterRna.spec >& clusterRna.log
 
 cd ..
 
 # Make script to setup job file for raw altGraphX files on human
 # If we had a cluster with database access this could be run there.
 # As it is, run it on hgwdev.  This took 45 minutes.
 cat << '_EOF_' > makeRun.sh
 #!/bin/sh
 echo "#!/bin/tcsh -ef"
 for chrom in `echo "select chrom from chromInfo" | hgsql hg18 | grep -v chrom`; do
 echo "echo 'Doing $chrom'"
 echo "altSplice -db=hg18   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/hg18.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/hg18/nib/$chrom.nib"
 done
 '_EOF_'
     # << this line makes emacs coloring happy
 
 mkdir agxs
 chmod 755 makeRun.sh
 ./makeRun.sh > toRun.sh
 chmod 755 toRun.sh
 ./toRun.sh >& toRun.log &
 
 cat agxs/*.agx > hg18.agx
 
 mkdir hg18
 mv agxs/ makeRun.sh toRun.log toRun.sh hg18.agx hg18
 cd ..
 
 mkdir mm8
 cd mm8
 # make the rnaClusters
 mkdir rnaCluster
 cd rnaCluster/
 mkdir chrom
 
 # Don't use RAGE libraries for clone bounds.
 ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh  mm8 rage.libs
 
 echo "#!/bin/tcsh -ef" > clusterRna.spec
 foreach c (`echo 'select chrom from chromInfo' | hgsql mm8 | grep -v chrom`)
     set out = chrom/$c.bed
     echo "clusterRna -mrnaExclude=mm8.rage.libs mm8 /dev/null $out -chrom=$c" >> clusterRna.spec
 end
 
 # Could make this a cluster run if had a cluster with database access.
 # as is, took about 15 minutes on hgwdev. (Faster than human since less ESTs.)
 chmod 755 clusterRna.spec
 ./clusterRna.spec >& clusterRna.log &
 
 cd ..
 
 # Make batch file file to run altSplice program (by making a batch file).
 cat << '_EOF_' > makeRun.sh
 #!/bin/sh
 echo "#!/bin/tcsh -ef"
 for chrom in `echo "select chrom from chromInfo" | hgsql mm8 | grep -v chrom`; do
 echo "echo 'Doing $chrom'"
 echo "/cluster/home/sugnet/bin/i386/altSplice -db=mm8   -beds=rnaCluster/chrom/$chrom.bed -agxOut=agxs/mm8.$chrom.agx -consensus -weightMrna -localMem -minAli=.95 -minCover=.5 -chromNib=/cluster/data/mm8/nib/$chrom.nib"
 done
 '_EOF_'
     # << this line keeps emacs coloring happy
 chmod 755 makeRun.sh
 ./makeRun.sh > toRun.sh
 chmod 755 toRun.sh
 
 # Run altSplice.  This takes about 12 minutes.
 mkdir agxs
 ./toRun.sh >& toRun.log &
 
 cat agxs/*.agx > mm8.agx
 cd ..
 mkdir orthoSpliceExoniphy
 cd orthoSpliceExoniphy/
 
 echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" | hgsql hg18 | grep -v txStart > hg18.exoniphy.bed
 mkdir orthoSplice
 cd orthoSplice
 echo 'select chrom, size from chromInfo' | hgsql hg18 | grep -v chrom > chromSizes.tab
 zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.all.chain.gz | chainSplit chains stdin
 zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.net.gz | netSplit stdin nets
 
 mkdir agx report logs
 cat << '_EOF_' > makeRun.sh
 #!/usr/bin/perl -w
 
 open(IN, 'chromSizes.tab') or die "Can't open chromSizes.tab\n";
 while(<IN>) {
     chomp;
     @w = split;
     print "orthoSplice -chromSize=$w[1] -exonFile=../hg18.exoniphy.bed -trumpNum=3 -chrom=$w[0] -altInFile=../../hg18/agxs/hg18.$w[0].agx -orthoAgxFile=../../mm8/mm8.agx -db=hg18 -orthoDb=mm8 -netFile=nets/$w[0].net -chainFile=chains/$w[0].chain -commonFile=agx/$w[0].hg18.mm8.cons.t3.agx -reportFile=report/$w[0].hg18.report -edgeFile=report/$w[0].hg18.edge.report >& logs/$w[0].test.log\n";
 }
 '_EOF_'
     # << this line keeps emacs coloring happy
 
 chmod 755 makeRun.sh
 ./makeRun.sh > orthoSplice.para.spec
 
 # do a little cluster run
 ssh kki
 cd /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy/orthoSplice
 para create orthoSplice.para.spec
 para push
 # Do para check, etc until done.  Here's the para time results.
 #
 # 49 jobs in batch
 # 147 jobs (including everybody's) in Parasol queue.
 # Checking finished jobs
 # Completed: 47 of 49 jobs
 # Crashed: 2 jobs
 # CPU time in finished jobs:       7002s     116.70m     1.94h    0.08d  0.000 y
 # IO & Wait Time:                   196s       3.27m     0.05h    0.00d  0.000 y
 # Average job time:                 153s       2.55m     0.04h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1283s      21.38m     0.36h    0.01d
 # Submission to last job:          1283s      21.38m     0.36h    0.01d
 #
 # The two jobs that crashed are ok, it was simply the result of no input on
 # some of the small random chroms. It'd be good to take the jobs out earlier
 # somehow. Probably Angie could figure out a way to add a file existence
 # test in a line of the perl script above.  The altInFile is missing in this
 # case.
 
 # Concatenate cluster output and load it into the database.
 ssh hgwdev
 cd /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy/orthoSplice
 cat agx/*.agx > hg18.mm8.t3.exoniphy.agx
 cp ~/kent/src/hg/lib/altGraphX.sql .
 hgLoadBed -notItemRgb -sqlTable=altGraphX.sql hg18 altGraphX2 hg18.mm8.t3.exoniphy.agx
 
 # clean up disk space we're not using
 rm hg18.mm7.all.chain hg18.mm7.net.gz nets/* chains/*
 
 # end AltGraphX2 track.
 
 ####################################################################
 # EXONWALK2 TRACK (kent) Tue Jan 24 2007
 
 # first make altGraphX2 track (see above)
 ssh hgwdev
 cd
 /cluster/store1/sugnet/altSplice/hg18-2007.01.19/orthoSpliceExoniphy
 mkdir exonWalk
 mkdir beds
 cd exonWalk
 mkdir beds
 foreach file (`ls ../orthoSplice/agx/*.agx`)
   set base=`basename $file .agx`
   echo "exonWalk db=hg18 minPercent=0 trumpSize=100000 $file beds/$base.bed" >> exonWalk.para.spec
 end
 
 # Execute para spec as batch file since wants database access.
 # takes about 2.5 hours
 #para create exonWalk.para.spec
 #para push
 #cat beds/*.bed > hg18.mm7.cons.t3.exoniphy.bed
 time tcsh -efx exonWalk.para.spec
 #8256.940u 21.747s 2:18:07.32 99.8%      0+0k 0+0io 0pf+0w
 
 mkdir orfs
 cd orfs
 mkdir bedOrf beds fa borf genePred
 cd beds
 # cp /cluster/store1/sugnet/store1/altSplice/hg17-2005.01.09/orthoSpliceExonify/exonWalk/orfs.mrna2/*.sh ./
 cat ../../beds/*.bed | splitFile stdin 500 exonWalk.
 cd ..
 cat << '_EOF_' > makeFa.sh
 #!/bin/sh
 
 for file in "$@"
 do
  base=`basename $file`
  echo "Doing $file"
  echo "sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa "
  sequenceForBed -db=hg18 -bedIn=$file -fastaOut=fa/$base.fa
 done
 '_EOF_'
 chmod 755 makeFa.sh
 makeFa.sh beds/*
 
 cat << '_EOF_' > makeBorf.sh
 #!/bin/sh
 
 for file in "$@"
 do
  base=`basename $file`
  echo "Doing $file"
  echo "borfBig $file borf/$base.borf "
  borfBig $file borf/$base.borf
 done
 '_EOF_'
 chmod 755 makeBorf.sh
 makeBorf.sh fa/*.fa
 
 # Alternatively do this on the cluster.  It takes a little doing to
 # get a version of bestorf set up to be cluster accessible.  I
 # just copied it in from /projects/compbio/bin/borf, including
 # copying in some binary fiels that script referenced.
 # As a parasol job on kk, here's what para time said:
 CPU time in finished jobs:      51577s     859.61m    14.33h    0.60d  0.002 y
 IO & Wait Time:                 25442s     424.04m     7.07h    0.29d  0.001 y
 Average job time:                 132s       2.19m     0.04h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:             179s       2.98m     0.05h    0.00d
 Submission to last job:           307s       5.12m     0.09h    0.00d
 
 cat << '_EOF_' > makeGenePred.sh
 #!/bin/sh
 
 for file in "$@"
 do
   base=`basename $file`
   borfMatcher -keepNmd beds/$base borf/$base.borf bedOrf/$base.bed genePred/$base.gp
 done
 '_EOF_'
 chmod 755 makeGenePred.sh
 
 makeGenePred.sh beds/*
 cat beds/* > hg18.mm7.exonWalk.bed
 cat genePred/*.gp | ldHgGene -predTab hg18 exonWalk2 stdin
 
 cat << '_EOF_' > makeNoNmdGenePred.sh
 #!/bin/sh
 
 for file in "$@"
 do
   base=`basename $file`
   /cluster/home/sugnet/bin/i386/borfMatcher beds/$base borf/$base.borf bedOrfNoNmd/$base.bed genePredNoNmd/$base.gp
 done
 '_EOF_'
 
 mkdir bedOrfNoNmd genePredNoNmd
 chmod 755 ./makeNoNmdGenePred.sh
 
 wc beds/*
  275987 3311844 57319256 total
 wc genePredNoNmd/*.gp
  169203 1692030 59907679 total
 wc genePred/*.gp
  225252 2252520 83619240 total
 
 
 cat genePred/*.gp > hg18.mm7.exonWalk.nmd.gp
 cat genePredNoNmd/*.gp > hg18.mm7.exonWalk.noNmd.gp
 cat beds/* > hg18.mm7.exonWalk.all.bed
 
 # Plain "exonWalk" track is the only one used on regular genome browser.
 ldHgGene -predTab hg18 exonWalk hg18.mm7.exonWalk.noNmd.gp
 hgLoadBed hg18 exonWalkAll hg18.mm7.exonWalk.all.bed
 ldHgGene -predTab hg18 exonWalkWithNmd hg18.mm7.exonWalk.nmd.gp
 
 cat hg18.mm7.exonWalk.noNmd.gp  | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
 Q1 1.000000
 median 3.000000
 Q3 7.000000
 average 10.670556
 min 1.000000
 max 3844.000000
 count 15857
 total 169203.000000
 standard deviation 63.330761
 
 cat hg18.mm7.exonWalk.nmd.gp  | cut -f 1,2 -d '.' | sort | uniq -c | sort -rnk 1 > counts.txt
 ave counts.txt
 Q1 1.000000
 median 3.000000
 Q3 8.000000
 average 14.037891
 min 1.000000
 max 7278.000000
 count 16046
 total 225252.000000
 standard deviation 99.406890
 
 trackGenome hg18 all refGene:cds trackGenome.spec
 Track Specification      track  overlap track    cov   track   new    cum
                           size     size  geno  track     cov   cov    cov
 -----------------------------------------------------------------------------
 exonWalk:cds          31207765 27951670  1.00%  89.57%  90.24% 90.24% 90.24%
 # end ExonWalk track.
 
 
 ####################################################################
 # LOAD ENSEMBL GENES (DONE, 2006-05-02, Fan)
 # ADDED STABLE URL TO TRACKDB (DONE, 2006-05-29, hartera)
 # ADDED RELEASE ALPHA AND RELEASE BETA VERSIONS OF TRACK ENTRY IN
 # trackDb.ra SO THAT CORRECT ENSEMBL BUILD VERSION DISPLAYED AND LINKED TO
 # AS DIFFERENT ENSEMBL BUILDS ON RR AND HGWDEV (DONE, 2007-09-25, hartera)
     mkdir /cluster/data/hg18/bed/ensembl
     cd /cluster/data/hg18/bed/ensembl
     # Get the ensembl protein data from
     # http://www.ensembl.org/Homo_sapiens/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Structures" box.
     # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
     # Save as ensemblGene.gtf.gz
 
     # Ensembl handles random chromosomes differently than us, so we
     # strip this data.  Fortunately it just loses a couple of genes.
     # Add "chr" to front of each line in the gene data gtf file to make
     # it compatible with our software.
     # Finally, get rid of the ".1" or ".2" after the name
     gunzip -c ensemblGene.gtf.gz \
     |sed -e 's/c22_H2/22_h2_hap1/'\
     |sed -e 's/c5_H2/5_h2_hap1/'\
     |sed -e 's/c6_COX/6_cox_hap1/'\
     |sed -e 's/c6_QBL/6_qbl_hap2/'\
     | perl -wpe 's/^([0-9]|X|Y|Un|MT|5_h2_hap1|22_h2_hap1|6_cox_hap1|6_qbl_hap2)/chr$1/ || die "Line $. doesnt start with human chrom:\n$_"' \
     | sed -e 's/\..\"/\"/g' \
     | sed -e 's/chrMT/chrM/' \
     > ensGene.gtf
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/ensembl
 
     # Remove hap chroms entries because Ensembl is using different genomic coordinates.
 
     fgrep -v hap ensGene.gtf > ensGeneNew.gtf
 
     /cluster/bin/i386/ldHgGene hg18 ensGene ensGeneNew.gtf
     #  Read 58424 transcripts in 1014240 lines in 1 files
     #  58424 groups 25 seqs 1 sources 4 feature types
     #  58424 gene predictions
 
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
     # Choose Text, tab-separated as the output format.  Result name ensGtp.
     # Save file as ensGtp.txt.gz
     gunzip ensGtp.txt.gz
     hgsql hg18 < ~/kent/src/hg/lib/ensGtp.sql
     # remove header line from ensGtp.txt
     echo "load data local infile 'ensGtp.txt' into table ensGtp ignore 1 lines" | hgsql -N hg18
 
     # Load Ensembl peptides:
     # Get them from ensembl as above in the gene section except for
     # Page 2) Choose protein_coding for gene type
     # Page 3) Choose the "Sequences" box.
     # Page 4) check Ensembl Gene ID, Transcript ID, and Peptid ID, uncheck chrom, Transcripts/Proteins.  Peptide.  Format = FASTA.
     # Save file as ensemblPep.fa.gz
     gunzip ensemblPep.fa.gz
     hgPepPred hg18 ensembl ensemblPep.fa
     # Added stable archive URL for Ensembl v38 to human/hg18/trackDb.ra
     # (2006-05-29, hartera)
     # Changed url line for ensGene entry to:
     # url http://apr2006.archive.ensembl.org/perl/transview?transcript=$$
 
     # (2007-09-25, hartera)
     # Created a release beta version of this track in human/hg18/trackDb.ra
     # with the ensArchive setting set to apr2006 to create the correct URL
     # as above and add the correct version (version 38) in the label:
 track ensGene
 release beta
 shortLabel Ensembl Genes
 longLabel Ensembl (Build 38) Gene Predictions
 group genes
 priority 40
 visibility hide
 color 150,0,0
 type genePred ensPep
 ensArchive apr2006
     # A separate trackDb entry (release alpha) was made for the updated
     # track on hgwdev which is Build 46 (aug2007). This means that the
     # correct version will be displayed and the correct links made on both
     # the RR and hgwdev.
 
 # Create knownToEnsembl column (updated 2007-11-15 - Jim Kent)
     hgMapToGene hg18 ensGene knownGene knownToEnsembl
 
 # QA NOTE [ASZ: 9-11-2006]: mytouch on ensGtp and ensPep.  This is because
 # ensGene was updated later than they were.  Ensembl treats hap chroms
 # differently than we do.  So the ensGene table was reloaded.
 # sudo mytouch hg18 ensGtp 200605241000.00
 # sudo mytouch hg18 ensPep 200605241000.00
 
 
 # SGP GENES (DONE 5/3/06 Fan)
 # See below for: SGP GENES Update (DONE - 2007-10-02 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/sgp
     cd /cluster/data/hg18/bed/sgp
     foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
       wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/SGP/$chr.gtf
       wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200603/SGP/$chr.prot
     end
 
     ldHgGene -gtf -genePredExt hg18 sgpGene chr*.gtf
 
 # VEGA LIFT FROM HG17 (DONE 5/22/06 acs)
 # This can be replaced when the new version comes out (Tim Hubbard says soon)
     ssh hgwdev
     cd /cluster/store8/ensembl/vega33_35f
 
     # there's a bad record at the top of both of these files
     awk 'NF == 15 ' vegaGene.gp > tmp.gp
     awk 'NF == 15 ' vegaPseudo.gp > tmp2.gp
 
     zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | liftOver tmp.gp stdin vegaGeneHg18.gp unMapped.gp -genePred
     # only 6 dropped
     zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | liftOver tmp2.gp stdin vegaPseudoGeneHg18.gp unMappedPseudo.gp -genePred
     # only 11 dropped
 
     ldHgGene hg18 vegaGene -predTab vegaGeneHg18.gp -genePredExt
     ldHgGene hg18 vegaPseudoGene -predTab vegaPseudoGeneHg18.gp -genePredExt
 
     hgsql hg18 -N -B < /cluster/home/acs/kent/src/hg/lib/vegaInfo.sql
     echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg18 -N -B
 
 # SYNTENIC NETS FOR PANTRO2, RHEMAC2, MM8, RN4, AND CANFAM2 AS COMPOSITE TRACK (DONE 5/22/06 acs)
 # (for use in defining orthologs for macaque paper)
     ssh hgwdev
 
     # load syntenic nets created previously by Robert
     hgLoadNet hg18 netSyntenyPanTro2 /cluster/data/hg18/bed/blastz.panTro2/axtChain/hg18.panTro2.syn.net
     zcat /cluster/data/hg18/bed/blastz.rheMac2/axtChain/hg18.rheMac2.syn.net.gz | hgLoadNet hg18 netSyntenyRheMac2 stdin
     zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.syn.net.gz | hgLoadNet hg18 netSyntenyMm8 stdin
     zcat /cluster/data/hg18/bed/blastz.rn4/axtChain/hg18.rn4.syn.net.gz | hgLoadNet hg18 netSyntenyRn4 stdin
     zcat /cluster/data/hg18/bed/blastz.canFam2/axtChain/hg18.canFam2.syn.net.gz | hgLoadNet hg18 netSyntenyCanFam2 stdin
 
     # add more distant vertebrates to track so we can evaluate
     # syntenic netting for multiple alignment (2007-03-10 kate)
     cd /cluster/data/hg18/bed
     netFilter -syn blastz.danRer4/axtChain/hg18.danRer4.net.gz | \
         hgLoadNet hg18 netSyntenyDanRer4 stdin
     netFilter -syn blastz.galGal3/axtChain/hg18.galGal3.net.gz | \
         hgLoadNet hg18 netSyntenyGalGal3 stdin
     netFilter -syn blastz.monDom4/axtChain/hg18.monDom4.net.gz | \
         hgLoadNet -warn hg18 netSyntenyMonDom4 stdin
     netFilter -syn blastz.ornAna1/axtChain/hg18.ornAna1.net.gz | \
         hgLoadNet hg18 netSyntenyOrnAna1 stdin
     netFilter -syn blastz.anoCar1/axtChain/hg18.anoCar1.net.gz | \
         hgLoadNet hg18 netSyntenyAnoCar1 stdin
     netFilter -syn blastz.xenTro2/axtChain/hg18.xenTro2.net.gz | \
         hgLoadNet hg18 netSyntenyXenTro2 stdin
     netFilter -syn blastz.fr2/axtChain/hg18.fr2.net.gz | \
         hgLoadNet hg18 netSyntenyFr2 stdin
     netFilter -syn blastz.equCab1/axtChain/hg18.equCab1.net.gz | \
         hgLoadNet hg18 netSyntenyEquCab1 stdin
     netFilter -syn blastz.bosTau3/axtChain/hg18.bosTau3.net.gz | \
         hgLoadNet -warn hg18 netSyntenyBosTau3 stdin
     netFilter -syn blastz.oryLat1/axtChain/hg18.oryLat1.net.gz | \
         hgLoadNet hg18 netSyntenyOryLat1 stdin
 
 cat > netCov.csh << 'EOF'
     #!/bin/csh -ef
     foreach db (PanTro2 RheMac2 Mm8 Rn4 CanFam2 EquCab1 BosTau3 MonDom4 OrnAna1 GalGal3 AnoCar1 XenTro2 DanRer4 Fr2 OryLat1)
         echo -n "       "
         featureBits -countGaps -chrom=chr1 hg18 refGene:cds net$db -enrichment
         featureBits -countGaps -chrom=chr1 hg18 refGene:cds netSynteny$db -enrichment
         echo ""
     end
 'EOF'
     csh netCov.csh >&! netCov.log &
 cat netCov.log
        #refGene:cds 1.282%, netPanTro2 99.979%, both 1.282%, cover 100.00%, enrich 1.00x
 #refGene:cds 1.282%, netSyntenyPanTro2 99.978%, both 1.282%, cover 100.00%, enrich 1.00x
        #refGene:cds 1.282%, netRheMac2 99.970%, both 1.282%, cover 100.00%, enrich 1.00x
 #refGene:cds 1.282%, netSyntenyRheMac2 99.961%, both 1.282%, cover 99.97%, enrich 1.00x
        #refGene:cds 1.282%, netMm8 98.650%, both 1.278%, cover 99.69%, enrich 1.01x
 #refGene:cds 1.282%, netSyntenyMm8 98.352%, both 1.255%, cover 97.89%, enrich 1.00x
        #refGene:cds 1.282%, netRn4 98.404%, both 1.281%, cover 99.89%, enrich 1.02x
 #refGene:cds 1.282%, netSyntenyRn4 98.074%, both 1.258%, cover 98.10%, enrich 1.00x
        #refGene:cds 1.282%, netCanFam2 99.527%, both 1.281%, cover 99.91%, enrich 1.00x
 #refGene:cds 1.282%, netSyntenyCanFam2 99.274%, both 1.272%, cover 99.16%, enrich 1.00x
        #refGene:cds 1.282%, netEquCab1 99.457%, both 1.281%, cover 99.87%, enrich 1.00x
 #refGene:cds 1.282%, netSyntenyEquCab1 99.020%, both 1.270%, cover 99.06%, enrich 1.00x
        #refGene:cds 1.282%, netBosTau3 99.641%, both 1.282%, cover 100.00%, enrich 1.00x
 #refGene:cds 1.282%, netSyntenyBosTau3 99.493%, both 1.280%, cover 99.81%, enrich 1.00x
        #refGene:cds 1.282%, netMonDom4 98.718%, both 1.279%, cover 99.72%, enrich 1.01x
 #refGene:cds 1.282%, netSyntenyMonDom4 98.029%, both 1.260%, cover 98.26%, enrich 1.00x
        #refGene:cds 1.282%, netOrnAna1 68.119%, both 1.168%, cover 91.06%, enrich 1.34x
 #refGene:cds 1.282%, netSyntenyOrnAna1 56.729%, both 0.714%, cover 55.67%, enrich 0.98x
        #refGene:cds 1.282%, netGalGal3 82.246%, both 1.189%, cover 92.68%, enrich 1.13x
 #refGene:cds 1.282%, netSyntenyGalGal3 80.379%, both 1.101%, cover 85.86%, enrich 1.07x
        #refGene:cds 1.282%, netAnoCar1 63.263%, both 1.128%, cover 87.97%, enrich 1.39x
 #refGene:cds 1.282%, netSyntenyAnoCar1 54.068%, both 0.816%, cover 63.65%, enrich 1.18x
        #refGene:cds 1.282%, netXenTro2 45.072%, both 1.057%, cover 82.44%, enrich 1.83x
 #refGene:cds 1.282%, netSyntenyXenTro2 31.985%, both 0.596%, cover 46.44%, enrich 1.45x
        #refGene:cds 1.282%, netDanRer4 28.211%, both 1.012%, cover 78.87%, enrich 2.80x
 #refGene:cds 1.282%, netSyntenyDanRer4 7.631%, both 0.177%, cover 13.83%, enrich 1.81x
        #refGene:cds 1.282%, netFr2 26.938%, both 0.975%, cover 76.03%, enrich 2.82x
 #refGene:cds 1.282%, netSyntenyFr2 7.991%, both 0.200%, cover 15.62%, enrich 1.95x
 
     # Conclusion: CDS coverage loss is small in all placentals and opossum, so
     # use syntenic net mafs for these in multiz.
     # Ask about chicken -- it's marginal
     # Robert prepped synMafNet's for some species, but the files lack
     # soft-masked sequence, so redo if time.
 
     # (set up trackDb.ra entry for composite track)
 
 
 # SYNTENIC NET MAFS FOR MULTIZ (2007-03-09 kate)
 # Compare with Robert's
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.rheMac2
     mv mafSynNet mafSynNet.robert
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
         -syntenicNet -continue syntenicNet >&! synnet.log &
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.panTro2
 
     # need DEF file for syntenic net, but this was
     # a swapped run, so we will simulate
     cp /cluster/data/panTro2/bed/blastz.hg18/DEF .
     # edit to reverse target and query, and change BASE dir
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
         -syntenicNet -continue syntenicNet >&! synnet.log &
     rm DEF
 
     # edit DEF file to reference kolossus-accessible sequence and chrom.sizes
     cd /cluster/data/hg18/bed/blastz.monDom4
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
         -syntenicNet -continue syntenicNet >&! synnet.log &
 
     cd /cluster/data/hg18/bed/blastz.equCab1
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
         -syntenicNet -continue syntenicNet >&! synnet.log &
 
     cd /cluster/data/hg18/bed/blastz.bosTau3
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
         -syntenicNet -continue syntenicNet >&! synnet.log &
 
     cd /cluster/data/hg18/bed/blastz.mm8
     cp /cluster/data/mm8/bed/blastz.hg18/DEF .
     # edit to reverse target & query, change BASE
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
         -syntenicNet -continue syntenicNet >&! synnet.log
     rm -f DEF
 
     cd /cluster/data/hg18/bed/blastz.rn4
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
         -syntenicNet -continue syntenicNet >&! synnet.log
 
     cd /cluster/data/hg18/bed/blastz.canFam2
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
         -syntenicNet -continue syntenicNet >&! synnet.log &
 
     # use syntenic net on opossum too
     cd /cluster/data/hg18/bed/blastz.monDom4
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl DEF \
         -syntenicNet -continue syntenicNet >&! synnet.log &
 
 # NET AND RECIPROCAL BEST TABLES FOR 2X MAMMALS
     # load net and reciprocal best net for comparison
 
     # rabbit
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.oryCun1/axtChain
     netFilter -minGap=10 hg18.oryCun1.net | hgLoadNet -warn hg18 netOryCun1 stdin
     netFilter -minGap=10 hg18.oryCun1.rbest.net.gz |  \
         hgLoadNet -warn hg18 netRBestOryCun1 stdin
 
     # tenrec
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.echTel1/axtChain
     netFilter -minGap=10 hg18.echTel1.net.gz | hgLoadNet -warn hg18 netEchTel1 stdin
     netFilter -minGap=10 hg18.echTel1.rbest.net.gz |  \
         hgLoadNet -warn hg18 netRBestEchTel1 stdin
 
     # net coverage
     ssh hgwdev
     cd /cluster/data/hg18/bed
 cat > netRBestCov.csh << 'EOF'
     #!/bin/csh -ef
     foreach db (OtoGar1 OryCun1 CavPor2 LoxAfr1 EchTel1 DasNov1)
         echo -n "     "
         featureBits -countGaps -chrom=chr1 hg18 refGene:cds net$db -enrichment
         featureBits -countGaps -chrom=chr1 hg18 refGene:cds netRBest$db -enrichment
         echo ""
     end
 'EOF'
     # << emacs
     csh netRBestCov.csh >&! netRBestCov.log &
 
 
 ##########################################################################
 # EVOFOLD (Done, 05/12/06) Jakob Skou Pedersen
 # RNA secondary structure predictions lifted from hg17 and filtered
   ssh -C hgwdev
   mkdir -p /cluster/data/hg18/bed/evofold
   cd /cluster/data/hg18/bed/evofold
   echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed
   liftOver -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz tmp.bed unmapped.bed
   # remove elements which are wrong size after lifting
   awk '$3-$2 == $7' tmp.bed | sort -k4,4 > rawFoldsHg18.bed
 
   # structure filters
   # first, remove pairs that can't form in human
   cut -f 1-6 rawFoldsHg18.bed > tmp.bed
   # sequenceForBed can be found and compiled from here: $HOME/kent/src/hg/altSplice/altSplice/
   nice /cluster/home/sugnet/bin/i386/sequenceForBed -db=hg18 -bedIn=tmp.bed -fastaOut=tmp.fa
   cat tmp.fa | sed -e 's/\.[+-]\.chr.*$//' \
              | sed -e '/^>/s/$/\t/' | tr -d '\n' | sed -e 's/>/\n/g' | sed -e '1d' -e '$s/$/\n/' | sort -k1,1 > foldsHg18Seq.tab
   join -1 4 -2 1 -o "1.4 1.8 2.2" rawFoldsHg18.bed foldsHg18Seq.tab | sed -e 's/  */\t/g' | sort -k1,1 \
 	     | /cluster/home/jsp/scripts/tabFoldFilter.py > cleanFolds.tab
   join -1 4 -2 1 -o "1.1 1.2 1.3 1.4 1.5 1.6 1.7 2.2 1.9" rawFoldsHg18.bed cleanFolds.tab | sed -e 's/  */\t/g' > tmp1.bed
   # second, remove poor predictions
   # scripts can be found in cvs tree at: cvsroot/jsp/scripts/. They use a few modules which can be found at: cvsroot/jsp/py_modules
   cat tmp1.bed | /cluster/home/jsp/scripts/bedRnassFilter.py --dangling --minAvrStemSize=3 | /cluster/home/jsp/scripts/bedRnassFilter.sh 1 3 \
 	       | /cluster/home/jsp/scripts/roundListFloats.py -c9 > foldsHg18.bed
   # clean up
   rm tmp.bed tmp1.bed foldsHg17.bed foldsHg18Seq.tab rawFoldsHg18.bed tmp.fa cleanFolds.tab
 
   # upload
   hgLoadBed -notItemRgb -sqlTable=$HOME/kent/src/hg/lib/evofold.sql hg18 evofold foldsHg18.bed
 
 
 #########################################################################
 # BLASTZ CHICKEN galGal3 (DONE 5/23/06 angie)
     ssh pk
     mkdir /cluster/data/hg18/bed/blastz.galGal3.2006-05-22
     cd /cluster/data/hg18/bed/blastz.galGal3.2006-05-22
     cat << '_EOF_' > DEF
 # human vs chicken
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_SMSK=/cluster/bluearc/hg18/linSpecRep/notInOthers
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
 SEQ2_DIR=/san/sanvol1/galGal3/nib
 SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.galGal3.2006-05-22
 '_EOF_'
     # << emacs
     ~/kent/src/utils/doBlastzChainNet.pl DEF \
       -bigClusterHub=pk -smallClusterHub=pk \
       -chainMinScore=5000 -chainLinearGap=loose \
       >& do.log & tail -f do.log
     ln -s blastz.galGal3.2006-05-22 /cluster/data/hg18/bed/blastz.galGal3
 
     # running syntenicNet 2008-10-30
     #	had to update the DEF file to correspond to new hive layout
     cd /cluster/data/hg18/bed/blastz.galGal3.2006-05-22
     mv DEF DEF.0
     cat << '_EOF_' > DEF
 # human vs chicken
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_SMSK=/scratch/data/hg18/linSpecRep/notInMouseRat
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
 SEQ2_DIR=/scratch/data/galGal3/nib
 SEQ2_LEN=/scratch/data/galGal3/chrom.sizes
 SEQ2_SMSK=/scratch/data/galGal3/linSpecRep
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastz.galGal3.2006-05-22
 '_EOF_'
     # << happy emacs 
 
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
       -bigClusterHub=swarm -smallClusterHub=memk \
       -continue=syntenicNet -syntenicNet \
 	-chainMinScore=5000 -chainLinearGap=loose > synNet.log 2>&1
     #	worked OK in about 3 minutes
 
 #########################################################################
 # REGULATORY POTENTIAL (DONE - 2006-06-09 - Hiram)
     #	download data from "James Taylor" <james at bx.psu.edu>
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/regPotential7X
     cd /cluster/data/hg18/bed/regPotential7X
 
     #	This is a lot of data
     for C in 1 2 3 4 5 6 7 8 9 X Y 10 11 12 13 14 15 16 17 18 19 20 21 22
     do
     wget --timestamping \
 "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/chr${C}.scores.truncated.bz2"
     done
 
     wget --timestamping \
 "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/trackDb.html" -O description.html
 
     time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
     do
 	bzcat chr${C}.scores.truncated.bz2
     done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
     #	Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    23m27.454s
     #	user    22m41.058s
     #	sys     0m41.850s
 
     #	Loading the table on hgwdev
     ssh hgwdev
     cd /cluster/data/hg18/bed/regPotential7X
     ln -s /cluster/data/hg18/bed/regPotential7X/regPotential7X.wib \
 	/gbdb/hg18/wib/regPotential7X.wib
     #	using the tmpDir is faster since it is on local disk and it will
     #	clean up any temporary .tab file it creates there
     time hgLoadWiggle -tmpDir=/scratch/tmp \
 	hg18 regPotential7X regPotential7X.wig
 
     #	How about a histogram of the data.
     #	find min and max for everything to verify it is 0 to 1
     ssh kkstore02
     cd /cluster/data/hg18/bed/regPotential7X
     time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
     do
 	echo " ============   ${C}  ======================="
 	bzcat chr${C}.scores.truncated.bz2 | ave -col=2 stdin
     done > stats.all 2>&1
 
     grep "^min" stats.all | sort -u
     #	min 0.000000
     grep "^max" stats.all | sort -u
     #	max 1.000000
 
     ssh kolossus
     cd /cluster/data/hg18/bed/regPotential7X
     time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
 	-hMinVal=0.0 -db=hg18 regPotential7X > histogram.data 2>&1
     #	real    2m42.311s
     #	73 % of the data values are zero
 
     #	create download gzip files from the bz2 files:
     ssh kkstore02
     cd /cluster/data/hg18/bed/regPotential7X
     for F in chr*.scores.truncated.bz2
     do
 	C=`echo $F | awk -F'.' '{print $1}'`
 	echo -n "${C}.regPotential7X.hg18.gz working ... "
 	bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz
 	echo
     done
 
 #########################################################################
 # create md5sum.txt under bigZips (DONE, 6/7/06, Fan)
 
     cd /cluster/store11/gs.19/build36/downloads/bigZips
     md5sum *.zip *.2bit README.txt > md5sum.txt
 
 #########################################################################
 # UPDATE BACENDS track (DONE - 2006-06-16 - Hiram)
 #	An attempt to recover some of the missing clones from the
 #	bacEnds track.  It turns out the perl processing script wasn't
 #	properly catagorizing all the clone ends, thus a lot of them
 #	were being left out of the final track
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/updateCloneEnds
     cd /cluster/data/hg18/bed/updateCloneEnds
     ln -s ../cloneend/all.txt.gz .
     #	Checked this script into the source tree and fixed it up to
     #	recognize more of the catagories of clone ends
     zcat all.txt.gz | $HOME/kent/src/hg/utils/cloneEndParse.pl /dev/stdin
     #	Reading in end info
     #	Writing out pair info
     #	Writing out singleton info
     #	301377 pairs and 204698 singles
     #	Note that there are none marked at "unclassified" - this script
     #	will print out that message to stderr if it doesn't recognize
     #	any marker classifications.  This produces the files:
     #	-rw-rw-r--  1 9645568 Jun 16 14:09 cloneEndPairs.txt
     #	-rw-rw-r--  1 4906468 Jun 16 14:09 cloneEndSingles.txt
     wc -l clone*.txt
     #	301377 cloneEndPairs.txt
     #	204698 cloneEndSingles.txt
     #	This is a lot better than previous:
     wc -l ../cloneend/cloneEnd*.txt
     #	249619 ../cloneend/cloneEndPairs.txt
     #	318500 ../cloneend/cloneEndSingles.txt
 
     mkdir /san/sanvol1/scratch/hg18/updateBacEnds
     cd /san/sanvol1/scratch/hg18/updateBacEnds
     ln -s ../bacends/bacEnds.sorted.psl .
     ln -s ../bacends/lifted .
     pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 \
 	-slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 	-mismatch -verbose bacEnds.sorted.psl \
 	/cluster/data/hg18/bed/updateCloneEnds/cloneEndPairs.txt \
 	all_bacends bacEnds
 
     echo -e \
 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' > header
     echo -e '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> header
     cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
 	| headchg -del > bacEndPairs.bed
     cat header bacEnds.slop bacEnds.short bacEnds.long \
 	bacEnds.mismatch bacEnds.orphan \
         | row score ge 300 | sorttbl chr start | headchg -del \
 	> bacEndPairsBad.bed
 
     extractPslLoad -noBin bacEnds.sorted.psl bacEndPairs.bed \
                 bacEndPairsBad.bed | \
                         sorttbl tname tstart | headchg -del > bacEnds.load.psl
     #	looks like we are getting a lot more now in every catagory:
     wc -l bacEnds.* bacEndPairs* | sort -n
        49 bacEnds.long
      1399 bacEnds.mismatch
      4516 bacEnds.slop
      7202 bacEnds.short
     66861 bacEnds.orphan
     78900 bacEndPairsBad.bed
    205443 bacEndPairs.bed
    207997 bacEnds.pairs
   1727387 bacEnds.load.psl
     #	Previously:
     wc -l ../bacends/bacEnds.* ../bacends/bacEndPairs* | sort -n
        40 ../bacends/bacEnds.long
      1061 ../bacends/bacEnds.mismatch
      3954 ../bacends/bacEnds.slop
      6279 ../bacends/bacEnds.short
     59245 ../bacends/bacEnds.orphan
     69788 ../bacends/bacEndPairsBad.bed
    159268 ../bacends/bacEndPairs.bed
    161251 ../bacends/bacEnds.pairs
   1249956 ../bacends/bacEnds.load.psl
 
     #	Move the previous build out of the way and copy these
     #	results over to the primary hg18 bed location:
     mv /cluster/data/hg18/bed/bacends /cluster/data/hg18/bed/bacends.2006-02-02
     mkdir /cluster/data/hg18/bed/bacends
     cp -p bacEnd* /cluster/data/hg18/bed/bacends
     cp -p lifted/bacEnds.lifted.psl /cluster/data/hg18/bed/bacends
 
     #	load them into the database
     ssh hgwdev
     cd /cluster/data/hg18/bed/bacends
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $5}' bacEndPairs.bed | sort | uniq -c
     #	result should be the scores, no extraneous strings:
     #	202488 1000
     #	   255 300
     #	   416 375
     #	   384 500
     #	  1900 750
     #	edit the file and fix it if it has a bad name.
 
     sed -e "s/bacEndPairs /bacEndPairsUpdate /" \
 	$HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsUpdate.sql
 
     hgLoadBed -notItemRgb hg18 bacEndPairsUpdate bacEndPairs.bed \
                  -sqlTable=bacEndPairsUpdate.sql
     # Loaded 205443 elements of size 11
     #	Previously was:
     # Loaded 159268
 
     # note - this track isn't pushed to RR, just used for assembly QA
     sed -e "s/bacEndPairsBad /bacEndPairsBadUpdate /" \
 	$HOME/kent/src/hg/lib/bacEndPairsBad.sql > bacEndPairsBadUpdate.sql
 
     hgLoadBed -notItemRgb hg18 bacEndPairsBadUpdate bacEndPairsBad.bed \
                  -sqlTable=bacEndPairsBadUpdate.sql
     # Loaded 78900 elements of size 11
     #	Previously was:
     # Loaded 69788
     #hgLoadPsl hg18 -nobin -table=all_bacends bacEnds.load.psl
     # NOTE: truncates file to 0 if -nobin is used
     # NOTE: truncates file to 0 if -nobin is used
     hgLoadPsl hg18 -table=all_bacendsUpdate bacEnds.load.psl
 
     #	no complaints !  Usually there are, this loaded:
     hgsql -N -e "select count(*) from all_bacendsUpdate;" hg18
     #	1727387
     #	Previously this was:
     #	1249956
 
     nice featureBits hg18 all_bacendsUpdate
 # 227770876 bases of 2881515245 (7.905%) in intersection
     nice featureBits hg18 all_bacends
 # 191078854 bases of 2881515245 (6.631%) in intersection
     nice featureBits hg17 all_bacends
 # 225763317 bases of 2866216770 (7.877%) in intersection
 
     nice featureBits hg18 bacEndPairsUpdate
 # 162690030 bases of 2881515245 (5.646%) in intersection
     nice featureBits hg18 bacEndPairs
 # 130270940 bases of 2881515245 (4.521%) in intersection
     nice featureBits hg17 bacEndPairs
 # 162099487 bases of 2866216770 (5.656%) in intersection
 
     nice featureBits hg18 bacEndPairsBadUpdate
 # 37326990 bases of 2881515245 (1.295%) in intersection
     nice featureBits hg18 bacEndPairsBad
 # 33650226 bases of 2881515245 (1.168%) in intersection
     nice featureBits hg17 bacEndPairsBad
 # 37437558 bases of 2866216770 (1.306%) in intersection
 
 
 # Renamed the new BAC End Pairs tables (7-27-2006 Brooke)
 mysql> alter table all_bacends rename all_bacendsOld;
 Query OK, 0 rows affected (0.01 sec)
 
 mysql> alter table bacEndPairs rename bacEndPairsOld;
 Query OK, 0 rows affected (0.00 sec)
 
 mysql> alter table all_bacendsUpdate rename all_bacends;
 Query OK, 0 rows affected (0.00 sec)
 
 mysql> alter table bacEndPairsUpdate rename bacEndPairs;
 Query OK, 0 rows affected (0.00 sec)
 
 #########################################################################
 # dbSNP BUILD 126 (Heather, June 2006)
 
 # Set up directory structure
 ssh kkstore02
 cd /cluster/data/dbSNP
 mkdir 126
 cd 126
 mkdir human
 cd human
 mkdir data
 mkdir schema
 mkdir rs_fasta
 
 # Get data from NCBI (anonymous FTP)
 cd /cluster/data/dbSNP/126/human/data
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/human_9606/database/organism_data
 # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
 get b126_SNPContigLoc_36_1.bcp.gz
 # ContigLocusId has function
 get b126_SNPContigLocusId_36_1.bcp.gz
 get b126_ContigInfo_36_1.bcp.gz
 # MapInfo has alignment weights
 get b126_SNPMapInfo_36_1.bcp.gz
 # SNP has univar_id, validation status and heterozygosity
 get SNP.bcp.gz
 
 # Get schema from NCBI
 cd /cluster/data/dbSNP/126/human/schema
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/human_9606/database/organism_schema
 get human_9606_table.sql.gz
 
 # Get fasta files from NCBI
 # using headers of fasta files for molType
 cd /cluster/data/dbSNP/126/human/rs_fasta
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/human_9606/rs_fasta
 mget *.gz
 
 # Simplify names of data files
 cd /cluster/data/dbSNP/126/human/data
 mv b126_SNPContigLoc_36_1.bcp.gz ContigLoc.gz
 mv b126_SNPContigLocusId_36_1.bcp.gz ContigLocusId.gz
 mv b126_ContigInfo_36_1.bcp.gz ContigInfo.gz
 mv b126_SNPMapInfo_36_1.bcp.gz MapInfo.gz
 mv SNP.bcp.gz SNP.gz
 ls -1 *.gz > filelist
 
 # edit table descriptions
 cd /cluster/data/dbSNP/126/human/schema
 # get CREATE statements from human_9606_table.sql for our 5 tables
 # store in table.tmp
 # convert and rename tables
 sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
 rm table.tmp
 sed -f 'tableRename.sed' table2.tmp > table.sql
 rm table2.tmp
 
 # Get updated UniVariation table
 cd /cluster/data/dbSNP/126/shared
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/human_9606/database/shared_data
 get UniVariation.bcp.gz
 cd ../shared_schema
 get dbSNP_main_table.sql.gz
 # get UniVariation CREATE statement from dbSNP_main_table.sql
 # use mssqlToMysql.sed to convert
 
 # get header lines from rs_fasta
 cd /cluster/data/dbSNP/126/human/rs_fasta
 /bin/csh gnl.csh
 
 # add rs_fasta to seq/extFile
 # 2 edits first: strip header to just rsId, and remove duplicates
 # work on /cluster/store12 (kkstore05) which has more disk space
 # also for human, don't include chrUn
 cp rs_ch*.fas.gz /cluster/store12/snp/126/human/rs_fasta
 ssh kkstore05
 cd /cluster/store12/snp/126/human/rs_fasta
 mkdir unarchive
 mv rs_chUn.fas.gz unarchive
 # concat into rsAll.fas
 cat << '_EOF_' > concat.csh
 #!/bin/csh -ef
 rm -f rsAll.fas
 foreach file (rs_ch*.fas.gz)
     echo $file
     zcat $file >> rsAll.fas
 end
 '_EOF_'
 # << emacs
 # snpCleanSeq strips the header and skips duplicates
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
 rm rsAll.fas
 # load on hgwdev
 ssh hgwdev
 mkdir /gbdb/hg18/snp
 ln -s /cluster/store12/snp/126/human/rs_fasta/snp.fa /gbdb/hg18/snp/snp.fa
 cd /cluster/store12/snp/126/human/rs_fasta
 hgLoadSeq hg18 /gbdb/hg18/snp/snp.fa
 # look up id in extFile
 # move into separate table
 hgsql hg18 < snpSeq.sql
 hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 15200238' hg18
 hgsql -e 'delete from seq where extFile = 15200238' hg18
 hgsql -e 'alter table snpSeq add index acc (acc)' hg18
 # clean up after hgLoadSeq
 rm seq.tab
 
 # load on kkr5u00
 ssh kkr5u00
 hgsql -e mysql 'create database hg18snp126'
 cd /cluster/data/dbSNP/126/human/schema
 hgsql hg18snp126 < table.sql
 cd ../data
 /bin/csh load.csh
 
 # note rowcount
 # ContigLoc     27007176
 # SNP           11961761
 # MapInfo       11712346
 # ContigLocusId 11854143
 
 cd /cluster/data/dbSNP/126/shared
 hgsql hg18snp126 < UniVariation.sql
 zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' hg18snp126
 
 # create working /scratch dir
 cd /scratch/snp
 mkdir 126
 cd 126
 mkdir human
 cd human
 
 # get hg18 ctgPos, load into dbSnpHumanBuild126, compare contig list between ctgPos and ContigInfo
 # Note: missing chrY PAR regions
 
 # get gnl files
 cp /cluster/data/dbSNP/126/human/rs_fasta/*.gnl .
 
 # examine ContigInfo for group_term and edit pipeline.csh
 # use "ref_assembly"
 
 cd /scratch/snp/126/human
 # filter ContigLoc into ContigLocFilter
 # this lifts from contig coords to chrom coords
 # phys_pos_from is used to check coords for non-random chroms
 # errors reported to stdout
 # this gets rid of alternate assemblies (using ContigInfo)
 # this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
 # assumes all contigs are positively oriented; will abort if not true
 
 mysql> desc ContigLocFilter;
 #  +---------------+-------------+------+-----+---------+-------+
 #  | Field         | Type        | Null | Key | Default | Extra |
 #  +---------------+-------------+------+-----+---------+-------+
 #  | snp_id        | int(11)     | NO   |     |         |       |
 #  | ctg_id        | int(11)     | NO   |     |         |       |
 #  | chromName     | varchar(32) | NO   |     |         |       |
 #  | loc_type      | tinyint(4)  | NO   |     |         |       |
 #  | start         | int(11)     | NO   |     |         |       |
 #  | end           | int(11)     | YES  |     | NULL    |       |
 #  | orientation   | tinyint(4)  | NO   |     |         |       |
 #  | allele        | blob        | YES  |     | NULL    |       |
 #  +---------------+-------------+------+-----+---------+-------+
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter hg18snp126 ref_assembly reference
 # note rowcount
 # ContigLocFilter  12368145
 # how many are positive strand? hopefully 90%
 mysql> select count(*) from ContigLocFilter where orientation = 0;
 # 10622168
 # note count by loc_type
 mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
 # +----------+----------+
 # | count(*) | loc_type |
 # +----------+----------+
 # |   205359 |        1 |
 # | 10678378 |        2 |
 # |  1464642 |        3 |
 # |     9025 |        4 |
 # |     1117 |        5 |
 # |     9624 |        6 |
 # +----------+----------+
 
 
 # filter ContigLocusId into ContigLocusIdFilter
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter hg18snp126 ref_assembly
 # note rowcount
 # ContigLocusIdFilter  5812538
 
 # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
 # assumes SNPs are in numerical order; will errAbort if not true
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense hg18snp126
 # note rowcount; expect about 50% for human
 # ContigLocusIdCondense 3975405 (note this is smaller than hg17/snp125)
 # could delete ContigLocusIdFilter table here
 
 # create chrN_snpFasta tables from *.gnl files
 # we are just using molType, but also storing class and observed
 # 266,366 duplicates detected in snpMoltype.errors
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta hg18snp126
 
 # (could start using pipeline.csh here)
 # (pipeline.csh takes about 35 minutes to run)
 
 # split ContigLocFilter by chrom
 # create the first chrN_snpTmp
 # we will reuse this table name, adding/changing columns as we go
 # at this point chrN_snpTmp will have the same description as ContigLocFilter
 # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom hg18snp126 ref_assembly
 
 # adjust coords using loc_type
 # possible errors logged to snpLocType.error:
 # Unknown locType
 # Between with end != start + 1
 # Between with allele != '-'
 # Exact with end != start
 # Range with end < start
 
 # possible exceptions logged to snpLocType.exceptions:
 # RefAlleleWrongSize
 
 # This run no errors, no exceptions
 # I do note that out of 25K rows where loc_type == 6, 12259 have asn_from == asn_to
 # All of loc_type == 1, 4, 5 have zero rows where asn_from == asn_to
 # This was also true in build125
 
 # morph chrN_snpTmp
 
 mysql> desc chr1_snpTmp;
 
 #  +---------------+-------------+------+-----+---------+-------+
 #  | Field         | Type        | Null | Key | Default | Extra |
 #  +---------------+-------------+------+-----+---------+-------+
 #  | snp_id        | int(11)     | NO   |     |         |       |
 #  | ctg_id        | int(11)     | NO   |     |         |       |
 #  | chromStart    | int(11)     | NO   |     |         |       |
 #  | chromEnd      | int(11)     | NO   |     |         |       |
 #  | loc_type      | tinyint(4)  | NO   |     |         |       |
 #  | orientation   | tinyint(4)  | NO   |     |         |       |
 #  | allele        | blob        | YES  |     | NULL    |       |
 #  +---------------+-------------+------+-----+---------+-------+
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype hg18snp126 ref_assembly
 
 # expand allele as necessary
 # report syntax errors to snpExpandAllele.errors
 # possible exceptions logged to snpExpandAllele.exceptions:
 # RefAlleleWrongSize
 # This run no errors, no exceptions
 # 8092 alleles expanded
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele hg18snp126 ref_assembly
 
 # the next few steps prepare for working in UCSC space
 
 # sort by position
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort hg18snp126 ref_assembly
 
 # rename MT --> M (pipeline.csh takes care of this)
 hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" hg18snp126
 
 # get hg18 nib files
 # get hg18 chromInfo, load into hg18snp126 with editted path
 # lookup reference allele in nibs
 # keep reverse complement to use in error checking (snpCheckAlleles)
 # check here for SNPs larger than 1024
 # errAbort if detected
 # check for coords that are too large, log to snpRefUCSC.error and skip
 # This run we got 30678 lines in snpRefUCSC.error
 # 12178 from chr14 (reported to dbSNP)
 # also 18423 from chr1_random and 77 from chr6_random
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC hg18snp126
 
 # morph chrN_snpTmp
 
 mysql> desc chr1_snpTmp;
 
 #  +--------------------+-------------+------+-----+---------+-------+
 #  | Field              | Type        | Null | Key | Default | Extra |
 #  +--------------------+-------------+------+-----+---------+-------+
 #  | snp_id             | int(11)     | NO   |     |         |       |
 #  | ctg_id             | int(11)     | NO   |     |         |       |
 #  | chromStart         | int(11)     | NO   |     |         |       |
 #  | chromEnd           | int(11)     | NO   |     |         |       |
 #  | loc_type           | tinyint(4)  | NO   |     |         |       |
 #  | orientation        | tinyint(4)  | NO   |     |         |       |
 #  | allele             | blob        | YES  |     | NULL    |       |
 #  | refUCSC            | blob        | YES  |     | NULL    |       |
 #  | refUCSCReverseComp | blob        | YES  |     | NULL    |       |
 #  +--------------------+-------------+------+-----+---------+-------+
 
 # compare allele from dbSNP to refUCSC
 # locType between is excluded from this check
 # log exceptions to snpCheckAllele.exceptions
 # if SNP is positive strand, expect allele == refUCSC
 # log RefAlleleMismatch if not
 # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
 # If allele == refUCSCRevComp, log RefAlleleNotRevComp
 # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
 # This run we got:
 # 0 RefAlleleMismatch
 # 119366   RefAlleleNotRevComp
 # Note this is double from build125
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles hg18snp126
 
 # add class and observed using univar_id from SNP table
 # to get class (subsnp_class) and observed (var_str) from UniVariation
 # log errors to snpClassAndObserved.errors
 # errors detected:
 # class = 0 in UniVariation
 # class > 8 in UniVariation
 # univar_id = 0 in SNP
 # no row in SNP for snp_id in chrN_snpTmp
 # This run we got:
 # 3 class = 0 in UniVariation
 # 0 class > 8 in UniVariation
 # 39059 univar_id = 0 in SNP
 # 879 no row in SNP for snp_id in chrN_snpTmp (all chr6)
 # dbSNP has class = 'in-del'
 # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved hg18snp126
 
 # morph chrN_snpTmp
 #  +--------------------+---------------+------+-----+---------+-------+
 #  | Field              | Type          | Null | Key | Default | Extra |
 #  +--------------------+---------------+------+-----+---------+-------+
 #  | snp_id             | int(11)       | NO   |     |         |       |
 #  | chromStart         | int(11)       | NO   |     |         |       |
 #  | chromEnd           | int(11)       | NO   |     |         |       |
 #  | loc_type           | tinyint(4)    | NO   |     |         |       |
 #  | class              | varchar(255)  | NO   |     |         |       |
 #  | orientation        | tinyint(4)    | NO   |     |         |       |
 #  | allele             | blob          | YES  |     | NULL    |       |
 #  | refUCSC            | blob          | YES  |     | NULL    |       |
 #  | refUCSCReverseComp | blob          | YES  |     | NULL    |       |
 #  | observed           | blob          | YES  |     | NULL    |       |
 #  +--------------------+---------------+------+-----+---------+-------+
 
 # generate exceptions for class and observed
 
 # SingleClassBetweenLocType
 # SingleClassRangeLocType
 # NamedClassWrongLocType
 
 # ObservedWrongFormat
 # ObservedWrongSize (twice as many as hg17/snp125)
 # ObservedMismatch (nearly 3x as many as hg17/snp125)
 
 # RangeSubstitutionLocTypeExactMatch
 
 # SingleClassTriAllelic
 # SingleClassQuadAllelic
 
 # This will also detect IUPAC symbols in allele
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved hg18snp126
 
 # add function
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction hg18snp126
 
 # add validation status and heterozygosity
 # log error if validation status > 31 or missing
 # this run we got 8 missing
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP hg18snp126
 
 # add molType
 # errors detected: missing or duplicate molType
 # no errors this run
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype hg18snp126
 
 # generate chrN_snp126 and snp126Exceptions tables
 cp snpCheckAlleles.exceptions snpCheckAlleles.tab
 cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
 cp snpExpandAllele.exceptions snpExpandAllele.tab
 cp snpLocType.exceptions snpLocType.tab
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable hg18snp126 126
 
 # handle chrY PAR SNPs (still missing from dbSNP)
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpPAR hg18snp126
 hgsql -e 'load data local infile "snpPARexceptions.tab" into table snp126Exceptions' hg18snp126
 
 # concat into snp126.tab
 # cat chr*_snp126.tab >> snp126.tab
 # note chr18_random_snp126.tab is empty (just 2 rows in hg17/snp125)
 /bin/sh concat.sh
 
 # check for multiple alignments
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple hg18snp126
 mysql> load data local infile 'snpMultiple.tab' into table snp126Exceptions;
 
 # run and review snpCompareLoctype
 # load snp125subset
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareLoctype hg18snp126 snp125subset snp126
 # cat snpCompareLoctypeCounts.out
 # note: rangeToExact is 2x 124/125 conversion rate
 
 # exactToExact = 8747888
 # exactToBetween = 1071
 # exactToRange = 6673
 # betweenToBetween = 321371
 # betweenToExact 1323
 # betweenToRange 514
 # rangeToRange = 95562
 # rangeToBetween = 1794
 # rangeToExact = 15148
 # oldToNew = 10649
 
 # run and review snpCompareWeight
 # load into database snp125snp126
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCompareWeight snp125snp126 weight125 weight126
 # cat snpCompareWeightCounts.out
 # oneToOne = 9161896
 # oneToTwo = 0   <-- good
 # oneToThree = 531  <--- interesting but minor
 # twoToTwo = 38  <-- okay
 # twoToOne = 1896  <--- improvement
 # twoToThree = 0   <-- good
 # threeToThree = 494  <-- okay
 # threeToOne = 37571  <-- improvement
 # threeToTwo = 12  <-- improvement
 
 
 # load on hgwdev
 cp snp126.tab /cluster/home/heather/transfer/snp
 hgsql hg18snp126 -e 'select * from snp126Exceptions' > /cluster/home/heather/transfer/snp/snp126Exceptions.tab
 ssh hgwdev
 mysql> load data local infile 'snp126.tab' into table snp126;
 mysql> load data local infile 'snp126Exceptions.tab' into table snp126Exceptions;
 
 # create indexes
 mysql> alter table snp126 add index name (name);
 mysql> alter table snp126 add index chrom (chrom, bin);
 mysql> alter table snp126Exceptions add index name(name);
 
 # create snp126ExceptionDesc table
 cd /cluster/data/dbSNP
 hgsql hg18 < snp126ExceptionDesc.sql
 # add counts to exception.human.126, can start with exception.template
 hgsql -e 'select count(*), exception from snp126Exceptions group by exception' hg18
 mysql> load data local infile 'exception.human.126' into table snp126ExceptionDesc;
 
 ################################################################
 # SNP126 edit: condense UTR/intron func into just intron at Jim's request
 ssh kkr5u00
 cd /scratch/snp/126/human
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense hg18snp126
 /bin/csh pipeline.csh
 ssh hgwdev
 cd /cluster/home/heather/transfer/snp
 hgsql hg18 -e 'drop table snp126'
 hgsql hg18 < /cluster/home/heather/kent/src/hg/lib/snp126.sql
 hgsql hg18 -e 'load data local infile "snp126.tab" into table snp126'
 hgsql hg18 -e 'alter table snp126 add index name (name)'
 hgsql hg18 -e 'alter table snp126 add index chrom (chrom, bin)'
 
 ################################################################
 # SNP126 edit: detect clustering errors  (Heather, Sept. 2006)
 # for locType = 'between' (class = 'insertion')
 # 1,393,040 candidates
 # exceptions:
 #   DuplicatedObserved (3020 of these)
 #   MixedObserved (1312 of these)
 
 # create and populate a simple table snp126insertions
 mysql> insert into snp126insertions
        select chrom, chromStart, chromEnd, name, score, strand, observed from snp126
        where locType = 'between' and class = 'insertion';
 
 # generate and load data
 cd /cluster/home/heather/kent/src/hg/snp/snpLoad
 ./snpCheckCluster hg18 snp126insertions
 mysql> load data local infile 'snpCheckCluster.tab' into table snp126Exceptions;
 
 # update snp126ExceptionDesc
 
 ################################################################
 # generate snpMasked sequence for snp126 (Heather, Sept. 2006)
 # snpMaskChrom was run too, not documented here.
 # OBSOLETED by snp128Mask, see below.
 
 # 3 steps: simple filtering, advanced filtering, generate sequence
 
 # simple filtering: create and populate tables
 # insertions: 1,393,040
 # deletions: 783,454
 ssh hgwdev
 mysql> insert into snp126insertions select * from snp126
        where locType = 'between' and class = 'insertion';
 mysql> insert into snp126deletions select * from snp126
        where class = 'deletion';
 
 # advanced filtering -- insertions
 cd /cluster/home/heather/kent/src/hg/snp/snpLoad
 # this removes SNPs with weight != 1
 # this removes SNPs that align to more than one position
 # this removes SNPs that cluster together with conflicting observations
 # (these should be class = 'mixed')
 # this removes SNPs with invalid observed string
 # this asserts end == start
 # final count 1,352,380
 # written to insertions.tab
 ./snpGetInsertions hg18 snp126insertions snp126Exceptions
 
 # advanced filtering -- deletions
 cd /cluster/home/heather/kent/src/hg/snp/snpLoad
 # this removes SNPs with weight != 1
 # this removes SNPs that align to more than one position
 # this removes SNPs with invalid observed string
 # this removes SNPs with exception ObservedWrongSize
 # this asserts end > start
 # final count 621,024
 # written to deletions.tab
 ./snpGetDeletions hg18 snp126deletions snp126Exceptions
 
 # Note: the advanced filtering pretty much removes all SNPs from chrN_random
 
 # generate sequence -- insertions
 # use kent/src/hg/snp/snpMask/seqWithInsertions.c
 # this asserts that position doesn't exceed chromSize
 # this will reverse complement observed if strand is negative
 # if no SNPs found, output sequence == input sequence
 # write to chrN.fat
 ssh kkr5u00
 mysql> load data local infile
 "/cluster/home/heather/kent/src/hg/snp/snpLoad/insertions.tab" into table
 snp126insertionsClean;
 cd /scratch/snp126/human/fat
 /bin/sh fat.sh
 cp *.fat /cluster/data/hg18/snpMask/insertions
 ssh kkstore02
 cd /cluster/data/hg18/snpMask/insertions
 nice gzip *.fat
 
 # generate sequence -- deletions
 # use kent/src/hg/snp/snpMask/seqWithoutDeletions.c
 # this asserts that position doesn't exceed chromSize
 # if no SNPs found, output sequence == input sequence
 # write to chrN.skinny
 ssh kkr5u00
 mysql> load data local infile
 "/cluster/home/heather/kent/src/hg/snp/snpLoad/deletions.tab" into table
 snp126deletionsClean;
 cd /scratch/snp126/human/skinny
 /bin/sh skinny.sh
 cp *.skinny /cluster/data/hg18/snpMask/deletions
 ssh kkstore02
 cd /cluster/data/hg18/snpMask/deletions
 nice gzip *.skinny
 
 # create links on hgwdev
 ssh hgwdev
 cd /usr/local/apache/htdocs/goldenPath/hg18/snpMask/insertions
 /bin/sh link.sh
 cd /usr/local/apache/htdocs/goldenPath/hg18/snpMask/deletions
 /bin/sh link.sh
 
 
 ############################################################################
 # Lift simple bi-allelic SNPs to rheMac2 and panTro2 (Heather, August 2006)
 # OBSOLETED by snp128Ortho, see below.
 
 ssh hgwdev
 cd /cluster/data/dbSNP/ortho/hg18/snpDump
 
 # dump raw data -- this creates snpGetSimple.chr*
 # exceptions table is used to skip SNPs that align in multiple places
 # We also skip SNPs on chrN_random
 # We also skip triallelic and quadallelic
 # We don't filter on weight
 # This yields 9,092,533 SNPs
 # This data is also stored into hg18.snp126simple for later use
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpGetSimple hg18 snp126 snp126Exceptions
 
 # split up into just under 200 files to make for an efficient pk run
 # using file size of 60K lines
 # this creates /cluster/data/dbSNP/ortho/hg18/split/chr1-01, chr1-02, chr1-03, etc.
 # 165 files created
 # 140 files have 60k lines
 /bin/csh split.csh
 
 # prepare cluster runs
 # I didn't use -bedPlus=6, didn't seem to need it
 cp /cluster/data/dbSNP/ortho/hg18/split/* /san/sanvol1/snp/liftOver/hg18/rheMac2/input
 cp /cluster/data/dbSNP/ortho/hg18/split/* /san/sanvol1/snp/liftOver/hg18/panTro2/input
 cd /san/sanvol1/snp/liftOver/hg18/rheMac2
 /bin/csh makeJobList.csh
 
     rm -f jobList
     foreach fileName (`ls input/chr*`)
         set baseName = $fileName:t
         echo liftOver $fileName /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz output/$baseName.out unmapped/$baseName.unmapped >> jobList
     end
 
 cd /san/sanvol1/snp/liftOver/hg18/panTro2
 /bin/csh makeJobList.csh
 
     rm -f jobList
     foreach fileName (`ls input/chr*`)
         set baseName = $fileName:t
         echo liftOver $fileName /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz output/$baseName.out unmapped/$baseName.unmapped >> jobList
     end
 
 
 # do cluster runs
 # this only took a few minutes
 # got 7321537 lifts for rheMac2
 # got 8517465 lifts for panTro2
 ssh pk
 cd /san/sanvol1/snp/liftOver/hg18/rheMac2
 para create jobList
 para try; para check; para push; para check; etc.
 cd /san/sanvol1/snp/liftOver/hg18/panTro2
 para create jobList
 para try; para check; para push; para check; etc.
 
 # concatenate output files into all.out
 cd /san/sanvol1/snp/liftOver/hg18/rheMac2/output
 /bin/csh concat.csh
 cd /san/sanvol1/snp/liftOver/hg18/panTro2/output
 /bin/csh concat.csh
 
 # load into panTro2 and rheMac2
 # Doing the load and split so I can easily load sequence for a full chrom
 ssh hgwdev
 cp /san/sanvol1/snp/liftOver/hg18/rheMac2/output/all.out /cluster/data/dbSNP/ortho/hg18/rheMac2Lift
 cd /cluster/data/dbSNP/ortho/hg18/rheMac2Lift
 hgsql rheMac2 < snp126hg18ortho.sql
 hgsql -e 'load data local infile "all.out" into table snp126hg18ortho' rheMac2
 cp /san/sanvol1/snp/liftOver/hg18/panTro2/output/all.out /cluster/data/dbSNP/ortho/hg18/panTro2Lift
 cd /cluster/data/dbSNP/ortho/hg18/panTro2Lift
 hgsql panTro2 < snp126hg18ortho.sql
 hgsql -e 'load data local infile "all.out" into table snp126hg18ortho' panTro2
 
 # split by chrom
 # this creates tables chrN_snp126hg18ortho and can be run from anywhere
 # it will create chrN_snp126hg18ortho.tab files which can be deleted
 cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom2 rheMac2 snp126hg18ortho
 rm chr*.tab
 # rm snp126ortho.tab
 cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom2 panTro2 snp126hg18ortho
 rm chr*.tab
 # rm snp126ortho.tab
 
 # get sequence
 # this creates chrN_snp126hg18orthoPrelim.tab files
 # random chroms are okay here
 # note we are including Ns
 # This will log to fetchSeq.errors any examples where chromEnd != chromStart + 1
 # It will also check for coordinates past the end of the chrom.
 # No errors for rheMac2 or panTro2.
 
 cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
 /cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq rheMac2 /cluster/data/rheMac2/rheMac2.2bit
 # ssh kkstore02
 # cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
 /bin/csh concat.csh
 # cleanup; remove split tables from rheMac2, keep snp126hg18orthoPrelim
 hgsql rheMac2 < drop.sql
 rm chr*.tab
 
 cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
 /cluster/home/heather/kent/src/hg/snp/snpLoad/fetchSeq panTro2 /cluster/data/panTro2/panTro2.2bit
 # ssh kkstore02
 # cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
 /bin/sh concat.sh
 # cleanup; remove split tables from panTro2, keep snp126hg18orthoPrelim
 hgsql panTro2 < drop.sql
 rm chr*.tab
 
 # do a preliminary load -- combine chimp and macaque
 cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
 hgsql hg18 < snp126orthoPrelim.sql
 hgsql -e 'load data local infile "snp126orthoPrelim.tab" into table snp126orthoPrelim' hg18
 cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
 hgsql -e 'load data local infile "snp126orthoPrelim.tab" into table snp126orthoPrelim' hg18
 
 # add human chrom, chromStart, chromEnd, allele, variant
 # liftOver loses the chrom, chromStart and chromEnd
 # liftOver does retain the allele and variant
 cd /cluster/data/dbSNP/ortho/hg18/integrate
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrthoLookup hg18 snp126simple snp126orthoPrelim
 
 # load final table with separate rows for chimp and macaque
 # drop snp126orthoPrelim because it has non-human coords
 # rm tab file because it is huge
 hgsql hg18 < snp126ortho.sql
 load data local infile "snpOrthoLookup.tab" into table snp126ortho
 drop table snp126orthoPrelim
 rm snpOrthoLookup.tab
 
 # create indices
 mysql> alter table snp126ortho add index name (name);
 mysql> alter table snp126ortho add index chrom (chrom, bin);
 
 # manually validate a few examples on various chroms, various strands
 # I used rheMac2:
 # rs533274, hg18 chr1 +, rheMac2 chr18 -
 # rs1690550, hg18 chr1 -, rheMac2 chr19 +
 # rs3121568, hg18 chr1 -, rheMac2 chr19 -
 # rs28709562, hg18 chr1 +, rheMac2 chr19 +
 # rs34675838, also hg18 chr1 +, rheMac2 chr19 +
 
 # create alternate format with both alleles in same row
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpOrthoJoin hg18 snp126simple snp126ortho
 
 # 8517465 rows in hash for panTro2
 # 7321537 rows in hash for rheMac2
 # humanCount =       9092533
 # chimpOnlyCount =   1418324
 # macaqueOnlyCount =  222396
 # missingCount =      352672
 # bothCount =        7098141
 # confirm that chimpOnly + macaqueOnly + missing + both = human
 
 hgsql hg18 < snp126orthoPanTro2RheMac2.sql
 hgsql -e "load data local infile 'snpOrthoJoin.tab' into table snp126orthoPanTro2RheMac2" hg18
 mysql> alter table snp126orthoPanTro2RheMac2 add index name (name);
 mysql> alter table snp126orthoPanTro2RheMac2 add index chrom (chrom, bin);
 
 ################################################################
 ### CREATE chimpHiQualDiff -- panTro2 (Daryl; May 1, 2006)
     # Make file/table of high quality single base pair differences
     # between hg18 and panTro2
     set bedDir = /cluster/data/hg18/bed/chimpHiQualDiffs
     mkdir -p $bedDir
     cd $bedDir
     sed 's/simpleNucDiff/chimpHiQualDiffs/' ~/kent/src/hg/lib/simpleNucDiff.sql >! chimpHiQualDiffs.sql
 
     set axtDir = /cluster/data/hg18/bed/blastz.panTro2/axtRBestNet
     mkdir -p chroms; cd chroms
     ls -1 $axtDir | grep chr | grep axt | sed 's/.hg18.panTro2.net.axt.gz//' | grep -v random | grep -v "_" | xargs mkdir
     set workDir = /scratch/chqd
     mkdir -p $workDir
     touch $workDir/chqd.log
 #       time nice /cluster/home/daryl/bin/i386/chimpHiQualDiffs $workDir/$f /cluster/data/panTro2/bed/quality/qac/panTro2.qac $f.chimpHiQualDiffs
 .bed>>& $workDir/chqd.log
     foreach f (chr*)
         echo -n $f "  "
         mkdir -p $workDir/$f/
         cp $axtDir/$f.*.axt.gz $workDir/$f/
         gunzip $workDir/$f/$f.*.axt.gz
         time nice /cluster/home/daryl/bin/i386/chimpHiQualDiffs $workDir/$f /cluster/data/panTro2/bed/quality/qac/panTro2.qac $f.chimpHiQualDiffs
 .bed
         rm -f $workDir/$f/$f.*axt
         rmdir $workDir/$f/
     end
     mv $workDir/chqd.log .
     cat chr*bed >! ../chimpHiQualDiffs.bed
 
     ## The load (sort) ran out of memory on hgwdev, so sort the
     ## file first on kolossus and then load it on hgwdev
     ssh kolossus
     time hgLoadBed -strict -sqlTable=chimpHiQualDiffs.sql -noLoad hg18 chimpHiQualDiffs chimpHiQualDiffs.bed
     # 110.214u 10.836s 2:24.42 83.8%  0+0k 0+0io 1pf+0w
     exit
     ## hgwdev
     time hgLoadBed -hasBin -noSort -sqlTable=chimpHiQualDiffs.sql hg18 chimpHiQualDiffs bed.tab
     # 328.890u 113.230s 42:26.00 17.3%        0+0k 0+0io 197676pf+0w
 
     ## TODO: need to filter out polymorphic sites (SNPs)
 
 
 #################################################################
 ###### BUILD SUPERFAMILY RELATED TABLES (DONE - 2006-06-20 - Fan)
 
 # Build Superfamily track and create sf tables needed for PB
 
    ssh hgwdev
    hgsql hg18 < ~/src/hg/lib/sfAssign.sql
 
    cd /cluster/data/superfamily/060619
    hgsql hg18 -e 'load data local infile "ass_18-Jun-2006.tab" into table hg18.sfAssign;'
 
 # If hg18.sfDes already exists, drop it.
 
    mkdir /cluster/data/hg18/bed/sf
    cd /cluster/data/hg18/bed/sf
 
    hgsql superfam060619 -N -e "select * from des" >sfDes.tab
    hgsql hg18 < ~/src/hg/lib/sfDes.sql
    hgsql hg18 -e 'load data local infile "sfDes.tab" into table sfDes'
 
 # Build ensemblXref3
 
     # Get the ensembl gene/protein cross-reference data from Ensembl BioMart
     # http://www.ensembl.org/Multi/martview
     # Follow this sequence through the pages:
     # Page 1) Select Ensembl39 and Homo Sapien. Hit next.
     # Page 2) Do not select anything. Hit next.
     # Page 3) Choose the "Feature" box, select Ensembl gene ID, transcript ID, peptide ID,
               UniProt/TrEMBL ID, UniProt/SWISSPROT ID, and UniProt/SWISSPROT Accession
     # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
     # Save as ensembXref3.gz
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/ensembl
     gzip -d ensembXref3.gz
 
     hgsql hg18 < ~/src/hg/lib/ensemblXref3Temp.sql
     hgsql hg18 -e \
     'load data local infile "ensemblXref3" into table ensemblXref3Temp ignore 1 lines'
 
     hgsql hg18 -N -e \
     'select gene, "0", transcript, "0", protein, "0", tremblAcc, swissDisplayId, swissAcc from ensemblXref3Temp' \
     > ensemblXref3.tab
 
     hgsql hg18 -e 'drop table ensemblXref3'
     hgsql hg18 <~/src/hg/lib/ensemblXref3.sql
     hgsql hg18 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3'
 
 # If hg18.superfamily already exists, drop it.
    cd /cluster/data/hg18/bed/sf
    hgSuperfam hg18 superfam060619 > sf.log
 
 # It is normal that many proteins do not have corresponding Superfamily entries.
 
 # If hg18.sfDescription exists, drop it.
 
    hgsql hg18 < ~/src/hg/lib/sfDescription.sql
    hgsql hg18 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg18.sfDescription;'
 
 # Finally, load the superfamily table.
 
    hgLoadBed hg18 superfamily superfamily.tab -tab
 
 # Create knownToSuperfamily table
 # Note hs is changed into ht for this Superfamily release.
 
    cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \
    | hgKnownToSuper hg18 hs stdin
 # created 27,511 rows in knownToSuper
 
 ############################################################################
 # SEGMENTAL DUPLICATIONS (DONE 7/14/06 angie)
     # File emailed from Xinwei She <xws at u.washington.edu>
     mkdir /cluster/data/hg18/bed/genomicSuperDups
     cd /cluster/data/hg18/bed/genomicSuperDups
     # The sed command is necessary to fix "_" used as strand.
     # The awk command was necessary for some recent other species
     # genomicSuperDups that had some too-short regions.  It does not seem
     # to be necessary here, but doesn't hurt and may be useful in
     # future builds.
     sed -e 's/\t_\t/\t-\t/' hg18genomicSuperDup.tab \
     | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
     | hgLoadBed hg18 genomicSuperDups stdin \
       -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
 
 ############################################################################
 # GENE BOUNDS (RNACLUSTER) (DONE 08-09-2006 Fan)
 # Create rnaCluster table (depends on {est,mrna}OrientInfo)
 
 cd /cluster/data/hg18/bed
 mkdir rnaCluster
 cd rnaCluster/
 mkdir chrom
 
 # Create a list of accessions that come from RAGE libraries and need to be excluded.
 ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs
 foreach f (/cluster/data/hg18/nib/chr*.nib)
     set c = $f:t:r
     set out = chrom/$c.bed
     # Exclude accesions in the RAGE file
     echo clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
     clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
  end
 hgLoadBed hg18 rnaCluster chrom/*.bed
 ############################################################################
 
 ############################################################################
 # POLYA_DB TRACK (DONE 08-28-2006 Andy)
 
 mkdir /cluster/data/hg18/bed/polyaDB
 cd /cluster/data/hg18/bed/polyaDB
 wget http://polya.umdnj.edu/download/polyAsite.gz
 gunzip polyAsite.gz
 find /cluster/data/hg16/ -name 'ordered.lft' | xargs cat > hg16.lft
 sed 's/\(\s\).*\//\1/; s/chr/hg16.chr/' hg16.lft > tmp
 mv tmp hg16.lft
 cut -f2 hg16.lft > hg16.lft.names
 grep -F -f hg16.lft.names polyAsite > hg16.polyAsite
 awk '{printf("%s\t%d\t%d\t%s\n", $3, ($5-1), $5, $1);}' hg16.polyAsite > hg16.polyAsite.bed
 liftUp lifted.bed hg16.lft warn hg16.polyAsite.bed
 sed 's/hg16\.//' lifted.bed > final.bed
 liftOver final.bed /gbdb/hg16/liftOver/hg16ToHg18.over.chain.gz hg18.bed unmapped
 hgLoadBed hg18 polyaDB hg18.bed
 # trackDb entry/html in human/hg18
 
 ############################################################################
 # Translate SNP Array data from hg17 (Heather August 2006)
 
 # Affy500
 cd /cluster/data/hg18/bed/snp/affy
 # get rsId/affy name pairs from hg17 where rsId != 'unknown'
 # 257954 candidates from Nsp (4311 with unknown rsId)
 # 234765 candidates from Sty (3540 with unknown rsId)
 hgsql hg17 < getHg17-Nsp.sql > nsp.hg17
 hgsql hg17 < getHg17-Sty.sql > sty.hg17
 # get name, chrom, chromStart, chromEnd, strand, observed from snp126simple
 # snp126simple contains only class = "simple", locType = "exact",
 # chromEnd = chromStart + 1, biallelic, singly-aligning
 hgsql hg18 < getHg18.sql > snp126simple.hg18
 # sort and join
 # 257213in nsp.join
 # 233941 in sty.join
 # 741 in nsp.missing
 # 824 in sty.missing
 sort nsp.hg17 > nsp.hg17.sort
 sort sty.hg17 > sty.hg17.sort
 sort snp126simple.hg18 > snp126simple.hg18.sort
 join nsp.hg17.sort snp126simple.hg18.sort > nsp.join
 join sty.hg17.sort snp126simple.hg18.sort > sty.join
 join -v 1 nsp.hg17.sort snp126simple.hg18.sort > nsp.missing
 join -v 1 sty.hg17.sort snp126simple.hg18.sort > sty.missing
 # fix column order
 awk '{print $3, $4, $5, $2, 0, $6, $7, $1}' nsp.join > nsp.bed
 awk '{print $3, $4, $5, $2, 0, $6, $7, $1}' sty.join > sty.bed
 # load
 hgLoadBed hg18 snpArrayAffy250Nsp nsp.bed -sqlTable=snpArrayAffy250Nsp.sql
 hgLoadBed hg18 snpArrayAffy250Sty sty.bed -sqlTable=snpArrayAffy250Sty.sql
 # cleanup
 rm nsp.hg17 nsp.hg17.sort nsp.join
 rm sty.hg17 sty.hg17.sort sty.join
 rm snp126simple.hg18 bed.tab
 mv snp126simple.hg18.sort ../illumina
 gzip nsp.bed sty.bed
 
 # Illumina300
 cd /cluster/data/hg18/bed/snp/illumina
 # 317,100 candidates from hg17
 hgsql -e 'select name from snpArrayIllumina300' hg17 >  hg17.data
 # sort and join
 # 314,093 in join.out
 # 3,007 in join.missing
 sort hg17.data > hg17.data.sort
 join hg17.data.sort hg18.data.sort > join.out
 join -v 1 hg17.data.sort hg18.data.sort > join.missing
 # fix column order
 awk '{print $2, $3, $4, $1}' join.out > illumina.bed
 # load
 hgsql hg18 < snpArrayIllumina300.sql
 hgLoadBed hg18 snpArrayIllumina300 illumina.bed -sqlTable=snpArrayIllumina300.sql
 # cleanup
 rm hg17.data hg17.data.sort hg18.data.sort bed.tab join.out
 gzip illumina.bed
 
 ##########################################################################
 # New SNP Array data (Heather April 2007)
 # Affymetrix introduced a new genotyping array in February
 # I got the data from Venu in April
 # It is based on dbSNP build 126
 # Venu reviewed the load
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/snp/affy
     # There were 60 lines with no chrom, chromEnd or strand
     grep -v NULL GenomeWideSNP_5_ucsc.tsv > genomewide.in
     # little Perl script to add chromEnd & score for bed format
     genomewide.pl < genomewide.id > genomewide.bed
     # preliminary load
     hgLoadBed hg18 snpArrayAffyGenomeWidePrelim genomewide.bed -tab -sqlTable=snpArrayAffyGenomeWidePrelim.sql
     # based on position, lookup rsId
     # 2 runs
     # first run: don't include dbSNP if class != single or locType != exact or
     # chromEnd != chromStart + 1
     /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffyGenomeWidePrelim snp126
     # missing count = 5279
     # multiple count = 44
     # second run: use all of snp126
     /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffyGenomeWidePrelim snp126
     # missing count = 5210
     # multiple count = 724
     # Use the first run (better to avoid nearly 700 multiples at the cost of
     # 69 more unknown)
     hgLoadBed hg18 snpArrayAffy5 affyLookup.out -tab -sqlTable=snpArrayAffy5.sql
 
 ##########################################################################
 # More new SNP Array data from Affymetrix  (Heather May 2007)
 # Source: Venu_Valmeekam at affymetrix.com
 # This is the 6.0 array, announced mid-May
 # It contains 2 components: single-base substitutions and copy-number probes
 # Single-base substitutions are based on snp127
 
     ssh hgwdev
 
     cd /cluster/data/hg18/bed/snp/affy/6.0/single
     unzip GenomeWideSNP_6_ucsc_1.tsv.zip
     unzip GenomeWideSNP_6_ucsc_2.tsv.zip
     format.pl < GenomeWideSNP_6_ucsc_1.tsv > 1.bed
     format.pl < GenomeWideSNP_6_ucsc_2.tsv > 2.bed
     cp 1.bed all.bed
     cat 2.bed >> all.bed
     hgLoadBed hg18 snpArrayAffy6Prelim all.bed -tab -sqltable=snpArrayAffy6Prelim.sql
     mysql> update snpArrayAffy6Prelim set chrom = "chrM" where chrom = "chrMT";
     /cluster/home/heather/kent/src/hg/snp/snpLoad/affyLookup hg18 snpArrayAffy6Prelim snp127
     # missing count = 1149
     # multiple count = 2396
     # used the strict version of affyLookup (class="single", locType="exact", size=1)
     hgLoadBed hg18 snpArrayAffy6 affyLookup.out -tab -sqlTable=snpArrayAffy6.sql
     mysql> alter table snpArrayAffy6 add index name(name);
     mysql> alter table snpArrayAffy6 add index chrom(chrom, bin);
 
     cd /cluster/data/hg18/bed/snp/affy/6.0/sv
     unzip GenomeWideSNP_6_CN_ucsc_1.tsv.zip
     unzip GenomeWideSNP_6_CN_ucsc_2.tsv.zip
     format.pl < GenomeWideSNP_6_CN_ucsc_1.tsv > 1.bed
     format.pl < GenomeWideSNP_6_CN_ucsc_2.tsv > 2.bed
     cp 1.bed all.bed
     cat 2.bed >> all.bed
     hgLoadBed hg18 snpArrayAffy6SV all.bed -tab
     mysql> delete from snpArrayAffy6SV where chrom = "chr0";
     mysql> update snpArrayAffy6SV set chromStart = chromStart - 1;
 
 ##########################################################################
 # Venu from Affy requested to remove about 25,000 items from
 # snpArrayAffy6 track.
 #
 # Imported the list into the table, snpArrayAffy6Remove, in hg18.
 #
 # Issued a simple MySQL command to delete records in snpArrayAffy6
 # that having ids in snpArrayAffy6Remove (sorry did not write down it).
 #
 # This was done 10/8/07. Fan.
 ##########################################################################
 # New Illumina Array data (Heather April 2007)
 # HumanHap300v3, HumanHap550v3, HumanHap650v3
 # Data from Luana Galver (lgalver at illumina.com)
 # Based on dbSNP build 126
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/snp/illumina
     # split off chrM from zips
     bed.pl < 300.in > 300.bed
     bed.pl < 550.in > 550.bed
     bed.pl < 650.in > 650.bed
     chrM.pl < 550.in.M > 550.bed.M
     chrM.pl < 650.in.M > 650.bed.M
     hgLoadBed hg18 snpArrayIllumina300 300.bed -sqlTable=snpArrayIllumina300.sql -tab
     hgLoadBed hg18 snpArrayIllumina550 550.bed -sqlTable=snpArrayIllumina550.sql -tab
     hgLoadBed hg18 snpArrayIllumina650 650.bed -sqlTable=snpArrayIllumina650.sql -tab
     hgLoadBed hg18 snpArrayIllumina550 550.bed.M -tab -oldTable
     hgLoadBed hg18 snpArrayIllumina650 650.bed.M -tab -oldTable
 
     # add indices
     mysql> alter table snpArrayIllumina300 add index name (name);
     mysql> alter table snpArrayIllumina300 add index chrom (chrom, bin);
     mysql> alter table snpArrayIllumina550 add index name (name);
     mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin);
     mysql> alter table snpArrayIllumina650 add index name (name);
     mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin);
 
     # fix strand convention
     mysql> update snpArrayIllumina300 set strand = "+" where strand = "F";
     mysql> update snpArrayIllumina300 set strand = "-" where strand = "R";
     mysql> update snpArrayIllumina550 set strand = "+" where strand = "F";
     mysql> update snpArrayIllumina550 set strand = "-" where strand = "R";
     mysql> update snpArrayIllumina650 set strand = "+" where strand = "F";
     mysql> update snpArrayIllumina650 set strand = "-" where strand = "R";
 
     # Note no A/T or C/G!!
     mysql> select distinct(observed) from snpArrayIllumina300;
     # +----------+
     # | observed |
     # +----------+
     # | [A/G]    |
     # | [T/C]    |
     # | [A/C]    |
     # | [T/G]    |
     # +----------+
 
     # fix observed
     mysql> update snpArrayIllumina300 set observed = "A/C" where observed = "[A/C]";
     mysql> update snpArrayIllumina550 set observed = "A/C" where observed = "[A/C]";
     mysql> update snpArrayIllumina650 set observed = "A/C" where observed = "[A/C]";
     mysql> update snpArrayIllumina300 set observed = "A/G" where observed = "[A/G]";
     mysql> update snpArrayIllumina550 set observed = "A/G" where observed = "[A/G]";
     mysql> update snpArrayIllumina650 set observed = "A/G" where observed = "[A/G]";
     mysql> update snpArrayIllumina300 set observed = "C/T" where observed = "[T/C]";
     mysql> update snpArrayIllumina550 set observed = "C/T" where observed = "[T/C]";
     mysql> update snpArrayIllumina650 set observed = "C/T" where observed = "[T/C]";
     mysql> update snpArrayIllumina300 set observed = "G/T" where observed = "[T/G]";
     mysql> update snpArrayIllumina550 set observed = "G/T" where observed = "[T/G]";
     mysql> update snpArrayIllumina650 set observed = "G/T" where observed = "[T/G]";
 
     # Note 2 rows in 300 and 15 rows in 550 and 650 where chrom = "chrXY"
 
     # validation
     /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina300 snp126 snp126Exceptions illuminaLookup.hg18.300
     /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina550 snp126 snp126Exceptions illuminaLookup.hg18.550
     /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup hg18 snpArrayIllumina650 snp126 snp126Exceptions illuminaLookup.hg18.650
     # Not found: 2 in 300, 15 in 550 and 650
     # These are in snp127
     # Mixed: 55 in 300, 74 in 550, 81 in 650
 
     # Found 2 strange things here:
     # First of all, for snps that are illumina forward strand, dbSNP reverse strand:
     # in all cases, the observed polymorphism is identical.
     # Counts:
     # 36k on the HumanHap300v3
     # 52k on the HumanHap550v3
     # 59k on the HumanHap650v3
     # This surprises me, because the dbSNP observation is intended to be reverse-complemented.
     # Examples from HumanHap300v3 include rs1000007, rs1000031, rs1000041, rs1000071, rs1000078.
     # Secondly, for snps that are illumina reverse strand:
 
     # in all cases is that your observed polymorphism is the reverse complement of the dbSNP polymorphism.
     # this could only make sense for the dbSNP forward strand OR the dbSNP reverse strand, although I don't think it matters which one.
 
     # examples:
 
     # rs3934834: illumina A/G (-), dbSNP C/T (+)
     # rs6687776: illumina A/G (-), dbSNP C/T (+)
     # rs2298217: illumina A/G (-), dbSNP C/T (+)
     # rs9442380: illumina A/G (-), dbSNP C/T (+)
     # rs3737728: illumina A/G (-), dbSNP C/T (-)
     # rs3813199: illumina A/G (-), dbSNP C/T (-)
     # rs880051: illumina A/G (-), dbSNP C/T (-)
 
     # rs12562034: illumina C/T (-), dbSNP A/G (+)
     # rs9442372: illumina C/T (-), dbSNP A/G (+)
     # rs11260588: illumina C/T (-), dbSNP A/G (+)
     # rs12726255: illumina C/T (-), dbSNP A/G (+)
     # rs2887286: illumina C/T (-), dbSNP A/G (-)
     # rs2649588: illumina C/T (-), dbSNP A/G (-)
     # rs2296716: illumina C/T (-), dbSNP A/G (-)
     # rs2474460: illumina C/T (-), dbSNP A/G (-)
 
     # redo this, just using name/chrom/pos from illumina
     bed2.pl < 300.in > 300.bed.2
     hgLoadBed hg18 snpArrayIllumina300Prelim 300.bed.2 -tab
     /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina300Prelim snp126 snp126Exceptions
     mv illuminaLookup.out lookup.300
     mv illuminaLookup.err lookup.300.err
     hgLoadBed hg18 snpArrayIllumina300 lookup.300 -tab -sqlTable=snpArrayIllumina300.sql
     hgsql -N -e 'drop table snpArrayIllumina300Prelim' hg18
 
     bed2.pl < 550.in > 550.bed.2
     hgLoadBed hg18 snpArrayIllumina550Prelim 550.bed.2 -tab
     /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina550Prelim snp126 snp126Exceptions
     mv illuminaLookup.err lookup.550.err
     mv illuminaLookup.out lookup.550
     hgLoadBed hg18 snpArrayIllumina550 lookup.550 -tab -sqlTable=snpArrayIllumina550.sql
     hgsql -N -e 'drop table snpArrayIllumina550Prelim' hg18
 
     bed2.pl < 650.in > 650.bed.2
     hgLoadBed hg18 snpArrayIllumina650Prelim 650.bed.2 -tab
     /cluster/home/heather/kent/src/hg/snp/snpLoad/illuminaLookup2 hg18 snpArrayIllumina650Prelim snp126 snp126Exceptions
     mv illuminaLookup.out lookup.650
     mv illuminaLookup.err lookup.650.err
     hgLoadBed hg18 snpArrayIllumina650 lookup.650 -tab -sqlTable=snpArrayIllumina650.sql
     hgsql -N -e 'drop table snpArrayIllumina650Prelim' hg18
 
     # add indices
     mysql> alter table snpArrayIllumina300 add index name (name);
     mysql> alter table snpArrayIllumina300 add index chrom (chrom, bin);
     mysql> alter table snpArrayIllumina550 add index name (name);
     mysql> alter table snpArrayIllumina550 add index chrom (chrom, bin);
     mysql> alter table snpArrayIllumina650 add index name (name);
     mysql> alter table snpArrayIllumina650 add index chrom (chrom, bin);
 
 
 
 ##########################################################################
 # Added gvPos table for Locus Variants (Belinda Giardine Sept 2006)
 # This uses the gv* tables in hgFixed for the related data.  The track has
 # been on hg17, just added to hg18.  Most variants were mapped directly to
 # hg18 only the LSDB BGMUT was lifted using liftOver.
 # Update, reloaded table Dec 2006	Belinda Giardine
 # new entries for previous sources and more IDbases
 # Update, reloaded table January 2007       Belinda Giardine
 # new source (first set of LOVD) and some fixes to IDbases and HbVar
 # Update most LSDBs, add more genes for LMDp(LOVD) Jan 11, 2008
 # loaded and tested first at PSU
 
 #update old dbs and add dbPEX     March 22-23, 2007
 #need to truncate and reload all tables (new entries in old)
 #prepare positions for loading
 cd gvNov2006
 cat gvPosARdb.hg17.txt gvPosSrd5a2.hg17.txt gvPosPah.hg17.txt > ../gvMar2007/gvPosNov2006.hg17.txt
 cd ../gvMar2007
 cat ../gvJan2007/gvPosLOVD.hg17.txt *.hg17.txt > gvPos.Hg17.txt
 grep "^chr" gvPos.Hg17.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg17.bed
 cd gvNov2006
 cat gvPosARdb.hg18.txt gvPosSrd5a2.hg18.txt gvPosPah.hg18.txt > ../gvMar2007/gvPosNov2006.hg18.txt
 cd ../gvMar2007
 cat ../gvJan2007/gvPosLOVD.hg18.txt *.hg18.txt > gvPos.Hg18.txt
 grep "^chr" gvPos.Hg18.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg18.bed
 #run checks
 ~giardine/gv/checkLinksRaFile.pl /cluster/store6/giardine/gvMar2007/
 ~giardine/gv/checkSeq.pl hg18 < gvPos.Hg18.txt > errors.txt
 ~giardine/gv/checkSeq.pl hg17 < gvPos.Hg17.txt > errors17.txt
 #start reload
 hgsql hgFixed < emptyTables.sql
 #copy and paste from reloadHgFixed.txt
 #load new dbs
 hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvRettBASE.txt
 hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrRettBASE.txt
 hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkRettBASE.txt
 hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvdbPEX.txt
 hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrdbPEX.txt
 hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkdbPEX.txt
 #load position tables
 hgLoadBed hg18 gvPos gvPosSorted.Hg18.bed -noSort -oldTable -tab
 hgLoadBed hg17 gvPos gvPosSorted.Hg17.bed -noSort -oldTable -tab
 #run remaining checks
 select distinct attrType from gvAttr;
 select distinct attrType from gvLink;
 #and compare against gvAttrTypeKey in hg/lib/gvUi.c
 ~/gv/joinerChecks.pl table1 IDfield1 table2 IDfield2
 #for gv, gvPos, gvSrc, gvAttr, and gvLink
 #script to check for non unique rows in database
 ~/gv/uniqueCheck.pl gvAttr > gvAttrNonunique.txt
 ~/gv/uniqueCheck.pl gvLink > gvLinkNonunique.txt
 
 #add IPNMDB and reload LOVD with more genes           April 12, 2007
 cat *.hg17.txt > gvPos.Hg17.txt
 grep "^chr" gvPos.Hg17.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg17.bed
 cat *.hg18.txt > gvPos.Hg18.txt
 grep "^chr" gvPos.Hg18.txt | sort -k1,1 -k2,2n > gvPosSorted.Hg18.bed
 #run checks
 ~giardine/gv/checkLinksRaFile.pl /cluster/store6/giardine/gvMar2007/
 ~giardine/gv/checkSeq.pl hg18 < gvPos.Hg18.txt > errors.txt
 ~giardine/gv/checkSeq.pl hg17 < gvPos.Hg17.txt > errors17.txt
 #remove old LOVD entries
 hgsql hgFixed
         delete from gvLink where id like 'FKRP%';
         delete from gvAttr where id like 'FKRP%';
         delete from gv where id like 'FKRP%';
         insert into gvSrc values ('IPNMDB', 'LSDB', 'Mutation Database of Inherited Peripheral Neuropathies');
 #load new dbs
 hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvLOVD.txt
 hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrLOVD.txt
 hgLoadSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkLOVD.txt
 hgLoadSqlTab -oldTable hgFixed gv ~/humPhen/kent/src/hg/lib/gv.sql gvIPNMDB.txt
 hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrIPNMDB.txt
 adSqlTab -oldTable hgFixed gvLink ~/humPhen/kent/src/hg/lib/gv.sql gvLinkIPNMDB.txt
 hgsql hg18
         truncate table gvPos;
 hgsql hg17
         truncate table gvPos;
 #load position tables
 hgLoadBed hg18 gvPos gvPosSorted.Hg18.bed -noSort -oldTable -tab
 hgLoadBed hg17 gvPos gvPosSorted.Hg17.bed -noSort -oldTable -tab
 #run remaining checks
 select distinct attrType from gvAttr;
 select distinct attrType from gvLink;
 #and compare against gvAttrTypeKey in hg/lib/gvUi.c
 ~/gv/joinerChecks.pl table1 IDfield1 table2 IDfield2
 #for gv, gvPos, gvSrc, gvAttr, and gvLink
 #script to check for non unique rows in database
 ~/gv/uniqueCheck.pl gvAttr
 ~/gv/uniqueCheck.pl gvLink
 #found missing common names
 hgLoadSqlTab -oldTable hgFixed gvAttr ~/humPhen/kent/src/hg/lib/gv.sql gvAttrIPNMDBcommonName.txt
 
 
 ##########################################################################
 #  hars 1 to 202  Sol 09/10/2006
 
     set bedDir = /gbdb/hg18/haseq/bed
     mkdir -p $bedDir/hars
     pushd /projects/hg/wet/Sol/hars1to49
     cp -p hars_1to202.hg18.bed  $bedDir/hars/hars_1to202.bed
     hgLoadBed hg18 hars         $bedDir/hars/hars_1to202.bed
     rm -f                       $bedDir/hars/hars_1to202.bed
     popd
 
 # BUILD HPRD DATA FOR KNOWN GENE DETAILS PAGE LINKS (DONE 9/11/06)
 
 # Download HPRD_XML_060106.tar.gz from www.hprd.org
 
     gzip -d HPRD_XML_060106.tar.gz
     tar -xvf HPRD_XML_060106.tar.gz
 
 # This will create 18838 xxxx.xml files under HPRD_XML_060106
 
 # Create hprdToCdna table
 
     echo 'grep -H entry_cdna  HPRD_XML_060106/$1.xml' >do1Cdna
 
     ls  HPRD_XML_060106 >j
     cat j |sed -e 's/.xml/\tdo1Cdna/g' >jj
     cut -f 1 jj >j.2
     cut -f 2 jj >j.1
     paste j.1 j.2 >doAllCdna
     chmod +x do*
 
     ./doAllCdna >j.cdna
     cat j.cdna| sed -e 's/\//\t/' | sed -e 's/.xml/\t/' |\
     sed -e 's/<entry_cdna>/\t/' | sed -e 's/<\//\t/'| sed -e 's/\./\t/'| cut -f 2,4|\
     grep -v None >hprdToCdna.tab
 
     hgsql hg18 -e 'drop table hprdToCdna'
     hgsql hg18 <~/src/hg/lib/hprdToCdna.sql
     hgsql hg18 -e 'load data local infile "hprdToCdna.tab" into table hprdToCdna'
 
 # Create hprdToUniProt table
 
     echo 'fgrep -H Swiss  HPRD_XML_060106/$1.xml' >do1
 
     ls HPRD_XML_060106 >j
     cat j |sed -e 's/.xml/\tdo1/g' >jj
     cut -f 1 jj >j.2
     cut -f 2 jj >j.1
     paste j.1 j.2 >doall
     chmod +x do*
 
     ./doall >j.out
     cat j.out|grep SwissProt | sed -e 's/\//\t/' | sed -e 's/.xml/\t/' | \
     sed -e 's/Prot>/\t/' | sed -e 's/<\//\t/'| cut -f 2,4|grep -v None >hgrdToUniProt.tab
 
     hgsql hg18 -e 'drop table hprdToUniProt'
     hgsql hg18 <~/src/hg/lib/hprdToUniProt.sql
     hgsql hg18 -e 'load data local infile "hprdToUniProt.tab" into table hprdToUniProt'
 
 # build knownToHprd table
 
     hgsql hg18 -N -e 'select kgId,hprdId from hprdToCdna, kgXref where cdnaId=kgId' >j.kg1
     hgsql hg18 -N -e 'select kgId,hprdId from hprdToUniProt, kgXref where uniProtId=spId' >j.kg2
 
     cat j.kg1 j.kg2 |sort -u >knownToHprd.tab
     wc knownToHprd.tab
 
     hgsql hg18 -e 'drop table knownToHprd'
     hgsql hg18 <~/src/hg/lib/knownToHprd.sql
 
     hgsql hg18 -e 'load data local infile "knownToHprd.tab" into table knownToHprd'
     hgsql hg18 -e 'select count(*) from knownToHprd'
 
 # 19,646 records created.
 
 # remove temporary files.
 
     rm j*
 
 # Do the same for hg17.  See hg17.txt for details.
 
 ##########################################################################
 # ORegAnno: oreganno, oregannoAttr, oregannoLink
 # Belinda Giardine  August 3, 2007
 #             updated Oct 26, 2007
 #             updated July 7, 2008
 # This has regulatory annotations from ORegAnno.
 # Get updated file from ORegAnno wiki page
 #    http://www.bcgsc.ca/wiki/display/oreganno/DataFiles
 # Parse flat file into 3 tables, truncate tables, load.
 # Has other species but only Human, Fly, sacSer1 has enough entries for now.
 cd /cluster/store6/giardine/oreganno/20071026/
 ~giardine/oreganno/parseOra hg18 < oreganno_UCSC_25Oct07.txt
 hgsql hg18
         truncate table oreganno;
         truncate table oregannoAttr;
         truncate table oregannoLink;
 	quit;
 grep "^chr" oreganno.hg18.txt | sort -k1,1 -k2,2n > oreganno.bed
 hgLoadBed hg18 oreganno oreganno.bed -noSort -oldTable -tab
 hgLoadSqlTab -oldTable hg18 oregannoAttr
 ~/humPhen/kent/src/hg/lib/oreganno.sql oregannoAttr.hg18.txt
 hgLoadSqlTab -oldTable hg18 oregannoLink
 ~/humPhen/kent/src/hg/lib/oreganno.sql oregannoLink.hg18.txt
 
 ##########################################################################
 # LIFT ACEMBLY FROM HG17 TO HG18 (DONE, Fan, 9/28/06)
 # OBSOLETED BY LOAD OF NEW DATA, SEE BELOW 8/28/07 angie
 
 # get acembly data from hg17
 
     hgsql hg17 -N -e 'select * from acembly' >hg17Acembly.gp
 
 # lift to hg18
 
     zcat /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz | \
     liftOver hg17Acembly.gp stdin acembly.gp  unMapped.gp -genePred
 
 # load the genePred table
 
    ldHgGene hg18 acembly -predTab acembl.gp
 
 # get acemblyPep and acemblyClass table from hg17 and load them into hg18.
 
     hgsql hg17 -N -e 'select * from acemblyPep' >acemblyPep.tab
     hgsql hg18 -e 'drop table acemblyPep'
     hgsql hg18 < ~/src/hg/lib/acemblyPep.sql
     hgsql hg18 -e 'load data local infile "acemblyPep.tab" into table acemblyPep'
 
     hgsql hg17 -N -e 'select * from acemblyClass' >acemblyClass.tab
     hgsql hg18 -e 'drop table acemblyClass'
     hgsql hg18 < ~/src/hg/lib/acemblyClass.sql
     hgsql hg18 -e 'load data local infile "acemblyClass.tab" into table acemblyClass'
 
 
 ##########################################################################
 # LIFT RNAGENE FROM HG17 TO HG18 (DONE, Robert, 10/3/06)
 mkdir /cluster/data/hg18/bed/rnaGene
 cd /cluster/data/hg18/bed/rnaGene
 hgsql hg18 < rnaGene.sql
 liftOver ~/hg17/rnaGene/rnaGenes.tab  /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz rnaGenes.bed unmapped -bedPlus=10 -tab
 hgLoadBed hg18 rnaGene rnaGenes.bed -oldTable -tab -noBin
 
 
 ##########################################################################
 # SWAP/CHAIN/NET GASACU1 (DONE 10/17/06 angie)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.gasAcu1.swap
     cd /cluster/data/hg18/bed/blastz.gasAcu1.swap
     doBlastzChainNet.pl -swap /cluster/data/gasAcu1/bed/blastz.hg18/DEF \
       -chainMinScore=2000 -chainLinearGap=loose >& do.log & tail -f do.log
     ln -s blastz.gasAcu1.swap /cluster/data/hg18/bed/blastz.gasAcu1
     nice featureBits hg18 chainGasAcu1Link
 #55424609 bases of 2881515245 (1.923%) in intersection
 
 
 ##########################################################################
 # YALE TRANSCRIPTIONALLY ACTIVE REGIONS (TARs/TransFrags) TRACK IDENTIFIED #
 # USING A WHOLE GENOME TILING ARRAY (DONE, 2006-10-12 - 2006-10-13, hartera)
 # Data is from the paper: Bertone et al. Science 24 December 2004:
 # Vol. 306. no. 5705, pp. 2242 - 2246. From Mark Gerstein's lab at Yale.
 # Contact at Yale: Joel S. Rozowsky, joel.rozowsky at yale.edu
 # The data consist of Transcriptionally Active Regions (TARs or TransFrags)
 # found using Affymetrix genome tiling arrays. The data is from the lab
 # of Mark Gerstein at Yale.
      ssh kkstore02
      mkdir /cluster/data/hg18/bed/yaleBertoneTars/
      cd /cluster/data/hg18/bed/yaleBertoneTars/
      # download Bertone et al. data from this URL:
     #http://dart.gersteinlab.org/cgi-bin/ar/download.cgi?ID=TAR_data_NCBI31.txt
      # and put it in this directory.
      # The sequences used to design the microarrays were from
      # UCSC hg13/NCBI Build 31 so the sequences
      # should be aligned again using Blat since this is probably better
      # than using liftOver across so many assemblies.
 
      # Get sequences from TARs file and put in FASTA format:
      # Remove characters from Windows:
      dos2unix TAR_data_NCBI31.txt
      # The TARs are in order of IDs in the file so the first TAR has ID 1, the
      # second is 2 up to the last which is 17517. These IDs are used to link
      # to the DART database of TARs at Yale so use these IDs in the FASTA
      # header lines. Need to add "TAR" as prefix to ID so that it is unique
      # in the seq table.
    awk 'BEGIN {FS="\t";n=0;}{if ($1 ~ /^chr/) print ">TAR"n"\n"$14"\n";n++;}' \
          TAR_data_NCBI31.txt > yaleBertoneTARSeqs.fa
      ssh pk
      mkdir -p /san/sanvol1/scratch/hg18/TARs/
      cp /cluster/data/hg18/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
         /san/sanvol1/scratch/hg18/TARs/
      # Set up to Blat the TAR sequences against hg18
      cd /cluster/data/hg18/bed/yaleBertoneTars
      ls -1 /san/sanvol1/scratch/hg18/TARs/yaleBertoneTARSeqs.fa > tars.lst
      ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
      # output dir
      mkdir psl
 
      cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/x86_64/blat -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
      # << for emacs
      gensub2 genome.lst tars.lst template.sub para.spec
      para create para.spec
      para try, para check, para push ...
      para time
 # Completed: 49 of 49 jobs
 #CPU time in finished jobs:        396s       6.61m     0.11h    0.00d  0.000y
 #IO & Wait Time:                   198s       3.29m     0.05h    0.00d  0.000 y
 #Average job time:                  12s       0.20m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              39s       0.65m     0.01h    0.00d
 #Submission to last job:           253s       4.22m     0.07h    0.00d
 
      # sort and then filter
      pslSort dirs raw.psl tmp psl
      # use these parameters as for Genbank alignments of native mRNAs
      # for finished assemblies.
      pslCDnaFilter -minId=0.96 -minCover=0.25 -localNearBest=0.001 \
        -minQSize=20 -minNonRepSize=16 -ignoreNs -bestOverlap \
        raw.psl yaleBertoneTars.psl
 
 #                     seqs    aligns
 #             total:     17512   38243
 # drop minNonRepSize:     159     403
 #     drop minIdent:     3822    14798
 #     drop minCover:     563     895
 #        weird over:     242     832
 #        kept weird:     204     210
 #    drop localBest:     2410    4018
 #              kept:     17469   18129
 
      # 99.75% were kept.
      # check how many aligned
      grep '>' yaleBertoneTARSeqs.fa | wc -l
      # 17517
      # 99.7% of the original set of sequences are in this filtered PSL file.
 
      pslCheck yaleBertoneTars.psl
      # psl is ok
 
      # load into database
      ssh hgwdev
      cd /cluster/data/hg18/bed/yaleBertoneTars
      hgLoadPsl hg18 yaleBertoneTars.psl
 
      # Add sequences to /gbdb/hg18 and to seq and extFile tables.
      mkdir -p /gbdb/hg18/yaleTARs/
      ln -s /cluster/data/hg18/bed/yaleBertoneTars/yaleBertoneTARSeqs.fa \
            /gbdb/hg18/yaleTARs/
      hgLoadSeq hg18 /gbdb/hg18/yaleTARs/yaleBertoneTARSeqs.fa
 
      # Add trackDb.ra entry to trackDb/human/trackDb.ra and create
      # a description page.
 
 ##############################################################################
 # Update upstream maf files, fixing a problem of RefSeq ID being trucated. (2006-10-20 Fan)
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz17way
     cd mafDownloads
     # upstream mafs (mafFrags takes a while)
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
 
         cat up.bad|sed -e "s/_up_${i}_/\t/" >up.bad2
 	awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, 0, $6)}' up.bad2 > up.bed
 	rm up.bad up.bad2
 	nice mafFrags hg18 multiz17way up.bed upstream$i.maf \
 	   -orgs=/cluster/store11/gs.19/build36/bed/multiz17way.2006-02-18/species.lst
 	rm up.bed
     end
     date
 'EOF'
 # << happy emacs
 
     time csh mafFrags.csh > mafFrags.log
     nice gzip up*.maf
     md5sum up*.gz >> md5sum.txt
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET FELCAT3 (Done Nov 09 2006 heather)
 # working in /cluster/data/felCat3 because /cluster/data/hg18 is 96% full
 # make this a link in /cluster/data/hg18
     mkdir /cluster/data/felCat3/bed/blastz.hg18.2006-11-09
     ln -s /cluster/data/felCat3/bed/blastz.hg18.2006-11-09 /cluster/data/hg18/bed/blastz.felCat3
     cd /cluster/data/felCat3/bed/blastz.hg18.2006-11-09
     cat << '_EOF_' > DEF
 
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 # Can we use 2bit here?
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cat felCat3
 SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
 SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ2_LIMIT=500
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/felCat3/bed/blastz.hg18.2006-11-09
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/felCat3/blastz.hg18 >& do.log &
     tail -f do.log
 
     nice featureBits -chrom=chr1 hg18 chainFelCat3Link
     # 86932463 bases of 224999719 (38.637%) in intersection
 
     # reciprocal best net mafs for multiz
      ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 felCat3 >&! rbest.log &
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET BOSTAU3 (Done Feb 2007 heather)
     mkdir /cluster/data/hg18/bed/blastz.bosTau3.2007-02-23
     ln -s /cluster/data/hg18/bed/blastz.bosTau3.2007-02-23 /cluster/data/hg18/bed/blastz.bosTau3
     cd /cluster/data/hg18/bed/blastz.bosTau3
     cat << '_EOF_' > DEF
 
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/san/sanvol1/scratch/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau3
 SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
 SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ2_LIMIT=500
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.bosTau3.2007-02-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/bosTau3/blastz.hg18 >& do.log &
     tail -f do.log
 
     nice featureBits -chrom=chr1 hg18 chainBosTau3Link
     # 114562908 bases of 224999719 (50.917%) in intersection
 
 
 ##############################################################################
 # MAKE tfbsConsSites and tfbsConsFactors for TFBS conserved track (DONE weirauch 11/19/06)
 # Questions?  weirauch at soe.ucsc.edu or braney at soe.ucsc.edu
 
 ssh hgwdev
 mkdir /cluster/data/hg18/bed/tfbsCons
 cd /cluster/data/hg18/bed/tfbsCons
 
 # Define all parameters in 'PARAMS.txt'
 # Define all chromosomes in 'CHROMS.txt'
 # Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts  weirauch at soe.ucsc.edu
 set tarfile=/cluster/data/hg18/bed/tfbsCons/tfbsConsUtils.tar.gz
 tar zxf $tarfile
 
 nice ./getRefseqStats.pl &
 nice ./getBatchQueries.pl &
 
 ssh kk
 mkdir /cluster/bluearc/braney/tfloc
 # Copy ./tmp/ctfbs_batch_list.txt to this dir
 # Copy ./scripts/doit to this dir
 para create ctfbs_batch_list.txt
 para try
 para push
 
 # When the run is done (within a day or so), the results will be in individual dirs, one for each chromosome.
 
 ssh kksilo (or hgwdev, or whatever)
 nice ./getBedFile.pl &
 
 hgLoadBed -noSort hg18 tfbsConsSites -sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed -tab
 hgLoadBed -noSort hg18 tfbsConsFactors -sqlTable=$HOME/kent/src/hg/lib/tfbsConsFactors.sql tfbsConsFactors.bed -tab
 
 # Feel free to delete or gzip anything in ./tmp (particularly the huge .maf and .bed files) after the final two bed files are sucessfully loaded
 
 # fixed up the tfbsConsSites.bed file to remove extra indexes, then:
 hgsql -e "drop index chrom_2 on tfbsConsSites;" hg18
 hgsql -e "drop index chrom_3 on tfbsConsSites;" hg18
 
 # the tfbsConsFactors table had extra names, they were removed:
 for N in `cat extra.tfbsConsFactors.name`
 do
         echo "delete from tfbsConsFactors where name=\"${N}\";" hg18
         hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg18
 done
 
 # the extra names were:
 # B$CRP_C F$DDE1_B F$STRE_01 P$GBP_Q6 V$ACAAT_B V$APOLYA_B V$ATATA_B
 # V$BARBIE_01 V$BEL1_B V$CAAT_01 V$CAAT_C V$CAP_01 V$DTYPEPA_B V$E2F_Q2
 # V$ETF_Q6 V$ETS_Q6 V$GC_01 V$GEN_INI2_B V$GEN_INI3_B V$GEN_INI_B V$HFH8_01
 # V$HOGNESS_B V$LBP1_Q6 V$LDSPOLYA_B V$LEF1_Q2 V$LPOLYA_B V$MEF3_B V$MINI19_B
 # V$MINI20_B V$MTATA_B V$MUSCLE_INI_B V$PADS_C V$PEA3_Q6 V$POLY_C V$SRY_01
 # V$STAT4_01 V$STAT5A_03 V$STAT5A_04 V$STAT6_02 V$TAACC_B V$TANTIGEN_B
 # V$TEF1_Q6 V$USF2_Q6
 
 # And re-load once again since the above data was based on transfac data that
 # is too new (2006-11-03 - Hiram)
     cd /cluster/data/hg18/bed/tfbsCons
     hgLoadBed -tab -strict hg18 tfbsConsSites \
 	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed
 # And this leads once again to a bunch of extra names in Factors
     hgsql -N -e "select name from tfbsConsSites;" hg18 | sort -u > names.new
     hgsql -N -e "select name from tfbsConsFactors;" hg18 \
 	| sort -u > names.factors
     comm -13 names.new names.factors > names.extra.factors
 for N in `cat names.extra.factors`
 do
         echo "delete from tfbsConsFactors where name=\"${N}\";" hg18
         hgsql -e "delete from tfbsConsFactors where name=\"${N}\";" hg18
 done
 
 # Reload tfbsCons to correct errors (2007-07-17 - Hiram)
     cd /cluster/data/hg18/bed/tfbsCons
     hgLoadBed -tab hg18 tfbsConsSites \
 	-sqlTable=$HOME/kent/src/hg/lib/tfbsConsSites.sql tfbsConsSites.bed
     hgsql -N -e "select name from tfbsConsSites;" hg18 | sort -u \
 	> names.new.2007-07-17
     #	showing zero difference still, nothing more to be done
     comm -13 names.new.2007-07-17 names.factors
 
 ##############################################################################
 # REWORK PLACE ASSEMBLY CLONES ON CONTIGS AND SEQUENCE
 #	(WORKING - 2006-10-23 - Hiram)
 #	five different cluster runs are described here for different classes
 #	of clones
 #	runPlacedNotSplit - all placed clones split or not split with blat
 #	runFish - 392 fish clones against all 378 contigs, with blat
 #	runUnPlaced - 14,569 clones on known contigs - with psLayout
 #	runUnPlacedChr - 297 clones on known chroms - with psLayout
 #	runLastOnes - 1,877 clones against 378 contigs - with blat
 
 #  The original run of this forgot to split of the BAC clones that were just
 #	a fasta file full of unordered pieces.  They need to be split up
 #	to work properly.
 
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23
     cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
     #	Going to copy over the BAC clones from the previous runs and split
     #	them up if they have too many N's (>100) (indicating pieces)
     #	This may actually split up a couple of BACs that are not actually
     #	pieces, but in the cases I could find, and they were rare, the big
     #	BACs appear to break into only two pieces.
     #	The first set to do are the clones that were used in the assembly
     #	Since they were placed, we know where they all belong.  Only 50 of
     #	them end up being split, and then usually only in 2 pieces.
     #	We could tediously go through each of these 50 and determine if they
     #	are actually unordered pieces.  Although this raises the question,
     #	how could unordered pieces be used in the assembly ?  Doesn't make any
     #	sense.
     cat << '_EOF_' > placedClones.sh
 #!/bin/sh
 
 D0=placedNotSplit
 D1=placedSplit
 export D0 D1
 
 find ../coverage/placedClones -type f | grep -v faCount.all.txt | while read F
 do
     BN=`basename "${F}"`
     DN=`dirname "${F}"`
     CHROM=`basename "${DN}"`
     Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"`
     if [ "${Ncount}" -gt 99 ]; then
 	out="${D1}/${CHROM}/${BN}"
         mkdir -p ${D1}/${CHROM}
         echo "gapSplit -minGap=100 ${F} ${out}"
         gapSplit -minGap=100 ${F} stdout | gzip > ${out}
         faSize "${F}"
         faSize "${out}"
     else
 	out="${D0}/${CHROM}/${BN}"
         mkdir -p ${D0}/${CHROM}
         echo "cp -p ${F} ${out}"
         cp -p ${F} ${out}
     fi
 done
 '_EOF_'
 # << happy emacs
     #	Going to use blat this time instead of psLayout
     #	It is faster and appears to do just about the same exact job
     mkdir runPlacedNotSplit
     cd runPlacedNotSplit
     #	Re-use the previous jobList
     sed -e "s/runPsLayout.sh/runBlat.csh/" \
 	../../coverage/runPlaced/masterJobList > jobList
     cat << '_EOF_' > runBlat.csh
 #!/bin/csh -fe
 set chrom = $1
 set clone = $2
 set contig = $3
 set result = $4
 set target = /san/sanvol1/scratch/hg18/coverage/maskedContigs/$chrom/$contig.fa.gz
 set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/placedNotSplit/$chrom/$clone.fa.gz
 if ( ! -f $query ) then
 set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/placedSplit/$chrom/$clone.fa.gz
 endif
 set scrTmp = "/scratch/tmp/$contig/$clone"
 set ooc = /san/sanvol1/scratch/hg18/coverage/contigOoc/$contig.10.ooc
 mkdir -p "$scrTmp"
 zcat $target > $scrTmp/$contig.fa
 zcat $query > $scrTmp/$clone.fa
 cp -p $ooc $scrTmp/10.ooc
 pushd $scrTmp
 pwd
 ls -l
 blat -minIdentity=98 -fastMap -tileSize=10 -t=dna -q=dna -ooc=10.ooc $contig.fa $clone.fa $clone.psl
 popd
 mkdir -p psl/$chrom/$contig
 cp -p $scrTmp/$clone.psl $result
 rm $scrTmp/*
 rmdir $scrTmp
 rmdir --ignore-fail-on-non-empty /scratch/tmp/$contig
 '_EOF_'
 # << happy emacs
 
     para create jobList
     para try; para check; etc ...
     para time
 # Completed: 27093 of 27093 jobs
 # CPU time in finished jobs:     435042s    7250.69m   120.84h    5.04d  0.014 y
 # IO & Wait Time:                 74031s    1233.86m    20.56h    0.86d  0.002 y
 # Average job time:                  19s       0.31m     0.01h    0.00d
 # Longest finished job:             463s       7.72m     0.13h    0.01d
 # Submission to last job:          3079s      51.32m     0.86h    0.04d
 
     #	combine the results into one large raw.psl file
     time pslSort dirs raw.psl tmp psl/*/*
     ls -og raw.psl
 # -rw-rw-r--  1 52067774 Oct 31 12:06 raw.psl
     #	This raw.psl file will be included in the overall results, but as a
     #	check, it is possible to turn just these results into a .bed file for
     #	uploading as a custom track to take a look at them.
     time pslReps -nohead -nearTop=0.001 -singleHit \
 	raw.psl repsSingle.psl /dev/null
     clusterClone -allowDuplicates -agp -minCover=80 \
         -maxGap=60000 repsSingle.psl > single.agp 2> single.out
     sort -k1,1 -k2,2n single.agp | ../../coverage/fixPhase.pl \
         /cluster/data/hg18/bed/coverage/phase.txt > contig_overlaps.agp \
         2> singleToOverlaps.out
     awk -F'\t' '{printf "%s\t%s\t%s\t%s\t0\t%s\n", $1,$2,$3,$6,$9}' \
         contig_overlaps.agp > cOverlaps.bed
     liftUp chrOverlaps.bed /san/sanvol1/scratch/hg18/bacends/liftContigs.lft \
         warn cOverlaps.bed
     #	Load up that chrOverlaps.bed as a custom track to see these results
 
     ##################################################################
     #	The next big group are the FISH clones
     cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
     #	Split or not split depending on gap count >= 100
     cat << '_EOF_' > splitFishClones.sh
 #!/bin/sh
 
 D0=fishSplit
 export D0
 
 find ../coverage/fishClones/sequence -type f | while read F
 do
     BN=`basename "${F}"`
     Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"`
     if [ "${Ncount}" -gt 99 ]; then
 	out="${D0}/fishPieces/${BN}"
         echo "gapSplit -minGap=100 ${F} ${out}"
         gapSplit -minGap=100 ${F} stdout | gzip > ${out}
         faSize "${F}"
         faSize "${out}"
     else
 	out="${D0}/noPieces/${BN}"
 	echo "cp -p ${F} ${out}"
 	cp -p "${F}" "${out}"
     fi
 done
 '_EOF_'
 # << happy emacs
     mkdir fishSplit
     chmod +x splitFishClones.sh
     time ./splitFishClones.sh
 
     #	combine them all into large fasta files to lower the file count
     cd fishSplit
     for F in fishPieces/* noPieces/*
     do
 	zcat "${F}"
     done | gzip > all.fa.gz
     faSplit about all.fa.gz 500000 split/f_
 
     mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runFish
     cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runFish
     ls ../fishSplit/split | sed -e "s/.fa.gz//" > fish.list
     ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/?/* | \
 	sed -e \
 "s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
 	> contig.list
     ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/??/* | \
 	sed -e \
 "s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
 	>> contig.list
     ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/?_*/* | \
 	sed -e \
 "s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
 	>> contig.list
     ls /san/sanvol1/scratch/hg18/coverage/maskedContigs/??_*/* | \
 	sed -e \
 "s#/san/sanvol1/scratch/hg18/coverage/maskedContigs/##; s#.fa.gz##" \
 	>> contig.list
 
     cat << '_EOF_' > template
 #LOOP
 ./runBlat.csh $(path1) $(path2) {check out line+ psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
 # << happy emacs
 
     cat << '_EOF_' > runBlat.csh
 #!/bin/csh -fe
 set target = /san/sanvol1/scratch/hg18/coverage/maskedContigs/$1.fa.gz
 set query = /san/sanvol1/scratch/hg18/coverage.2006-10-23/fishSplit/split/$2.fa.gz
 set contig = $target:t:r:r
 set fishPiece = $query:t:r:r
 set result = psl/$contig/$fishPiece.psl
 set scrTmp = "/scratch/tmp/$contig/$fishPiece"
 set ooc = /san/sanvol1/scratch/hg18/coverage/contigOoc/$contig.10.ooc
 mkdir -p "$scrTmp"
 zcat $target > $scrTmp/$contig.fa
 zcat $query > $scrTmp/$fishPiece.fa
 cp -p $ooc $scrTmp/10.ooc
 pushd $scrTmp
 pwd
 ls -l
 blat -fastMap -tileSize=10 -t=dna -q=dna -ooc=10.ooc $contig.fa $fishPiece.fa $fishPiece.psl
 popd
 mkdir -p psl/$contig
 cp -p $scrTmp/$fishPiece.psl $result
 rm $scrTmp/*
 rmdir $scrTmp
 rmdir --ignore-fail-on-non-empty /scratch/tmp/$contig
 '_EOF_'
 # << happy emacs
     chmod +x runBlat.csh
 
     para create contig.list fish.list template jobList
     para try; para create; etc ...
     para time
 # Completed: 148176 of 148176 jobs
 # CPU time in finished jobs:    2884533s   48075.56m   801.26h   33.39d  0.091 y
 # IO & Wait Time:                385142s    6419.03m   106.98h    4.46d  0.012 y
 # Average job time:                  22s       0.37m     0.01h    0.00d
 # Longest finished job:             270s       4.50m     0.07h    0.00d
 # Submission to last job:          9510s     158.50m     2.64h    0.11d
 
     #	put all the results together into a single file
     pslSort dirs raw.psl tmp psl/*
     #	this is a big result
     ls -og raw.psl
 # -rw-rw-r--  1 6972351482 Oct 25 16:25 raw.psl
     #	can do the same thing as above to look at these results individually
     #	not listed here
 
     ##################################################################
     #	The next big group are the unplaced clones.  In the original run, the
     #	contig location of these items were inferred from Hg17 results, and
     #	thus many of them can be aligned against their respective contig.  For
     #	some cases, the contig isn't known, but the chrom is, thus they can be
     #	aligned to all the contigs for a chrom.  And finally, those completely
     #	unknown have to be aligned to all contigs.
     #	There are two sections here, those for which contig details are
     #	unknown, and those for which contigs are known.  First, those for
     #	which details are unknown:
     cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
     cat << '_EOF_' > splitUnplacedClones.sh
 #!/bin/sh
 
 find ../coverage/unPlacedClones -type f | while read F
 do
     BN=`basename "${F}"`
     DN=`dirname "${F}"`
     CONTIG=`basename "${DN}"`
     DN=`dirname "${DN}"`
     CHROM=`basename "${DN}"`
     out="unPlacedSplit/${CHROM}/${CONTIG}/${BN}"
 #    echo "${CHROM}/${CONTIG}/${BN}"
     mkdir -p unPlacedSplit/${CHROM}/${CONTIG}
     Ncount=`faSize "${F}" | sed -e "s/N's.*//; s/.* bases (//;"`
     if [ "${Ncount}" -gt 99 ]; then
         echo "gapSplit -minGap=100 ${F} ${out}"
         gapSplit -minGap=100 ${F} stdout | gzip > ${out}
         faSize "${F}"
         faSize "${out}"
     fi
 done
 '_EOF_'
 # << happy emacs
     chmod +x splitUnplacedClones.sh
     mkdir unPlacedSplit
     time ./splitUnplacedClones.sh > unPlaced.out 2>&1
 
     mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlacedChr
     cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlacedChr
     ls ../unPlacedSplit/*/XX*/*.fa.gz > bac.list
     cat << '_EOF_' > mkJobList.sh
 #!/bin/sh
 
 cat bac.list | while read F
 do
     CHR=`echo "${F}" | sed -e "s#.*unPlacedSplit/##; s#/.*##"`
     CLONE=`basename ${F} | sed -e "s/.fa.gz//"`
     case $CHR in
         U|Un)
         for C in /san/sanvol1/scratch/hg18/coverage/maskedContigs/? \
             /san/sanvol1/scratch/hg18/coverage/maskedContigs/?? \
             /san/sanvol1/scratch/hg18/coverage/maskedContigs/?_* \
             /san/sanvol1/scratch/hg18/coverage/maskedContigs/??_*
         do
             CH=`basename ${C}`
             for CT in /san/sanvol1/scratch/hg18/coverage/maskedContigs/${CH}/*
             do
                 CONTIG=`basename ${CT} | sed -e "s/.fa.gz//"`
                 echo "./runPsLayout.sh $CH $CLONE $CONTIG {check out line+ psl/$
 CH/$CONTIG/$CLONE.psl}"
             done
         done
         ;;
         *)
         for CT in /san/sanvol1/scratch/hg18/coverage/maskedContigs/${CHR}/*
         do
             CONTIG=`basename ${CT} | sed -e "s/.fa.gz//"`
             echo "./runPsLayout.sh $CHR $CLONE $CONTIG {check out line+ psl/$CHR
 /$CONTIG/$CLONE.psl}"
         done
         ;;
     esac
 '_EOF_'
 # << happy emacs
     chmod +x mkJobList.sh
 
     ./mkJobList.sh > jobList
     cat << '_EOF_' > runPsLayout.sh
 #!/bin/sh
 #   runPsLayout.sh <chrom> <clone> <contig>
 #     where <chrom> is the chrom this contig is on
 #      <clone> is one of the .fa.gz files in
 #         /san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/<chrom>/<clone>.fa.gz
 #      <contig> is one of the contigs found in:
 #       /san/sanvol1/scratch/hg18/coverage/maskedContigs/<chrom>/<contig>.fa.gz
 #
 HERE=`pwd`
 CHROM=$1
 CLONE=$2
 CONTIG=$3
 TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz
 CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/$CHROM/XX_000000/$CLONE.fa.gz
 OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc
 RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl"
 export CHROM CLONE CONTIG TARGET CLONESRC RESULT
 mkdir -p psl/${CHROM}/${CONTIG}
 if [ ! -s ${CLONESRC} ]; then
     CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/U/XX_000000/$CLONE.fa.gz
     if [ ! -s ${CLONESRC} ]; then
         CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/Un/XX_000000/$CLONE.fa.gz
         if [ ! -s ${CLONESRC} ]; then
             echo "Can not find: ${CLONESRC}" 1>/dev/stderr
             exit 255
         fi
     fi
 fi
 if [ ! -s ${TARGET} ]; then
         echo "Can not find: ${TARGET}" 1>/dev/stderr
         exit 255
 fi
 if [ ! -s ${OOC} ]; then
         echo "Can not find: ${OOC}" 1>/dev/stderr
         exit 255
 fi
 WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}"
 mkdir -p "${WRKDIR}"
 cd ${WRKDIR}
 zcat ${CLONESRC} > ${CLONE}.fa
 zcat ${TARGET} > ${CONTIG}.fa
 cp -p ${OOC} ./10.ooc
 /cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT}
 RET=$?
 cd ${HERE}
 rm -fr ${WRKDIR}
 rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}"
 rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}"
 exit ${RET}
 '_EOF_'
 # << happy emacs
     chmod +x ./runPsLayout.sh
 
     mkdir psl
     para create jobList
     para try; para check; ... etc ...
     para time
 # Completed: 40509 of 40509 jobs
 # CPU time in finished jobs:    5354801s   89246.69m  1487.44h   61.98d  0.170 y
 # IO & Wait Time:                115279s    1921.31m    32.02h    1.33d  0.004 y
 # Average job time:                 135s       2.25m     0.04h    0.00d
 # Longest finished job:          164276s    2737.93m    45.63h    1.90d
 # Submission to last job:        187712s    3128.53m    52.14h    2.17d
 
     # combine into one result file
     pslSort dirs raw.psl tmp psl/*/*
 
     ##################################################################
     #	Now, for those unplaced clones for which contig details are known
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlaced
     cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/runUnPlaced
     cat << '_EOF_' > mkJobList.sh
 #!/bin/sh
 
 find ../unPlacedSplit -type f | grep -v XX_ | while read F
 do
     BN=`basename ${F} | sed -e "s/.fa.gz//"`
     DN=`dirname ${F}`
     CONTIG=`basename ${DN}`
     DN=`dirname ${DN}`
     CHROM=`basename ${DN}`
     echo "./runPsLayout.sh ${CHROM} ${BN} ${CONTIG} {check out line+ psl/${CHROM
 }/${CONTIG}/${BN}.psl}"
 done
 '_EOF_'
 # << happy emacs
     chmod +x mkJobList.sh
 
     ./mkJobList.sh > jobList
     cat << '_EOF_' > runPsLayout.sh
 #!/bin/sh
 #   runPsLayout.sh <chrom> <clone> <contig>
 #     where <chrom> is the chrom this contig is on
 #      <clone> is one of the .fa.gz files in
 #         /san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/<chrom>/<clone>.fa.gz
 #      <contig> is one of the contigs found in:
 #       /san/sanvol1/scratch/hg18/coverage/maskedContigs/<chrom>/<contig>.fa.gz
 #
 HERE=`pwd`
 CHROM=$1
 CLONE=$2
 CONTIG=$3
 TARGET=/san/sanvol1/scratch/hg18/coverage/maskedContigs/$CHROM/$CONTIG.fa.gz
 CLONESRC=/san/sanvol1/scratch/hg18/coverage.2006-10-23/unPlacedSplit/$CHROM/$CONTIG/$CLONE.fa.gz
 OOC=/san/sanvol1/scratch/hg18/coverage/maskedContigs/ooc/$CHROM/$CONTIG.10.ooc
 RESULT="${HERE}/psl/${CHROM}/${CONTIG}/${CLONE}.psl"
 mkdir -p psl/${CHROM}/${CONTIG}
 if [ ! -s ${CLONESRC} ]; then
         echo "Can not find: ${CLONESRC}" 1>/dev/stderr
         exit 255
 fi
 if [ ! -s ${TARGET} ]; then
         echo "Can not find: ${TARGET}" 1>/dev/stderr
         exit 255
 fi
 if [ ! -s ${OOC} ]; then
         echo "Can not find: ${OOC}" 1>/dev/stderr
         exit 255
 fi
 WRKDIR="/scratch/tmp/hg18_${CHROM}/${CONTIG}/${CLONE}"
 mkdir -p "${WRKDIR}"
 cd ${WRKDIR}
 zcat ${CLONESRC} > ${CLONE}.fa
 zcat ${TARGET} > ${CONTIG}.fa
 cp -p ${OOC} ./10.ooc
 /cluster/bin/x86_64/psLayout ${CONTIG}.fa ${CLONE}.fa genomic 10.ooc ${RESULT}
 RET=$?
 cd ${HERE}
 rm -fr ${WRKDIR}
 rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}/${CONTIG}"
 rmdir --ignore-fail-on-non-empty "/scratch/tmp/hg18_${CHROM}"
 exit ${RET}
 '_EOF_'
 # << happy emacs
     chmod +x runPsLayout.sh
 
     para create jobList
     para try; para check; ... etc ...
     para time
 # Completed: 14569 of 14569 jobs
 # CPU time in finished jobs:    4863551s   81059.19m  1350.99h   56.29d  0.154 y
 # IO & Wait Time:                 64196s    1069.93m    17.83h    0.74d  0.002 y
 # Average job time:                 338s       5.64m     0.09h    0.00d
 # Longest finished job:           36681s     611.35m    10.19h    0.42d
 # Submission to last job:         68213s    1136.88m    18.95h    0.79d
 
     #	combine into a single result
     pslSort dirs raw.psl tmp psl/*/*
 
 
     #	combine into a single result
     time pslSort dirs raw.psl tmp psl/*
     #	real    550m57.744s
     #	user    324m56.251s
     #	sys     10m15.358s
     ls -og raw.psl
     #	-rw-rw-r--  1 39273644954 Nov  2 20:23 raw.psl
     #	Wow ...
     time pslReps -nohead -nearTop=0.001 -singleHit \
 	raw.psl repsSingle.psl /dev/null
     #	real    15m14.462s
     #	user    13m6.580s
     #	sys     1m50.304s
     ls -og repsSingle.psl
     #	-rw-rw-r--    1    73403317 Nov  3 09:44 repsSingle.psl
 
     ###########################################################
     #  And now, combining all results together
     mkdir /san/sanvol1/scratch/hg18/coverage.2006-10-23/finalPsl
     cd /san/sanvol1/scratch/hg18/coverage.2006-10-23/finalPsl
     ln -s ../runLastOnes/repsSingle.psl lastOnes.psl
     ln -s ../runFish/raw.psl fish.psl
     ln -s ../runUnPlaced/raw.psl unPlaced.psl
     ln -s ../runUnPlacedChr/raw.psl unPlacedChr.psl
     ln -s ../runPlacedNotSplit/raw.psl placed.psl
     cd /san/sanvol1/scratch/hg18/coverage.2006-10-23
     time pslSort dirs raw.psl tmp finalPsl
     #	real    18m53.770s
     #	user    12m19.002s
     #	sys     1m17.504s
     ls -og raw.psl
     #	-rw-rw-r--   1 7742802124 Nov  3 10:10 raw.psl
     time pslReps -nohead -nearTop=0.001 -singleHit \
         raw.psl repsSingle.psl /dev/null
     clusterClone -allowDuplicates -agp -minCover=80 \
         -maxGap=60000 repsSingle.psl > single.agp 2> single.out
     sort -k1,1 -k2,2n single.agp | ../coverage/fixPhase.pl \
         /cluster/data/hg18/bed/coverage/phase.txt > contig_overlaps.agp \
         2> singleToOverlaps.out
     awk -F'\t' '{printf "%s\t%s\t%s\t%s\t0\t%s\n", $1,$2,$3,$6,$9}' \
         contig_overlaps.agp > cOverlaps.bed
     liftUp chrOverlaps.bed /san/sanvol1/scratch/hg18/bacends/liftContigs.lft \
         warn cOverlaps.bed
     #	Load up that chrOverlaps.bed as a custom track to see these results
 
     #	And back to the original business of eliminating obsolete clones
     awk '{print $6}' contig_overlaps.agp | sort -u > clone.coverage.list
     time $HOME/kent/src/hg/makeDb/hgClonePos/ckMultipleVersions.pl \
 	clone.coverage.list > /dev/null 2> obsolete.clones
 
     time $HOME/kent/src/hg/makeDb/hgClonePos/removeObsoleteClones.sh \
 	contig_overlaps.agp obsolete.clones > clean_overlaps.agp
     #	looks like it removes 295 lines
     wc -l contig_overlaps.agp clean_overlaps.agp
     #	613577 contig_overlaps.agp
     #	613507 clean_overlaps.agp
     mv contig_overlaps.agp contig_overlapsWithObsoletes.agp
     mv clean_overlaps.agp contig_overlaps.agp
 
 
     cd /cluster/data/hg18
     #	save all existing .gl files before we overwrite them all
     tar cvzf ./save.glFiles.tgz ./?/*.gl ./??/*.gl ./?_*/*.gl \
 	./??_*/*.gl ./?/*/*.gl ./??/*/*.gl ./?_*/*/*.gl ./??_*/*/*.gl
     time agpToGl contig_overlaps.agp . -md=seq_contig.md
     #	real    1m4.253s
     time ./jkStuff/liftGl.csh contig.gl
     #	saw some errors such as: NT_113974/contig.gl doesn't exist, skipping
     #	I'm guessing they were contigs with no alignment results
     #	capture these new .gl files for future reference
     tar cvzf ./new.glFiles.tgz ./?/*.gl ./??/*.gl ./?_*/*.gl \
 	./??_*/*.gl ./?/*/*.gl ./??/*/*.gl ./?_*/*/*.gl ./??_*/*/*.gl
     #	now reload all the _gold, _gap and _gl tables
     #	Tested this load on a dummy database and found that the contents of
     #	the gold and gap tables do not change
     hgGoldGapGl -chromLst=chrom.lst hg18 /cluster/store11/gs.19 build36
     #   Then hgClonePos uses those tables to create the Coverage track
     #	table: clonePos
     hgClonePos  -maxErr=600 -maxWarn=50000 -chromLst=chrom.lst \
         hg18 /cluster/data/hg18 ./cleanedSequence.inf /cluster/store11/gs.19 \
         > updated.clone.pos.errors 2>&1
     #	Now let's check for clones that are excessively wrong
     cd /tmp
     hgsql -N -e \
 "select chrom,chromStart,chromEnd,name,chromEnd-chromStart,seqSize from clonePos;" \
 	hg18 > clonePos.hg18.lengths
     awk '{if ($6 > 0) { printf "%.2f\t%s\n", 100.0*$5/$6,$0}}' \
 	clonePos.hg18.lengths | sort -n > clonePos.hg18.deviations
     #	Looking at that list of deviations, there are still a number of them
     #	that are extreme deviants, but there are a lot less than there were
     #	before. Previously:
     ave clonePos.hg18.deviations
     #	Q1 100.000000
     #	median 100.000000
     #	Q3 109.172500
     #	average 350.043843
     #	min 80.000000
     #	max 23574.310000
     #	count 44978
     #	total 15744271.980000
     #	standard deviation 851.762186
     #	Over 3,500 of them larger than 10 times too large:
     awk '{if ($1 > 1000) {print}}' clonePos.hg18.deviations | wc
     #	3881   27167  223039
 
     # This new lot:
     ave clonePos.hg18.deviations
     #	Q1 100.000000
     #	median 100.000000
     #	Q3 100.360000
     #	average 140.353820
     #	min 0.250000
     #	max 40838.840000
     #	count 43734
     #	total 6138233.960000
     #	standard deviation 381.871589
     #	Only 277 are larger than 10 times too big:
     awk '{if ($1 > 1000) {print}}' clonePos.hg18.deviations | wc
     #	277    1939   15747
 
 # QA NOTE: ran mytouch on the *gold and *gap tables because the values were
 # unachaged, but they got a new date/time in the above process (ASZ
 # 11-14-2006):
 # sudo mytouch hg18 'chr*_gold' 200604060800.00
 # sudo mytouch hg18 'chr*_gap' 200604060800.00
 
 
 ##############################################################################
 # LongSAGE                                  (2006-10-20 markd)
 # Load LongSAGE composite tag with genomo mappings of tag clusters
 # obtained from "Martin Hirst" <mhirst at bcgsc.ca>
 
     ftp ftp2.bcgsc.ca
     user: ucsc
     <password from martin >
 
     download SHE*_u.map
     chmod a-w *.map
 
     ~/compbio/kent/src/hg/makeDb/outside/bcgscSage/bcgscSageLoad hg18 *_u.map
 
 ####################################################################
 # MAKE UNIGENE/SAGE TRACK (DONE - 2006-11-20 Fan)
 
 # Create the uniGene alignments
 # /cluster/data/hg18/uniGene/hg18.uniGene.lifted.pslReps.psl
 
     # Download of the latest UniGene version is now automated by a
     # cron job -- see /cluster/home/angie/crontab ,
     # /cluster/home/angie/unigeneVers/unigene.csh .
     # If hgwdev gets rebooted, that needs to be restarted... maybe there's
     # a more stable place to set up that cron job.
 
     ssh hgwdev
     cd /cluster/store11/gs.19/build36/bed
     mkdir uniGene
     cd uniGene
 
     set Version = 196
 
     zcat /cluster/store7/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
     sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa
 
     ssh pk
     set Version = 196
     mkdir -p /san/sanvol1/scratch/hg18/uniGene/
     cd /san/sanvol1/scratch/hg18/uniGene/
     cp -p /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa .
     ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
     ls -1S \
     /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa \
       > uniGene.lst
     cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
     gensub2 genome.lst uniGene.lst template.sub para.spec
     para create para.spec
     mkdir psl
     para try
     para check
     para push
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:      46855s     780.92m    13.02h    0.54d  0.001 y
 # IO & Wait Time:                   240s       3.99m     0.07h    0.00d  0.000 y
 # Average job time:                 961s      16.02m     0.27h    0.01d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3629s      60.48m     1.01h    0.04d
 # Submission to last job:          4337s      72.28m     1.20h    0.05d
 
     pslSort dirs raw.psl tmp psl >& pslSort.log
     cat raw.psl|\
     pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
       stdin hg18.uniGene.pslReps.psl /dev/null
 
 # Processed 553470 alignments
     gzip raw.psl
     gzip Hs.seq.uniq.simpleHeader.fa
 
     ssh hgwdev
     cd /cluster/store11/gs.19/build36/bed/uniGene
     cp -p /san/sanvol1/scratch/hg18/uniGene/hg18.uniGene.pslReps.psl .
 
     hgLoadPsl -table=uniGene_3 hg18 hg18.uniGene.pslReps.psl
 
 ####################################################################
 # EXONIPHY (2006-12-05 acs)
 
     # predictions provided by Brona Brejova in Siepel Lab (bb248 at cornell.edu).
     # stored in /cluster/data/hg18/bed/exoniphy/exoniphy.gff
 
     ldHgGene -genePredExt -gtf hg18 exoniphy exoniphy.gff
 
 ####################################################################
 # HapMap CNVRs (copy number variable regions) from Matt Hurles (Heather Dec. 2006)
 # Change bed3 to bed6 to match hg17
 
 cd /cluster/data/hg18/bed/sv
 redon.pl < cnpRedon.hg18 > redon.bed
 hgLoadBed hg18 cnpRedon cnpRedon.bed
 
 #########################################################
 # Structural Variation from Lars Feuk (Heather Jan - April 2007)
 # These tables are all tiny so I'm not using indices
 # I kept the bin column in all but Sebat but I could have done without that, # too
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/sv
     # 8 *txt files from Lars
 
     # Sharp (format different from hg17)
     cp Sharp*txt sharp.in
     # use editor to remove header from sharp.in
     # grab the data we need
     sharp.pl < sharp.in > sharp.prelim
     # adjust
     sharp2.pl < sharp.prelim > sharp.bed
     hgLoadBed hg18 cnpSharp2 sharp.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpSharp2.sql
 
     # Iafrate (format different from hg17)
     cp Iafrate*txt iafrate.in
     # use editor to change TABTAB to TAB0TAB and get rid of header
     iafrate.pl < iafrate.in > iafrate.bed
     hgLoadBed hg18 cnpIafrate2 iafrate.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpIafrate2.sql
 
     # Sebat (format different from hg17)
     cp Sebat*txt sebat.in
     # use editor to get rid of header
     sebat.pl < sebat.in > sebat.bed
     hgLoadBed hg18 cnpSebat2 sebat.bed -noBin -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/cnpSebat2.sql
 
     # Tuzun (I called this cnpFosmid in hg17)
     # simple bed 4 .
     cp Tuzun*txt tuzun.in
     # use editor to get rid of header
     tuzun.pl < tuzun.in > tuzun.bed
     hgLoadBed hg18 cnpTuzun tuzun.bed -tab
 
     # McCarroll (same format as hg17, simple bed 4 .)
     # need to sort and assign ids
     cp McCarroll*txt mccarroll.in
     # use editor to get rid of header
     mccarroll.pl < mccarroll.in > mccarroll.prelim
     sort -g mccarroll.prelim > mccarroll.sort
     # sort isn't perfect, use editor to finish
     mccarroll2.pl < mccarroll.sort > mccarroll.bed
     hgLoadBed hg18 delMccarroll mccarroll.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delMccarroll.sql
 
     # Conrad (different format from hg17)
     cp Conrad*txt conrad.prelim
     # use editor to shorten "Study" column
     conrad.pl < conrad.prelim > conrad.prelim2
     cp conrad.prelim2 conrad.prelim3
     # use editor to sort conrad.prelim3 (lame)
     # assign Ids
     conradId.pl < conrad.prelim3 > conrad.bed
     hgLoadBed hg18 delConrad2 conrad.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delConrad2.sql
 
     # Hinds (different format from hg17)
     cp Hinds*txt hinds.in
     # use editor to remove header
     hinds.pl < hinds.in > hinds.prelim
     sort -g hinds.prelim > hinds.sort
     # sort isn't perfect, use editor to finish
     hinds2.pl < hinds.sort > hinds.bed
     hgLoadBed hg18 delHinds2 hinds.bed -tab -sqlTable=/cluster/home/heather/kent/src/hg/lib/delHinds2.sql
 
     # Locke (new data)
     cp Locke*txt locke.in
     locke.pl < locke.in > locke.prelim
     sort -g locke.prelim > locke.??
     locke2.pl
 
 
 #########################################################
 # BUILD GAD TRACK (Done, 12/12/06, Fan)
 
    mkdir /cluster/store12/gad061211
    rm /cluster/data/gad
    ln -s /cluster/store12/gad061211 /cluster/data/gad
 
 # Receive "GAD-Hg18DATA.txt" from GAD/NIA
 # contact person: Shenoy, Narmada, shenoyn at grc.nia.nih.gov
 
    hgsql hg18 -e 'drop table gadAll'
    hgsql hg18 <~/src/hg/lib/gadAll.sql
    hgsql hg18 -e 'load data local infile "GAD-Hg18DATA.txt" into table gadAll ignore 1 lines'
    hgsql hg18 -e 'create index geneSymbol on gadAll(geneSymbol(10))'
 
 # create gad table
 
    hgsql hg18 -N -e \
    'select "chr",chromosome, chromStart, chromEnd, geneSymbol from gadAll where chromStart <>0 and chromEnd <>0 and chromosome<>""'|\
    sed -e 's/chr\t/chr/' |grep -v "chr\." |grep -v " "|sort -u >gadHg18.bed
 
    hgLoadBed hg18 gad gadHg18.bed
 
 #########################################################################
 # BLASTZ/CHAIN/NET oryLat1 (DONE - 2006-12-14 - Hiram)
 #  third time with randoms and chrUn in scaffolds on both sequences
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.oryLat1.2006-12-14
     cd /cluster/data/hg18/bed/blastz.oryLat1.2006-12-14
     cat << '_EOF_' > DEF
 # Human vs. Medaka
 
 # Try "human-fugu" (more distant, less repeat-killed than mammal) params
 # +M=50:
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human hg18, randoms in contigs, lifted to their chr*_random
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CTGDIR=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.2bit
 SEQ1_CTGLEN=/san/sanvol1/scratch/hg18/hg18.randomContigs.sdTrf.sizes
 SEQ1_LIFT=/san/sanvol1/scratch/hg18/hg18.randomContigs.lift
 SEQ1_CHUNK=10000000
 SEQ1_LIMIT=1
 SEQ1_LAP=10000
 
 # QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
 #       chrUn in Scaffolds for this alignment run
 SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.oryLat1.2006-12-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -verbose=2 \
 	-blastzOutRoot /cluster/bluearc/hg18OryLat1 > do.log 2>&1 &
     ### this did not work, abandoned
 
 #########################################################################
 # BLASTZ/CHAIN/NET oryLat1 (DONE - 2006-12-14 - Hiram)
 #  fourth time with randoms and chrUn in scaffolds for only Medaka
 #	All chroms and randoms as they are complete on Human
     ssh kkstore04
     mkdir /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
     cd /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
     cat << '_EOF_' > DEF
 # Human vs. Medaka
 
 # Try "human-fugu" (more distant, less repeat-killed than mammal) params
 # +M=50:
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human hg18, randoms complete, as they are, no contig confusion
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=1
 
 # QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
 #       chrUn in Scaffolds for this alignment run
 SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -verbose=2 \
 	-blastzOutRoot /cluster/bluearc/hg18OryLat1 > do.log 2>&1 &
     #	real    318m45.339s
     # typical failure:
     #	HgStepManager: executing step 'net'.
     #	netChains: looks like previous stage was not successful
     #		(can't find [hg18.oryLat1.]all.chain[.gz]).
     #	continuing net:
     time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-continue=net -bigClusterHub=pk -verbose=2 \
 	-blastzOutRoot /cluster/bluearc/hg18OryLat1 > net.log 2>&1 &
     #	real    39m25.853s
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.oryLat1.2007-02-24
     nice -n +19 featureBits  hg18 chainOryLat1Link \
 	> fb.hg18.chainOryLat1Link.txt 2>&1 &
     #	57393910 bases of 2881515245 (1.992%) in intersection
 
     ssh kkstore04
     mkdir /cluster/data/oryLat1/bed/blastz.hg18.swap
     cd /cluster/data/oryLat1/bed/blastz.hg18.swap
     time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \
 	/cluster/data/hg18/bed/blastz.oryLat1.2007-02-24/DEF \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -verbose=2 -swap > swap.log 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/oryLat1/bed/blastz.hg18.swap
     nice -n +19 featureBits  oryLat1 chainHg18Link \
 	> fb.oryLat1.chainHg18Link.txt 2>&1 &
     #	48002423 bases of 700386597 (6.854%) in intersection
 
 ##########################################################################
 # AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14)
    ssh hgwdev
    cd /cluster/data/hg18/bed/affyHumanExon
    liftOver /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.bed \
       /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz affyHuEx1.bed affyHuEx1.unmapped
    awk 'BEGIN{OFS="\t"}{print $4,$3-$2}' affyHuEx1.bed | sort -k2,2nr | head
 #2440970 81664
 #3016074 9552
 #3641787 8061
 #2321649 8054
    # It seems the liftOver problem still happens for that segmental dupe.
    # So the start is correct and the end is correct.  Just make two entries, both
    # with size == 305.
    grep -v "\b2440970\b" affyHuEx1.bed > tmp.bed
    grep "\b2440970\b" affyHuEx1.bed > bad.bed
    awk 'BEGIN{OFS="\t"}{print $1,$2,$2+305,$4,$5,$6}' bad.bed > good.bed
    awk 'BEGIN{OFS="\t"}{print $1,$3-305,$3,$4,$5,$6}' bad.bed >> good.bed
    cat tmp.bed good.bed > affyHuEx1.bed
    bedSort affyHuEx1.bed tmp.bed
    mv tmp.bed affyHuEx1.bed
    rm good.bed bad.bed
    hgLoadBed hg18 affyHuEx1 affyHuEx1.bed
 
 ##########################################################################
 # CGAP SAGE (In progress Andy 2007-01-09)
     # This is the BED part.
     ssh hgwdev
     cd /cluster/data/hg18/bed
     mkdir /san/sanVol1/scratch/andy/cgapSage
     ln -s /san/sanVol1/scratch/andy/cgapSage cgapSage
     wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_hs_long_forward_v36.1.tar.gz
     wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_hs_long_reverse_v36.1.tar.gz
     tar xfz SAGE_hs_long_forward_v36.1.tar.gz
     tar xfz SAGE_hs_long_reverse_v36.1.tar.gz
     cd hs_forward/
     cat * | awk 'BEGIN{OFS="\t"}{print $1, $3, $4, $2, 1000, "+"}' > ../unlifted.bed
     cd ../hs_reverse/
     cat * | awk 'BEGIN{OFS="\t"}{print $1, $4, $3, $2, 1000, "-"}' >> ../unlifted.bed
     cd ../
     rm -rf hs*
     liftUp lifted.bed /cluster/data/hg18/jkStuff/liftAll.lft warn unlifted.bed
 #Got 378 lifts in /cluster/data/hg18/jkStuff/liftAll.lft
 #Lifting unlifted.bed
 #Expecting number field 3 line 13868252 of unlifted.bed, got CCATCGGATGCCCACCT
     # Looks like there was a funny line in unlifted.bed:
     grep CCATCGGATGCCCACCT unlifted.bed
 #NT_011362       24364534NT_004321       CCATCGGATGCCCACCT       AATAAGCCAGAGTCTAT       1000    -
 #NT_004321       7900    7884    CCATCGGATGCCCACCT       1000    -
     # Ok so there's one record for CCATCGGATGCCCACCT in addition... and for
     # AATAAGCCAGAGTCTAT?
     grep AATAAGCCAGAGTCTAT unlifted.bed
 #NT_011362       24364534NT_004321       CCATCGGATGCCCACCT       AATAAGCCAGAGTCTAT       1000    -
 #NT_011362       24364534        24364518        AATAAGCCAGAGTCTAT       1000    -
     # Looks like that one's got a record too.  So just get rid of the stupid
     # line:
     grep -v 24364534NT_004321 unlifted.bed > tmp
     mv tmp unlifted.bed
     liftUp lifted.bed /cluster/data/hg18/jkStuff/liftAll.lft warn unlifted.bed
     rm unlifted.bed
     head lifted.bed
 #chr1    649     665     TGTCTGCGCCTGCGCCG       1000    -
 #chr1    670     686     CTAGCGCGTCGGGGTGG       1000    +
     nibFrag /cluster/data/hg18/nib/chr1.nib 669 686 "+" /dev/stdout
 #>/cluster/data/hg18/nib/chr1.nib:669-686
 #ctagcgcgtcggggtgg
     nibFrag /cluster/data/hg18/nib/chr1.nib 649 665 m /dev/stdout
 #>/cluster/data/hg18/nib/chr1.nib:649-665
 #tgtctgcgcctgcgcc
     # It looks like there's off-by-one errors, so fix em:
     awk 'BEGIN{OFS="\t"}{start=$2; end=$3;if ($6 == "-") { end = end+1; } else { start = start-1 } print $1, start, end, $4, $5, $6}' \
        < lifted.bed > mapping.bed6
     rm lifted.bed
     # Add thickStart/thickEnd fields
     awk 'BEGIN{OFS="\t"}{thickStart=$2; thickEnd=$3; if ($6=="-") {thickStart = thickStart+13; } else { thickEnd = thickEnd-13; } print $0, thickStart, thickEnd}' \
        < mapping.bed6 > mapping.bed
 
 
 ##########################################################################
 #  xxBlastTab - Help filter out unwanted paralogs  (Galt 2007-01-10)
 #
 # Background: The xxBlastTab tables are made with a simple blastall
 # (blastp with -b 1) which chooses the best match.  Unfortunately this
 # means that if there is no proper match it will still pick something
 # even though it's probably not orthologous. This is especially a problem
 # in organisms like rat knownGene which has only 30% gene coverage.
 # The strategy here is to filter our xxBlastTab using synteny mappings from
 # the chains. This is done by simply taking hg18.kg and using /gbdb/$db chains
 # and pslMap to lift the genes to the target xx assembly.  Then hgMapToGene
 # will find which of those mapped ids have good overlap with xx.knownGene.
 # The final mapping is then created by doing an inner join between
 # the traditional xxBlastTab and the mapping table produced above.
 # Then simply drop the old table and rename the new table.
 #
 #
 # We are starting with xxBlastTab tables already built in the usual way with
 # blastall/blastp, probably with doHgNearBlastp.pl script.
 #
 # I created a new utility script called synBlastp.csh since I have to do this
 # several times.
 #
 # we want to update hg18 for rat and mouse,
 # so check ./hgGeneData/Human/hg18/otherOrgs.ra for current settings
 
 ssh hgwdev
 
 synBlastp.csh hg18 rn4
 
 #hg18.rnBlastTab results:
 #new number of unique query values:
 # 13120
 #new number of unique target values
 # 6431
 #old number of unique query values:
 # 26982
 #old number of unique target values
 # 6732
 
 synBlastp.csh hg18 mm8
 
 #hg18.mmBlastTab results:
 #new number of unique query values:
 # 28733
 #new number of unique target values
 # 15366
 #old number of unique query values:
 # 33016
 #old number of unique target values
 # 15918
 
 
 ##########################################################################
 # GenBank gbMiscDiff table (markd 2007-01-10)
 # Supports `NCBI Clone Validation' section of mgcGenes details page
 
    # genbank release 157.0 now contains misc_diff fields for MGC clones
    # reloading mRNAs results in gbMiscDiff table being created.
    ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg18
 
 #################################################
 # BUILD ncRna TRACK (DONE, 1/12/07, Fan)
 
 # Download the terms and make the database.
     ssh hgwdev
     cd /cluster/store11/gs.19/build36
     cd bed
     mkdir ncRna
 
 # copy Perl file at:
 # http://cvs.sanger.ac.uk/cgi-bin/viewcvs.cgi/biomart-perl/scripts/webExample.pl?view=markup
 # into getBiomart.pl
 
 # create the following query xml file, ncRna.xml:
 
     cat << '_EOF_' >ncRna.xml
 <?xml version="1.0" encoding="UTF-8" ?>
 <!DOCTYPE Query>
 <Query virtualSchemaName="default" Header="1" count="" softwareVersion="0.5">
 <Dataset name="hsapiens_gene_ensembl" interface="default" Formatter="GTF" >
 
 <Attribute name="biotype" />
 <Attribute name="str_chrom_name" />
 <Attribute name="gene_stable_id" />
 <Attribute name="exon_chrom_start" />
 <Attribute name="exon_chrom_end" />
 <Attribute name="transcript_chrom_strand" />
 <Attribute name="external_gene_id" />
 </Dataset>
 </Query>
 '_EOF_'
 
 # get Ensembl gene data from BioMart and filter out protein-conding genes
 
     perl getBiomart.pl ncRna.xml | grep -v protein_coding >ncRna0.tab
 
 # cut and paste different cols to form ncRna.tab
 
     cat ncRna0.tab | sed -e 's/ENSG/chr\tENSG/'>j1
     cut -f 2 j1 >j.chr0
     cut -f 1 j1 >j.chr
     cat j.chr0|sed -e 's/chr/0/' >j.0
     cut -f 6 j1  >j.strand
     cut -f 4,5 j1 >j.startEnd
     cut -f 3 j1 >j.name
     cut -f 7 j1 >j.type
     cut -f 8 j1 >j.extGeneId
     paste j.chr0 j.chr j.startEnd j.name j.0 j.strand j.0 j.0 j.type j.extGeneId >j.all
     cat j.all|grep -v c6_COX|grep -v c6_QBL|grep -v c5_H2\
     |sed -e 's/chr\t/chr/'\
     |grep -v NT_\
     |sed -e 's/\t-1\t/\t-\t/' |sed -e 's/\t1\t/\t+\t/' \
     |sed -e 's/chrMT/chrM/'\
     |sort -k1,1 -k2,2n -k3,3n >ncRna.tab
 
     hgLoadBed -strict -tab -sqlTable=/cluster/home/fanhsu/src/hg/lib/ncRna.sql hg18 ncRna ncRna.tab
 
     rm j.*
     rm j1
 
 ###########################################################
 # MAKE Drosophila Proteins track (DONE 2007-02-06 braney)
     ssh kkstore02
     sandir=/san/sanvol1/scratch/hg18
     mkdir $sandir
     cd /cluster/data/hg18
     cat noUn/chr*fa > temp.fa
     faSplit gap temp.fa 1000000 $sandir/blastDb/x -lift=$sandir/blastDb.lft
     cat randomContigs/*.fa > temp.fa
     faSplit sequence temp.fa 150 $sandir/blastDb/y
     rm temp.fa
     cd $sandir/blastDb
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.fa
 
     mkdir -p /cluster/data/hg18/bed/tblastn.dm2FB
     cd /cluster/data/hg18/bed/tblastn.dm2FB
     echo  /san/sanvol1/scratch/hg18/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 3066 query.lst
 
    # we want around 150000 jobs
    calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\)
 
 # 18929/(150000/3066) = 386.908760
 
    mkdir -p /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa
    split -l 387 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl  /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa/kg
    ln -s /cluster/bluearc/hg18/bed/tblastn.dm2FB/fbfa
    cd fbfa
    for i in *; do
      nice pslxToFa $i $i.fa;
      rm $i;
      done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir -p /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut
    ln -s /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/hg18/bed/tblastn.dm2FB
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/hg18/blastDb.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.3
 
         if pslCheck -prot $3.tmp
         then
             mv $3.tmp $3
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     exit
     chmod +x blastSome
     gensub2 query.lst fb.lst blastGsub blastSpec
 
     ssh pk
     cd /cluster/data/hg18/bed/tblastn.dm2FB
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 # Completed: 150234 of 150234 jobs
 # CPU time in finished jobs:    8313632s  138560.53m  2309.34h   96.22d  0.264 y
 # IO & Wait Time:                882301s   14705.02m   245.08h   10.21d  0.028 y
 # Average job time:                  61s       1.02m     0.02h    0.00d
 # Longest finished job:             545s       9.08m     0.15h    0.01d
 # Submission to last job:         40693s     678.22m    11.30h    0.47d
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/tblastn.dm2FB
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut/c.`basename $1`.psl)
 '_EOF_'
     exit
     chmod +x chainOne
     ls -1dS /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh kk
     cd /cluster/data/hg18/bed/tblastn.dm2FB/chainRun
     para create chainSpec
     para maxNode 30
     para try, check, push, check etc.
 
 # Completed: 48 of 49 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:     209872s    3497.86m    58.30h    2.43d  0.007 y
 # IO & Wait Time:                 48501s     808.35m    13.47h    0.56d  0.002 y
 # Average job time:                5383s      89.71m     1.50h    0.06d
 # Longest finished job:           19336s     322.27m     5.37h    0.22d
 # Submission to last job:         19336s     322.27m     5.37h    0.22d
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/tblastn.dm2FB/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/hg18/bed/tblastn.dm2FB/unliftBlastDm2FB.psl
     cd ..
     pslCheck unliftBlastDm2FB.psl
     sed "s/[0-9XY]*\///" unliftBlastDm2FB.psl | liftUp -type=.psl -nohead stdout ../../randomContigs/hg18.randomContigs.lift carry stdin | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > blastDm2FB.psl
 
     # load table
     ssh hgwdev
     cd /cluster/data/hg18/bed/tblastn.dm2FB
     hgLoadPsl hg18 blastDm2FB.psl
 
     # check coverage
     featureBits hg18 blastDm2FB
 # 5976178 bases of 2881515245 (0.207%) in intersection
 
     featureBits hg18 knownGene:cds blastDm2FB  -enrichment
 # knownGene:cds 1.111%, blastDm2FB 0.207%, both 0.130%, cover 11.71%, enrich 56.45x
 
     ssh kkstore04
     rm -rf /cluster/data/hg18/bed/tblastn.dm2FB/blastOut
     rm -rf /cluster/bluearc/hg18/bed/tblastn.dm2FB/blastOut
 #end tblastn
 ##########################################################################
 
 #########################################################################
 # BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-26 - Hiram)
 ##  Align to fr2 scaffolds,
 ##	results lifted to fr2 chrUn coordinates
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.fr2.2007-01-24
     cd /cluster/data/hg18/bed/blastz.fr2.2007-01-24
     cat << '_EOF_' > DEF
 # Human vs. Fugu
 
 # Try "human-fugu" (more distant, less repeat-killed than mammal) params
 # +M=50:
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human hg18
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LIMIT=1
 SEQ1_LAP=10000
 
 # QUERY: Fugu fr2
 #       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
 SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.fr2.2007-01-24
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-verbose=2 -bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/hg18Fr2 > do.log 2>&1 &
     #	real    414m47.505s
 
     ##  Swap back to fr2 (duplicated in fr2.txt also)
     mkdir /cluster/data/fr2/bed/blastz.hg18.swap
     cd /cluster/data/fr2/bed/blastz.hg18.swap
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/hg18/bed/blastz.fr2.2007-01-24/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -swap > swap.log 2>&1 &
     #	real    47m14.554s
     ssh hgwdev
     cd /cluster/data/fr2/bed/blastz.hg18.swap
     time nice -n +19 featureBits fr2 chainHg18Link \
 	> fb.fr2.chainHg18Link.txt 2>&1 &
     #	42875664 bases of 393312790 (10.901%) in intersection
 
 ############################################################################
 ##  BLASTZ mm8 test with WindowMasker sequence (DONE - 2007-01-30 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/hg18/bed/blastz.mm8.2007-01-30
     cd /cluster/data/hg18/bed/blastz.mm8.2007-01-30
 
     cat << '_EOF_' > DEF
 # human vs mouse
 
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.2bit
 SEQ1_LEN=/san/sanvol1/scratch/hg18/hg18.noUn.sdTrf.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse Mm8 - single chunk big enough to run each chrom by itself
 SEQ2_DIR=/san/sanvol1/scratch/mm8/sdTrf/mm8.noUn.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/mm8/sdTrf/noUn.sdTrf.sizes
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.mm8.2007-01-30
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/hg18Mm8 \
 	-chainMinScore=3000 -chainLinearGap=medium > do.out 2>&1 &
     time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/hg18Mm8 \
 	-continue=cat -stop=net \
 	-chainMinScore=3000 -chainLinearGap=medium > cat.out 2>&1 &
     #	real    635m55.126s
     nice -n +19 featureBits -noRandom hg18 chainMm8Link \
 	> fb.noRandom.hg18.chainMm8Link.txt 2>&1
     #	991429484 bases of 2868834265 (34.559%) in intersection
     nice -n +19 featureBits -noRandom hg18 chainMm8WMLink \
 	> fb.noRandom.hg18.chainMm8WMLink.txt 2>&1
     #	1071083201 bases of 2868834265 (37.335%) in intersection
 
     ## swap to mm8
     mkdir /cluster/data/mm8/bed/blastz.hg18.swap.2007-02-01
     cd /cluster/data/mm8/bed/blastz.hg18.swap.2007-02-01
     time doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
 	/cluster/data/hg18/bed/blastz.mm8.2007-01-30/DEF \
 	-swap -stop=net -chainMinScore=3000 \
 	-chainLinearGap=medium > swap.out 2>&1 &
     #	this created the directory /cluster/data/mm8/bed/blastz.hg18.swap
     #	after it was done, move to here blastz.hg18.swap.2007-02-01 since
     #	it is on a filesystem with some free space
     nice -n +19 featureBits -noRandom mm8 chainHg18Link \
 	> fb.noRandom.mm8.chainHg18Link.txt 2>&1
     #	983004750 bases of 2550172871 (38.547%) in intersection
     nice -n +19 featureBits -noRandom mm8 chainHg18WMLink \
 	> fb.noRandom.mm8.chainHg18WMLink.txt 2>&1
     #	976774811 bases of 2550172871 (38.302%) in intersection
 
 ###########################################################
 # MAKE C. elegans proteins track
     ssh kkstore02
     sandir=/san/sanvol1/scratch/hg18
 
     mkdir -p /cluster/data/hg18/bed/tblastn.ce3WB
     cd /cluster/data/hg18/bed/tblastn.ce3WB
     echo  /san/sanvol1/scratch/hg18/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 3066 query.lst
 
    # we want around 200000 jobs
    calc `wc /cluster/data/ce3/bed/blat.ce3WB/ce3WB.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
 
 # 22395/(200000/3066) = 343.315350
 
    mkdir -p /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa
    split -l 343 /cluster/data/ce3/bed/blat.ce3WB/ce3WB.psl  /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa/wb
    ln -s /cluster/bluearc/hg18/bed/tblastn.ce3WB/wbfa
    cd wbfa
    for i in *; do
      nice pslxToFa $i $i.fa;
      rm $i;
      done
    cd ..
    ls -1S wbfa/*.fa > wb.lst
    mkdir -p /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut
    ln -s /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut
    for i in `cat wb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/hg18/bed/tblastn.ce3WB
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/hg18/blastDb.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/ce3/bed/blat.ce3WB/protein.lft warn $f.3
 
         if pslCheck -prot $3.tmp
         then
             mv $3.tmp $3
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0
     fi
 fi
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     exit
     chmod +x blastSome
     gensub2 query.lst wb.lst blastGsub blastSpec
 
     ssh pk
     cd /cluster/data/hg18/bed/tblastn.ce3WB
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 # Completed: 195603 of 195603 jobs
 # CPU time in finished jobs:   12047221s  200787.01m  3346.45h  139.44d  0.382 y
 # IO & Wait Time:               9089287s  151488.12m  2524.80h  105.20d  0.288 y
 # Average job time:                 108s       1.80m     0.03h    0.00d
 # Longest finished job:            1002s      16.70m     0.28h    0.01d
 # Submission to last job:        192221s    3203.68m    53.39h    2.22d
 
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/tblastn.ce3WB
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut/c.`basename $1`.psl)
 '_EOF_'
     exit
     chmod +x chainOne
     ls -1dS /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut/wb?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh kk
     cd /cluster/data/hg18/bed/tblastn.ce3WB/chainRun
     para create chainSpec
     para maxNode 30
     para try, check, push, check etc.
 
 # Completed: 66 of 66 jobs
 # CPU time in finished jobs:     161714s    2695.23m    44.92h    1.87d  0.005 y
 # IO & Wait Time:                 40315s     671.92m    11.20h    0.47d  0.001 y
 # Average job time:                3061s      51.02m     0.85h    0.04d
 # Longest finished job:            9372s     156.20m     2.60h    0.11d
 # Submission to last job:         11599s     193.32m     3.22h    0.13d
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/tblastn.ce3WB/blastOut
     for i in wb??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/hg18/bed/tblastn.ce3WB/unliftBlastCe3WB.psl
     cd ..
     pslCheck unliftBlastCe3WB.psl
     sed "s/[0-9XY]*\///" unliftBlastCe3WB.psl | liftUp -type=.psl -nohead stdout ../../randomContigs/hg18.randomContigs.lift carry stdin | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > blastCe3WB.psl
 
     # load table
     ssh hgwdev
     cd /cluster/data/hg18/bed/tblastn.ce3WB
     hgLoadPsl hg18 blastCe3WB.psl
 
     # check coverage
     featureBits hg18 blastCe3WB
 # 4326489 bases of 2881515245 (0.150%) in intersection
 
     featureBits hg18 knownGene:cds blastCe3WB  -enrichment
 # knownGene:cds 1.111%, blastCe3WB 0.150%, both 0.086%, cover 7.76%, enrich 51.67x
 
     ssh kkstore04
     rm -rf /cluster/data/hg18/bed/tblastn.ce3WB/blastOut
     rm -rf /cluster/bluearc/hg18/bed/tblastn.ce3WB/blastOut
 #end tblastn
 ##########################################################################
 
 #############################################################################
 # RE-BUILD WGRNA TRACK (DONE, 2007-02-09, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
 
   ssh hgwdev
   cd /cluster/data/hg18/bed
 
   mkdir wgRna-2007-02-07
   cd wgRna-2007-02-07
 
 # Received the data file, wg_feb2007.txt (saved from wg_feb2007.doc)
 # from Michel Weber's email
 # (Michel.Weber at ibcg.biotoul.fr)
 # and place it under cd /cluster/data/hg18/bed/wgRna-2007-02-07.
 
 # The record of hsa-mir-770 was found missing the strand info.
 # manually add "+" to wg_feb2007.txt for the record of hsa-mir-770.
 
   cat wg_feb2007.txt|sed -e 's/ /\t/g' > wgRna.tab
 
   hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
 
 #############################################################################
 # RE-BUILD WGRNA TRACK AGAIN (DONE, 2007-02-12, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
 
   ssh hgwdev
   cd /cluster/data/hg18/bed
 
   mkdir wgRna-2007-02-12
   cd wgRna-2007-02-12
 
 # Received the data file, wg_feb2007_corrected.txt (saved from wg_feb2007_corrected.doc)
 # from Michel Weber's email
 # (Michel.Weber at ibcg.biotoul.fr)
 # and place it under cd /cluster/data/hg18/bed/wgRna-2007-02-12.
 
 # The record of hsa-mir-770 was found missing the strand info.
 # manually add "+" to wg_feb2007_corrected.txt for the record of hsa-mir-770.
 
   cat wg_feb2007_corrected.txt|sed -e 's/ /\t/g' > wgRna.tab
 
   hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
 
 #########################################################################
 ## BLASTZ ANOCAR1 - Lizard - (DONE - 2007-02-17 - 2007-02-18 - Hiram)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
     cd /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
 
     cat << '_EOF_' > DEF
 # human vs lizard
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=1
 
 # QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
 SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-verbose=2 -bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/hg18AnoCar1 > do.log 2>&1 &
     #	real    684m40.568s
     #	there was a pause in there as the pk kluster was corrected during the
     #	first kluster run to get it to finish.
 
     #	appears to have successfully finished
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.anoCar1.2007-02-17
     time nice -n +19 featureBits hg18 chainAnoCar1Link \
 	> fb.hg18.chainAnoCar1Link.txt 2>&1
     #	real    2m28.318s
     #	137554843 bases of 2881515245 (4.774%) in intersection
 
     #	running the swap to anoCar1 - instructions in anoCar1.txt
     cd /cluster/data/anoCar1/bed/blastz.hg18.swap
     time nice -n +19 featureBits anoCar1 chainHg18Link \
 	> fb.anoCar1.chainHg18Link.txt 2>&1
     #	real    3m16.810s
     #	112434396 bases of 1741478929 (6.456%) in intersection
 
     # reciprocal best net mafs for multiz 2008-10-30 - Hiram
     time nice -n +19 doRecipBest.pl hg18 anoCar1 > rbest.log 2>&1 &
     #	this failed immediately:
 # cd /cluster/data/hg18/bed/blastz.anoCar1/axtChain
 # chainStitchId hg18.anoCar1.over.chain.gz stdout
 # chainSwap stdin stdout
 # chainSort stdin anoCar1.hg18.tBest.chain
 # t end mismatch -526389042 vs 10481870 line 1920305 of stdin
 
 # Command failed:
 # ssh -x kkr14u04 nice /cluster/data/hg18/bed/blastz.anoCar1/axtChain/doRecipBest.csh
     #	but, then, when run locally on hgwdev, it proceeded just fine:
     time nice -n +19 ./doRecipBest.csh > doRecipBest.log 2>&1 &
     #	real    175m54.202s
     doRecipBest.pl -continue=download hg18 anoCar1
 
 ##########################################################################
 # UPDATED hg18.knownToVisiGene (DONE galt 2007-02-15)
 #  after making sure hg18.vgAllProbes was up to date (see makeVisiGene.doc)
 ssh hgwdev
 knownToVisiGene hg18 -fromProbePsl=vgAllProbes
 
 #########################################################################
 ## BLASTZ OTOGAR1 - Bushbaby - (2007-02-26 kate)
 #
 # NOTE: using masked sequence (unlike Brian Raney's alignments)
 
     cd /cluster/data/otoGar1
     ln -s otoGar1.rmsk.2bit otoGar1.2bit
     mkdir -p /san/sanvol1/scratch/otoGar1
     cp -p otoGar1.2bit chrom.sizes /san/sanvol1/scratch/otoGar1
 
     ssh pk
     mkdir /cluster/data/hg18/bed/blastz.otoGar1.2007-02-26
     cd /cluster/data/hg18/bed/blastz.otoGar1.2007-02-26
 
     cat << '_EOF_' > DEF
 # human vs bushbaby
 # params from Hiram & Brian
 BLASTZ=blastz.v7.x86_64
 
 # TARGET: Human Hg18
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=1
 
 # QUERY: Bushbaby otoGar1 - single chunk big enough to run largest scaffold
 SEQ2_DIR=/san/sanvol1/scratch/otoGar1/otoGar1.2bit
 SEQ2_LEN=/cluster/data/otoGar1/chrom.sizes
 SEQ2_LIMIT=400
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.otoGar1.2007-02-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << emacs
     /cluster/bin/scripts/doBlastzChainNet.pl DEF \
       -bigClusterHub=pk -smallClusterHub=pk \
       -chainMinScore=3000 -chainLinearGap=medium \
       >& do.log & tail -f do.log
 
     # problems on cluster -- stale NFS mounts and a routing problem
     # so batch failed with 4 retries.  I restarted cluster run
     # with retries=8, and all finished except 38.  These failed due
     # to output files existing;  as the results look OK, I'm proceeding.
 
     para time > run.time
     /cluster/bin/scripts/doBlastzChainNet.pl DEF \
       -continue=cat -bigClusterHub=pk \
       -chainMinScore=3000 -chainLinearGap=medium \
                 >&! do2.log &
     tail -f do2.log
 
     # failed due to pre-existing liftOver chain from Brian's run
     /cluster/bin/scripts/doBlastzChainNet.pl DEF \
       -continue=net -bigClusterHub=pk \
                 >&! do3.log &
     tail -f do3.log
 
     # reciprocal best net mafs for multiz
     ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 otoGar1 >&! rbest.log &
 
     # Load net (2007-03-12 kate)
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.otoGar1/axtChain
     netFilter -minGap=10 hg18.otoGar1.rbest.net.gz |  \
         hgLoadNet -warn hg18 netRBestOtoGar1 stdin
 
 #########################################################################
 # BLASTZ/CHAIN/NET CAVPOR2 (IN PROGRESS 2007-03-06 kate)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
     cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
     cat << '_EOF_' > DEF
 # human vs. guinea pig
 
 # dynamic masking param
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Guinea pig cavPor2
 # using cat-like params, as this has similar #scaffolds
 SEQ2_DIR=/san/sanvol1/scratch/cavPor2/cavPor2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/cavPor2/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 # this makes ~200K jobs
 SEQ2_LIMIT=500
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium >& do.log &
     tail -f do.log
 
     # cluster brought down by site work
     # restart on 3/7
 
     ssh pk
     cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06/run.blastz
     para recover jobList jobList2
     para make jobList2 >&! do2.log &
     para time > run.time
 
     # entire run took probably 36 hours cluster time
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.cavPor2.2007-03-06
     /cluster/bin/scripts/doBlastzChainNet.pl DEF \
       -bigClusterHub pk -continue=cat -stop cleanup \
       -chainMinScore=3000 -chainLinearGap=medium >& do3.log &
 
     # reciprocal best net mafs for multiz
     ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 cavPor2 >&! rbest.log &
 
     # load nets manually -- automated loading fails as classification info
     #  not available (no database)
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.cavPor2/axtChain
     netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netCavPor2 stdin
     netFilter -minGap=10 hg18.cavPor2.rbest.net.gz |  \
         hgLoadNet -warn hg18 netRBestCavPor2 stdin
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET ERIEUR1 (IN PROGRESS 2007-03-08 kate)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.eriEur1.2007-03-08
     cd /cluster/data/hg18/bed/blastz.eriEur1.2007-03-08
     cat << '_EOF_' > DEF
 # human vs. hedgehog
 
 # dynamic masking param
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: hedgehog eriEur1
 # using cat-like params, as this has similar #scaffolds
 SEQ2_DIR=/san/sanvol1/scratch/eriEur1/eriEur1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/eriEur1/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 # this makes ~200K jobs
 SEQ2_LIMIT=500
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.eriEur1.2007-03-08
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium >& do.log &
     tail -f do.log
 
     # Reciprocal best net mafs for multiz (kate)
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.eriEur1
     ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 eriEur1 >&! rbest.log &
 
     #GOT HERE
 
     # Load nets (2007-03-12 kate)
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.dasNov1/axtChain
     netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netDasNov1 stdin
     netFilter -minGap=10 hg18.dasNov1.rbest.net.gz |  \
         hgLoadNet -warn hg18 netRBestDasNov1 stdin
 
 #########################################################################
 # BLASTZ/CHAIN/NET SORARA1 (IN PROGRESS 2007-03-08 kate)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.sorAra1.2007-03-08
     cd /cluster/data/hg18/bed/blastz.sorAra1.2007-03-08
     cat << '_EOF_' > DEF
 # human vs. hedgehog
 
 # dynamic masking param
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: shrew sorAra1
 # using cat-like params, as this has similar #scaffolds
 SEQ2_DIR=/san/sanvol1/scratch/sorAra1/sorAra1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/sorAra1/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 # this makes ~200K jobs
 SEQ2_LIMIT=500
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.sorAra1.2007-03-08
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium >& do.log &
     tail -f do.log
 
     # stopped during load step due to missing database for classifying net
 
     # Reciprocal best net mafs for multiz (2007-03-12 kate)
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.sorAra1
     ~/kent/src/hg/utils/automation/doRecipBest.pl hg18 sorAra1 >&! rbest.log &
 
     # GOT HERE
 
     # Load nets (2007-03-12 kate)
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.sorAra1/axtChain
     netFilter -minGap=10 noClass.net | hgLoadNet -warn hg18 netSorAra1 stdin
     netFilter -minGap=10 hg18.sorAra1.rbest.net.gz |  \
         hgLoadNet -warn hg18 netRBestSorAra1 stdin
 
 
 #########################################################################
 # BLASTZ ORNANA1 (PLATYPUS) - (DONE 2007-02-23, REDONE 2007-04-04 angie)
 # The first time around, the copy of ornAna1.2bit still had the pre-release --
 # doh!  Differences are miniscule (a couple contigs' orientation changed),
 # but redo just to get it 100% right.
 # In the re-run, I changed SEQ2_LIMIT which made the cluster run more
 # efficient but had side-effects on the results because blastz's dynamic
 # masking was applied differently (different groupings of sequences) --
 # in retrospect, would have been better to use the suboptimal SEQ2_LIMIT
 # and have fewer differences to slog through.
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
     cd /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
     cat << '_EOF_' > DEF
 # human vs. platypus
 
 # Use same params as used for hg18-danRer4
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: hg18
 SEQ1_DIR=/scratch/hg/hg18/hg18.2bit
 SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: ornAna1
 SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
 SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << emacs
 
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /cluster/bluearc/hg18.ornAna1 \
       >& do.log & tail -f do.log
 
     cd /cluster/data/hg18/bed/blastz.ornAna1.2007-04-02
     time nice -n +19 doRecipBest.pl hg18 ornAna1 > rbest.log 2>&1 &
     #	real    238m22.247s
     #	worked OK
 
 ########################################################################
 # 28-WAY VERTEBRATE MULTIZ (2007-03-20 kate)
 
     ssh kkstore02
     cd /cluster/data/hg18/bed
     mkdir multiz28way.2007-03-20
     ln -s multiz28way.2007-03-20 multiz28way
     cd multiz28way
 
     # start with 17way tree; update assemblies and add new species
     mkdir tree
     cd tree
     cp /cluster/data/hg18/bed/multiz17way/tree.nh tree.asm.nh
     # edit and create tree.28.nh, with Webb's assistance
     echo `sed 's/[a-zA-Z0-9]*_//g' tree.asm.nh` > tree.28.nh
     #
     # create version for download that includes common names and assemblies
     cp tree.asm.nh ../28way.nh
     # edit
 
     # create version for phyloGif program (replace spaces with commas)
     cp 28way.gif /usr/local/apache/htdocs/images/phylo/hg18_28way.gif
 
     # create species list
     cd ..
     sed -e 's/[()]//g' -e 's/ /\n/g' tree/tree.28.nh | \
         sed -e '/^$/d'| sort > species.28.lst
     wc -l species.28.lst
     ln -s species.28.lst species.lst
 
 # Organisms:
 (N)ew, (U)pdated, (S)ame species since 17way:
 U chimp (panTro2)
 S rhesus (rheMac2)
 -N bushbaby (otoGar1) "Otolemur garnetti" (galago) 2X
 N tree_shrew (tupBel1) "Tupaia belangeri"
 S rat (rn4)
 S mouse (mm8)
 -N guinea_pig (cavPor2) "Cavia porcellus" 2X
 S rabbit (oryCun1) 2X
 -N shrew (sorAra1) "Sorex araneus" 2X
 -N hedgehog (eriEur1) "Erinaceus europaeus" 2X
 S dog (canFam2)
 N cat (felCat3) "Felis catus" 2X
 -N horse (equCab1) "Equus caballus"
 U cow (bosTau3)
 S armadillo (dasNov1) "Dasypus novemcinctus" 2X
 S elephant (loxAfr1) 2X
 S tenrec (echTel1) 2X
 S opossum (monDom4)
 N platypus (ornAna1) "Ornithorhychus anatinus"
 U chicken (galGal3)
 N lizard (anoCar1) "Anolis carolinensis" (Green Anole), Iguana family
 U frog (xenTro2)
 U fugu (fr2)
 S tetraodon (tetNig1)
 N stickleback (gasAcu1) "Gasterosteus aculeatus"
 N medaka (oryLat1) "Oryzias latipes"
 U zebrafish (danRer4)
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz28way
     # verify all blastz's exists
 cat > listMafs.csh << 'EOF'
     foreach db (`cat species.lst`)
         set bdir = /cluster/data/hg18/bed/blastz.$db
         if (-e $bdir/mafRBestNet/chr1.maf.gz) then
             echo "$db mafRBestNet"
         else if (-e $bdir/mafSynNet/chr1.maf.gz) then
             echo "$db mafSynNet"
         else if (-e $bdir/mafNet/chr1.maf.gz) then
             echo "$db mafNet"
         else
             echo "$db mafs not found"
         endif
     end
 'EOF'
 
     # gather chain stats
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way
 cat > getChainStats.csh << 'EOF'
     set species = $1
     foreach db (`cat $species`)
         echo -n "${db}	"
         set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
         set fb = /cluster/data/hg18/bed/blastz.$db/fb.hg18.chain${Db}Link.txt
         if (! -e $fb || -z $fb ) then
             nice featureBits hg18 chain${Db}Link >& $fb
         endif
         sed 's/.*(\(.*\)).*/\1/' $fb
         end
 'EOF'
     # << happy emacs
     csh getChainStats.csh species.lst >&! species.chainStats
 
     # Maf types:
     # 2X mammals -> reciprocal best net
     # high cov placental mammals and opossum -> syntenic net
     # other -> standard net
 
     csh listMafs.csh > listMafs.log &
     cat listMafs.log
 
     # add links of the formt blastz.<db> to blastz.<db>.<date> dirs:
     cd /cluster/data/hg18/bed
     ln -s blastz.fr2.2007-01-24 blastz.fr2
     ln -s blastz.ornAna1.2007-02-21 blastz.ornAna1
     ln -s blastz.oryLat1.swap blastz.oryLat1
 
     # copy net mafs to cluster-friendly storage, splitting chroms
     # into 50MB chunks  to improve run-time
     # NOTE: splitting will be different for scaffold-based reference asemblies
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way
     mkdir run.split
     cd run.split
     mafSplitPos hg18 50 mafSplit.bed
 
     ssh kki
     cd /cluster/data/hg18/bed/multiz28way
     cd run.split
     cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
 set db = $1
 set sdir = /san/sanvol1/scratch/hg18/splitStrictMafNet
 mkdir -p $sdir
 if (-e $sdir/$db) then
     echo "directory $sdir/$db already exists -- remove and retry"
     exit 1
 endif
 set bdir = /cluster/data/hg18/bed/blastz.$db
 if (! -e $bdir) then
     echo "directory $bdir not found"
     exit 1
 endif
 mkdir -p $sdir/$db
 if (-e $bdir/mafRBestNet) then
     set mdir = $bdir/mafRBestNet
 else if (-e $bdir/mafSynNet) then
     set mdir = $bdir/mafSynNet
 else if (-e $bdir/mafNet) then
     set mdir = $bdir/mafNet
 else
     echo "$bdir maf dir not found"
     exit 1
 endif
 echo $mdir
 foreach f ($mdir/*)
     set c = $f:t:r:r
     echo "  $c"
     nice mafSplit mafSplit.bed $sdir/$db/ $f
 end
 echo "gzipping $sdir/$db mafs"
 nice gzip $sdir/$db/*
 endif
 echo $mdir > $db.done
 'EOF'
 # << happy emacs
     chmod +x doSplit.csh
 
     grep -v hg18  ../species.28.lst > split.lst
     cat > spec << 'EOF'
 #LOOP
 doSplit.csh $(path1) {check out line+ $(path1).done}
 #ENDLOOP
 'EOF'
     gensub2 split.lst single spec jobList
     para create jobList
         # 24 jobs
     para try
     para check
     para push
     # till complete
     para time >&! run.time
     # 30 minutes
 
     # run multiz
     ssh pk
     cd /cluster/data/hg18/bed/multiz28way
     mkdir -p maf run
     cd run
     mkdir penn
     # use latest penn utilities
     set PENN_BIN = /cluster/bin/penn/multiz.v11.2007-03-19
     cp -p $PENN_BIN/{autoMZ,multiz,maf_project} penn
 
     # list chrom chunks, any db dir will do; better would be for the
     # splitter to generate this file
     # We temporarily use __ instead of . to delimit chunk in filename
     # so we can use $(root) to get basename
     set mdir = /san/sanvol1/scratch/hg18/splitStrictMafNet
     ls $mdir/fr2 | sed -e 's/.maf.gz//' -e 's/\./__/' > chromChunks.lst
     wc -l chromChunks.lst
         # 93
 
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
 
     set db = hg18
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../tree/tree.28.nh ../species.28.lst $tmp
     pushd $tmp
     foreach s (`cat species.28.lst`)
         set c2 = `echo $c | sed 's/__/./'`
         set in = $pairs/$s/$c2.maf
         set out = $db.$s.sing.maf
         if ($s == hg18) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.28.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
 # << happy emacs
     chmod +x autoMultiz.csh
 
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz28way/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
     # << emacs
     gensub2 chromChunks.lst single spec jobList
     para create jobList
         # 93 jobs
     para try
     para check
     para push
     para time > run.time
         # 4 hours!  (~9 min/species)
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/hg18/multiz28way/maf
     ln -s /cluster/data/hg18/bed/multiz28way/maf/*.maf \
                 /gbdb/hg18/multiz28way/maf
     cd /cluster/data/hg18/bed/multiz28way
 cat > loadMaf.csh << 'EOF'
     date
     hgLoadMaf -pathPrefix=/gbdb/hg18/multiz28way/maf hg18 multiz28way
 
      # load summary table
      cat maf/*.maf |  nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
                  -maxSize=200000  multiz28waySummary stdin
 'EOF'
      csh loadMaf.csh >&! loadMaf.log &
 
 
     # look at coverage
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz25wayStrict
     mkdir mafCov
     cd mafCov
     cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 \
         -otherDb=canFam2 chr7.canFam2.bed | bedSort > chr7.canFam2.bed
     echo canFam2 > species.lst
     cat ../maf/chr7__*.maf | mafSpeciesSubset stdin species.lst stdout | \
         mafToAxt stdin hg18 canFam2 stdout | \
         axtToPsl stdin /cluster/data/hg18/chrom.sizes \
                 /cluster/data/canFam2/chrom.sizes chr7.canFam2.psl
 
     cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 -otherDb=oryCun1 chr7.oryCun1.bed
     cat ../maf/chr7__*.maf | nice mafRanges stdin hg18 -otherDb=tetNig1 chr7.tetNig1.bed
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz25wayStrict/mafCov
     # canFam2 syntenic net vs standard net
     nice featureBits hg18 -chrom=chr7 chr7.canFam2.bed
         # 82967535 bases of 154952424 (53.544%) in intersection
     nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.canFam2.bed
         # 86391682 bases of 154952424 (55.754%) in intersection
     nice featureBits hg18 -chrom=chr7 ../../multiz17way/mafCov/chr7.canFam2.bed
         # 86248995 bases of 154952424 (55.662%) in intersection
 
     # compare using another method
     cat ../maf/chr7__*.maf | mafSpeciesSubset stdin species.lst chr7.canFam2.maf
     mafToAxt chr7.canFam2.maf hg18 canFam2 chr7.canFam2.axt
     axtToPsl chr7.canFam2.axt /cluster/data/hg18/chrom.sizes \
         /cluster/data/canFam2/chrom.sizes chr7.canFam2.psl
     nice featureBits hg18 -chrom=chr7 chr7.canFam2.psl
         # 75497734 bases of 154952424 (48.723%) in intersection
 
     # oryCun1 reciprocal best net vs standard net
     nice featureBits hg18 -chrom=chr7 chr7.oryCun1.bed
         # 53157578 bases of 154952424 (34.306%) in intersection
     nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.oryCun1.bed
         # 56858022 bases of 154952424 (36.694%) in intersection
 
     # tetNig1 both used standard net
     nice featureBits hg18 -chrom=chr7 chr7.tetNig1.bed
         # 2905058 bases of 154952424 (1.875%) in intersection
     nice featureBits hg18 -chrom=chr7 ../../multiz25way/mafCov/chr7.tetNig1.bed
         # 2901708 bases of 154952424 (1.873%) in intersection
 
     # NOTE: Next time concatenate split mafs before proceeding further
 
     # Gap Annotation
     # prepare bed files with gap info
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz28way
     mkdir anno
     cd anno
     mkdir maf run
     cd run
 
 cat > doNBed.csh << 'EOF'
     foreach db (`cat species.lst`)
         echo -n "$db "
         set cdir = /cluster/data/$db
         if (! -e $cdir/$db.N.bed) then
             echo "creating N.bed"
             twoBitInfo -nBed $cdir/$db.2bit $cdir/$db.N.bed
         else
             echo ""
         endif
     end
 'EOF'
     csh doNBed.csh >&! doNBed.log &
 
     rm -f nBeds sizes
     foreach db (`grep -v hg18 ../../species.lst`)
         echo "$db "
         ln -s  /cluster/data/$db/$db.N.bed $db.bed
         echo $db.bed  >> nBeds
         ln -s  /cluster/data/$db/chrom.sizes $db.len
         echo $db.len  >> sizes
     end
 
     ssh kki
     cd /cluster/data/hg18/bed/multiz28way/anno/run
 
 cat > doAnno.csh << 'EOF'
 #!/bin/csh -ef
     set dir = /cluster/data/hg18/bed/multiz28way
     set c = $1
     cat $dir/maf/${c}__*.maf | \
         nice mafAddIRows -nBeds=nBeds -sizes=sizes stdin \
                 /cluster/data/hg18/hg18.2bit $2
 'EOF'
 #<< happy emacs
     chmod +x doAnno.csh
 
 cat > spec << 'EOF'
 #LOOP
 ./doAnno.csh $(root1) {check out line+ /cluster/data/hg18/bed/multiz28way/anno/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
 #<< happy emacs
     awk '{print $1}' /cluster/data/hg18/chrom.sizes > chroms.lst
     gensub2 chroms.lst single spec jobList
     para create jobList
     para try
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way/anno
     mkdir -p /gbdb/hg18/multiz28way/anno/maf
     ln -s /cluster/data/hg18/bed/multiz28way/anno/maf/*.maf \
                 /gbdb/hg18/multiz28way/anno/maf
 cat > loadMaf.csh << 'EOF'
     date
     nice hgLoadMaf -pathPrefix=/gbdb/hg18/multiz28way/anno/maf \
                 hg18 multiz28wayAnno
     date
     cat maf/*.maf | \
         nice hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
             -maxSize=200000  multiz28wayAnnoSummary stdin
     date
 'EOF'
     csh loadMaf.csh >& loadMaf.log  &
 
     # NOTE: rebuilt hgLoadMafSummary to exclude chroms<1MB (2007-06-21 kate)
 
 
 ########################################################################
 # ANNOTATE 28-WAY ALIGNMENT WITH QUALITY DATA (2007-06-11 rico at bx.psu.edu)
 #
 # The basic idea here is to create a qac file which has quality data for each
 # (chromosome/scaffold/etc) and then index the qac file.  Once this is done,
 # mafAddQRows can be used to add the quality data to a given maf.  The agp
 # files are used so that gaps can be represented in the qac files as a special
 # value.
 
 	## create .qac and .qdx files for each species in the 28-way alignment
 	o human (hg18)
 		Unable to find quality data.
 
 	o chimp (panTro2)
 		/cluster/data/panTro2/bed/quality/qac/*.qac
 		/cluster/data/panTro2/wustl/*.agp
 		qacAddGapIdx in.agp in.qac panTro2.qac panTro2.qdx
 
 	o rhesus (rheMac2)
 		/cluster/data/rheMac2/qual/foo.qv
 		/cluster/data/rheMac2/downloads/foo.agp
 		qacAddGapIdx in.agp in.qac rheMac2.qac rheMac2.qdx
 
 	o bushbaby (otoGar1)
 		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/bushbaby/otoGar1
 			assembly.agp
 			Draft_v1.agp.chromosome.qual.gz
 		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz otoGar1.qac otoGar1.qdx
 
 	o treeshrew (tupBel1)
 		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/treeShrew/tupBel1
 			assembly.agp
 			Draft_v1.agp.chromosome.qual.gz
 		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz tupBel1.qac tupBel1.qdx
 
 	o rat (rn4)
 		/cluster/data/rn4/downloads/foo.qual
 		/cluster/data/rn4/CHROM/foo.agp
 		qacAddGapIdx in.agp in.qac rn4.qac rn4.qdx
 
 	o mouse (mm8)
 		Unable to find quality data.
 
 	o guinea pig (cavPor2)
 		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/guineaPig/cavPor2
 			assembly.agp
 			Draft_v2.agp.chromosome.qual.gz
 		qaAgpToQacIdx assembly.agp Draft_v2.agp.chromosome.qual.gz cavPor2.qac cavPor2.qdx
 
 	o rabbit (oryCun1)
 		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/rabbit/oryCun1
 			assembly.agp
 			Draft_v1.agp.chromosome.qual.gz
 		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz oryCun1.qac oryCun1.qdx
 
 	o shrew (sorAra1)
 		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/commonShrew/sorAra1
 			assembly.agp
 			Draft_v1.agp.chromosome.qual.gz
 		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz sorAra1.qac sorAra1.qdx
 
 	o hedgehog (eriEur1)
 		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/hedgehog/eriEur1
 			assembly.agp
 			Draft_v1.agp.chromosome.qual.gz
 		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz eriEur1.qac eriEur1.qdx
 
 	o dog (canFam2)
 		/cluster/data/canFam2/bed/quality/chrom.qac
 		/cluster/data/canFam2/broad/foo.agp
 		qacAddGapIdx in.agp in.qac canFam2.qac canFam2.qdx
 
 	o cat (felCat3)
 		/cluster/data/felCat3/downloads/assembly.agp
 		/cluster/data/felCat3/downloads/Draft_v3.agp.chromosome.qual.gz
 		qaAgpToQacIdx assembly.agp Draft_v3.agp.chromosome.qual.gz felCat3.qac felCat3.qdx
 
 	o horse (equCab1)
 		/cluster/data/equCab1/downloads/assembly.agp
 		/cluster/data/equCab1/downloads/Draft_v1.agp.chromosome.qual.gz
 		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz equCab1.qac equCab1.qdx
 
 	o cow (bosTau3)
 		/cluster/data/bosTau3/baylor/chroms/foo.qual
 		/cluster/data/bosTau3/baylor/foo.agp
 		qacAddGapIdx in.agp in.qac bosTau3.qac bosTau3.qdx
 
 	o armadillo (dasNov1)
 		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/armadillo/dasNov1
 			assembly.agp
 			assembly.quals.gz
 		combineQuals assembly.agp assembly.quals.gz combined.quals
 		qaAgpToQacIdx assembly.agp combined.quals.gz dasNov1.qac dasNov1.qdx
 
 	o elephant (loxAfr1)
 		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/elephant/loxAfr1
 			assembly.agp
 			assembly.quals.gz
 		combineQuals assembly.agp assembly.quals.gz combined.quals
 		qaAgpToQacIdx assembly.agp combined.quals.gz loxAfr1.qac loxAfr1.qdx
 
 	o tenrec (echTel1)
 		http://www.broad.mit.edu/ftp/pub/assemblies/mammals/tenrec/echTel1
 			assembly.agp
 			Draft_v1.agp.chromosome.qual.gz
 		qaAgpToQacIdx assembly.agp Draft_v1.agp.chromosome.qual.gz echTel1.qac echTel1.qdx
 
 	o opossum (monDom4)
 		/cluster/data/monDom4/broad.mit.edu/foo.qac
 		/cluster/data/monDom4/broad.mit.edu/foo.agp
 		qacAddGapIdx in.agp in.qac monDom4.qac monDom4.qdx
 
 	o platypus (ornAna1)
 		/cluster/data/ornAna1
 			agp files are present, but there are no quality files
 
 	o chicken (galGal3)
 		Unable to find quality data.
 
 	o lizard (anoCar1)
 		/cluster/data/anoCar1/downloads/assembly.agp
 		/cluster/data/anoCar1/downloads/scaffold.lifted.qac
 		qacAddGapIdx in.agp in.qac anoCar1.qac anoCar1.qdx
 
 	o frog (xenTro2)
 		Unable to find quality data.
 
 	o tetraodon (tetNig1)
 		Unable to find quality data.
 
 	o fugu (fr2)
 		Unable to find quality data.
 
 	o stickleback (gasAcu1)
 		/cluster/data/gasAcu1/downloads/foo.agp
 		/cluster/data/gasAcu1/downloads/foo.qual
 		qacAddGapIdx in.agp in.qac gasAcu1.qac gasAcu1.qdx
 
 	o medaka (oryLat1)
 		/cluster/data/oryLat1/bed/qual/foo.qual
 		/cluster/data/oryLat1/downloads/foo.agp
 		qacAddGapIdx in.agp in.qac oryLat1.qac oryLat1.qdx
 
 	o zebrafish (danRer4)
 		Unable to find quality data.
 
 	## NOTE
 	quality data for chrM needed: dog, guineapig, horse, hedgehog, stickleback, medaka, rat
 	quality data for chrUn needed: medaka
 
 	## copy all .qac and .qdx files to the san
 	cp *.qac *.qdx /san/sanvol1/rico/quality
 
 	## create species list (species.lst) containing the following
 	anoCar1 /san/sanvol1/rico/quality
 	bosTau3 /san/sanvol1/rico/quality
 	canFam2 /san/sanvol1/rico/quality
 	cavPor2 /san/sanvol1/rico/quality
 	dasNov1 /san/sanvol1/rico/quality
 	echTel1 /san/sanvol1/rico/quality
 	equCab1 /san/sanvol1/rico/quality
 	eriEur1 /san/sanvol1/rico/quality
 	felCat3 /san/sanvol1/rico/quality
 	gasAcu1 /san/sanvol1/rico/quality
 	loxAfr1 /san/sanvol1/rico/quality
 	monDom4 /san/sanvol1/rico/quality
 	oryCun1 /san/sanvol1/rico/quality
 	oryLat1 /san/sanvol1/rico/quality
 	otoGar1 /san/sanvol1/rico/quality
 	panTro2 /san/sanvol1/rico/quality
 	rheMac2 /san/sanvol1/rico/quality
 	rn4     /san/sanvol1/rico/quality
 	sorAra1 /san/sanvol1/rico/quality
 	tupBel1 /san/sanvol1/rico/quality
 
 	## the following script will add quality data to each of the mafs
 cat > addQData << 'EOF'
 #!/bin/sh
 
 INPUT_DIR=/cluster/data/hg18/bed/multiz28way/anno/maf
 OUTPUT_DIR=/cluster/store12/rico/hg18/bed/multiz28way/qual/maf
 
 for maf in `ls -1Sr ${INPUT_DIR}/*.maf`
 do
 	file=`basename $maf`
 
 	mafAddQRows species.lst $maf ${OUTPUT_DIR}/$file
 done
 'EOF'
 
     # Gene frames
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way
     mkdir frames
     cd frames
 cat > showGenes.csh << 'EOF'
     foreach db (`grep -v hg18  ../species.lst`)
         echo "    $db"
         echo -n "Tables: "
         set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
         foreach table ($tables)
             if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
                 $table == "knownGene") then
                     echo -n "${table}: "
                     hgsql $db -N -e "select count(*) from $table"
             endif
 
         end
         echo -n "Mrnas: "
         set orgName = `hgsql hgcentraltest -N -e \
                 "select scientificName from dbDb where name='$db'"`
         set orgId = `hgsql hg18 -N -e \
                 "select id from organism where name='$orgName'"`
         if ($orgId == "") then
             echo "0"
         else
             hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"
         endif
     end
 'EOF'
 
     # based on output, pick gene tables, according to the following criteria:
     # KG if present, else refGene if >10000 entries, else ensGene (unless dog),
     # else mgcGenes, else mrnas if > 10000 else none.   In all cases
     # except none, add in refGene.
 
 hg18: knownGene
 bosTau3: mrna
 canFam2: mrna
 cavPor2: mrna
 danRer4: refGene (13K) or ensGene (36K ?)
 equCab1: mrna
 fr2: ensGene
 galGal3: mrna
 gasAcu1: ensGene
 mm8: knownGene
 monDom4: ensGene
 oryCun1: mrna
 panTro2: refGene
 rheMac2: ensGene
 rn4: knownGene ? (8K) or refGene (10K) or ensGene(34K) ?
 tetNig1: mrna
 xenTro2: mrna
 
 
     # get the genes for all genomes
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
     # using mrna table as genes: bostau3, canFam2, cavPor2, equCab1, galGal3, oryCun1, tetNig1, xenTro2
 cat > getGenes.csh << 'EOF'
     rm -fr genes
     mkdir -p genes
     #set mrnaDbs = "bosTau3 canFam2 cavPor2 equCab1 galGal3 oryCun1 tetNig1 xenTro2"
     # use only those with databases for now
     set mrnaDbs = "bosTau3 canFam2 equCab1 galGal3 oryCun1 tetNig1 xenTro2"
     foreach queryDb ($mrnaDbs)
       set tmpExt = `mktemp temp.XXXXXX`
       set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
       set tmpMrna = ${queryDb}.mrna.${tmpExt}
       set tmpCds = ${queryDb}.cds.${tmpExt}
       echo $queryDb
       hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                    from all_mrna,gbCdnaInfo,cds \
                    where (all_mrna.qName = gbCdnaInfo.acc) and \
                      (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
        $queryDb > ${tmpMrnaCds}
       cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
       cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
       mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
         genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$queryDb.tmp.gz
       rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
     # using knownGene for rn4 mm8 hg18
     # using refGene for panTro2
     # using ensGene for danRer4, fr2, gasSAcu1, monDom4, rheMac2
     # genePreds; (must keep only the first 10 columns for knownGene)
     set geneDbs = "hg18 mm8 rn4 danRer4 panTro2 fr2 gasAcu1 monDom4 rheMac2"
     foreach queryDb ($geneDbs)
       if ($queryDb == "danRer4" || $queryDb == "fr2" || $queryDb == "gasAcu1" || \
                 $queryDb == "monDom4" || $queryDb == "rheMac2") then
         set geneTbl = ensGene
       else if ($queryDb == "panTro2") then
         set geneTbl = refGene
       else if ($queryDb == "hg18" || $queryDb == "mm8" || $queryDb == "rn4") then
         set geneTbl = knownGene
       endif
       hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from $geneTbl" ${queryDb} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
     end
 'EOF'
     csh getGenes.csh >&! getGenes.log &
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz28way/frames
     # leaving out cavPor2 (no db) and tetNig1 (too few gene preds)
     (cat  ../maf/*.maf | nice genePredToMafFrames hg18 stdin stdout bosTau3 genes/bosTau3.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz fr2 genes/fr2.gp.gz galGal3 genes/galGal3.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz xenTro2 genes/xenTro2.gp.gz gasAcu1 genes/gasAcu1.gp.gz monDom4 genes/monDom4.gp.gz equCab1 genes/equCab1.gp.gz |  gzip > multiz28way.mafFrames.gz) >& frames.log &
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way/frames
     nice hgLoadMafFrames hg18 multiz28wayFrames multiz28way.mafFrames.gz >& loadFrames. log &
 
 # from 17way:
     hg18 = knownGene
     rn4 = knownGene
     mm8 = knownGene
     panTro1 = ensGene
     rheMac2 = mrna
     oryCun1 = mrna
     #dasNov1 =
     canFam2 = mrna
     #loxAfr1 =
     bosTau2 = mrna
     #echTel1 =
     #monDom4 =
     galGal2 = refGene
     xenTro1 = mgcGenes
     #tetNig1 =
     fr1 = ensGene
     danRer3 = mrna
 
 ############################################################################
 # PHASTCONS FOR 28WAY (2007-04-04 kate)
 
 # generate tree model with branch lengths using phyloFit from Adam
 # Siepel's # phastCons package.  Input is 28way alignments of
 # 4-fold degenerate sites (4d sites) determined from a
 # nonredundant (non-overlapping) gene set.  Elliott Margulies
 # has a perl script (extract_coding_alignments.pl) that he used
 # with the ENCODE alignments.
 # Adam uses his msa_view tool with the --4d option.
 
 # For first try, use Gencode Oct '05 reference set filtered
 # to longest transcript, then lifted to hg18
 # Compare results with hgClusterGenes and /cluster/bin/phast/refeature,
 # and genePredSingleCover
 
     hgsql hg17 -N -e 'select * from encodeGencodeGeneKnownOct05' > gencodeKnown.gp
     wc -l gencodeKnown.gp
         # 2608 gencodeKnown.gp
     hgsql hg17 -N -e "select count(*) from encodeGencodeGeneKnownOct05 where cdsStart <> 0 and cdsEnd <> 0"
         # 1097
     hgsql hg17 -N -e "select count(*) from encodeGencodeGeneKnownOct05 where cdsStartStat='cmpl' and cdsEndStat='cmpl'"
         # 752
 
     # Jim's gene uniquifier
     hgClusterGenes -noProt hg17 encodeGencodeGeneKnownOct05 \
         encodeGencodeGeneKnownOct05Clusters encodeGencodeGeneKnownOct05Canonical
         # Got 457 clusters, from 2608 genes in 46 chromosomes
     hgsql hg17 -N -e "select transcript from encodeGencodeGeneKnownOct05Canonical order by transcript" > genes.jim
 
     # Adam's feature uniquifier
     # requires cdsStart and cdsEnd in gene pred
     hgsql hg17 -N -e 'select * from encodeGencodeGeneKnownOct05 where cdsStart<>0 and cdsEnd <> 0' > gencodeKnownCds.gp
     wc -l gencodeKnownCds.gp
         # 1097 gencodeKnownCds.gp
 
     /cluster/bin/phast/refeature --unique gencodeKnownCds.gp > \
         gencodeKnownCdsNR.gff
      awk '{print $10}' gencodeKnownCdsNR.gff | sort | uniq | wc -l
      # 333
 
     /cluster/bin/phast/refeature -o genepred --unique \
         gencodeKnownCds.gp | sort > gencodeKnownCdsNR.gp
     wc -l gencodeKnownCdsNR.gp
         # 333
     awk '{print $1}' gencodeKnownCdsNR.gp | sort > genes.adam
 
     # get intersection
     comm -1 -2 genes.jim genes.adam > genes.both
     wc -l genes.both
         # 235
 
     # genePredSingleCover filters but leaves extended gene pred
     genePredSingleCover gencodeKnownCds.gp stdout | sort > gencodeKnownCdsNR2.gp
     wc -l gencodeKnownCdsNR2.gp
         # 423
     awk '{print $1}' gencodeKnownCdsNR2.gp | sort > genes.scov
     comm -1 -2 genes.scov genes.both > genes.all
     wc -l genes.all
         # 224 -- all 3 methods picked these
 
     liftOver -genePred  gencodeKnownCdsNR2.gp \
         /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz \
                 gencodeKnown.hg18.gp unmapped.gp
     genePredCheck  gencodeKnown.hg18.gp
     # checked: 423 failed: 0
     # all genes mapped
 
     # consider using only intersection of above 3 methods
 
    grep chr22 gencodeKnown.hg18.gp > gencodeKnown.hg18.chr22.gp
    /cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.chr22.gp \
         -i MAF ../maf/chr22__0.maf > chr22.mfa
 
     # extract ENCODE regions from MAF's
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way/4d
     hgsql hg18 -N -e  \
      "select chrom, chromStart, chromEnd, name from encodeRegions" \
         > encodeRegions.bed
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz28way/4d
 cat > encodeMafs.csh << 'EOF'
     mkdir -p encodeMafs
     set chroms = `awk '{print $1}' encodeRegions.bed | sort | uniq`
     foreach c ($chroms)
         echo $c
         # needed till mafsInRegion is fixed to handle split maf files
         cat ../maf/${c}__?.maf > $c.maf
         awk -v CHR=$c '$1 == CHR {print}' encodeRegions.bed > regions.bed
         mafsInRegion regions.bed -outDir encodeMafs/ $c.maf
     end
 'EOF'
     csh encodeMafs.csh >&! encodeMafs.log &
 
     # try it out on a few regions
     set r = "ENm001"
     set r = "ENr231"
     perl -wpe 's/^s ([^.]+)\.\S+/s $1/' encodeMafs/$r.maf > $r.clean.maf
     # generate ss file
     /cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.gp \
                 -i MAF $r.clean.maf -o SS > $r.4d.3.ss
     /cluster/bin/phast/msa_view -i SS -o FASTA $r.4d.3.ss > $r.4d.3.mfa
     /cluster/bin/phast/msa_view -i SS --tuple-size 1 $r.4d.3.ss -o SS > $r.4d.1.ss
     /cluster/bin/phast/msa_view -i SS -o FASTA $r.4d.1.ss > $r.4d.1.mfa
 
     # now on all regions
 cat > encode4d.csh << 'EOF'
     mkdir mfa4d
     foreach f (encodeMafs/*.maf)
         set r = $f:t:r
         echo $r
         perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $f > clean.maf
     /cluster/bin/phast/msa_view --4d --features gencodeKnown.hg18.gp \
                 -i MAF clean.maf -o SS | \
         /cluster/bin/phast/msa_view -i SS --tuple-size 1 - > mfa4d/$r.4d.mfa
         # remove empties to satisfy msa_view --aggregate
         if (-z mfa4d/$r.4d.mfa) then
             rm mfa4d/$r.4d.mfa
         endif
     end
 'EOF'
     csh encode4d.csh >&! encode4d.log &
 
     set species1 = `sed 's/$/,/g' ../species.lst`
     set species = `echo $species1 | sed -e 's/ //g' -e 's/,$//'`
 
     # From Elliott's script:
     #/cluster/bin/phast/msa_view --aggregate $species EN*.mfa | \
         #sed s/"> "/">"/ > some-4d_align.mfa
 
     /cluster/bin/phast/msa_view --aggregate $species mfa4d/EN*.4d.mfa | \
         sed s/"> "/">"/ > all-4d_align.mfa
 
     # tweak input tree -- remove common names, include commas
     sed 's/[a-z][a-z]*_//g' ../tree/tree.web.commas.nh > tree.commas.nh
 
     # From Elliott's script with Adam's mods (use --EM, MED)
     /cluster/bin/phast/phyloFit --EM --precision MED \
         --msa-format FASTA --subst-mod REV \
         --tree tree.commas.nh all-4d_align.mfa
     grep TREE phyloFit.mod | sed 's/TREE\:\ //' > tree_4d.28way.nh
     /cluster/bin/phast/tree_doctor --dissect tree_4d.28way.nh | \
         awk '$1 == "dparent" {x += $3} END {print x}'
             # 9.0516
 
     # extract species distances
     /cluster/bin/phast/all_dists tree_4d.28way.nh > 28way.distances.txt
     grep hg18 28way.distances.txt | sort -k3,3n | \
         awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
     # get chain stats ordered by distance
     awk '{print $2}' distances.txt > species.byDistance
     csh ../getChainStats.csh species.byDistance >&! species.chainStats
 
     # spruce up names for tree drawing
     /cluster/bin/phast/tree_doctor \
       --rename="hg18 -> human ; panTro2 -> chimp ; rheMac2 -> macaque ; otoGar1 -> bushbaby ; tupBel1 -> tree_shrew ; rn4 -> rat ; mm8 -> mouse ; cavPor2 -> guinea_pig ; oryCun1 -> rabbit ; sorAra1 -> shrew ; eriEur1 -> hedgehog ; canFam2 -> dog ; felCat3 -> cat ; equCab1 -> horse ; bosTau3 -> cow ; dasNov1 -> armadillo ; loxAfr1 -> elephant ; echTel1 -> tenrec ; monDom4 -> opossum ; ornAna1 -> platypus ; galGal3 -> chicken ; anoCar1 -> lizard ; xenTro2 -> frog ; tetNig1 -> tetraodon ; fr2 -> fugu ; gasAcu1 -> stickleback ; oryLat1 -> medaka ; danRer4 -> zebrafish" \
       tree_4d.28way.nh > tree_4d.28way.common.nh
 
     # compare to Elliott's latest ENCODE tree, pruned to match
     /cluster/bin/phast/tree_doctor \
         --prune-all-but=human,chimp,macaque,galago,rat,mouse,guinea_pig,rabbit,cow,cat,dog,hedgehog,shrew,armadillo,elephant,tenrec,monodelphis,platypus,chicken,xenopus \
         --rename="xenopus -> frog ; galago -> bushbaby; monodelphis -> opossum"\
             encode2007.nh > encode2007.pruned.nh
 
     # my 4d tree with only species in the pruned ENCODE tree
     /cluster/bin/phast/tree_doctor \
         --prune-all-but=human,chimp,macaque,bushbaby,rat,mouse,guinea_pig,rabbit,cow,cat,dog,hedgehog,shrew,armadillo,elephant,tenrec,opossum,platypus,chicken,frog \
           tree_4d.28way.common.nh > tree_4d.20way.common.nh
 
     # Create chrom mafs from split mafs (do this earlier next time)
     ssh kki
     cd /cluster/data/hg18/bed/multiz28way
     mkdir chromMaf
     mkdir run.merge
     cd run.merge
 cat > doMerge.csh << 'EOF'
 #!/bin/csh -ef
     set c = $1
     set cmaf = ../chromMaf/${c}.maf
     # NOTE: need to change mafFilter to retain (and uniquify) comments
     # begin with ##maf header
     head -1 ../maf/${c}__0.maf > $cmaf
     grep -h '# ' ../maf/${c}__?.maf | sed 's/\/scratch\/tmp.* //' | sort | uniq \
         >> $cmaf
     # don't filter out blocks with alignment this time -- might be needed
     # for symmetry with irows version, or for analysis.  Check on this.
     mafFilter -minRow=1 ../maf/${c}__?.maf >> $cmaf
 'EOF'
     # << happy emacs
     chmod a+x doMerge.csh
 
 cat > spec << 'EOF'
 #LOOP
 ./doMerge.csh $(root1) {check out line+ ../chromMaf/$(root1).maf}
 #ENDLOOP
 'EOF'
     # << happy emacs
 
     awk '{print $1}' /cluster/data/hg18/chrom.sizes > chrom.lst
     gensub2 chrom.lst single spec jobList
     para create jobList
         # 49 jobs
     para try
     para check
     para push
 
     # Split chromosome MAF's into windows and use to generate
     # "sufficient statistics" (ss) files for phastCons input
     # large mem jobs so use mini-cluster
     ssh kki
     cd /cluster/data/hg18/bed/multiz28way
     mkdir cons
     cd cons
 
     # Create tree model for phastCons
     # Adjust model file base composition background and rate matrix to be
     # representative of whole-genome (.41 -- as was done for ENCODE)
     # using utility, 'modFreqs' from Adam (5/07)
     # NOTE: updated all phast source and rebuilt to phast.2007-05-04
     set gc = `grep BACKGROUND /cluster/data/hg18/bed/multiz17way/cons/elliotsEncode.mod | \
                 awk '{printf "%0.3f\n", $3 + $4;}'`
     echo $gc
         # .41
     /cluster/bin/phast.2007-05-04/modFreqs ../4d/phyloFit.mod $gc > 28way.mod
 
     # split 28way mafs into 10M chunks and generate sufficient statistics
     # files for # phastCons
     mkdir run.split
     cd run.split
     set WINDOWS = /san/sanvol1/scratch/hg18/multiz28way/cons/ss
     rm -fr $WINDOWS
     mkdir -p $WINDOWS
 
     cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
     set MAFS = /cluster/data/hg18/bed/multiz28way/chromMaf
     set WINDOWS = /san/sanvol1/scratch/hg18/multiz28way/cons/ss
     cd $WINDOWS
     set c = $1
     echo $c
     rm -fr $c
     mkdir $c
     # need to truncate odd-ball scaffold/chrom names that include dots
     # as phastCons utils can't handle them
     set TMP = /scratch/tmp/$c.clean.maf.$$
     #perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $MAFS/$c.maf > $TMP
     perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $TMP
     /cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \
         -M /cluster/bluearc/hg18/chrom/$c.fa \
         -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
     rm -f $TMP
     echo "Done" >> $c.done
 'EOF'
 # << happy emacs
     chmod +x doSplit.csh
 
 rm -f jobList
 foreach f (../../chromMaf/*.maf)
     set c = $f:t:r
     echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
 end
 
     para create jobList
         # 49 jobs
     para try
     para check
     para push
     # completed shorter jobs in a few hours, but others failed on memory.
     # redo on kolossus -- 14 hours!
     # NOTE: next time try harder working with split mafs!
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     ssh pk
     cd /cluster/data/hg18/bed/multiz28way/cons
     mkdir run.cons
     cd run.cons
     cat > doPhast.csh << 'EOF'
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.2007-05-04
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $cwd:t
 set tmp = /scratch/tmp/$f
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/hg18/multiz28way/cons
 cp -p $grp/$grp.mod $grp/$grp.non-inf .
 cp -p $san/ss/$c/$f.ss ../../$grp/$grp.mod ../../$grp/$grp.non-inf $tmp
 pushd $tmp > /dev/null
 $PHASTBIN/phastCons $f.ss $grp.mod \
 --rho $rho --expected-length $len --target-coverage $cov --quiet \
 --not-informative `cat $grp.non-inf` \
 --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 popd > /dev/null
 mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
 sleep 1
 mv $tmp/$f.pp $san/$grp/pp/$c
 mv $tmp/$f.bed $san/$grp/bed/$c
 rm -fr $tmp
 'EOF'
     # << happy emacs
     chmod a+x doPhast.csh
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/hg18/multiz28way/cons
     ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/hg18/bed/multiz28way/cons/run.cons/in.list
     popd
 
     # run for all species
     cd ..
     mkdir -p all run.cons/all
     cd all
     cp ../28way.mod all.mod
 
     # non-informative option for closest relatives (exclude regions with only these aligning),
     # and till Adam fixes the problem, also exclude all species removed from tree (below)
     echo "panTro2,rheMac2" > all.non-inf
     cd ../run.cons
 
     # Create template file
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     cat > template << 'EOF'
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31
 #ENDLOOP
 'EOF'
     # << happy emacs
     cd all
     gensub2 ../in.list single ../template jobList
     para create jobList
         # 337 jobs
     para try
     para check
     para push
 
     # NOTE: These jobs regularly crash (too quick ?), and have to be repushed.
     # Also, a few hang, and need to be stopped and restarted.
     # The whole batch runs so fast, this isn't a problem
 # CPU time in finished jobs:      34253s     570.89m     9.51h    0.40d  0.001 y
 IO & Wait Time:                 61148s    1019.13m    16.99h    0.71d  0.002 y
 Average job time:                 283s       4.72m     0.08h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:             496s       8.27m     0.14h    0.01d
 Submission to last job:           995s      16.58m     0.28h    0.01d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/hg18/multiz28way/cons/all
     cat bed/*/chr*.bed | ~/bin/${MACHTYPE}/bedSort stdin stdout | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/hg18/bed/multiz28way/cons/all
 
     # load into database
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way/cons/all
     hgLoadBed hg18 phastConsElements28way mostConserved.bed
         # Loaded 2183600 elements
     # compare with previous tracks
     hgsql hg18 -s -N -e "select count(*) from phastConsElements17way"
         # 2229902
     hgsql hg18 -s -N -e "select count(*) from phastConsElements17way where chrom='chr7'"
         # 114703
     # Try for 5% overall cov, and 70% CDS cov
     featureBits hg18 -enrichment refGene:cds phastConsElements28way >& fb.out &
 
     # Compare to chr7 for 17way -- chr7 is .7% lower than whole genome,
     # so aim for 4.3% on chr7
 
     featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements28way
 # USED FOR 17WAY
 #   too little coverage
     # 14 .008 .28
 # refGene:cds 0.911%, phastConsElements28way 3.551%, both 0.653%, cover 71.74%, enrich 20.20x
     # 14 .1 .28
 # refGene:cds 0.911%, phastConsElements28way 3.954%, both 0.648%, cover 71.12%, enrich 17.98x
     # 12 .1 .28
 # refGene:cds 0.911%, phastConsElements28way 3.914%, both 0.644%, cover 70.74%, enrich 18.08x
     # 14 .2 .3
     # 234653 elements
 # refGene:cds 0.911%, phastConsElements28way 4.423%, both 0.659%, cover 72.34%, enrich 16.36x
     # 13 .2 .28
 # refGene:cds 0.911%, phastConsElements28way 4.266%, both 0.644%, cover 70.73%, enrich 16.58x
 # USE THIS ONE
 #  minimum change to params to achieve coverage
     # 14 .2 .28
     # 249585 elements
 # refGene:cds 0.911%, phastConsElements28way 4.269%, both 0.646%, cover 70.92%, enrich 16.61x
     # 15 .2 .28
 # refGene:cds 0.911%, phastConsElements28way 4.271%, both 0.647%, cover 71.08%, enrich 16.64x
     # 14 .3 .28
 # refGene:cds 0.911%, phastConsElements28way 4.644%, both 0.645%, cover 70.89%, enrich 15.27x
     # 14 .35 .28
 # refGene:cds 0.911%, phastConsElements28way 4.879%, both 0.646%, cover 70.90%, enrich 14.53x
     # 14 .15 .3
     # 207188 elements
 # refGene:cds 0.912%, phastConsElements28way 4.260%, both 0.660%, cover 72.34%, enrich 16.98x
     # 16 .15 .3
     # 193531 elements
 # refGene:cds 0.912%, phastConsElements28way 4.289%, both 0.663%, cover 72.66%, enrich 16.94x
     # 20 .15 .3
     # 173668 elements
 # refGene:cds 0.912%, phastConsElements28way 4.321%, both 0.667%, cover 73.11%, enrich 16.92x
     # 24 .15 .3
     # 159646 elements
 # refGene:cds 0.912%, phastConsElements28way 4.338%, both 0.670%, cover 73.40%, enrich 16.92x
     # 30 .15 .3
     # 144399 elements
 # refGene:cds 0.912%, phastConsElements28way 4.349%, both 0.673%, cover 73.72%, enrich 16.95x
     # 40 .15 .3
     # 128087 elements
 # refGene:cds 0.912%, phastConsElements28way 4.353%, both 0.676%, cover 74.09%, enrich 17.02x
     # 50 .15 .3
     # 117338 elements
 # refGene:cds 0.912%, phastConsElements28way 4.352%, both 0.678%, cover 74.32%, enrich 17.08x
     # 50 .1 .3
     # 116930 elements
 # refGene:cds 0.912%, phastConsElements28way 4.347%, both 0.678%, cover 74.32%, enrich 17.10x
     # 50 .05 .3
     # 93391 elements
 # refGene:cds 0.912%, phastConsElements28way 4.193%, both 0.680%, cover 74.57%, enrich 17.78x
     # 50 .07 .3
     # 99358
 # refGene:cds 0.912%, phastConsElements28way 4.231%, both 0.680%, cover 74.51%, enrich 17.61x
     # 45 .07 .3
     # 102864 elements
 # refGene:cds 0.912%, phastConsElements28way 4.227%, both 0.679%, cover 74.41%, enrich 17.60x
     # USE THIS ONE
     # matches element count for 17way
     # 45 .1 .3
     # 110836 elements
 # refGene:cds 0.912%, phastConsElements28way 4.277%, both 0.678%, cover 74.33%, enrich 17.38x
     # 75 .1 .3
     # Try for really long elements
     # 93524 elements
 # refGene:cds 0.912%, phastConsElements28way 4.279%, both 0.682%, cover 74.73%, enrich 17.47x
     # 100 .1 .3
     # 85757 elements
 # refGene:cds 0.912%, phastConsElements28way 4.270%, both 0.683%, cover 74.90%, enrich 17.54
     # 71218 elements
     # 200 .1 .3
 # refGene:cds 0.912%, phastConsElements28way 4.225%, both 0.686%, cover 75.16%, enrich 17.79x
     # 200 .12 .3
 # refGene:cds 0.912%, phastConsElements28way 4.241%, both 0.686%, cover 75.13%, enrich 17.72x
     # USE THIS ONE
     # for really long elements
     # 200 .15 .3
         # 75659
 # refGene:cds 0.912%, phastConsElements28way 4.261%, both 0.685%, cover 75.11%, enrich 17.63x
 
     featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements17way
 # refGene:cds 0.911%, phastConsElements17way 4.838%, both 0.639%, cover 70.22%, enrich 14.51x
     featureBits hg18 -enrichment refGene:cds phastConsElements17way
 # refGene:cds 1.072%, phastConsElements17way 5.510%, both 0.759%, cover 70.83%, enrich 12.86x
 
     # compare element sizes to other runs:
     # e.g. select min(chromEnd-chromStart) from encodeTbaPhastConsEl
     # hg17 ENCODE TBA phastCons:  min=1, max=1961
     # hg17 ENCODE TBA gerp:       min=3, max=1426
     # hg18 17way:                 min=1, max=12590   #el on chr7: 114703
 
     # 45 .3 .31
     #  featureBits hg18 -enrichment refGene:cds phastConsElements28way
     refGene:cds 1.095%, phastConsElements28way 4.920%, both 0.827%, cover 75.48%, enrich 15.34x
     # 2906254 elements
 
     # Create merged posterier probability file and wiggle track data files
     #	pk is currently closer to the san than any other machine
     ssh pk
     cd /san/sanvol1/scratch/hg18/multiz28way/cons/all
     # sort by chromName, chromStart so that items are in numerical order
     #  for wigEncode
 cat > listPp.csh << 'EOF'
     foreach d (pp/chr*/)
         ls $d/*.pp | sort -n -t\. -k2
     end
 'EOF'
     csh listPp.csh | xargs cat | \
         nice wigEncode stdin phastCons28way.wig phastCons28way.wib
 
     # about 23 minutes for above
     cp -p phastCons28way.wi? /cluster/data/hg18/bed/multiz28way/cons/all
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way/cons/all
     ln -s /cluster/data/hg18/bed/multiz28way/cons/all/phastCons28way.wib \
         /gbdb/hg18/multiz28way
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
         phastCons28way phastCons28way.wig
     #  ~ 3 minute load
 
     ## Run phastCons on subgroup (placentals)
     ssh pk
     cd /cluster/data/hg18/bed/multiz28way/cons
 
     # create pruned tree
     set species = `cat ../species.lst`
     echo $species | sed 's/ /,/g'
     #anoCar1,bosTau3,canFam2,cavPor2,danRer4,dasNov1,echTel1,equCab1,eriEur1,felCat3,fr2,galGal3,gasAcu1,hg18,loxAfr1,mm8,monDom4,ornAna1,oryCun1,oryLat1,otoGar1,panTro2,rheMac2,rn4,sorAra1,tetNig1,tupBel1,xenTro2
 
     # setup placental-only run
     mkdir placental run.cons/placental
     cd placental
     # placental-only: exclude from phastCons:  10 non-placentals
     #       (platypus, opossum, 5 fish, chicken, lizard, frog)
     /cluster/bin/phast.new/tree_doctor ../28way.mod \
              --prune-all-but=bosTau3,canFam2,cavPor2,dasNov1,echTel1,equCab1,eriEur1,felCat3,hg18,loxAfr1,mm8,oryCun1,otoGar1,panTro2,rheMac2,rn4,sorAra1,tupBel1 \
                                       > placental.mod
     echo "panTro2,rheMac2,anoCar1,danRer4,fr2,galGal3,gasAcu1,monDom4,ornAna1,oryLat1,tetNig1,xenTro2" \
         > placental.non-inf
 
     cd ../run.cons/placental
     gensub2 ../in.list single ../template jobList
     para create jobList
     para try
     para check
     para push
     # ~30 minutes on pk
     # NOTE: sometimes jobs crash or hang due to access problems on SAN
     # para stop then push to recover
 
     cd ../../
     mkdir hqAll run.cons/hqAll
     cd hqAll
 # high-qual only: exclude 10 low-qual mammals
     /cluster/bin/phast.new/tree_doctor 28way.mod \
         --prune-all-but=anoCar1,bosTau3,canFam2,danRer4,equCab1,fr2,galGal3,gasAcu1,hg18,mm8,monDom4,ornAna1,oryLat1,panTro2,rheMac2,rn4,tetNig1,xenTro2 \
                                       > hqAll.mod
     echo "panTro2,rheMac2,cavPor2,dasNov1,echTel1,loxAfr1,eriEur1,felCat3,oryCun1,otoGar1,sorAra1,tupBel1" \
         > hqAll.non-inf
     cd ../run.cons/hqAll
     gensub2 ../in.list single ../template jobList
     para create jobList
     para try
     para check
     para push
 
     cd ../../
     mkdir hqPlacental run.cons/hqPlacental
     cd hqPlacental
 # high-qual placental only: exclude 10 non-placentals and 10 low-qual mammals,
     /cluster/bin/phast.new/tree_doctor ../28way.mod \
              --prune-all-but=bosTau3,canFam2,equCab1,hg18,mm8,panTro2,rheMac2,rn4 \
                                       > hqPlacental.mod
     echo "panTro2,rheMac2,cavPor2,dasNov1,echTel1,loxAfr1,eriEur1,felCat3,oryCun1,otoGar1,sorAra1,tupBel1,anoCar1,danRer4,fr2,galGal3,gasAcu1,monDom4,ornAna1,oryLat1,tetNig1,xenTro2" \
         > hqPlacental.non-inf
     cd ../run.cons/hqPlacental
     gensub2 ../in.list single ../template jobList
     para create jobList
     para try
     para check
     para push
 
     # add placental elements to Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/hg18/multiz28way/cons/placental
     cat bed/*/chr*.bed | ~/bin/${MACHTYPE}/bedSort stdin stdout | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/hg18/bed/multiz28way/cons/placental
     # load into database
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way/cons/placental
     hgLoadBed hg18 phastConsElements28wayPlacMammal mostConserved.bed
     featureBits hg18 -enrichment refGene:cds phastConsElements28wayPlacMammal >&! ../run.cons/placental/fb.out
 
     # experiments
     # USING THIS ONE: min change from 17way to achieve coverage
     # 14.2.28
     # 169516 elements
     # 169518
 # refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Short 3.437%, both 0.615%, cover 67.40%, enrich 19.61x
 # refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Short 3.437%, both 0.615%, cover 67.40%, enrich 19.61x
     # USING THIS ONE: vertebrate elements have similar count to 17way ("medium")
     # 45.1.3
     # 76715 elements
     # 76718 elements
 # refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Med 3.312%, both 0.642%, cover 70.33%, enrich 21.24x
 #refGene:cds 0.912%, phastConsElements28wayPlacMammalChr7Med 3.312%, both 0.642%, cover 70.33%, enrich 21.24x
 
     # Create merged posterier probability file and wiggle track data files
     #	pk is currently closer to the san than any other machine
     ssh pk
     cd /san/sanvol1/scratch/hg18/multiz28way/cons/placental
 
     # sort by chromName, chromStart so that items are in numerical order
     #  for wigEncode
 cat > listPp.csh << 'EOF'
     foreach d (pp/chr*/)
         ls $d/*.pp | sort -n -t\. -k2
     end
 'EOF'
     csh ../listPp.csh | xargs cat | \
         nice wigEncode stdin \
                 phastCons28wayPlacMammal.wig phastCons28wayPlacMammal.wib
     # about 23 minutes for above
     cp -p phastCons28wayPlacMammal.wi? /cluster/data/hg18/bed/multiz28way/cons/placental
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way/cons/placental
     ln -s  \
         /cluster/data/hg18/bed/multiz28way/cons/placental/phastCons28wayPlacMammal.wib \
         /gbdb/hg18/multiz28way
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
         phastCons28wayPlacMammal phastCons28wayPlacMammal.wig
         # WARNING: Exceeded chr4_random size 842649 > 842648. dropping 2 data point(s)
         # NOTE: weird msa_split on this chrom -- sent inquiry to Adam about this
     #  ~ 3 minute load
 
     ########################################################################
     # phyloP conservation
     # split SS files into 100K chunks (5 min./job)
     ssh kki
     cd /cluster/data/hg18/bed/multiz28way/cons/
     mkdir run.phyloP.split
     cd run.phyloP.split
 
     cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
     set c = $1
     set san = /san/sanvol1/scratch/hg18/multiz28way
     set in =  $san/cons/ss
     set out = $san/phyloP/ss
     set PHASTBIN = /cluster/bin/phast.2007-05-04
     @ i=0
     foreach f (`ls $in/$c/*.ss | sort -n -t\. -k2`)
         @ i++
         mkdir -p $out/$c/$i
         $PHASTBIN/msa_split $f -i SS -o SS \
             -r $out/$c/$i/$c.$i -w 100000,0 -I 1000 -B 5000
     end
     echo "Done" >> $out/$c.done
 'EOF'
 # << happy emacs
     chmod +x doSplit.csh
 
     set san = /san/sanvol1/scratch/hg18/multiz28way
     set JOBS = /cluster/data/hg18/bed/multiz28way/cons/run.phyloP.split/jobList
     rm -f $JOBS
     foreach c (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
         echo "doSplit.csh $c {check out line+ $san/phyloP/ss/$c.done}" >> $JOBS
     end
 
     para create jobList
         # 49 jobs
     para try
     para check
     para push
     para time
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:       8827s     147.12m     2.45h    0.10d  0.000 y
 # IO & Wait Time:                  6837s     113.95m     1.90h    0.08d  0.000 y
 # Average job time:                 320s       5.33m     0.09h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1343s      22.38m     0.37h    0.02d
 # Submission to last job:          1528s      25.47m     0.42h    0.02d
 
     ########################################################################
     # phyloP scoring method experiments on chr7   (2008-11-11 kate)
 
     ssh pk
     cd /cluster/data/hg18/bed/multiz28way/cons
     mkdir -p run.phyloPMethod
     cd run.phyloPMethod
 
 cat > doPhyloP.csh << 'EOF'
     set method = $1
     set f = $2
     set out = $3
     set c = $f:r:r
     set n = $f:r:e
     set tmp = /scratch/tmp/$f
     mkdir -p $tmp
     cp -p /san/sanvol1/scratch/hg18/multiz28way/phyloP/ss/$c/$n/$f.ss ../tree.mod $tmp
     pushd $tmp > /dev/null
 
     # Built phast from CornellCVS on 11/11/08 in /cluster/bin/phast.build.
     # Symlinked the bin to /cluster/bin/phast.2008
     set PHASTBIN = /cluster/bin/phast.2008-11-13
     # PHAST version is 0.9.9.8b
     $PHASTBIN/phyloP --method $method --mode CONACC --wig-scores --chrom $c \
                 -i SS tree.mod $f.ss > $f.wig
 
     popd > /dev/null
     mkdir -p $out:h
     mv $tmp/$f.wig $out
     rm -fr $tmp
 'EOF'
     # Create list of chunks (just chr7 for now)
     pushd /san/sanvol1/scratch/hg18/multiz28way/phyloP/ss
     ls chr7/*/chr7.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
         /cluster/data/hg18/bed/multiz28way/cons/run.phyloPMethod/in.list
 
     # setup run
     mkdir -p all
     cd all
     cp ../../28way.mod tree.mod
     mkdir -p SCORE
     cd SCORE
 
     # Create template file
     #	file1 == $chr/$chunk/file name without .ss suffix
     cat > template << 'EOF'
 #LOOP
 csh ../../doPhyloP.csh SCORE $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloPMethod/all/SCORE/$(path1).wig}
 #ENDLOOP
 'EOF'
 
     # << happy emacs
     gensub2 ../../in.list single template jobList
     para create jobList
     para try
     para check
     para push
 
     # Completed: 1552 of 1552 jobs
     # CPU time in finished jobs:      15411s     256.84m     4.28h    0.18d  0.000 y
     # IO & Wait Time:                  7678s     127.97m     2.13h    0.09d  0.000 y
     # Average job time:                  15s       0.25m     0.00h    0.00d
     # Longest finished job:              29s       0.48m     0.01h    0.00d
     # Submission to last job:           236s       3.93m     0.07h    0.00d
     # Estimated complete:                 0s       0.00m     0.00h    0.00d
 
     cd ..
     mkdir -p LRT
     cd LRT
 
     # Create template file
     #	file1 == $chr/$chunk/file name without .ss suffix
     cat > template << 'EOF'
 #LOOP
 csh ../../doPhyloP.csh LRT $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloPMethod/all/LRT/$(path1).wig}
 #ENDLOOP
 'EOF'
     # << happy emacs
     gensub2 ../../in.list single template jobList
     para create jobList
     para try
     para check
     para push
 
     # sort by chromName, chromStart so that items are in numerical order
     #  for wigEncode
     cd /san/sanvol1/scratch/hg18/multiz28way/phyloP/all
 cat > listPp.csh << 'EOF'
     foreach c (`ls -d chr*`)
         foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
             ls -1 $d/*.wig | sort -n -t\. -k3
         end
     end
 'EOF'
     csh listPp.csh | xargs cat | \
         nice wigEncode stdin phyloP28way.wig phyloP28way.wib
     mkdir /cluster/data/hg18/bed/multiz28way/cons/phyloP/all
     cp -p phyloP28way.wi? /cluster/data/hg18/bed/multiz28way/cons/phyloP/all
 
     # setup placental run
     mkdir -p placental
     cd all
     cp ../../placental.mod tree.mod
 
     # Create template file
     #	file1 == $chr/$chunk/file name without .ss suffix
     cat > template << 'EOF'
 #LOOP
 csh ../doPhyloP.csh $(file1) {check out line+ /san/sanvol1/scratch/hg18/multiz28way/phyloP/placental/$(path1).wig}
 #ENDLOOP
 'EOF'
 
     # << happy emacs
     gensub2 ../in.list single template jobList
     para create jobList
     para try
     para check
     para push
 
     #CPU time in finished jobs:    1934553s   32242.55m   537.38h   22.39d  0.061 y
     #IO & Wait Time:                 82007s    1366.78m    22.78h    0.95d  0.003 y
     #Average job time:                  70s       1.16m     0.02h    0.00d
     #Longest running job:                0s       0.00m     0.00h    0.00d
     #Longest finished job:             147s       2.45m     0.04h    0.00d
     #Submission to last job:         37642s     627.37m    10.46h    0.44d
 
     # sort by chromName, chromStart so that items are in numerical order
     #  for wigEncode
     ssh pk
     cd /san/sanvol1/scratch/hg18/multiz28way/phyloP/placental
 # check for clean dir here -- chr* will match garbage if it's there
 cat > listPp.csh << 'EOF'
     foreach c (`ls -d chr*`)
         foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
             ls -1 $d/*.wig | sort -n -t\. -k3
         end
     end
 'EOF'
     csh listPp.csh | xargs cat | \
         nice wigEncode stdin phyloP28wayPlacMammal.wig phyloP28wayPlacMammal.wib
     mkdir /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental
     cp -p phyloP28wayPlacMammal.wi? /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way/cons/phyloP/all
     ln -s  \
         /cluster/data/hg18/bed/multiz28way/cons/phyloP/all/phyloP28way.wib \
         /gbdb/hg18/multiz28way/phyloP28way.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
         phyloP28way phyloP28way.wig
 
     # WARNING: Exceeded chr4_random size 842649 > 842648. dropping 2 data point(s)
     cd ../placental
     ln -s  \
         /cluster/data/hg18/bed/multiz28way/cons/phyloP/all/phyloP28wayPlacMammal.wib \
         /gbdb/hg18/multiz28way/phyloP28wayPlacMammal.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz28way hg18 \
         phyloP28wayPlacMammal phyloP28wayPlacMammal.wig
 
      hgWiggle phyloP28wayChr7 | textHistogram -col=2 -real -skip=7 -binSize=.2
      0.000000 **************** 26649187
      0.200000 ************************************************************ 101774235
      0.400000 ********** 16325655
      0.600000 *** 4331032
      0.800000 * 1029490
      1.000000  0
      1.200000  456666
      1.400000  0
      1.600000  240876
      1.800000  0
      2.000000  246969
      2.200000  0
      2.400000  0
      2.600000  0
      2.800000  134764
 
      cd ../placental
      hgWiggle phyloP28wayPlacMammalChr7 | textHistogram -col=2 -real -skip=7 -binSize=.2 stdin
 
      cd ../../all
      hgWiggle phastCons28wayChr7Short | textHistogram -col=2 -real -skip=7 -binSize=.1 stdin
      0.000000 ************************************************************ 128445730
      0.100000 **** 7648620
      0.200000 ** 3473415
      0.300000 * 1986801
      0.400000 * 1399849
      0.500000 * 1096292
      0.600000  912539
      0.700000  893991
      0.800000  1008630
      0.900000 * 2940535
      1.000000 * 1383115
 
     ############################################################################
     # PhyloP experiments with new scoring methods:  LRT and SCORE, implemented in 2008
     # Using new PHAST package (rebuilt from cornellCVS)
     # chr7-only
     # 2008-11-11 kate
 
 ############################################################################
 # DOWNLOADS FOR 28WAY (2007-05-30 kate)
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz28way
     cat > downloads.csh << 'EOF'
     date
     set dir = /cluster/data/hg18/bed/multiz28way
     mkdir -p mafDownloads
     cd $dir/mafDownloads
     foreach f (../maf/chr*.maf)
 	set c = $f:t:r
         echo $c
 	nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
 
     cd $dir
     mkdir -p phastConsDownloads/vertebrate phastConsDownloads/placental
     cd /san/sanvol1/scratch/hg18/multiz28way/cons
     foreach chr (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
       echo $chr
       cat `ls -1 all/pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \
         | nice gzip -c \
             > $dir/phastConsDownloads/vertebrate/$chr.pp.gz
       cat `ls -1 placental/pp/$chr/$chr.*.pp | sort -t\. -k2,2n` \
         | nice gzip -c \
             > $dir/phastConsDownloads/placental/$chr.pp.gz
     end
     cd /cluster/data/hg18/bed/multiz28way/phastConsDownloads/vertebrate
     md5sum *.gz > md5sum.txt
     cd ../placental
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     csh downloads.csh >&! downloads.log &
     # << happy emacs
 
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/hg18/phastCons28way
     mkdir -p $dir/vertebrate $dir/placental
     ln -s /cluster/data/hg18/bed/multiz28way/phastConsDownloads/vertebrate/{*.gz,md5sum.txt} $dir/vertebrate
     ln -s /cluster/data/hg18/bed/multiz28way/phastConsDownloads/placental/{*.gz,md5sum.txt} $dir/placental
     cp /usr/local/apache/htdocs/goldenPath/hg18/phastCons17way/README.txt $dir
     # edit this file to reflect the latest releases used.
     vi $dir/README.txt
     set dir = /usr/local/apache/htdocs/goldenPath/hg18/multiz28way/maf
     mkdir $dir
     ln -s /cluster/data/hg18/bed/multiz28way/mafDownloads/{*.gz,md5sum.txt} $dir
 
     # upstream mafs (mafFrags takes a while)
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way/mafDownloads
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits hg18 refGene:upstream:$i -fa=/dev/null -bed=up.bad
 
         cat up.bad|sed -e "s/_up_${i}_/\t/" >up.bad2
 	awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, 0, $6)}' up.bad2 > up.bed
 	rm up.bad up.bad2
 	nice mafFrags hg18 multiz28way up.bed upstream$i.maf \
 	   -orgs=/cluster/data/hg18/bed/multiz28way/species.lst
 	rm up.bed
     end
     date
 'EOF'
 # << happy emacs
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/multiz28way/mafDownloads
     csh mafFrags.csh > mafFrags.log &
     nice gzip up*.maf
     md5sum up*.gz >> md5sum.txt
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz28way
     # link filtered nets and chains to downloads area (doRecipBest.pl could
     # be changed for this)
     # Species where syntenic net was used
     foreach db (panTro2 rheMac2 equCab1 canFam2 bosTau3 mm8 rn4,monDom4)
         echo $db
         set cd = /cluster/data/hg18/bed/blastz.$db/axtChain
         cd $cd
         set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
         set f = hg18.$db.syn.net.gz
         if (! -e $f) then
             netFilter -syn hg18.$db.net.gz > $f
         endif
         set d = /usr/local/apache/htdocs/goldenPath/hg18/vs$Db
         ln -s $cd/$f $d
         nice md5sum $f >> $d/md5sum.txt
     end
 
     # Create downloads dir for new species without genome databases
     #foreach db (tupBel1 cavPor2 eriEur1 sorAra1)
     # NOTE: Keeping these only on genome-test for now.
     foreach db (tupBel1 cavPor2 eriEur1 sorAra1)
         echo $db
         set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
         set d = /usr/local/apache/htdocs/goldenPath/hg18
         mkdir -p $d/vs$Db
         cp $d/vsOryCun1/README.txt $d/vs$Db
         set bd = /cluster/data/hg18/bed/blastz.$db
         cd $bd/axtChain
         set f = hg18.$db.net.gz
         if (! -e $f) then
            cat net/*.net | gzip -c > $f
         endif
         nice md5sum hg18.$db.{all.chain,net}.gz > md5sum.txt
         cd ..
         nice md5sum axtNet/*.gz >> axtChain/md5sum.txt
         ln -s $bd/axtChain/hg18.$db.{all.chain,net}.gz $d/vs$Db
         ln -s $bd/axtChain/md5sum.txt $d/vs$Db
         ln -s $bd/axtNet $d/vs$Db
     end
     # EDIT README's for the species
 
     # Post reciprocal best nets
     # NOTE: Keeping these only on genome-test for now.
 cat > downloads4.csh << 'EOF'
     foreach db (felCat3 otoGar1 loxAfr1 oryCun1 echTel1 dasNov1 \
             tupBel1 cavPor2 eriEur1 sorAra1)
         echo $db
         set Db = `echo $db | perl -wpe 's/(.*)/\u$1/'`
         set d = /usr/local/apache/htdocs/goldenPath/hg18/vs$Db
         set cd = /cluster/data/hg18/bed/blastz.$db/axtChain
         ln -s $cd/hg18.$db.rbest.{chain,net}.gz $d
         cd $d
         md5sum hg18.$db.rbest.{chain,net}.gz >> md5sum.txt
     end
 'EOF'
     # EDIT README's to include reciprocal best chains & nets
 
 
 ############################################################################
 # 28-way PhyloP downloads
 # 2008-10-21 kate
 
     ssh kolossus
     cd /san/sanvol1/scratch/hg18/multiz28way/phyloP
 cat > merge.csh << 'EOF'
     set out = $1
     rm -f *.lst
     foreach c (`ls -d chr*`)
         echo $c
         touch $c.lst
         foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
             ls -1 $d/*.wig | sort -n -t\. -k3 >> $c.lst
             xargs < $c.lst cat > $out/$c.wigFix
         end
     end
 'EOF'
     # all species
     cd all 
     csh ../merge.csh /cluster/data/hg18/bed/multiz28way/cons/phyloP/all > merge.log
 
     cd ../placental
     csh ../merge.csh /cluster/data/hg18/bed/multiz28way/cons/phyloP/placental > merge.log
 
     cd /cluster/data/hg18/bed/multiz28way/cons/phyloP
    
     # post to downloads
     cd /usr/local/apache/htdocs/goldenPath/hg18
     mkdir phyloP28way
     cd phyloP28way
     ln -s /cluster/data/hg18/bed/multiz28way/cons/phyloP/{all,placental} .
 
     cd all
     nice gzip $out/$c.wigFix
     cd ../placental
     nice gzip $out/$c.wigFix
 
 ############################################################################
 # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd)
 
     # see hg17.txt for build temporary ccds database for CCDS.20070228
 
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg18 ccdsInfo ccdsGene
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg18 -loadDb ccdsGene knownGene ccdsKgMap
     checkTableCoords hg18 -verbose=2 ccdsGene
     # update all.jointer to include hg18 in ccdsDb
     joinerCheck -database=hg18 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
 
     # build initial version of ccdsMgcMap table, updated by nightly genbank update
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg18 ccdsGene mgcGenes ccdsMgcMap
 
     # load trackDb
     cd kent/src/hg/makeDb/trackDb
     make alpha
     # check in browser
 
     # request push of
         ccdsGene
         ccdsInfo
         ccdsKgMap
         ccdsMgcMap
     # << emacs
 
 #########################################################################
 # RECIPROCAL BEST CHIMP PANTRO2 (2007-03-02 kate)
 #       Requested by Daryl
     cd /cluster/data/hg18/bed/blastz.panTro2
     doRecipBest.pl hg18 panTro2 >&! rbest.log &
 
 
 
 #########################################################################
 # EPONINE-TSS (TRANSCRIPTON START SITE) PREDICTION
 # (DONE, 2007-03-08, hartera)
 # The Eponine software is version 2 and has not changed in several years
 # (contact: Thomas Down at Sanger, td2 at sanger.ac.uk). The version downloaded
 # for hg16 should be the same as the current version but download again just
 # to check. The application includes the TSS model file: eponine-tss2.xml
 
      ssh kkstore02
      # Eponine runs fine on 2.5Mb contig, but barfs on much larger contig;
      # chop up sequence at gaps into ~2.5Mb chunks for cluster run.
      mkdir /san/sanvol1/scratch/hg18/chunks
      cd /cluster/data/hg18
      foreach f (?{,?}/NT_*/NT_??????.fa)
        set ctg = $f:t:r
        /cluster/bin/x86_64/faSplit -minGapSize=10 \
         -lift=/san/sanvol1/scratch/hg18/chunks/${ctg}.lft \
         gap $f 2500000 /san/sanvol1/scratch/hg18/chunks/${ctg}.chunk
      end
      # seems to ignore the chunk part of the file name
      mkdir /cluster/data/hg18/bed/eponine
      cd /cluster/data/hg18/bed/eponine
      wget --timestamping \
        http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar
      # file has the same date and same size as the one downloaded for hg16
      # the script requires all of the path setting found in my .tcshrc file.
      # Using only set path = (/usr/java/jre1.5.0_06/bin $path)
      # as in the doEpo file for hg16 does not work.
      cat << '_EOF_' > doEpo
 #!/bin/csh -ef
 set path = (/usr/java/jre1.5.0_06/bin /bin /usr/bin /usr/X11R6/bin \
              /usr/local/bin . /cluster/home/hartera/bin/x86_64 \
              /cluster/bin/x86_64 /projects/compbio/bin/x86_64 \
              /projects/compbio/bin /projects/compbio/bin/x86_64-linux \
              /cluster/bin/scripts)
 java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2
 '_EOF_'
      chmod a+x doEpo
      cp /dev/null jobList
      foreach f (/san/sanvol1/scratch/hg18/chunks/NT*.fa)
         echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \
       >> jobList
      end
      mkdir out
      ssh pk
      cd /cluster/data/hg18/bed/eponine
      /parasol/bin/para create jobList
      /parasol/bin/para try, check, push, check etc.....
      /parasol/bin/para time
 # Completed: 1408 of 1408 jobs
 # CPU time in finished jobs:     105248s    1754.13m    29.24h    1.22d  0.003 y
 # IO & Wait Time:                  4369s      72.82m     1.21h    0.05d  0.000 y
 # Average job time:                  78s       1.30m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             104s       1.73m     0.03h    0.00d
 # Submission to last job:          1295s      21.58m     0.36h    0.01d
 
     # lift chunks -> contigs
     mkdir contigs/
     foreach l (/san/sanvol1/scratch/hg18/chunks/*.lft)
       set ctg = $l:t:r
       liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff
     end
     # lift contigs -> chrom
     liftUp eponine.gff /cluster/data/hg18/jkStuff/liftAll.lft \
            warn contigs/NT_*.gff
     # Translate to bed 4 + float-score -- it would be a shame to lose
     # those scores in genePred or bed 5 (int score)
     awk 'BEGIN {i=0;} \
          {printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \
           i = i + 1;}' \
       eponine.gff > eponine.bed
     # load up
     ssh hgwdev
     cd /cluster/data/hg18/bed/eponine
     sed -e 's/bed6FloatScore/eponine/g' \
       $HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql
     hgLoadBed hg18 eponine eponine.bed -tab -sqlTable=eponine.sql
     # Loaded 61359 elements of size 6
     # trackDb.ra entry and eponine.html already exist in trackDb directory.
 
 ###########################################################################
 # ACEScan Track (DONE 2007-03-15  Andy
 
 ssh hgwdev
 cd /cluster/data/hg18/bed
 mkdir acescan
 cd acescan/
 cp /cluster/data/hg17/bed/acescan/acescan.hg17.gp .
 liftOver -genePred acescan.hg17.gp /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
   acescan.hg18.gp unmapped
 ldHgGene -predTab hg18 acescan acescan.hg18.gp
 
 ##############################################################################
 # Update central DB gdbPdb table in preparation for KG III (DONE 3/22/07, Fan)
 
    mysql -u hgcat -p$HGPSWD -h genome-testdb -A hgcentraltest
       update gdbPdb set proteomeDb  = "proteins070202" where genomeDb = "hg18";
       quit
 
 
 ##############################################################################
 # UPDATE CGAP TABLES (DONE, 3/26/07, Fan)
 
     cd /cluster/data/hg18/bed/ucsc.10
     mkdir cgap
     cd cgap
 
     wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat"
     hgCGAP Hs_GeneData.dat
 
     cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab
     hgLoadSqlTab hg18 cgapAlias ~/kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab
 
     hgLoadSqlTab hg18 cgapBiocPathway ~/kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab
 
     cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab
     hgLoadSqlTab hg18 cgapBiocDesc ~/kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab
 
 ##############################################################################
 # UPDATE CGAP TABLES (DONE, 8/05/08, JK)
 
     cd /cluster/data/hg18/bed/ucsc.11
     mkdir cgap
     cd cgap
 
     wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat"
     hgCGAP Hs_GeneData.dat
 
     cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab
     hgLoadSqlTab hg18 cgapAlias ~/kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab
 
     hgLoadSqlTab hg18 cgapBiocPathway ~/kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab
 
     cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab
     hgLoadSqlTab hg18 cgapBiocDesc ~/kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab
 
 
 
 
 ##############################################################################
 ## BLASTZ HUMAN HG18 (DONE - 2007-03-26 - Hiram)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
     cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
     cat << '_EOF_' > DEF
 # human vs lancelet
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=1
 
 # QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold
 #       Largest scaffold 7,200,735 - 3032 scaffolds + chrM
 SEQ2_DIR=/san/sanvol1/scratch/braFlo1/braFlo1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/braFlo1/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/braFlo1/braFlo1.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -verbose=2 \
 	-blastzOutRoot /cluster/bluearc/hg18BraFlo1 > do.log 2>&1 &
     #	real    458m43.961s
 
     cat fb.hg18.chainBraFlo1Link.txt
     #	26455595 bases of 2881515245 (0.918%) in intersection
     #	test reciprocal best chains/nets for 5-way maf alignments
     #	on braFlo1, this did not work right there
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 braFlo1 \
 	> rbest.log 2>&1 &
     #	real    105m14.176s
 
     # and now the swap, also documented in braFlo1.txt
     mkdir /cluster/data/braFlo1/bed/blastz.hg18.swap
     cd /cluster/data/braFlo1/bed/blastz.hg18.swap
     time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \
 	/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26/DEF \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -verbose=2 \
 	-swap > swap.log 2>&1 &
     #	real    83m46.258s
     cat fb.braFlo1.chainHg18Link.txt
     #	30912893 bases of 923355587 (3.348%) in intersection
 
 ##############################################################################
 # RE-BUILD knownGeneList, (DONE, 3/29/07, Fan)
 
     cd /cluster/data/hg18/bed
     rm -rf knownGeneList/hg18
 
 # Run hgKnownGeneList to generate the tree of HTML pages
 # under ./knownGeneList/hg18
 
     hgKnownGeneList hg18
 
 # copy over to /usr/local/apache/htdocs
 
     rm -rf /usr/local/apache/htdocs/knownGeneList/hg18
     mkdir -p /usr/local/apache/htdocs/knownGeneList/hg18
     cp -Rfp knownGeneList/hg18/* /usr/local/apache/htdocs/knownGeneList/hg18
 
 ##############################################################################
 # Update entrez DB tables.
 
    cd /cluster/store10/entrez
    mkdir 070329
    ln -s /cluster/store10/entrez/070329 /cluster/data/entrez/070329
    cd /cluster/data/entrez/070329
 
    wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz
    wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
    wget -- timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
    gzip -d *.gz
 
    cut -f 2,4 gene2accession | sort -u | grep -v "-" | grep -v "NM_" | sed -e 's/\./\t/g'|cut -f 1-2 > entrezMrna.tab
    cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g' > entrezRefseq.tab
    cut -f 2,4 gene2refseq | grep "NM_"| sort -u | grep -v "-" | sed -e 's/\./\t/g'|cut -f 1-2 > entrezRefseq.tab
    cut -f 2,4,6 gene2accession | grep "NM_"| grep "NP_"|sort -u | sed -e 's/\./\t/g'|cut -f 1,2,4 > entrezRefProt.tab
 
    hgLoadSqlTab entrez entrezRefseq ~/src/hg/lib/entrezRefseq.sql ./entrezRefseq.tab
    hgLoadSqlTab entrez entrezMrna ~/src/hg/lib/entrezMrna.sql ./entrezMrna.tab
    hgLoadSqlTab entrez entrezRefProt ~/src/hg/lib/entrezRefProt.sql ./entrezRefProt.tab
 
    cd /cluster/data/hg18/bed/ucsc.10
    hgsql entrez -N -e \
         'select mrna, refseq from entrezRefseq, entrezMrna, hg18.all_mrna where qName=mrna and entrezRefseq.geneID=entrezMrna.geneID' \
    >mrnaRefseq1.tab
 
 # Include RefSeq as valid mRNA too.
     hgsql hg18 -N -e 'select name, name from refGene' >mrnaRefseq2.tab
 
     cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab
 
     hgLoadSqlTab hg18 mrnaRefseq ~/src/hg/lib/mrnaRefseq.sql ./mrnaRefseq.tab
 
 ##############################################################################
 # RE-BUILD KEGG RELATED TABLES FOR KG III.  (DONE, 3/29/07, Fan)
 
     wget --timestamping -O hsa.html \
     "http://www.genome.ad.jp/dbget-bin/www_bfind_sub?dbkey=pathway&keywords=hsa&mode=bfind&max_hit=1000&.cgifields=max_hit"
 
     grep href hsa.html | perl -wpe "s/<[^>]+>//g" > hsa.lis
 
 # edit hsa.lis to get rid of the first blank line and last line which is an unrelated line.
 
     ~/kent/src/hg/protein/getKeggList2.pl hsa > keggList.tab
 
     hgLoadSqlTab hg18 keggList ~/src/hg/lib/keggList.sql ./keggList.tab
 
 # Before running hgKegg3, make sure entrez DB is updated.
 
     hgKegg3 hg18 hg18
 
 # Load resulting data
 
     hgLoadSqlTab hg18 keggPathway ~/src/hg/lib/keggPathway.sql ./keggPathway.tab
     hgLoadSqlTab hg18 keggMapDesc ~/src/hg/lib/keggMapDesc.sql ./keggMapDesc.tab
 
 ##############################################################################
 # REATTACH KEGG TO KNOWN GENES.  (DONE, 8/12/08, JK)
 
     mkdir -p /cluster/data/hg18/bed/ucsc.11/kegg
     cd /cluster/data/hg18/bed/ucsc.11/kegg
     kgAttachKegg hg18 ../../ucsc.10/kegg/keggList.tab keggPathway.tab
     hgLoadSqlTab hg18 keggPathway ~/src/hg/lib/keggPathway.sql ./keggPathway.tab
 
 
 ##############################################################################
 # REATTACH SPMRNA TABLE TO KNOWN GENES.  (DONE, 8/12/08, JK)
    hgsql hg18 -N -e "select spDisplayID,kgID from kgXref where spDisplayID != ''" > spMrna.tab;
    hgLoadSqlTab hg18 spMrna ~/kent/src/hg/lib/spMrna.sql spMrna.tab
 
 ##############################################################################
 # UPDATE BIOCYCTABLES NEEDED BY hgGene (DONE 3/27/07 Fan)
 
 # First register with BioCyc to download their HumanCyc database
 # The site will email you the URL for download
 
     wget --timesatmping \
     http://bioinformatics.ai.sri.com/ecocyc/dist/flatfiles-52983746/humancyc-flatfiles.zip
 
     unzip  humancyc-flatfiles.zip
 
     cp genes.col genes.tab
     cp pathways.col pathways.tab
 
 # delete the first 20 or so header lines from these two files.
     vi genes.tab
     vi pathways.tab
 
     hgsql hg18 -e 'create database bioCyc070327'
 
     hgLoadSqlTab bioCyc070327 genes ~/src/hg/lib/bioCycGenes.sql ./genes.tab
     hgLoadSqlTab bioCyc070327 pathways ~/src/hg/lib/bioCycPathways.sql ./pathways.tab
 
 
 # Create bioCycMapDesc.tab
     hgsql bioCyc070327 -N -e 'select UNIQUE_ID, NAME from pathways' |sort -u >  bioCycMapDesc.tab
 
 # Create bioCycPathway.tab
     kgBioCyc0 bioCyc070327 hg18 hg18
 
     hgLoadSqlTab hg18 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
     hgLoadSqlTab hg18 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
 
 ##########################################################################
 # PARTIAL UPDATE OF BIOCYCTABLES NEEDED BY hgGene (DONE 8/05/08 JK)
 # Note, ideally would get new data from bioCyc, but they never sent me the
 # URL for the files though I filled out their web form a week ago.  So reusing the
 # 3/27/07 pathways.col and genes.col files.  I did write a new kbBioCyc1 to do
 # the actual load, and it is on the new UCSC genes.  It looks to be a slight
 # improvement.  About 10% more genes in pathways.
     mkdir /cluster/data/hg18/bed/ucsc.11/bioCyc
     cd /cluster/data/hg18/bed/ucsc.11/bioCyc
     grep -v '^#' /cluster/data/hg18/bed/ucsc.10/bioCyc/pathways.col > pathways.tab
     grep -v '^#' /cluster/data/hg18/bed/cusc.10/bioCyc/genes.col > genes.tab
     kgBioCyc1 genes.tab pathways.tab $db bioCycPathway.tab bioCycMapDesc.tab
     hgLoadSqlTab hg18 bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab
     hgLoadSqlTab hg18 bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab
 
 ###########################################################################
 # SwitchDB TSS Track (DONE 2007-04-12 Andy)
 
 ssh hgwdev
 mkdir /cluster/data/hg18/bed/switchDbTss
 cd /cluster/data/hg18/bed/switchDbTss
 ln -s /cluster/data/hg17/bed/switchDbTss/switchDbTss.bed hg17.bed
 liftOver -bedPlus=5 hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.bed unMapped
 wc -l unMapped
 #12 unMapped    (12 are "deleted in new")
 ln -s ~/kent/src/hg/lib/switchDbTss.sql
 hgLoadBed -sqlTable=switchDbTss.sql hg18 switchDbTss hg18.bed
 
 ###########################################################################
 # ADD KG TO TREEFAM LINKS (DONE, 2007-04-13 Fan)
 
 # Generate ucscToEnsembl.txt and send it to TreeFam
 # zhongzhongchen [chenzhzh at genomics.org.cn]
 
     hgsql hg18 -N -e 'select * from knownToEnsembl' >ucscToEnsembl.txt
 
 
     ssh hgwdev
     cd /cluster/store12
     mkdir treeFam070413
     ln -s /cluster/store12/treeFam070413 /cluster/data/treeFam
     cd /cluster/data/treeFam
 
 # Receive the following files from TreeFam
 
     ucscToEnsemblToTreefamToRefToUniprot.txt
     ucscToEnsemblToTreefamToRef.txt
     ucscToEnsemblTotreefam.txt
 
 # Use ucscToEnsemblTotreefam.txt to construct knownToTreefam table.
 
     cut -f 1,3 ucscToEnsemblTotreefam.txt >knownToTreefam.tab
 
     hgLoadSqlTab hg18 knownToTreefam \
     ~/src/hg/lib/knownToTreefam.sql ./knownToTreefam.tab
 
 # Add the following section into kent/src/hg/hgGene/hgGeneData/links.ra
 
     name treeFam
     shortLabel Treefam
     tables knownToTreefam
     idSql select value from knownToTreefam where name = '%s';
     url http://www.treefam.org/cgi-bin/TFinfo.pl?ac=%s
     priority 10
 
 ###########################################################################
 # BLASTZ/CHAIN/NET HORSE (equCab1) (STARTED 2/16/07, DONE 2/21/07, Fan)
     ssh kkstore05
     mkdir /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
     cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
 # NOTE: THE TARGET WAS ORIGINALLY INTENDED TO BE HORSE, BUT I DID NOT
 # DISCOVER THIS UNTIL THE TASK IS DONE.
     cat << '_EOF_' > DEF
 # Horse vs. Human
 
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse equCab1
 SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ2_LIMIT=500
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/equCab1/bed/blastz.hg18.2007-02-15
 TMPDIR=/scratch/tmp
 '_EOF_'
 
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab1/blastz.hg18 >& do.log &
     tail -f do.log
 
     ln -s blastz.hg18.2007-02-15 /cluster/data/hg18/bed/blastz.equCab1
 
     nice featureBits hg18 -chrom=chr1 chainEquCab1Link
     # 132947074 bases of 224999719 (59.088%) in intersection
 
     ssh hgwdev
     cd /cluster/data/equCab1/bed/blastz.hg18.2007-02-15
 
     bash
     time nice -n 19 featureBits hg18 chainEquCab1Link \
 	> fb.hg18.chainEquCab1Link.txt 2>&1 &
     # 1643928877 bases of 2881515245 (57.051%) in intersection
 
 #########################################################################
 # enable ORFeome track build.  (markd 2007-05-02)
     cd ~/kent/src/hg/makeDb/genbank
     cvs update -d etc
     # edit etc/genbank.conf to add
         hg18.orfeomeTables.hgwdev = yes
         hg18.orfeomeTables.hgwbeta = yes
     # will need to enable for rr later.  In the future, this can just be enabled
     # as part the normal genbank build.  Change above to:
         hg18.orfeomeTables.default = yes
 
 
 #########################################################################
 # exaptedRepeats track (4/30/07, Craig)
 # for full methods an analysis see: Lowe, Bejerano, Haussler.
 # Thousands of human mobile element fragments undergo
 # strong purifying selection near developmental genes.
 # PNAS. (in press). Epub 2007 Apr 26.
 #
 # Code to re-make this track is in:
 # build36/bed/exapted/create.csh
 #
 # To re-make the track all you have to do is run that c-shell
 # while you are in its directory.
 # It is easiest if you are on hgwdev since it uses featureBits a few times
 # and gets some info from the sql database.  I would say it takes
 # about two hours to run.
 #
 
 #######################################################################
 # UCSC GENES (DONE 2007-03-xx kent)
 
 see file: ucscGenes10.txt
 
 #######################################################################
 # CGAP SAGE (DONE 2007-04-17 Andy)
 
    ssh hgwdev
    bash
    mkdir /san/sanVol1/scratch/andy/cgapSage
    cd /san/sanVol1/scratch/andy/cgapSage
    echo "select * from cgapSageLib" | hgsql hg18 | tail +2 > libs.txt
    echo "select * from snp127 where class='single' and locType='exact'" | hgsql hg18 | tail +2 | cut -f2- > allSnpss.txt
    echo "select name from snp127Exceptions where exception='ObservedWrongSize' or exception='SingleClassBetweenLocType' or exception='SingleClassRangeLocType' or exception='MultipleAlignment'" | hgsql hg18 | tail +2 > exceptions
   tabGrep -v exceptions 4 allSnps.txt > snps.txt
    rm allSnps.txt exceptions
    echo select chrom,chromStart,chromEnd,name from simpleRepeat | hgsql hg18 | tail +2 > trf.bed
    cut -f1-4 snps.txt > snps.bed
    overlapSelect -selectFmt=bed -inFmt=bed -nonOverlapping trf.bed snps.bed snps.noTrf.bed
    cut -f4 snps.noTrf.bed > snps.noTrf
    tabGrep snps.noTrf 4 snps.txt > snps.noTrf.txt
    mv snps.noTrf.txt snps.txt
    grep -v random /cluster/data/hg18/chrom.sizes | grep -v hap > chrom.sizes
    mkdir chromSnps
    for c in `cat chrom.sizes | cut -f1`; do
      awk "{if (\$1==\"$c\") print;}" snps.txt > chromSnps/$c.snps.txt;
      echo $c;
    done
    rm snps.txt
    wget ftp://ftp1.nci.nih.gov/pub/SAGE/HUMAN/Hs.libraries.gz
    gunzip Hs.libraries.gz
 cat << "EOF" > cleanLibs.awk
 BEGIN{FS="\t"}
 {
 for (i = 1; i <= 12; i++)
     {
     printf("%s\t", $i);
     }
 sex = "";
 if ($13=="male")
     {
     sex = "male,";
     }
 else if ($13=="female")
     {
     sex = "female,";
     }
 else if ($13=="male and female")
     {
     sex = "male,female,";
     }
 else if ($13=="unknown")
     {
     sex = "";
     }
 printf("%s\t", sex);
 for (i = 14; i <= 20; i++)
     {
     printf("%s\t", $i);
     }
 printf("%s\n", $21);
 }
 EOF
    tail +2 Hs.libraries | awk -f cleanLibs.awk > libs.txt
    ln -s ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql
    hgLoadSqlTab hg18 cgapSageLib cgapSageLib.sql libs.txt
    partitionSequence.pl -lstDir small 5000000 30 hg18.2bit chrom.sizes 0 > sequence.lst
    grep -v small sequence.lst > seq.lst
    cat small/* >> seq.lst
    mv seq.lst sequence.lst
    rm -rf small/
    wget ftp://ftp1.nci.nih.gov/pub/SAGE/HUMAN/Hs_long.frequencies.gz
    gunzip Hs_long.frequencies.gz
    cat << "EOF" > doJobList.sh
 #!/bin/bash
 
 # basic vars
 
 part=$1;
 range=${part#*2bit:};
 chrom=${range%:*};
 nums=${range#*:}
 firstnum=${nums:0:1}
 
 outDir=output/${chrom}/${firstnum}
 mkdir -p $outDir
 echo ./doFind.sh $1 {check out exists `pwd`/${outDir}/${range}.bed}
 EOF
    chmod +x doJobList.sh
    for part in `cat sequence.lst`; do ./doJobList.sh $part >> jobList; done
    cat << "EOF" > doFind.sh
 #!/bin/bash
 
 # basic vars
 
 part=$1;
 range=${part#*2bit:};
 chrom=${range%:*};
 nums=${range#*:}
 firstnum=${nums:0:1}
 
 # dirs/files
 
 startDir=`pwd`
 scratch=/scratch/tmp/$part
 output=$2
 
 # begin
 
 mkdir -p $scratch
 pushd $scratch
 twoBitToFa -noMask $startDir/"$part" part.fa
 cgapSageFind part.fa $startDir/Hs_long.frequencies $startDir/libs.txt \
      $startDir/chromSnps/${chrom}.snps.txt output.bed
 cp output.bed $output
 popd
 rm -rf $scratch
 EOF
 chmod +x doFind.sh
 ssh pk
 cd /san/sanVol1/scratch/andy/cgapSage
 para create jobList
 para try
 para push
 # takes like 5-10 min
 exit
 # back to hgwdev
 find output/ -name '*.bed' -exec cat '{}' >> output.bed \;
 cgapSageDupeRemove output.bed tmp.bed
 cgapSageDupeRemove -unique tmp.bed final.bed
 ln -s ~/kent/src/hg/lib/cgapSage/cgapSage.sql
 hgLoadBed -sqlTable=cgapSage.sql -tab hg18 cgapSage final.bed
 
 #########################################################################
 # HapMap SNPs (DONE 2007-05-23 Andy)
 # rel22
 # OBSOLETED by Phase II+III SNPs 3/09 angie (see HAPMAP REL27 GENOTYPES)
 # Tables renamed to [originalName]PhaseII 3/9/09
 ssh hgwdev
 bash
 cd /cluster/data/hg18/bed
 mkdir -p hapmap/zips
 cd hapmap/zips
 # archived to http://www.hapmap.org/genotypes/2007-03
 wget -nd -r -N -A html http://www.hapmap.org/genotypes/latest_ncbi_build36/rs_strand/non-redundant/
 grep gz index.html | sed 's/^.*href=\"\(geno.*\.txt\.gz\)\".*$/\1/' > files.txt
 wget -N -i files.txt --base=http://www.hapmap.org/genotypes/latest_ncbi_build36/rs_strand/non-redundant/
 rm index.html robots.txt files.txt
 cd ../
 mkdir samples
 cd samples/
 wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_CEU.txt.gz
 wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_CHB.txt.gz
 wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_JPT.txt.gz
 wget http://www.hapmap.org/downloads/samples_individuals/pedinfo2sample_YRI.txt.gz
 cp /cluster/store12/snp/hapmap/rel21a/genotypes/2007-01/rs_strand/non-redundant/*.pl .
 ln -s ../zips
 ./filterPedigree.pl < pedinfo2sample_CEU.txt > filtered.CEU
 ./filterPedigree.pl < pedinfo2sample_YRI.txt > filtered.YRI
 zcat zips/*chr22_CEU* | head -1 | tr ' ' '\n' > header.CEU
 zcat zips/*chr22_YRI* | head -1 | tr ' ' '\n' > header.YRI
 grep -n -f filtered.CEU header.CEU | cut -f1 -d':' > offsets.CEU
 grep -n -f filtered.YRI header.YRI | cut -f1 -d':' > offsets.YRI
 for pop in CEU YRI CHB JPT; do
    for f in zips/genotypes_chr*_${pop}_r22_nr.b36.txt.gz; do
       zcat $f | ./filter${pop}.pl >> ../${pop}.merge
       echo Done with $f
    done
 done
 cd ../
 for pop in CEU YRI CHB JPT; do
    ~/kent/src/hg/snp/snpLoad/hapmap1 ${pop}.merge ${pop}.condense
    mv hapmap1.log ${pop}.hapmap1.log
 done
 wc -l *.log
 #0 CEU.hapmap1.log
 #0 CHB.hapmap1.log
 #0 JPT.hapmap1.log
 #0 YRI.hapmap1.log
 #0 total
 rm *.log
 cp ~/kent/src/hg/lib/hapmapSnps.sql .
 for pop in CEU CHB JPT YRI; do
      sed "s/hapmapSnps/hapmapSnps$pop/" hapmapSnps.sql > hapmapSnps${pop}.sql
      hgLoadBed -sqlTable=hapmapSnps${pop}.sql hg18 hapmapSnps$pop ${pop}.condense
 done
 # Don't worry if you see:
 #load of hapmapSnpsCEU did not go as planned... etc.
 # unless it says rows skipped.
 ~/kent/src/hg/snp/snpLoad/hapmap2 hg18
 #building CEU hash...
 #Can't start query:
 #select * from hapmapAllelesCEU
 #
 #mySQL error 1146: Table 'hg18.hapmapAllelesCEU' doesn't exist
 # But this works:
 ~heather/kent/src/hg/snp/snpLoad/hapmap2 hg18
 # (gotta bug Heather about that one)
 ln -s ~/kent/src/hg/lib/hapmapSnpsCombined.sql
 hgLoadBed -sqlTable=hapmapSnpsCombined.sql hg18 hapmapSnpsCombined hapmapSnpsCombined.tab
 # Checks:
 ~heather/kent/src/hg/snp/snpLoad/snpCheckCluster2 hg18 hapmapSnpsCombined
 #match count = 0
 ### clean up
 rm *.sql hapmapSnpsCombined.tab bed.tab
 tar cfvz merge.tar.gz *.merge
 tar cfvz condense.tar.gz *.condense
 rm *.condense *.merge
 mkdir logs
 mv *.errors *.log *.out logs
 mkdir orthos
 cd orthos/
 
 # hgWiggle output has the chromosome in a comment, followed by the values
 # This script prints that chromosome on every line
 cat << "EOF" > joinify.awk
 {
 if ($1 == "variableStep")
    {
    sub("chrom=", "", $2);
    chrom = $2;
    }
 else if ($1 != "#")
    {
    printf("%s,%s\t%s\n", chrom, $1, $2);
    }
 }
 EOF
 
 cat << "EOF" > join.sh
 #!/bin/bash
 sed 's/\(^chr\w\+\)\t/\1,/' $1 > bed
 sort -k1,1 bed > tmp; mv tmp bed
 awk -f joinify.awk $2 > scores
 sort -k1,1 scores > tmp; mv tmp scores
 join -1 1 -2 1 bed scores | tr ',' ' ' |
   awk '{printf("%s\t%s\t%s\t%s\t%d\t%s\t%s\n", $1, $2, $3, $4, $8, $6, $7);}' > qual.tab
 rm scores bed
 EOF
 chmod +x join.sh
 
 # chimp alleles
 cd /cluster/data/dbSNP/ortho/hg18/panTro2Seq
 awk '{printf("%s\t%s\t%s\t%s\t0\t%s\t%s\n", $2, $3, $4, $5, $7, $8);}' snp126orthoPrelim.tab > snp126orthoPrelim.bed
 cp snp126orthoPrelim.bed /cluster/data/hg18/bed/hapmap/orthos/panTro2.bed.new
 cd /cluster/data/hg18/bed/hapmap/orthos
 hgWiggle -db=panTro2 -bedFile=panTro2.bed quality > panTro2.scores
 # create qual.tab; combine panTro2 sequence with panTro2 quality score
 ./join.sh panTro2.bed.new panTro2.scores
 grep chr21 panTro2.bed.new >> qual.tab
 grep chrY panTro2.bed.new >> qual.tab
 # create snpOrtho.tab; a table in human coords that has associated ortho alleles
 ~heather/kent/src/hg/snp/snpLoad/snpOrtho hg18 snp126 qual.tab
 sed 's/snpOrtho/snp126OrthoPanTro2/' ~/kent/src/hg/lib/snpOrtho.sql  > snpOrthoPanTro2.sql
 hgLoadBed -tab -sqlTable=snpOrthoPanTro2.sql hg18 snp126OrthoPanTro2 snpOrtho.tab
 mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chr21";
 mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY";
 mysql> update snp126OrthoPanTro2 set orthoScore = 98 where orthoChrom = "chrY_random";
 # get the HapMap subset
 sed 's/hapmapAllelesOrtho/hapmapAllelesChimp/' ~/kent/src/hg/lib/hapmapAllelesOrtho.sql  > hapmapAllelesChimp.sql
 ~heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg18 hapmapSnpsCombined snp126OrthoPanTro2
 hgLoadBed -tab -sqlTable=hapmapAllelesChimp.sql hg18 hapmapAllelesChimp hapmapOrtho.tab
 # sanity check
 mysql> select count(*) from hapmapAllelesChimp where chrom = orthoChrom;
 # 3,492,708
 mysql> select count(*) from hapmapAllelesChimp where chrom != orthoChrom;
 # 374,010
 
 # macaque alleles
 cd /cluster/data/dbSNP/ortho/hg18/rheMac2Seq
 awk '{printf("%s\t%s\t%s\t%s\t0\t%s\t%s\n", $2, $3, $4, $5, $7, $8);}' snp126orthoPrelim.tab > snp126orthoPrelim.bed
 cp snp126orthoPrelim.bed /cluster/data/hg18/bed/hapmap/orthos/rheMac2.bed.new
 cd /cluster/data/hg18/bed/hapmap/orthos
 hgWiggle -db=rheMac2 -bedFile=rheMac2.bed quality > rheMac2.scores
 # create qual.tab: combine rheMac2 sequence with rheMac2 quality score
 ./join.sh rheMac2.bed.new rheMac2.scores
 # create snpOrtho.tab; a table in human coords that has associated ortho alleles
 ~heather/kent/src/hg/snp/snpLoad/snpOrtho hg18 snp126 qual.tab
 sed 's/snpOrtho/snp126OrthoRheMac2/' ~/kent/src/hg/lib/snpOrtho.sql  > snpOrthoRheMac2.sql
 hgLoadBed -tab -sqlTable=snpOrthoRheMac2.sql hg18 snp126OrthoRheMac2 snpOrtho.tab
 # get the HapMap subset
 sed 's/hapmapAllelesOrtho/hapmapAllelesMacaque/' ~/kent/src/hg/lib/hapmapAllelesOrtho.sql  > hapmapAllelesMacaque.sql
 ~heather/kent/src/hg/snp/snpLoad/hapmapOrtho hg18 hapmapSnpsCombined snp126OrthoRheMac2
 hgLoadBed -tab -sqlTable=hapmapAllelesMacaque.sql hg18 hapmapAllelesMacaque hapmapOrtho.tab
 
 # create summary table
 ~heather/kent/src/hg/snp/snpLoad/hapmapSummary hg18 hapmapSnpsCombined hapmapAllelesChimp hapmapAllelesMacaque
 ln -s ~/kent/src/hg/lib/hapmapAllelesSummary.sql
 hgLoadBed -tab -sqlTable=hapmapAllelesSummary.sql hg18 hapmapAllelesSummary hapmapSummary.tab
 
 
 #############################################################################
 # RE-BUILD WGRNA TRACK (DONE, 2007-05-31, Fan)
 # rebuilt below: RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
 
   ssh hgwdev
   cd /cluster/data/hg18/bed
 
   mkdir wgRna-2007-05-31
   cd wgRna-2007-05-31
 
 # Received the data file, wg_may2007.txt (saved from wg_may2007.doc)
 # from Michel Weber's email
 # (Michel.Weber at ibcg.biotoul.fr)
 # and place it under cd /cluster/data/hg18/bed/wgRna-2007-05-31.
 
   cat wg_may2007.txt|sed -e 's/ /\t/g' > wgRna.tab
 
   hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
 
 #############################################################################
 # N-SCAN GENES track (2007-06-21 markd)
 # create a composite track with exists ab-inito and new PASA N-SCAN predictions
 
     # download pasa predictions
     cd /cluster/data/hg18/bed/nscan/pasa
     wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.pasa.gtf
     wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.prot.fa
     bzip2 hg18.*
     chmod a-w hg18.*
 
     ldHgGene -gtf -genePredExt hg18 nscanPasaGene hg18.pasa.gtf.bz2
     hgPepPred hg18 generic nscanPasaPep  hg18.prot.fa.bz2
     rm *.tab
 
     # update trackDb; need a hg18-specific page to describe informants and PASA
     human/hg18/nscan.html
     human/hg18/trackDb.ra
 
     # remove old human/hg18/nscanGene.html
 
 ###########################################################################
 # AUGUSTUS track (DONE 2007-7-3 Mario)
 #
 # augustusHints subtrack
 
 mkdir -p /cluster/data/hg18/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.X.final
 cd /cluster/data/hg18/bed/augustus/usingHints/predictions/Trefseq.hmRNA.hsEST.R.X.final
 wget http://augustus.gobics.de/predictions/hg18/usingEvidence/augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.gff
 wget http://augustus.gobics.de/predictions/hg18/usingEvidence/augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.aa
 ldHgGene -bin hg18 augustusHints augustus.hg18.Trefseq.hmRNA.hsEST.R.X.gff
 hgPepPred hg18 generic augustusHintsPep augustus.hg18.Trefseq.hmRNA.hsEST.R.X.pep.aa
 
 
 # augustus de novo subtrack
 
 mkdir -p /cluster/data/hg18/bed/augustus/usingHints/predictions/Xp.RA.it
 cd /cluster/data/hg18/bed/augustus/usingHints/predictions/Xp.RA.it
 wget http://augustus.gobics.de/predictions/hg18/deNovo/augustus.hg18.Xp.RA.it.pep.gff
 wget http://augustus.gobics.de/predictions/hg18/deNovo/augustus.hg18.Xp.RA.it.pep.aa
 ldHgGene -bin hg18 augustusXRA augustus.hg18.Xp.RA.it.gff
 hgPepPred hg18 generic augustusXRAPep augustus.hg18.Xp.RA.it.pep.aa
 
 
 # augustus ab initio subtrack
 
 mkdir -p /cluster/data/hg18/bed/augustus/abinitio
 cd /cluster/data/hg18/bed/augustus/abinitio
 wget http://augustus.gobics.de/predictions/hg18/abinitio/augustus.pep.gff
 wget http://augustus.gobics.de/predictions/hg18/abinitio/augustus.pep.aa
 ldHgGene -bin hg18 augustusAbinitio augustus.gff
 hgPepPred hg18 generic augustusAbinitioPep augustus.pep.aa
 
 
 
 
 #############################################################################
 # Stanford NRSF ChIP-seq (DONE, Heather, July 2007)
 # Add color-by-strand and overlap table (2008-05-27 kate)
 # BED file of sites provided May 2008 by Tim Reddy (treddy@gmail.com)
 
     ssh kkstore03
     cd /cluster/data/encode/stanford/2007-03-14
 
     # lift to hg18
     liftOver fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.bed core.unmapped
     liftOver control_fix.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz hg18.control.bed control.unmapped
 
     # add color by strand (red for +, blue for minus)
     awk 'OFS="\t" {$6=="+" ? c="255,0,0" : c="0,0,255"; print $1, $2, $3, "", $5, $6, $2, $3, c}' hg18.bed > hg18.fixc.bed
     awk 'OFS="\t" {$6=="+" ? c="255,0,0" : c="0,0,255"; print $1, $2, $3, "", $5, $6, $2, $3, c}' hg18.control.bed > hg18.control_fixc.bed
 
     # load into database
     hgwdev
     cd /cluster/data/encode/stanford/2007-03-14
     hgLoadBed hg18 stanfordNRSFEnriched hg18.fixc.bed -tab
     hgLoadBed hg18 stanfordNRSFControl hg18.control_fixc.bed -tab
 
     # overlap tables
     set prefix = /gbdb/hg18/wib
     set table = stanfordNRSFEnrichedOverlaps
     sort -k1,1 -k2,2n hg18.bed | bedItemOverlapCount hg18 stdin | \
         wigEncode stdin ${table}.wig ${table}.wib
     ln -s /cluster/data/encode/stanford/2007-03-14/${table}.wib $prefix
     hgLoadWiggle -pathPrefix=$prefix hg18 $table ${table}.wig
 
     set table = stanfordNRSFControlOverlaps
     sort -k1,1 -k2,2n hg18.control.bed | bedItemOverlapCount hg18 stdin | \
         wigEncode stdin ${table}.wig ${table}.wib
     ln -s /cluster/data/encode/stanford/2007-03-14/${table}.wib $prefix
     hgLoadWiggle -pathPrefix=$prefix hg18 $table ${table}.wig
 
     # peaks (provided May 2008)
     sort -k1,1 -k2,2n lab/NRSF_Peak_Calls.bed | \
         awk '{print $1, $2, $3}' > peaks.bed
     wc -l peaks.bed
         # 2116
     hgLoadBed -noBin hg18 stanfordNRSFSites peaks.bed
 
 #########################################################################
 # REGULATORY POTENTIAL UPDATE (DONE - 2007-08-01 - Hiram)
     #	download data from "James Taylor" <james at bx.psu.edu>
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/regPotential7X.update
     cd /cluster/data/hg18/bed/regPotential7X.update
 
     #	This is a lot of data
     for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
 do
 wget --timestamping \
 "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_hg18/chr${C}.scores.truncated.bz2"
 echo "DONE - chr${C}.scores.truncated.bz2"
 done
 
     #	create download gzip files from the bz2 files:
     time for F in chr*.scores.truncated.bz2
     do
 	C=`echo $F | awk -F'.' '{print $1}'`
 	echo -n "${C}.regPotential7X.hg18.gz working ... "
 	bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz
 	touch -r "${F}" "${C}.regPotential7X.hg18.gz"
 	echo "done"
     done
 
     time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
     do
 	zcat chr${C}.regPotential7X.hg18.gz
     done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
     #	Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    16m40.347s
 
     #	Loading the table on hgwdev
     ssh hgwdev
     cd /cluster/data/hg18/bed/regPotential7X.update
     mkdir /gbdb/hg18/wib/070118
     ln -s /cluster/data/hg18/bed/regPotential7X.update/regPotential7X.wib \
 	/gbdb/hg18/wib/070118/regPotential7X.wib
     #	using the tmpDir is faster since it is on local disk and it will
     #	clean up any temporary .tab file it creates there
     time nice -n +19 hgLoadWiggle -tmpDir=/scratch/tmp \
 	-pathPrefix=/gbdb/hg18/wib/070118 hg18 regPotential7X regPotential7X.wig
     #	real    0m38.247s
 
     #	How about a histogram of the data.
     ssh kolossus
     cd /cluster/data/hg18/bed/regPotential7X.update
     time nice -n +19 hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 \
        -hBinCount=100 -hMinVal=0.0 -db=hg18 regPotential7X > histogram.data 2>&1
     #	real    3m15.934s
     #	73 % of the data values are zero
 
     #	create download gzip files from the bz2 files:
     ssh kkstore02
     cd /cluster/data/hg18/bed/regPotential7X
     for F in chr*.scores.truncated.bz2
     do
 	C=`echo $F | awk -F'.' '{print $1}'`
 	echo -n "${C}.regPotential7X.hg18.gz working ... "
 	bzcat ${F} | gzip > ${C}.regPotential7X.hg18.gz
 	echo
     done
 
     # renaming file directory -- kuhn 08-17-2007
     cd /gbdb/hg18/wib
     mv 070118 regPot070118
     hgsql -e " update regPotential7X SET file = " \
       "/gbdb/hg18/wib/regPot070118/regPotential7X.wib" hg18
     Query OK, 2341572 rows affected (31.59 sec)
     Rows matched: 2341572  Changed: 2341572  Warnings: 0
 
 
 #############################################################################
 # SIB Transcriptome (DONE Aug 29, 2007 - JK)
 
    # Create working directory and download data from where Christian Iseli
    # (Christian.Iseli at licr.org) put it, and unpack.  The download takes about
    # ten minutes (161M file).
    cd /cluster/data/hg18/bed
    mkdir sibTranscriptome
    cd sibTranscriptome
    wget ftp://ftp.licr.org/pub/databases/trome/human/txg.tar.gz
    wget ftp://ftp.licr.org/pub/databases/trome/human/HTR.gtf.gz
    tar -zxvf txg.tar.gz
 
    # Load up sibGene table
    zcat HTR.gtf.gz | ldHgGene hg18 sibGene stdin
 
    # Do a little data cleanup and transformation and load splice graphs into database.
    sed 's/altGraphX/sibTxGraph/' ~/src/hg/lib/altGraphX.sql > sibTxGraph.sql
    sed 's/chrMt/chrM/' txg/chromMt.txg > txg/chromM.txg
    rm txg/chromMt.txt
    cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql hg18 sibTxGraph stdin
 
    # Create sibAltEvents track for analysed alt-splices.
    cat txg/*.txg | txgAnalyze stdin /cluster/data/hg18/hg18.2bit sibAltEvents.bed
    awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
    hgLoadBed hg18 sibAltEvents foo.bed
 
 #########################################################################
 # BLASTZ MOUSE Mm9 (DONE - 2007-08-20 - Hiram)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastzMm9.2007-08-09
     cd /cluster/data/hg18/bed/blastzMm9.2007-08-09
     #	Started this before the rsync to /scratch/data/mm9/ had completed,
     #	hence the /cluster/bluearc/scratch/data/mm9/ location is used
     #	here.  (hg18 was also in transition to a new location)
 
     cat << '_EOF_' > DEF
 # human vs mouse
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human Hg18
 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
 SEQ1_SMSK=/cluster/bluearc/scratch/data/hg18/linSpecRep/notInMouseRat
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=0
 
 # QUERY: Mouse Mm9
 SEQ2_DIR=/cluster/bluearc/scratch/data/mm9/nib
 SEQ2_SMSK=/cluster/bluearc/scratch/data/mm9/notInOthers
 SEQ2_LEN=/cluster/data/mm9/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 
 BASE=/cluster/data/hg18/bed/blastzMm9.2007-08-09
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	real    1480m54.483s
     #	failed due to pk node difficulties, finish the run.blastz
     #	manually
 # Completed: 102120 of 102120 jobs
 # CPU time in finished jobs:    6908585s  115143.08m  1919.05h   79.96d  0.219 y
 # IO & Wait Time:              50958894s  849314.90m 14155.25h  589.80d  1.616 y
 # Average job time:                 567s       9.44m     0.16h    0.01d
 # Longest finished job:            3000s      50.00m     0.83h    0.03d
 # Submission to last job:        446177s    7436.28m   123.94h    5.16d
 
     #	continuing
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
 	-chainLinearGap=medium -continue=cat `pwd`/DEF > cat.out 2>&1 &
     #	real    111m59.041s
     cat /cluster/data/hg18/bed/blastzMm9.2007-08-09/fb.hg18.chainMm9Link.txt
     #	1014323175 bases of 2881515245 (35.201%) in intersection
     cat /cluster/data/hg18/bed/blastz.mm8/fb.hg18.chainMm8Link.txt
     #	994530182 bases of 2881515245 (34.514%) in intersection
 
     cd /cluster/data/hg18/bed
     ln -s blastzMm9.2007-08-09 blastz.mm9
 
     #	Then to swap over to Mm9   (also in mm9.txt)
     mkdir /cluster/data/mm9/bed/blastz.hg18.swap
     cd /cluster/data/mm9/bed/blastz.hg18.swap
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 -swap -bigClusterHub=pk -chainMinScore=3000 \
 	-chainLinearGap=medium \
 	/cluster/data/hg18/bed/blastz.mm9/DEF > swap.out 2>&1 &
     #	real    67m21.146s
     cat /cluster/data/mm9/bed/blastz.hg18.swap/fb.mm9.chainHg18Link.txt
     #	1008812599 bases of 2620346127 (38.499%) in intersection
     cat /cluster/data/mm8/bed/blastz.hg18/fb.mm8.chainHg18Link
     #	984380268 bases of 2567283971 (38.343%) in intersection
 
     cd /cluster/data/mm9/bed
     ln -s blastz.hg18.swap blastz.hg18
 
     ## make syntenic net  (DONE - 2007-08-20 - Hiram)
     cd /cluster/data/hg18/bed/blastzMm9.2007-08-09
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
 	-syntenicNet -chainLinearGap=medium -continue=syntenicNet \
 	`pwd`/DEF > syntenic.out 2>&1 &
     ##	real    25m47.767s
 
 
 #########################################################################
 # LOAD ACEMBLY (DONE 8/28/07 angie)
     ssh kkstore02
     cd /cluster/data/hg18/bed/acembly
     # Move aside liftOver run results
     mkdir liftOver
     mv a* g* h* j* u* liftOver
 
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_36.human.genes/AceView.ncbi_36.genes_gff.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_36.human.genes/AceView.ncbi_36.good_proteins_fasta.tar.gz
     tar xvzf AceView.ncbi_36.genes_gff.tar.gz
     tar xvzf AceView.ncbi_36.good_proteins_fasta.tar.gz
 
     cd AceView.ncbi_36.genes_gff
     # If the result of this command is > 0, then some lines have end < start
     # and need to be fixed:
     awk '$5 < $4 {print;}' *.gff | wc -l
 #0
 
     # Filter out empty lines, lines where the product_id has a stray
     # newline before it, and $chr|Hs# IDs that don't appear liftable.
     egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \
     | sed -e 's/^/chr/;' \
       > acembly.gff
 
     # Extract annotation classes from original gff:
     egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \
     | perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \
                  s/Main$/main/ || s/Putative$/putative/ || \
                    die "Unrecognized class/Gene_type:\n$_\n";' \
     | sort -u \
       > acemblyClass.tab
 
     # Some gff transcript_id's end in -unspliced (no intron), but the
     # corresponding protein fasta IDs to not have that suffix.  We need
     # them to match, so add where necessary.
     # Use perl to make a perl script to add -unspliced to protein IDs
     # where necessary:
     grep unspliced acemblyClass.tab | wc -l
 #70156
     egrep -h -v '^(| ?product_id.*|..?\|Hs.*)$' *.gff \
     | perl -wpe 's@^.*transcript_id (\S+)-unspliced;.*$@\$unsp{"$1"} = 1;@ \
                || s/^.*\n$//;' \
     | sort -u \
       > ../addUnspliced.pl
     wc -l ../addUnspliced.pl
 #70156 ../addUnspliced.pl
     cat >> ../addUnspliced.pl <<'_EOF_'
 while (<>) {
   if (/^>(\S+)$/) {
     if ($unsp{$1}) {
       s/^>(\S+)/>$1-unspliced/;
     }
   }
   print;
 }
 '_EOF_'
     # << emacs
 
     # Add -unspliced suffix to protein IDs where necessary, and pare down
     # proteins to just the ones that we have transcripts for:
     cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.good_proteins_fasta
     awk '{print $1;}' ../AceView.ncbi_36.genes_gff/acemblyClass.tab \
       > transcriptNames.txt
     perl ../addUnspliced.pl *.fasta \
     | faSomeRecords stdin transcriptNames.txt acemblyPep.fa
     grep unspliced acemblyPep.fa | wc -l
 #55931
     # Danielle Thierry-Mieg explained that noncoding genes are included so
     # the number of proteins can be smaller than the number of transcripts.
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.genes_gff
     ldHgGene -gtf hg18 acembly acembly.gff
 #Read 258618 transcripts in 3451107 lines in 1 files
 #  258618 groups 24 seqs 1 sources 5 feature types
 #258618 gene predictions
     hgLoadSqlTab hg18 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \
       acemblyClass.tab
     cd /cluster/data/hg18/bed/acembly/AceView.ncbi_36.good_proteins_fasta
     hgPepPred hg18 generic acemblyPep acemblyPep.fa
     rm acemblyPep.tab
     runJoiner.csh hg18 acembly
 # hg18.acemblyPep.name - hits 210003 of 210003 ok
 # hg18.acemblyClass.name - hits 258618 of 258618 ok
 
 
 ###########################################################################
 ## Create gc5Base download raw data file (DONE - 2007-08-29 - Hiram)
     ssh kkstore02
     cd /cluster/data/hg18/bed/gc5Base
     hgGcPercent -wigOut -doGaps -file=stdout -win=5 \
 	hg18 /cluster/data/hg18/hg18.2bit 2> /dev/null \
 	| gzip > hg18.gc5Base.txt.gz
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg18/gc5Base
     cd /usr/local/apache/htdocs/goldenPath/hg18/gc5Base
     ln -s /cluster/data/hg18/bed/gc5Base/hg18.gc5Base.txt.gz .
 
 
 ###########################################################################
 # GENE BOUNDS (RNACLUSTER) (REBUILT 08-30-2007 Fan)
 # Create rnaCluster table (depends on {est,mrna}OrientInfo)
 
 cd /cluster/data/hg18/bed
 mv rnaCluster rnaCluster.old
 mkdir rnaCluster
 cd rnaCluster/
 mkdir chrom
 
 # Create a list of accessions that come from RAGE libraries and need to be excluded.
 ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg18 rage.libs
 foreach f (/cluster/data/hg18/nib/chr*.nib)
     set c = $f:t:r
     set out = chrom/$c.bed
     # Exclude accesions in the RAGE file
     echo clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
     clusterRna -mrnaExclude=hg18.rage.libs hg18 /dev/null $out -chrom=$c
 end
 hgLoadBed hg18 rnaCluster chrom/*.bed
 
 
 ###########################################################################
 # RE-LOAD FISH CLONES after bacEnds update (DONE - 2007-09-04 - Hiram)
 # The bacEnds processing results are used here
 
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/fishClones.2007-08-29
     cd /cluster/data/hg18/bed/fishClones.2007-08-29
     ln -s ../fishClones/cl_acc_gi_len .
     ln -s ../fishClones/fhcrc.sts .
 
     #	have to be on hgwdev for this since it is going to read from the db
     time nice -n +19 fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg18 \
 	/cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt \
 	/cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out \
          ./cl_acc_gi_len \
          /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl \
             fishClones > fishClones.out 2>&1
     #	real    0m53.783s
 # Reading Fish Clones file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
 #	reading fishInfo file /cluster/data/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
 # Reading Clone/Acc (clac.out) file /cluster/data/ncbi/fishClones/fishClones.2006-01/clac.out
 # Reading BAC Ends file ./cl_acc_gi_len
 # Reading BAC Ends psl file /cluster/data/hg18/bed/bacends/bacEnds.lifted.psl
 # Reading additional STS Marker links fhcrc.sts
 # Determining good positions
 #	findClonePos: determining positions of fish clones
 # Writing output file
 # ERROR: at line # 177, no cytoband info for chrX:104048913-104206974
 # RP11-79L11
 # ERROR: at line # 178, no cytoband info for chrX:104048913-104206974
 # RP11-79L11
 
     # Load the track
     hgLoadBed -notItemRgb -noBin -tab \
         -sqlTable=$HOME/kent/src/hg/lib/fishClones.sql \
 	hg18 fishClones fishClones.bed
     #	Loaded 9788 elements of size 16
 
 ############################################################################
 # INDEL-BASED CONSERVATION TRACK (DONE, 2007-09-03 - 2007-09-17, hartera)
 # Data from the Gerton Lunter (gerton.lunter at anat.ox.ac.uk), MRC
 # Functional Genetics Unit, University of Oxford, United Kingdom.
 # Data is from the paper:
 # Lunter G, Ponting CP and Hein J Genome-wide identification of human
 # functional DNA using a neutral indel model. PLoS Comput Biol. 2006
 # Jan;2(1):e5.
     ssh kkstore02
     mkdir -p /cluster/data/hg18/bed/consIndels/data
     cd /cluster/data/hg18/bed/consIndels
     # Add a README.indels with the e-mail from Gerton Lunter
     # get the data
     wget --timestamping \
          http://wwwfgu.anat.ox.ac.uk/~gerton/igs-hg18mm8cf2.zip
     # 38 Mb zip file in GFF format. This contains data for hg18
     # comparing it to mm8 and cf2 (canFam2).
     unzip igs-hg18mm8cf2.zip
     mv *.gff ./data/
     foreach f (./data/*.gff)
        set r = $f:r
        echo $r
        grep -v "track" $f > ${r}NoHeader.gff
     end
 
     # strip off the end of the name e.g. IGS0001.1:p=.74; FDR 0.27
     # so that the name displayed is short - IGS0001.1. The score field
     # is used to determine colouring and this is calculated from FDR
     ssh kkstore02
     cd /cluster/data/hg18/bed/consIndels
     perl -pi.bak -e \
 's/(IGS[0-9a-z]+\.[0-9XY]+):p=?<?\.[0-9]+;\sFDR\s[0-9]+\.[0-9]+/$1/'  \
     ./data/igs*NoHeader.gff
     # check this looks ok then clean up
     rm *.bak
     # makes sense to store this as a BED5 table in order to use the score
     # for display.
     foreach f (./data/*NoHeader.gff)
        awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1,$4,$5,$9,$6}' $f \
        >> consIndelsHg18Mm8CanFam2.bed
     end
 
     # load data
     ssh hgwdev
     cd /cluster/data/hg18/bed/consIndels
     hgsql -e 'drop table consIndelsHg18Mm8CanFam2;' hg18
     hgLoadBed hg18 consIndelsHg18Mm8CanFam2 consIndelsHg18Mm8CanFam2.bed
     # Loaded 2603017 elements of size 5
 
     # Get the IDs, posterior probabilities (p) for the segment being neutral,
     # and the FDR from the original GFFs for a separate table. Some items
     # have p<.001. Can not do Table Browser queries restricting
     # p to <, =, or > a specified value unless all values are floats.
     # Contacted the data contributor, Gerton Lunter, and he said it would be
     # ok to change all p<.001 to p=0.0005
     ssh kkstore02
     cd /cluster/data/hg18/bed/consIndels/
     foreach c (`cat /cluster/data/hg18/chrom.lst`)
        echo $c
        foreach f (./data/igs.chr${c}.gff)
           echo $f
           awk 'BEGIN {FS="\t"} {if ($9 ~ /IGS/) print $9;}' $f \
               | sed -e 's/:/\t/' \
               | sed -e 's/p=\./0\./' | sed -e 's/p<\.001/0\.0005/' \
               | sed -e 's/;\sFDR/\t/' >> consIndelsHg18Mm8CanFam2Conf.txt
        end
     end
     # there are no GFF files for the haplotype chroms
 
     # Create a table definition for the table of identifier,  posterior
     # probability and false discovery rate (FDR).
     cat << 'EOF' > $HOME/kent/src/hg/lib/itemConf.as
 table itemConf
 "Probability and false discovery rate (FDR) for an element in a track."
    (
    string id;  "Identifier of element"
    float probability; "Probability associated with element"
    float fdr; "False Discovery Rate (FDR) associated with element"
    )
 'EOF'
     # << emacs
     cd $HOME/kent/src/hg/lib
     autoSql itemConf.as itemConf
     mv itemConf.h ../inc/
     # commit ../inc/itemConf.h, itemConf.c, itemConf.as and
     # itemConf.sql to CVS. Add itemConf.o to src/hg/lib/makefile
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/consIndels
     hgLoadSqlTab hg18 consIndelsHg18Mm8CanFam2Conf \
          $HOME/kent/src/hg/lib/itemConf.sql \
          consIndelsHg18Mm8CanFam2Conf.txt
     # check that all itesm are in this table.
     hgsql -N -e 'select distinct(name) from consIndelsHg18Mm8CanFam2;' hg18 \
          | sort > consIndels.names.sort
     hgsql -N -e 'select distinct(id) from consIndelsHg18Mm8CanFam2Conf;' hg18 \
          | sort > consIndels.idsfromConf.sort
     wc -l *.sort
     # 2603017 consIndels.idsfromConf.sort
     # 2603017 consIndels.names.sort
     comm -12 consIndels.names.sort consIndels.idsfromConf.sort | wc -l
     # 2603017
     # so all element IDs are in both tables.
     # cleanup
     rm ./data/*.bak *.sort
 
     # add trackDb/human/hg18/trackDb.ra entry and add description that
     # was written by the data contributor. Add code to hgc.c to display
     # the posterior probability and the FDR on the details page for
     # track elements. Gerton Lunter provided a description for the data
     # on 2007-09-12.
 
 ############################################################################
 # Promote UCSD genome-wide ENCODE Chip tracks:
 # UCSD TAF1 IMR90 Chip/chip to Regulation group
 # (2007-09-14 kate)
 
     hgsql hg18 -e "alter table encodeUcsdNgChipSignal rename to wgEncodeUcsdNgTaf1Signal"
     hgsql hg18 -e "update wgEncodeUcsdNgTaf1Signal set file='/gbdb/hg18/encode/wib/wgEncodeUcsdNgTaf1Signal.wib'"
 
     hgsql hg18 -e "alter table encodeUcsdNgChipKnownSites rename to wgEncodeUcsdNgTaf1KnownSites"
     hgsql hg18 -e "alter table encodeUcsdNgChipNovelSites rename to wgEncodeUcsdNgTaf1NovelSites"
 
     hgsql hg18 -e "alter table encodeUcsdNgValChipH3K4me rename to wgEncodeUcsdNgTaf1ValidH3K4me"
     hgsql hg18 -e "alter table encodeUcsdNgValChipH3ac rename to wgEncodeUcsdNgTaf1ValidH3ac"
     hgsql hg18 -e "alter table encodeUcsdNgValChipRnap rename to wgEncodeUcsdNgTaf1ValidRnap"
     hgsql hg18 -e "alter table encodeUcsdNgValChipTaf rename to wgEncodeUcsdNgTaf1ValidTaf"
 
 
 ############################################################################
 # NESTED REPEATS (DONE 9/20/07 angie)
 # This track is now generated by doRepeatMasker.pl; added to this older
 # assembly for interest.
     ssh kkstore02
     # First, re-liftUp the .out -- liftUp has been enhanced to uniquify the
     # RepeatMasker IDs.
     cd /cluster/data/hg18
     foreach c ( `cat chrom.lst` )
       echo lifting chr$c chunks to contigs
       foreach d ( ${c}/N{C,G,T}_* )
         cd $d
         set contig = $d:t
         liftUp $contig.IDs.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out \
         > /dev/null
         cd ../..
       end
       echo lifting contigs to chr$c
       cd $c
       if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
         liftUp chr$c.IDs.fa.out lift/ordered.lft warn \
         `sed -e 's/.fa.out$/.IDs.fa.out/' lift/oOut.lst` \
         > /dev/null
       endif
       if (-e lift/random.lft && ! -z lift/random.lft) then
         liftUp chr${c}_random.IDs.fa.out lift/random.lft warn \
         `sed -e 's/.fa.out$/.IDs.fa.out/' lift/rOut.lst` \
         > /dev/null
       endif
       cd ..
     end
     # Now join fragments using shared IDs:
     ssh kolossus
     mkdir /cluster/data/hg18/bed/nestedRepeats
     cd /cluster/data/hg18/bed/nestedRepeats
     extractNestedRepeats.pl ../../?{,?}/chr*.IDs.fa.out \
       > hg18.nestedRepeats.bed
     # Load table:
     ssh hgwdev
     cd /cluster/data/hg18/bed/nestedRepeats
     hgLoadBed hg18 nestedRepeats hg18.nestedRepeats.bed \
       -sqlTable=$HOME/kent/src/hg/lib/nestedRepeats.sql
 
 
 ############################################################################
 # Promote GIS genome-wide ENCODE tracks:
 # GIS PET RNA and GIS ChIP-PET to Regulation group
 # (2007-09-20 kate)
 
     hgsql hg18 -e "alter table encodeGisChipPet rename to wgEncodeGisChipPet"
     hgsql hg18 -e "alter table encodeGisChipPetHes3H3K27me3 rename to wgEncodeGisChipPetHes3H3K27me3"
     hgsql hg18 -e "alter table encodeGisChipPetHes3H3K4me3 rename to wgEncodeGisChipPetHes3H3K4me3"
     hgsql hg18 -e "alter table encodeGisChipPetMycP493 rename to wgEncodeGisChipPetMycP493"
     hgsql hg18 -e "alter table encodeGisChipPetStat1Gif rename to wgEncodeGisChipPetStat1Gif"
     hgsql hg18 -e "alter table encodeGisChipPetStat1NoGif rename to wgEncodeGisChipPetStat1NoGif"
     hgsql hg18 -e "alter table encodeGisRnaPetHCT116 rename to wgEncodeGisRnaPetHCT116"
     hgsql hg18 -e "alter table encodeGisRnaPetHes3 rename to wgEncodeGisRnaPetHes3"
     hgsql hg18 -e "alter table encodeGisRnaPetMCF7 rename to wgEncodeGisRnaPetMCF7"
     hgsql hg18 -e "alter table encodeGisRnaPetMCF7Estr rename to wgEncodeGisRnaPetMCF7Estr"
 
 ##########################################################
 # Case Control Consortium  (DONE 2007-09-20 (Andy)
 
 ssh hgwdev
 bash
 mkdir /cluster/data/hg17/bed/caseControl
 cd /cluster/data/hg17/bed/caseControl
 wget ftp://ftp.sanger.ac.uk/pub/WTCCC/summary_stats/summary_stats_auto_all.zip
 unzip summary_stats_auto_all.zip
 cd basic/
 for disease in BD CAD CD HT RA T1D T2D; do
     echo $disease
     jkDisease=${disease:0:1}`echo ${disease:1} | tr [[:upper:]] [[:lower:]]`
     for f in *${disease}*.txt; do
        tail +2 $f | awk '{if ($21 == "1") print;}' | \
          cut -f1,15 >> ../chromGraphs/cccTrendPval${jkDisease}.cg
     done
 done
 cd ../chromGraphs/
 mkdir hg17 hg18
 for f in *.cg; do
     table=${f%.cg};
     echo $table
     hgLoadChromGraph -idTable=affy500k -minusLog10 -pathPrefix=/gbdb/hg17/chromGraph hg17 $table $f 2> ${table}.hg17.errors
     mv ${table}.cgb hg17/
     hgLoadChromGraph -idTable=affy500k -minusLog10 -pathPrefix=/gbdb/hg18/chromGraph hg18 $table $f 2> ${table}.hg18.errors
     mv ${table}.cgb hg18/
 done
 pushd /gbdb/hg18/chromGraph
 ln -s /cluster/data/hg17/bed/caseControl/chromGraphs/hg18/*.cgb .
 popd
 pushd /gbdb/hg17/chromGraph
 ln -s /cluster/data/hg17/bed/caseControl/chromGraphs/hg17/*.cgb .
 popd
 # Add the hack row into metaChromGraph for the composite tracks.
 hgsql hg17 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("caseControl", 0, 0, "composite")'
 hgsql hg18 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile) values ("caseControl", 0, 0, "composite")'
 
 
 #############################################################################
 # RGD HUMAN QTL (DONE 9/24/07 angie)
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/rgdQtl
     cd /cluster/data/hg18/bed/rgdQtl
     wget ftp://rgd.mcw.edu/pub/data_release/QTLS
     # Pick out the human QTLs and liftOver hg17 --> hg18.
     # Make bed4 and rgdQtlLink:
     perl -we 'open(BED, ">rgdQtl.bed") || die; \
               open(LINK, ">rgdQtlLink.txt") || die; \
               while (<>) { \
                 chomp; my @w = split("\t"); \
                 next unless ($w[1] eq "human" && $w[15]); \
                 $w[5] =~ s/^/chr/; \
                 $w[15] =~ s/^([-\d]+).*$/$1/ || die "parse start pos"; \
                 $w[16] =~ s/^(\d+).*$/$1/ || die "parse end pos"; \
                 if ($w[15] > $w[16]) { \
                   $tmp = $w[15];  $w[15] = $w[16];  $w[16] = $tmp; \
                 } \
                 $w[15]--; \
                 $w[15] = 0 if ($w[15] < 0); \
                 print BED "$w[5]\t$w[15]\t$w[16]\t$w[2]\n"; \
                 print LINK "$w[0]\t$w[2]\t$w[3]\n"; \
               } \
               close(BED);  close(LINK);' \
       QTLS
     mv rgdQtl.bed hg17.rgdQtl.bed
     # Using a fairly loose minMatch -- the regions covered are huge.
     liftOver -minMatch=0.5 hg17.rgdQtl.bed \
       /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz \
       hg18.rgdQtl.{bed,unmapped}
     wc -l hg18*
 # 254 hg18.rgdQtl.bed
 #   2 hg18.rgdQtl.unmapped
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/rgdQtl
     hgLoadBed hg18 rgdQtl hg18.rgdQtl.bed
     hgLoadSqlTab hg18 rgdQtlLink ~/kent/src/hg/lib/rgdQtlLink.sql rgdQtlLink.txt
     # Make sure there aren't any illegal coords:
     checkTableCoords -verbose=2 hg18 rgdQtl
 
 
 #############################################################################
 # RGD RAT QTL MAPPED TO HUMAN (DONE 9/26/07 angie)
 #====== Begin work that was discarded because its output was too voluminous
 #       to be very useful IMHO.  Keeping it in the doc as a lesson learned.
 #       See below for what I ended up loading.
     ssh hgwdev
     cd /cluster/data/hg18/bed/rgdQtl
     genePredToPsl -bedFormat rn4 /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed \
       rn4.rgdQtl.psl
     time ssh -x kolossus pslMap `pwd`/rn4.rgdQtl.psl \
       -chainMapFile /cluster/data/hg18/bed/liftOver/hg18ToRn4.over.chain.gz \
       `pwd`/hg18.rgdRatQtl.psl
 #0.011u 0.006s 10:58.56 0.0%     0+0k 0+0io 0pf+0w
     # That created an 11G monstrosity of a file that dwarfs the original
     # input.  Linecount increased 3 orders of magnitude, filesize increased
     # 5 orders of magnitude.
     wc -l rn4.rgdQtl.psl
 #1067 rn4.rgdQtl.psl
     ssh -x kkstore02 wc -l `pwd`/hg18.rgdRatQtl.psl
 #1228306 /cluster/store11/gs.19/build36/bed/rgdQtl/hg18.rgdRatQtl.psl
 
     # Let's see what liftOver does...
     time ssh -x kolossus \
       liftOver -minMatch=0.5 -multiple \
         /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed \
         /cluster/data/rn4/bed/liftOver/rn4ToHg18.over.chain.gz \
         `pwd`/hg18.rgdRatQtl.lo.{bed,unmapped}
 #0.014u 0.004s 0:59.27 0.0%      0+0k 0+0io 0pf+0w
     wc -l hg18.rgdRatQtl.lo.{bed,unmapped}
 # 1214366 hg18.rgdRatQtl.lo.bed
 #      14 hg18.rgdRatQtl.lo.unmapped
     # Still got 1M lines... ugh.  Mapped all over the place, of course.
 #====== end discarded work.
 
     # Use a stringently filtered version of over.chain to do the mapping,
     # so we only pick up large chunks (targeting >10,000bases) of these
     # enormous regions (up to 235M in rn4).
     ssh kolossus
     cd /cluster/data/hg18/bed/rgdQtl
     # rn4ToHg18 was built before doBlastz included chainStitchId in the
     # pipe to create over.chain.  Run it here, to repair any chain breaks:
     chainStitchId /cluster/data/rn4/bed/liftOver/rn4ToHg18.over.chain.gz \
       rn4ToHg18Stitch.over.chain
 
     # I looked at the summed scores from chainStitchId vs. the length
     # spanned by the stitched chains, and arbitrarily picked what I
     # think is a sweet spot for mapping very large ranges: at scores
     # near 500000, chains seem to span 40-60k bases.  Pretty much all
     # of the rat and human chromosomes (except human randoms) have at
     # least some chains with scores >= 500000.  So I'll filter the
     # stitched chains to keep those with score >= 500000.
 # NOTE FOR NEXT TIME: consider filtering by length (see jaxQtl below).
     chainFilter rn4ToHg18Stitch.over.chain -minScore=500000 \
       > rn4ToHg18Coarse.over.chain
     # I tried liftOver with -minMatch=0.5, 0.33, 0.25 and 0.2.  These are the
     # wc -l stats for each run -- not surprisingly, many more matches with
     # lower minMatch:
 #0.5:
 # 1256 hg18.rgdRatQtl.coarse.lo.bed
 #  998 hg18.rgdRatQtl.coarse.lo.unmapped
 #0.33:
 #  6748 hg18.rgdRatQtl.coarse.lo.bed
 #    92 hg18.rgdRatQtl.coarse.lo.unmapped
 #0.25:
 #  9609 hg18.rgdRatQtl.coarse.lo.bed
 #    36 hg18.rgdRatQtl.coarse.lo.unmapped
 #0.2:
 # 10529 hg18.rgdRatQtl.coarse.lo.bed
 #    30 hg18.rgdRatQtl.coarse.lo.unmapped
     # I spot-checked by viewing a rat QTL and hg18 chains in rn4, and
     # eyeballing whether the net track looked like there were solid
     # matches for large regions.  With minMatch=0.25, most mappings
     # and unmapped looked pretty reasonable, but I still saw a few
     # (like Alc4) where a nice long chain was not being used, so I
     # kicked it down to 0.2 and checked again -- looks good.
     time liftOver -minMatch=0.2 -multiple \
       /cluster/data/rn4/bed/rgdQtl/rgdQtl.bed rn4ToHg18Coarse.over.chain \
       hg18.rgdRatQtl.coarse.lo.{bed,unmapped}
 #100.476u 10.925s 1:52.31 99.1%  0+0k 0+0io 0pf+0w
    wc -l hg18.rgdRatQtl.coarse.lo.{bed,unmapped}
 # see above.
 
     # Many of the records are completely contained within other records
     # for the same QTL (inversions I suppose) -- they don't really tell
     # us anything new about the murky QTL region, so merge them in.
 # NOTE FOR NEXT TIME: instead of the perl+sort, use something like this:
 #    liftOverMerge -mergeGap=10000 hg18.rgdRatQtl.coarse.lo.bed stdout \
 #    | mergeOverlapBed4.pl - > hg18.rgdRatQtl.coarse.lo.pruned.bed
 # liftOverMerge joins items separated by small (a relative term) gaps.
     perl -we \
       'while (<>) { \
          chomp; ($chrom, $start, $end, $name) = split; \
          push @{$item2coords{"$chrom.$name"}}, [$start, $end]; \
        } \
        foreach $item (keys %item2coords) { \
          @sortedCoords = sort { $a->[0] <=> $b->[0] } @{$item2coords{$item}}; \
          ($chrom, $name) = split(/\./, $item); \
          ($mergeStart, $mergeEnd) = @{shift @sortedCoords}; \
          foreach $rangeRef (@sortedCoords) { \
            ($rangeStart, $rangeEnd) = @{$rangeRef}; \
            next if ($rangeEnd <= $mergeEnd); \
            if ($rangeStart > $mergeEnd) { \
              print "$chrom\t$mergeStart\t$mergeEnd\t$name\n"; \
              ($mergeStart, $mergeEnd) = ($rangeStart, $rangeEnd); \
            } else { \
              $mergeEnd = $rangeEnd; \
            } \
          } \
          print "$chrom\t$mergeStart\t$mergeEnd\t$name\n" if ($mergeEnd); \
        } \
       ' hg18.rgdRatQtl.coarse.lo.bed \
       | sort -k1,1 -k2n,2n -k4,4r \
       > hg18.rgdRatQtl.coarse.lo.pruned.bed
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/rgdQtl
     hgLoadBed hg18 rgdRatQtl hg18.rgdRatQtl.coarse.lo.pruned.bed
     # Just use rn4's non-positional associated info:
     sed -e 's/rgdQtlLink/rgdRatQtlLink/' ~/kent/src/hg/lib/rgdQtlLink.sql \
       > rgdRatQtlLink.sql
     hgLoadSqlTab hg18 rgdRatQtlLink rgdRatQtlLink.sql \
       /cluster/data/rn4/bed/rgdQtl/rgdQtlLink.txt
     # Make sure there aren't any illegal coords:
     checkTableCoords -verbose=2 hg18 rgdRatQtl
     runJoiner.csh hg18 rgdRatQtl
 
 #====== more discarded work 10/2/07:
     ssh kolossus
     cd /cluster/data/hg18/bed/rgdQtl
     # Try pslMap with the same filtered chains:
     time pslMap -swapMap rn4.rgdQtl.psl \
       -chainMapFile rn4ToHg18Coarse.over.chain \
       hg18.rgdRatQtl.coarse.pm.psl
 #444.915u 29.914s 11:20.08 69.8% 0+0k 0+0io 0pf+0w
     wc -l hg18.rgdRatQtl.coarse.pm.psl
 #10755 hg18.rgdRatQtl.coarse.pm.psl
     # Again, linecount is comparable to liftOver, but the block-by-block
     # detail from pslMap creates an enormous file (10GB) even with the
     # filtered chains.
     # Recover 21G of disk space:
     rm hg18.rgdRatQtl.psl hg18.rgdRatQtl.coarse.pm.psl
 #====== end discarded work.
 
 
 #############################################################################
 # N-SCAN GENES partial reload (2007-09-26 markd)
 # reload nscanPasaGene to get fixed names and to fix search criteria
 
     # download pasa predictions
     cd /cluster/data/hg18/bed/nscan/pasa2
     wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.pasa.gtf
     wget http://mblab.wustl.edu/predictions/human/hg18_PASA/hg18.prot.fa
     bzip2 hg18.*
     chmod a-w hg18.*
 
     ldHgGene -gtf -genePredExt hg18 nscanPasaGene hg18.pasa.gtf.bz2
     hgPepPred hg18 generic nscanPasaPep  hg18.prot.fa.bz2
     rm *.tab
 
     # update trackDb to add correct termRegex entries
     human/hg18/trackDb.ra
 
     # push nscanPasaGene nscanPasaPep and trackDb
 
 #############################################################################
 # Blastz hg18 to J. Craig Venter chrom attempt (DONE - 2007-09-27 - Hiram)
     ssh kkstore06
     screen # use a screen to control this job
 
     mkdir /cluster/data/hg18/bed/blastzVenter1.2007-09-27
     cd /cluster/data/hg18/bed/blastzVenter1.2007-09-27
 
     cat << '_EOF_' > DEF
 # human reference vs J. Craig Venter
 
 # using -chainMinScore=10000 and -chainLinearGap=medium
 #       during doBlastzChainNet.pl run
 
 # parameters on advice from Webb for K and Q
 # M as in hg18 self, O and E from Q
 # Y and T as in hg18-panTro2 and mm9-rn4
 BLASTZ_K=10000
 BLASTZ_M=400
 BLASTZ_O=600
 BLASTZ_E=150
 BLASTZ_Y=15000
 BLASTZ_T=2
 BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q
 
 # TARGET: Human Hg18
 SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib
 SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Venter1
 SEQ2_DIR=/iscratch/i/venter1/venter1.unmasked.2bit
 SEQ2_LEN=/cluster/data/venter1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzVenter1.2007-09-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 -chainMinScore=10000 -chainLinearGap=medium \
 	-bigClusterHub=kk -noDbNameCheck DEF > do.log 2>&1 &
     #	real    163m10.634s
     #	 this doesn't work, it failed due to mistakenly thinking it was a self
     #	 alignment.  Plus, we need to do the raw scaffolds, not these fake
     #	 chroms.
 
 #############################################################################
 # CONTRAST GENES (2007-10-02 markd)
 # recieved predictions from Sam Gross <ssgross at stanford.edu>
 
     cd /cluster/data/hg18/bed/contrastGene/
     wget http://www.stanford.edu/~ssgross/contrast.hg18.bed
     # this is a custom track, not a pure BED
     tail +2 contrast.hg18.bed | hgLoadBed -tab hg18 contrastGene stdin
 
     # verify
     # load track db (ra and contrastGene.html are global
     # request push of contrastGene
 
 
 ###########################################################################
 # SGP GENES Update (DONE - 2007-10-02 - Hiram)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/sgp.2007-10-02
     cd /cluster/data/hg18/bed/sgp.2007-10-02
     SITE="genome.imim.es/genepredictions/H.sapiens/golden_path_200603_x_mm9"
     for C in `cut -f1 ../../chrom.sizes`
 do
     wget --timestamping "http://${SITE}/SGP/${C}.gtf" -O ${C}.gtf
     wget --timestamping "http://${SITE}/SGP/${C}.prot" -O ${C}.prot
 done
 
     #	before reloading the table, measure the previous set:
     nice -n +19 featureBits -enrichment hg18 refGene:CDS sgpGene
 # refGene:CDS 1.123%, sgpGene 1.272%, both 0.964%, cover 85.83%, enrich 67.47x
     nice -n +19 featureBits -enrichment hg18 knownGene:CDS sgpGene
 # knownGene:CDS 1.185%, sgpGene 1.272%, both 0.989%, cover 83.43%, enrich 65.58x
 
     #	now reload the table
     ldHgGene -gtf -genePredExt hg18 sgpGene chr*.gtf
     #	Read 34023 transcripts in 288520 lines in 49 files
     #	34023 groups 46 seqs 1 sources 3 feature types
     #	34023 gene predictions
 
     #	and now measure this new set
     nice -n +19 featureBits -enrichment hg18 refGene:CDS sgpGene
 # refGene:CDS 1.123%, sgpGene 1.270%, both 0.964%, cover 85.84%, enrich 67.59x
     nice -n +19 featureBits -enrichment hg18 knownGene:CDS sgpGene
 # knownGene:CDS 1.185%, sgpGene 1.270%, both 0.988%, cover 83.41%, enrich 65.68x
 
 ###########################################################################
 # Blastz Orangutan ponAbe2 (DONE - 2007-10-02 - 2007-10-05 - Hiram)
     ssh kkstore02
     screen # use screen to control this job
     mkdir /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
     cd /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
 
     cat << '_EOF_' > DEF
 # Human vs orangutan
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Orangutan ponAbe2
 SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
 SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=pk > blastz.log 2>&1 &
     #	real    388m20.443s
 # Completed: 126960 of 126960 jobs
 # CPU time in finished jobs:    7068824s  117813.73m  1963.56h   81.82d  0.224 y
 # IO & Wait Time:                517624s    8627.07m   143.78h    5.99d  0.016 y
 # Average job time:                  60s       1.00m     0.02h    0.00d
 # Longest finished job:            4940s      82.33m     1.37h    0.06d
 # Submission to last job:         62056s    1034.27m    17.24h    0.72d
 
     # some jobs failed (because they were done but parasol didn't realize that)
     #	after recovery, continuing:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-continue=cat -bigClusterHub=pk > cat.log 2>&1 &
     #	real    390m56.934s
     cat fb.hg18.chainPonAbe2Link.txt
     #	2676696124 bases of 2881515245 (92.892%) in intersection
 
     #	And the swap
     mkdir /cluster/data/ponAbe2/bed/blastz.hg18.swap
     cd /cluster/data/ponAbe2/bed/blastz.hg18.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-swap -bigClusterHub=pk > swap.log 2>&1 &
     #	real    123m9.197s
     cat fb.ponAbe2.chainHg18Link.txt
     #	2824501297 bases of 3093572278 (91.302%) in intersection
 
 ##############################################################
 # NIMH Bipolar Genome Graphs built-in  (DONE 2007-10-04 Galt)
 ssh hgwdev
 mkdir /cluster/data/hg17/bed/nimhBipolar
 # I registered and downloaded :
 wget http://mapgenetics.nimh.nih.gov/BP_POOLING/german_data_share.csv.zip \
 --user=galt --password=mypassword
 wget http://mapgenetics.nimh.nih.gov/BP_POOLING/nimh_data_share.csv.zip \
 --user=galt --password=mypassword
 unzip german_data_share.csv.zip
 unzip nimh_data_share.csv.zip
 mkdir chromGraphs
 tail +2 nimh_data_share.csv  | tr -d '"' | gawk -F ',' '{print $1 "\t" $9}' \
  > chromGraphs/nimhBipolarUs.cgt
 tail +2 german_data_share.csv | tr -d '"' | gawk -F ',' '{print $1 "\t" $9}' \
  > chromGraphs/nimhBipolarDe.cgt
 cd chromGraphs/
 mkdir hg17 hg18
 
 hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
 -pathPrefix=/gbdb/hg17/chromGraph hg17 nimhBipolarUs nimhBipolarUs.cgt \
  >& nimhBipolarUs.hg17.errors
 mv nimhBipolarUs.cgb hg17/
 hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
 -pathPrefix=/gbdb/hg17/chromGraph hg17 nimhBipolarDe nimhBipolarDe.cgt \
  >& nimhBipolarDe.hg17.errors
 mv nimhBipolarDe.cgb hg17/
 hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
 -pathPrefix=/gbdb/hg18/chromGraph hg18 nimhBipolarUs nimhBipolarUs.cgt \
  >& nimhBipolarUs.hg18.errors
 mv nimhBipolarUs.cgb hg18/
 hgLoadChromGraph -idTable=snpArrayIllumina550 -minusLog10
 -pathPrefix=/gbdb/hg18/chromGraph hg18 nimhBipolarDe nimhBipolarDe.cgt \
  >& nimhBipolarDe.hg18.errors
 mv nimhBipolarDe.cgb hg18/
 
 pushd /gbdb/hg17/chromGraph
 ln -s /cluster/data/hg17/bed/nimhBipolar/chromGraphs/hg17/*.cgb .
 popd
 pushd /gbdb/hg18/chromGraph
 ln -s /cluster/data/hg17/bed/nimhBipolar/chromGraphs/hg18/*.cgb .
 popd
 
 # Add the hack row into metaChromGraph for the composite tracks.
 hgsql hg17 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile)
 values ("bipolar", 0, 0, "composite")'
 hgsql hg18 -e 'insert into metaChromGraph (name, minVal, maxVal, binaryFile)
 values ("bipolar", 0, 0, "composite")'
 
 #Add composite track info to src/hg/makeDb/trackDb/human/trackDb.ra:
 
 
 ############################################################################
 # MGI MOUSE QTL MAPPED TO HUMAN (DONE 10/10/07 angie)
     # Use a stringently filtered version of over.chain to do the mapping,
     # so we only pick up large chunks (targeting >10,000bases) of the
     # large fuzzy QTL regions.
     # Of the MGI QTLs, some are large as expected, but most are tiny --
     # they have only the peak STS marker coords, no indication of the
     # range.  Jim suggested padding those out to 100k.  So I will process
     # these in two batches, and make subtracks -- one for original, one
     # for our modified set.
 ### NOTE FOR NEXT TIME ###
 ### Use jaxQtl instead of jaxQTL throughout.
     ssh kolossus
     mkdir /cluster/data/hg18/bed/jaxQTL
     cd /cluster/data/hg18/bed/jaxQTL
     # mm8ToHg18 was built before doBlastz included chainStitchId in the
     # pipe to create over.chain.  Run it here, to repair any chain breaks:
     chainStitchId /cluster/data/mm8/bed/liftOver/mm8ToHg18.over.chain.gz \
       /scratch/tmp/mm8ToHg18Stitch.over.chain
     # For rn4->hg18 (rgdRatQtl above), I eyeballed scores vs. spans of
     # stitched chains, to try to find a score threshold over which almost
     # all spans were at least 10 or 20k, most >50k.  For mm8->hg18, the
     # correspondence is not quite so smooth, and in order to keep all spans
     # >= 100k, the score threshold would have to be 170k (compared to
     # 500k for rn4-hg18) and would pick up a lot of short chains.
     # So this time I'll try filtering directly by span instead of score
     # (but add a reasonable minScore to kick out some outliers).
     chainFilter /scratch/tmp/mm8ToHg18Stitch.over.chain \
       -tMinSize=20000 -qMinSize=20000 -minScore=10000\
       > mm8ToHg18Coarse.over.chain
 
     # Separate the mm8 jaxQtl's by size and reduce to bed4:
     awk 'BEGIN{OFS="\t";} \
          ($3-$2) < 1000 {s = $2 > 50000 ? $2-50000 : 0; \
                          print $1, s, $3+50000, $4;}' \
       /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed \
       > mm8.jaxQtl.padded.bed
     cp /dev/null tmp.bed
     foreach chr (`awk '{print $1;}' /cluster/data/mm8/chrom.sizes`)
       set size = `awk '$1 == "'$chr'" {print $2;}' /cluster/data/mm8/chrom.sizes`
       awk 'BEGIN{OFS="\t";} \
            $1 == "'$chr'" && $3 > '$size' {$3 = '$size';} \
            $1 == "'$chr'" && $3 > $2 {print;}' \
       mm8.jaxQtl.padded.bed >> tmp.bed
     end
     mv tmp.bed mm8.jaxQtl.padded.bed
     awk 'BEGIN{OFS="\t";}  ($3-$2) > 100000 {print $1, $2, $3, $4;}' \
       /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed \
       > mm8.jaxQtl.asIs.bed
     # Make sure we didn't miss any between those two size ranges (except for
     # the 4 markers whose coords are completely off the end of mm8 chroms):
     wc -l mm8.*.bed
 #   73 mm8.jaxQtl.asIs.bed
 # 1468 mm8.jaxQtl.padded.bed
 # 1541 total
     wc -l /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed
 #1545 /cluster/data/mm8/bed/jax/2007_09/jaxQtl.bed
 
     # Try liftOver with various -minMatch settings.  Compare the number
     # mapped and unmapped; eyeball some of the unmapped in mm8, see if
     # the hg18 Nets are truly weak there.
     foreach minMatch (0.1 0.2 0.25 0.33)
       time liftOver -minMatch=$minMatch -multiple \
         mm8.jaxQtl.asIs.bed mm8ToHg18Coarse.over.chain \
         hg18.jaxQTL.asIs.$minMatch.{bed,unmapped}
       time liftOver -minMatch=$minMatch -multiple \
         mm8.jaxQtl.padded.bed mm8ToHg18Coarse.over.chain \
         hg18.jaxQTL.padded.$minMatch.{bed,unmapped}
       wc -l hg18.jaxQTL.*.$minMatch.{bed,unmapped}
       echo ""
     end
 #typical time: 23s for asIs, 45s for padded
 #  757 hg18.jaxQTL.asIs.0.1.bed
 # 1471 hg18.jaxQTL.padded.0.1.bed
 #    0 hg18.jaxQTL.asIs.0.1.unmapped
 #   54 hg18.jaxQTL.padded.0.1.unmapped
 #  634 hg18.jaxQTL.asIs.0.2.bed
 # 1429 hg18.jaxQTL.padded.0.2.bed
 #    0 hg18.jaxQTL.asIs.0.2.unmapped
 #  128 hg18.jaxQTL.padded.0.2.unmapped
 #  532 hg18.jaxQTL.asIs.0.25.bed
 # 1345 hg18.jaxQTL.padded.0.25.bed
 #    2 hg18.jaxQTL.asIs.0.25.unmapped
 #  282 hg18.jaxQTL.padded.0.25.unmapped
 #  362 hg18.jaxQTL.asIs.0.33.bed
 # 1146 hg18.jaxQTL.padded.0.33.bed
 #    8 hg18.jaxQTL.asIs.0.33.unmapped
 #  670 hg18.jaxQTL.padded.0.33.unmapped
     # I eyeballed the 0.1 .bed and .unmapped files, and they look
     # pretty good, esp. for mapped... we could probably get away with
     # 0.2 for the asIs but 0.1 looks OK.
 
     # Many of the records are completely contained within other records
     # for the same QTL (inversions I suppose) -- they don't really tell
     # us anything new about the murky QTL region, so merge them in.
 # NOTE FOR NEXT TIME: try this:
 #    liftOverMerge -mergeGap=10000 hg18.jaxQTL.asIs.0.1.bed stdout \
 #    | mergeOverlapBed4.pl - > hg18.jaxQTL.asIs.0.1.pruned.bed
 # liftOverMerge joins items separated by small (a relative term) gaps.
     mergeOverlapBed4.pl hg18.jaxQTL.asIs.0.1.bed \
       > hg18.jaxQTL.asIs.0.1.pruned.bed
     mergeOverlapBed4.pl hg18.jaxQTL.padded.0.1.bed \
       > hg18.jaxQTL.padded.0.1.pruned.bed
     wc -l hg18.jaxQTL.*.pruned.bed
 #  398 hg18.jaxQTL.asIs.0.1.pruned.bed
 # 1463 hg18.jaxQTL.padded.0.1.pruned.bed
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/jaxQTL
 ### NOTE FOR NEXT TIME ###
 ### Call the tables jaxQtl* instead of jaxQTL* -- QA doesn't like jaxQTL.
     hgLoadBed hg18 jaxQTLAsIs hg18.jaxQTL.asIs.0.1.pruned.bed
     hgLoadBed hg18 jaxQTLPadded hg18.jaxQTL.padded.0.1.pruned.bed
   # Make sure there aren't any illegal coords:
     checkTableCoords -verbose=2 hg18 jaxQTLAsIs
     checkTableCoords -verbose=2 hg18 jaxQTLPadded
     runJoiner.csh hg18 jaxQTLAsIs
     runJoiner.csh hg18 jaxQTLPadded
 
 # Tables renamed   kuhn 10-12-2007
 # jaxQTLAsIs   to  jaxQtlAsIs
 # jaxQTLPadded to  jaxQtlPadded
 
 
 ###########################################################################
 # Build targetScanS track - (DONE - 2007-10-05 - 2007-10-31 - Hiram)
 #       requested by: George Bell gbell at wi.mit.edu
     ssh hgwdev
     mkdir -p /cluster/data/hg18/bed/targetScanS
     cd /cluster/data/hg18/bed/targetScanS
 
     wget --timestamping \
 	http://jura.wi.mit.edu/targetscan/vert_40/ucsc/hg18/hg18ConsChrALL.bed
 
     hgLoadBed hg18 targetScanS -tmpDir=/scratch/tmp hg18ConsChrALL.bed
     #	Loaded 50764 elements of size 6
     featureBits hg18 targetScanS
     #	313293 bases of 2881515245 (0.011%) in intersection
 
 
     ################################
     # previous attempts listed below
 
     #	the don't supply them all, but we don't know which ones they
     #	don't.  So, ask for them all, and remove the files that are empty.
     for C in `cut -f1 ../../chrom.sizes | sed -e "s/chr//"`
 do
     wget --timestamping \
     "http://jura.wi.mit.edu/targetscan/vert_40/ucsc/NR/hg18ConsChr${C}.bed" \
 	-O hg18ConsChr${C}.bed
     if [ ! -s "hg18ConsChr${C}.bed" ]; then
 	rm -f "hg18ConsChr${C}.bed"
     fi
 done
 
     # Remove the browser/track lines from these custom track files
     #	and load into the hg18.targetScanS table
 
     egrep -h -v "^browser|^track" hg*.bed | \
 	hgLoadBed hg18 targetScanS -tmpDir=/scratch/tmp stdin
     #	Loaded 50802 elements of size 6
     featureBits hg18 targetScanS
     #	312951 bases of 2881515245 (0.011%) in intersection
 
     # Create/edit/check in targetScans.html and trackDb.ra under
     # kent/src/hg/makeDb/trackDb/human/hg18
 
 ###########################################################################
 # RE-BUILD WGRNA TRACK (DONE, 2007-10-05, Fan)
 
   ssh hgwdev
   cd /cluster/data/hg18/bed
 
   mkdir wgRna-2007-10-05
   cd wgRna-2007-10-05
 
 # Received the data file, wgtrack_oct2007.txt (saved from wgtrack_oct2007.doc)
 # from Michel Weber's email
 # (Michel.Weber at ibcg.biotoul.fr)
 # and place it under cd /cluster/data/hg18/bed/wgRna-2007-10-05.
 
   cat wg_track_oct2007.txt|sed -e 's/ /\t/g' > wgRna.tab
 
   hgLoadBed -sqlTable=/cluster/home/fanhsu/src/hg/lib/wgRna.sql hg18 wgRna wgRna.tab
 
 #############################################################################
 # BLASTZ calJac1 - Marmoset  (2007-10-09 kate)
 
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastz.calJac1.2007-10-07
     cd /cluster/data/hg18/bed/blastz.calJac1.2007-10-07
 
     cat << '_EOF_' > DEF
 # human vs. marmoset
 
 # dynamic masking param
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/hg/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Marmoset (calJac1)
 SEQ2_DIR=/san/sanvol1/scratch/calJac1/calJac1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/calJac1/chrom.sizes
 SEQ2_LIMIT=500
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.calJac1.2007-10-07
 '_EOF_'
     # << happy emacs
 
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium >& do.log &
     tail -f do.log
 
     # failed at download step do to pre-existing file of Brian's
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk -continue=download \
       -chainMinScore=3000 -chainLinearGap=medium >& do2.log &
     tail -f do2.log
 
 
 #########################################################
 # RE-BUILD GAD TRACK (Done, 10/17/06, Fan)
 
    mkdir /cluster/store12/gad071011
    rm /cluster/data/gad
    ln -s /cluster/store12/gad071011 /cluster/data/gad
 
    cd /cluster/data/gad
 
 # Receive "all.txt" from GAD
 # contact person: Garner, John (NIH/NIA/IRP) [F] [garnerjr@mail.nih.gov]
 
    hgsql hg18 -e 'drop table gadAll'
    hgsql hg18 <~/src/hg/lib/gadAll.sql
    hgsql hg18 -e 'load data local infile "all.txt" into table gadAll ignore 3 lines'
 
 # create gad table
 
    gadPos hg18 j18.tmp
    cat j18.tmp |sort -u >hg18.gad.tab
 
    hgLoadBed hg18 gad hg18.gad.tab
    rm j18.tmp
 
 
 #########################################################################
 # HAPMAP LD (DONE 10/26/07 angie -- phased REDONE 1/30/08)
     # Based on Daryl's hg17 work.  Data version here is release #22,
     # March 2007 (2007-03).
     # 1/30/08: HapMap re-released the phased genotypes 1/22/08 -- re-run,
     # but without the removal of question marks that we had to do the
     # first time around.
 
     # hapmap.org offers ld_data downloads that look like the output of
     # makeDcc -- but only for older versions.  To get LD for the latest
     # release (and for hg18 coords), compute LD from genotype as Daryl did.
 
 ############################# unphased ##############################
 #*** NOTE FOR NEXT TIME: don't bother with individual CHB and JPT subsets,
 #*** {CEU, CHB+JPT, YRI} is what we display.
 #*** Actually, if there is a next time, we'll probably just start with
 #*** phased and ignore unphased.
     ssh kolossus
     mkdir -p /san/sanvol1/scratch/hg18/bed/hapmapLd/genotypes_2007-03/run.Haploview
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd/genotypes_2007-03
     # wget all genotype data:
     wget ftp://ftp.hapmap.org/pub/hapmap/public/00README.releasenotes_rel22
     wget ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2007-03/fwd_strand/non-redundant/genotypes_chr\*.txt.gz
     # Use latest Haploview to compute LD scores:
     wget http://www.broad.mit.edu/mpg/haploview/downloads/Haploview.jar
 
     # Haploview cluster run on whole-chrom genotype files was a bust.
     # Even on kki nodes, with java memory maxed out, 47 of 120 jobs crashed
     # and one was still running after 5.5 days so I killed it.
 
     # Meanwhile, Daryl suggested using the phased data instead.  It is
     # not yet available for all chrom/pops, but start with what's there
     # to iron out the flow.
 
     # New approach to unphased -> LD -- split, run Haploview, merge.
     ssh pk
     # Note: although the genotypes_ files are *mostly* sorted by position,
     # they're not completely sorted!  That can cause splitGenotype.pl to
     # screw up (as well as other downstream stuff), so sort them on the way
     # into splitGenotype.
     mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.splitUnphased
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.splitUnphased
     cat > runSplit.csh <<'_EOF_'
 #!/bin/csh -ef
 
 set f = $1
 set base = $f:t:r:r
 set scriptBin = ~/kent/src/hg/snp/hapmapLd
 
 set tmpDir = `mktemp -d -p /scratch/tmp runSplit.XXXXXX`
 zcat $f \
 | sort -k4n,4n \
 | $scriptBin/splitGenotype.pl -suffix .txt.gz \
   10000000 250000 $tmpDir/$base
 mv $tmpDir/$base.* ../splitUnphased/$base/
 rmdir $tmpDir
 '_EOF_'
     # << emacs
     chmod a+x runSplit.csh
     cp /dev/null jobList
     foreach f (../genotypes_2007-03/genotypes_chr*.txt.gz)
       mkdir -p ../splitUnphased/$f:t:r:r
       echo ./runSplit.csh $f >> jobList
     end
     para make jobList
     para time
 #Completed: 120 of 120 jobs
 #CPU time in finished jobs:        826s      13.77m     0.23h    0.01d  0.000 y
 #IO & Wait Time:                   457s       7.61m     0.13h    0.01d  0.000 y
 #Average job time:                  11s       0.18m     0.00h    0.00d
 #Longest finished job:              22s       0.37m     0.01h    0.00d
 #Submission to last job:            29s       0.48m     0.01h    0.00d
 
     # Run Haploview on split files.
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewSplitUnphased
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewSplitUnphased
     set scriptBin = ~/kent/src/hg/snp/hapmapLd
     set hvPath = /san/sanvol1/scratch/hg18/bed/hapmapLd/Haploview.jar
     # Latest installed java on the cluster nodes (not on the para hub machine):
     set javaPath = /usr/java/jre1.5.0_12/bin/java
     set javaMemSize = 1500M
     find /san/sanvol1/scratch/hg18/bed/hapmapLd/splitUnphased \
       -name \*.txt.gz -ls \
     | awk '{print $7, $11;}' | sort -nr > filesBySize
     cp /dev/null jobList
     foreach f (`awk '{print $2;}' filesBySize`)
       echo $scriptBin/runHaploview.csh $f $javaPath $hvPath $javaMemSize \
         >> jobList
     end
     para make jobList
     para time
 #Completed: 1493 of 1493 jobs
 #CPU time in finished jobs:     582015s    9700.25m   161.67h    6.74d  0.018 y
 #IO & Wait Time:                  6558s     109.30m     1.82h    0.08d  0.000 y
 #Average job time:                 394s       6.57m     0.11h    0.00d
 #Longest finished job:            1711s      28.52m     0.48h    0.02d
 #Submission to last job:          1740s      29.00m     0.48h    0.02d
 
     # Merge Haploview results.
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.mergeSplitHapLD
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.mergeSplitHapLD
     cat > runMerge.csh <<'_EOF_'
 #!/bin/csh -ef
 
 set mapFile = $1
 set outFile = $2
 
 set scriptBin = ~/kent/src/hg/snp/hapmapLd
 
 set tmpOut = `mktemp -p /scratch/tmp runMerge.XXXXXX`
 $scriptBin/mergeHaploviewLD.pl $mapFile $tmpOut
 mv $tmpOut $outFile
 '_EOF_'
     # << emacs
     chmod a+x runMerge.csh
     mkdir ../mergedUnphasedLD
     cp /dev/null jobList
     foreach f (`ls -1S ../splitUnphased/genotypes_chr*/genotypes_chr*.map`)
       set base = $f:t:r
       echo ./runMerge.csh $f ../mergedUnphasedLD/$base.txt.LD.gz >> jobList
     end
     para make jobList
     para time
 #Completed: 120 of 120 jobs
 #CPU time in finished jobs:      16035s     267.25m     4.45h    0.19d  0.001 y
 #IO & Wait Time:                 17282s     288.03m     4.80h    0.20d  0.001 y
 #Average job time:                 278s       4.63m     0.08h    0.00d
 #Longest finished job:             737s      12.28m     0.20h    0.01d
 #Submission to last job:           738s      12.30m     0.20h    0.01d
 
     # Compare results of unsplit run with split/merge:
     ssh kolossus
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd
     # Compare SNP pairs:
     zcat genotypes_2007-03/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz \
     | awk '{print $1, $2;}' > /tmp/1
     zcat mergedUnphasedLD/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz \
     | awk '{print $1, $2;}' > /tmp/2
     wc -l /tmp/1 /tmp/2
 #  32514982 /tmp/1
 #  32514982 /tmp/2
     cmp /tmp/1 /tmp/2
     # Compare entire files:
     zcat genotypes_2007-03/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz > /tmp/1
     zcat mergedUnphasedLD/genotypes_chr9_JPT_r22_nr.b36_fwd.txt.LD.gz > /tmp/2
     head /tmp/1 /tmp/2
     cmp /tmp/1 /tmp/2
     # Woohoo!
 
 ############################# phased ##############################
     # For this build, Daryl suggested using the phased data (output of
     # Jonathan Marchini's PHASE program) instead of raw genotype data
     ssh kolossus
     mkdir -p /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22
     # 1/30/08: re-run from this point on, to pick up re-release (same URL)
     wget --timestamping \
       ftp://ftp.hapmap.org/pub/hapmap/public/phasing/2007-08_rel22/phased/\*.gz
     # Downstream stuff depends on the inputs being sorted by position -- check:
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd
     foreach f (phased_2007-08_rel22/*_legend.txt.gz)
       echo $f
       zcat $f | tail +2 | awk '{print $2;}' > /tmp/1
       sort -n /tmp/1 > /tmp/2
       cmp /tmp/1 /tmp/2
     end
     rm -f /tmp/1 /tmp/2
 
     # kki cluster run -- need lots of memory!  more than pk's 2G hard limit.
     # (would use memk but it doesn't have java and kki is sufficient)
     ssh kki
     mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewPhased
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.HaploviewPhased
     set scriptBin = $HOME/kent/src/hg/snp/hapmapLd
     set hv = $scriptBin/runHaploviewPhased.csh
     set phaseDir = /san/sanvol1/scratch/hg18/bed/hapmapLd/phased_2007-08_rel22
     set hvPath = /san/sanvol1/scratch/hg18/bed/hapmapLd/Haploview.jar
     # Latest installed java on the cluster nodes (not on the para hub machine):
     set javaPath = /usr/java/jre1.5.0_12/bin/java
     set javaMemSize = 4G
     # Sort by size (descending) to kick off the biggest jobs first:
     cp /dev/null jobList
     foreach f (`ls -1S $phaseDir/genotypes_chr*.phase.gz`)
       echo $hv $f:r:r $javaPath $hvPath $javaMemSize >> jobList
     end
     para make jobList
     para time
 #Completed: 66 of 66 jobs
 #CPU time in finished jobs:     406845s    6780.76m   113.01h    4.71d  0.013 y
 #IO & Wait Time:                  1517s      25.28m     0.42h    0.02d  0.000 y
 #Average job time:                6187s     103.12m     1.72h    0.07d
 #Longest finished job:           15667s     261.12m     4.35h    0.18d
 #Submission to last job:         29868s     497.80m     8.30h    0.35d
 
     # Our software assumes that LD scores are given for consecutive SNPs
     # without gaps in between, so scores in the encoded lists can be
     # associated with other SNPs just by their position in the list.
     # Make sure that's the case!  I suspect this also depends on the
     # inputs to Haploview being sorted by position -- checked those above.
     ssh kolossus
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd
     cp /dev/null checkLD.log
     foreach f ( mergedUnphasedLD/*.LD.gz phased_2007-08_rel22/*.LD.gz )
       echo $f >> checkLD.log
       $scriptBin/checkLDSnpOrder.pl $f >>& checkLD.log
       echo "" >> checkLD.log
       date
     end
     # Takes a long time (~4 minutes for 184 files -> 11-12 hours) --
     # left to run overnight.
 
     # Cluster run to translate Haploview .LD output into the DCC's
     # ld_data downloads format, and in turn into our bed4+ format.
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/{dcc,bed}{Phased,Unphased}
     mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatUnphased
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatUnphased
     cat > runFormatsUnphased.csh <<'_EOF_'
 #!/bin/csh -ef
 
 set base = $1
 
 set db = hg18
 set scriptBin = ~/kent/src/hg/snp/hapmapLd
 set hapDir    = /san/sanvol1/scratch/$db/bed/hapmapLd
 set unphDir   = $hapDir/genotypes_2007-03
 set unphLDDir   = $hapDir/mergedUnphasedLD
 set dccOut = `echo $base | sed -e 's/^genotypes_/ld_/; s/$/.txt.gz/;'`
 set chr = `echo $base | perl -wpe 's/^.*_(chr[0-9MXY]+)_.*/$1/'`
 set pop = `echo $base | perl -wpe 's/^.*_chr[0-9MXY]+_([A-Z+]+)_.*/$1/'`
 set bedOut = $db.${pop}_$chr.bed.gz
 
 $scriptBin/makeDccAndLdBed.pl \
   $unphDir/$base.txt.gz $unphLDDir/$base.txt.LD.gz \
   $hapDir/dccUnphased/$dccOut $hapDir/bedUnphased/$bedOut
 '_EOF_'
     # << emacs
     chmod a+x runFormatsUnphased.csh
     cp /dev/null jobList
     foreach f (`ls -1S ../mergedUnphasedLD/genotypes_chr*.txt.LD.gz`)
       echo ./runFormatsUnphased.csh $f:t:r:r:r >> jobList
     end
     para make jobList
     para time
 #Completed: 120 of 120 jobs
 #CPU time in finished jobs:     101968s    1699.46m    28.32h    1.18d  0.003 y
 #IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 #Average job time:                 847s      14.11m     0.24h    0.01d
 #Longest finished job:            2276s      37.93m     0.63h    0.03d
 #Submission to last job:          2276s      37.93m     0.63h    0.03d
 
     mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatPhased
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.formatPhased
     cat > runFormatsPhased.csh <<'_EOF_'
 #!/bin/csh -ef
 
 set basePath = $1
 set base = $basePath:t
 
 set db = hg18
 set scriptBin = ~/kent/src/hg/snp/hapmapLd
 set hapDir    = /san/sanvol1/scratch/$db/bed/hapmapLd
 set dccOut = `echo $base | sed -e 's/^genotypes_/ld_/; s/$/.txt.gz/;'`
 set chr    = `echo $base | perl -wpe 's/^.*_(chr[0-9MXY]+)_.*/$1/'`
 set pop    = `echo $base | perl -wpe 's/^.*_chr[0-9MXY]+_([A-Z+]+)_.*/$1/'`
 set bedOut = $db.${pop}_$chr.bed.gz
 
 $scriptBin/makeDccAndLdBed.pl ${basePath}_legend.txt.gz $basePath.LD.gz \
    $hapDir/dccPhased/$dccOut $hapDir/bedPhased/$bedOut
 '_EOF_'
     # << emacs
     chmod a+x runFormatsPhased.csh
     cp /dev/null jobList
     foreach f (`ls -1S ../phased_2007-08_rel22/genotypes_chr*.LD.gz`)
       echo ./runFormatsPhased.csh $f:r:r >> jobList
     end
     para make jobList
     para time
 #Completed: 66 of 66 jobs
 #CPU time in finished jobs:      66155s    1102.58m    18.38h    0.77d  0.002 y
 #IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 #Average job time:                 972s      16.20m     0.27h    0.01d
 #Longest finished job:            2292s      38.20m     0.64h    0.03d
 #Submission to last job:          2292s      38.20m     0.64h    0.03d
 
     # Create empty tables, then load one pop_chr at a time in order
     # to avoid thrashing.
     # hg17 took about half an hour to an hour per population on hgwdev.
     # Load on kolossus, then ask cluster-admin to rsync to hgwdev.
     ssh kolossus
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd
     cat > loadOne.csh <<'_EOF_'
 #!/bin/csh -ef
 
 set tableBase = $1
 set Pop = $2
 set bedDir = $3
 
 set table = $tableBase$Pop
 hgsql hg18 -e "drop table if exists $table;"
 sed "s/ld2/$table/" $HOME/kent/src/hg/lib/ld2.sql \
 | hgsql hg18
 
 set pop = `echo $Pop | perl -wpe 's/ChbJpt/JPT+CHB/; tr/a-z/A-Z/;'`
 foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
   set bed = $bedDir/hg18.${pop}_chr$c.bed.gz
   if (-e $bed) then
     echo $bed
     hgLoadBed -noSort -oldTable hg18 $table $bed
   else
     echo "\n$bed does not exist\n"
   endif
   echo ""
 end
 echo -n "\nDone with $table.    "; date
 '_EOF_'
     # << emacs
     chmod a+x loadOne.csh
     # phased:
     cp /dev/null loadPhased.log
     foreach Pop (Ceu ChbJpt Yri)
       ./loadOne.csh hapmapLdPh $Pop bedPhased >>& loadPhased.log
     end
     # ~16 minutes for all phased on kolossus
     # 1/30/08: ~11 minutes for all phased on hgwdev!  bg load ~1.25
     # unphased:
     cp /dev/null loadUnphased.log
     foreach Pop (Ceu Chb ChbJpt Jpt Yri)
       ./loadOne.csh hapmapLd $Pop bedUnphased >>& loadUnphased.log
     end
     # ~21 minutes -- got segfaults for empty gzipped chrY files, debug later.
     rm -f bed.tab
 
     # Repeat hg17 sanity checks on the unphased results.
     ssh pk
     mkdir /san/sanvol1/scratch/hg18/bed/hapmapLd/run.maxDist
     cd /san/sanvol1/scratch/hg18/bed/hapmapLd/run.maxDist
     # Find the largest distance between any paired SNPs in DCC ld_* files.
     # Should be 249999 or less.  Also count the number of unique starting
     # coords.  We can compare those to the SNP counts in checkLD.log.
     cat > runMaxDist.csh <<'_EOF_'
 #!/bin/csh -ef
 
 set dccIn = $1
 set out = $dccIn:r:r.check
 
 echo -n "$dccIn:t    " > $out
 zcat $dccIn \
 | awk '{if ($2-$1>max) max=$2-$1} \
        {if (prevStart && $1 != prevStart) count++; prevStart = $1;} \
        END {print max "\t" count; \
             if (max > 249999) print "ERROR: maxDistance too large!";}' \
   >> $out
 '_EOF_'
     # << emacs
     chmod a+x runMaxDist.csh
     cp /dev/null jobList
     foreach f (../dccUnphased/ld_*.txt.gz)
       echo ./runMaxDist.csh $f >> jobList
     end
     para make jobList
     para time
 #Completed: 120 of 120 jobs
 #CPU time in finished jobs:      12274s     204.56m     3.41h    0.14d  0.000 y
 #IO & Wait Time:                  4137s      68.96m     1.15h    0.05d  0.000 y
 #Average job time:                 137s       2.28m     0.04h    0.00d
 #Longest finished job:             365s       6.08m     0.10h    0.00d
 #Submission to last job:           365s       6.08m     0.10h    0.00d
     cd ..
     cat dccUnphased/*.check > maxDist.txt
     grep -B1 ERROR maxDist.txt
 
     # Other cleanup:
     rm -r splitUnphased
 
 
 #########################################################################
 # University of Uppsala, Sweden Chip-chip  (2007-10-18 kate)
 # 3 datasets (Usf1, Usf2, H3ac) -- wiggle and bed for each, in hg16 coords
 # Submitted by Adam Ameur
 
     ssh kkstore02
     cd /cluster/data/hg18/bed
     mkdir uppsalaChip
     cd uppsalaChip
     foreach f (H3ac Usf1 Usf2)
         #wget -nd  http://www.lcb.uu.se/~mada/UUtracks_hg16/${f}_hg16.wig.gz
         wget -nd  http://www.lcb.uu.se/~mada/UUtracks_hg16/${f}_hg16.bed
     end
     wget -nd  http://www.lcb.uu.se/~mada/UUtracks_hg16/UCSCdescription.html
 
     # lift to hg18
     foreach f (lab/*hg16.bed)
         set b = `echo $f:t | sed 's/_.*//'`
         echo $b
         tail +2 $f | \
             liftOver stdin \
                 /cluster/data/hg16/bed/liftOver/hg16ToHg18.over.chain.gz \
                     $b.bed $b.bed.unmapped
     end
 
     ssh kolossus
     cd /cluster/data/hg18/bed
     cd uppsalaChip
 
     # remove duplicate regions resulting from liftOver
     cat > trimDups.awk << 'EOF'
 BEGIN {chr=""; start="";}
 {
 if (!(($1 == chr) && ($2 == start)))
     print;
     chr = $1;
     start = $2;
 }
 'EOF'
 
     # process in 2 unix pipelines, so as not to overload machine
     cat > load.csh << 'EOF'
 foreach f (lab/*hg16.wig.gz)
     set b = `echo $f:t | sed 's/_.*//'`
     echo $b
     date
     nice zcat $f | tail +2 | \
         nice varStepToBedGraph.pl stdin | \
         nice liftOver stdin \
             /cluster/data/hg16/bed/liftOver/hg16ToHg18.over.chain.gz \
             $b.wigBed $b.wigBed.unmapped
         nice bedSort $b.wigBed stdout | \
         nice awk -f trimDups.awk | \
         nice wigEncode stdin $b.wig $b.wib
     date
 end
 'EOF'
     csh load.csh >&! load.log &
     # approx. 50 minutes to process the 3 datasets
 
     # load bed and wiggles into database
     ssh hgwdev
     cd /cluster/data/hg18/bed/uppsalaChip
     cat > load2.csh << 'EOF'
 foreach f (*.wig)
     set b = $f:r
     echo $b
     date
     set table = uppsalaChip${b}Sites
     hgLoadBed hg18 $table $b.bed
     set table = uppsalaChip${b}Signal
     ln -s /cluster/data/hg18/bed/uppsalaChip/$b.wib /gbdb/hg18/wib/uppsalaChip${b}Signal.wib
     hgLoadWiggle hg18 $table $f
     date
 end
 'EOF'
     csh load2.csh >&! load2.log  &
     # just a few minutes runtime
 
     # somehow 2 beds were left out above (lifted files were missing)
 cat > loadBed.csh << 'EOF'
 foreach f (*.bed)
     set b = $f:r
     echo $b
     hgLoadBed hg18 uppsalaChip${b}Sites $f
 end
 'EOF'
     # << emacs
     csh loadBed.csh >& loadBed.log &
 
     # data distribution
      textHistogram H3ac.wigBed -minVal=-2 -real -col=4 -binSize=.5
      -2.000000  611
      -1.500000  5711
      -1.000000 * 391229
      -0.500000 ************************************************************ 21240336
      0.000000 ******************************************************* 19325712
      0.500000 ** 689267
      1.000000  99083
      1.500000  24453
      2.000000  4635
      2.500000  635
      3.000000  49
      3.500000  3
      <minVal or >=4.000000  562
 
 
 
 
 #########################################################################
 
 # BLASTZ Zebrafish danRer5 (DONE - 2007-10-18 - Hiram)
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/blastzDanRer5.2007-10-17
     cd /cluster/data/hg18/bed/blastzDanRer5.2007-10-17
 
     cat << '_EOF_' > DEF
 # Human (hg18) vs zebrafish (danRer5)
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=0
 
 # QUERY - zebrafish (danRer5)
 SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
 SEQ2_LEN=/cluster/data/danRer5/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzDanRer5.2007-10-17
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
     #	real    369m20.490s
     cat fb.hg18.chainDanRer5Link.txt
     #	73923439 bases of 2881515245 (2.565%) in intersection
 
     mkdir /cluster/data/danRer5/bed/blastz.hg18.swap
     cd /cluster/data/danRer5/bed/blastz.hg18.swap
     time nice -n +19 doBlastzChainNet.pl \
 	-chainMinScore=5000 \
 	/cluster/data/hg18/bed/blastzDanRer5.2007-10-17/DEF \
 	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> swap.log 2>&1 &
     #	real    11m35.536s
     cat fb.danRer5.chainHg18Link.txt
     #	74166352 bases of 1435609608 (5.166%) in intersection
 
 #########################################################################
 #  Vista Enhancers (2007-10-18, conodera)
 # see also /projects/compbiousr/wet/browser/vista_enhancer/17Oct2007/Makefile
 #
 # download data file from the vista browser (coordinates are for hg17)
 # http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1
 # save as enhancerbrowser.datadownload.txt
 
 cd /projects/compbiousr/wet/browser/vista_enhancer/
 
 # liftOver hg17 file
 liftOver vista_enhancer.hg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz vista_enhancer.hg18.bed vista_enhancer.hg17ToHg18.unMapped
 hgLoadBed hg18 vistaEnhancers vista_enhancer.hg18.bed
 
 ############################################################################
 # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-10-30 markd)
 
     cd /cluster/data/genbank/data/ccds/
     ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
     get CCDS.20071030.tar.gz
     mkdir /scratch/tmp/ccds
     cd /scratch/tmp/ccds
     tar -zxf /cluster/data/genbank/data/ccds/CCDS.20071030.tar.gz
 
     # import ccds database tables
     /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
 
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds hg18 ccdsInfo ccdsGene
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=hg18 -loadDb ccdsGene knownGene ccdsKgMap
     checkTableCoords hg18 -verbose=2 ccdsGene
     # update all.jointer to include hg18 in ccdsDb
     joinerCheck -database=hg18 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
 
     # build initial version of ccdsMgcMap table, updated by nightly genbank update
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=hg18 ccdsGene mgcGenes ccdsMgcMap
 
     # request push of
         ccdsGene
         ccdsInfo
         ccdsKgMap
         ccdsMgcMap
     # << emacs
 
 #########################################################################
 # Load ENSEMBL ver 45  (2007-09-5 markd)
 
 mkdir /cluster/data/hg18/bed/ensembl45
 cd /cluster/data/hg18/bed/ensembl45
 
 ##
 # need to find bounds of haplotype chromosomes
 ##
 
 # get unmasked haplotype pseudochroms from ensemble (dna, NOT dna_rm)
 wget ftp://ftp.ensembl.org/pub/current_homo_sapiens/data/fasta/dna/
     Homo_sapiens.NCBI36.46.dna.chromosome.c22_H2.fa.gz
     Homo_sapiens.NCBI36.46.dna.chromosome.c5_H2.fa.gz
     Homo_sapiens.NCBI36.46.dna.chromosome.c6_COX.fa.gz
     Homo_sapiens.NCBI36.46.dna.chromosome.c6_QBL.fa.gz
 
 # get gap locations and create hap.lift
 foreach f ( *.fa.gz )
    faGapLocs $f $f:r:r.lift
 end
 
 # build lift file for randons and haps
 (mkRandomNTLift hg18 && cat hap.lift) > randHap.lift
 
 # load ensembl genes
 hgLoadEnsembl -l randHap.lift  -p homo_sapiens core_45_36g hg18>&log
 
 # got 1 genes with CDS exons with no frame:
 ENST00000374459
 # add this to problem ids and rerun
 
 hgLoadEnsembl -l randHap.lift -f problem.ids homo_sapiens core_45_36g hg18>&log
 
 # load pseudogenes
 hgLoadEnsembl -l randHap.lift  -p homo_sapiens core_45_36g hg18>&log
 
 # got 3 pseudogenes with CDS bounds outside of exons
 ENST00000342841
 ENST00000361218
 ENST00000388856
 # add this to problem ids and rerun
 
 hgLoadEnsembl -l randHap.lift -f problem.ids -p homo_sapiens core_45_36g hg18>&log
 
 # vega code is not working in robert's scripts.
 # done to support CCDS; push not requested awaiting resolution of vega
 # stuff
 
 #########################################################################
 # AFFY TRANSCRIPTOME PHASE 3 (2007-11-06, Andy)
 
 ssh hgwdev
 bash
 cd /san/sanVol1/scratch/andy/transcriptome
 mkdir splits
 cd originalWigs/
 for f in *.wigVar; do
    table=${f%.wigVar};
    mkdir ../splits/$table
    grep -v "^track" $f | splitWig stdin 1000000 ../splits/${table}/split
    echo Done with $table
 done
 # Done with cluster run
 ssh kolossus
 cd /san/sanVol1/scratch/andy/transcriptome/lift/bed
 for tab in *; do
    for split in ${tab}/*; do
       cat $split >> ${tab}.bed
    done
    echo done catting $tab
 done
 # Split into chrom beds (with a cluster run)
 for f in `ls -1 hg18.bed`; do
      tab=${f%.bed};
      for c in `cut -f1 chrom.sizes`; do
         cfile=hg18.bed.chromSplit/${tab}.${c}.bed;
         outFile=hg18.wigVar/${tab}.wigVar;
         if [ -e $cfile ]; then
             echo variableStep chrom=${c} span=1 >> $outFile;
             bedSort $cfile stdout | awk 'BEGIN{FS="\t"}{print $2+1, $4;}' | awk -f noDupe.awk >> $outFile;
             echo Added $cfile to $outFile >> the.log;
         fi;
      done;
      echo DONE with $tab >> the.log;
      wigEncode $outFile hg18.wigVar/${tab}.wig hg18.wigVar/${tab}.wib >> the.log;
      gzip $outFile
 done
 cd hg18.wigVar/
 mkdir -p /cluster/data/hg18/bed/affyTxnPhase3/wib
 for f in *.wib; do
    echo copying $f...;
    cp $f /cluster/data/hg18/bed/affyTxnPhase3/wib/;
 done
 pushd /gbdb/hg18/wib
 ln -s /cluster/data/hg18/bed/affyTxnPhase3/wib/* .
 popd
 mkdir -p /cluster/data/hg18/bed/affyTxnPhase3/downloads
 cp *.wigVar.gz /cluster/data/hg18/bed/affyTxnPhase3/downloads
 mkdir -p /usr/local/apache/htdocs/goldenPath/hg18/affyTxnPhase3
 pushd /usr/local/apache/htdocs/goldenPath/hg18/affyTxnPhase3
 ln -s /cluster/data/hg18/bed/affyTxnPhase3/downloads/* .
 for f in *Strand*; do mv $f sRNA.$f; done
 for f in affyTxnPhase3*; do mv $f lRNA.$f; done
 
 #########################################################################
 # Blastz Marmoset calJac1 (DONE - 2007-11-09 - Hiram)
 ##	this is not necessary - already done by Kate in October
     ssh kkstore06
     screen # use screen to control this job
     mkdir /cluster/data/hg18/bed/blastzCalJac1.2007-11-09
     cd /cluster/data/hg18/bed/blastzCalJac1.2007-11-09
 
     cat << '_EOF_' > DEF
 # Human vs marmoset
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Marmoset calJac1
 SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
 SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzCalJac1.2007-11-09
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-bigClusterHub=pk > blastz.log 2>&1 &
     #	real    542m2.359s
 # Completed: 230805 of 230805 jobs
 # CPU time in finished jobs:    7279638s  121327.30m  2022.12h   84.26d  0.231 y
 # IO & Wait Time:                831303s   13855.05m   230.92h    9.62d  0.026 y
 # Average job time:                  35s       0.59m     0.01h    0.00d
 # Longest finished job:             972s      16.20m     0.27h    0.01d
 # Submission to last job:         20572s     342.87m     5.71h    0.24d
     cat fb.hg18.chainCalJac1Link.txt
     #	2236493373 bases of 2881515245 (77.615%) in intersection
 
 ###########################################################################
 # LIFT RM ALIGN FILES, MAKE PER-CHROM DOWNLOADS (DONE 12/7/07 angie)
 # Lifting of .align files is now automated by doRepeatMasker.pl, but we
 # got a user request for .align files from this pre-automation db.
     ssh kkstore02
     cd /cluster/data/hg18
     mkdir downloads/RMalign
     foreach c (?{,?} ?{,?}_*hap?)
       echo linking/lifting to contigs of $c:t
       foreach ctgdir ($c/N[TC]_??????)
         set nt = $ctgdir:t
 	if (! -f $ctgdir/$nt.fa.align) then
           pushd $ctgdir
           liftRMAlign.pl $nt.lft > $nt.fa.align
           popd
         endif
         ln -s $nt/$nt.fa.align $c/
       end
       set chr = chr$c:t
       if (-e $c/lift/ordered.lft && ! -z $c/lift/ordered.lft) then
         echo lifting contigs to chr$c
         liftRMAlign.pl $c/lift/ordered.lft \
         | gzip -c > downloads/RMalign/$chr.fa.align.gz
       endif
       if (-e $c/lift/random.lft && ! -z $c/lift/random.lft) then
         echo lifting contigs to chr${c}_random
         liftRMAlign.pl $c/lift/random.lft \
         | gzip -c > downloads/RMalign/${chr}_random.fa.align.gz
       endif
     end
     md5sum downloads/RMalign/*.gz > downloads/RMalign/md5sum.txt
     ssh hgwdev ln -s /cluster/data/hg18/downloads/RMalign \
       /usr/local/apache/htdocs/goldenPath/hg18/
 
 
 
 #########################################################################
 # ADD LINKS TO GENETESTS ON hgGene DETAILS PAGE (DONE 12/12/07 Fan)
 
     ssh hgwdev
     cd /cluster/store11/gs.19/build36/bed
     mkdir geneTests
     cd geneTests
 
 # paste the 3 cols gene list from GeneTest web site into file geneTests.lis
 
     cut -f 1 geneTests.lis >j1
     cut -f 2 geneTests.lis >j2
     cut -f 3 geneTests.lis >j3
 
     cat j1 j2 j3 |sort -u >geneTests.tab
     rm j1 j2 j3
 
     hgsql hg18 -e 'drop table geneTests'
     hgsql hg18 < ~/src/hg/lib/geneTests.sql
     hgsql hg18 -e 'load data local infile "geneTests.tab" into table geneTests
 ignore 1 lines'
 
 # the list is independent of hg18, so load it into hg17 too.
 
     hgsql hg17 -e 'drop table geneTests'
     hgsql hg17 < ~/src/hg/lib/geneTests.sql
 
     hgsql hg17 -e 'load data local infile "geneTests.tab" into table geneTests
 ignore 1 lines'
 
 ###########################################################################
 # ADD SeattleSNPs PGA GENES ON hgGene DETAILS PAGE. (DONE, Fan, 12/13/07).
 
     cd /cluster/store12/snp
     mkdir pga
     cd pga
 
 # download data from SeattleSNPs
 
     wget --timestamping http://pga.gs.washington.edu/data.tar.gz
     gzip -d *.gz
     tar -xvf *.tar
 
 # create SeattleSNPs PGA gene list
 
     cut -f 1 FinishedGenes.txt >j1
     cut -f 2 FinishedGenes.txt >j2
     cat j1 j2 |sort -u >pga.tab
     rm j1 j2
 
 # load the data into the pga table.
 
     hgsql hg18 -e 'drop table pga'
     hgsql hg18 < ~/src/hg/lib/pga.sql
 
     hgsql hg18 -e 'load data local infile "pga.tab" into table pga'
 
 ###########################################################################
 # Reload CCDS (2007-12-12 markd)
     # import ccds database as described in ccds.txt
     set db=hg18
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     # build initial version of ccdsMgcMap table, updated by nightly genbank update
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of
         ccdsGene
         ccdsInfo
         ccdsKgMap
         ccdsMgcMap
 
 ############################################################################
 # dbSNP BUILD 128 (DONE 1/22/08 angie)
 # updated snp128ExceptionDesc (tweaked wording) 3/7/08
 # 8/7/08: Regenerated snp128.sql with only those enum/set values that are
 # actually used (except always keep unknown, the default) and reloaded snp128.
 # No data change -- just the sql field definitions for enums and sets.
 # QA NOTE: used sudo mytouch on the snp128 table to reset the timestamp to
 # .2008-01-22 00:00:00 (was .2008-08-07 16:08:27 after Angie's re-load) in
 # order to keep joinerCheck happy and avoid confusion. (8/8/08 brooke)
     # Set up build directory
     ssh kkstore06
     mkdir -p /cluster/store3/dbSNP128/{human,shared}
     ln -s /cluster/store3/dbSNP128 /cluster/data/dbSNP/128
 
     # Get field encodings -- if there are changes or additions to the
     # encoding of the corresponding fields, you might need to update
     # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
     # hg/lib/snp125Ui.c).
     cd /cluster/data/dbSNP/128/shared
     set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
     wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
     wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
     wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
     wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
     # Here is another source -- it is not as up-to-date as the above, but
     # our encodings (enums and sets in snp128.sql) are named more similar
     # to those in the 2005 ASN:
     # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
 
     ########################## DOWNLOAD #############################
     cd /cluster/data/dbSNP/128/human
     mkdir data schema rs_fasta
     # Get data from NCBI (anonymous FTP)
     wget ftp://ftp.ncbi.nih.gov/snp/00readme.txt
     cd /cluster/data/dbSNP/128/human/data
     alias wg wget --timestamping
     set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
     # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
     wg $ftpSnpDb/organism_data/b128_SNPContigLoc_36_2.bcp.gz
     wg $ftpSnpDb/organism_data/b128_SNPContigLocusId_36_2.bcp.gz
     wg $ftpSnpDb/organism_data/b128_ContigInfo_36_2.bcp.gz
     # MapInfo has alignment weights
     wg $ftpSnpDb/organism_data/b128_SNPMapInfo_36_2.bcp.gz
     # SNP has univar_id, validation status and heterozygosity
     wg $ftpSnpDb/organism_data/SNP.bcp.gz
 
     # Get schema
     cd /cluster/data/dbSNP/128/human/schema
     wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
 
     # Get fasta files
     # using headers of fasta files for molType, class, observed
     cd /cluster/data/dbSNP/128/human/rs_fasta
     wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
 
 
     ########################## LOAD NCBI TABLES #############################
     # Simplify names of data files -- strip version & extras to get
     # local canonical table names.
     cd /cluster/data/dbSNP/128/human/data
     foreach f (*.bcp.gz)
       set new = `echo $f \
                  | sed -e 's/^b128_SNP//; s/^b128_//; s/_36_2//; s/.bcp//;'`
       mv $f $new
       echo $new
     end
 
     # Extract just the tables that we need from the NCBI msSQL table
     # creation file, and get CREATE statements from
     # human_9606_table.sql for our 5 tables
     cd /cluster/data/dbSNP/128/human/schema
 
     zcat human_9606_table.sql.gz \
     | perl -we '$/ = "\nGO\n\n\n\n"; \
         while (<>) { \
           next unless /^CREATE TABLE \[(b128_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_2)?\]/; \
           s/b128_(SNP)?//; s/_36_2//; \
           s/[\[\]]//g;  s/GO\n\n\n/;/;  s/smalldatetime/datetime/g; \
           s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
           s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
           s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
           s/(image|varchar\s+\(\d+\))/BLOB/g; \
           print; \
         }' \
       > table.sql
 
     # load on kolossus or a small cluster machine (mysql5 is OK for this).
     ssh kolossus
     hgsql '' -e 'create database hg18snp128'
     cd /cluster/data/dbSNP/128/human/schema
     hgsql hg18snp128 < table.sql
     cd ../data
 
     foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
       zcat $t.gz \
       | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
       | hgLoadSqlTab -oldTable hg18snp128 $t placeholder stdin
     end
     # There were some warnings (many cleared up by the perl substitution)
     # but no rows were dropped.  I eyeballed a few examples, seemed OK.
     foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
      echo -n "${t}:\t"
       hgsql -N -B hg18snp128 -e 'select count(*) from '$t
     end
 #ContigInfo:     7067
 #ContigLoc:      24685256
 #ContigLocusId:  13129868
 #MapInfo:        24132236
 #SNP:    	 11833664
     # these counts (except for MapInfo which has ~doubled) are
     # slightly down from 126.  MapInfo has a lot of alternate assembly
     # mappings, esp. the celera assembly; maybe that's new?
 
     # load hg18.ctgPos into dbSnpHumanBuild128, compare contig list between
     # ctgPos and ContigInfo
     # NOTE FOR NEXT TIME: instead of going through mysql, just make a
     # tab-sep dump file of ctgPos.
     ssh hgwdev hgsql hg18 -N -B -e '"select * from ctgPos;"' \
     | hgLoadSqlTab hg18snp128 ctgPos ~/kent/src/hg/lib/ctgPos.sql stdin
     hgsql hg18snp128 -N -B -e 'select contig from ctgPos;' | sort > /tmp/1
     # Note: we used to look for group_term = "ref_assembly", but that leaves
     # behind some contigs that we include.  So use a list of group_label:
     hgsql hg18snp128 -NBe 'select distinct(group_label) from ContigInfo'
     # --> ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")
     hgsql hg18snp128 -N -B -e 'select contig_acc from ContigInfo \
         where group_label in \
         ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
     diff /tmp/1 /tmp/2
     # No diff.
 
     #################### EXTRACT INFO FROM NCBI TABLES ####################
     mkdir -p /scratch/snp/128/human
     cd /scratch/snp/128/human
 
     # Fields of the SNP table and their NCBI source table/file:
     # chrom		ContigLoc / contigInfo / liftUp
     # chromStart	ContigLoc / liftUp; check vs phys_pos_from
     # chromEnd		ContigLoc / liftUp
     # name		rs + numeric ID that joins all the other sources
     # score		0
     # strand		ContigLoc.orientation
     # refNCBI		ContigLoc.allele
     # refUCSC		ContigLoc.allele if insertion, othw. from genomic
     # observed		fasta headers
     # molType		fasta headers
     # class		fasta headers
     # valid		SNP
     # avHet		SNP
     # avHetSE		SNP
     # func		ContigLocusId
     # locType		ContigLoc
     # weight		MapInfo
 
     time hgsql hg18snp128 -e \
       'alter table ContigLoc  add index (ctg_id); \
        alter table ContigInfo add index (ctg_id);'
     #kolossus load was already 1.0.
 #0.001u 0.002s 4:04.73 0.0%      0+0k 0+0io 0pf+0w
 
     time hgsql hg18snp128 -e \
       'alter table ContigInfo add index (group_label(9));'
 #0.001u 0.001s 0:00.07 0.0%      0+0k 0+0io 0pf+0w
 
     # Make sure there are no orient != 0 contigs among those selected.
     hgsql hg18snp128 -NBe \
       'select count(*) from ContigInfo where orient != 0 and \
          group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
 #0
 
     # For joining files by shared column, we need a unique identifier in
     # that shared column.  snp_id is not unique -- the same rsID can appear
     # in both the reference assembly and on one of the others e.g. c6_COX.
     # So concatenate the assembly identifier and snp_id to get hopefully
     # unique label.
     time hgsql hg18snp128 -NBe \
       'select concat(ContigInfo.group_label, ".", snp_id), \
               ContigInfo.contig_acc, asn_from, asn_to, \
               loc_type, orientation, allele, phys_pos_from \
        from ContigLoc, ContigInfo \
        where ContigLoc.ctg_id = ContigInfo.ctg_id and ContigInfo.group_label \
              in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
       | sort \
       > ucscContigLoc.txt
     # no time output because of the pipe... took 4 minutes (load was 3 or 4).
 
     # Make sure these IDs are unique.
     wc -l ucscContigLoc.txt
 #12275300 ucscContigLoc.txt
     awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l
 #11863799
     # Doh!  Find non-unique IDs:
     awk 'prev == $1 {print;} {prev = $1;}' ucscContigLoc.txt | head
     grep ^c5_H2.10035195 ucscContigLoc.txt
 #c5_H2.10035195  NT_113801       639954  639954  2       0       G       69605321
 #c5_H2.10035195  NT_113801       660407  660407  2       0       G       69625774
 #c5_H2.10035195  NT_113801       911780  911780  2       1       C       69877147
     # OK, they can be duplicated within the same contig.  See if we can
     # get by with anchoring everything to ucscContigLoc.txt.  But everybody
     # else better have unique IDs!
 
     # SNP -> valid, avHet, avHetSE
     # SNP has only snp_id as identifier, nothing relating to assembly.
     hgsql hg18snp128 -NBe \
       'select snp_id, validation_status, avg_heterozygosity, het_se \
        from SNP;' \
     | sort \
       > ucscSNP.txt
     # Check ID uniqueness:
     wc -l ucscSNP.txt
 #11833664 ucscSNP.txt
     awk '{print $1;}' ucscSNP.txt | uniq | wc -l
 #11833664
 
     # ContigLocusId -> func
     # ContigLocusId has only snp_id as an identifier (it gives one
     # example contig if the SNP is on multiple contigs).
     # The sort options and awk are to convert multiple entries with different
     # function classes for the same SNP into one entry per SNP with a list
     # of function classes.
     hgsql hg18snp128 -NBe \
       'select snp_id, fxn_class from ContigLocusId;' \
     | sort -u -k1,1 -k2,2n  \
     | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
             else { if (prevId) {print prevId "\t" prevFunc;} \
                                 prevFunc = $2 ","; }} \
            {prevId = $1;} \
            END {print prevId "\t" prevFunc;}' \
       > ucscFunc.txt
     # Check ID uniqueness:
     wc -l ucscFunc.txt
 #4676589 ucscFunc.txt
     awk '{print $1;}' ucscFunc.txt | sort -u | wc -l
 #4676589
 
     # MapInfo -> weight
     # MapInfo needs assembly+snp_ids in order to have unique IDs.
     time hgsql hg18snp128 -e \
       'alter table MapInfo add index (assembly(9));'
 #0.000u 0.004s 2:22.64 0.0%      0+0k 0+0io 0pf+0w
     hgsql hg18snp128 -NBe \
       'select concat(assembly, ".", snp_id), weight \
              from MapInfo where assembly \
              in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
       | sort \
       > weight.txt
     # ~1 minute
     # Check ID uniqueness:
     wc -l weight.txt
 #11863799 weight.txt
     awk '{print $1;}' weight.txt | uniq | wc -l
 #11863799
     awk '{print $2;}' weight.txt | sort -n | uniq -c
 #   47454 0
 #11621954 1
 #   91766 2
 #  100142 3
 #    2483 10
     # SNPs w/weight 0 and 10 will be discarded later.
 
     # fasta headers -> observed, molType, class
     zcat /cluster/data/dbSNP/128/human/rs_fasta/rs_ch*.fas.gz \
     | grep '^>gnl' \
     | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
     | sort \
       > ucscGnl.txt
     # ~4 minutes
     wc -l ucscGnl.txt
 #11833664 ucscGnl.txt
     awk '{print $1;}' ucscGnl.txt | uniq | wc -l
 #11833664
 
     ############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################
     # Join files by ID.  Start with ContigLoc and MapInfo because they
     # share the concatenated assembly+snp_id IDs.
     time join -a 1 -e MISSING -t '	' ucscContigLoc.txt weight.txt \
       > ucscCL+w.txt
 #25.408u 3.551s 0:29.26 98.9%    0+0k 0+0io 0pf+0w
     wc -l ucscCL+w.txt
 #12275300 ucscCL+w.txt
     # Same as ucscContigLoc.txt above, good.
     # Any missing weights?
     grep MISSING ucscCL+w.txt | head
     # No output, good.
 
     # Join the files with SNP-only IDs.
     time join -e MISSING -t '	' ucscGnl.txt ucscSNP.txt \
       > ucscG+S.txt
 #16.805u 1.996s 0:19.04 98.6%    0+0k 0+0io 0pf+0w
     wc -l ucscG+S.txt
 #11833664 ucscG+S.txt
     # Same as ucscSNP.txt and ucscGnl.txt above.
     grep MISSING ucscG+S.txt | wc -l
 #0
     time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \
       -t '	' ucscG+S.txt ucscFunc.txt \
       > ucscG+S+F.txt
 #17.656u 2.318s 0:20.10 99.3%    0+0k 0+0io 0pf+0w
     wc -l ucscG+S+F.txt
 #11833664 ucscG+S+F.txt
     grep MISSING ucscG+S+F.txt | wc -l
 #7157075
     # Not surprising -- ucscFunc.txt has only 4676589 lines.
     expr 11833664 - 4676589
 #7157075
 
     # Convert assembly+snp_id's to just snp_id (sorted) for final join.
     perl -wpe 's/^\S+\.(\d+)/$1/;' ucscCL+w.txt \
     | sort > ucscCL+w.snp_id.txt
     awk '{print $1;}' ucscCL+w.snp_id.txt | uniq | wc -l
 #11727742
     # Interesting... which snp_ids are missing from ContigLoc?
     awk '{print $1;}' ucscCL+w.snp_id.txt | uniq > /tmp/1
     awk '{print $1;}' ucscGnl.txt | uniq > /tmp/2
     comm -13 /tmp/1 /tmp/2 > notInContigLoc.txt
     comm -23 /tmp/1 /tmp/2 > notInSNP.txt
     wc -l notIn*.txt
 #105994 notInContigLoc.txt
 #    72 notInSNP.txt
     expr 11833664 + 72 - 105994
 #11727742
 
     # Final join -- treat ContigLoc as authoritative (since it has coords).
     # Arrange columns in same order as in the SNP table, with extras for
     # checking at the end (phys_pos_from).
     # chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ...
     time join -a 1 -e MISSING -t '	' \
   -o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \
       ucscCL+w.snp_id.txt ucscG+S+F.txt \
       > ucscNcbiSnp.ctg.txt
 #38.497u 5.536s 2:08.18 34.3%    0+0k 0+0io 0pf+0w
     wc -l ucscNcbiSnp.ctg.txt
 #12275300 ucscNcbiSnp.ctg.txt
     grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l
 #7058898
     # a bit less than the 7157075 missing FUNC's above -- some overlap with
     # notInContigLoc would explain.
 
     # Lift the map contig coordinates to chrom coordinates (~2m);
     time liftUp ucscNcbiSnp.bed \
       /cluster/data/hg18/jkStuff/liftContigs.lft warn \
       ucscNcbiSnp.ctg.txt
 #98.038u 5.974s 1:45.65 98.4%    0+0k 0+0io 5pf+0w
     wc -l ucscNcbiSnp.bed
 #12275300 ucscNcbiSnp.bed
 
     # At this point, move back from /scratch to /cluster/data.
     nice gzip ucscNcbiSnp.bed
     cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/128/human/
 
     # Drum roll please... translate NCBI's encoding into UCSC's, and
     # perform a bunch of checks.  This is where developer involvement
     # is most likely as NCBI extends the encodings used in dbSNP.
     cd /cluster/data/dbSNP/128/human/
     gunzip ucscNcbiSnp.bed.gz
     # Re-ran this command 8/7/08 to get new snp128.sql that includes
     # only those enum/set values that are actually used.  No other output
     # files changed.
     time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
       snp128
 #spaces stripped from observed:
 #chr12   5963395 5963395 rs41402545
 #count of snps with weight  0 = 59123
 #count of snps with weight  1 = 11654498
 #count of snps with weight  2 = 191647
 #count of snps with weight  3 = 335214
 #count of snps with weight 10 = 34818
 #Skipped 167 snp mappings due to errors -- see snp128Errors.bed
 #176.712u 17.466s 3:34.82 90.3%  0+0k 0+0io 0pf+0w
     # The 167 errors are all for SNPs for which we don't have fasta,
     # so we also don't have observed, class, or molType.  I spot-checked
     # a few, and they have been deleted from dbSNP.  Nothing to show,
     # so we skip those 167 -- nothing catastrophic.  Watch out for new
     # types of errors reported, though:
     awk -F"\t" '{print $5;}' snp128Errors.bed | sort -u | wc -l
 #1
     wc -l snp*
 #  12181192 snp128.bed
 #        22 snp128.sql
 #       167 snp128Errors.bed
 #        18 snp128ExceptionDesc.tab
 #   1013020 snp128Exceptions.bed
 
     # Make one big fasta file.  (note: snp126 skipped chrUn... but it's small
     # compared to chr1, chr2 etc.)
     # It's a monster: 14G!  Can we split by hashing rsId?
     zcat rs_fasta/rs_ch*.fas.gz \
     | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
       > snp128.fa
     # Check for duplicates.
     grep ^\>rs snp128.fa | sort > /scratch/tmp/seqHeaders
     wc -l /scratch/tmp/seqHeaders
 #11833664 /scratch/tmp/seqHeaders
     uniq /scratch/tmp/seqHeaders | wc -l
 #11833664
     # Use hgLoadSeq to generate .tab output for sequence file offsets,
     # and keep only the columns that we need: acc and file_offset.
     # Index it and translate to snpSeq table format.
     time hgLoadSeq -test placeholder snp128.fa
 #107.137u 37.140s 2:39.16 90.6%  0+0k 0+0io 0pf+0w
     cut -f 2,6 seq.tab > snp128Seq.tab
     rm seq.tab
 
     ssh hgwdev
     # Load up main track tables.
     cd /cluster/data/dbSNP/128/human
     # Re-ran this command 8/7/08 to get new snp128.sql that includes
     # only those enum/set values that are actually used.  No data values
     # changed.  Removed -noSort because Brooke had spotted some entries
     # sorted by chromEnd instead of chromStart.
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
       hg18 snp128 -sqlTable=snp128.sql snp128.bed
 #78.060u 13.298s 7:32.71 20.1%   0+0k 0+0io 0pf+0w
     sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125Exceptions.sql \
       > snp128Exceptions.sql
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
       hg18 snp128Exceptions -sqlTable=snp128Exceptions.sql \
       snp128Exceptions.bed
 #5.915u 0.492s 0:28.69 22.3%     0+0k 0+0io 0pf+0w
     sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
       > snp128ExceptionDesc.sql
     # 3/7/08: reloaded snp128ExceptionDesc (tweaked wording)
     hgLoadSqlTab hg18 snp128ExceptionDesc snp128ExceptionDesc.sql \
       snp128ExceptionDesc.tab
     # Load up sequences.
     sed -e 's/snpSeq/snp128Seq/' ~/kent/src/hg/lib/snpSeq.sql \
       > snp128Seq.sql
     mkdir -p /gbdb/hg18/snp
     ln -s /cluster/data/dbSNP/128/human/snp128.fa /gbdb/hg18/snp/snp128.fa
     time nice hgLoadSqlTab hg18 snp128Seq snp128Seq.sql snp128Seq.tab
 #0.001u 0.000s 2:31.19 0.0%      0+0k 0+0io 0pf+0w
 
     # Put in a link where one would expect to find the track build dir...
     ln -s /cluster/data/dbSNP/128/human /cluster/data/hg18/bed/snp128
 
 
 #######################################################################
 # SNPMASKED SEQUENCE FOR SNP128 (DONE 2/1/08 angie)
     ssh kolossus
     mkdir /cluster/data/hg18/snp128Mask
     cd /cluster/data/hg18/snp128Mask
 
     # Identify rsIds with various problems -- we will exclude those.
     # MultipleAlignments is kinda broad because anything that maps on
     # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
     # matches on chrN_random might disqualify good matches on chrN.
     # Well, erring on the side of caution is good.
     awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
       /cluster/data/dbSNP/128/human/snp128Exceptions.bed \
       | sort -u \
       > snp128ExcludeRsIds.txt
     time grep -vFwf snp128ExcludeRsIds.txt \
       /cluster/data/dbSNP/128/human/snp128.bed \
       > snp128Cleaned.bed
 #100.027u 11.779s 2:09.61 86.2%  0+0k 0+0io 0pf+0w
 
     # Substitutions:
     mkdir substitutions
     snpMaskSingle snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
     | faSplit byname stdin substitutions/
     #-- 79 warnings about differing observed at same base positions
     #-- (66 distinct positions) -- send to NCBI. snp-admin@ncbi.nlm.nih.gov
     # Also this warning about total size -- just means that some chroms
     # didn't have any SNPS that survived the stringent filtering.
 #Masked 9146694 snps in 9146642 out of 3091528550 genomic bases
 #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091528550 (difference is 16148723)
     # Make sure that sizes are identical, first diffs are normal -> IUPAC,
     # and first diffs' case is preserved:
     foreach f (substitutions/chr*.fa)
       faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ"
     end
 #(output OK)
     foreach f (substitutions/chr*.fa)
       echo $f:t:r
       mv $f $f:r.subst.fa
       gzip $f:r.subst.fa
     end
 
     # Insertions:
     mkdir insertions
     snpMaskAddInsertions snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
     | faSplit byname stdin insertions/
 #Added 1332737 snps totaling 2372942 bases to 3085151178 genomic bases
 #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085151178 (difference is 22526095)
     # Again, that just means that some chroms didn't have filtered SNPs.
     # Make sure that all sizes have increased relative to original:
     foreach f (insertions/chr*.fa)
       faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
       |& perl -we '$_=<>; \
            if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
              if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
              else {die "ERROR: ins size $1 <= $2\n";} \
            } else {die $_;}'
     end
 #(output OK)
     foreach f (insertions/chr*.fa)
       mv $f $f:r.ins.fa
       gzip $f:r.ins.fa
     end
 
     # Deletions:
     mkdir deletions
     snpMaskCutDeletions snp128Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
     | faSplit byname stdin deletions/
 #Cut 661637 snps totaling 1248873 bases from 3085167749 genomic bases
 #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
     # Again, that just means that some chroms didn't have filtered SNPs.
     # Make sure that all sizes have decreased relative to original:
     foreach f (deletions/chr*.fa)
       faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
       |& perl -we '$_=<>; \
            if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
              if ($1 < $2) {print "OK: del size $1 < $2\n";} \
              else {die "ERROR: del size $1 >= $2\n";} \
            } else {die $_;}'
     end
 #(output OK)
     foreach f (deletions/chr*.fa)
       mv $f $f:r.del.fa
       gzip $f:r.del.fa
     end
 
     # Clean up and prepare for download:
     gzip snp128Cleaned.bed
     foreach d (substitutions insertions deletions)
       pushd $d
         md5sum *.gz > md5sum.txt
       popd
     end
     # Make a README.txt in each subdir.
 
     # Create download links on hgwdev.
     # NOTE: I am going to start by offering only the substitutions.
     # If we get any user requests, then maybe we can put the insertions
     # and deletions out there.
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask
     ln -s /cluster/data/hg18/snp128Mask/substitutions/* \
       /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/
 ## If there is user demand for ins & del, then start over with an empty
 ## goldenPath/snp128Mask and do this:
 ##    foreach type (substitutions insertions deletions)
 ##      mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/$type
 ##      ln -s /cluster/data/hg18/snp128Mask/$type/* \
 ##        /usr/local/apache/htdocs/goldenPath/hg18/snp128Mask/$type/
 ##    end
 
 
 #######################################################################
 # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP128 (DONE 2/8/08 angie)
 # REDONE 2/29/08 (upcase ortho alleles)
     ssh kolossus
     mkdir /cluster/data/hg18/bed/snp128Ortho
     cd /cluster/data/hg18/bed/snp128Ortho
 
     # Following Heather's lead in snp126orthos, filter SNPs to to keep
     # only those with class=single, length=1, chrom!~random;
     # Exclude those with exceptions MultipleAlignments,
     # SingleClassTriAllelic or SingleClassQuadAllelic.
     # Unlike snp masking, we do not filter for weight -- don't know why.
     awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
       /cluster/data/dbSNP/128/human/snp128Exceptions.bed \
     | sort -u \
       > snp128ExcludeIds.txt
     awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
       /cluster/data/dbSNP/128/human/snp128.bed \
     | grep -vFwf snp128ExcludeIds.txt \
       > snp128Simple.bed
     # took ~3 minutes
     wc -l snp128Simple.bed
 #9133704 snp128Simple.bed
     # This is the analog of db table snp126simple.
 
     # Glom all human info that we need for the final table onto the
     # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
     awk 'BEGIN{OFS="\t";} \
         {print $1, $2, $3, \
                $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                0, $6;}' \
       snp128Simple.bed > snp128ForLiftOver.bed
 
     # 2/29/08 -- re-ran from this point on to regenerate cleaned up
     # cluster run results (oops) and then force ortho alleles to upper
     # case, for consistency with dbSNP formatting.
 
     # Map coords to chimp using liftOver.
     # I don't know why chimp took so much longer than macaque... the
     # chimp .over has fewer chains and fewer bytes than the macaque .over.
     mkdir run.liftOChimp
     cd run.liftOChimp
     mkdir split out
     splitFile ../snp128ForLiftOver.bed 25000 split/chunk
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
         \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     ssh pk
     cd /cluster/data/hg18/bed/snp128Ortho/run.liftOChimp
     para make jobList
 #Completed: 366 of 366 jobs
 #CPU time in finished jobs:      71660s    1194.33m    19.91h    0.83d  0.002 y
 #IO & Wait Time:                  5377s      89.62m     1.49h    0.06d  0.000 y
 #Average job time:                 210s       3.51m     0.06h    0.00d
 #Longest finished job:             518s       8.63m     0.14h    0.01d
 #Submission to last job:           518s       8.63m     0.14h    0.01d
 
     # Map coords to macaque using liftOver.
     mkdir ../run.liftOMac
     cd ../run.liftOMac
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
         \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     para make jobList
 #Completed: 366 of 366 jobs
 #CPU time in finished jobs:       5663s      94.38m     1.57h    0.07d  0.000 y
 #IO & Wait Time:                 12066s     201.10m     3.35h    0.14d  0.000 y
 #Average job time:                  48s       0.81m     0.01h    0.00d
 #Longest finished job:             102s       1.70m     0.03h    0.00d
 #Submission to last job:           102s       1.70m     0.03h    0.00d
     # Average job time was 54s with 50000 chunks, but those made chimp
     # jobs run too long.
 
     ssh kolossus
     cd /cluster/data/hg18/bed/snp128Ortho
     # Here is a script that looks up the base value in the ortho species
     # and swizzles columns to prepare for the joining and re-swizzling
     # of both ortho species' columns into the final product.  If it is
     # used more than once, should be checked in, perhaps in hg/snp/snpLoad.
     cat > getOrthoSeq.pl <<'_EOF_'
 #!/usr/bin/env perl
 # Dig up orthologous alleles and swizzle columns so the glommed name that
 # includes human position info etc. is first.  It will be used as a key for
 # joining up multiple other-species' ortho data.  Also swizzle columns so
 # that the remaining columns are in order of appearance in the final result,
 # snp128OrthoPanTro2RheMac2.  Upcase ortho alleles for consistency w/dbSNP.
 use warnings;
 use strict;
 
 my $twoBitFName = shift @ARGV
   || die "usage: getOrthoSeq.pl orthoDb.2bit [file(s)]\n";
 
 sub getOChrSeq($$) {
   # Slurp in fasta sequence using twoBitToFa.
   my ($twoBitFName, $oChr) = @_;
   open(P, "twoBitToFa -noMask $twoBitFName -seq=$oChr stdout |")
     || die "Can't open pipe from twoBitToFa $twoBitFName -seq=$oChr: $!\n";
   <P> =~ /^>\w+/
     || die "Doesn't look like we got fasta -- first line is this:\n$_";
   # From man perlfaq5: trick to slurp entire contents:
   my $c = 0;
   my $seq = do { local $/; my $data = <P>; $c = ($data =~ s/\n//g); $data; };
   close(P);
   return $seq;
 }
 
 my %rc = ( "a" => "t", "c" => "g", "g" => "c", "t" => "a",
            "A" => "T", "C" => "G", "G" => "C", "T" => "A", );
 sub revComp($) {
   # Reverse-complement fasta input.  (Pass through non-agtc chars.)
   my ($seq) = @_;
   my $rcSeq = reverse $seq;
   for (my $i = 0;  $i < length($rcSeq);  $i++) {
     my $base = substr($rcSeq, $i, 1);
     my $cBase = $rc{$base} || $base;
     substr($rcSeq, $i, 1, $cBase);
   }
   return $rcSeq;
 }
 
 my $prevOChr;
 my ($oChrSeq, $oChrSize);
 while (<>) {
   chomp;
   my ($oChr, $oStart, $oEnd, $nameGlom, undef, $oStrand) = split;
   if (! defined $prevOChr || $oChr ne $prevOChr) {
     $oChrSeq = &getOChrSeq($twoBitFName, $oChr);
     $oChrSize = length($oChrSeq);
   }
   die "Coords out of range, input line $.: $oEnd > $oChr size $oChrSize\n\t"
     if ($oEnd > $oChrSize);
   my $oAllele = substr($oChrSeq, $oStart, $oEnd - $oStart);
   $oAllele = &revComp($oAllele) if ($oStrand eq "-");
   print join("\t", $nameGlom, $oChr, $oStart, $oEnd, $oAllele, $oStrand) .
         "\n";
   $prevOChr = $oChr;
 }
 '_EOF_'
     # << emacs
     chmod a+x getOrthoSeq.pl
 
     # Concatenate the chimp results, sorting by chimp pos in order to
     # efficiently access 2bit sequence in ./getOrthoSeq.  The output of
     # that is then sorted by the glommed human info field, so that we
     # can use join to combine chimp and macaque results in the next step.
     sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
     | ./getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
     | sort > panTro2.orthoGlom.txt
     # ditto for macaque:
     sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
     | ./getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
     | sort > rheMac2.orthoGlom.txt
     # The whole pipeline takes ~4-6 minutes each.
     wc -l panTro2.orthoGlom.txt rheMac2.orthoGlom.txt
 #   8549323 panTro2.orthoGlom.txt
 #   7324851 rheMac2.orthoGlom.txt
 
     # Use the glommed name field as a key to join up chimp and macaque
     # allele data.  Include glommed name from both files because if only
     # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
     # in the orthoGlom files from each file, which are in the same order
     # as the chimp and macaque columns of snp128OrthoPanTro2RheMac2.
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e 0 \
       panTro2.orthoGlom.txt rheMac2.orthoGlom.txt \
     | perl -wpe 'chomp; \
         ($glom1, $glom2, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand) = split; \
         $glomKey = ($glom1 ne "0") ? $glom1 : $glom2; \
         ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
           split(/\|/, $glomKey); \
         $o1Chr    =~ s/^0$/?/;   $o2Chr    =~ s/^0$/?/; \
         $o1Al     =~ s/^0$/?/;   $o2Al     =~ s/^0$/?/; \
         $o1Strand =~ s/^0$/?/;   $o2Strand =~ s/^0$/?/; \
         print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                          $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand) . "\n"; \
         s/^.*$//;' \
     | sort -k1,1 -k2n,2n > snp128OrthoPanTro2RheMac2.bed
     # took ~5 minutes.
     wc -l snp128OrthoPanTro2RheMac2.bed
 #8770301 snp128OrthoPanTro2RheMac2.bed
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/snp128Ortho
     sed -e 's/snpOrthoPanTroRheMac/snp128OrthoPanTro2RheMac2/' \
       ~/kent/src/hg/lib/snpOrthoPanTroRheMac.sql \
       > snp128OrthoPanTro2RheMac2.sql
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
       hg18 snp128OrthoPanTro2RheMac2 -sqlTable=snp128OrthoPanTro2RheMac2.sql \
       snp128OrthoPanTro2RheMac2.bed
 #Loaded 8770301 elements of size 17
 #52.659u 8.528s 5:18.68 19.1%    0+0k 0+0io 0pf+0w
 
     # Cleanup on fileserver:
     cd /cluster/data/hg18/bed/snp128Ortho
     nice gzip snp128Simple.bed snp128ExcludeIds.txt snp128ForLiftOver.bed
     rm -r run*/split *.orthoGlom.txt
 
 
 #######################################################################
 # COMPARE SNP128 TO SNP126 (DONE 2/7/08 angie)
 
     # First, do a featureBits venn, on some machine other than hgwdev.
     # I can't find the file from which snp126 was loaded... but kkr5u00
     # has an hg18snp126 database with a snp126 that is a few hours newer,
     # but apparently the same as, hgwdev's hg18.snp126... so use that
     # (had to add gap tables too):
     ssh kkr5u00
     time featureBits hg18snp126 snp126
 #12451939 bases of 2881515245 (0.432%) in intersection
 #57.274u 15.283s 1:20.56 90.0%   0+0k 0+0io 0pf+0w
     # Now make sure we have a file copy of snp126 in case we need it in
     # the future:
     hgsql hg18snp126 -NBe 'select * from snp126' \
     | cut -f 2-18 \
       > /cluster/data/dbSNP/126/human/snp126.bed
 
     rsync /cluster/data/dbSNP/128/human/snp128.bed /scratch/tmp/
     time featureBits hg18 /scratch/tmp/snp128.bed
 #12387071 bases of 2881515245 (0.430%) in intersection
 #636.834u 47.039s 11:24.02 99.9% 0+0k 0+0io 0pf+0w
     # OK, db is a lot faster!
     # I am not worried about the drop -- spot-checking, I have seen some
     # dropped rsIds and some that used to have multiple mappings but now
     # have only one mapping -- an improvement.
     pushd /cluster/data/dbSNP/128/human
     hgLoadBed -tab -noSort -onServer -tmpDir=/scratch/tmp \
       hg18snp126 snp128 -sqlTable=snp128.sql snp128.bed
     popd
 
     # How many covered bases in common?
     time featureBits hg18snp126 snp126 snp128
 #11576806 bases of 2881515245 (0.402%) in intersection
 #114.365u 26.671s 3:15.55 72.1%  0+0k 0+0io 0pf+0w
 
     # Base coverage Venn counts:
     #            snp126    snp128   !snp126   !snp128
     # snp126   12451939  11576806         0    875133
     # snp128   11576806  12387071    810265         0
 
     # Do the same for SNPs (rs* records as opposed to bases):
     hgsql hg18snp126 -NBe 'select name from snp126' \
     | sort -u > /scratch/tmp/1
     hgsql hg18snp126 -NBe 'select name from snp128' \
     | sort -u > /scratch/tmp/2
     wc -l /scratch/tmp/[12]
 # 11647909 /scratch/tmp/1
 # 11677826 /scratch/tmp/2
     comm -12 /scratch/tmp/[12] | wc -l
 #11531282
     cd /cluster/data/dbSNP/128/human
     comm -23 /scratch/tmp/[12] \
       > /cluster/data/dbSNP/128/human/ids.inSnp126Not128.txt
     comm -13 /scratch/tmp/[12] \
       > /cluster/data/dbSNP/128/human/ids.inSnp128Not126.txt
 
     # rsId Venn counts:
     #            snp126    snp128   !snp126   !snp128
     # snp126   11647909  11531282         0    116627
     # snp128   11531282  11677826    146544         0
 
     # Interesting that snp128 has more new rsIds but fewer new bases.
     # It has been 2 versions since 126... also, when spot-checking
     # exceptions I noticed that a lot of deletion SNPs used to be
     # mapped to the appropriate span in 126, but in 128 were mapped to
     # a single base and had some kind of range*tion locType... not an
     # improvement.  But that kind of observation best falls out of an
     # examination of exception cases... and that is what will be
     # useful for us to report to NCBI.
 
 
 ############################################################################
 #  BLASTZ SELF chain minScore=2000 (DONE - 2007-12-19 - Hiram)
     ssh kkstore02
     screen # use screen to manage this job
     mkdir /cluster/data/hg18/bed/blastzSelf.2007-12-17
     cd /cluster/data/hg18/bed/blastzSelf.2007-12-17
 
     cat << '_EOF_' > DEF
 # human vs human
 BLASTZ_M=400
 
 # TARGET: Human Hg18
 SEQ1_DIR=/san/sanvol1/scratch/hg18/selfNib
 SEQ1_LEN=/san/sanvol1/scratch/hg18/self.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Human Hg18
 SEQ2_DIR=/san/sanvol1/scratch/hg18/selfNib
 SEQ2_LEN=/san/sanvol1/scratch/hg18/self.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/cluster/data/hg18/bed/blastzSelf.2006-01-17
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	happy emacs
 
     cd /cluster/data/hg18/bed/blastzSelf.2007-12-17
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	`pwd`/DEF -verbose=2 -chainMinScore=2000 -chainLinearGap=medium \
 	-stop=net -smallClusterHub=memk -bigClusterHub=pk > do.log 2>&1 &
     #	real    640m37.637s
     ## crafted a special loadUp.csh to avoid haplotypes and randoms,
     #	and load with normScore
     ssh hgwdev
     cd /cluster/data/hg18/bed/blastzSelf.2007-12-17/axtChain
     time nice -n +19 ./loadUp.csh >loadUp.out 2>&1
     #	real    24m51.669s
     cd /cluster/data/hg18/bed/blastzSelf.2007-12-17
     time nice -n +19 featureBits hg18 chainSelf2KLink \
 	-noRandom -noHap > fb.hg18.chainSelf2KLink.txt 2>&1 &
     #	real    11m30.010s
     cat fb.hg18.chainSelf2KLink.txt
     #	346885376 bases of 2858034764 (12.137%) in intersection
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	`pwd`/DEF -verbose=2 -chainMinScore=2000 -chainLinearGap=medium \
 	-continue=download \
 	-stop=download -smallClusterHub=memk -bigClusterHub=pk \
 	> download.log 2>&1 &
 ############################################################################
 # RE-BUILD GAD TRACK (Done, 1/16/08, Fan)
 # During previous build, all.txt was corrupted during receiving file from
 # email.
 
    mkdir /cluster/store12/gad080116
    rm /cluster/data/gad
    ln -s /cluster/store12/gad080116 /cluster/data/gad
 
    cd /cluster/data/gad
 
 # Receive "all.txt" from GAD
 # contact person: Garner, John (NIH/NIA/IRP) [F] [garnerjr@mail.nih.gov]
 
    hgsql hg18 -e 'drop table gadAll'
    hgsql hg18 <~/src/hg/lib/gadAll.sql
    hgsql hg18 -e 'load data local infile "all.txt" into table gadAll ignore 3 lines'
 
 # create gad table
 
    gadPos hg18 j18.tmp
    cat j18.tmp |sort -u >hg18.gad.tab
 
 # removed 1 record from hg18.gad.tab that has multiple words in geneSymbol
 # field.
 
 # use -nobin option to ensure display order is according to genomic position
    hgLoadBed -nobin hg18 gad hg18.gad.tab
    rm j18.tmp
 
 #######################################################################
 # BLASTZ/CHAIN/NET Lamprey petMar1 (DONE - 2008-01-29 - Hiram)
 #	with contigs for Lamprey
     ssh kkstore02
     screen # use screen to control this job
     mkdir /cluster/data/hg18/bed/blastzPetMar1.2008-01-29
     cd /cluster/data/hg18/bed/blastzPetMar1.2008-01-29
 
     cat << '_EOF_' > DEF
 # Human vs. Lamprey
 
 # using the "close" genome alignment parameters
 #	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human - WindowMasker sequence
 SEQ1_DIR=/san/sanvol1/scratch/hg18/hg18.sdTrf.2bit
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=1
 
 # QUERY: Lamprey petMar1
 SEQ2_DIR=/cluster/bluearc/scratch/data/petMar1/petMar1.2bit
 SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzPetMar1.2008-01-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk > do.log 2>&1 &
     #	real    414m33.533s
     cat fb.hg18.chainPetMar1Link.txt
     #	36042598 bases of 2881515245 (1.251%) in intersection
 
     #	That is OK, now for the swap:
     mkdir /cluster/data/petMar1/bed/blastz.hg18.swap
     cd /cluster/data/petMar1/bed/blastz.hg18.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/hg18/bed/blastzPetMar1.2008-01-29/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk > swap.log 2>&1 &
     #	real    60m1.928s
     cat  fb.petMar1.chainHg18Link.txt
     #	26751073 bases of 831696438 (3.216%) in intersection
 
 #######################################################################
 ###################
 # Build recip-best alignments with calJac1 (DONE 2008-01-25 braney)
 
     cd /cluster/data/hg18/bed
     ln -s blastz.calJac1.2007-10-07 blastz.calJac1
     cd blastz.calJac1
     screen
     /cluster/bin/scripts/doRecipBest.pl hg18 calJac1
 
 
 ###################
 # Build syntenic net for orang (DONE 2008-01-25 braney)
 
     cd /cluster/data/hg18/bed/blastz.ponAbe2
     screen
 
      /cluster/bin/scripts/doBlastzChainNet.pl -syntenicNet -continue syntenicNet -stop syntenicNet `pwd`/DEF  2>&1 | tee syntenic.out
 
 #########################################################################
 ## Primate Multiz (Working
 ##
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/multizPrimate
     cd /cluster/data/hg18/bed/multizPrimate
     #	take the 30-way tree from mm9 and eliminate genomes not in
     #	this alignment
     #	rearrange to get hg18 on the top of the graph
     #	paste this tree into the on-line phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to create the image for the tree diagram
 
     /cluster/bin/phast/tree_doctor  --prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Orangutan_ponAbe2,Rhesus_rheMac2,Marmoset_calJac1,Bushbaby_otoGar1,TreeShrew_tupBel1,Rat_rn4,Dog_canFam2 /cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh  > primate.fullNames.nh
 
     #	looks something like this:
 (((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.249544,((((((Human_hg18:0.005873,Chimp
 _panTro2:0.007668):0.013037,Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:
 0.031973):0.036500,Marmoset_calJac1:0.070000):0.036500,Bushbaby_otoGar1:0.151185
 ):0.015682,TreeShrew_tupBel1:0.162844):0.006272):0.019763,Dog_canFam2:0.187963);
 
     #	rearrange to get human at the top:
     # this leaves us with:
     cat << _EOF_ > hg18.primate.nh
 ((((((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:0.031973):0.036500,Marmoset_calJac1:0.070000):0.036500,Bushbaby_otoGar1:0.151185):0.015682,TreeShrew_tupBel1:0.162844):0.006272,(Mouse_mm9:0.076274,Rat_rn4:0.084383):0.249544):0.019763,Dog_canFam2:0.187963);
 _EOF_
     #	<< happy emacs
 
     #	create a species list from that file:
     sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' hg18.primate.nh \
         | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
         | sed -e "s/.*_//; s/:.*//" | sort > species.list
     # create a stripped down nh file for use in autoMZ run
     echo \
 `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' hg18.primate.nh \
 	| sed -e "s/  / /g"` > tree.primate.nh
     #	that looks like, as a single line:
     #  ((((((((hg18 panTro2) ponAbe2) rheMac2) calJac1) otoGar1) tupBel1) (mm9   rn4)) canFam2)
 
     # verify all blastz's exists
     cat << '_EOF_' > listMafs.csh
 #!/bin/csh -fe
 cd /cluster/data/hg18/bed/multizPrimate
 foreach db (`cat species.list`)
     set bdir = /cluster/data/hg18/bed/blastz.$db
     if (-e $bdir/mafRBestNet/chr1.maf.gz) then
 	echo "$db mafRBestNet"
     else if (-e $bdir/mafSynNet/chr1.maf.gz) then
 	echo "$db mafSynNet"
     else if (-e $bdir/mafNet/chr1.maf.gz) then
 	echo "$db mafNet"
     else
 	echo "$db mafs not found"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./listMafs.csh
     #	see what it says, the "mafs not found" should only show up on hg18
     ./listMafs.csh
 # calJac1 mafRBestNet
 # canFam2 mafSynNet
 # hg18 mafNet
 # mm9 mafSynNet
 # otoGar1 mafRBestNet
 # panTro2 mafSynNet
 # ponAbe2 mafSynNet
 # rheMac2 mafSynNet
 # rn4 mafSynNet
 # tupBel1 mafRBestNet
 
     /cluster/bin/phast/all_dists hg18.primate.nh > Primate.distances.txt
     grep -i hg18 Primate.distances.txt | sort -k3,3n
 # Human_hg18      Chimp_panTro2   0.013541
 # Human_hg18      Orangutan_ponAbe2       0.038910
 # Human_hg18      Rhesus_rheMac2  0.063920
 # Human_hg18      Marmoset_calJac1        0.138447
 # Human_hg18      Bushbaby_otoGar1        0.256132
 # Human_hg18      TreeShrew_tupBel1       0.283473
 # Human_hg18      Dog_canFam2     0.334627
 # Human_hg18      Mouse_mm9       0.452719
 # Human_hg18      Rat_rn4 0.460828
 
     # copy net mafs to cluster-friendly storage, splitting chroms
     # into 50MB chunks  to improve run-time
     # NOTE: splitting will be different for scaffold-based reference asemblies
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/multizPrimate/run.split
     cd /cluster/data/hg18/bed/multizPrimate/run.split
     #	this works by examining the rmsk table for likely repeat areas
     #	that won't be used in blastz
     mafSplitPos hg18 50 mafSplit.bed
 
     ssh kki
     cd /cluster/data/hg18/bed/multizPrimate/run.split
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set targDb = "hg18"
 set db = $1
 set sdir = /san/sanvol1/scratch/$targDb/BRsplitStrictMafNet
 mkdir -p $sdir
 if (-e $sdir/$db) then
     echo "directory $sdir/$db already exists -- remove and retry"
     exit 1
 endif
 set bdir = /cluster/data/$targDb/bed/blastz.$db
 if (! -e $bdir) then
     echo "directory $bdir not found"
     exit 1
 endif
 mkdir -p $sdir/$db
 if (-e $bdir/mafRBestNet) then
     set mdir = $bdir/mafRBestNet
 else if (-e $bdir/mafSynNet) then
     set mdir = $bdir/mafSynNet
 else if (-e $bdir/mafNet) then
     set mdir = $bdir/mafNet
 else
     echo "$bdir maf dir not found"
     exit 1
 endif
 echo $mdir
 foreach f ($mdir/*)
     set c = $f:t:r:r
     echo "  $c"
     nice mafSplit mafSplit.bed $sdir/$db/ $f
 end
 echo "gzipping $sdir/$db mafs"
 nice gzip $sdir/$db/*
 endif
 echo $mdir > $db.done
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     grep -v hg18  ../species.list > split.list
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(path1) {check out line+ $(path1).done}
 #ENDLOOP
 '_EOF_'
     gensub2 split.list single template jobList
     para create jobList
     # start these gently, this is a good load on the san filesystem
     para -maxPush=3 push
     #	wait a while, verify these are running OK
     para push
     # let that run to a couple completions, a few minutes, then again:
     para try
     # etc ...
 
 # Completed: 9 of 9 jobs
 # CPU time in finished jobs:       9090s     151.50m     2.52h    0.11d  0.000 y
 # IO & Wait Time:                  3093s      51.55m     0.86h    0.04d  0.000 y
 # Average job time:                1354s      22.56m     0.38h    0.02d
 # Longest finished job:            2134s      35.57m     0.59h    0.02d
 # Submission to last job:          2153s      35.88m     0.60h    0.02d
 
     # ready for the multiz run
     ssh pk
     cd /cluster/data/hg18/bed/multizPrimate
     #	actually, the result directory here should be maf.split instead of maf
     mkdir -p maf run
     cd run
     mkdir penn
     # use latest penn utilities
     P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
     cp -p $P/{autoMZ,multiz,maf_project} penn
 
     # list chrom chunks, any db dir will do; better would be for the
     # splitter to generate this file
     # We temporarily use __ instead of . to delimit chunk in filename
     # so we can use $(root) to get basename
     find /san/sanvol1/scratch/hg18/BRsplitStrictMafNet -type f \
 	| while read F; do basename $F; done \
 	| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list
     wc -l chromChunks.list
 	# 93 chromChunks.list
 
 cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 
     set db = hg18
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/BRsplitStrictMafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../tree.primate.nh ../species.list $tmp
     pushd $tmp
     foreach s (`cat species.list`)
         set c2 = `echo $c | sed 's/__/./'`
         set in = $pairs/$s/$c2.maf
         set out = $db.$s.sing.maf
         if ($s == hg18) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.primate.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/hg18/bed/multizPrimate/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << emacs
     gensub2 chromChunks.list single template jobList
     para create jobList
 
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:     302126s    5035.43m    83.92h    3.50d  0.010 y
 # IO & Wait Time:                  3499s      58.32m     0.97h    0.04d  0.000 y
 # Average job time:                3286s      54.77m     0.91h    0.04d
 # Longest finished job:            6972s     116.20m     1.94h    0.08d
 # Submission to last job:          7052s     117.53m     1.96h    0.08d
 
     # put the split maf results back together into single chroms
     ssh kkstore02
     cd /cluster/data/hg18/bed/multizPrimate
     # here is where the result directory maf should have already been maf.split
     mv maf maf.split
     mkdir maf
     # going to sort out the redundant header garbage to leave a cleaner maf
     for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
 do
     echo ${C}
     head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
     grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
 	sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
     grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
     tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
 done
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/hg18/multizPrimate/maf
     ln -s /cluster/data/hg18/bed/multizPrimate/maf/*.maf \
                 /gbdb/hg18/multizPrimate/maf
     # this generates a large 1 Gb multizPrimate.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/hg18/multizPrimate/maf hg18 multizPrimate
     # Loaded 12531777 mafs in 49 files from /gbdb/hg18/multizPrimate/maf
     # real    8m44.516s
 
     # load summary table
     time nice -n +19 cat /gbdb/hg18/multizPrimate/maf/*.maf \
 	| hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
 	 -maxSize=200000  multizPrimateSummary stdin
     #	Created 1417364 summary blocks from 29928557 components
     #	and 6981421 mafs from stdin
     #	real    21m35.057s
 
     # Gap Annotation
     # prepare bed files with gap info
     ssh kkstore02
     mkdir /cluster/data/hg18/bed/multizPrimate/anno
     cd /cluster/data/hg18/bed/multizPrimate/anno
     mkdir maf run
 
     #	these actually already all exist from previous multiple alignments
     for DB in `cat ../species.list`
 do
     CDIR="/cluster/data/${DB}"
     if [ ! -f ${CDIR}/${DB}.N.bed ]; then
 	echo "creating ${DB}.N.bed"
 	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
     else
 	ls -og ${CDIR}/${DB}.N.bed
     fi
 done
 
     cd run
     rm -f nBeds sizes
     for DB in `grep -v hg18 ../../species.list`
 do
     echo "${DB} "
     ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
     echo ${DB}.bed  >> nBeds
     ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
     echo ${DB}.len  >> sizes
 done
 
     ssh kki
     cd /cluster/data/hg18/bed/multizPrimate/anno/run
 
     cat << '_EOF_' > doAnno.csh
 #!/bin/csh -ef
     set dir = /cluster/data/hg18/bed/multizPrimate
     set c = $1
     cat $dir/maf/${c}.maf | \
         nice mafAddIRows -nBeds=nBeds stdin /cluster/data/hg18/hg18.2bit $2
 '_EOF_'
     # << happy emacs
     chmod +x doAnno.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./doAnno.csh $(root1) {check out line+ /cluster/data/hg18/bed/multizPrimate/anno/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 /cluster/data/hg18/chrom.sizes > chrom.list
     gensub2 chrom.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:      10782s     179.71m     3.00h    0.12d  0.000 y
 # IO & Wait Time:                  3380s      56.33m     0.94h    0.04d  0.000 y
 # Average job time:                 289s       4.82m     0.08h    0.00d
 # Longest finished job:             751s      12.52m     0.21h    0.01d
 # Submission to last job:          1479s      24.65m     0.41h    0.02d
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/multizPrimate/anno
     mkdir -p /gbdb/hg18/multizPrimate/anno/maf
     ln -s /cluster/data/hg18/bed/multizPrimate/anno/maf/*.maf \
                 /gbdb/hg18/multizPrimate/anno/maf
     #	by loading this into the table multizPrimate, it will replace the
     #	previously loaded table with the unannotated mafs
     #	huge temp files are made, do them on local disk
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg18/multizPrimate/anno/maf \
                 hg18 multizPrimate
     #	Loaded 7331265 mafs in 55 files from /gbdb/hg18/multizPrimate/anno/maf
     #	real    8m31.092s
 
     cat /cluster/data/hg18/chrom.sizes | \
 	awk '{if ($2 > 1000000) { print $1 }}' |
 	while read C
 do
     echo /gbdb/hg18/multizPrimate/anno/maf/$C.maf
 done | xargs cat | \
         hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
             -maxSize=200000  multizPrimateSummary stdin
     # Created 1621960 summary blocks from 75794119 components and 12601786
     # mafs from stdin
     #	remove the multizPrimate*.tab files in this /scratch/tmp directory
     rm multizPrimate*
 
 #######
 
 ################################################################################
 # RE-SEQUENCING TRACE DOWNLOAD (DONE 2008-01-25, Andy)
 
 ssh kolossus
 bash
 cd /san/sanVol1/scratch/andy
 mkdir traces
 cd traces/
 cat < "EOF" > getOldTraces.sh
 #!/bin/bash
 
 echo Retrieving sequences before Jan 2008
 echo Starting at `date`
 
 # Query the database and figure out the total number of pages needed
 count=`./query_tracedb "query count species_code='HOMO SAPIENS' and strategy='Re-Sequencing' and load_date<'1/1/2008'"`
 pages=$(( (count/40000) + ((count % 40000) > 0) ))
 
 echo
 echo Total of $count sequences and $pages pages to retrieve
 echo
 
 for ((page=0; page < pages; page++)); do
     pagenum=`printf "%03d" $((page+1))`
     ./query_tracedb "query page_size 40000 page_number $page binary species_code='HOMO SAPIENS' and strategy='Re-Sequencing' and load_date<'1/1/2008'" > page.bin
     echo -n "Retrieving page $((page+1)) of $pages compressed fasta... "
     (echo -n "retrieve_gz fasta 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.fa.gz
     echo "done at `date +%T`"
     echo -n "Retrieving page $((page+1)) of $pages compressed quality file... "
     (echo -n "retrieve_gz quality 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.qa.gz
     echo "done at `date +%T`"
     echo -n "Retrieving page $((page+1)) of $pages xml file... "
     (echo -n "retrieve xml_info 0b"; cat page.bin ) | ./query_tracedb > page-${pagenum}.xml
     gzip page-${pagenum}.xml
     echo "done at `date +%T`"
     rm page.bin
 done
 
 echo
 echo All done at `date`!
 EOF
 chmod +x getOldTraces.sh
 screen
 ./getOldTraces.sh > download.log
 # detach screen
 # tail -f download.log
 #Retrieving sequences before Jan 2008
 #Starting at Wed Jan 23 11:47:04 PST 2008
 #
 #Total of 13978657 sequences and 350 pages to retrieve
 #
 #Retrieving page 1 of 350 compressed fasta... done at 11:48:40
 #Retrieving page 1 of 350 compressed quality file... done at 11:49:10
 #Retrieving page 1 of 350 xml file... done at 11:51:05
 #Retrieving page 2 of 350 compressed fasta... done at 11:52:40
 #Retrieving page 2 of 350 compressed quality file... done at 11:53:10
 # ...
 #Retrieving page 350 of 350 compressed quality file... done at 07:07:08
 #Retrieving page 350 of 350 xml file... done at 07:08:16
 #
 #All done at Fri Jan 25 07:08:16 PST 2008!
 
 ################################################################################
 # RE-SEQUENCING TRACE ALIGNMENT TO HG18 (DONE 2008-01-31, Andy)
 
 ssh kkr12u22
 cd /san/sanVol1/scratch/andy/traces
 mkdir run
 cd run/
 ls -1 /scratch/hg/hg18/nib/* | grep -v hap > nib.lst
 ls -1 /san/sanVol1/scratch/andy/traces/page-*.fa.gz > traces.lst
 
 cat < "EOF" > gsub
 #LOOP
 ./doBlat.sh {check in exists $(path1)} $(path2) {check out line+ $(root2)/$(root1).$(root2).maf}
 #ENDLOOP
 cat < "EOF" > doBlat.sh
 #!/bin/bash
 
 thisDir=`pwd -P`
 fa=`basename $1`
 nib=$2
 f=${fa%.fa.gz}
 
 n=`basename $2`
 n=${n%.nib}
 name=${f}.${n}
 
 out=${name}.maf
 mkdir -p /scratch/tmp/andy/$name
 mkdir -p $n
 pushd /scratch/tmp/andy/$name
 cp $1 .
 blat -minMatch=12 -ooc=/scratch/hg/hg18/11.ooc -out=maf $nib $fa $out
 cp $out ${thisDir}/$n
 popd
 rm -rf /scratch/tmp/andy/$name
 EOF
 chmod +x doBlat.sh
 
 ssh pk
 cd /san/sanVol1/scratch/andy/traces/run
 gensub2 traces.lst nib.lst gsub spec
 sed 's/\.fa\.c/.c/' spec > tmp; mv tmp spec
 para create spec
 para try, push, check
 para time
 #15750 jobs in batch
 #100 jobs (including everybody's) in Parasol queue.
 #Checking finished jobs
 #Completed: 15750 of 15750 jobs
 #CPU time in finished jobs:     385991s    6433.19m   107.22h    4.47d  0.012 y
 #IO & Wait Time:                 47866s     797.76m    13.30h    0.55d  0.002 y
 #Average job time:                  28s       0.46m     0.01h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             186s       3.10m     0.05h    0.00d
 #Submission to last job:          1551s      25.85m     0.43h    0.02d
 
 # Cat all the alignments
 
 ssh hgwdev
 cd /san/sanVol1/scratch/andy/traces/run
 head -n1 chrY/page-112.chrY.maf > maf.header
 for ((i=0; i < 350; i++)); do
    echo page $((i+1))
    pagenum=`printf "%03d" $((i+1))`
    prefix=page-$pagenum
    newfile=cat/${prefix}.maf
    cp maf.header $newfile
    for f in `find . -name "${prefix}*"`; do
       tail +2 $f | sed 's/gnl|ti|//' >> $newfile
    done
 done
 
 ############################################################################
 # Reload CCDS (2008-02-01 markd)
     # import ccds database as described in ccds.txt
     set db=hg18
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of
         ccdsGene
         ccdsInfo
         ccdsKgMap
     # << emacs
 
 #############################################################################
 # phastCons multizPrimage
 ##		(DONE - 2008-02-11 braney )
 
     # split mafs into 10M chunks and generate sufficient statistics
     # files for # phastCons
     ssh kki
     mkdir /cluster/data/hg18/bed/multizPrimate/msa.split
     mkdir -p /san/sanvol1/scratch/hg18/multizPrimate/cons/ss
     cd /cluster/data/hg18/bed/multizPrimate
     # just use primates
     cat << '_EOF_' > primates.list
 hg18
 panTro2
 ponAbe2
 rheMac2
 calJac1
 otoGar1
 '_EOF_'
 
     cd /cluster/data/hg18/bed/multizPrimate/msa.split
     zcat /san/sanvol1/braney/multizPrimate/chr1.maf.gz | \
     	perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' | \
 	mafOrder stdin /cluster/data/hg18/bed/multizPrimate/primates.list chr1.maf
     twoBitToFa -seq=chr1 /scratch/data/hg18/hg18.2bit chr1.fa
     /cluster/bin/phast/$MACHTYPE/msa_split chr1.maf -i MAF -M chr1.fa \
              -o SS -r chr1 -w 300000000,0 -I 1000 -B 5000
     time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
             chr1.1-247249719.ss --tree \
 	    "(((((hg18,panTro2),ponAbe2),rheMac2),calJac1),otoGar1)" \
 	            --out-root starting-tree
     rm chr1.maf chr1.fa chr1.1-247249719.ss
     mkdir -p /san/sanvol1/scratch/hg18/multizPrimate/cons/estimate
     cp msa.split/starting-tree.mod /san/sanvol1/scratch/hg18/multizPrimate/cons/estimate
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set MAFS = /san/sanvol1/braney/multizPrimate
 set WINDOWS = /san/sanvol1/scratch/hg18/multizPrimate/cons/ss
 pushd $WINDOWS
 set c = $1
 rm -fr $c
 mkdir $c
 twoBitToFa -seq=$c /scratch/data/hg18/hg18.2bit /scratch/tmp/hg18.$c.fa
 set TMP = /scratch/BR.$c.maf
 zcat $MAFS/$c.maf.gz | perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' | \
     mafOrder stdin /cluster/data/hg18/bed/multizPrimate/primates.list $TMP
 /cluster/bin/phast/$MACHTYPE/msa_split $TMP \
     -i MAF \
     -M /scratch/tmp/hg18.$c.fa \
     -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
 rm -f $TMP /scratch/tmp/hg18.$c.fa
 popd
 date >> $c.done
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(root1) {check out line+ $(root1).done}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	do the easy ones first to see some immediate results
     ls -1S -r ../anno/maf | sed -e "s/.maf//" > maf.list
 
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check ... etc
 
 # Completed: 49 of 49 jobs
 # CPU time in finished jobs:       3520s      58.66m     0.98h    0.04d  0.000 y
 # IO & Wait Time:                  1200s      20.00m     0.33h    0.01d  0.000 y
 # Average job time:                  96s       1.61m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             464s       7.73m     0.13h    0.01d
 # Submission to last job:           723s      12.05m     0.20h    0.01d
 
     # XXXX Estimates were attempted, not really very useful, instead, as seen
     # below, merely take the cons and noncons trees from the mouse 30-way
 
     # Estimate phastCons parameters
     #	see also:
     #	http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html
 
     # Create a list of .ss files over 3,000,000 in length
     #	this is almost everything
     cd /san/sanvol1/scratch/hg18/multizPrimate/cons/ss
     ls -1l chr*/chr*.ss | egrep -v "_hap|chrUn|random" | \
 	awk '$5 > 3000000 {print $9;}' > ../tuningRun.list
 
     # Set up parasol directory to calculate trees on these 50 regions
     ssh pk
     mkdir /cluster/data/hg18/bed/multizPrimate/treeRun2
     cd /cluster/data/hg18/bed/multizPrimate/treeRun2
     mkdir tree log most
 
     #	Tuning this loop should come back to here to recalculate
     # Create script that calls phastCons with right arguments
     cat > makeTree.csh << '_EOF_'
 #!/bin/csh -fe
 set SAN="/san/sanvol1/scratch/hg18/multizPrimate/cons"
 set SS=$1
 set C=$1:h
 set F=$1:t
 set tmpDir="/scratch/tmp/pA2_$2"
 rm -fr $tmpDir
 mkdir $tmpDir
 mkdir -p log/${C} tree/${C} most/${C}
 cp -p $SAN/ss/$1 $tmpDir/$F
 cp -p $SAN/estimate/starting-tree.mod $tmpDir
 pushd $tmpDir
 /cluster/bin/phast/$MACHTYPE/phastCons $F starting-tree.mod \
       --gc 0.355 --nrates 1,1 --no-post-probs --ignore-missing \
       --expected-length 45 --target-coverage 0.3 --most-conserved $F.most \
       --quiet --log $F.log --estimate-trees $F.tree
 popd
 cp -p $tmpDir/$F.log log/$C
 cp -p $tmpDir/$F.most most/$C
 cp -p $tmpDir/$F.tree.*cons.mod tree/$C
 rm -fr $tmpDir
 '_EOF_'
 #	<< happy emacs
       chmod a+x makeTree.csh
 
 # Create gensub file
       cat > template << '_EOF_'
 #LOOP
 makeTree.csh $(path1) $(num1)
 #ENDLOOP
 '_EOF_'
 #	<< happy emacs
 
 # Make cluster job and run it
   scp -p braney@pk:/san/sanvol1/scratch/hg18/multizPrimate/cons/tuningRun.list .
     gensub2 tuningRun.list single template jobList
     para create jobList
     para try/push/check/etc
 
 # Completed: 310 of 310 jobs
 # CPU time in finished jobs:     226767s    3779.45m    62.99h    2.62d  0.007 y
 # IO & Wait Time:                  1224s      20.40m     0.34h    0.01d  0.000 y
 # Average job time:                 735s      12.26m     0.20h    0.01d
 # Longest finished job:             908s      15.13m     0.25h    0.01d
 # Submission to last job:          4948s      82.47m     1.37h    0.06d
 
 # Now combine parameter estimates.  We can average the .mod files
 # using phyloBoot.  This must be done separately for the conserved
 # and nonconserved models
     ls -1 tree/chr*/*.cons.mod > cons.list
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
 	--output-average ave.cons.mod > cons_summary.txt
     ls -1 tree/chr*/*.noncons.mod > noncons.list
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
 	--output-average ave.noncons.mod > noncons_summary.txt
     sort -k1,1 -k2,2n  most/chr*/*.most > mostConserved.bed
     wc -l mostConserved.bed
 # 1192414 mostConserved.bed
 
 #	measuring entropy
 #	consEntopy <target coverage> <expected lengths>
 #		 ave.cons.mod ave.noncons.mod --NH 9.78
     /cluster/bin/phast/$MACHTYPE/consEntropy .3 45 \
 	ave.cons.mod ave.noncons.mod
 
 # Transition parameters: gamma=0.300000, omega=45.000000, mu=0.022222,
 # nu=0.009524
 # Relative entropy: H=0.141789 bits/site
 # Expected min. length: L_min=98.721504 sites
 # Expected max. length: L_max=62.917932 sites
 # Phylogenetic information threshold: PIT=L_min*H=13.997639 bits
 
 
     ssh hgwdev featureBits -noRandom -noHap hg18 `pwd`/mostConserved.bed
     # 372348946 bases of 2858034764 (13.028%) in intersection
     ssh hgwdev featureBits -noRandom -noHap -enrichment hg18 genscan:cds \
 	`pwd`/mostConserved.bed
     # genscan:cds 1.927%,
     # mostConserved.bed 13.028%,
     # both 0.300%, cover 15.57%, enrich 1.20x
 
     #	Estimates could be made, but more correctly, take the 30-way
     #	.mod file, and re-use it here.
     ssh hgwdev
     cd /cluster/data/hg18/bed/multizPrimate
  #   cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
 
     # add up the C and G:
     grep BACKGROUND treeRun2/ave.noncons.mod | awk '{printf "%0.3f\n", $3 + $4;}'
     #	0.355
     #	This 0.355 is used in the --gc argument below
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     ssh pk
     mkdir -p /cluster/data/hg18/bed/multizPrimate/cons/run.cons
     cd /cluster/data/hg18/bed/multizPrimate/cons/run.cons
 
     #	there are going to be several different phastCons runs using
     #	this same script.  They trigger off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is one of:
     #	all gliers placentals
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.2007-05-04
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $cwd:t
 set tmp = /scratch/tmp/$f
 set cons = /cluster/data/hg18/bed/multizPrimate/cons
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/hg18/multizPrimate/cons
   cp -p $cons/$grp/*.mod .
   cp -p $san/ss/$c/$f.ss $cons/$grp/*.mod $tmp
 pushd $tmp > /dev/null
   $PHASTBIN/phastCons $f.ss ave.cons.mod,ave.noncons.mod \
     --expected-length $len --target-coverage $cov --quiet \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 #  $PHASTBIN/phastCons $f.ss $grp.mod \
 #    --rho $rho --expected-length $len --target-coverage $cov --quiet \
 #    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 endif
 popd > /dev/null
 mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
 sleep 4
 touch $san/$grp/pp/$c $san/$grp/bed/$c
 rm -f $san/$grp/pp/$c/$f.pp
 rm -f $san/$grp/bed/$c/$f.bed
 mv $tmp/$f.pp $san/$grp/pp/$c
 mv $tmp/$f.bed $san/$grp/bed/$c
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast.csh
 
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/hg18/multizPrimate/cons/all/bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/hg18/multizPrimate/cons
     ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/hg18/bed/multizPrimate/cons/ss.list
     popd
 
     # run for all species
     cd ..
     mkdir -p all run.cons/all
     cd all
 #    /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
 #    --prune-all-but=hg18,hg18,panTro2,rheMac2,calJac1,mm9,monDom4,ornAna1 \
 #	> all.mod
     cd ../run.cons/all
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create template file for "all" run
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/hg18/multizPrimate/cons/all/bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 ../../ss.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 
 # crashed jobs are OK methinks since we're checking output in
 # bed file instead of pp file
 
 # Completed: 332 of 337 jobs
 # Crashed: 5 jobs
 # CPU time in finished jobs:      11572s     192.86m     3.21h    0.13d  0.000 y
 # IO & Wait Time:                  3189s      53.15m     0.89h    0.04d  0.000 y
 # Average job time:                  44s       0.74m     0.01h    0.00d
 # Longest finished job:              60s       1.00m     0.02h    0.00d
 # Submission to last job:           564s       9.40m     0.16h    0.01d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all
     time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/hg18/bed/multizPrimate/cons/all
 
     # load into database
     ssh hgwdev
     cd /cluster/data/hg18/bed/multizPrimate/cons/all
     time nice -n +19 hgLoadBed hg18 phastConsElementsPrimate mostConserved.bed
     # Loaded 1431934 elements of size 5
 
     # Try for 5% overall cov, and 70% CDS cov
     featureBits hg18 phastConsElementsPrimate
     # 460640890 bases of 2881515245 (15.986%) in intersection
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     # sort by chromName, chromStart so that items are in numerical order
     #  for wigEncode
     cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
 
 TOP=`pwd`
 export TOP
 
 mkdir -p phastConsPrimateScores
 
 for D in pp/chr*
 do
     C=${D/pp\/}
     out=phastConsPrimateScores/${C}.data.gz
     echo "${D} > ${C}.data.gz"
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
 	gzip > ${out}
 done
 '_EOF_'
     #	<< happy emacs
     chmod +x gzipAscii.sh
     time nice -n +19 ./gzipAscii.sh
     #  real    47m46.099s
     #	copy the phastCons8wayScores to:
 # /cluster/data/hg18/bed/multizPrimate/downloads/phastCons8way/phastConsScores
     #	for hgdownload downloads
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     cd /san/sanvol1/scratch/hg18/multizPrimate/cons/all
     time nice -n +19 ls phastConsPrimateScores/*.data.gz | xargs zcat \
 	| wigEncode -noOverlap stdin phastConsPrimate.wig phastConsPrimate.wib
     # Converted stdin, upper limit 1.00, lower limit 0.00
     # real    30m18.821s
 
     time nice -n +19 cp -p *.wi? /cluster/data/hg18/bed/multizPrimate/cons/all
     # real    1m26.426s
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/hg18/bed/multizPrimate/cons/all
     ln -s `pwd`/phastConsPrimate.wib /gbdb/hg18/multizPrimate/phastConsPrimate.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multizPrimate hg18 \
 	phastConsPrimate phastConsPrimate.wig
     # real    0m53.686s
 
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/hg18/bed/multizPrimate/cons/all
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=hg18 phastConsPrimate > histogram.data 2>&1
     # real    5m10.426s
 
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Human Histogram phastConsPrimate track"
 set xlabel " phastConsPrimate score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 #############################################################################
 ## Annotate multizPrimate multiple alignment with gene annotations
 ##		(DONE - 2008-02-11 braney )
     # Gene frames
     ## survey all genomes to see what type of gene track to use
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/multizPrimate/frames
     cd /cluster/data/hg18/bed/multizPrimate/frames
     #	dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
     cat << '_EOF_' > showGenes.csh
 #!/bin/csh -fe
 foreach db (`cat ../species.list`)
     echo -n "${db}: "
     echo -n "Tables: "
     set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
     foreach table ($tables)
 	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
 	    $table == "knownGene") then
 		set count = `hgsql $db -N -e "select count(*) from $table"`
 		echo -n "${table}: ${count}, "
 	endif
     end
     set orgName = `hgsql hgcentraltest -N -e \
 	    "select scientificName from dbDb where name='$db'"`
     set orgId = `hgsql hg18 -N -e \
 	    "select id from organism where name='$orgName'"`
     if ($orgId == "") then
 	echo "Mrnas: 0"
     else
 	set count = `hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
 	echo "Mrnas: ${count}"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./showGenes.csh
     #	given this output, manually sorted for this display:
 
 # calJac1: Tables: Mrnas: 3558
 # canFam2: Tables: ensGene: 25568, refGene: 864, Mrnas: 367629
 # hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 28497, refGene:
 # 26066, Mrnas: 8354195
 # mm9: Tables: ensGene: 43795, knownGene: 49409, mgcGenes: 22368, refGene:
 # 21395, Mrnas: 5093221
 # otoGar1: Tables: Mrnas: 0
 # panTro2: Tables: ensGene: 32852, mgcGenes: 4, refGene: 26344, Mrnas: 6346
 # ponAbe2: Tables: Mrnas: 0
 # rheMac2: Tables: ensGene: 38561, refGene: 445, Mrnas: 61770
 # rn4: Tables: ensGene: 33745, knownGene: 8202, mgcGenes: 5704, refGene: 14498,
 # Mrnas: 872209
 # tupBel1: Tables: Mrnas: 2364
 
     #	use knownGene for hg18, mm9
     #	use ensGene for rn4, canFam2, panTro2, rheMac2
     #	use Mrnas for calJac1, ponAbe2
     #	no annotations for
     #		tupBel1, otoGar1
 
     mkdir genes
     # knownGene
     for DB in hg18 mm9
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # ensGene
     for DB in rn4 canFam2  panTro2 rheMac2
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # and finally, using the mrna tables
 
     for DB in calJac1 ponAbe2
 do
 tmpExt=`mktemp temp.XXXXXX`
 tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
 tmpMrna=${DB}.mrna.${tmpExt}
 tmpCds=${DB}.cds.${tmpExt}
 hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
 	   from all_mrna,gbCdnaInfo,cds \
 	   where (all_mrna.qName = gbCdnaInfo.acc) and \
 	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
 $DB > ${tmpMrnaCds}
 cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
 cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
 mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
 genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
 rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
 mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
 rm -f $tmpExt
 echo "${DB} done"
 done
 
     ssh kkstore06
     cd /cluster/data/hg18/bed/multizPrimate/frames
     time (cat  ../anno/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout rn4 genes/rn4.gp.gz mm9 genes/mm9.gp.gz hg18 genes/hg18.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz canFam2 genes/canFam2.gp.gz calJac1 genes/calJac1.gp.gz | gzip > multizPrimate.mafFrames.gz) > frames.log 2>&1
     # see what it looks like in terms of number of annotations per DB:
     zcat multizPrimate.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
 #   2732 calJac1
 # 190927 hg18
 # 195671 panTro2
 # 208637 rheMac2
 # 230764 mm9
 # 231026 rn4
 # 248086 canFam2
 
     #	load the resulting file
     ssh hgwdev
     cd /cluster/data/hg18/bed/multizPrimate/frames
     time nice -n +19 hgLoadMafFrames hg18 multizPrimateFrames \
 	multizPrimate.mafFrames.gz
     #	real    1m1.893s
 
     #	enable the trackDb entries:
 # frames multizPrimateFrames
 # irows on
 
 
 #############################################################################
 ## Add CTD data	(DONE - 2008-02-22, updated 2008-03-07, Fan )
 
     mkir /cluster/store11/gs.19/build36/bed/ctd021508
     cd /cluster/store11/gs.19/build36/bed/ctd021508
 
 #   Download chem_gene_ixns.tsv from CTD site, http://ctd.mdibl.org/downloads/.
 
     hgsql hg18 -e 'create database ctd'
     hgsql ctd < ~/kent/src/hg/lib/chem_gene_ixns.sql
 
     hgsql ctd -e 'load data local infile "chem_gene_ixns.tsv" into table chem_gene_ixns'
 
 # create sorted data
 
     hgsql hg18 -N -e \
     'select x.geneSymbol, ChemicalId, count(distinct Interaction), ChemicalName from kgXref x, ctd.chem_gene_ixns c where x.geneSymbol=c.GeneSymbol group by x.geneSymbol, ChemicalId'|\
     sort -k 1,1 -k 3,3nr -k 4,4 >ctdSorted.tab
 
     hgsql hgFixed < ~/kent/src/hg/lib/ctdSorted.sql
     hgsql hgFixed -e 'load data local infile "ctdSorted.tab" into table ctdSorted'
 
 #############################################################################
 # CREATE huge TABLE FOR HuGE LINK (DONE 3/6/08, Fan)
 
 # Get HuGEgeneList.txt (list of HuGE genes from HuGE collaborator).
 
     mkdir /cluster/store11/gs.19/build36/bed/HuGE
     cd /cluster/store11/gs.19/build36/bed/HuGE
 
 # put the file there.
 
     cp HuGEgeneList.txt huge.tab
 
 # get rid of header lines and blank lines at the end.
     vi huge.tab
 
     hgsql hg17 < ~/kent/src/hg/lib/huge.sql
     hgsql hg18 < ~/kent/src/hg/lib/huge.sql
 
     hgsql hg17 -e 'load data local infile "huge.tab" into table huge'
     hgsql hg18 -e 'load data local infile "huge.tab" into table huge'
 #############################################################################
 
 
 #############################################################################
 # ULTRACONSERVED TRACKS (LIFT FROM HG17) (DONE 2008-03-10, Andy)
 
 ssh hgwdev
 cd /cluster/data/hg18/bed
 mkdir ultras
 cd ultras/
 echo "select chrom,chromStart,chromEnd,name from uc16" \
     | hgsql hg17 | tail +2 > uc16Hg17.bed
 echo "select chrom,chromStart,chromEnd,name from ux16" \
     | hgsql hg17 | tail +2 > ux16Hg17.bed
 liftOver uc16Hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
     uc16Hg18.bed uc16Hg18.unmapped
 liftOver ux16Hg17.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
     ux16Hg18.bed ux16Hg18.unmapped
 hgLoadBed hg18 uc16 uc16Hg18.bed
 hgLoadBed hg18 ux16 ux16Hg18.bed
 
 
 #############################################################################
 # TAJIMA'S D (LIFTOVER FROM HG17) (DONE 3/17/08 angie)
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/tajdLiftOver
     cd /cluster/data/hg18/bed/tajdLiftOver
     # The submitted hg17 bedGraph custom tracks had 1-based start coords,
     # so correct; also, the tajdSnp* tables used a sql command to set
     # the rs names, so get the data from SQL not file:
     set loChain = /cluster/data/hg17/bed/liftOver/hg17ToHg18.over.chain.gz
     foreach pop (Ad Ed Xd)
       zcat /cluster/data/hg17/bed/tajdpoly/20050603/hg17.tajd$pop.bedGraph.gz \
       | awk '{print $1 "\t" $2-1 "\t" $3 "\t" $4}' \
       | liftOver stdin -minMatch=0.5 \
           $loChain hg18.tajd$pop.bedGraph hg17.tajd$pop.unmapped
       hgsql hg17 -NBe "select chrom,chromStart,chromEnd,name from tajdSnp$pop" \
       | liftOver stdin \
           $loChain hg18.tajdSnp$pop.bed hg17.tajdSnp$pop.unmapped
     end
     foreach pop (Ad Ed Xd)
       hgLoadBed hg18 tajdSnp$pop hg18.tajdSnp$pop.bed
       hgLoadBed -bedGraph=4 hg18 tajd$pop hg18.tajd$pop.bedGraph
     end
 
     # The hg17 build had some fancy sql to find items overlapping with gaps,
     # awk'd to make sql to delete those items.  Use featureBits to find:
     foreach pop (Ad Ed Xd)
       featureBits hg18 -countGaps tajdSnp$pop gap -bed=tajdSnp$pop.gap.bed
       featureBits hg18 -countGaps tajd$pop gap -bed=tajd$pop.gap.bed
     end
     wc -l *.gap.bed
 #  8 tajdAd.gap.bed
 #  8 tajdEd.gap.bed
 #  0 tajdSnpAd.gap.bed
 #  0 tajdSnpEd.gap.bed
 #  0 tajdSnpXd.gap.bed
 #  8 tajdXd.gap.bed
     diff tajdAd.gap.bed tajdEd.gap.bed
     diff tajdAd.gap.bed tajdXd.gap.bed
     # No output from either diff -- same ranges.
     awk '{print $3 - $2;}' tajdAd.gap.bed
 #2605
 #5000
 #5000
 #1000
 #1199
 #1359
 #5000
 #4100
     # Actually, I disagree with removing the items that overlap those.
     # As the description page says, each 10kb region is really the center
     # of a 100kb window.  Those windows will overlap gaps -- and if the
     # center 10k of a window happens to overal a gap, the whole window is
     # no worse than a window that overlaps a gap 1/3 of the way in instead
     # of 1/2.
 
 
 #############################################################################
 # ADD ALLEN BRAIN CORTEXT LINK (DONE, 2/12/08, Fan)
 
     mkdir -p /cluster/store11/gs.19/build36/bed/allenBrain
     cd /cluster/store11/gs.19/build36/bed/allenBrain
 
 # save list of genes from Allen Brain into file allenBrainGene.tab
 
     hgsql hg18 < ~/src/hg/lib/allenBrainGene.sql
     hgsql hg18 -e \
     'load data local infile "allenBrainGene.tab" into table allenBrainGene'
 
 #############################################################################
 # BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-10 - larrym)
     ssh kkstore04
     screen #	use screen to control this multi-day job
     mkdir /cluster/data/hg18/bed/blastz.equCab2.2008-04-10
     cd /cluster/data/hg18/bed/blastz.equCab2.2008-04-10
     cat << '_EOF_' > DEF
 # Human vs. Horse
 
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse
 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
 SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
 SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.equCab2.2008-04-10
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log &
 
     # failed so had to rerun stuff manually then, continue thus:
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log &
     0.157u 0.084s 1:21:15.25 0.0%   0+0k 0+0io 0pf+0w
 
     ln -s blastz.equCab2.2008-04-10 /cluster/data/hg18/bed/blastz.equCab2
 
     featureBits hg18 -chrom=chr1 chainEquCab2Link
     # 133103986 bases of 224999719 (59.157%) in intersection
 
     cd /cluster/data/hg18/bed/blastz.equCab2.2008-04-10
 
     cat fb.hg18.chainEquCab2Link.txt
     # 1647122438 bases of 2881515245 (57.162%) in intersection
 
     #	re-running with fixed UnScaffolds business with fixed chr27:
     mkdir /hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01
     cd /hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01
     cat << '_EOF_' > DEF
 # Human vs. Horse
 
 BLASTZ=blastz
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes 
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse
 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
 SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
 SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
 SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzEquCab2.2008-12-01
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk  \
 	-chainMinScore=3000 -chainLinearGap=medium  > do.log 2>&1
     #	broken chain step for chr19, ran manually all day long on swarm, then
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
 	-continue=chainMerge -verbose=2 -workhorse=hgwdev \
 	-stop=net -smallClusterHub=pk -bigClusterHub=pk  \
 	-chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1
 XXX - running Tue Dec  2 15:42:18 PST 2008
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
 	-continue=syntenicNet -syntenicNet -verbose=2 -workhorse=hgwdev \
 	-stop=syntenicNet -smallClusterHub=pk -bigClusterHub=pk  \
 	-debug -chainMinScore=3000 -chainLinearGap=medium > syntenicNet.log 2>&1
 
 #############################################################################
 # MAKE PCR TARGET FOR UCSC GENES (DONE 4/18/08 angie - UPDATED 11/4/08)
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/mrnaPcr
     cd /cluster/data/hg18/bed/mrnaPcr
     # First, get consistent FA and PSL for UCSC Genes.
     # Initially I tried to use files from /cluster/data/hg18/bed/ucsc.10/:
     # subColumn 10 /cluster/data/hg18/bed/ucsc.10/rnaToGenome.psl
     #   /cluster/data/hg18/bed/ucsc.10/txToAcc.tab ucscGenes.hg18.psl
     # /cluster/data/hg18/bed/ucsc.10/ucscGenes.fa
     # But the psl was not from exactly the same seq's as in the fa.
     # Jim's suggestion: use sequenceForBed to get genomic-translated
     # sequences, and then genePredToFakePsl.  sequenceToBed must be
     # run on hgwdev.
     genePredToBed /cluster/data/hg18/bed/ucsc.11/ucscGenes.gp > ucscGenes.bed
     hgsql hg18 -NBe 'select kgId,geneSymbol from kgXref' \
     | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
       > idSub.txt
     subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
     sequenceForBed -keepName -db=hg18 -bedIn=ucscGenesIdSubbed.bed \
       -fastaOut=stdout \
     | faToTwoBit stdin kgTargetSeq.2bit
     cut -f 1-10 /cluster/data/hg18/bed/ucsc.11/ucscGenes.gp \
     | genePredToFakePsl hg18 stdin kgTargetAli.psl /dev/null
 
     # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
     cd /cluster/data/hg18/bed/mrnaPcr
     hgLoadPsl hg18 kgTargetAli.psl
     mkdir /gbdb/hg18/targetDb
     ln -s /cluster/data/hg18/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg18/targetDb/
 
     # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
     # /gbdb/hg18/targetDb/kgTargetSeq.2bit .
 
     ssh hgwdev
     # Add records to hgcentraltest blatServers and targetDb:
     hgsql hgcentraltest -e \
       'INSERT into blatServers values ("hg18KgNov08", "blat13", 17799, 0, 1);'
     hgsql hgcentraltest -e \
       'INSERT into targetDb values("hg18KgNov08", "UCSC Genes", \
          "hg18", "kgTargetAli", "", "", \
          "/gbdb/hg18/targetDb/kgTargetSeq.2bit", 1, now(), "");'
 
 
 #############################################################################
 # MAKE PCR TARGET FOR SNAPSHOT OF ALL_MRNA (DONE 4/18/08 angie)
     ssh hgwdev
     # Load up native mRNA target tables:
     hgsql hg18 -NBe 'select qName from all_mrna' \
     | sort -u > mrnaAccs.txt
     $HOME/kent/src/hg/makeDb/genbank/bin/$MACHTYPE/gbGetSeqs \
       -gbRoot=/gbdb/genbank -accFile=mrnaAccs.txt \
       -db=hg18 -native genbank mrna mrnaTargetSeq.fa
     faToTwoBit mrnaTargetSeq.fa mrnaTargetSeq.2bit
     ln -s /cluster/data/hg18/bed/mrnaPcr/mrnaTargetSeq.2bit \
       /gbdb/hg18/targetDb/
     hgsql hg18 -e ' \
       create table mrnaTargetAli select * from all_mrna; \
       alter table mrnaTargetAli add index (tName,bin); \
       alter table mrnaTargetAli add index (qName);'
     rm *.tab
 
     ssh kolossus
     # Start up gfServer for mrnaTargetSeq:
     cd /cluster/data/hg18/bed/mrnaPcr
     faToTwoBit mrnaTargetSeq.fa mrnaTargetSeq.2bit
     gfServer -stepSize=5 -canStop start localhost 17991 mrnaTargetSeq.2bit &
 
     ssh hgwdev
     # Add records to hgcentraltest blatServers and targetDb:
     hgsql hgcentraltest -e \
       'INSERT into blatServers values ("hg18MrnaApr08", "kolossus", 17991, 0, 1);'
     hgsql hgcentraltest -e \
       'INSERT into targetDb values("hg18MrnaApr08", "Human mRNAs", \
          "hg18", "mrnaTargetAli", "", "", \
          "/gbdb/hg18/targetDb/mrnaTargetSeq.2bit", 2, now(), "");'
 
 
 #############################################################################
 # Reload CCDS from CCDS.20080502 dump (2008-05-03 markd)
     # import ccds database as described in ccds.txt
     set db=hg18
     set ncbiBld=36.3
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of
         ccdsGene
         ccdsInfo
         ccdsKgMap
     # << emacs
 ############################################################################
 #  update vega genes to version 31 (v49 of Ensembl genes)
 #	(DONE - 2008-05-15 - Hiram)
     mkdir  /cluster/data/hg18/bed/vega31_49
     cd  /cluster/data/hg18/bed/vega31_49
     wget --timestamping \
 	"ftp://ftp.sanger.ac.uk/pub/vega/human/gtf_file.gz"
     wget --timestamping \
 	"ftp://ftp.sanger.ac.uk/pub/vega/human/CHANGELOG.gz"
     wget --timestamping \
 	"ftp://ftp.sanger.ac.uk/pub/vega/human/catalog.txt"
     wget --timestamping \
 "ftp://ftp.sanger.ac.uk/pub/vega/human/pep/Homo_sapiens.VEGA.apr.pep.tot.fa.gz"
 
     #	processing similar to the same processing for Ensembl genes,
     #	from /cluster/data/hg18/bed/ensGene.49/process/doProcess.csh
     zcat gtf_file.gz \
         | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
         | liftUp -type=.gtf stdout \
 	    /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry stdin \
         | gzip > allGenes.gtf.gz
     gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
 	| gzip > hg18.allGenes.gp.gz
     /cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl \
 	infoOut.txt > ensGtp.tab
     genePredCheck -db=hg18 hg18.allGenes.gp.gz
     #	checked: 62418 failed: 0
     zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
     zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
     gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
     gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
     genePredCheck -db=hg18 pseudo.gp
     #	checked: 5747 failed: 0
     genePredCheck -db=hg18 not.pseudo.gp
     #	checked: 56671 failed: 0
     hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
     hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
 
 
 ############################################################################
 # DGV V8 (DATABASE OF GENOMIC VARIANTS) (DONE 8/12/09 angie)
 # DGV V7 done 3/11/09
 # DGV V6 thin regions dropped 2/23/09
 # DGV V6 with useless thin regions done 11/12/08
 # DGV V5 done 7/16/08
 # DGV V4 done 5/9/08
     ssh hgwdev
     mkdir /hive/data/genomes/hg18/bed/dgv.v8
     cd /hive/data/genomes/hg18/bed/dgv.v8
     wget --timestamping \
       http://projects.tcag.ca/variation/downloads/variation.hg18.v8.aug.2009.txt
     wget --timestamping \
       http://projects.tcag.ca/variation/downloads/indel.hg18.v8.aug.2009.txt
     # Save previous version for comparison:
     hgsql hg18 -e 'rename table dgv to dgvV7'
     # shuffle fields into bed8+
     foreach f (*.v8.*.txt)
       tail -n +2 $f \
       | perl -wpe 'chomp; \
         ($id, $landmark, $chr, $start, $end, $varType, \
          undef, undef, undef, $ref, $pmid, $method, \
          $gain, $loss, undef, undef, $sample) = split("\t"); \
         $id =~ s/^Variation_//; \
         $start-- unless ($start == 0); \
         $landmark = "" if ($landmark =~ /^chr.*\d\.\.\d/); \
         $rgb = ($varType =~ /^Inv/) ? "100,0,100" : "0,200,0"; \
         if ($gain ne "" || $loss ne "") { \
           $gain =~ s/^(NA)? ?$/0/;  $loss =~ s/^(NA)? ?$/0/; \
           $rgb = "200,0,0" if ($gain > 0 && $loss == 0); \
           $rgb = "0,0,200" if ($loss > 0 && $gain == 0); \
         } \
         $_ = join("\t", $chr, $start, $end, $id, 0, "+", \
                   $start, $start, $rgb, $landmark, $varType, \
                   $ref, $pmid, $method, $sample) . "\n";' \
           > $f:r.bed
     end
     hgLoadBed hg18 dgv *.bed \
       -sqlTable=$HOME/kent/src/hg/lib/dgv.sql -tab
 #Loaded 49988 elements of size 15
       hgsql hg18 -NBe 'select count(distinct(pubMedId)) from dgv;'
 #35
 
 
 ############################################################################
 # AGILENT CGH PROBES (AND MM8, RN4) (Done 2008-05-13, Andy)
    ssh hgwdev
    bash
    cd /cluster/data/hg18/bed
    mkdir agilentProbes
    cd agilentProbes/
    cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Human_CGH.zip .
 # (agilent-provided zips)
 # what a pain... this zipfile isn't unzippable using linux unzip.
 # Bob's windows machine didn't do it either.  Finally got it using the
 # mac in Erich and Victoria's office.  Extracting creates a directory
 # called "Agilent_Human_CGH Folder"
    cp Agilent_Human_CGH\ Folder/* .
    rmdir Agilent_Human_CGH\ Folder/
    tail +3 014693_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent244a.bed
    tail +3 014698_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent105a.bed
    tail +3 014950_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilent44k.bed
    for bed in *.bed; do hgLoadBed hg18 ${bed%.bed}{,.bed}; done
    cd /cluster/data/mm8/bed
    mkdir agilentCgh
    cd agilentCgh/
    cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Mouse_CGH.zip .
 # (same crap as before with the zip file)
    cp Agilent_Mouse_CGH\ Folder/* .
    rmdir Agilent_Mouse_CGH\ Folder/
    tail +3 014695_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh244a.bed
    tail +3 014699_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh105a.bed
    tail +3 015028_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh44k.bed
    for bed in *.bed; do hgLoadBed mm8 ${bed%.bed}{,.bed}; done
    cd /cluster/data/rn4/bed
    mkdir agilentCgh
    cd agilentCgh/
    cp /usr/local/apache/htdocs/donna/Agilent/Agilent_Rat_CGH.zip .
 # (yep, again)
    cp Agilent_Rat_CGH\ Folder/* .
    rmdir Agilent_Rat_CGH\ Folder/
    tail +3 015223_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh244a.bed
    tail +3 015235_D_UCSCTrack_20070820.txt | bedSort stdin stdout > agilentCgh105a.bed
    for bed in *.bed; do hgLoadBed rn4 ${bed%.bed}{,.bed}; done
 
 ############################################################################
 # AGILENT HUMAN SUREPRINT G3 ARRAY PROBESETS (DONE 2008-12-09, Andy)
     ssh hgwdev
     cd /hive/data/hg18/bed/agilentProbes
     wget --timestamping --user=microarray --password=<get-it-from-agilent> \
         "ftp://ftp.agilent.com/restricted/UCSC_BED_FILES/*"
     zcat 021365_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCnv2x400k stdin
     zcat 021529_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh1x1m stdin
     zcat 021850_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh2x400k stdin
     zcat 021924_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh8x60k stdin
     zcat 022060_D_UCSCTrack_20081204.txt.gz | tail +3 | hgLoadBed hg18 agilentCgh4x180k stdin
 
 ############################################################################
 # TWO MORE AGILENT HUMAN ARRAYS (DONE, 2009-07-28 Andy)
     ssh hgwdev
     cd /hive/data/hg18/bed/agilentProbes
     wget --timestamping --user=microarray --password=<get-it-from-agilent> \
         "ftp://ftp.agilent.com/restricted/UCSC_BED_FILES/*"
     tail -n +3 022837_D_UCSCTrack_20090331.txt | hgLoadBed hg18 agilentCnv2x105k stdin
     tail -n +3 023642_D_BED_20090528.bed | \
       awk 'BEGIN{FS="\t";OFS="\t"}{print $0, "1000", "+";}' | \
       hgLoadBed hg18 agilentHdd1x1m stdin
 
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 
 ############################################################################
 # ILLUMINA WG-6 PROBES (2008-06-13 Andy)
 
 # Download the Platform file from GEO here:
 # http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6884
 # Click on "Download full table"
 
 ssh hgwdev
 bash
 cd /san/sanVol1/scratch/andy
 mkdir illumina
 cd illumina/
 cp ~/GPL6884-5803.txt .
 
 # Collect GIs for all the RNAs
 # First download/install Biopython
 wget http://biopython.org/DIST/biopython-1.45.tar.gz
 tar xfz biopython-1.45.tar.gz
 mkdir biopythonLibs
 cd biopython-1.45/
 python setup.py install --home=/san/sanVol1/scratch/andy/illumina/biopythonLibs
 export PYTHONPATH=/san/sanVol1/scratch/andy/illumina/biopythonLibs
 
 # Now get the RNAs
 mkdir getRna grabbed
 cd getRna/
 tail +31 ../GPL6884-5803.txt | cut -f11 | sort | uniq > gis.txt
 wc -l gis.txt
 # 43338 gis.txt
 split -d -l 100 -a 3 gis.txt gis-
 rm gis.txt
 
 cat < "EOF" > getSeqs.py
 import Bio
 from Bio import EUtils
 from Bio.EUtils import HistoryClient
 
 gis = open('gis.txt', 'r').readlines()
 for i in range(len(gis)):
     gis[i] = gis[i].rstrip('\n')
 ids = EUtils.DBIds('nucleotide', gis)
 
 client = HistoryClient.HistoryClient()
 result = client.post(ids)
 print result.efetch(retmode="text", rettype="fasta").read()
 EOF
 # << emacs
 
 cat < "EOF" > getSeqs.sh
 #!/bin/bash
 
 for gi in gis-*; do
     numGot="0";
     attempt="1";
     while [ $numGot -lt 100 ]; do
 	echo Getting $gi attempt $attempt;
 	cp $gi gis.txt;
 	fa=${gi}.fa
 	python getSeqs.py > $fa
 	numGot=`grep '>' $fa | wc -l`;
 	if [ $numGot = 100 ]; then
 	    echo Got all for $gi
 	    mv $fa ../grabbed/;
 	    rm $gi
 	else
 	    rm $fa;
 	    sleep 10;
 	fi
 	attempt=$((attempt+1));
     done
     sleep 5;
 done
 EOF
 # << emacs
 
 chmod +x getSeqs.sh
 ./getSeqs.sh
 
 # there's a fair bit that retries the download over and over but eventually it
 # gets to the last one, which doesn't have 100 lines, so I run the python
 # program on that on by itself.
 
 cat ../grabbed/* > probeRna.fa
 rm -rf ../grabbed/
 cd ../
 
 # Now blat RNA to genome
 
 mkdir -p blatRna/{splits,out}
 cd blatRna/
 faSplit sequence ../getRna/probeRNA.fa 400 splits/rna-
 ls -1 splits/* > splits.lst
 cat < "EOF" > runBlat.sh
 #!/bin/bash
 
 cd -P .
 fa=`basename $1`
 chr=`basename $2 .nib`
 split=`basename $1 .fa`
 out=${split}.${chr}.psl
 nibDir=/scratch/hg/hg18/bothMaskedNibs
 tmpDir=/scratch/tmp/$out
 
 mkdir $tmpDir
 pushd $tmpDir
 oldDir=`dirs +1`
 cp ${oldDir}/$1 .
 blat -noHead -ooc=/scratch/hg/hg18/11.ooc -out=psl ${nibDir}/$2 $fa $out
 mkdir -p ${oldDir}/out/${chr}
 cp $out ${oldDir}/out/${chr}/
 popd
 rm -rf $tmpDir
 EOF
 # << emacs
 
 chmod +x runblat.sh
 cat < "EOF" > gsub
 #LOOP
 ./runBlat.sh {check in line+ $(path1)} $(path2) {check out exists out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 EOF
 # << emacs
 
 ls -1 /cluster/data/hg18/nib > nib.lst
 ssh pk
 cd /san/sanVol1/scratch/andy/illumina/blatRna
 gensub2 splits.lst nib.lst gsub spec
 para create spec
 para try
 para push
 para time
 #17820 jobs in batch
 #34457 jobs (including everybody's) in Parasol queue.
 #Checking finished jobs
 #Completed: 17820 of 17820 jobs
 #CPU time in finished jobs:      84196s    1403.26m    23.39h    0.97d  0.003 y
 #IO & Wait Time:                 48448s     807.47m    13.46h    0.56d  0.002 y
 #Average job time:                   7s       0.12m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             270s       4.50m     0.07h    0.00d
 #Submission to last job:          1515s      25.25m     0.42h    0.02d
 
 exit; # back to hgwdev
 mkdir /tmp/andy
 pslSort -nohead dirs allSorted.psl /tmp/andy out/*
 rmdir /tmp/andy
 pslReps -singleHit allSorted.psl single.ps{l,r}
 
 # Blat probes against the RNAs
 cd ../
 mkdir -p blatProbes/out
 cd blatProbes/
 ln -s ../blatRna/splits .
 ln -s ../blatRna/splits.lst .
 ln -s ../blatRna/single.psl .
 tail +31 ../GPL6884-5803.txt | cut -f1,11,18 | \
    awk '{printf("%s\tgi|%s\t%s\n", $1, $2, $3);}' > probes.tab
 
 cat << "EOF" >
 #!/bin/bash
 
 faFile=`basename $1`;
 pslFile=${faFile%.fa}.psl
 probeFile=$2;
 rnaOnGenomePsl=$3;
 tmpDir=/scratch/andy/`date +"%T" | tr ':' '_'`.$$
 mkdir -p $tmpDir
 cp $1 $2 $3 $tmpDir
 
 pushd $tmpDir
 for id in `grep '>' $faFile | sed 's/^>//'`; do
      # make probe fa
      echo $id
      awk '{if ($2 == "'"$id"'") printf(">%s\n%s\n", $1, $3);}' $probeFile \
         > probe.fa
      # extract single RNA fa
      faOneRecord $faFile $id > rna.fa
      blat -noHead rna.fa probe.fa probeOnRna.psl
      awk 'BEGIN{FS="\t";OFS="\t";}{if ($10 == "'"$id"'") print;}' \
        $rnaOnGenomePsl > rnaOnGenome.psl
      if [ `find . -size '0b' -type f | wc -l` == 0 ]; then
          pslMap probeOnRna.psl rnaOnGenome.psl probeOnGenome.psl
          cat probeOnGenome.psl >> $pslFile
      fi
 done
 popd
 
 cp $tmpDir/$pslFile $4
 rm -rf $tmpDir
 EOF
 # << emacs
 
 cat << "EOF" > gsub
 #LOOP
 ./probeBlat.sh {check in line+ $(path1)} probes.tab single.psl {check out exists out/$(root1).psl}
 #ENDLOOP
 EOF
 # << emacs
 
 ssh pk
 cd /san/sanVol1/scratch/andy/illumina/blatProbes
 gensub2 splits.lst single gsub spec
 para create spec
 para try
 para push
 para time
 #396 jobs in batch
 #41977 jobs (including everybody's) in Parasol queue.
 #Checking finished jobs
 #Completed: 396 of 396 jobs
 #CPU time in finished jobs:      11101s     185.02m     3.08h    0.13d  0.000 y
 #IO & Wait Time:                  1361s      22.68m     0.38h    0.02d  0.000 y
 #Average job time:                  31s       0.52m     0.01h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             121s       2.02m     0.03h    0.00d
 #Submission to last job:           271s       4.52m     0.08h    0.00d
 
 exit # back to hgwdev
 mkdir /tmp/andy
 pslSort -nohead dirs sorted.psl /tmp/andy out
 # Load stuff up
 pslToBed sorted.psl sorted.bed
 cd ../
 mkdir tables
 cd tables/
 cp ../blatProbes/sorted.{psl,bed} .
 hgLoadPsl -table=illuminaProbesAlign hg18 sorted.psl
 hgLoadBed hg18 illuminaProbes sorted.bed
 cat << "EOF" >
 CREATE TABLE illuminaProbesSeq (
   id varchar(40) NOT NULL,
   seq varchar(55) NOT NULL,
   PRIMARY KEY (id)
   ) TYPE=MyISAM;
 EOF
 # << emacs
 
 cut -f1,3 ../blatProbes/probes.tab > illuminaProbesSeq.tab
 hgLoadSqlTab hg18 illuminaProbesSeq{,.sql,.tab}
 
 
 ############################################################################
 # dbSNP BUILD 129 (DONE 6/24/08 angie)
 # 8/6/08: Regenerated snp129.sql with only those enum/set values that are
 # actually used (except always keep unknown, the default) and reloaded snp129.
 # No data change -- just the sql field definitions for enums and sets.
 # 8/7/08: Swapped molType values cDNA <--> genomic in snp129 because they
 # were swapped in the fasta headers.
 # QA NOTE: used sudo mytouch to change timestamps on all downstream snp129
 # tables (snp129Exceptions, snp129ExceptionDesc, snp129OrthoPt2Pa2Rm2,
 # snp129Seq) to .2008-08-08 00:00:00 to avoid unwarranted joinerCheck
 # time discrepancy errors. (8/8/08, brooke)
 
     # Set up build directory
     ssh kkstore06
     mkdir -p /cluster/store3/dbSNP129/{human,shared}
     ln -s /cluster/store3/dbSNP129 /cluster/data/dbSNP/129
 
     # Get field encodings -- if there are changes or additions to the
     # encoding of the corresponding fields, you might need to update
     # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
     # hg/lib/snp125Ui.c).
     cd /cluster/data/dbSNP/129/shared
     alias wg wget --timestamping
     set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
     wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
     wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
     wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
     wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
     # Here is another source -- it is not as up-to-date as the above, but
     # our encodings (enums and sets in snp129.sql) are named more similar
     # to those in the 2005 ASN:
     # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
 
     ########################## DOWNLOAD #############################
     cd /cluster/data/dbSNP/129/human
     mkdir data schema rs_fasta
     # Get data from NCBI (anonymous FTP)
     wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
     cd /cluster/data/dbSNP/129/human/data
     # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
     wg $ftpSnpDb/organism_data/b129_SNPContigLoc_36_3.bcp.gz
     wg $ftpSnpDb/organism_data/b129_SNPContigLocusId_36_3.bcp.gz
     wg $ftpSnpDb/organism_data/b129_ContigInfo_36_3.bcp.gz
     # MapInfo has alignment weights
     wg $ftpSnpDb/organism_data/b129_SNPMapInfo_36_3.bcp.gz
     # SNP has univar_id, validation status and heterozygosity
     wg $ftpSnpDb/organism_data/SNP.bcp.gz
 
     # Get schema
     cd /cluster/data/dbSNP/129/human/schema
     wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
 
     # Get fasta files
     # using headers of fasta files for molType, class, observed
     cd /cluster/data/dbSNP/129/human/rs_fasta
     wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
 
     ########################## LOAD NCBI TABLES #############################
     # Simplify names of data files -- strip version & extras to get
     # local canonical table names.
     cd /cluster/data/dbSNP/129/human/data
     foreach f (*.bcp.gz)
       set new = `echo $f \
                  | sed -e 's/^b129_SNP//; s/^b129_//; s/_36_3//; s/.bcp//;'`
       mv $f $new
       echo $new
     end
 
     # Extract just the tables that we need from the NCBI msSQL table
     # creation file, and get CREATE statements from
     # human_9606_table.sql for our 5 tables
     cd /cluster/data/dbSNP/129/human/schema
     zcat human_9606_table.sql.gz \
     | perl -we '$/ = "\nGO\n\n\n"; \
         while (<>) { \
           next unless /^CREATE TABLE \[(b129_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_3)?\]/; \
           s/b129_(SNP)?//; s/_36_3//; \
           s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
           s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
           s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
           s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
           s/(image|varchar\s+\(\d+\))/BLOB/g; \
           print; \
         }' \
       > table.sql
 
     # load on kolossus or a small cluster machine (mysql5 is OK for this;
     # in fact it's better than 4 because it has 'show warnings').
     ssh kkr3u00
     hgsql '' -e 'create database hg18snp129'
     cd /cluster/data/dbSNP/129/human/schema
     hgsql hg18snp129 < table.sql
     cd ../data
 
     # Avoid wasting space by excluding mappings to non-reference contigs:
     foreach t (ContigInfo MapInfo)
       zcat $t.gz \
       | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
       | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
       | hgLoadSqlTab -oldTable hg18snp129 $t placeholder stdin
     end
     # Compare contig list between our ctgPos and reference contigs in
     # ContigInfo:
     ssh hgwdev-10 hgsql hg18 -N -B -e '"select contig from ctgPos;"' \
     | sort > /tmp/1
     hgsql hg18snp129 -NBe 'select distinct(group_label) from ContigInfo'
     # --> reference, c5_H2, c6_COX, c6_QBL, c22_H2, DR53
     # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
     hgsql hg18snp129 -N -B -e 'select contig_acc from ContigInfo \
         where group_label in \
         ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
     diff /tmp/1 /tmp/2
     # No diff.
     # Make sure there are no orient != 0 contigs among those selected.
     hgsql hg18snp129 -NBe \
       'select count(*) from ContigInfo where orient != 0 and \
          group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
 #0
 
     # ContigLoc is huge, and we want just the reference contig mappings.
     # So, based on the reference & haplo ctg_id values in ContigInfo,
     # filter to get just the mappings for those contigs:
     zcat ContigLoc.gz \
     | awk '$3 <= 377 || $3 == 7015' \
     | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
     | hgLoadSqlTab -oldTable hg18snp129 ContigLoc placeholder stdin
     foreach t (ContigLocusId SNP)
       zcat $t.gz \
       | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
       | hgLoadSqlTab -oldTable hg18snp129 $t placeholder stdin
     end
     # There were some warnings (many cleared up by the perl substitution)
     # but no rows were dropped.  'show warnings' after a manual 'load data'
     # complains about missing values (OK when e.g. position is not known).
     foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
      echo -n "${t}:\t"
       hgsql -N -B hg18snp129 -e 'select count(*) from '$t
     end
 #ContigInfo:     379
 #ContigLoc:      15835019  (before filtering: 46913472)
 #ContigLocusId:  25496815
 #MapInfo:        14845535  (before filtering: 44627804)
 #SNP:    	 14708770
 
     #################### EXTRACT INFO FROM NCBI TABLES ####################
     mkdir -p /scratch/snp/129/human
     cd /scratch/snp/129/human
 
     time hgsql hg18snp129 -e \
       'alter table ContigLoc  add index (ctg_id); \
        alter table ContigInfo add index (ctg_id);'
 #0.002u 0.002s 2:14.79 0.0%      0+0k 0+0io 1pf+0w
     # was ~12m on a run without trimming ContigLoc!
 
     time hgsql hg18snp129 -e \
       'alter table ContigInfo add index (group_label(9));'
 #0.005u 0.000s 0:00.16 0.0%      0+0k 0+0io 1pf+0w
 
     # For joining files by shared column, we need a unique identifier in
     # that shared column.  snp_id is not unique -- the same rsID can appear
     # in both the reference assembly and on one of the others e.g. c6_COX.
     # So concatenate the assembly identifier and snp_id to get hopefully
     # unique label.
     time hgsql hg18snp129 -NBe \
       'select concat(ContigInfo.group_label, ".", snp_id), \
               ContigInfo.contig_acc, asn_from, asn_to, \
               loc_type, orientation, allele, phys_pos_from \
        from ContigLoc, ContigInfo \
        where ContigLoc.ctg_id = ContigInfo.ctg_id and ContigInfo.group_label \
              in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
       | sort \
       > ucscContigLoc.txt
     # no time output because of the pipe... took 5 minutes.
 
     # Are these IDs unique?
     wc -l ucscContigLoc.txt
 #15835019 ucscContigLoc.txt
     awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l
 #14791529
     # Nope.  Find non-unique IDs:
     awk 'prev == $1 {print;} {prev = $1;}' ucscContigLoc.txt | head
     grep ^c5_H2.10035195 ucscContigLoc.txt
 #c5_H2.10035195  NT_113801       639954  639954  2       0       G      69605321
 #c5_H2.10035195  NT_113801       660407  660407  2       0       G      69625774
 #c5_H2.10035195  NT_113801       911780  911780  2       1       C      69877147
 #c5_H2.10035195  NT_113801       933061  933061  2       1       C      69898428
     # OK, they can be duplicated within the same contig.  See if we can
     # get by with anchoring everything to ucscContigLoc.txt.  But everybody
     # else better have unique IDs!
 
     # SNP -> valid, avHet, avHetSE
     # SNP has only snp_id as identifier, nothing relating to assembly.
     hgsql hg18snp129 -NBe \
       'select snp_id, validation_status, avg_heterozygosity, het_se \
        from SNP;' \
     | sort \
       > ucscSNP.txt
     # Check ID uniqueness:
     wc -l ucscSNP.txt
 #14708770 ucscSNP.txt
     awk '{print $1;}' ucscSNP.txt | uniq | wc -l
 #14708770
 
     # ContigLocusId -> func
     # ContigLocusId has only snp_id as an identifier (it gives one
     # example contig if the SNP is on multiple contigs).
     # The sort options and awk are to convert multiple entries with different
     # function classes for the same SNP into one entry per SNP with a list
     # of function classes.
     hgsql hg18snp129 -NBe \
       'select snp_id, fxn_class from ContigLocusId;' \
     | sort -u -k1,1 -k2,2n  \
     | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
             else { if (prevId) {print prevId "\t" prevFunc;} \
                                 prevFunc = $2 ","; }} \
            {prevId = $1;} \
            END {print prevId "\t" prevFunc;}' \
       > ucscFunc.txt
     # Check ID uniqueness:
     wc -l ucscFunc.txt
 #6136008 ucscFunc.txt
     awk '{print $1;}' ucscFunc.txt | sort -u | wc -l
 #6136008
 
     # MapInfo -> weight
     # MapInfo needs assembly+snp_ids in order to have unique IDs.
     time hgsql hg18snp129 -e \
       'alter table MapInfo add index (assembly(9));'
 #0.003u 0.003s 3:40.29 0.0%      0+0k 0+0io 1pf+0w
     hgsql hg18snp129 -NBe \
       'select concat(assembly, ".", snp_id), weight \
              from MapInfo where assembly \
              in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' \
       | sort \
       > weight.txt
     # ~1 minute
     # Check ID uniqueness:
     wc -l weight.txt
 #14791529 weight.txt
     awk '{print $1;}' weight.txt | uniq | wc -l
 #14791529
     awk '{print $2;}' weight.txt | sort -n | uniq -c
 #   40910 0
 #14326127 1
 #  157402 2
 #  256608 3
 #   10482 10
     # SNPs w/weight 0 and 10 will be discarded later.
 
     # fasta headers -> observed, molType, class
     zcat /cluster/data/dbSNP/129/human/rs_fasta/rs_ch*.fas.gz \
     | grep '^>gnl' \
     | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
     | sort \
       > ucscGnl.txt
     # ~5m
     wc -l ucscGnl.txt
 #14708630 ucscGnl.txt
     awk '{print $1;}' ucscGnl.txt | uniq | wc -l
 #14708630
 
     ############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################
     # Join files by ID.  Start with ContigLoc and MapInfo because they
     # share the concatenated assembly+snp_id IDs.
     time join -a 1 -e MISSING -t '	' ucscContigLoc.txt weight.txt \
       > ucscCL+w.txt
 #28.334u 4.730s 1:43.47 31.9%    0+0k 0+0io 0pf+0w
     wc -l ucscCL+w.txt
 #15835019 ucscCL+w.txt
     # Same as ucscContigLoc.txt above, good.
     # Any missing weights?
     grep MISSING ucscCL+w.txt | head
     # No output, good.
 
     # Join the files with SNP-only IDs.
     time join -e MISSING -t '	' ucscGnl.txt ucscSNP.txt \
       > ucscG+S.txt
 #17.375u 2.127s 0:47.40 41.1%    0+0k 0+0io 0pf+0w
     wc -l ucscG+S.txt
 #14708630 ucscG+S.txt
     # Same as ucscGnl.txt -- somewhat less than ucscSNP.txt (14708770)...
     grep MISSING ucscG+S.txt | wc -l
 #0
     time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \
       -t '	' ucscG+S.txt ucscFunc.txt \
       > ucscG+S+F.txt
 #18.612u 2.334s 0:50.30 41.6%    0+0k 0+0io 0pf+0w
     wc -l ucscG+S+F.txt
 #14708630 ucscG+S+F.txt
     grep MISSING ucscG+S+F.txt | wc -l
 #8572703
     # Not surprising -- ucscFunc.txt has only 6136008 lines.
     expr 14708630 - 6136008
 #8572622
     # Not an exact match like in 128, but not too far off.
 
     # Convert assembly+snp_id's to just snp_id (sorted) for final join.
     perl -wpe 's/^\S+\.(\d+)/$1/;' ucscCL+w.txt \
     | sort > ucscCL+w.snp_id.txt
     awk '{print $1;}' ucscCL+w.snp_id.txt | uniq | wc -l
 #14626025
     # Interesting... which snp_ids are missing from ContigLoc?
     # (note: don't use sort -n | comm, it needs alphabetical sort!)
     awk '{print $1;}' ucscCL+w.snp_id.txt | sort -u > /tmp/1
     awk '{print $1;}' ucscGnl.txt | sort -u > /tmp/2
     comm -13 /tmp/1 /tmp/2 > notInContigLoc.txt
     comm -23 /tmp/1 /tmp/2 > notInSNP.txt
     wc -l notIn*.txt
 # 83043 notInContigLoc.txt
 #   438 notInSNP.txt
     # notInContigLoc could simply mean that they weren't mapped, which is OK.
     # notInSNP is more concerning.
     #Not deleted!: 52789237, 55664014, 61749732,
     #Invalid (not retired): 63751714, 63751902
     # -- sent email to snp-admin at ncbi.
 
     # Final join -- treat ContigLoc as authoritative (since it has coords).
     # Arrange columns in same order as in the SNP table, with extras for
     # checking at the end (phys_pos_from).
     # chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ...
     time join -a 1 -e MISSING -t '	' \
   -o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \
       ucscCL+w.snp_id.txt ucscG+S+F.txt \
       > ucscNcbiSnp.ctg.txt
 #41.204u 6.274s 1:05.99 71.9%    0+0k 0+0io 0pf+0w
     wc -l ucscNcbiSnp.ctg.txt
 #15835019 ucscNcbiSnp.ctg.txt
     grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l
 #8495168
 
     # Lift the map contig coordinates to chrom coordinates (~2m);
     time liftUp ucscNcbiSnp.bed \
       /cluster/data/hg18/jkStuff/liftContigs.lft warn \
       ucscNcbiSnp.ctg.txt
 #123.952u 7.587s 2:22.24 92.4%   0+0k 0+0io 5pf+0w
     wc -l ucscNcbiSnp.bed
 #15835019 ucscNcbiSnp.bed
 
     # At this point, move back from /scratch to /cluster/data.
     nice gzip ucscNcbiSnp.bed
     cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/129/human/
     cp -p notIn* /cluster/data/dbSNP/129/human/
 
     # Drum roll please... translate NCBI's encoding into UCSC's, and
     # perform a bunch of checks.  This is where developer involvement
     # is most likely as NCBI extends the encodings used in dbSNP.
     cd /cluster/data/dbSNP/129/human/
     gunzip ucscNcbiSnp.bed.gz
     # Re-ran this command 8/6/08 to get new snp129.sql that includes
     # only those enum/set values that are actually used.  No other output
     # files changed.
     time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
       snp129
     # 8/7/08: added the awk command to unswap the molType values that
     # were swapped in dbSNP 129 fasta headers:
     # DO NOT USE THIS COMMAND NEXT TIME UNLESS NECESSARY AGAIN:
     awk 'BEGIN{OFS="\t";} \
          {if      ($8 == "genomic") {$8 = "cDNA";} \
           else if ($8 == "cDNA") {$8 = "genomic";} \
           print;}' ucscNcbiSnp.bed \
     | snpNcbiToUcsc stdin /cluster/data/hg18/hg18.2bit snp129
 #spaces stripped from observed:
 #chr12   5963395 5963395 rs41402545
 #count of snps with weight  0 = 63507
 #count of snps with weight  1 = 14375595
 #count of snps with weight  2 = 325745
 #count of snps with weight  3 = 924499
 #count of snps with weight 10 = 145673
 #Skipped 493 snp mappings due to errors -- see snp129Errors.bed
 #210.328u 10.793s 4:04.99 90.2%  0+0k 0+0io 0pf+0w
     # More skipped snps than in 128, but same reason:
     cut -f 5 snp129Errors.bed | sort | uniq -c
 #    493 Missing observed value (deleted SNP?).
     cut -f 4 snp129Errors.bed | sort -u | sed -e 's/^rs//' > errIds.txt
     comm -13 notInSNP.txt errIds.txt | wc -l
 #0
     # So those are a subset of the notInSNP.txt ids, good.
     wc -l snp*
 #  15625346 snp129.bed
 #        22 snp129.sql
 #       493 snp129Errors.bed
 #        18 snp129ExceptionDesc.tab
 #   2673142 snp129Exceptions.bed
 
     # Make one big fasta file.
     # It's a monster: 16G!  Can we split by hashing rsId?
   # NOTE FOR NEXT TIME: do this on the fileserver!
     zcat rs_fasta/rs_ch*.fas.gz \
     | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
       > snp129.fa
     # Check for duplicates.
     grep ^\>rs snp129.fa | sort > /scratch/tmp/seqHeaders
     wc -l /scratch/tmp/seqHeaders
 #14708630 /scratch/tmp/seqHeaders
     uniq /scratch/tmp/seqHeaders | wc -l
 #14708630
     # Use hgLoadSeq to generate .tab output for sequence file offsets,
     # and keep only the columns that we need: acc and file_offset.
     # Index it and translate to snpSeq table format.
     time hgLoadSeq -test placeholder snp129.fa
 #114.516u 37.585s 3:13.58 78.5%  0+0k 0+0io 6pf+0w
     cut -f 2,6 seq.tab > snp129Seq.tab
     rm seq.tab
 
     ssh hgwdev
     # Load up main track tables.
     cd /cluster/data/dbSNP/129/human
     # Re-ran this command 8/6/08 to get new snp129.sql that includes
     # only those enum/set values that are actually used.  No data values
     # changed.  Removed -noSort because Brooke had spotted some entries
     # sorted by chromEnd instead of chromStart.
     # Re-ran 8/7/08 to pick up corrected molType column in snp129.bed.
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
       hg18 snp129 -sqlTable=snp129.sql snp129.bed
 #100.406u 22.673s 9:44.17 21.0%  0+0k 0+0io 0pf+0w
     sed -e 's/snp125/snp129/' ~/kent/src/hg/lib/snp125Exceptions.sql \
       > snp129Exceptions.sql
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp \
       hg18 snp129Exceptions -sqlTable=snp129Exceptions.sql \
       snp129Exceptions.bed
 #13.125u 1.383s 1:15.39 19.2%    0+0k 0+0io 0pf+0w
     sed -e 's/snp125/snp129/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
       > snp129ExceptionDesc.sql
     hgLoadSqlTab hg18 snp129ExceptionDesc snp129ExceptionDesc.sql \
       snp129ExceptionDesc.tab
     # Load up sequences.
     sed -e 's/snpSeq/snp129Seq/' ~/kent/src/hg/lib/snpSeq.sql \
       > snp129Seq.sql
     mkdir -p /gbdb/hg18/snp
     ln -s /cluster/data/dbSNP/129/human/snp129.fa /gbdb/hg18/snp/snp129.fa
     time nice hgLoadSqlTab hg18 snp129Seq snp129Seq.sql snp129Seq.tab
 #0.007u 0.006s 3:06.83 0.0%      0+0k 0+0io 0pf+0w
 
     # Put in a link where one would expect to find the track build dir...
     ln -s /cluster/data/dbSNP/129/human /cluster/data/hg18/bed/snp129
 
     # Look at the breakdown of exception categories:
     ssh kkr3u00
     cd /cluster/data/dbSNP/129/human
     cut -f 5 snp129Exceptions.bed | sort | uniq -c | sort -nr
 #1580567 MultipleAlignments
 # 628933 ObservedMismatch
 # 387233 SingleClassLongerSpan
 #  31425 SingleClassTriAllelic
 #  13247 ObservedTooLong
 #  11095 FlankMismatchGenomeShorter
 #  10365 SingleClassZeroSpan
 #   3345 SingleClassQuadAllelic
 #   3310 FlankMismatchGenomeLonger
 #   1397 DuplicateObserved
 #   1250 MixedObserved
 #    547 NamedDeletionZeroSpan
 #    296 FlankMismatchGenomeEqual
 #     93 ObservedContainsIupac
 #     35 NamedInsertionNonzeroSpan
 #      3 RefAlleleMismatch
 #      1 ObservedWrongFormat
 
 
 #######################################################################
 # SNPMASKED SEQUENCE FOR SNP129 (DONE 7/1/08 angie)
     ssh kolossus
     mkdir /cluster/data/hg18/snp129Mask
     cd /cluster/data/hg18/snp129Mask
 
     # Identify rsIds with various problems -- we will exclude those.
     # MultipleAlignments is kinda broad because anything that maps on
     # both chrN and chrN_foo_hap1 will be excluded... similarly, extra
     # matches on chrN_random might disqualify good matches on chrN.
     # Well, erring on the side of caution is good.
     awk '$5 ~ /^MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved$/ {print $4;}' \
       /cluster/data/dbSNP/129/human/snp129Exceptions.bed \
       | sort -u \
       > snp129ExcludeRsIds.txt
     time grep -vFwf snp129ExcludeRsIds.txt \
       /cluster/data/dbSNP/129/human/snp129.bed \
       > snp129Cleaned.bed
 #154.384u 12.550s 3:09.01 88.3%  0+0k 0+0io 0pf+0w
 
     # Substitutions:
     mkdir substitutions
     snpMaskSingle snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
     | faSplit byname stdin substitutions/
     # Also this warning about total size -- just means that some chroms
     # didn't have any SNPS that survived the stringent filtering.
     #-- 113 warnings about differing observed at same base positions
     #-- (113 distinct positions).  saved as diffObserved.txt.
     #-- Spot-checking, I see a case (chr1|1476801|1476802) where two SNPs
     #-- should have been merged -- their flanking sequences were just from
     #-- diff. strands.  In another case (chr9|10122961|10122962), one of
     #-- the mappings looks like an insertion instead of a substitution but
     #-- the SNP's class is single, and one genomic base is mapped.
     #-- IMO not serious to bother dbSNP about, they want to get on w/130.
 #Masked 10637395 snps in 10637306 out of 3091528550 genomic bases
 #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3091528550 (difference is 16148723)
     # Make sure that sizes are identical, first diffs are normal -> IUPAC,
     # and first diffs' case is preserved:
     foreach f (substitutions/chr*.fa)
       faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ"
     end
 #(output OK)
     foreach f (substitutions/chr*.fa)
       echo $f:t:r
       mv $f $f:r.subst.fa
       gzip $f:r.subst.fa
     end
 
     # Insertions:
     mkdir insertions
     snpMaskAddInsertions snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
     | faSplit byname stdin insertions/
 #Added 1617522 snps totaling 3251578 bases to 3085167749 genomic bases
 #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
     # Again, that just means that some chroms didn't have filtered SNPs.
     # Make sure that all sizes have increased relative to original:
     foreach f (insertions/chr*.fa)
       faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
       |& perl -we '$_=<>; \
            if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
              if ($1 > $2) {print "OK: ins size $1 > $2\n";} \
              else {die "ERROR: ins size $1 <= $2\n";} \
            } else {die $_;}'
     end
 #(output OK)
     foreach f (insertions/chr*.fa)
       mv $f $f:r.ins.fa
       gzip $f:r.ins.fa
     end
 
     # Deletions:
     mkdir deletions
     snpMaskCutDeletions snp129Cleaned.bed /cluster/data/hg18/hg18.2bit stdout \
     | faSplit byname stdin deletions/
 #Cut 1046324 snps totaling 2173708 bases from 3085167749 genomic bases
 #/cluster/data/hg18/hg18.2bit has 3107677273 total bases, but the total number of bases in sequences for which we masked snps is 3085167749 (difference is 22509524)
     # Again, that just means that some chroms didn't have filtered SNPs.
     # Make sure that all sizes have decreased relative to original:
     foreach f (deletions/chr*.fa)
       faCmp -softMask $f ../[1-9MXY]*/$f:t |& grep -v "that differ" \
       |& perl -we '$_=<>; \
            if (/^\w+ in \S+ has (\d+) bases.  \w+ in \S+ has (\d+) bases/) { \
              if ($1 < $2) {print "OK: del size $1 < $2\n";} \
              else {die "ERROR: del size $1 >= $2\n";} \
            } else {die $_;}'
     end
 #(output OK)
     foreach f (deletions/chr*.fa)
       mv $f $f:r.del.fa
       gzip $f:r.del.fa
     end
 
     # Clean up and prepare for download:
     gzip snp129Cleaned.bed
     foreach d (substitutions insertions deletions)
       pushd $d
         md5sum *.gz > md5sum.txt
       popd
     end
     # Make a README.txt in each subdir.
 
     # Create download links on hgwdev.
     # NOTE: Currently we offer only the substitutions.
     # If we get any user requests, then maybe we can put the insertions
     # and deletions out there.
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask
     ln -s /cluster/data/hg18/snp129Mask/substitutions/* \
       /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/
 ## If there is user demand for ins & del, then start over with an empty
 ## goldenPath/snp129Mask and do this:
 ##    foreach type (substitutions insertions deletions)
 ##      mkdir /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/$type
 ##      ln -s /cluster/data/hg18/snp129Mask/$type/* \
 ##        /usr/local/apache/htdocs/goldenPath/hg18/snp129Mask/$type/
 ##    end
 
 
 #######################################################################
 # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP129 (DONE 7/2/08 angie)
     ssh kolossus
     mkdir /cluster/data/hg18/bed/snp129Ortho
     cd /cluster/data/hg18/bed/snp129Ortho
 
     # Following Heather's lead in snp126orthos, filter SNPs to to keep
     # only those with class=single, length=1, chrom!~random;
     # Exclude those with exceptions MultipleAlignments,
     # SingleClassTriAllelic or SingleClassQuadAllelic.
     # Unlike snp masking, we do not filter for weight -- don't know why.
     awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
       /cluster/data/dbSNP/129/human/snp129Exceptions.bed \
     | sort -u \
       > snp129ExcludeIds.txt
     awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
       /cluster/data/dbSNP/129/human/snp129.bed \
     | grep -vFwf snp129ExcludeIds.txt \
       > snp129Simple.bed
     # took ~3 minutes
     wc -l snp129Simple.bed
 #10633840 snp129Simple.bed
 
     # Glom all human info that we need for the final table onto the
     # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
     awk 'BEGIN{OFS="\t";} \
         {print $1, $2, $3, \
                $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                0, $6;}' \
       snp129Simple.bed > snp129ForLiftOver.bed
 
     # Map coords to chimp using liftOver.
     # I don't know why chimp took so much longer than macaque... the
     # chimp .over has fewer chains and fewer bytes than the macaque .over.
     mkdir run.liftOChimp
     cd run.liftOChimp
     mkdir split out
     splitFile ../snp129ForLiftOver.bed 25000 split/chunk
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /cluster/data/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
         \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     ssh pk
     cd /cluster/data/hg18/bed/snp129Ortho/run.liftOChimp
     para make jobList
 #Completed: 426 of 426 jobs
 #CPU time in finished jobs:      83616s    1393.60m    23.23h    0.97d  0.003 y
 #IO & Wait Time:                  1501s      25.02m     0.42h    0.02d  0.000 y
 #Average job time:                 200s       3.33m     0.06h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             574s       9.57m     0.16h    0.01d
 #Submission to last job:           939s      15.65m     0.26h    0.01d
 
     # Map coords to orangutan using liftOver.
     mkdir ../run.liftOPon
     cd ../run.liftOPon
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /cluster/data/hg18/bed/liftOver/hg18ToPonAbe2.over.chain.gz \
         \{check out exists out/ponAbe2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     para make jobList
 #Completed: 426 of 426 jobs
 #CPU time in finished jobs:     171875s    2864.58m    47.74h    1.99d  0.005 y
 #IO & Wait Time:                  1767s      29.45m     0.49h    0.02d  0.000 y
 #Average job time:                 408s       6.79m     0.11h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:            1268s      21.13m     0.35h    0.01d
 #Submission to last job:          1743s      29.05m     0.48h    0.02d
 
     # Map coords to macaque using liftOver.
     mkdir ../run.liftOMac
     cd ../run.liftOMac
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /cluster/data/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
         \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     para make jobList
 #Completed: 426 of 426 jobs
 #CPU time in finished jobs:       6356s     105.93m     1.77h    0.07d  0.000 y
 #IO & Wait Time:                  1812s      30.21m     0.50h    0.02d  0.000 y
 #Average job time:                  19s       0.32m     0.01h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              51s       0.85m     0.01h    0.00d
 #Submission to last job:           221s       3.68m     0.06h    0.00d
 
     ssh kolossus
     cd /cluster/data/hg18/bed/snp129Ortho
     # Note: the formerly inlined script getOrthoSeq.pl has been checked in
     # as kent/src/hg/snp/snpLoad/getOrthoSeq.pl.
 
     # Concatenate the chimp results, sorting by chimp pos in order to
     # efficiently access 2bit sequence in getOrthoSeq.  The output of
     # that is then sorted by the glommed human info field, so that we
     # can use join to combine chimp and macaque results in the next step.
     # Ditto for macaque and orangutan.
     sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
     | sort > panTro2.orthoGlom.txt
     sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
     | sort > ponAbe2.orthoGlom.txt
     sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
     | sort > rheMac2.orthoGlom.txt
     # The whole pipeline takes ~5-7 minutes each.
     wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
 #   9909458 panTro2.orthoGlom.txt
 #   9597270 ponAbe2.orthoGlom.txt
 #   8467866 rheMac2.orthoGlom.txt
 
     # Use the glommed name field as a key to join up chimp and macaque
     # allele data.  Include glommed name from both files because if only
     # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
     # in the orthoGlom fields from each file, which are in the same order
     # as the chimp and macaque columns of snp129OrthoPanTro2RheMac2.
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
     | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
             else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
       > tmp.txt
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       tmp.txt rheMac2.orthoGlom.txt \
     | perl -wpe 'chomp; \
         ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
         $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
         ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
           split(/\|/, $glomKey); \
         $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
         $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
         print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                          $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
         s/^.*$//;' \
     | sort -k1,1 -k2n,2n > snp129OrthoPt2Pa2Rm2.bed
     # took ~6 minutes.
     wc -l snp129OrthoPt2Pa2Rm2.bed
 #10325827 snp129OrthoPt2Pa2Rm2.bed
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/snp129Ortho
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
       -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
       hg18 snp129OrthoPt2Pa2Rm2 snp129OrthoPt2Pa2Rm2.bed
 #Loaded 10325827 elements of size 22
 #73.396u 10.864s 10:14.76 13.7%  0+0k 0+0io 0pf+0w
 
     # Cleanup on fileserver:
     cd /cluster/data/hg18/bed/snp129Ortho
     nice gzip snp129Simple.bed snp129ExcludeIds.txt snp129ForLiftOver.bed
     rm -r run*/split tmp.txt *.orthoGlom.txt
 
 
 ############################################################################
 # dbSNP BUILD 130 (UPDATED 8/18/09 angie)
 # Originally done 5/22/09.
     # Set up build directory
     mkdir -p /hive/data/outside/dbSNP/130/{human,shared}
 
     # Get field encodings -- if there are changes or additions to the
     # encoding of the corresponding fields, you might need to update
     # snpNcbiToUcsc, hgTracks, hgc and hgTrackUi (see also
     # hg/lib/snp125Ui.c).
     cd /hive/data/outside/dbSNP/130/shared
     alias wg wget --timestamping
     set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database
     wg $ftpSnpDb/shared_data/LocTypeCode.bcp.gz
     wg $ftpSnpDb/shared_data/SnpClassCode.bcp.gz
     wg $ftpSnpDb/shared_data/SnpFunctionCode.bcp.gz
     wg $ftpSnpDb/shared_data/SnpValidationCode.bcp.gz
     # Here is another source -- it is not as up-to-date as the above, but
     # our encodings (enums and sets in snp130.sql) are named more similar
     # to those in the 2005 ASN:
     # ftp://ftp.ncbi.nih.gov/snp/specs/docsum_2005.asn
 
     ########################## DOWNLOAD #############################
     cd /hive/data/outside/dbSNP/130/human
     mkdir data schema rs_fasta
     # Get data from NCBI (anonymous FTP)
     wg ftp://ftp.ncbi.nih.gov/snp/00readme.txt
     cd /hive/data/outside/dbSNP/130/human/data
     # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
     wg $ftpSnpDb/organism_data/b130_SNPContigLoc_36_3.bcp.gz
     wg $ftpSnpDb/organism_data/b130_SNPContigLocusId_36_3.bcp.gz
     wg $ftpSnpDb/organism_data/b130_ContigInfo_36_3.bcp.gz
     # MapInfo has alignment weights
     wg $ftpSnpDb/organism_data/b130_SNPMapInfo_36_3.bcp.gz
     # SNP has univar_id, validation status and heterozygosity
     wg $ftpSnpDb/organism_data/SNP.bcp.gz
 
     # Get schema
     cd /hive/data/outside/dbSNP/130/human/schema
     wg $ftpSnpDb/organism_schema/human_9606_table.sql.gz
     wg $ftpSnpDb/shared_schema/dbSNP_main_table.sql.gz
 
     # Get fasta files
     # using headers of fasta files for molType, class, observed
     cd /hive/data/outside/dbSNP/130/human/rs_fasta
     wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/\*.gz
 
     # Get 1000 Genomes IDs (unfortunately not in validation field as Sol suggested)
     cd /hive/data/outside/dbSNP/130/human/data
     wg -O 1000Genomes_README ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/ReadMe.txt
     wg ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/1000Genomes/B130_1000G_RsClusterReport.txt.gz
     zcat B130_1000G_RsClusterReport.txt.gz | wc -l
 #7512342
     # Make a uniquified list of only the numeric portion of the assigned rs IDs:
     zcat B130_1000G_RsClusterReport.txt.gz \
     | cut -d, -f 3 | sed -e 's/^rs//' \
     | sort -nu > 1000GenomesRsIds.txt
     wc -l 1000GenomesRsIds.txt
 #5611085 1000GenomesRsIds.txt
 
     ########################## LOAD NCBI TABLES #############################
     # Simplify names of data files -- strip version & extras to get
     # local canonical table names.
     cd /hive/data/outside/dbSNP/130/human/data
     foreach f (*.bcp.gz)
       set new = `echo $f \
                  | sed -e 's/^b130_SNP//; s/^b130_//; s/_36_3//; s/.bcp//;'`
       mv $f $new
       echo $new
     end
 
     cd /hive/data/outside/dbSNP/130/human/schema
     zcat human_9606_table.sql.gz \
     | perl -we '$/ = "\nGO\n\n\n"; \
         while (<>) { \
           next unless /^CREATE TABLE \[(b130_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_36_3)?\]/; \
           s/b130_(SNP)?//; s/_36_3//; \
           s/[\[\]]//g;  s/GO\n\n/;/;  s/smalldatetime/datetime/g; \
           s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
           s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
           s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
           s/(image|varchar\s+\(\d+\))/BLOB/g; \
           print; \
         }' \
       > table.sql
 
     # load on hgwdev (kolossus disk almost full, no more small cluster mysql5's):
     hgsql '' -e 'create database hg18snp130'
     cd /hive/data/outside/dbSNP/130/human/schema
     hgsql hg18snp130 < table.sql
     cd ../data
 
     # Avoid wasting space by excluding mappings to non-reference contigs:
     foreach t (ContigInfo MapInfo)
       zcat $t.gz \
       | egrep -vw '(Celera|HuRef|CRA_TCAGchr7v2)' \
       | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
       | hgLoadSqlTab -oldTable hg18snp130 $t placeholder stdin
     end
 #load of ContigInfo did not go as planned: 379 record(s), 0 row(s) skipped, 88 warning(s) loading /dev/stdin
     # Checked ContigInfo visually, looks OK.
     # Compare contig list between our ctgPos and reference contigs in
     # ContigInfo:
     ssh hgwdev-10 hgsql hg18 -N -B -e '"select contig from ctgPos;"' \
     | sort > /tmp/1
     hgsql hg18snp130 -NBe 'select distinct(group_label) from ContigInfo'
     # --> reference, c5_H2, c6_COX, c6_QBL, c22_H2, DR53
     # (HuRef, Celera, CRA_TCAGchr7v2 grepped out above)
     hgsql hg18snp130 -N -B -e 'select contig_acc from ContigInfo \
         where group_label in \
         ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");' | sort > /tmp/2
     diff /tmp/1 /tmp/2
     # No diff.
     # Make sure there are no orient != 0 contigs among those selected.
     hgsql hg18snp130 -NBe \
       'select count(*) from ContigInfo where orient != 0 and \
          group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2");'
 #0
 
     # ContigLoc is huge, and we want just the reference contig mappings.
     # So, based on the reference & haplo ctg_id values in ContigInfo,
     # filter to get just the mappings for those contigs:
     zcat ContigLoc.gz \
     | awk '$3 <= 377 || $3 == 7015' \
     | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
     | hgLoadSqlTab -oldTable hg18snp130 ContigLoc placeholder stdin
     zcat SNP.gz \
     | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
     | hgLoadSqlTab -oldTable hg18snp130 SNP placeholder stdin
     zcat ContigLocusId.gz \
     | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
     | hgLoadSqlTab -oldTable hg18snp130 ContigLocusId placeholder stdin
     # There were some warnings (many cleared up by the perl substitution)
     # but no rows were dropped.  In mysql5, 'show warnings' after a manual 'load data'
     # complains about missing values (OK when e.g. position is not known).
     foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
      echo -n "${t}:\t"
       hgsql -N -B hg18snp130 -e 'select count(*) from '$t
     end
 #ContigInfo:     379
 #ContigLoc:      19189750
 #ContigLocusId:  11790054
 #MapInfo:        17928700
 #SNP:    	 17804034
 
 
     #################### EXTRACT INFO FROM NCBI TABLES ####################
     # Glom each SNP's function codes together and load up a new hg18Snp130 table.
     # Also extract NCBI's annotations of coding SNPs' effects on translation.
     # ContigLocusId includes contig_acc and asn_{from,to} but those are not comprehensive!
     # If a SNP has been mapped to multiple contigs, one is randomly selected, and if 
     # it is not a reference contig, we miss out if we restrict by contig.  We may end
     # up getting a few spurious functional annotations from mappings to other assemblies
     # but them's the breaks.
     cd /hive/data/outside/dbSNP/130/human
     hgsql hg18snp130 -NBe 'select snp_id, mrna_acc, fxn_class, \
                            reading_frame, allele, residue, codon from ContigLocusId' \
       > ncbiFuncAnnotations.txt
     cut -f 1,3 ncbiFuncAnnotations.txt \
     | sort -u -k1,1 -k2,2n  \
     | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
             else { if (prevId) {print prevId "\t" prevFunc;} \
                                 prevFunc = $2 ","; }} \
            {prevId = $1;} \
            END {print prevId "\t" prevFunc;}' \
       > ucscFunc.txt
 
     wc -l ucscFunc.txt
 #7344853 ucscFunc.txt
     cat > ucscFunc.sql <<EOF
 CREATE TABLE ucscFunc (
         snp_id int NOT NULL ,
         fxn_class varchar(255) NOT NULL ,
         INDEX snp_id (snp_id)
 );
 EOF
     hgLoadSqlTab hg18snp130 ucscFunc{,.sql,.txt}
 
     # Extract observed allele, molType and snp class from FASTA headers gnl
     zcat /hive/data/outside/dbSNP/130/human/rs_fasta/rs_ch*.fas.gz \
     | grep '^>gnl' \
     | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
     | sort -n \
       > ucscGnl.txt
 #407.555u 57.499s 4:32.89 170.4% 0+0k 0+0io 0pf+0w
     wc -l ucscGnl.txt
 #17804034 ucscGnl.txt
     cut -f 1 ucscGnl.txt | uniq | wc -l
 #17804034
     cat > ucscGnl.sql <<EOF
 CREATE TABLE ucscGnl (
         snp_id int NOT NULL ,
         observed varchar(255) NOT NULL,
         molType varchar(255) NOT NULL,
         class varchar(255) NULL ,
         INDEX snp_id (snp_id)
 );
 EOF
     hgLoadSqlTab hg18snp130 ucscGnl{,.sql,.txt}
 
     # Add indices to tables for a big join (5 or 6 minutes):
     hgsql hg18snp130 -e \
       'alter table ContigLoc  add index (ctg_id); \
        alter table ContigInfo add index (ctg_id); \
        alter table ContigLoc  add index (snp_id); \
        alter table SNP        add index (snp_id); \
        alter table MapInfo    add index (snp_id);'
 
     # Big leftie join to bring together all of the columns that we want in snp130,
     # using all of the available joining info:
     hgsql hg18snp130 -NBe \
      'SELECT ci.contig_acc, cl.asn_from, cl.asn_to, cl.snp_id, cl.orientation, cl.allele, \
              ug.observed, ug.molType, ug.class, \
              s.validation_status, s.avg_heterozygosity, s.het_se, \
              uf.fxn_class, cl.loc_type, mi.weight, cl.phys_pos_from \
       FROM \
       ((((ContigLoc as cl JOIN ContigInfo as ci \
                ON cl.ctg_id = ci.ctg_id and \
                   ci.group_label in ("reference", "c5_H2", "c6_COX", "c6_QBL", "c22_H2")) \
           LEFT JOIN MapInfo as mi ON mi.snp_id = cl.snp_id and mi.assembly = ci.group_label) \
          LEFT JOIN SNP as s ON s.snp_id = cl.snp_id) \
         LEFT JOIN ucscGnl as ug ON ug.snp_id = cl.snp_id) \
        LEFT JOIN ucscFunc as uf ON uf.snp_id = cl.snp_id;' \
       > ucscNcbiSnp.ctg.bed
 #on a not-so busy hgwdev: 80.735u 36.958s 8:54.76 22.0%   0+0k 0+0io 0pf+0w
 #on a very busy hgwdev:   78.753u 41.304s 30:19.77 6.5%   0+0k 0+0io 0pf+0w
     wc -l ucscNcbiSnp.ctg.bed 
 #19189750 ucscNcbiSnp.ctg.bed
     liftUp ucscNcbiSnp.bed \
       /cluster/data/hg18/jkStuff/liftContigs.lft warn \
       ucscNcbiSnp.ctg.bed
 #116.027u 7.078s 2:27.93 83.2%   0+0k 0+0io 0pf+0w
 
     # Drum roll please... translate NCBI's encoding into UCSC's, and
     # perform a bunch of checks.  This is where developer involvement
     # is most likely as NCBI extends the encodings used in dbSNP.
     cd /hive/data/outside/dbSNP/130/human/
     snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/hg18/hg18.2bit \
       -1000GenomesRsIds=data/1000GenomesRsIds.txt snp130
 #spaces stripped from observed:
 #chr12   5963395 5963395 rs41402545
 #Line 8106609 of ucscNcbiSnp.bed: Encountered something that doesn't fit observedMixedFormat: GCAACTTCA
 #count of snps with weight  0 = 74828
 #count of snps with weight  1 = 17254041
 #count of snps with weight  2 = 389501
 #count of snps with weight  3 = 1189989
 #count of snps with weight 10 = 281391
 #Found no errors.
 #143.111u 14.313s 3:15.18 80.6%  0+0k 0+0io 0pf+0w
     wc -l snp*
 #  18833531 snp130.bed
 #        22 snp130.sql
 #         0 snp130Errors.bed
 #        18 snp130ExceptionDesc.tab
 #   2631563 snp130Exceptions.bed
     # More SNPs but 0 errors and a bit fewer exceptions that snp129, cool!
 
     # Make one big fasta file.
     # It's a monster: 18G!  Can we split by hashing rsId?
     zcat rs_fasta/rs_ch*.fas.gz \
     | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
       > snp130.fa
     # Check for duplicates.
     grep ^\>rs snp130.fa | sort > /scratch/tmp/seqHeaders
     wc -l /scratch/tmp/seqHeaders
 #17804034 /scratch/tmp/seqHeaders
     uniq /scratch/tmp/seqHeaders | wc -l
 #17804034
     # Use hgLoadSeq to generate .tab output for sequence file offsets,
     # and keep only the columns that we need: acc and file_offset.
     # Index it and translate to snpSeq table format.
     time hgLoadSeq -test placeholder snp130.fa
 #107.748u 24.338s 6:58.50 31.5%  0+0k 0+0io 0pf+0w
     cut -f 2,6 seq.tab > snp130Seq.tab
     rm seq.tab
 
     # Load up main track tables.
     cd /hive/data/outside/dbSNP/130/human
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
       hg18 snp130 -sqlTable=snp130.sql snp130.bed
 #Loaded 18833531 elements of size 17
 #107.246u 11.546s 10:49.23 18.2% 0+0k 0+0io 0pf+0w
     time nice hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd \
       hg18 snp130Exceptions -sqlTable=$HOME/kent/src/hg/lib/snp125Exceptions.sql -renameSqlTable \
       snp130Exceptions.bed
 #15.255u 1.257s 1:11.11 23.2%    0+0k 0+0io 0pf+0w
     sed -e 's/snp125/snp130/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
       > snp130ExceptionDesc.sql
     hgLoadSqlTab hg18 snp130ExceptionDesc snp130ExceptionDesc.sql \
       snp130ExceptionDesc.tab
     # Load up sequences.
     sed -e 's/snpSeq/snp130Seq/' ~/kent/src/hg/lib/snpSeq.sql \
       > snp130Seq.sql
     mkdir -p /gbdb/hg18/snp
     ln -s /hive/data/outside/dbSNP/130/human/snp130.fa /gbdb/hg18/snp/snp130.fa
     time nice hgLoadSqlTab hg18 snp130Seq snp130Seq.sql snp130Seq.tab
 #0.001u 0.004s 3:41.13 0.0%      0+0k 0+0io 0pf+0w
 
     # Put in a link where one would expect to find the track build dir...
     ln -s /hive/data/outside/dbSNP/130/human /cluster/data/hg18/bed/snp130
 
     # Look at the breakdown of exception categories:
     cd /hive/data/outside/dbSNP/130/human
     cut -f 5 snp130Exceptions.bed | sort | uniq -c | sort -nr
 #1960737 MultipleAlignments
 # 519222 ObservedMismatch
 #  38444 ObservedTooLong
 #  32069 SingleClassTriAllelic
 #  26351 FlankMismatchGenomeShorter
 #  19089 SingleClassLongerSpan
 #  15441 SingleClassZeroSpan
 #   6583 FlankMismatchGenomeLonger
 #   4108 DuplicateObserved
 #   3627 SingleClassQuadAllelic
 #   3473 MixedObserved
 #   1369 NamedDeletionZeroSpan
 #    547 FlankMismatchGenomeEqual
 #    355 NamedInsertionNonzeroSpan
 #    136 ObservedContainsIupac
 #      8 ObservedWrongFormat
 #      4 RefAlleleMismatch
 
 #TODO: go through those above and send some bug reports to dbSNP.
 
     # 8/18/09: dbSNP announced a correction to some functional class 
     # annotations (- strand mRNA -> swapped near-gene-3 and near-gene-5).
     cd /hive/data/outside/dbSNP/130/human
     # This is a list of affected rs IDs, genes, old funcs and new funcs:
     wget ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/database/organism_data/b130_update/b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt
     wc -l b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt
 #163147 b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt
     # The first 19 lines are the header.
 
     # Use the info in that file to make a series of sql update commands:
     tail -n +20 b130_SNPContigLocusId_36_3_functionClass_13_15_fix.txt \
     | perl -we '$fns[6]="intron"; $fns[13]="near-gene-3"; $fns[15]="near-gene-5"; \
       $fns[41]="nonsense"; $fns[42]="missense"; \
       $fns[53]="untranslated-3"; $fns[55]="untranslated-5"; \
       while (<>) { \
       ($rs,undef,undef,$old,undef,$new) = split(","); \
       $oldF = $fns[$old];  $newF = $fns[$new]; die if (!(defined $oldF && defined $newF)); \
       print "UPDATE snp130 set func=(REPLACE(func,\"$oldF\",\"$newF\")) where name=\"rs$rs\";\n"; \
       }' \
       > snp130.func_13_15_fix.sql
     wc -l snp130.func_13_15_fix.sql
 #163128 snp130.func_13_15_fix.sql
     hgsql hg18 < snp130.func_13_15_fix.sql
     # The number of rows changed has to be smaller because some of those replacements
     # are for annotations relative to a different assembly; we have func=unknown for
     # those.  E.g. rs437678.
 
 
 #######################################################################
 # ORTHOLOGOUS ALLELES IN CHIMP AND MACAQUE FOR SNP130 (DONE 5/15/09 angie)
     mkdir /hive/data/genomes/hg18/bed/snp130Ortho
     cd /hive/data/genomes/hg18/bed/snp130Ortho
 
     # Following Heather's lead in snp126orthos, filter SNPs to to keep
     # only those with class=single, length=1, chrom!~random;
     # Exclude those with exceptions MultipleAlignments,
     # SingleClassTriAllelic or SingleClassQuadAllelic.
     # Unlike snp masking, we do not filter for weight -- don't know why.
     awk '$5 ~ /^MultipleAlignments|SingleClassTriAllelic|SingleClassQuadAllelic/ {print $4;}' \
       /hive/data/outside/dbSNP/130/human/snp130Exceptions.bed \
     | sort -u \
       > snp130ExcludeIds.txt
     awk '$3-$2 == 1 && $1 !~ /_random/ && $11 == "single" {print;}' \
       /hive/data/outside/dbSNP/130/human/snp130.bed \
     | grep -vFwf snp130ExcludeIds.txt \
       > snp130Simple.bed
 #182.396u 12.388s 2:10.30 149.4% 0+0k 0+0io 0pf+0w
     wc -l snp130Simple.bed
 #12141377 snp130Simple.bed
 
     # Glom all human info that we need for the final table onto the
     # name, to sneak it through liftOver: rsId|chr|start|end|obs|ref|strand
     awk 'BEGIN{OFS="\t";} \
         {print $1, $2, $3, \
                $4 "|" $1 "|" $2 "|" $3 "|" $9 "|" $8 "|" $6, \
                0, $6;}' \
       snp130Simple.bed > snp130ForLiftOver.bed
 
     # Map coords to chimp using liftOver.
     # I don't know why chimp took so much longer than macaque... the
     # chimp .over has fewer chains and fewer bytes than the macaque .over.
     mkdir run.liftOChimp
     cd run.liftOChimp
     mkdir split out
     splitFile ../snp130ForLiftOver.bed 25000 split/chunk
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
         \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     ssh pk
     cd /hive/data/genomes/hg18/bed/snp130Ortho/run.liftOChimp
     para make jobList
 #Completed: 486 of 486 jobs
 #CPU time in finished jobs:      76679s    1277.99m    21.30h    0.89d  0.002 y
 #IO & Wait Time:                  1828s      30.46m     0.51h    0.02d  0.000 y
 #Average job time:                 162s       2.69m     0.04h    0.00d
 #Longest finished job:             486s       8.10m     0.14h    0.01d
 #Submission to last job:           513s       8.55m     0.14h    0.01d
 
     # Map coords to orangutan using liftOver.
     mkdir ../run.liftOPon
     cd ../run.liftOPon
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg18/bed/liftOver/hg18ToPonAbe2.over.chain.gz \
         \{check out exists out/ponAbe2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     para make jobList
 #Completed: 486 of 486 jobs
 #CPU time in finished jobs:     165378s    2756.31m    45.94h    1.91d  0.005 y
 #IO & Wait Time:                  2614s      43.56m     0.73h    0.03d  0.000 y
 #Average job time:                 346s       5.76m     0.10h    0.00d
 #Longest finished job:            1017s      16.95m     0.28h    0.01d
 #Submission to last job:          1051s      17.52m     0.29h    0.01d
 
     # Map coords to macaque using liftOver.
     mkdir ../run.liftOMac
     cd ../run.liftOMac
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
         \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     para make jobList
 #Completed: 486 of 486 jobs
 #CPU time in finished jobs:       4068s      67.80m     1.13h    0.05d  0.000 y
 #IO & Wait Time:                  1944s      32.40m     0.54h    0.02d  0.000 y
 #Average job time:                  12s       0.21m     0.00h    0.00d
 #Longest finished job:              38s       0.63m     0.01h    0.00d
 #Submission to last job:           126s       2.10m     0.04h    0.00d
 
     cd /hive/data/genomes/hg18/bed/snp130Ortho
     # Concatenate the chimp results, sorting by chimp pos in order to
     # efficiently access 2bit sequence in getOrthoSeq.  The output of
     # that is then sorted by the glommed human info field, so that we
     # can use join to combine chimp and macaque results in the next step.
     # Ditto for macaque and orangutan.  Each command pipe takes ~5 minutes:
     sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/panTro2/panTro2.2bit \
     | sort > panTro2.orthoGlom.txt
     sort -k1,1 -k2n,2n run.liftOPon/out/ponAbe2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/ponAbe2/ponAbe2.2bit \
     | sort > ponAbe2.orthoGlom.txt
     sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /cluster/data/rheMac2/rheMac2.2bit \
     | sort > rheMac2.orthoGlom.txt
     wc -l panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt rheMac2.orthoGlom.txt
 #  11318466 panTro2.orthoGlom.txt
 #  10976821 ponAbe2.orthoGlom.txt
 #   9702063 rheMac2.orthoGlom.txt
 
     # Use the glommed name field as a key to join up chimp and macaque
     # allele data.  Include glommed name from both files because if only
     # file 2 has a line for the key in 2.1, then 1.1 is empty.  Then plop
     # in the orthoGlom fields from each file, which are in the same order
     # as the chimp and macaque columns of snp130OrthoPanTro2RheMac2.
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       panTro2.orthoGlom.txt ponAbe2.orthoGlom.txt \
     | awk '{if ($1 != "?") { print $1, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; } \
             else           { print $2, $3,$4,$5,$6,$7,$8,$9,$10,$11,$12; }}' \
       > tmp.txt
     join -o '1.1 2.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 2.3 2.4 2.5 2.6' \
       -a 1 -a 2 -e '?' \
       tmp.txt rheMac2.orthoGlom.txt \
     | perl -wpe 'chomp; \
         ($glom12, $glom3, $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) = split; \
         $glomKey = ($glom12 ne "?") ? $glom12 : $glom3; \
         ($rsId, $hChr, $hStart, $hEnd, $hObs, $hAl, $hStrand) = \
           split(/\|/, $glomKey); \
         $o1Start =~ s/^\?$/0/;  $o2Start =~ s/^\?$/0/;  $o3Start =~ s/^\?$/0/; \
         $o1End   =~ s/^\?$/0/;  $o2End   =~ s/^\?$/0/;  $o3End   =~ s/^\?$/0/; \
         print join("\t", $hChr, $hStart, $hEnd, $rsId, $hObs, $hAl, $hStrand, \
                          $o1Chr, $o1Start, $o1End, $o1Al, $o1Strand, \
                          $o2Chr, $o2Start, $o2End, $o2Al, $o2Strand, \
                          $o3Chr, $o3Start, $o3End, $o3Al, $o3Strand) . "\n"; \
         s/^.*$//;' \
     | sort -k1,1 -k2n,2n > snp130OrthoPt2Pa2Rm2.bed
 #300.357u 31.419s 4:33.00 121.5% 0+0k 0+0io 0pf+0w
     wc -l snp130OrthoPt2Pa2Rm2.bed
 #11797184 snp130OrthoPt2Pa2Rm2.bed
 
     cd /hive/data/genomes/hg18/bed/snp130Ortho
     hgLoadBed -tab -onServer -tmpDir=/data/tmp -renameSqlTable \
       -sqlTable=$HOME/kent/src/hg/lib/snpOrthoPanPonRhe.sql \
       hg18 snp130OrthoPt2Pa2Rm2 snp130OrthoPt2Pa2Rm2.bed
 #Loaded 11797184 elements of size 22
 #83.624u 9.627s 10:19.26 15.0%   0+0k 0+0io 0pf+0w
 
     # Cleanup fileserver:
     cd /hive/data/genomes/hg18/bed/snp130Ortho
     nice gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed
     rm -r run*/split tmp.txt *.orthoGlom.txt
 
 
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 ############################################################################
 
 
 ############################################################################
 # Nuclear Lamina (2008-06-16 mikep)
 #  "Domain organization of human chromosomes revealed by mapping of nuclear lamina interactions"
 # We received these files from authors of Guelen et al. Nature 2008
 # doi:10.138/nature06947
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/nuclearLamina
     cd /cluster/data/hg18/bed/nuclearLamina/
     mv /var/ftp/encode/LADs_080513.bed.bz2 .
     mv /var/ftp/encode/LaminB1_080513.wig.bz2 .
     mv /var/ftp/encode/LaminB1_LAD.md5sum .
     # to check the md5sum we need to unzip it to its original name, done on the NFS host for this directory
     df -h .
     # Filesystem            Size  Used Avail Use% Mounted on
     # kkstore02-10:/export/cluster/store11
     #                       1.8T  1.7T   94G  95% /cluster/store11
     ssh kkstore02-10
     cd /cluster/data/hg18/bed/nuclearLamina/
     # check they are not too big to unzip, look ok
     ll -h L*bz2
     # -rw-r--r--  1 mikep protein 13K Jun 10 00:58 LADs_080513.bed.bz2
     # -rw-r--r--  1 mikep protein 16M Jun 10 01:02 LaminB1_080513.wig.bz2
     bunzip2 -dk L*bz2
     md5sum -c LaminB1_LAD.md5sum
     # all ok
     # LADs_080513.bed: OK
     # LaminB1_080513.wig: OK
 
     # Description files were received via email and copied directly to this dir.
     # Needed to convert from mac to unix due to ^M chars:
     mac2unix L*.html
     # Checked files looked OK, needed to remove HTML tags such as: DOCTYPE <HTML> <BODY> </BODY> </HTML>
     vi L*.html
     # Now find the min/max/avg range of values from the wiggle file
     egrep  "^[0-9]" LaminB1_080513.wig |ave -col=2 stdin
     # Q1 -0.509000
     # median -0.000000
     # Q3 0.514000
     # average -0.041192
     # min -6.602000
     # max 5.678000
     # count 2909178
     # total -119833.701411
     # standard deviation 1.037038
 
     # Now load the tracks on hgwdev
     ssh hgwdev
     cd /cluster/data/hg18/bed/nuclearLamina/
     # First two lines are custom track header
     tail +3 LADs_080513.bed | hgLoadBed hg18 laminB1Lads stdin
     # wigEncode the .wig and .wib files from the supplied wig ascii file, and symlink the .wib file from /gbdb
     wigEncode LaminB1_080513.wig laminB1.wig laminB1.wib
     ln -s /cluster/data/hg18/bed/nuclearLamina/laminB1.wib /gbdb/hg18/wib/
     # Converted LaminB1_080513.wig, upper limit 5.68, lower limit -6.60
     hgLoadWiggle hg18 laminB1 laminB1.wig
     rm bed.tab wiggle.tab
 
     ## Create the track definitions in hg18, copy them over, (these are my paths) and do make
     ## Make entries for: bed = "track laminB1Lads" wiggle = "track laminB1"
     ssh hgwdev
     # vi /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/trackDb.ra
     # cp /cluster/data/hg18/bed/nuclearLamina/laminB1.html      /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/
     # cp /cluster/data/hg18/bed/nuclearLamina/laminB1Lads.html  /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/
     # cp /cluster/data/hg18/bed/nuclearLamina/laminB1Super.html /cluster/home/mikep/kent/src/hg/makeDb/trackDb/human/hg18/
     # cp /cluster/data/hg18/bed/nuclearLamina/laminB1Super.gif  /cluster/home/mikep/browser/images/
     # cd /cluster/home/mikep/kent/src/hg/makeDb/trackDb
     # make
 
     # Add wig ascii track (+readme) to goldenPath so it can be downloaded
     mkdir  /data/apache/htdocs/goldenPath/hg18/nuclearLamina
     cp /cluster/data/hg18/bed/nuclearLamina/LaminB1_080513.wig.bz2 /data/apache/htdocs/goldenPath/hg18/nuclearLamina/hg18.laminB1.txt.bz2
     cp /cluster/data/hg18/bed/nuclearLamina/goldenPath.README.txt  /data/apache/htdocs/goldenPath/hg18/nuclearLamina/README.txt
 
     # Add both tracks to all.joiner under section: tablesIgnored $hg
 
 ############################################################################
 ##### Positively Selected Genes (Pos Sel Genes)  (braney - DONE - 2008-07-07)
 
     # get SQL data (mammalPsq.sql) from Adam Siepel
     #    and Tomas Vinar (acs4@cornell.edu)
     hgsql hg18 < mammalPsg.sql
     echo "alter table mammalPsg add index (chrom(7));" | hgsql hg18
 
 ####################################################################
 # UPDATE UNIGENE/SAGE TRACK (DONE - 2008-08-09 Fan)
 
 # Create the uniGene alignments
 
     # Download of the latest UniGene version is now automated by a
     # cron job -- see /cluster/home/angie/crontab ,
     # /cluster/home/angie/unigeneVers/unigene.csh .
     # If hgwdev gets rebooted, that needs to be restarted... maybe there's
     # a more stable place to set up that cron job.
 
     ssh hgwdev
     cd /cluster/store11/gs.19/build36/bed
     cd uniGene
     mkdir old
     mv * old
 
     set Version = 214
 
     zcat /cluster/store7/uniGene/uniGene.$Version/Hs.seq.uniq.gz|\
     sed -e "s#>.*/ug=#>#; s# /len.*##;" > Hs.seq.uniq.simpleHeader.fa
 
     ssh pk
     set Version = 214
     mv /san/sanvol1/scratch/hg18/uniGene /san/sanvol1/scratch/hg18/uniGene.old
     mkdir /san/sanvol1/scratch/hg18/uniGene/
     cd /san/sanvol1/scratch/hg18/uniGene/
     cp -p /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa .
     ls -1 /san/sanvol1/scratch/hg18/nib/*.nib > genome.lst
     ls -1S \
     /cluster/store11/gs.19/build36/bed/uniGene/Hs.seq.uniq.simpleHeader.fa \
       > uniGene.lst
     cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/x86_64/blat -repeats=lower -minIdentity=95 ooc=/san/sanvol1/scratch/hg18/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
     gensub2 genome.lst uniGene.lst template.sub para.spec
     para create para.spec
     mkdir psl
     para try
     para check
     para push
 
 Completed: 49 of 49 jobs
 CPU time in finished jobs:      59778s     996.30m    16.60h    0.69d  0.002 y
 IO & Wait Time:                   208s       3.47m     0.06h    0.00d  0.000 y
 Average job time:                1224s      20.40m     0.34h    0.01d
 Longest finished job:            4549s      75.82m     1.26h    0.05d
 Submission to last job:          4653s      77.55m     1.29h    0.05d
 Estimated complete:                 0s       0.00m     0.00h    0.00d
 
     pslSort dirs raw.psl tmp psl >& pslSort.log
     cat raw.psl|\
     pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
       stdin hg18.uniGene.pslReps.psl /dev/null
 
 # Processed 553470 alignments
     gzip raw.psl
     gzip Hs.seq.uniq.simpleHeader.fa
 
     ssh hgwdev
     cd /cluster/store11/gs.19/build36/bed/uniGene
     cp -p /san/sanvol1/scratch/hg18/uniGene/hg18.uniGene.pslReps.psl .
 
     hgLoadPsl -table=uniGene_3 hg18 hg18.uniGene.pslReps.psl
 
 # load the sequence with -replace option
 
     hgLoadSeq -replace hg18 /gbdb/hg18/uniGene/Hs.seq.uniq.simpleHeader.fa
 
 #############################################################################
 # BLASTZ/CHAIN/NET dipOrd1 (DONE - 2008-10-22 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21
     cd /hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21
     cat << '_EOF_' > DEF
 # Human vs. Kangaroo rat
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 
 # QUERY: Kangaroo rat
 SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit
 SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzDipOrd1.2008-10-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    881m33.829s
     cat fb.hg18.chainDipOrd1Link.txt 
     #	786126212 bases of 2881515245 (27.282%) in intersection
     #	slight difficulty with the makeMd5sum.csh script, fixed in the source
     #	and completed the copy of the liftOver file, then continuing,
     #	with -syntenicNet:
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet -continue=cleanup -bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 &
     #	real    86m15.646s
 
     cd /cluster/data/hg18/bed/blastzDipOrd1.2008-10-21
 
     time nice -n +19 doRecipBest.pl hg18 dipOrd1 > rbest.log 2>&1 &
     #	real    327m0.719s
 
 #############################################################################
 # BLASTZ/CHAIN/NET pteVam1 (DONE - 2008-10-21,29 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21
     cd /hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21
     cat << '_EOF_' > DEF
 # Human vs. Megabat
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 
 # QUERY: Megabat
 SEQ2_DIR=/scratch/data/pteVam1/pteVam1.2bit
 SEQ2_LEN=/scratch/data/pteVam1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzPteVam1.2008-10-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    595m14.168s
     #	some crashed jobs, finish the batch on pk manually, then, continuing:
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=cat -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
     #	real    151m54.924s
     cat fb.hg18.chainPteVam1Link.txt
     #	1311133709 bases of 2881515245 (45.502%) in intersection
 
     cd /cluster/data/hg18/bed/blastzPteVam1.2008-10-21
 
     time nice -n +19 doRecipBest.pl hg18 pteVam1 > rbest.log 2>&1 &
     #	finish manually due to problems:
     #	real    286m25.330s
     doRecipBest.pl -continue=download hg18 pteVam1 > rbestDownload.log 2>&1
 
 #############################################################################
 # BLASTZ/CHAIN/NET turTru1 (DONE - 2008-10-22 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21
     cd /hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21
     cat << '_EOF_' > DEF
 # Human vs. Dolphin
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 
 # QUERY: Dolphin
 SEQ2_DIR=/scratch/data/turTru1/turTru1.2bit
 SEQ2_LEN=/scratch/data/turTru1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzTurTru1.2008-10-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    702m54.490s
     cat fb.hg18.chainTurTru1Link.txt
     #	1398587431 bases of 2881515245 (48.537%) in intersection
     #	slight difficulty with the makeMd5sum.csh script, fixed in the source
     #	and completed the copy of the liftOver file, then continuing,
     #	with -syntenicNet:
 
     cd /cluster/data/hg18/bed/blastzTurTru1.2008-10-21
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet -continue=cleanup -bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1 &
     #	real    74m4.276s
 
     time nice -n +19 doRecipBest.pl hg18 turTru1 > rbest.log 2>&1 &
     #	real    275m19.714s
 
 #############################################################################
 # BLASTZ/CHAIN/NET tarSyr1 (DONE - 2008-10-21,29 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21
     cd /hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21
     cat << '_EOF_' > DEF
 # Human vs. Tarsier
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 
 # QUERY: Tarsier
 SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit
 SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzTarSyr1.2008-10-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    1518m42.776s
     #	recovered the batch on pk, then continuing:
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=cat -syntenicNet -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
     #	real    526m45.582s
 
     cat fb.hg18.chainTarSyr1Link.txt 
     #	1383104827 bases of 2881515245 (47.999%) in intersection
 
     cd /cluster/data/hg18/bed/blastzTarSyr1.2008-10-21
 
     time nice -n +19 doRecipBest.pl hg18 tarSyr1 > rbest.log 2>&1 &
     #	failed, finishing manually
     #	real    155m48.855s
     doRecipBest.pl -continue=download hg18 tarSyr1 > rbest.log 2>&1
 
 #############################################################################
 # BLASTZ/CHAIN/NET proCap1 (DONE - 2008-10-22,29 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22
     cd /hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22
     cat << '_EOF_' > DEF
 # Human vs. Rock Hyrax
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 
 # QUERY: Rock Hyrax
 SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit
 SEQ2_LEN=/scratch/data/proCap1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzProCap1.2008-10-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    1654m44.904s
     #	finish lastz batch manually after script difficulties, then continuing:
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=cat -syntenicNet -bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
     #	real    227m41.045s
     cat fb.hg18.chainProCap1Link.txt 
     #	891406629 bases of 2881515245 (30.935%) in intersection
 
     cd /cluster/data/hg18/bed/blastzProCap1.2008-10-22
 
     time nice -n +19 doRecipBest.pl hg18 proCap1 > rbest.log 2>&1 &
     #	real    232m9.789s
     #	failed
     #	running the last couple of commands to finish this off
     #	real    561m51.171s
     doRecipBest.pl -continue=download hg18 proCap1 > rbestDownload.log 2>&1
 
 #############################################################################
 # BLASTZ/CHAIN/NET choHof1 (DONE - 2008-10-22,28 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22
     cd /hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22
     cat << '_EOF_' > DEF
 # Human vs. Sloth
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 
 # QUERY: Sloth
 SEQ2_DIR=/scratch/data/choHof1/choHof1.2bit
 SEQ2_LEN=/scratch/data/choHof1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzChoHof1.2008-10-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    1649m6.606s
     #	finish lastz batch manually after script difficulties, then continuing:
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=cat -syntenicNet -bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
     #	real    276m1.827s
     cat fb.hg18.chainChoHof1Link.txt
     #	993065598 bases of 2881515245 (34.463%) in intersection
 
     cd /cluster/data/hg18/bed/blastz.choHof1.2008-10-22
 
     time nice -n +19 doRecipBest.pl hg18 choHof1 > rbest.log 2>&1 &
     #	real    900m50.222s
 
 #############################################################################
 # BLASTZ/CHAIN/NET dasNov2 (DONE - 2008-10-22,29 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22
     cd /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22
     cat << '_EOF_' > DEF
 # Human vs. Armadillo
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 
 # QUERY: Armadillo
 SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit
 SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    1664m4.331s
     #	finish this batch manually after some code troubles, then:
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=cat -syntenicNet -bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
     #	real    230m4.513s
     #	something broke during chainSplit, try that manuallyo
     nice -n +19 chainSplit \
 /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22/axtChain/chain \
 /hive/data/genomes/hg18/bed/blastzDasNov2.2008-10-22/axtChain/hg18.dasNov2.all.chain.gz
     #	no problem with that, continuing:
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=net -syntenicNet -bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1 &
     #	real    206m54.072s
 
     cd /cluster/data/hg18/bed/blastzDasNov2.2008-10-22
 
     time nice -n +19 doRecipBest.pl hg18 dasNov2 > rbest.log 2>&1 &
     #	failed, finishing manually:
     #	real    680m1.703s
     #	the following takes an instant:
     doRecipBest.pl -continue=download hg18 dasNov2 \
 	> rbestDownload.log 2>&1 &
 
 #############################################################################
 # BLASTZ/CHAIN/NET loxAfr2 (DONE - 2008-10-22,29 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22
     cd /hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22
     cat << '_EOF_' > DEF
 # Human vs. Elephant
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 
 # QUERY: Elephant
 SEQ2_DIR=/scratch/data/loxAfr2/loxAfr2.2bit
 SEQ2_LEN=/scratch/data/loxAfr2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzLoxAfr2.2008-10-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	1580m26.439s
     #	problems with batch do to scriping errors, finishing the batch
     #	manually
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-continue=cat -syntenicNet -bigClusterHub=swarm  \
       -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 &
     #	real    264m46.272s
 
     cat fb.hg18.chainLoxAfr2Link.txt 
     #	1014404239 bases of 2881515245 (35.204%) in intersection
 
     cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-10-22
     time nice -n +19 doRecipBest.pl hg18 loxAfr2 > rbest.log 2>&1 &
     #	real    622m17.655s
 
 #############################################################################
 # BLASTZ/CHAIN/NET vicPac1 (DONE - 2008-10-28,29 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28
     cd /hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28
     cat << '_EOF_' > DEF
 # Human vs. Alpaca
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=10000
 
 # QUERY: Alpaca
 SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit
 SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzVicPac1.2008-10-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=swarm -syntenicNet \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    488m36.288s
     cat fb.hg18.chainVicPac1Link.txt 
     #	1139088501 bases of 2881515245 (39.531%) in intersection
 
     cd /cluster/data/hg18/bed/blastzVicPac1.2008-10-28
 
     time nice -n +19 doRecipBest.pl hg18 vicPac1 > rbest.log 2>&1 &
     #	real    380m17.963s
 
 #############################################################################
 # BLASTZ/CHAIN/NET Gorilla gorGor1 (DONE - 2008-11-04,05 - Hiram)
     screen #	use screen to control this multi-day job
     mkdir /hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04
     cd /hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04
     cat << '_EOF_' > DEF
 # Human vs. Alpaca
 
 BLASTZ_M=50
 BLASTZ=lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 
 # QUERY: Alpaca
 SEQ2_DIR=/scratch/data/gorGor1/gorGor1.2bit
 SEQ2_LEN=/scratch/data/gorGor1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=284
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/blastzGorGor1.2008-11-04
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-bigClusterHub=swarm -syntenicNet \
       -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    644m45.816s
     cat fb.hg18.chainGorGor1Link.txt 
     #	1778801556 bases of 2881515245 (61.731%) in intersection
 
     cd /cluster/data/hg18/bed/blastzGorGor1.2008-11-04
 
     time nice -n +19 doRecipBest.pl hg18 gorGor1 > rbest.log 2>&1 &
     #	real    171m42.585s
     #	failed, need to finish manually
     cd /hive/data/genomes/hg18/bed/blastz.gorGor1/axtChain
     # alter the doRecipBest.csh script to finiRecipBest.csh and run:
     time ./finiRecipBest.csh > finiRecipBest.log 2>&1
     #	real    1177m37.534s
     #	then, continuing:
     doRecipBest.pl -continue=download hg18 gorGor1
 
 #############################################################################
 # BLASTZ/CHAIN/NET ochPri2 (DONE braney 2008-07-30)
     ssh kkstore02
     screen #	use screen to control this multi-day job
     mkdir /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
     cd /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
     cat << _EOF_ > DEF
 # Human vs. Pika
 
 BLASTZ_M=50
 BLASTZ=/cluster/home/braney/bin/x86_64/lastz
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Pika
 SEQ2_DIR=/san/sanvol1/scratch/ochPri2/ochPri2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/ochPri2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
 TMPDIR=/scratch/tmp
 _EOF_
 
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do.log 2>&1 &
 
 # Completed: 654120 of 654120 jobs
 # CPU time in finished jobs:   14082913s  234715.22m  3911.92h  163.00d  0.447 y
 # IO & Wait Time:               2257180s   37619.67m   626.99h   26.12d  0.072 y
 # Average job time:                  25s       0.42m     0.01h    0.00d
 # Longest finished job:             292s       4.87m     0.08h    0.00d
 # Submission to last job:         59396s     989.93m    16.50h    0.69d
 
     nice doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do2.log 2>&1 &
 
     # memk cluster couldn't find san for chainRun, ran on pk
 
     nice doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=chainMerge \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/ochPri2/blastz.hg18 > do3.log 2>&1 &
 
     ln -s `pwd`/blastz.ochPri2.2008-07-29 /cluster/data/hg18/bed/blastz.ochPri2
 
     featureBits hg18 chainOchPri2Link
 # 806073890 bases of 2881515245 (27.974%) in intersection
 
     cd /cluster/data/hg18/bed/blastz.ochPri2.2008-07-29
 
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 ochPri2 > rbest.log 2>&1 &
 
 #############################################################################
 # BLASTZ/CHAIN/NET myoLuc1 (DONE braney 2008-08-02)
     ssh kkstore02
     screen #	use screen to control this multi-day job
     mkdir /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
     cd /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
     cat << _EOF_ > DEF
 # Human vs. Microbat
 
 BLASTZ_M=50
 BLASTZ_T=2
 BLASTZ=/cluster/home/braney/bin/x86_64/lastz
 
 # TARGET: Human Hg18 (whole chroms)
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=0
 
 # QUERY: Microbat
 SEQ2_DIR=/san/sanvol1/scratch/myoLuc1/myoLuc1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/myoLuc1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
 TMPDIR=/scratch/tmp
 _EOF_
 
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/myoLuc1/blastz.hg18 > do.log 2>&1 &
 
 # Completed: 98879 of 99144 jobs
 # Crashed: 56 jobs
 # Other count: 209 jobs
 # CPU time in finished jobs:    2327505s   38791.75m   646.53h   26.94d  0.074 y
 # IO & Wait Time:                340164s    5669.40m    94.49h    3.94d  0.011 y
 # Average job time:                  27s       0.45m     0.01h    0.00d
 # Longest finished job:            1034s      17.23m     0.29h    0.01d
 # Submission to last job:         56968s     949.47m    15.82h    0.66d
 
 # do remaining jobs on kolossus
 
     nice doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/myoLuc1/blastz.hg18 > do2.log 2>&1 &
 
     ln -s `pwd`/blastz.myoLuc1.2008-07-31 /cluster/data/hg18/bed/blastz.myoLuc1
 
     featureBits hg18 chainMyoLuc1Link
 # 952177725 bases of 2881515245 (33.044%) in intersection
 
     cd /cluster/data/hg18/bed/blastz.myoLuc1.2008-07-31
 
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 myoLuc1 > rbest.log 2>&1 &
 
 #############################################################################
 # BLASTZ/CHAIN/NET loxAfr2 (not done)
     ssh kkstore02
     screen #	use screen to control this multi-day job
     mkdir /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
     cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
     cat << _EOF_ > DEF
 # Human vs. Elephant
 
 BLASTZ_M=50
 BLASTZ=/cluster/home/braney/bin/x86_64/lastz
 
 # TARGET: Human Hg18 (whole chroms)
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=0
 
 # QUERY: Elephant
 SEQ2_DIR=/san/sanvol1/scratch/loxAfr2/loxAfr2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/loxAfr2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
 TMPDIR=/scratch/tmp
 _EOF_
 
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do.log 2>&1 &
 
 # had to run some jobs on memk
 
     nice doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do2.log 2>&1 &
 
     # netChainSubset barfed with memory error (skipped over chains)
 
     nice doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/loxAfr2/blastz.hg18 > do3.log 2>&1 &
 
     ln -s `pwd`/blastz.loxAfr2.2008-08-01 /cluster/data/hg18/bed/blastz.loxAfr2
 
     featureBits hg18 chainLoxAfr2Link
 # 1025499138 bases of 2881515245 (35.589%) in intersection
 
     cd /cluster/data/hg18/bed/blastz.loxAfr2.2008-08-01
 
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 loxAfr2 > rbest.log 2>&1 &
 
 #############################################################################
 # BUILD snpArrayIllumina1M SUB-TRACK (DONE 8/4/08, Fan)
 
 # Received raw data file Illumina_Human1M-duoV3_SNPlist_Strand_Location.csv
 # from Illumina,  Luana Galver (lgalver at illumina.com).
 
     mkdir -p /cluster/store11/gs.19/build36/bed/snp/illumina/1M
     cd /cluster/store11/gs.19/build36/bed/snp/illumina/1M
 
     cat Illumina_Human1M-duoV3_SNPlist_Strand_Location.csv |\
     sed -e 's/,/\t/g' >1M.tab
 
     hgsql hg18 < ~src/hg/lib/snpArrayIllumina1MRaw.sql
     hgsql hg18 -e 'load data local infile "1M.tab" into table snpArrayIllumina1MRaw'
 
     ~/src/hg/snp/snpLoad/illuminaLookup1M hg18 snpArrayIllumina1MRaw snp129
 
 # The illuminaLookup1M generate two files:
 #
 #	illuminaLookup1M.out  contains all Illumina 1M probes found in snp129
 #	illuminaLookup1M.err  contains all Illumina 1M probes not found in snp129
 
     mv illuminaLookup.out illuminaLookup1Ma.out
 
     cut -f 1 illuminaLookup.err >j.1
     cat j.1 |sed -e 's/chrMt/chrM/' |\
     sed -e 's/XY/X/'  >j.chr
 
     cut -f 2-5 illuminaLookup.err >j.2
 
     cut -f 6 illuminaLookup.err >j.3
     cat j.3 |sed -e 's/F/+/' |sed -e 's/R/-/' >j.strand
 
     cut -f 7 illuminaLookup.err |sed -e "s/\[//" |sed -e "s/\]//" >j.observed
 
     paste j.chr j.2 j.strand j.observed >illuminaLookup1Mb.out
 
 # combine two parts
     cat illuminaLookup1Ma.out illuminaLookup1Mb.out >snpArrayIllumina1M.tab
 
 # load the table
     hgLoadBed hg18 snpArrayIllumina1M snpArrayIllumina1M.tab -tab -sqlTable=snpArrayIllumina1M.sql
 
 #############################################################################
 # BLASTZ/CHAIN/NET micMur1 (DONE braney 2008-08-04 )
     ssh kkstore02
     screen #	use screen to control this multi-day job
     mkdir /cluster/data/hg18/bed/blastz.micMur1.2008-08-03
     cd /cluster/data/hg18/bed/blastz.micMur1.2008-08-03
     cat << _EOF_ > DEF
 # Human vs. Mouse lemur
 
 BLASTZ_M=50
 BLASTZ_T=2
 BLASTZ=/cluster/home/braney/bin/x86_64/lastz
 
 # TARGET: Human Hg18 (whole chroms)
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=0
 
 # QUERY: Mouse lemur
 SEQ2_DIR=/san/sanvol1/scratch/micMur1/micMur1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/micMur1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.micMur1.2008-08-03
 TMPDIR=/scratch/tmp
 _EOF_
 
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/micMur1/blastz.hg18 > do.log 2>&1 &
 
 # did remaining jobs on memk
 
     nice doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/micMur1/blastz.hg18 > do2.log 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/hg18/bed
 
     ln -s `pwd`/blastz.micMur1.2008-08-03 /cluster/data/hg18/bed/blastz.micMur1
 
     featureBits hg18 chainMicMur1Link
 # 1338330504 bases of 2881515245 (46.445%) in intersection
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.micMur1.2008-08-03
 
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 micMur1 > rbest.log 2>&1 &
 
 #############################################################################
 # BLASTZ/CHAIN/NET speTri1 (DONE braney 2008-08-05)
     ssh kkstore02
     screen #	use screen to control this multi-day job
     mkdir /cluster/data/hg18/bed/blastz.speTri1.2008-08-04
     cd /cluster/data/hg18/bed/blastz.speTri1.2008-08-04
     cat << _EOF_ > DEF
 # Human vs. Squirrel
 
 BLASTZ_M=50
 BLASTZ=/cluster/home/braney/bin/x86_64/lastz
 
 # TARGET: Human Hg18 (whole chroms)
 SEQ1_DIR=/scratch/data/hg18/nib
 SEQ1_LEN=/cluster/data/hg18/chrom.sizes
 SEQ1_CHUNK=200000000
 SEQ1_LAP=0
 
 # QUERY: Squirrel
 SEQ2_DIR=/san/sanvol1/scratch/speTri1/speTri1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/speTri1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastz.speTri1.2008-08-04
 TMPDIR=/scratch/tmp
 _EOF_
 
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do.log 2>&1 &
 
 # did crashed jobs on memk
 
     nice doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=cat \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do2.log 2>&1 &
 
 # had to run netChains.csh by hand due to PATH problem
     nice doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk -syntenicNet -continue=load \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/speTri1/blastz.hg18 > do3.log 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/hg18/bed
     ln -s `pwd`/blastz.speTri1.2008-08-04 /cluster/data/hg18/bed/blastz.speTri1
 
     featureBits hg18 chainSpeTri1Link
 # 1032377454 bases of 2881515245 (35.828%) in intersection
 
     ssh kkstore02
     cd /cluster/data/hg18/bed/blastz.speTri1.2008-08-04
 
     nice -n +19 /cluster/bin/scripts/doRecipBest.pl hg18 speTri1 > rbest.log 2>&1 &
 
 #######################################################
 ## 44-way multiz (braney working....
 
 mkdir /cluster/data/hg18/bed/multiz44way
 cd /cluster/data/hg18/bed/multiz44way
 cp /cluster/data/mm9/bed/multiz30way/mm9.guess.30way.nh .
 
 # get mammal tree from Michele Clamp (clamp.nh)
 # that I re-rooted
 
 #######################################################
 # UW nucleosome occupancy predictions  (2008-08-13 markd)
 # update due to chr3 being truncated (2009-05-12 markd)
 # contact  William Stafford Noble <noble@gs.washington.edu>
     # obtain data:
     mkdir -p /cluster/data/hg18/bed/uwNucOcc
     cd /cluster/data/hg18/bed/uwNucOcc
     http://USER:PASS@noble.gs.washington.edu/~noble/proj/dennis/results/2008-08-11/ucsc.tgz
     mkdir wig
     cd wig
     tar -zxf ../ucsc.tgz
     cd ..
     rm  ucsc.tgz
 
     # encode and load wiggles
     ssh kkstore02
     cd /cluster/data/hg18/bed/uwNucOcc/wib
     zcat ../wig/a375/a375.chr*.wig.gz|wigEncode stdin uwNucOccA375.wig uwNucOccA375.wib
     # Converted stdin, upper limit 9.88, lower limit -5.19
 
     zcat ../wig/dennis/dennis.chr*.wig.gz|wigEncode stdin uwNucOccDennis.wig uwNucOccDennis.wib
     # Converted stdin, upper limit 8.26, lower limit -9.68
 
     zcat ../wig/mec/mec.chr*.wig.gz|wigEncode stdin uwNucOccMec.wig uwNucOccMec.wib
     # Converted stdin, upper limit 5.05, lower limit -9.86
 
     # link-n-load
     ssh hgwdev
     cd cluster/data/hg18/bed/uwNucOcc/wib
     ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccA375.wib /gbdb/hg18/wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccA375 uwNucOccA375.wig
 
     ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccDennis.wib /gbdb/hg18/wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccDennis uwNucOccDennis.wig
 
     ln -s /cluster/data/hg18/bed/uwNucOcc/wib/uwNucOccMec.wib /gbdb/hg18/wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccMec uwNucOccMec.wig
 
     rm wiggle.tab
 
     # noble lab supplied update due to chr3 being truncated (2009-05-12 markd)
     cd /cluster/data/hg18/bed/uwNucOcc
     mkdir bad
     mv wig/*/*.chr3.hg18.wig.gz bad/
     mv wib  bad/
     wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/a375/a375.chr3.hg18.wig.gz
     wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/mec/mec.chr3.hg18.wig.gz
     wget http://USER:PASS@noble.gs.washington.edu/~wnoble/proj/dennis/results/2008-08-11/dennis/dennis.chr3.hg18.wig.gz
     mv dennis.chr3.hg18.wig.gz wig/dennis/
     mv mec.chr3.hg18.wig.gz wig/mec/
     mv a375.chr3.hg18.wig.gz wig/a375/
 
     cd /cluster/data/hg18/bed/uwNucOcc/wib
     zcat ../wig/a375/a375.chr*.wig.gz|wigEncode stdin uwNucOccA375.wig uwNucOccA375.wib
     zcat ../wig/dennis/dennis.chr*.wig.gz|wigEncode stdin uwNucOccDennis.wig uwNucOccDennis.wib
     zcat ../wig/mec/mec.chr*.wig.gz|wigEncode stdin uwNucOccMec.wig uwNucOccMec.wib
     cd ..
     hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccA375 uwNucOccA375.wig
     hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccDennis uwNucOccDennis.wig
     hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 uwNucOccMec uwNucOccMec.wig
 
 #########################################################################
 # BLASTZ/CHAIN/NET oryLat2 (DONE - 2008-08-19,25 - Hiram)
     ssh kkstore02
     screen	# use a screen to manage this longish running job
     mkdir /cluster/data/hg18/bed/blastzOryLat2.2008-08-19
     cd /cluster/data/hg18/bed/blastzOryLat2.2008-08-19
     cat << '_EOF_' > DEF
 # Human vs. Medaka
 BLASTZ=/cluster/bin/penn/x86_64/lastz
 
 # typical parameters for a genome that is distant from human
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Human hg18, randoms complete, as they are, no contig confusion
 SEQ1_DIR=/scratch/data/hg18/hg18.2bit
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=1
 
 # QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
 SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
 SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg18/bed/blastzOryLat2.2008-08-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -verbose=2 > do.log 2>&1 &
     cat fb.hg18.chainOryLat2Link.txt
     #	52713428 bases of 2881515245 (1.829%) in intersection
     cd /cluster/data/hg18/bed
     ln -s blastzOryLat2.2008-08-19 blastz.oryLat2
 
 
     #	That is OK, now for the swap:
     mkdir /cluster/data/oryLat2/bed/blastz.hg18.swap
     cd /cluster/data/oryLat2/bed/blastz.hg18.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/hg18/bed/blastzOryLat2.2008-08-19/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk > swap.log 2>&1 &
     #	real    17m9.675s
     cat fb.oryLat2.chainHg18Link.txt
     #	46961822 bases of 700386597 (6.705%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET TAEGUT1 (DONE braney 2008-09-10)
     ssh swarm
     screen
     mkdir /cluster/data/hg18/bed/blastz.taeGut1.2008-09-09
     cd /cluster/data/hg18/bed/blastz.taeGut1.2008-09-09
     cat << _EOF_ > DEF
 # human vs. zebra finch
 BLASTZ_M=50
 
 # Copied settings from human vs galGal3
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Human hg18
 SEQ1_DIR=/scratch/data/hg18/hg18.2bit
 # SEQ1_SMSK=/hive/data/genomes/hg18/linSpecRep/notInChicken
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 
 # one chrom at a time
 SEQ1_CHUNK=200000000
 SEQ1_LAP=0
 
 # QUERY: Zebra finch taeGut1
 SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit
 SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes
 # SEQ2_DIR=/hive/data/genomes/taeGut1/taeGut1.2bit
 # SEQ2_LEN=/hive/data/genomes/taeGut1/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/taeGut1/taeGut1.blastz.2bit
 SEQ2_CTGLEN=/hive/data/genomes/taeGut1/taeGut1.blastz.sizes
 SEQ2_LIFT=/hive/data/genomes/taeGut1/jkStuff/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/hive/data/genomes/hg18/bed/blastz.taeGut1.2008-09-09
 _EOF_
     # << emacs
      doBlastzChainNet.pl -syntenicNet \
      -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=loose \
      -smallClusterHub=swarm DEF -workhorse=swarm \
      -qRepeats=windowmaskerSdust > do.log 2>&1
 
 # Completed: 14910 of 14910 jobs
 # CPU time in finished jobs:    2744737s   45745.62m   762.43h   31.77d  0.087 y
 # IO & Wait Time:               1493361s   24889.34m   414.82h   17.28d  0.047 y
 # Average job time:                 284s       4.74m     0.08h    0.00d
 # Longest finished job:            3678s      61.30m     1.02h    0.04d
 # Submission to last job:          6687s     111.45m     1.86h    0.08d
 
     cd /cluster/data/hg18/bed
     rm -f blastz.taeGut1
     ln -s blastz.taeGut1.2008-09-09 /cluster/data/hg18/bed/blastz.taeGut1
 
 ################################################################
 # HUMAN FETAL BRAIN EXON ARRAYS (YALE) (Andy)
 
 ssh hgwdev
 bash
 
 mkdir /hive/data/genomes/hg18/bed/yaleMicroarrays
 cd /hive/data/genomes/hg18/bed/yaleMicroarrays
 cp /var/ftp/encode/Sestan_fetal_brain_exon_arrays.rar .
 rar e Sestan_fetal_brain_exon_arrays.rar
 tail +2 18_19_21_23_full_SLR_converted.txt | grep -v "\-\-\-" > sestanBrainAtlas.bed
 hgLoadBed hg18 sestanBrainAtlas{,.bed}
 # just a little array name organization
 head -n1 18_19_21_23_full_SLR_converted.txt | \
    sed 's/.*expNames=\"//;s/\"\ name=.*//;s/\.CEL//g' | \
    tr ',' '\n' | sed '/^$/d' | grep -n '' | tr ':' '\t' | \
    awk 'BEGIN{OFS="\t";}{$1=$1 - 1; print;}' \
       > arrays.txt
 awk 'BEGIN{OFS="\t";}{print $1, $2, $2, "n/a", "n/a", "n/a", "103", "n/a,n/a,"$2",";}' \
    arrays.txt > sestanBrainAtlasExps.tab
 ln -s ~/kent/src/hg/lib/expRecord.sql sestanBrainAtlasExps.sql
 hgLoadSqlTab hgFixed sestanBrainAtlasExps{,.sql,.tab}
 
 # Removed some of the arrays... the manual way
 # something's weird
 tr '\r' '\n' < sestanBrainAtlas.bed | sed '/^$/d; s/$/,/' > ses.bed
 cut -f1-14 ses.bed | \
    awk 'BEGIN{FS="\t";OFS="\t"}{$2 = $2 - 1; $13 = $13 - 8; print;}' | \
    sed 's/95,96,97,98,99,100,101,102//' > ses14.bed
 cut -f15 ses.bed | cut -d',' -f1-74,77-92,99- > ses15.bed
 paste ses14.bed ses15.bed > newSestan.bed
 hgLoadBed hg18 sestanBrainAtlas newSestan.bed
 ln -s ~/kent/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra
 grep -A5 sestanBrainAtlasAll microarrayGroups.ra | grep "^names" | sed 's/names //' | tr ',' '\n' | sed '/^$/d' > namesCol.txt
 grep -A5 sestanBrainAtlasAll microarrayGroups.ra | grep "^expIds" | sed 's/expIds //' | tr ',' '\n' | sed '/^$/d' >expCol.txt
 paste expCol.txt namesCol.txt > arrays.txt
 awk 'BEGIN{OFS="\t";}{print $1, $2, $2, "n/a", "n/a", "n/a", "103", "n/a,n/a,"$2",";}' \
    arrays.txt > sestanBrainAtlasExps.tab
 hgLoadSqlTab hgFixed sestanBrainAtlasExps{,.sql,.tab}
 ssh kolossus
 
 ################################################################
 # HUMAN TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)
 
 #
 # AffyExonTissue Step 1: download exon array coordinate data from Affy
 # and extract coordinates Download HuEx-1_0-st-v2 Annotations, Full,
 # Hg18/Build 36 gff.  The file is available at
 # http://www.affymetrix.com/support/technical/byproduct.affx?product=huexon-st
 # and the download requires logging in to NetAffx (free, registration
 # required) Uncompress the GFF files.  Parse out key fields with the
 # script below, generating hg18.affy.exon.coords.tab
 #
 
 #---------
 #!/usr/bin/env perl
 
 =pod
 
 =head1 NAME
 
 parseGffArrayData.pl
 
 =head1 SYNOPSYS
 
 cat *gff |parseGffArrayData.pl > array.coords.tab
 
 =head1 DESCRIPTION
 
 Parses probeset coordinates out of the Affy design data
 
 =cut
 
 {
     use strict;
     use Getopt::Long;
     use GFF;
     use GFF::GeneFeature;
     use FileHandle;
 
     print "chr\tstart\tend\tID\tscore\tstrand\n";
     while (my $line = <>) {
 	chomp;
 	my @tokens = split /\s/, $line;
 	if ($tokens[2] eq "probeset") {
 	    my $gffFeature = new GFF::GeneFeature;
 	    my $gffData = $gffFeature->new_from_line($line);
 	    my $probesetId = $gffData->group_value('probeset_id');
 	    my $probesetLevel = $gffData->group_value('level');
 	    my $bounded = $gffData->group_value('bounded');
 	    my $cds = $gffData->group_value('cds');
 	    my $score;
 	    if ($probesetLevel eq "core") {
 		$score = 900;
 	    } elsif ($probesetLevel eq "extended") {
 		$score = 500;
 	    } else {
 		$score = 200;
 	    }
 	    if ($bounded) { $score -= 200; }
 	    if ($cds) { $score += 100; }
 	    if ($score < 100) { $score = 100; }
 	    print($gffData->seqname(), "\t", $gffData->start(), "\t",
 		  $gffData->end(), "\t", $probesetId, "\t$score\t",
 		  $gffData->strand(), "\n");
 	}
     }
 }
 #-------
 
 
 #
 # AffyExonTissue Step 2: download tissue data from Affy, generate bed15 file
 #
 # Download Human Exon 1.0 ST APT results from
 # http://www.affymetrix.com/support/technical/sample_data/exon_array_data.affx
 # (requires free registration and login, as above)
 # Uncompress, and get rid of the undesired tissue mixture columns.
 cut -f 1-34 \
   < apt-probeset-summarize-results-exon/quant-norm.pm-gcbg.plier.summary.txt \
   > quant-norm.pm-gcbg.plier.nomix.summary.txt
 
 #
 # Generate a bed15 file using the command below, and script below that.
 # For the purposes of generating a track, ignore the first line.
 #
 arrayToBed15.py \
   --coordinates hg18.affy.exon.coords.tab \
   --plier quant-norm.pm-gcbg.plier.nomix.summary.txt \
   --name "humanExon" \
   --groups "breast,breast,breast,cerebellum,cerebellum,cerebellum,heart,heart,heart,kidney,kidney,kidney,liver,liver,liver,muscle,muscle,muscle,pancreas,pancreas,pancreas,prostate,prostate,prostate,spleen,spleen,spleen,testes,testes,testes,thyroid,thyroid,thyroid" \
 |tail -n +2 > human.exon.headless.bed15
 
 #---
 #!/usr/bin/python
 
 from optparse import OptionParser
 import math
 import re
 
 #
 # get the genomic probeset coordinates
 #
 def parseProbesetCoordinates(coordinatesFilename):
     """Build a dictionary of coordinates from a tab-delmited file"""
     coordinateData = {}
     coordinatesFileHandle = open(coordinatesFilename)
     coordinatesFileHandle.readline()  # skip the header line
     for line in coordinatesFileHandle:
         line = line.rstrip();
         tokens = line.split('\t')
         id = tokens[3]
         coordinateData[id] = tokens;
     return(coordinateData)
 
 def median(numbers):
     """Sort the input list and return the middle element."""
     nn = len(numbers)
     copy = numbers[:] # So that "numbers" keeps its original order
     copy.sort()
     if nn & 1:         # There is an odd number of elements
         return copy[nn // 2]
     else:
         return (copy[nn // 2] + copy[nn // 2 - 1]) / 2
 
 
 def medianOfMedians(experimentNames, experimentValues):
        """Given replicated values, find the median of the replicate medians."""
        # Create a dictionary to sort the values by experiment set
        replicates = {}
        #
        # Group the epxeriments into replicate sets by experiment names.
        # This assumes that experiments in the same replicate set have the
        # same name.
        #
        for ii in range(0,len(experimentNames)-1):
            if not replicates.has_key(experimentNames[ii]):
                replicates[experimentNames[ii]] = [experimentValues[ii]]
            else :
                replicates[experimentNames[ii]].append(experimentValues[ii])
        # Make a list containing the median value of each replicate set.
        medians = list()
        for replicateSet in replicates.keys() :
            values = replicates[replicateSet]
            thisMedian = median(values)
            medians.append(thisMedian)
        # Now get the median value of the median list
        medianValue = median(medians)
        return(medianValue)
 
 
 
 def printHeaderData(experimentList, trackName):
     """Print a header line for a bed15 file"""
     expNames = ",".join(experimentList)
     print "track type=\"array\" expScale=3 expStep=0.5 ",
     print " name=\"" + str(trackName) + "\"", \
           " description=\"Microarray custom track\"",
     print " expNames=" "\"" + expNames + "\""
 
 
 def printPlierResults(resultsLine, experimentGroups, probesetCoordinates):
     """median-center a line of expression results, print in bed15 format"""
     background = 10
     plierResultsLine = line.split('\t')
     probesetId = plierResultsLine[0]
     del plierResultsLine[0]
     if probesetCoordinates.has_key(probesetId):
         coordinates = probesetCoordinates[probesetId]
         #
         # Given coordinate data (chr start end ID score strand)
         #   and given experimental data (ID exp1 exp2 exp3 ... expN)
         # Print as follows:
         # 1. Basic bedfile stuff: chromosome, start, end, name, score,
         #    strand, thickStart (=start), thickEnd (=end), 0, blocks (=1),
         #    blocklengths (=end-start+1,), blockstarts (=0,)
         #
         start = int(coordinates[1]) - 1
         end = int(coordinates[2])
         length = end - start
         print str(coordinates[0]) + "\t" + str(start) + "\t" \
               + str(end) + "\t" + str(probesetId) + "\t", \
               coordinates[4], "\t", coordinates[5], "\t", start, "\t", \
               end, "\t0\t1\t", \
               str(length) + ",\t", "0,\t",
         #
         # Continue with microarray-specific stuff:
         # - experiment count
         # - comma-separated list of experiment IDs (0 .. experimentCount)
         # - comma-separated list of experiment scores (log(result)-log(median))
         #
         experimentCount = len(plierResultsLine)
         experimentValues = list()
         for value in plierResultsLine:
             experimentValues.append(float(value))
         medianValue = medianOfMedians(experimentGroups, experimentValues)
         logMedian = math.log(medianValue+background)
         valuesStrings = list()
         for thisValue in experimentValues:
             thisScore = math.log(thisValue+background) - logMedian
             valuesStrings.append(str(thisScore))
         experimentScoreString = ",".join(valuesStrings)
         ids = list()
         for ii in range(0, experimentCount):
             ids.append(str(ii))
         experimentIdString = ",".join(ids)
         print experimentCount, "\t", experimentIdString, "\t", \
               experimentScoreString
     return
 
 parser = OptionParser()
 parser.add_option("--coordinates", dest="coordinatesFile")
 parser.add_option("--plier", dest="plierResultsFile")
 parser.add_option("--name", dest="trackName")
 parser.add_option("--groups", dest="experimentGroups")
 (parameters, args) = parser.parse_args()
 
 experimentGroups = parameters.experimentGroups.split(",")
 probesetCoordinates = parseProbesetCoordinates(parameters.coordinatesFile)
 
 plierResults = open(parameters.plierResultsFile)
 for line in plierResults:
     line = line.rstrip()
     if (re.search("^#", line)) :
         continue
     elif (re.search("^probeset_id", line)) :
         printHeaderData(experimentGroups, parameters.trackName)
     else :
         printPlierResults(line, experimentGroups, probesetCoordinates)
 #---
 
 
 
 #
 # AffyExonTissue Step 3: set up a browser track from the bed15 file
 #   created offline: trackDb.affyExonTissues.ra,
 #                    affyExonTissues.html,
 #                    microarrayGroups.affyExonTissues.ra
 #
 cat $KENT/src/hg/makeDb/trackDb/human/trackDb.ra trackDb.affyExonTissues.ra \
     > trackDb.new.ra
 cp trackDb.new.ra  $KENT/src/hg/makeDb/trackDb/human/trackDb.ra
 cp affyExonTissues.html $KENT/src/hg/makeDb/trackDb/human
 cat $KENT/src/hg/makeDb/hgCgiData/Human/microarrayGroups.ra \
     microarrayGroups.affyExonTissues.ra  > microarrayGroups.new.ra
 hgLoadBed hg18 affyExonTissues human.exon.headless.bed15
 cd $KENT/src/hg/makeDb/trackDb
 make update DBS="hg17 hg18"
 cd $KENT/src
 make -j8 cgi >& ~/make.j8.cgi.errout
 
 
 #
 # AffyExonTissue Step 4: load the appropriate fields into hgFixed
 #
 grep -A5 affyExonTissuesAll microarrayGroups.ra | grep "^names" \
    | sed 's/names //' | tr ',' '\n' | sed '/^$/d' | sed 's/^\s\+//' > n.txt
 grep -A5 affyExonTissuesAll microarrayGroups.ra | grep "^expIds" \
    | sed 's/expIds //' | tr ',' '\n' | sed '/^$/d' | sed 's/^\s\+//' > e.txt
 paste e.txt n.txt > a.txt
 awk 'BEGIN{OFS="\t";}
     {print $1, $2, $2, "n/a", "n/a", "n/a", "33", "n/a,n/a,"$2",";}' a.txt \
   > exps.tab
 ln -s ../../../lib/expRecord.sql
 hgLoadSqlTab hgFixed affyMouseExonTissuesAllExps expRecord.sql exps.tab
 rm a.txt n.txt e.txt exps.tab
 ############
 
 ########################################################################
 ## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy)
 ## 1. Log into Affymetrix netaffx site.
 ## 2. Use Firefox add-on "Export Cookies" to save a file called cookies.txt 
 ssh hgwdev
 grep affymetrix.com cookies.txt > affycookies.txt
 wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/HuEx-1_0-st-v2.na27.hg18.probeset.csv.zip
 wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/MoEx-1_0-st-v1.na27.mm9.probeset.csv.zip
 wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na27/wtexon/RaEx-1_0-st-v1.na27.rn4.probeset.csv.zip
 rm affycookies.txt
 for z in *.zip; do unzip $z; done
 rm *.zip
 ln -s HuEx-1_0-st-v2.na27.hg18.probeset.csv hg18.csv
 ln -s RaEx-1_0-st-v1.na27.rn4.probeset.csv rn4.csv
 ln -s MoEx-1_0-st-v1.na27.mm9.probeset.csv mm9.csv
 
 for csv in {hg18,mm9,rn4}.csv; do
     bed=${csv%.csv}.bed
     sed '1,20d' $csv | tr ',' '\t' | sed 's/\"//g' | cut -f1-5,16 \
      | grep -v "\-\-\-" \
      | awk 'BEGIN{FS="\t";OFS="\t";}{if ($6 == "core") score = 1000; else if ($6 == "extended") score = 700; else if ($6 == "full") score = 300; else score = 100; name = $1"|"$6; print $2, $4-1, $5, name, score, $3}' \
      | bedSort stdin $bed
 done
 for db in hg18 mm9 rn4; do hgLoadBed $db affyAllExonProbes $db.bed; done
 rm hg18.csv mm9.csv rn4.csv
 gzip *.bed *.csv
 mkdir -p /hive/data/genomes/{hg18,mm9,rn4}/bed/affyAllExonProbes
 mv HuEx-1_0-st-v2.na27.* hg18.bed.gz /hive/data/genomes/hg18/bed/affyAllExonProbes/
 mv MoEx-1_0-st-v1.na27.* mm9.bed.gz /hive/data/genomes/mm9/bed/affyAllExonProbes/
 mv * /hive/data/genomes/rn4/bed/affyAllExonProbes/
 ## forgot mm8 (see mm8.txt for that one)
 
 
 ################################################
 # SPLIT EXPRESSION & REGULATION GROUPS
 # (2008-09-09 kate)
 
 echo "insert into grp (name, label, priority, defaultIsClosed) values ('expression', 'Expression', 4.5, 1)" | hgsql hg18
 echo "update grp set label='Regulation' where name='regulation'" | hgsql hg18
 
 
 ############################################################################
 # KIDD/EICHLER DISCORDANT CLONE ENDS (DONE 9/16/08 angie)
     ssh hgwdev
     mkdir /cluster/data/hg18/bed/kiddEichlerDiscordant
     cd /cluster/data/hg18/bed/kiddEichlerDiscordant
     foreach i (ABC7 ABC8 ABC9 ABC10 ABC11 ABC12 ABC13 ABC14 G248)
       wget --user=uuuu  --password=ppppppp \
         http://eichlerlab.gs.washington.edu/kiddj/hg18_fosmidmap/$i.bestdiscordant.sorted.gz
     end
     # Load the tracks (translate bacEndPairs-inspired format to bed12):
     foreach f (*.gz)
       set track = `echo $f:r:r:r \
         | perl -wpe 's/^([AG])(\w+)$/kiddEichlerDisc$1\L$2/ || die;'`
       if ($status != 0) break
       echo $track
       zcat $f \
       | perl -wpe 'if (/^chrom\s+chromStart/) {s/^.*\n$//; next;} \
         my ($c, $s, $e, $n, $sc, $st, $bSt, $bSz, undef, $t) = split; \
         @bSts = split(",", $bSt);  @bSzs = split(",", $bSz); \
         if ($t =~ /^transchrm_/) { \
           @bSts = (0);  @bSzs = ($e - $s); \
         } elsif ($t =~ /^OEA_/) { \
           die "\nERROR: bSts[0] $bSts[0] != s $s\n" if ($bSts[0] != $s); \
           $bSzs[0]--; \
           $bE = $bSts[0] + $bSzs[0]; \
           die "bE $bE != e $e\n" if ($bE != $e); \
           $bSts[0] -= $s; \
         } elsif ($#bSts == 1) { \
           if ($bSts[0] > $bSts[1]) { \
             # warn "Swapping $n ($bSts[0] > $bSts[1])\n"; \
             $tmp = $bSts[0];  $bSts[0] = $bSts[1];  $bSts[1] = $tmp; \
             $tmp = $bSzs[0];  $bSzs[0] = $bSzs[1];  $bSzs[1] = $tmp; \
           } \
           if ($bSts[0] != $s) { \
             die "\nERROR: n=$n,$t: bSts[0]=$bSts[0] but s=$s\n\t"; \
           } \
           $bSzs[0]--;  $bSzs[1]--; \
           $bE0 = $bSts[0] + $bSzs[0]; \
           $bE1 = $bSts[1] + $bSzs[1]; \
           $bE = $bE0 > $bE1 ? $bE0 : $bE1; \
           if ($bE != $e) { \
             warn "n=$n,$t: bE0=$bE0, bE1=$bE1, bE=$bE, e=$e\n"; \
             if ($bE1 > $e) { \
               warn "n=$n,$t: tweaking bSzs[1] (clip to chromEnd)\n"; \
               $bSzs[1] = $e - $bSts[1]; \
             } \
           } \
           $bSts[0] -= $s;  $bSts[1] -= $s; \
         } else { die "t is $t but \$#bSts is $#bSts"; } \
         $bSt = join(",", @bSts) . ",";  $bSz = join(",", @bSzs) . ","; \
         $rgb = ($t =~ /^deletion/) ? "224,0,0" : \
                ($t =~ /^insertion/) ? "0,0,224" : \
                ($t =~ /^inversion/) ? "0,224,0" : \
                ($t =~ /^OEA/) ? "240,160,64" : "0,0,0"; \
         $_ = join("\t", $c, $s, $e, "$n,$t", int($sc+0.5), $st, $s, $e, $rgb, \
                         scalar(@bSzs), $bSz, $bSt) . "\n";' \
       | hgLoadBed -tab hg18 $track stdin
       checkTableCoords hg18 $track
     end
     # Tons of overlapping block and blockEnd[n-1]!=end warnings from
     # checkTableCoords -- but these are discordant mappings, so we
     # expect those.  Make sure there aren't any other types of errors:
     foreach f (*.gz)
       set track = `echo $f:r:r:r \
         | perl -wpe 's/^([AG])(\w+)$/kiddEichlerDisc$1\L$2/ || die;'`
       checkTableCoords hg18 $track |& egrep -v 'overlapping|!= end'
     end
     # No output, good.
 
     # Get clone ID -> NCBI acc mapping (same as for hg17; redownloaded to
     # make sure).
     mkdir /cluster/data/hg18/bed/kiddEichlerDiscordant/cloneIds
     cd /cluster/data/hg18/bed/kiddEichlerDiscordant/cloneIds
     # Saved off emailed file from Jeff Kidd to clones_used_3nov.txt.accessions;
     # get trace archive trace names for end reads:
     foreach n (7 9 10 11 12 13 14)
       wget http://hgsv.washington.edu/general/download/clone_mapping/ABC$n/ABC$n.conversion.gz
     end
     # ABC8 has _a and _b files:
     wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_a.conversion.gz
     wget http://hgsv.washington.edu/general/download/clone_mapping/ABC8/ABC8_b.conversion.gz
     # That file is not available for G248.
     gunzip *.gz
     # Combine the relevant data from the .conversion files; keep only those
     # IDs that are used in the tracks.
     zcat ../[AG]*.gz \
     | cut -f 4 \
     | egrep -v '^(#chrom|track|name)' \
     | sed -e 's/,.*//' \
     | sort -u > discIds.txt
     grep -h -v ^163722_163722- *.conversion \
     | perl -wpe 's/^OurClone.*\n// || s/^\d+_(HUMAN|\d+_).*\n$// || \
       s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\.(F|FORWARD|R|REVERSE)\.\d+\t(\w+)$/$2$3_$6\t$7\t$8/ || \
       warn "Parse line $.:\n$_";' \
     | sort > allEnds.tab
     grep -wFf discIds.txt allEnds.tab > discEnds.txt
     wc -l discIds.txt allEnds.tab discEnds.txt
 #   352330 discIds.txt
 # 17490847 allEnds.tab
 #   781513 discEnds.txt
     # discEnds.txt has 2 lines (forward & reverse) for most of its ids...
     # ideally we would see 2*(352330) lines in discEnds.txt.
     # Get a list of which discordant clone IDs don't have ends in *.conv*:
     cut -f 1 allEnds.tab | uniq > all.tmp
     comm -23 discIds.txt all.tmp > discNotInConv.txt
     wc -l discNotInConv.txt
 #41853 discNotInConv.txt
     cat > combine.pl <<'_EOF_'
 #!/usr/bin/perl -w
 use strict;
 my ($cloneFile, $endsFile) = @ARGV;
 open(CLONES, $cloneFile) || die "Can't open $cloneFile: $!\n";
 my %idInfo;
 while(<CLONES>) {
   (s/^(\d+_)?(ABC|G)(\d+)(_\d\d?)?(_\d\d?)?_0*(\d+?_[A-Z]\d\d?)\t(\w+)$/$2$3_$6\t$7/ && m/^(\w+)\t(\w+)/) || \
   m/^(G248\w+)\t(\w+)$/ || die "Parse line $.:$_";
   my ($id, $acc) = ($1, $2);
   $idInfo{$id}->[0] = $acc;
 }
 close(CLONES);
 open(ENDS, $endsFile) || die "Can't open $endsFile: $!\n";
 while (<ENDS>) {
   chomp; my ($id, $dir, $traceName) = split("\t");
   if ($dir =~ /^F/) {
     $idInfo{$id}->[1] = $traceName;
   } elsif ($dir =~ /^R/) {
     $idInfo{$id}->[2] = $traceName;
   } else { die "What is this \$dir: $dir ?\n"; }
 }
 close(ENDS);
 foreach my $id (sort keys %idInfo) {
   my $infoRef = $idInfo{$id};
   $infoRef->[0] = '' if (! defined $infoRef->[0]);
   $infoRef->[1] = 0 if (! defined $infoRef->[1]);
   $infoRef->[2] = 0 if (! defined $infoRef->[2]);
   print join("\t", $id, @{$infoRef}) . "\n";
 }
 '_EOF_'
     # << emacs
     chmod a+x combine.pl
     combine.pl clones_used_3nov.txt.accessions discEnds.txt \
     | sort > kiddEichlerToNcbi.txt
     # Load table:
     hgLoadSqlTab hg18 kiddEichlerToNcbi \
       $HOME/kent/src/hg/lib/kiddEichlerToNcbi.sql kiddEichlerToNcbi.txt
     # Add to makeDb/schema/all.joiner, then check:
     runJoiner.csh hg18 kiddEichlerToNcbi $HOME/kent/src/hg/makeDb/schema
 
 
 ############################################################################
 # hgPal downloads 28way refGene, knownGene, knownCanonical
     ssh hgwdev
     screen
     bash
     rm -rf /cluster/data/hg18/bed/multiz28way/pal
     mkdir /cluster/data/hg18/bed/multiz28way/pal
     cd /cluster/data/hg18/bed/multiz28way/pal
     cat > order.lst <<EOF
 hg18
 panTro2
 rheMac2
 otoGar1
 tupBel1
 mm8
 rn4
 cavPor2
 oryCun1
 sorAra1
 eriEur1
 canFam2
 felCat3
 equCab1
 bosTau3
 dasNov1
 loxAfr1
 echTel1
 monDom4
 ornAna1
 anoCar1
 galGal3
 xenTro2
 danRer4
 tetNig1
 fr2
 gasAcu1
 oryLat1
 EOF
 
     mz=multiz28way
     gp=refGene
     db=hg18
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.jobs
 
     time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
     sleep 1
     tail -f $gp.jobs.log
 
 # real    232m24.611s
 # user    13m59.669s
 # sys     5m5.601s
 
     zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     # we're only distributing exons at the moment
     mz=multiz28way
     gp=refGene
     db=hg18
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
     mz=multiz28way
     gp=knownGene
     db=hg18
 
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.$mz.jobs
 
     time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
     sleep 1
     tail -f $gp.$mz.job.log
 
 # real    248m39.293s
 # user    23m30.788s
 # sys     8m2.714s
 
     zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
     # now do the canonical set
     cd /cluster/data/hg18/bed/multiz28way/pal
     mz=multiz28way
     gp=knownCanonical
     db=hg18
     for j in `awk '{print $1}' /cluster/data/hg18/chrom.sizes`
     do
 	echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
     done
 
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -geneBeds=$j.known.bed  $db $mz knownGene order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.$mz.jobs
 
     time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
     sleep 1
     tail -f $gp.$mz.job.log
 
 # real    216m41.700s
 # user    10m22.016s
 # sys     4m6.917s
 
     rm *.known.bed
     zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
 #########################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 hg18.upstreamGeneTbl = refGene
 hg18.upstreamMaf = multiz28way /hive/data/genomes/hg18/bed/multiz28way/species.lst
 
 #########################################################################
 # BarskiChIPseq tracks      Begun: 2008-09-19 Finished: 2008-09-22 Tim
 # Barski, et al 2007 Paper - High-Resolution Mapping of Histone Modifications in the Human Genome
 # Solexa high-throughput sequencing: ChIPseq data
 # http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/hgtcell.html
 
 ssh hgwdev
 mkdir /hive/data/genomes/hg18/bed/Barski2007/lab
 cd /hive/data/genomes/hg18/bed/Barski2007/lab
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me1.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me2.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me3.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me1.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me2.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me3.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me1.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me2.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me3.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me1.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me3.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me1.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me2.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me3.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me1.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me2.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me1.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me3.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4R3me2.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2BK5me1.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2AZ.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/PolII.bed
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/CTCF.bed
 
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me1.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me2.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K4me3.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me1.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me2.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K9me3.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me1.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me2.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K27me3.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me1.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K36me3.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me1.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me2.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3K79me3.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me1.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H3R2me2.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me1.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4K20me3.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H4R3me2.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2BK5me1.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/H2AZ.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/PolII.vstep.gz
 wget http://dir.nhlbi.nih.gov/Papers/lmi/epigenomes/data/CTCF.vstep.gz
 
 gunzip *.gz
 
 mv H3K4me1.vstep  H3K4me1.wig
 mv H3K4me2.vstep  H3K4me2.wig
 mv H3K4me3.vstep  H3K4me3.wig
 mv H3K9me1.vstep  H3K9me1.wig
 mv H3K9me2.vstep  H3K9me2.wig
 mv H3K9me3.vstep  H3K9me3.wig
 mv H3K27me1.vstep H3K27me1.wig
 mv H3K27me2.vstep H3K27me2.wig
 mv H3K27me3.vstep H3K27me3.wig
 mv H3K36me1.vstep H3K36me1.wig
 mv H3K36me3.vstep H3K36me3.wig
 mv H3K79me1.vstep H3K79me1.wig
 mv H3K79me2.vstep H3K79me2.wig
 mv H3K79me3.vstep H3K79me3.wig
 mv H3R2me1.vstep  H3R2me1.wig
 mv H3R2me2.vstep  H3R2me2.wig
 mv H4K20me1.vstep H4K20me1.wig
 mv H4K20me3.vstep H4K20me3.wig
 mv H4R3me2.vstep  H4R3me2.wig
 mv H2BK5me1.vstep H2BK5me1.wig
 mv H2AZ.vstep     H2AZ.wig
 mv PolII.vstep    PolII.wig
 mv CTCF.vstep     CTCF.wig
 
 head -1 H3K4me1.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me1/g"  > barskiChIPseqH3K4me1.wigVar
 head -1 H3K4me2.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me2/g"  > barskiChIPseqH3K4me2.wigVar
 head -1 H3K4me3.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K4me3/g"  > barskiChIPseqH3K4me3.wigVar
 head -1 H3K9me1.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me1/g"  > barskiChIPseqH3K9me1.wigVar
 head -1 H3K9me2.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me2/g"  > barskiChIPseqH3K9me2.wigVar
 head -1 H3K9me3.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3K9me3/g"  > barskiChIPseqH3K9me3.wigVar
 head -1 H3K27me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me1/g" > barskiChIPseqH3K27me1.wigVar
 head -1 H3K27me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me2/g" > barskiChIPseqH3K27me2.wigVar
 head -1 H3K27me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K27me3/g" > barskiChIPseqH3K27me3.wigVar
 head -1 H3K36me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K36me1/g" > barskiChIPseqH3K36me1.wigVar
 head -1 H3K36me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K36me3/g" > barskiChIPseqH3K36me3.wigVar
 head -1 H3K79me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me1/g" > barskiChIPseqH3K79me1.wigVar
 head -1 H3K79me2.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me2/g" > barskiChIPseqH3K79me2.wigVar
 head -1 H3K79me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH3K79me3/g" > barskiChIPseqH3K79me3.wigVar
 head -1 H3R2me1.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3R2me1/g"  > barskiChIPseqH3R2me1.wigVar
 head -1 H3R2me2.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH3R2me2/g"  > barskiChIPseqH3R2me2.wigVar
 head -1 H4K20me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4K20me1/g" > barskiChIPseqH4K20me1.wigVar
 head -1 H4K20me3.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH4K20me3/g" > barskiChIPseqH4K20me3.wigVar
 head -1 H4R3me2.vstep  | sed -e "s/\"CTCF/\"BarskiChIPseqH4R3me2/g"  > barskiChIPseqH4R3me2.wigVar
 head -1 H2BK5me1.vstep | sed -e "s/\"CTCF/\"BarskiChIPseqH2BK5me1/g" > barskiChIPseqH2BK5me1.wigVar
 head -1 H2AZ.vstep     | sed -e "s/\"CTCF/\"BarskiChIPseqH2AZ/g"     > barskiChIPseqH2AZ.wigVar
 head -1 PolII.vstep    | sed -e "s/\"CTCF/\"BarskiChIPseqPolII/g"    > barskiChIPseqPolII.wigVar
 head -1 CTCF.vstep     | sed -e "s/\"CTCF/\"BarskiChIPseqCTCF/g"     > barskiChIPseqCTCF.wigVar
 
 tail --lines=+2 H3K4me1.vstep  >> barskiChIPseqH3K4me1.wigVar
 tail --lines=+2 H3K4me2.vstep  >> barskiChIPseqH3K4me2.wigVar
 tail --lines=+2 H3K4me3.vstep  >> barskiChIPseqH3K4me3.wigVar
 tail --lines=+2 H3K9me1.vstep  >> barskiChIPseqH3K9me1.wigVar
 tail --lines=+2 H3K9me2.vstep  >> barskiChIPseqH3K9me2.wigVar
 tail --lines=+2 H3K9me3.vstep  >> barskiChIPseqH3K9me3.wigVar
 tail --lines=+2 H3K27me1.vstep >> barskiChIPseqH3K27me1.wigVar
 tail --lines=+2 H3K27me2.vstep >> barskiChIPseqH3K27me2.wigVar
 tail --lines=+2 H3K27me3.vstep >> barskiChIPseqH3K27me3.wigVar
 tail --lines=+2 H3K36me1.vstep >> barskiChIPseqH3K36me1.wigVar
 tail --lines=+2 H3K36me3.vstep >> barskiChIPseqH3K36me3.wigVar
 tail --lines=+2 H3K79me1.vstep >> barskiChIPseqH3K79me1.wigVar
 tail --lines=+2 H3K79me2.vstep >> barskiChIPseqH3K79me2.wigVar
 tail --lines=+2 H3K79me3.vstep >> barskiChIPseqH3K79me3.wigVar
 tail --lines=+2 H3R2me1.vstep  >> barskiChIPseqH3R2me1.wigVar
 tail --lines=+2 H3R2me2.vstep  >> barskiChIPseqH3R2me2.wigVar
 tail --lines=+2 H4K20me1.vstep >> barskiChIPseqH4K20me1.wigVar
 tail --lines=+2 H4K20me3.vstep >> barskiChIPseqH4K20me3.wigVar
 tail --lines=+2 H4R3me2.vstep  >> barskiChIPseqH4R3me2.wigVar
 tail --lines=+2 H2BK5me1.vstep >> barskiChIPseqH2BK5me1.wigVar
 tail --lines=+2 H2AZ.vstep     >> barskiChIPseqH2AZ.wigVar
 tail --lines=+2 PolII.vstep    >> barskiChIPseqPolII.wigVar
 tail --lines=+2 CTCF.vstep     >> barskiChIPseqCTCF.wigVar
 
 mkdir ../signal
 mv *.wigVar ../signal
 gzip *
 
 mkdir ../tags
 mv H3K4me1.bed  ../tags/barskiChIPseqH3K4me1.bed
 mv H3K4me2.bed  ../tags/barskiChIPseqH3K4me2.bed
 mv H3K4me3.bed  ../tags/barskiChIPseqH3K4me3.bed
 mv H3K9me1.bed  ../tags/barskiChIPseqH3K9me1.bed
 mv H3K9me2.bed  ../tags/barskiChIPseqH3K9me2.bed
 mv H3K9me3.bed  ../tags/barskiChIPseqH3K9me3.bed
 mv H3K27me1.bed ../tags/barskiChIPseqH3K27me1.bed
 mv H3K27me2.bed ../tags/barskiChIPseqH3K27me2.bed
 mv H3K27me3.bed ../tags/barskiChIPseqH3K27me3.bed
 mv H3K36me1.bed ../tags/barskiChIPseqH3K36me1.bed
 mv H3K36me3.bed ../tags/barskiChIPseqH3K36me3.bed
 mv H3K79me1.bed ../tags/barskiChIPseqH3K79me1.bed
 mv H3K79me2.bed ../tags/barskiChIPseqH3K79me2.bed
 mv H3K79me3.bed ../tags/barskiChIPseqH3K79me3.bed
 mv H3R2me1.bed  ../tags/barskiChIPseqH3R2me1.bed
 mv H3R2me2.bed  ../tags/barskiChIPseqH3R2me2.bed
 mv H4K20me1.bed ../tags/barskiChIPseqH4K20me1.bed
 mv H4K20me3.bed ../tags/barskiChIPseqH4K20me3.bed
 mv H4R3me2.bed  ../tags/barskiChIPseqH4R3me2.bed
 mv H2BK5me1.bed ../tags/barskiChIPseqH2BK5me1.bed
 mv H2AZ.bed     ../tags/barskiChIPseqH2AZ.bed
 mv PolII.bed    ../tags/barskiChIPseqPolII.bed
 mv CTCF.bed     ../tags/barskiChIPseqCTCF.bed
 
 cd ..
 
 cd ../signal
 cat > makeWig.sh << \_EOF_
 #!/bin/bash
     genDir=/gbdb/hg18/barskiChIPseq
     mkdir \${genDir}
     for file in *.wigVar
     do
         base=\${file%.wigVar}
         echo "Loading \${file} to \${base}..."
         time nice -n +19 wigEncode base \${base}.wigVar \${base}.wig \${base}.wib
         time nice -n +19 hgLoadWiggle -pathPrefix=\${genDir} hg18 \${base} \${base}.wig
         ln -sf `pwd`/\${base}.wib \${genDir}/\${base}.wib
     done
 _EOF_
 chmod 755 makeWig.sh
 ./makeWig.sh &
 
 # ................ Got to here
 # ................ Got to here
 # ................ Got to here
 # ................ Got to here
 
 # .............. I have not loaded the tags !!!
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me1  BarskiChIPseqH3K4me1.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me2  BarskiChIPseqH3K4me2.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K4me3  BarskiChIPseqH3K4me3.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me1  BarskiChIPseqH3K9me1.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me2  BarskiChIPseqH3K9me2.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K9me3  BarskiChIPseqH3K9me3.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me1 BarskiChIPseqH3K27me1.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me2 BarskiChIPseqH3K27me2.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K27me3 BarskiChIPseqH3K27me3.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K36me1 BarskiChIPseqH3K36me1.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K36me3 BarskiChIPseqH3K36me3.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me1 BarskiChIPseqH3K79me1.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me2 BarskiChIPseqH3K79me2.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3K79me3 BarskiChIPseqH3K79me3.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3R2me1  BarskiChIPseqH3R2me1.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH3R2me2  BarskiChIPseqH3R2me2.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH4K20me1 BarskiChIPseqH4K20me1.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH4K20me3 BarskiChIPseqH4K20me3.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH4R3me2  BarskiChIPseqH4R3me2.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH2BK5me1 BarskiChIPseqH2BK5me1.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqH2AZ     BarskiChIPseqH2AZ.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqPolII    BarskiChIPseqPolII.bed
 # time nice -n +19 hgLoadBed hg18 barskiChIPseqCTCF     BarskiChIPseqCTCF.bed
 # .............. I have not loaded the tags !!!
 
 #########################################################################
 ## 44-Way Multiz (DONE - 2008-11-10,15 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg18/bed/multiz44way
     cd /hive/data/genomes/hg18/bed/multiz44way
 
     #	starting with the 44way tree that Brian made earlier:
     cp -p ../multiz44way.2008-08-06/44way.db.nh ./44way.nh
 
     sed -e "s/oryLat1/hg18/; s/danRer4/danRer5/; s/oryLat1/oryLat2/" \
 	/cluster/data/oryLat1/bed/multiz44way/44way.nh > 44way.nh
     #	this file looks like:
 
     cat << '_EOF_' > 44way.nh
   (((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2,
     (stickleback_gasAcu1:0.2,medaka_hg18:0.2):0.2):0.292961,
         zebrafish_danRer5:0.782561);
 '_EOF_'
     # << happy emacs
 
     #	Use this specification in the phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to obtain a gif image for htdocs/images/phylo/hg18_44way.gif
 
     /cluster/bin/phast/all_dists 44way.nh > 44way.distances.txt
     #	Use this output to create the table below, with this perl script:
     cat << '_EOF_' > sizeStats.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 open (FH, "grep -y hg18 44way.distances.txt | sort -k3,3n|") or
         die "can not read 44way.distances.txt";
 
 my $count = 0;
 while (my $line = <FH>) {
     chomp $line;
     my ($hg18, $D, $dist) = split('\s+', $line);
     my $chain = "chain" . ucfirst($D);
     my $B="/hive/data/genomes/hg18/bed/blastz.$D/fb.hg18." .
         $chain . "Link.txt";
     my $chainLinkMeasure =
         `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
     chomp $chainLinkMeasure;
     $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
     $chainLinkMeasure =~ s/\%//;
     my $orgName=
     `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
     chomp $orgName;
     if (length($orgName) < 1) {
         $orgName="N/A";
     }
     ++$count;
     printf "# %02d  %.4f - %s %s\t(%% %.3f)\n", $count, $dist, $orgName, $D,
         $chainLinkMeasure
 }
 close (FH);
 '_EOF_'
     # << happy emacs
     chmod +x ./sizeStats.pl
     ./sizeStats.pl
 #
 #	If you can fill in all the numbers in this table, you are ready for
 #	the multiple alignment procedure
 #
 #                         featureBits chainLink measures
 #                                        chainOryLat1Link   chain    linearGap
 #    distance                      on hg18    on other   minScore
 # 01  0.0092 - Chimp panTro2    (% 94.888)
 # 02  0.0267 - Gorilla gorGor1  (% 61.731)
 # 03  0.0467 - Orangutan ponAbe2        (% 92.892)
 # 04  0.0667 - Marmoset calJac1 (% 78.351)
 # 05  0.0783 - Rhesus rheMac2   (% 85.552)
 # 06  0.1767 - Tarsier tarSyr1  (% 47.999)
 # 07  0.2448 - Mouse lemur micMur1      (% 46.445)
 # 08  0.3061 - Bushbaby otoGar1 (% 44.638)
 # 09  0.3367 - Rabbit oryCun1   (% 34.015)
 # 10  0.3507 - TreeShrew tupBel1        (% 37.348)
 # 11  0.3567 - Squirrel speTri1 (% 35.828)
 # 12  0.4067 - Guinea Pig cavPor3       (% 43.971)
 # 13  0.4067 - Alpaca vicPac1   (% 39.531)
 # 14  0.4098 - Megabat pteVam1  (% 45.502)
 # 15  0.4099 - Microbat myoLuc1 (% 33.044)
 # 16  0.4154 - Cat felCat3      (% 35.888)
 # 17  0.4293 - Elephant loxAfr2 (% 35.204)
 # 18  0.4314 - Dog canFam2      (% 52.915)
 # 19  0.4317 - Mouse mm9        (% 35.201)
 # 20  0.4362 - Rat rn4  (% 32.893)
 # 21  0.4367 - Pika ochPri2     (% 27.974)
 # 22  0.4639 - Horse equCab2    (% 57.162)
 # 23  0.4693 - Rock hyrax proCap1       (% 30.935)
 # 24  0.4767 - Dolphin turTru1  (% 48.537)
 # 25  0.5067 - Kangaroo rat dipOrd1     (% 27.282)
 # 26  0.5187 - Armadillo dasNov2        (% 33.663)
 # 27  0.5191 - Cow bosTau4      (% 46.689)
 # 28  0.5298 - hedgehog eriEur1 (% 19.622)
 # 29  0.5399 - Sloth choHof1    (% 34.463)
 # 30  0.5605 - Shrew sorAra1    (% 20.056)
 # 31  0.5815 - Tenrec echTel1   (% 23.645)
 # 32  0.7309 - Opossum monDom4  (% 12.385)
 # 33  0.9870 - Platypus ornAna1 (% 7.870)
 # 34  1.0313 - Zebra finch taeGut1      (% 3.503)
 # 35  1.0436 - Lamprey petMar1  (% 1.251)
 # 36  1.1013 - Chicken galGal3  (% 3.589)
 # 37  1.2253 - Lizard anoCar1   (% 4.774)
 # 38  1.5473 - X. tropicalis xenTro2    (% 2.623)
 # 39  1.8337 - Stickleback gasAcu1      (% 1.923)
 # 40  1.8482 - Zebrafish danRer5        (% 2.565)
 # 41  1.8721 - Tetraodon tetNig1        (% 2.001)
 # 42  1.9077 - Fugu fr2 (% 1.766)
 # 43  2.0215 - Medaka oryLat2   (% 1.829)
 
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	44way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.list
 
     cd /hive/data/genomes/hg18/bed/multiz44way
     #	bash shell syntax here ...
     export H=/hive/data/genomes/hg18/bed
     mkdir mafLinks
     for G in `sed -e "s/hg18 //" species.list`
     do
 	mkdir mafLinks/$G
 	if [ -s ${H}/blastz.${G}/mafRBestNet/chr1.maf.gz ]; then
 	    echo "$G - recipBest"
 	    ln -s ${H}/blastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
 	else
 	    if [ -s ${H}/blastz.${G}/mafSynNet/chr1.maf.gz ]; then
 		echo "$G - synNet"
 		ln -s ${H}/blastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G
 	    else
 		if [ -s ${H}/blastz.${G}/mafNet/chr1.maf.gz ]; then
 		    echo "$G - mafNet"
 		    ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
 		else
 		    echo "missing directory blastz.${G}/*Net"
 		fi
 	    fi
 	fi
     done
 
     #	need to split these things up into smaller pieces for
     #	efficient kluster run.  Using the new hive architecture.
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way
     mkdir mafSplit
     #	mafSplitPos splits on repeat areas that will not have any chains
     mafSplitPos -minGap=50000 hg18 10 mafSplit.bed
 
     for G in `sed -e "s/hg18 //" species.list`
 do
     echo -n "working ${G} ..."
     rm -fr mafSplit/${G}
     mkdir mafSplit/${G}
     cd mafSplit/${G}
     mafSplit ../../mafSplit.bed hg18_ ../../mafLinks/${G}/chr*.maf.gz \
 	-verbose=2 
     cd /hive/data/genomes/hg18/bed/multiz44way
     echo " done"
 done
 
     #	create a run-time list of files to operate on, not all file names
     #	exist for all assemblies
     cd mafSplit
     for D in *
 do
     cd "${D}"
     find . -type f
     cd ..
 done | sort -u | sed -e "s#./##" > ../44-way.split.list
     wc -l ../44-way.split.list
     #	267 ../44-way.split.list
     
     # the autoMultiz cluster run
     ssh swarm
     cd /hive/data/genomes/hg18/bed/multiz44way/
 
     mkdir splitRun
     cd splitRun
     mkdir maf run
     cd run
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
 
     #	set the db and pairs directories here
     cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 set db = hg18
 set c = $1
 set result = $2
 set run = `pwd`
 set tmp = $run/tmp/$db/multiz.$c
 set pairs = /hive/data/genomes/hg18/bed/multiz44way/mafSplit
 /bin/rm -fr $tmp
 /bin/mkdir -p $tmp
 /bin/cp -p ../../tree.nh ../../species.list $tmp
 pushd $tmp
 foreach s (`sed -e "s/ $db//" species.list`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if (-e $in.gz) then
         /bin/zcat $in.gz > $out
     else if (-e $in) then
         ln -s $in $out
     else
         echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($run/penn $path); rehash
 $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 /bin/rm -f $result
 /bin/cp -p $tmp/$c.maf $result
 /bin/rm -fr $tmp
 /bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
 /bin/rmdir --ignore-fail-on-non-empty $run/tmp
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg18/bed/multiz44way/splitRun/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
 # << happy emacs
 
     gensub2 ../../44-way.split.list single template jobList
     para create jobList
     #	initial run experience suggest some of the big jobs reach 8 Gb
     #	of memory usage, so, tell parasol to limit the number of jobs per
     #	node to avoid thrashing
     para -ram=6g push
     #	8 jobs were finished manually on hgwdev, kolossus and memk nodes
 XXXX - running 2008-11-12 - Wed Nov 12 15:29:39 PST 2008
 
 # Completed: 792 of 792 jobs
 # CPU time in finished jobs:       5423s      90.38m     1.51h    0.06d  0.000 y
 # IO & Wait Time:                138287s    2304.79m    38.41h    1.60d  0.004 y
 # Average job time:                 181s       3.02m     0.05h    0.00d
 # Longest finished job:             404s       6.73m     0.11h    0.00d
 # Submission to last job:           436s       7.27m     0.12h    0.01d
 # Estimated complete:                 0s       0.00m     0.00h    0.00d
 
     # put the split maf results back together into a single maf file
     #	eliminate duplicate comments
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/splitRun
     mkdir ../maf
     #	the sed edits take out partitioning name information from the comments
     #	so the multiple parts will condense to smaller number of lines
     #	this takes almost 2 hours of time, resulting in a bit over 150 Gb,
     #	almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
     #	HOWEVER, this is actually not necessary to maintain these comments,
     #	they are lost during the mafAddIRows
     ls maf | sed -e "s/hg18_//; s/\..*//" | sort -u | while read C
 do
     echo "==========  $C  =============="
     rm -f ../maf/${C}.maf.gz
     head -q -n 1 maf/hg18_${C}.*.maf | sort -u > ../maf/${C}.maf
     grep -h "^#" maf/hg18_${C}.*.maf | egrep -v "maf version=1|eof maf" | \
         sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
         | sort -u >> ../maf/${C}.maf
     grep -h -v "^#" `ls maf/hg18_${C}.*.maf | sort -t. -k2,2n` \
         >> ../maf/${C}.maf
     tail -q -n 1 maf/hg18_${C}.*.maf | sort -u >> ../maf/${C}.maf
 done
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/hg18/multiz44way/maf
     cd /hive/data/genomes/hg18/bed/multiz44way/maf
     ln -s `pwd`/*.maf /gbdb/hg18/multiz44way/maf
 
     # this generates an immense multiz44way.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     cd /data/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/hg18/multiz44way/maf hg18 multiz44way
     #	real    1m10.380s
     #	Loaded 1366931 mafs in 1 files from /gbdb/hg18/multiz44way/maf
     # load summary table
     time nice -n +19 cat /gbdb/hg18/multiz44way/maf/*.maf \
 	| hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
 	 -maxSize=200000  multiz44waySummary stdin
     #	real    2m39.822
     #	Created 353577 summary blocks from 2852890 components and 1197504 mafs
     #	from stdin
 
     # Gap Annotation
     # prepare bed files with gap info
     mkdir /hive/data/genomes/hg18/bed/multiz44way/anno
     cd /hive/data/genomes/hg18/bed/multiz44way/anno
     mkdir maf run
 
     #	most of these will already exist from previous multiple alignments
     #	remove the echo from in front of the twoBitInfo command to get them
     #	to run if this loop appears to be correct
     for DB in `cat ../species.list`
 do
     CDIR="/hive/data/genomes/${DB}"
     if [ ! -f ${CDIR}/${DB}.N.bed ]; then
 	echo "creating ${DB}.N.bed"
 	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
     else
 	ls -og ${CDIR}/${DB}.N.bed
     fi
 done
 
     cd run
     rm -f nBeds sizes
     for DB in `sed -e "s/hg18 //" ../../species.list`
 do
     echo "${DB} "
     ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
     echo ${DB}.bed  >> nBeds
     ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
     echo ${DB}.len  >> sizes
 done
 
     #	the annotation step requires large memory, run on memk nodes
     ssh memk
     cd /hive/data/genomes/hg18/bed/multiz44way/anno/run
     ls ../../maf | sed -e "s/.maf//" > chr.list
     cat << '_EOF_' > template
 #LOOP
 ./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cat << '_EOF_' > anno.csh
 #!/bin/csh -fe
 
 set inMaf = ../../maf/$1.maf
 set outMaf = ../maf/$1.maf
 rm -f $outMaf
 mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/hg18/hg18.2bit $outMaf
 '_EOF_'
     # << happy emacs
     chmod +x anno.csh
 
     gensub2 chr.list single template jobList
     para create jobList
     #	specify lots of ram to get one job per node
     para -ram=30g push
 
     ssh hgwdev
     rm -fr /gbdb/hg18/multiz44way/maf
     mkdir /gbdb/hg18/multiz44way/maf
     cd /hive/data/genomes/hg18/bed/multiz44way/anno/maf
     ln -s `pwd`/*.maf /gbdb/hg18/multiz44way/maf/
     #	by loading this into the table multiz44way, it will replace the
     #	previously loaded table with the unannotated mafs
     #	huge temp files are made, do them on local disk
     cd /data/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/hg18/multiz44way/maf hg18 multiz44way
     #	with final set of quality annotated files:
     #	Loaded 33320838 mafs in 49 files from /gbdb/hg18/multiz44way/maf
     #	real    91m46.889s
     #	running on Irow annotated mafs Fri Nov 28 00:28:09 PST 2008
     #	Loaded 33320675 mafs in 49 files from /gbdb/hg18/multiz44way/maf
     #	real    236m15.279s
     #	running on bare bones mafs Thu Nov 27 19:29:44 PST 2008
     #	Loaded 33273351 mafs in 49 files from /gbdb/hg18/multiz44way/maf
     #	real    198m55.761s - while swarm busy with rebalancing
     # from before the fixed multiz:
     #	Loaded 35154852 mafs in 49 files from /gbdb/hg18/multiz44way/maf
     #	real    71m5.594s
 
     time nice -n +19 cat /gbdb/hg18/multiz44way/maf/*.maf \
 	| hgLoadMafSummary hg18 -minSize=30000 -mergeGap=1500 \
                  -maxSize=200000  multiz44waySummary stdin
     #	with the quality annotated mafs, and mem interference on hgwdev:
     #	Created 8514381 summary blocks from 600504256 components \
     #	and 33320838 mafs from stdin
     #	real    169m56.936s
 
     #	with the Irow annotations after the multiz fix:
     #	Created 8514380 summary blocks from 600499937
     #		components and 33298894 mafs from stdin
     #	real    184m42.893s
     #	user    70m44.431s
     #	sys     8m7.970s
 
     #	Created 8514078 summary blocks from 604683213 components
     #	and 35125649 mafs from stdin
     #	real    130m55.115s
     #	user    71m37.409s
     #	sys     8m5.110s
 
     #	by loading this into the table multiz44waySummary, it will replace
     #	the previously loaded table with the unannotated mafs
     #	remove the multiz44way*.tab files in this /data/tmp directory
 # -rw-rw-r--   1 1949221892 Nov 15 14:04 multiz44way.tab
 # -rw-rw-r--   1  417994189 Nov 15 20:57 multiz44waySummary.tab
     wc -l multiz44way*.tab
     #	33964377 multiz44way.tab
     #	 8514078 multiz44waySummary.tab
     #	42478455 total
     rm multiz44way*.tab
 
     # create some downloads
     mkdir -p /hive/data/genomes/hg18/bed/multiz44way/download/maf
     cd /hive/data/genomes/hg18/bed/multiz44way/download/maf
     time cp -p ../../anno/maf/chr*.maf .
     #	real    72m46.514s
     #	user    0m1.293s
     #	sys     5m15.981s
     time gzip --rsyncable *.maf
     time gzip --rsyncable *.maf
     #	real    185m37.884s
     #	user    179m51.161s
     #	sys     3m48.016s
     time md5sum *.gz > md5sum.txt
     #	real    3m59.009s
     #	user    1m19.338s
     #	sys     0m18.976s
 
 #############################################################################
 ## Annotate 44-way multiple alignment with gene annotations
 ##		(DONE - 2008-12-08,23 - Hiram)
     # Gene frames
     ## survey all genomes to see what type of gene track to use
     ssh hgwdev
     mkdir /hive/data/genomes/hg18/bed/multiz44way/frames
     cd /hive/data/genomes/hg18/bed/multiz44way/frames
     #
     cat << '_EOF_' > showGenes.csh
 #!/bin/csh -fe
 foreach db (`cat ../species.list`)
     echo -n "${db}: "
     set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
     foreach table ($tables)
 	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
 	    $table == "knownGene") then
 		set count = `hgsql $db -N -e "select count(*) from $table"`
 		echo -n "${table}: ${count}, "
 	endif
     end
     set orgName = `hgsql hgcentraltest -N -e \
 	    "select scientificName from dbDb where name='$db'"`
     set orgId = `hgsql hg18 -N -e \
 	    "select id from organism where name='$orgName'"`
     if ($orgId == "") then
 	echo "Mrnas: 0"
     else
 	set count = `hgsql hg18 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
 	echo "Mrnas: ${count}"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./showGenes.csh
     #	rearrange that output to create four sections:
     #	1. knownGenes for hg18, mm9
     #	2. ensGene for almost everything else
     #	3. Mrnas for taeGut1, anoCar1, petMar1, calJac1
     #	4. nothing for loxAfr2, dasNov2, choHof1
 
     mkdir genes
     # knownGene
     for DB in hg18 mm9
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
 
     # ensGene
     for DB in bosTau4 canFam2 cavPor3 danRer5 dipOrd1 echTel1 equCab2 \
 	eriEur1 felCat3 fr2 galGal3 gasAcu1 gorGor1 micMur1 monDom4 myoLuc1 \
 	ochPri2 ornAna1 oryCun1 oryLat2 otoGar1 panTro2 ponAbe2 proCap1 \
 	pteVam1 rheMac2 rn4 sorAra1 speTri1 tarSyr1 tetNig1 tupBel1 \
 	turTru1 vicPac1 xenTro2
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
 
     #	use Mrnas for  taeGut1 anoCar1 petMar1 calJac1
     for DB in taeGut1 anoCar1 petMar1 calJac1
 do
 tmpExt=`mktemp temp.XXXXXX`
 tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
 tmpMrna=${DB}.mrna.${tmpExt}
 tmpCds=${DB}.cds.${tmpExt}
 hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
 	   from all_mrna,gbCdnaInfo,cds \
 	   where (all_mrna.qName = gbCdnaInfo.acc) and \
 	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
 $DB > ${tmpMrnaCds}
 cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
 cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
 mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
 genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
 rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
 mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
 rm -f $tmpExt
 echo "${DB} done"
 done
 
     # leaving out loxAfr2, dasNov2, choHof1 since no gene preds there
     #	Create this command with this script:
     cat << '_EOF_' > mkCmd.sh
 #!/bin/sh
 
 echo "time (cat ../quals/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout \\"
 if [ ! -s genes/mm9.gp.gz ]; then
     echo "missing genes/mm9.gp.gz"
     exit 255
 fi
 echo "mm9 genes/mm9.gp.gz \\"
 for D in `sort ensGene.list`
 do
     if [ ! -s genes/${D}.gp.gz ]; then
         echo "missing genes/${D}.gp.gz"
         exit 255
     fi
     echo -n "${D} genes/${D}.gp.gz "
 done
 echo "\\"
 for D in `sort mrna.list`
 do
     if [ ! -s genes/${D}.gp.gz ]; then
         echo "missing genes/${D}.gp.gz"
         exit 255
     fi
     echo -n "${D} genes/${D}.gp.gz "
 done
 echo "\\"
 echo "    | gzip > multiz44way.mafFrames.gz) > frames.log 2>&1"
 '_EOF_'
     # << happy emacs
     chmod +x ./mkCmd.sh
 
     time (cat ../quals/maf/*.maf | nice -n +19 genePredToMafFrames hg18 stdin stdout \
 mm9 genes/mm9.gp.gz \
 bosTau4 genes/bosTau4.gp.gz canFam2 genes/canFam2.gp.gz cavPor3 genes/cavPor3.gp.gz danRer5 genes/danRer5.gp.gz dipOrd1 genes/dipOrd1.gp.gz echTel1 genes/echTel1.gp.gz equCab2 genes/equCab2.gp.gz eriEur1 genes/eriEur1.gp.gz felCat3 genes/felCat3.gp.gz fr2 genes/fr2.gp.gz galGal3 genes/galGal3.gp.gz gasAcu1 genes/gasAcu1.gp.gz micMur1 genes/micMur1.gp.gz monDom4 genes/monDom4.gp.gz myoLuc1 genes/myoLuc1.gp.gz ochPri2 genes/ochPri2.gp.gz ornAna1 genes/ornAna1.gp.gz oryCun1 genes/oryCun1.gp.gz oryLat2 genes/oryLat2.gp.gz otoGar1 genes/otoGar1.gp.gz panTro2 genes/panTro2.gp.gz ponAbe2 genes/ponAbe2.gp.gz proCap1 genes/proCap1.gp.gz pteVam1 genes/pteVam1.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz sorAra1 genes/sorAra1.gp.gz speTri1 genes/speTri1.gp.gz tarSyr1 genes/tarSyr1.gp.gz tetNig1 genes/tetNig1.gp.gz tupBel1 genes/tupBel1.gp.gz turTru1 genes/turTru1.gp.gz vicPac1 genes/vicPac1.gp.gz xenTro2 genes/xenTro2.gp.gz \
 anoCar1 genes/anoCar1.gp.gz calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz taeGut1 genes/taeGut1.gp.gz \
     | gzip > multiz44way.mafFrames.gz) > frames.log 2>&1
 
     #	that doesn't work on any 32 Gb computer, requires much more memory
     #	turn it into a kluster job
     ssh swarm
     cd /hive/data/genomes/hg18/bed/multiz44way/frames
     cat << '_EOF_' > runOne
 #!/bin/csh -fe
 
 set C = $1
 set G = $2
 
 cat ../quals/maf/${C}.maf | genePredToMafFrames hg18 stdin stdout \
         ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
 '_EOF_'
     # << happy emacs
     chmod +x runOne
 
     ls ../quals/maf | sed -e "s/.maf//" > chr.list
     ls genes | sed -e "s/.gp.gz//" | grep -v hg18 > gene.list
 
     cat << '_EOF_' > template
 #LOOP
 runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     mkdir parts
     gensub2 chr.list gene.list template jobList
     para -ram=8g create jobList
     para try ... check ... push
 # Completed: 1911 of 1911 jobs
 # CPU time in finished jobs:     126751s    2112.52m    35.21h    1.47d  0.004 y
 # IO & Wait Time:               2573543s   42892.38m   714.87h   29.79d  0.082 y
 # Average job time:                1413s      23.55m     0.39h    0.02d
 # Longest finished job:            6490s     108.17m     1.80h    0.08d
 # Submission to last job:         11310s     188.50m     3.14h    0.13d
 
     # see what it looks like in terms of number of annotations per DB:
     find ./parts -type f | while read F
 do
     zcat ${F}
 done | cut -f4 | sort | uniq -c | sort -n
     165 anoCar1
    2807 calJac1
    3306 taeGut1
    5416 petMar1
  141256 tarSyr1
  142346 vicPac1
  163854 sorAra1
  164475 galGal3
  174150 felCat3
  178531 oryCun1
  178744 ornAna1
  179511 turTru1
  190622 eriEur1
  191477 tupBel1
  197338 panTro2
  198063 speTri1
  199541 micMur1
  207391 ponAbe2
  208629 rheMac2
  208850 otoGar1
  212751 myoLuc1
  212857 dipOrd1
  213343 proCap1
  214972 echTel1
  216367 monDom4
  220724 ochPri2
  223159 equCab2
  227928 bosTau4
  231351 cavPor3
  231553 pteVam1
  233980 mm9
  234268 rn4
  249016 canFam2
  258191 xenTro2
  315098 danRer5
  365824 oryLat2
  387739 fr2
  423941 gasAcu1
  549846 tetNig1
 
     #	load the resulting file
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz44way/frames
     find ./parts -type f | while read F
 do
     zcat ${F}
 done | sort -k1,1 -k2,2n | hgLoadMafFrames hg18 multiz44wayFrames stdin
 
     find ./parts -type f | while read F
 do
     zcat ${F}
 done | sort -k1,1 -k2,2n > multiz44wayFrames.bed
     featureBits -countGaps hg18 multiz44wayFrames.bed
     #	62315198 bases of 3107677273 (2.005%) in intersection
     featureBits -countGaps hg18 multiz28wayFrames
     #	48236360 bases of 3107677273 (1.552%) in intersection
 
     #	enable the trackDb entries:
 # frames multiz44wayFrames
 # irows on
     #	appears to work OK
 
 #########################################################################
 # Phylogenetic tree from 44-way  (2008-12-06 kate)
 
     # Extract 4-fold degenerate sites based on 
     # of RefSeq Reviewed, coding
     ssh pk
     cd /hive/data/genomes/hg18/bed/multiz44way
     mkdir 4d
     cd 4d
 
     hgsql hg18 -Ne \
     "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and refSeqStatus.status='Reviewed' and mol='mRNA'" | cut -f 2-20 >  refSeqReviewed.gp
     wc -l refSeqReviewed.gp
     #12684 refSeqReviewed.gp
     genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
     wc -l refSeqReviewedNR.gp
     #7365 refSeqReviewedNR.gp
 
     mkdir run
     cd run
 
 # chopped up mafs version
 # run on swarm with -ram=8g
 cat > 4d.csh << 'EOF'
     set infile = $1
     set outfile = $2
     set c = `echo $1 | sed 's/^.*hg18_\(chr[^.][^.]*\).*.maf/\1/'`
     echo $c
     cd /scratch/tmp
     # 'clean' maf
     perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
     awk -v C=$c '$2 == C {print}' /cluster/data/hg18/bed/multiz44way/4d/refSeqReviewedNR.gp > $c.gp
     set PHASTBIN=/cluster/bin/phast.2008-11-30
     $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
     $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
     #rm -f $c.gp $c.maf $c.ss
 'EOF'
 
 # whole chrom mafs version, using new version of 
 # uses memory-efficient version of phast, from Melissa Hubisz at Cornell (mjhubisz@gmail.com)
 cat > 4d.csh << 'EOF'
     set c = $1
     set infile = $2
     set outfile = $3
     echo $c
     cd /scratch/tmp
     # 'clean' maf
     perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
     awk -v C=$c '$2 == C {print}' /cluster/data/hg18/bed/multiz44way/4d/refSeqReviewedNR.gp > $c.gp
     set PHASTBIN=/cluster/bin/phast.2008-12-18
     $PHASTBIN/msa_view --4d --features --do-cats 3 $c.gp -i MAF $c.maf -o SS > $c.ss
     $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
     rm -f $c.gp $c.maf $c.ss
 'EOF'
 
     ls -1S /hive/data/genomes/hg18/bed/multiz44way/maf/*.maf | \
         grep -v random | grep -v chrM | grep -v hap > in.lst
 
 cat << 'EOF' > template
 #LOOP
 csh 4d.csh $(root1) {check in line+ $(path1)}  {check out line+ /cluster/data/hg18/bed/multiz44way/4d/mfa/$(root1).mfa}
 #ENDLOOP
 'EOF'
 # << this line makes emacs coloring happy
 
 cat << 'EOF' > template
 #LOOP
 csh 4d.csh $(root1) {check in line+ $(path1)}  {check out line+ /cluster/data/hg18/bed/multiz44way/4d/mfa2/$(root1).mfa}
 #ENDLOOP
 'EOF'
 # << this line makes emacs coloring happy
 
     gensub2 in.lst single template stdout | tac > jobList
     rm -fr /cluster/data/hg18/bed/multiz44way/4d/mfa
     mkdir /cluster/data/hg18/bed/multiz44way/4d/mfa
     para create jobList
     para try
     para check
     para push
 
     # combine mfa files
     cd ..
     sed -e "s/ /,/g" ../species.list > species.lst
     /cluster/bin/phast/msa_view --aggregate `cat species.lst` mfa/*.mfa | \
         sed s/"> "/">"/ > 4d.all.mfa
 
     sed -e 's/,monDom4.*//' species.lst > placentals.lst
     /cluster/bin/phast/msa_view --aggregate `cat placentals.lst` mfa/*.mfa | \
         sed s/"> "/">"/ > 4d.placentals.mfa
 
     # use phyloFit to create tree model (output is phyloFit.mod)
     set PHASTBIN=/cluster/bin/phast.2008-12-18
     $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA --subst-mod REV --tree ../tree-commas.nh 4d.all.mfa
     # started at 5:50pm
     # ended at 7:27 => ~90 min on swarm
     mv phyloFit.mod phyloFit.all.mod
     grep TREE phyloFit.all.mod | sed 's/TREE\:\ //' > tree_4d.44way.nh
 
     $PHASTBIN/tree_doctor \
         --prune=monDom4,ornAna1,taeGut1,petMar1,galGal3,anoCar1,xenTro2,gasAcu1,danRer5,tetNig1,fr2,oryLat2 \
                 tree_4d.44way.nh > tree_4d.44way.placental.nh
 
     # chrX-only for placental subset (requested by 2X project)
     set PHASTBIN=/cluster/bin/phast.2008-12-18
     $PHASTBIN/phyloFit --EM --precision MED --msa-format FASTA --subst-mod REV --tree ../tree-commas.nh --out-root 4d.chrX mfa/chrX.mfa
 
 #############################################################################
 # phastCons 44-way (DONE - 2008-12-23 - 2009-01-02 - Hiram)
 
     # split 44way mafs into 10M chunks and generate sufficient statistics 
     # files for # phastCons
     ssh memk
     mkdir -p /hive/data/genomes/hg18/bed/multiz44way/cons/msa.split
     mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/ss
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/msa.split
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set c = $1
 set MAF = /hive/data/genomes/hg18/bed/multiz44way/maf/$c.maf
 set WINDOWS = /hive/data/genomes/hg18/bed/multiz44way/cons/ss/$c
 rm -fr $WINDOWS
 mkdir $WINDOWS
 pushd $WINDOWS > /dev/null
 twoBitToFa -seq=$c /hive/data/genomes/hg18/hg18.2bit hg18.$c.fa
 /cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
     -M hg18.$c.fa -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
 rm -f hg18.$c.fa
 popd > /dev/null
 date >> $c.done
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(root1) {check out line+ $(root1).done}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	do the easy ones first to see some immediate results
     ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
 
     gensub2 maf.list single template jobList
     para -ram=32g create jobList
     para try ... check ... etc
     #	this takes a really long time.  memk was down to 2 usable
     #	machines - got it finished manually on a combination of hgwdevnew CPUs
     #	and other machines
 
     # Estimate phastCons parameters
     #	experimented with this as a parasol job on hgwdevnew to try a number
     #	of SS files.  With a command of:
 
 /cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
 --tree "(((((((((((((((((hg18,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
 --out-root=$OUT/starting_tree
 
     #	running over the input files ../ss/*/*.ss results to
 #.../genomes/hg18/bed/multiz44way/cons/startingTree/result/*/starting-tree.mod
 
     # add up the C and G:
     find ./result -type f | xargs ls -rt | while read F
 do
     D=`dirname $F`
     echo -n `basename $D`" - "
     grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
 done
     #	counting number of species seen in the maf file:
     find ./result -type f | xargs ls -rt | while read F
 do
     D=`dirname $F`
     echo -n `basename $D`" - "
     grep TREE $F | sed -e \
 "s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g"  | tr ',' '\n' | wc -l
 done
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     ssh swarm
     mkdir -p /hive/data/genomes/hg18/bed/multiz44way/cons/run.cons
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/run.cons
 
     #	there are going to be several different phastCons runs using
     #	this same script.  They trigger off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is one of:
     #	all euarchontogliers placentals
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast/x86_64
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $cwd:t
 set cons = /hive/data/genomes/hg18/bed/multiz44way/cons
 set tmp = $cons/tmp/$f
 mkdir -p $tmp
 set ssSrc = $cons
 if (-s $cons/$grp/$grp.non-inf) then
   ln -s $cons/$grp/$grp.mod $tmp
   ln -s $cons/$grp/$grp.non-inf $tmp
   ln -s $ssSrc/ss/$c/$f.ss $tmp
   ln -s $cons/$grp/$grp.mod $tmp
   ln -s $cons/$grp/$grp.non-inf $tmp
 else
   ln -s $ssSrc/ss/$c/$f.ss $tmp
   ln -s $cons/$grp/$grp.mod $tmp
 endif
 pushd $tmp > /dev/null
 if (-s $grp.non-inf) then
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative `cat $grp.non-inf` \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 else
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 endif
 popd > /dev/null
 mkdir -p pp/$c bed/$c
 sleep 4
 touch pp/$c bed/$c
 rm -f pp/$c/$f.pp
 rm -f bed/$c/$f.bed
 mv $tmp/$f.pp pp/$c
 mv $tmp/$f.bed bed/$c
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast.csh
 
     #	this template will serve for all runs
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     cat << '_EOF_' > template
 #LOOP
 ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     # Create parasol batch and run it
     ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > ss.list
 
     # run for all species
     cd /hive/data/genomes/hg18/bed/multiz44way/cons
     mkdir -p all
     cd all
     #	Using Kate's .mod tree
     cp -p ../../4d/44way.all.mod ./all.mod
 
     gensub2 ../run.cons/ss.list single ../run.cons/template jobList
     para -ram=8g create jobList
     para try ... check ... push ... etc.
 XXX - running Tue Jan 13 22:19:21 PST 2009
 # Completed: 322 of 322 jobs
 # CPU time in finished jobs:      47406s     790.10m    13.17h    0.55d  0.002 y
 # IO & Wait Time:                 29902s     498.37m     8.31h    0.35d  0.001 y
 # Average job time:                 240s       4.00m     0.07h    0.00d
 # Longest finished job:             354s       5.90m     0.10h    0.00d
 # Submission to last job:           536s       8.93m     0.15h    0.01d
 
     # create Most Conserved track
     cd /hive/data/genomes/hg18/bed/multiz44way/cons
     cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
 
     # load into database
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
     time nice -n +19 hgLoadBed hg18 phastConsElements44way mostConserved.bed
     #	Loaded 4878296 elements of size 5
     #	real     2m3.414s
 
     # Try for 5% overall cov, and 70% CDS cov 
     #	--rho 0.3 --expected-length 45 --target-coverage 0.3
     featureBits hg18 -enrichment refGene:cds phastConsElements44way
     #	refGene:cds 1.144%, mostConserved.bed 4.973%,
     #	both 0.854%, cover 74.62%, enrich 15.01x
 
     #	--rho .31 --expected-length 45 --target-coverage .3
     #	refGene:cds 1.144%, phastConsElements44way 4.706%,
     #	both 0.824%, cover 72.07%, enrich 15.31x
 
     #	--rho 0.3 --expected-length 45 --target-coverage 0.3
     featureBits hg18 -enrichment knownGene:cds phastConsElements44way
     #	knownGene:cds 1.205%, mostConserved.bed 4.973%,
     #	both 0.874%, cover 72.55%, enrich 14.59x
 
     #	--rho .31 --expected-length 45 --target-coverage .3
     #	knownGene:cds 1.205%, phastConsElements44way 4.706%,
     #	both 0.844%, cover 70.05%, enrich 14.88x
 
     featureBits hg18 -enrichment refGene:cds phastConsElements28way
     #	refGene:cds 1.144%, phastConsElements28way 4.920%,
     #	both 0.858%, cover 74.96%, enrich 15.24x
     featureBits hg18 -enrichment knownGene:cds phastConsElements28way
     #	knownGene:cds 1.205%, phastConsElements28way 4.920%,
     #	both 0.878%, cover 72.88%, enrich 14.81x
 
     # Create merged posterier probability file and wiggle track data files
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
 
 TOP=`pwd`
 export TOP
 
 mkdir -p downloads
 
 for D in pp/chr*
 do
     C=${D/pp\/}
     out=downloads/${C}.phastCons44way.wigFix.gz
     echo "${D} > ${C}.phastCons44way.wigFix.gz"
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
 	gzip > ${out}
 done
 '_EOF_'
     #	<< happy emacs
     chmod +x gzipAscii.sh
     time nice -n +19 ./gzipAscii.sh
     #	real    30m7.228s
 
     #	encode those files into wiggle data
     zcat downloads/*.wigFix.gz \
 	| wigEncode stdin phastCons44way.wig phastCons44way.wib
     #	Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    22m54.291s
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
     ln -s `pwd`/phastCons44way.wib /gbdb/hg18/multiz44way/phastCons44way.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
 	phastCons44way phastCons44way.wig
     #	real    1m13.681s
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/all
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=hg18 phastCons44way > histogram.data 2>&1
     #	real    8m6.841s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Human Hg18 Histogram phastCons44way track"
 set xlabel " phastCons44way score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
     ########################################################################
     ### Create a phastCons data set for Primates
 
     # setup primates-only run
     ssh swarm
     mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/primates
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
     # primates-only: exclude all but these for phastCons tree:
 
     /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
 	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1 \
 	> primates.mod
     #	and place the removed ones in the non-inf file so phastCons will
     #	truly ignore them:
     echo "tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1,monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \
 	> primates.non-inf
 
     gensub2 ../run.cons/ss.list single ../run.cons/template jobList
     para -ram=8g create jobList
     para try ... check ... push ... etc.
 # bed/chr18_random/chr18_random.1-4262.bed is empty
 # bed/chr19_random/chr19_random.1-301858.bed is empty
 # bed/chr21/chr21.1-10000000.bed is empty
 # bed/chrM/chrM.1-16571.bed is empty
 
     #	the jobs that fail have messages like this:
 # bed/chrM/chrM.1-16571.bed is empty
 # WARNING: No match for name "tupBel1" in alignment.
 # WARNING: No match for name "sorAra1" in alignment.
 
 # Completed: 318 of 322 jobs
 # Crashed: 4 jobs
 # CPU time in finished jobs:      20253s     337.54m     5.63h    0.23d  0.001 y
 # IO & Wait Time:                 33093s     551.56m     9.19h    0.38d  0.001 y
 # Average job time:                 168s       2.80m     0.05h    0.00d
 # Longest finished job:             249s       4.15m     0.07h    0.00d
 # Submission to last job:           282s       4.70m     0.08h    0.00d
 
     # create Most Conserved track
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
     cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
 
     # load into database
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
     time nice -n +19 hgLoadBed hg18 phastConsElements44wayPrimates \
 	mostConserved.bed
     #	Loaded 808218 elements of size 5
     #	real    0m16.817s
     # verify coverage
     featureBits hg18 phastConsElements44wayPrimates
     #	113268574 bases of 2881515245 (3.931%) in intersection
 
     #	--rho 0.3 --expected-length 45 --target-coverage 0.3
     featureBits hg18 -enrichment refGene:cds phastConsElements44wayPrimates
     #	refGene:cds 1.144%, phastConsElements44wayPrimates 4.222%,
     #	both 0.756%, cover 66.07%, enrich 15.65x
 
     featureBits hg18 -enrichment knownGene:cds phastConsElements44wayPrimates
     #	knownGene:cds 1.205%, phastConsElements44wayPrimates 4.222%,
     #	both 0.769%, cover 63.84%, enrich 15.12x
 
     #	Create the downloads .pp files, from which the phastCons wiggle data
     #	is calculated
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
     mkdir downloads
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
     for D in pp/chr*
 do
     C=${D/pp\//}
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
 	> downloads/${C}.primates.wigFix.gz
     echo $D $C done
 done
 '_EOF_'
     # << happy emacs
     time nice -n +19 ./gzipAscii.sh
     #	real    36m13.492s
 
     # Create merged posterier probability file and wiggle track data files
     zcat downloads/chr*.wigFix.gz \
 	 | wigEncode stdin phastCons44wayPrimates.wig phastCons44wayPrimates.wib
     # Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    24m15.688s
 
     ## load table with wiggle data
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/primates
     ln -s `pwd`/phastCons44wayPrimates.wib \
 	/gbdb/hg18/multiz44way/phastCons44wayPrimates.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
 	phastCons44wayPrimates phastCons44wayPrimates.wig
     #	real    0m48.942s
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=hg18 phastCons44wayPrimates > histogram.data 2>&1
     #	real    5m50.154s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Mouse Hg18 Histogram phastCons44wayPrimates track"
 set xlabel " phastCons44wayPrimates score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
     ########################################################################
     ### Create a phastCons data set for Euarchontoglires
 
     # setup euarchontoglires-only run
     ssh swarm
     cd /hive/data/genomes/hg18/bed/multiz44way/cons
     mkdir euarchontoglires
     cd euarchontoglires
     # euarchontoglires-only: exclude all but these for phastCons tree:
 
 
     /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
 	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2 \
 	> euarchontoglires.mod
     #	and place the removed ones in the non-inf file so phastCons will
     #	truly ignore them:
     echo "vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1,monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \
 	> euarchontoglires.non-inf
 
     gensub2 ../run.cons/ss.list single ../run.cons/template jobList
     para -ram=8g create jobList
     para try ... check ... push ... etc.
     #	Two of these jobs fail to produce any output in the bed file:
     #	I believe this is because there is a missing sequence in these files
     #	compared to the ones specified in euarchontoglires.mod:
     #	bed/chr18_random/chr18_random.1-4262.bed is empty
     #	bed/chr19_random/chr19_random.1-301858.bed is empty
 # Completed: 320 of 322 jobs
 # Crashed: 2 jobs
 # CPU time in finished jobs:      25869s     431.14m     7.19h    0.30d  0.001 y
 # IO & Wait Time:                 34404s     573.41m     9.56h    0.40d  0.001 y
 # Average job time:                 188s       3.14m     0.05h    0.00d
 # Longest finished job:             272s       4.53m     0.08h    0.00d
 # Submission to last job:           309s       5.15m     0.09h    0.00d
 
     # create Most Conserved track
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
     cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
 
     # load into database
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
     time nice -n +19 hgLoadBed hg18 phastConsElements44wayEuarch \
 	mostConserved.bed
     #	Loaded 1623656 elements of size 5
     #	real    4m15.125s
     # verify coverage
     featureBits hg18 phastConsElements44wayEuarch
     #	109221588 bases of 2881515245 (3.790%) in intersection
 
     #	--rho 0.3 --expected-length 45 --target-coverage 0.3
     featureBits hg18 -enrichment refGene:cds phastConsElements44wayEuarch
     #	refGene:cds 1.144%, mostConserved.bed 3.696%,
     #	both 0.822%, cover 71.87%, enrich 19.45x
 
     #	--rho 0.31 --expected-length 45 --target-coverage 0.3
     #	refGene:cds 1.144%, phastConsElements44wayEuarch 3.790%,
     #	both 0.822%, cover 71.79%, enrich 18.94x
 
     #	--rho 0.3 --expected-length 45 --target-coverage 0.3
     featureBits hg18 -enrichment knownGene:cds phastConsElements44wayEuarch
     #	knownGene:cds 1.205%, mostConserved.bed 3.696%,
     #	both 0.839%, cover 69.59%, enrich 18.83x
 
     #	--rho 0.31 --expected-length 45 --target-coverage 0.3
     #	knownGene:cds 1.205%, phastConsElements44wayEuarch 3.790%,
     #	both 0.838%, cover 69.51%, enrich 18.34x
 
     #	Create the downloads .pp files, from which the phastCons wiggle data
     #	is calculated
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
     mkdir downloads
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
     for D in pp/chr*
 do
     C=${D/pp\//}
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
 	> downloads/${C}.euarchontoglires.wigFix.gz
     echo $D $C done
 done
 '_EOF_'
     # << happy emacs
     time nice -n +19 ./gzipAscii.sh
     #	real    26m54.263s
 
     # Create merged posterier probability file and wiggle track data files
     zcat downloads/chr*.wigFix.gz \
 	 | wigEncode stdin phastCons44wayEuarch.wig phastCons44wayEuarch.wib
     # Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    18m15.693s
 
     ## load table with wiggle data
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/euarchontoglires
     ln -s `pwd`/phastCons44wayEuarch.wib \
 	/gbdb/hg18/multiz44way/phastCons44wayEuarch.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
 	phastCons44wayEuarch phastCons44wayEuarch.wig
     #	real     0m57.590s
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=hg18 phastCons44wayEuarch > histogram.data 2>&1
     #	real    6m37.512s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Mouse Hg18 Histogram phastCons44wayEuarch track"
 set xlabel " phastCons44wayEuarch score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
     ########################################################################
     ### Create a phastCons data set for Placentals
     # setup placental-only run
     ssh swarm
     mkdir /hive/data/genomes/hg18/bed/multiz44way/cons/placental
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
 
     # placental-only: exclude all but these for phastCons tree:
     /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
 	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1 \
 	> placental.mod
     #	and place the removed ones in the non-inf file so phastCons will
     #	truly ignore them:
     echo "monDom4,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat2,danRer5,petMar1" \
         > placental.non-inf
 
     gensub2 ../run.cons/ss.list single ../run.cons/template jobList
     para -ram=8g create jobList
     para try ... check ... push ... etc.
     #	Two of these jobs fail to produce any output:
     #	bed/chr18_random/chr18_random.1-4262.bed is empty
     #	bed/chr19_random/chr19_random.1-301858.bed is empty
 # Completed: 320 of 322 jobs
 # Crashed: 2 jobs
 # CPU time in finished jobs:      38258s     637.63m    10.63h    0.44d  0.001 y
 # IO & Wait Time:                 34704s     578.40m     9.64h    0.40d  0.001 y
 # Average job time:                 228s       3.80m     0.06h    0.00d
 # Longest finished job:             313s       5.22m     0.09h    0.00d
 # Submission to last job:          1030s      17.17m     0.29h    0.01d
 
     # create Most Conserved track
     cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
 
     # load into database
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
     time nice -n +19 hgLoadBed hg18 phastConsElements44wayPlacental \
 	mostConserved.bed
     #	Loaded  3962527 elements of size 5
     #	real    3m28.564s
     # verify coverage
     featureBits hg18 phastConsElements44wayPlacental
     #	119635433 bases of 2881515245 (4.152%) in intersection
 
     #	--rho 0.3 --expected-length 45 --target-coverage 0.3
     featureBits hg18 -enrichment refGene:cds phastConsElements44wayPlacental
     #	refGene:cds 1.144%, phastConsElements44wayPlacental 4.329%,
     #	both 0.840%, cover 73.41%, enrich 16.96x
     featureBits hg18 -enrichment knownGene:cds phastConsElements44wayPlacental
     #	knownGene:cds 1.205%, phastConsElements44wayPlacental 4.329%,
     #	both 0.858%, cover 71.17%, enrich 16.44x
 
     #	Create the downloads .pp files, from which the phastCons wiggle data
     #	is calculated
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
     mkdir downloads
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
     for D in pp/chr*
 do
     C=${D/pp\//}
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
 	> downloads/${C}.placental.wigFix.gz
     echo $D $C done
 done
 '_EOF_'
     # << happy emacs
     time nice -n +19 ./gzipAscii.sh
     #	real    22m12.762s
 
     # Create merged posterier probability file and wiggle track data files
     zcat downloads/chr*.wigFix.gz \
 	| wigEncode stdin phastCons44wayPlacental.wig \
 		phastCons44wayPlacental.wib
     #	Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    37m20.176s
 
     ## load table with wiggle data
     ssh hgwdev
     cd /hive/data/genomes/hg18/bed/multiz44way/cons/placental
     ln -s `pwd`/phastCons44wayPlacental.wib \
 	/gbdb/hg18/multiz44way/phastCons44wayPlacental.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
 	phastCons44wayPlacental phastCons44wayPlacental.wig
     #	real    1m16.900s
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=hg18 phastCons44wayPlacental > histogram.data 2>&1
     #	real    8m15.623s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Human Hg18 Histogram phastCons44wayPlacental track"
 set xlabel " phastCons44wayPlacental score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 #########################################################################
 # Update phastCons44way tables from Adam (DONE - 2009-05-22 - Hiram)
     mkdir /hive/data/genomes/hg18/bed/multiz44way/chrX.phastCons
     cd /hive/data/genomes/hg18/bed/multiz44way/chrX.phastCons
     mkdir  primates
     cd primates
     wget --timestamping \
 ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/primates/*
     cd ..
 
     mkdir placental
     cd placental
 wget --timestamping \
 ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/placental/*
     cd ..
 
     mkdir all
     cd all
     wget --timestamping \
 ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phastCons/all/*
 
     zcat all/*.wigFix.gz \
 	| wigEncode stdin phastCons44way_v2.wig phastCons44way_v2.wib
     zcat primates/*.wigFix.gz \
  | wigEncode stdin phastCons44wayPrimates_v2.wig phastCons44wayPrimates_v2.wib
     zcat placental/*.wigFix.gz \
  | wigEncode stdin phastCons44wayPlacental_v2.wig phastCons44wayPlacental_v2.wib
 
     ln -s `pwd`/*.wib /gbdb/hg18/multiz44way
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
 	phastCons44way_v2 phastCons44way_v2.wig
     #	real    0m43.022s
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
 	phastCons44wayPrimates_v2 phastCons44wayPrimates_v2.wig
     #	real    0m43.660s
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 \
 	phastCons44wayPlacental_v2 phastCons44wayPlacental_v2.wig
     #	real    0m44.607s
 
     time nice -n +19 hgLoadBed hg18 phastConsElements44way_v2 \
 	all/mostConserved.bed
     #	Loaded 4779670 elements of size 5
     #	real    2m10.975s
     time nice -n +19 hgLoadBed hg18 phastConsElements44wayPrimates_v2 \
 	primates/mostConserved.bed
     #	Loaded 785075 elements of size 5
     #	real    0m21.619s
     time nice -n +19 hgLoadBed hg18 phastConsElements44wayPlacental_v2 \
 	placental/mostConserved.bed
     #	Loaded 3862854 elements of size 5
     #	real    1m41.223s
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=hg18 phastCons44wayPlacental_v2 > placental.histogram.data 2>&1
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=hg18 phastCons44wayPrimates_v2 > primates.histogram.data 2>&1
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=hg18 phastCons44way_v2 > vertebrate.histogram.data 2>&1
 
     cat << '_EOF_' | gnuplot > placental.histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Human Hg18 Histogram phastCons44wayPlacental_v2 track"
 set xlabel " phastCons44wayPlacental_v2 score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "placental.histogram.data" using 2:5 title " RelFreq" with impulses, \
         "placental.histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display placental.histo.png &
 
     cat << '_EOF_' | gnuplot > primates.histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Human Hg18 Histogram phastCons44wayPrimates_v2 track"
 set xlabel " phastCons44wayPrimates_v2 score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "primates.histogram.data" using 2:5 title " RelFreq" with impulses, \
         "primates.histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display primates.histo.png &
 
     cat << '_EOF_' | gnuplot > vertebrate.histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Human Hg18 Histogram phastCons44way_v2 track"
 set xlabel " phastCons44way_v2 score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "vertebrate.histogram.data" using 2:5 title " RelFreq" with impulses, \
         "vertebrate.histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display placental.histo.png &
 
 #########################################################################
 # phyloP conservation for 44-way (2009-01-05 kate)
 #
 # Vertebrate, Placental
 # Also doing Euarchontoglire, since Hiram did
 #
 # Using newer scoring method LRT (replaces SPH), based
 # on scoring method experiments, above (compared to SCORE method).
 # Using phast from Adam's student Melissa Hubisz, with fixes needed for LRT scoring
 # Will replace with version from CVS if/when these fixes are integrated
 # PHAST version is 0.9.9.9b
 
     # split SS files into 1M chunks (tried 10M used for phastCons, and these
     #   took 5hrs/chunk w/ LRT scoring)
 
     ssh swarm
     cd /cluster/data/hg18/bed/multiz44way
     mkdir consPhyloP
     cd consPhyloP
     mkdir ss run.split
     cd run.split
 
 cat << 'EOF' > doSplit.csh
     set c = $1
     set d = /cluster/data/hg18/bed/multiz44way
     set in =  $d/cons/ss
     set out = $d/consPhyloP/ss
     set PHASTBIN = /cluster/bin/phast.2008-12-18
     @ i=0
     foreach f (`ls $in/$c/*.ss | sort -n -t\. -k2`)
         @ i++
         mkdir -p $out/$c/$i
         $PHASTBIN/msa_split $f -i SS -o SS \
             -r $out/$c/$i/$c.$i -w 1000000,0 -I 1000 -B 5000
     end
     echo "Done" >> $out/$c.done
 'EOF'
 # << happy emacs
 
     set d = /cluster/data/hg18/bed/multiz44way/consPhyloP
     set JOBS = $d/run.split/jobList
     rm -f $JOBS
     touch $JOBS
     foreach c (`awk '{print $1}' /cluster/data/hg18/chrom.sizes`)
         echo "csh doSplit.csh $c {check out line+ $d/ss/$c.done}" >> $JOBS
     end
 
     para create jobList
         # 49 jobs
     para try
     para check
     para push
     para time
 
     # run phyloP with score=LRT 
     ssh swarm
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP
     mkdir run.phyloP
     cd run.phyloP
 
     # Adjust model file base composition background and rate matrix to be
     # representative of whole-genome (.41 -- as was done for ENCODE)
     # using utility, 'modFreqs' from PHAST package
 
     set PHASTBIN = /cluster/bin/phast.2008-12-18
     set gc = `grep BACKGROUND /cluster/data/hg18/bed/multiz17way/cons/elliotsEncode.mod |  awk '{printf "%0.3f\n", $3 + $4}'`
     echo $gc
     # .410
     # NOTE: this corresponds well to Hiram's GC values from his phyloFit runs
     # on the 44-way ss files
     $PHASTBIN/modFreqs ../../4d/phyloFit.all.mod $gc > ../../4d/44way.all.mod
 
     # repeat for chrX only tree
     cd /cluster/data/hg18/bed/multiz44way/4d
     $PHASTBIN/modFreqs 4d.chrX.mod $gc > 44way.chrX.mod
     ln -s `pwd`/44way.chrX.mod /usr/local/apache/golenPath/hg18/phastCons44way
 
 cat > doPhyloP.csh << 'EOF'
     set f = $1
     set out = $2
     set c = $f:r:r
     set n = $f:r:e
     set tmp = /scratch/tmp/$f
     rm -fr $tmp
     mkdir -p $tmp
     cp -p /cluster/data/hg18/bed/multiz44way/consPhyloP/ss/$c/$n/$f.ss $tmp
     cp -p tree.mod $tmp
     pushd $tmp > /dev/null
     set PHASTBIN = /cluster/bin/phast.2008-12-18
     $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $c \
                 -i SS tree.mod $f.ss > $f.wig
     popd > /dev/null
     mkdir -p $out:h
     mv $tmp/$f.wig $out
     rm -fr $tmp
 'EOF'
 
     # Create list of chunks
     pushd /cluster/data/hg18/bed/multiz44way/consPhyloP/ss
     ls chr*/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
         /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/in.list
     popd > /dev/null
 
     # need to fill in chr8, neglected in main run
     pushd /cluster/data/hg18/bed/multiz44way/consPhyloP/ss
     ls chr8/*/chr*.*.ss | sed -e 's/.ss$//' -e 's/^\.\///' > \
         /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/in.chr8.list
     popd > /dev/null
 
     # Create template file
     #	file1 == $chr/$chunk/file name without .ss suffix
     cat > template << 'EOF'
 #LOOP
 csh ../doPhyloP.csh $(file1) {check out line+ wig/$(dir1)/$(file1).wig}
 #ENDLOOP
 'EOF'
     # setup run for all species
     mkdir all
     cd all
     cp ../../../4d/44way.all.mod tree.mod
     rm -fr wig
     mkdir wig
 
     # << happy emacs
     gensub2 ../in.list single ../template jobList
     # 2823 jobs
     para create jobList
     para try
     para check
     para push
 
     para time
     #Completed: 2823 of 2823 jobs
     #CPU time in finished jobs:    4691641s   78194.02m  1303.23h   54.30d  0.149 y
     #IO & Wait Time:                171343s    2855.71m    47.60h    1.98d  0.005 y
     #Average job time:                1723s      28.71m     0.48h    0.02d
     #Longest finished job:            2451s      40.85m     0.68h    0.03d
     #Submission to last job:          6055s     100.92m     1.68h    0.07d
 
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
 # check for clean dir here -- chr* will match garbage if it's there
 cat > listWig.csh << 'EOF'
     foreach c (`ls -d chr*`)
         foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
             ls -1 $d/*.wig | sort -n -t\. -k3
         end
     end
 'EOF'
 
     cd all/wig
     csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayAll.wig phyloP44wayAll.wib
     # Reloaded to include chr8 (2008-01-15 kate)
     #Converted stdin, upper limit 7.13, lower limit -15.41
     # Load gbdb and database with wiggle.
     ln -s  \
         /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/all/wig/phyloP44wayAll.wib \
         /gbdb/hg18/multiz44way/phyloP44wayAll.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayAll phyloP44wayAll.wig
 
     # placental-only: exclude all but these: 
     cd /cluster/data/hg18/bed/multiz44way/4d
     set PHASTBIN = /cluster/bin/phast.2008-12-18
     $PHASTBIN/tree_doctor 44way.all.mod \
 	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,\
           micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2,\
           vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,\
           sorAra1,loxAfr2,proCap1,echTel1,dasNov2,choHof1 \
 	> 44way.placental.mod
     cd ../consPhyloP/run.phyloP
     mkdir placental
     cd placental
     cp ../../../4d/44way.placental.mod tree.mod
     mkdir wig
     gensub2 ../in.list single ../template jobList
     # 2823 jobs
     para create jobList
     para try
     para check
     para push
 
     para time
     #Completed: 2823 of 2823 jobs
     #CPU time in finished jobs:    3358003s   55966.71m   932.78h   38.87d  0.106 y
     #IO & Wait Time:                142664s    2377.74m    39.63h    1.65d  0.005 y
     #Average job time:                1240s      20.67m     0.34h    0.01d
     #Longest finished job:            1781s      29.68m     0.49h    0.02d
     #Submission to last job:          4383s      73.05m     1.22h    0.05d
 
     # load wiggle
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/wig
     csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPlacMammal.wig phyloP44wayPlacMammal.wib
     #Converted stdin, upper limit 3.46, lower limit -14.42
 
     # Load gbdb and database with wiggle.
     ln -s  \
         /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/wig/phyloP44wayPlacMammal.wib \
         /gbdb/hg18/multiz44way/phyloP44wayPlacMammal.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPlacMammal phyloP44wayPlacMammal.wig
 
     cd /cluster/data/hg18/bed/multiz44way/4d
     set PHASTBIN = /cluster/bin/phast.2008-12-18
     $PHASTBIN/tree_doctor 44way.all.mod \
 	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun1,ochPri2 \
 	> 44way.euarchontoglires.mod
 
     # euarchontoglires only: exclude all but these: 
     cd ../consPhyloP/run.phyloP
     mkdir euarch
     cd euarch
     cp ../../../4d/44way.euarchontoglires.mod tree.mod
     mkdir wig
     gensub2 ../in.list single ../template jobList
     # 2823 jobs
     para create jobList
     para try
     para check
     para push
 
     para time
     #Completed: 2823 of 2823 jobs
     #CPU time in finished jobs:    1646910s   27448.49m   457.47h   19.06d  0.052 y
     #IO & Wait Time:                 94310s    1571.84m    26.20h    1.09d  0.003 y
     #Average job time:                 617s      10.28m     0.17h    0.01d
     #Longest finished job:             901s      15.02m     0.25h    0.01d
     #Submission to last job:          2127s      35.45m     0.59h    0.02d
 
     # process results and load wiggle
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/euarch/wig
     csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayEuarch.wig phyloP44wayEuarch.wib
     #Converted stdin, upper limit 2.03, lower limit -9.78
     ln -s  \
         /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/euarch/wig/phyloP44wayEuarch.wib \
         /gbdb/hg18/multiz44way/phyloP44wayEuarch.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayEuarch phyloP44wayEuarch.wig
 
     # primates only: exclude all but these: 
     cd /cluster/data/hg18/bed/multiz44way/4d
     set PHASTBIN = /cluster/bin/phast.2008-12-18
     $PHASTBIN/tree_doctor 44way.all.mod \
 	--prune-all-but=hg18,panTro2,gorGor1,ponAbe2,rheMac2,calJac1,tarSyr1,micMur1,otoGar1 \
 	> 44way.primate.mod
     cd ../consPhyloP/run.phyloP
     mkdir primate
     cd primate
     cp ../../../4d/44way.primate.mod tree.mod
     mkdir wig
     gensub2 ../in.list single ../template jobList
     para create jobList
     # 2823 jobs
     para try
     para check
     para push
 
     # quick!
     para time
     #Completed: 2823 of 2823 jobs
     #CPU time in finished jobs:     895998s   14933.30m   248.89h   10.37d  0.028 y
     #IO & Wait Time:                 66654s    1110.90m    18.52h    0.77d  0.002 y
     #Average job time:                 341s       5.68m     0.09h    0.00d
     #Longest finished job:             503s       8.38m     0.14h    0.01d
     #Submission to last job:          1190s      19.83m     0.33h    0.01d
 
     # process results and load wiggle
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/wig
     csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPrimate.wig phyloP44wayPrimate.wib
     #Converted stdin, upper limit 0.99, lower limit -8.17
     ln -s  /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/wig/phyloP44wayPrimate.wib /gbdb/hg18/multiz44way/phyloP44wayPrimate.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimate phyloP44wayPrimate.wig
 
 # get stats
     cd run.phyloP/all
     hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayAll > stats.out
     hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayAll | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out
 
     cd ../placental
     hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayPlacMammal > stats.out
     hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayPlacMammal | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out
 
     cd ../euarch
     hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayEuarch > stats.out
     hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayEuarch | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out
 
     cd ../primate
     hgWiggle -db=hg18 -verbose=2 -doStats phyloP44wayPrimate > stats.out
     hgWiggle -db=hg18 -chr=chr20 -rawDataOut phyloP44wayPrimate | textHistogram -real stdin -minVal=-20 -maxBinCount=30 >&! histo.out
 
     # Downloads
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
     cat > listWigsByChrom.csh << 'EOF'
             set c = $1
             foreach d (`ls -d $c/[1-9]* | sort -t/ -k2 -n`)
                 ls -1 $d/*.wig | sort -n -t\. -k3
             end
     'EOF'
 
     cat > downloads.csh << 'EOF'
         mkdir ../downloads
         foreach c (`ls -d chr*`)
             echo $c
             csh ../../listWigsByChrom.csh $c > ../downloads/$c.lst
             csh ../../listWigsByChrom.csh $c | xargs cat | gzip -c > ../downloads/$c.$1.wigFix.gz
             end
         cd ../downloads
         md5sum *.wigFix.gz > md5sum.txt
     'EOF'
 
     cd all/wig
     csh ../../downloads.csh phyloP44way >&! downloads.log &
 
     cd ../../placental/wig
     csh ../../downloads.csh phyloP44way.placental >&! downloads.log &
     cd ../../primate/wig
     csh ../../downloads.csh phyloP44way.primate >&! downloads.log &
 
     # add create web downloads dir and add symlinks to files
     cd ../../
     mkdir downloads
     cp /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/README.txt downloads
     # edit
     cd /usr/local/apache/htdocs/goldenPath/hg18/
     mkdir phyloP44way
     cd  phyloP44way
     ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/downloads/README.txt .
     mkdir vertebrate
     ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/all/downloads/{*.gz,md5sum.txt} vertebrate
     mkdir placentalMammals
     ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/placental/downloads/{*.gz,md5sum.txt} placentalMammals
     mkdir primates
     ln -s /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate/downloads/{*.gz,md5sum.txt} primates
 
     # Lineage-specific runs
     # uses --subtree option of phyloP
 
     # name ancestor nodes 
     cd /cluster/data/hg18/bed/multiz44way/4d
     set PHASTBIN = /cluster/bin/phast.2008-12-18
     $PHASTBIN/tree_doctor 44way.all.mod --name-ancestors >44way.all-ancestors.mod 
     cd ../consPhyloP/run.phyloP
     
     # built new PHAST package with fix from Adam for --subtree problems:w
     sed -e 's/phyloP/phyloP --subtree=$3/' -e 's/phast.2008-12-18/phast.2009-01-26/' doPhyloP.csh > doPhyloPSubtree.csh
     # visually inspect shell script
 
     cat > template.subtree << 'EOF'
 #LOOP
 csh ../doPhyloPSubtree.csh $(file1) {check out line+ wig/$(dir1)/$(file1).wig} SUBTREE
 #ENDLOOP
 'EOF'
 
     # primate lineage-specific
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
     mkdir primate-ls
     cd primate-ls
     cp ../../../4d/44way.all-ancestors.mod tree.mod
     mkdir wig
     sed 's/SUBTREE/hg18-micMur1/' ../template.subtree > template.ls
     gensub2 ../in.list single template.ls jobList
     para create jobList
     # 2823 jobs
     para try
     para check
     para push
 
     para time
 #CPU time in finished jobs:    4949300s   82488.33m  1374.81h   57.28d  0.157 y
 #IO & Wait Time:                143956s    2399.27m    39.99h    1.67d  0.005 y
 #Average job time:                1805s      30.08m     0.50h    0.02d
 #Longest finished job:            2780s      46.33m     0.77h    0.03d
 #Submission to last job:          6447s     107.45m     1.79h    0.07d
 
     # process results and load wiggle
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate-ls/wig
     csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayPrimateLs.wig phyloP44wayPrimateLs.wib
     #Converted stdin, upper limit 3.91, lower limit -9.28
     ln -s  /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/primate-ls/wig/phyloP44wayPrimateLs.wib /gbdb/hg18/multiz44way/phyloP44wayPrimateLs.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimateLs phyloP44wayPrimateLs.wig
 
     # glire lineage-specfic
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP
     mkdir glire-ls
     cd glire-ls
     cp ../../../4d/44way.all-ancestors.mod tree.mod
     mkdir wig
     sed 's/SUBTREE/mm9-oryCun1/' ../template.subtree > template.ls
     gensub2 ../in.list single template.ls jobList
     para create jobList
     # 2823 jobs
     para try
     para check
     para push
 
     para time
     #CPU time in finished jobs:    5173192s   86219.87m  1437.00h   59.87d  0.164 y
     #IO & Wait Time:                145615s    2426.91m    40.45h    1.69d  0.005 y
     #Average job time:                1884s      31.40m     0.52h    0.02d
     #Longest finished job:            2721s      45.35m     0.76h    0.03d
     #Submission to last job:          6883s     114.72m     1.91h    0.08d
 
     # process results and load wiggle
     ssh hgwdev
     cd /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/glire-ls/wig
     csh ../../listWig.csh | xargs cat | nice wigEncode stdin phyloP44wayGlireLs.wig phyloP44wayGlireLs.wib
     #Converted stdin, upper limit 5.95, lower limit -6.99
 
     ln -s  /cluster/data/hg18/bed/multiz44way/consPhyloP/run.phyloP/glire-ls/wig/phyloP44wayGlireLs.wib /gbdb/hg18/multiz44way/phyloP44wayGlireLs.wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayGlireLs phyloP44wayGlireLs.wig
 
 #########################################################################
 # Update phyloP44way tables from Adam Siepel, Melissa Hubisz at Cornell
 # This version uses a different neutral tree model for chrX
 # and will replace the original version as default view on the Conservation track
 # ( 2009-06-30  kate)
     mkdir /hive/data/genomes/hg18/bed/multiz44way/chrX.phyloP
     cd /hive/data/genomes/hg18/bed/multiz44way/chrX.phyloP
     mkdir  primates
     cd primates
     wget --timestamping ftp:ftp.biotech.cornell.edu/2x/phyloP/44way/primates/\*
     cd ..
 
     mkdir placental
     cd placental
     wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/placental/\*
     cd ..
 
     mkdir all
     cd all
     wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/all/\*
     cd ..
 
     zcat all/*.wigFix.gz | wigEncode stdin phyloP44way_v2.wig phyloP44way_v2.wib
     zcat primates/*.wigFix.gz | wigEncode stdin phyloP44wayPrimates_v2.wig phyloP44wayPrimates_v2.wib
     zcat placental/*.wigFix.gz | wigEncode stdin phyloP44wayPlacental_v2.wig phyloP44wayPlacental_v2.wib
 
     ln -s `pwd`/*.wib /gbdb/hg18/multiz44way
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18  phyloP44way_v2 phyloP44way_v2.wig
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18  phyloP44wayPrimates_v2 phyloP44wayPrimates_v2.wig
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18  phyloP44wayPlacental_v2 phyloP44wayPlacental_v2.wig
 
 # Lineage specific phyloP
 # These updated tables will appear in the Lineage Cons track
 
     mkdir glires-ls
     cd glires-ls
     wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/glires-ls/\*
     cd ..
 
     mkdir primates-ls
     cd primates-ls
     wget --timestamping ftp://siepellab:siepellab@ftp.biotech.cornell.edu/2x/phyloP/44way/primates-ls/\*
     cd ..
 
     zcat glires-ls/*.wigFix.gz | wigEncode stdin phyloP44wayGliresLs_v2.wig phyloP44wayGliresLs_v2.wib
     zcat primates-ls/*.wigFix.gz | wigEncode stdin phyloP44wayPrimatesLs_v2.wig phyloP44wayPrimatesLs_v2.wib
 
     ln -s `pwd`/phyloP44wayGliresLs_v2.wib /gbdb/hg18/multiz44way
     nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayGliresLs_v2 phyloP44wayGliresLs_v2.wig
 
     ln -s `pwd`/phyloP44wayPrimatesLs_v2.wib /gbdb/hg18/multiz44way
     nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg18/multiz44way hg18 phyloP44wayPrimatesLs_v2 phyloP44wayPrimatesLs_v2.wig
 
 
 ######################################################################
 # downloads for 44-way (DONE - 2009-01-09 - Hiram)
     mkdir -p /hive/data/genomes/hg18/bed/multiz44way/downloads/maf
     cd /hive/data/genomes/hg18/bed/multiz44way/downloads/maf
     # bash script
 #!/bin/sh
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.maf"
     featureBits hg18 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | /cluster/bin/$MACHTYPE/mafFrags hg18 multiz44way \
                 stdin stdout \
                 -orgs=/hive/data/genomes/hg18/bed/multiz44way/species.list \
         | gzip -c > upstream${S}.maf.gz
     echo "done upstream${S}.maf.gz"
 done
 
     cd /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/maf
     ln -s /hive/data/genomes/hg18/bed/multiz44way/downloads/maf/up*.gz .
     md5sum up*.gz >> md5sum.txt
 
     mkdir /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way
     cd /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way
     mkdir placentalMammals primates vertebrate
     cd vertebrate
     ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/all/downloads/* .
     cd ../placentalMammals
     ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/placental/downloads/* .
     cd ../primates
     ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/primates/downloads/* .
     cd ..
     ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/all/all.mod \
 	vertebrate.mod
     ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/primates/primates.mod .
     ln -s /hive/data/genomes/hg18/bed/multiz44way/cons/placental/placental.mod \
 	./placentalMammals.mod
     ln -s \
 /hive/data/genomes/hg18/bed/multiz44way/downloads/phastCons44way/README.txt .
 
     #	pushQ MySQL tables:
 phastCons44way, phastCons44wayPlacental, phastCons44wayPrimates,
 multiz44way, multiz44wayFrames, multiz44waySummary,
 phastConsElements44way, phastConsElements44wayPlacental,
 phastConsElements44wayPrimates, phyloP44wayAll, phyloP44wayPlacMammal,
 phyloP44wayPrimate
 
     #	pushQ files:
 /gbdb/hg18/multiz44way/maf/*
 /gbdb/hg18/multiz44way/phastCons44way.wib
 /gbdb/hg18/multiz44way/phastCons44wayPlacental.wib
 /gbdb/hg18/multiz44way/phastCons44wayPrimates.wib
 /gbdb/hg18/multiz44way/phyloP44wayAll.wib
 /gbdb/hg18/multiz44way/phyloP44wayPlacMammal.wib
 /gbdb/hg18/multiz44way/phyloP44wayPrimate.wib
 /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/vertebrate/*
 /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/primates/*
 /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/placentalMammals/*
 /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/*.mod
 /usr/local/apache/htdocs/goldenPath/hg18/phastCons44way/README.txt
 /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/maf/*
 /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/alignments/
 /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/*.nh
 /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/README.txt
 /usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/vertebrate/*
 /usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/placentalMammals/*
 /usr/local/apache/htdocs/goldenPath/hg18/phyloP44way/primate/*
 
     #	MySQL tables:	5,624,932,756 = 5,364 Mb
     #	gbdb files:	271,318,361,985 = 258,749 Mb
     #	apache htdocs:	58,767,852,372 = 56,045 Mb
     #	Total 335,711,147,113  = 320,159 Mb
 
     #	An extra set of error corrected MAF's from the Siepel lab:
     mkdir /hive/data/genomes/hg18/bed/multiz44way/errorCorrectedMafs
     wget --timestamping \
     "ftp://siepellab:XXXXXX@ftp.biotech.cornell.edu/2x/maf-ec/*"
     #	not showing the password here on purpose
     # verify md5sums:
     md5sum *.maf.gz > md5sum.here
     diff md5sum.txt md5sum.here
     #	no difference
     rm md5sum.here
     mkdir \
 /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/SiepelLabCorrectedMafs
     cd \
 /usr/local/apache/htdocs/goldenPath/hg18/multiz44way/SiepelLabCorrectedMafs
     ln -s /hive/data/genomes/hg18/bed/multiz44way/errorCorrectedMafs/* .
 
 #########################################################################
 # Create Syntenic and Recip Best net files to load into tracks to view
 #	on the browser to see what was used during the multiple alignment
     cd /hive/data/genomes/hg18/bed/blastz.gorGor1/axtChain
     netClass -verbose=0 -noAr hg18.gorGor1.rbest.net.gz hg18 gorGor1 stdout \
 	| gzip -c > netRBestGorGor1.net.gz
     hgLoadNet hg18 netRBestGorGor1 netRBestGorGor1.net.gz
 
     cd /hive/data/genomes/hg18/bed/blastz.ponAbe2/axtChain
     hgLoadNet hg18 netSyntenyPonAbe2 hg18.ponAbe2.syn.net.gz
 
     cd /hive/data/genomes/hg18/bed/blastz.calJac1/axtChain
     netClass -verbose=0 -noAr hg18.calJac1.rbest.net.gz hg18 calJac1 stdout \
 	| gzip -c > netRBestCalJac1.net.gz
     hgLoadNet hg18 netRBestCalJac1 netRBestCalJac1.net.gz
 
     cd /hive/data/genomes/hg18/bed/blastz.tarSyr1/axtChain
     netClass -verbose=0 -noAr hg18.tarSyr1.rbest.net.gz hg18 tarSyr1 stdout \
 	| gzip -c > netRBestTarSyr1.net.gz
     hgLoadNet hg18 netRBestTarSyr1 netRBestTarSyr1.net.gz
 
 
 #########################################################################
 # EIO/JCVI NAS TRACK (2008-11-25 Fan)
 # Contact: Gaetano Gargiulo [gaetano.gargiulo@ifom-ieo-campus.it]
 
     cd /hive/data/genomes/hg18/bed
     mkdir eioJcviNAS
     cd eioJcviNAS
 
 # receive the doc and two bed files and put them there.
 
     fgrep -v description HG18_NAS_CD34_neg.bed| \
     cut -f 1-3 |hgLoadBed -noBin hg18 eioJcviNASNeg stdin
     checkTableCoords -table=eioJcviNASNeg hg18
 
     fgrep -v description HG18_NAS_CD34_pos.bed| \
     cut -f 1-3 |hgLoadBed -noBin hg18 eioJcviNASPos stdin
     checkTableCoords -table=eioJcviNASPos hg18
 
 # Create the description file, eioJcviNAS.html, according to 
 # according to the latest doc file from Gaetano.
 #
 # Add the two composite sub-tracks to human/hg18/trackDb.ra.
 
 #########################################################################
 # hgPal downloads (DONE braney 2008-12-07)
 #   FASTA from 44way for refGene, knownGene, knownCanonical 
 
     ssh hgwdev
     screen
     bash
     rm -rf /cluster/data/hg18/bed/multiz44way/pal
     mkdir /cluster/data/hg18/bed/multiz44way/pal
     cd /cluster/data/hg18/bed/multiz44way/pal
     echo hg18 | cat - /cluster/data/hg18/bed/multiz44way/ordered.list > order.lst
 
     mz=multiz44way
     gp=refGene
     db=hg18
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.jobs
 
     time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
     sleep 1
     tail -f $gp.jobs.log
 
 # real    525m57.376s
 # user    25m36.072s
 # sys     7m41.565s
 
     ssh kolossus
     mz=multiz44way
     gp=refGene
     db=hg18
     zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     # we're only distributing exons at the moment
     mz=multiz44way
     gp=refGene
     db=hg18
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
     mkdir -p $pd
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
     mz=multiz44way
     gp=knownGene
     db=hg18
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.$mz.jobs
 
     time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
     sleep 1
     tail -f $gp.$mz.job.log
 
 # real    442m46.735s
 # user    43m3.060s
 # sys     10m45.635s
 
 
     mz=multiz44way
     gp=knownGene
     db=hg18
 
     zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     mz=multiz44way
     gp=knownGene
     db=hg18
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
     mkdir -p $pd
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
     # now do the canonical set
     cd /cluster/data/hg18/bed/multiz44way/pal
     mz=multiz44way
     gp=knownCanonical
     db=hg18
     for j in `awk '{print $1}' /cluster/data/hg18/chrom.sizes`
     do
 	echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
     done
 
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -geneBeds=$j.known.bed  $db $mz knownGene order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.$mz.jobs
 
     time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
     sleep 1
     tail -f $gp.$mz.job.log
 
 # real    326m12.849s
 # user    17m40.850s
 # sys     3m59.648s
 
     rm *.known.bed
     mz=multiz44way
     gp=knownCanonical
     db=hg18
     zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     mz=multiz44way
     gp=knownCanonical
     db=hg18
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments
     mkdir -p $pd
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
 #########################################################################
 # BUILD OMIM RELATED GENES TRACK (complete rebuild, 2/24/09 Fan)
 
 ssh hgwdev
 cd /hive/data/genomes/gs.19/build36/bed
 mkdir omimGene
 cd omimGene
 
 # download the file morbidmap and genemap from OMIM
 
 mkdir omim
 cd omim
 wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/morbidmap
 wget --timestamping ftp://ftp.ncbi.nih.gov/repository/OMIM/genemap
 cat genemap|sed -e 's/|/\t/g' > genemap.tab
 autoSql ~/src/hg/lib/omimGeneMap.as x
 cat x.sql |sed -e 's/PRIMARY KEY(numbering)/KEY(omimId)/' >omimGeneMap.sql
 hgLoadSqlTab -warn hg18 omimGeneMap omimGeneMap.sql genemap.tab
 
 # got warning on 3 records, just ignore them
 Warning: load of omimGeneMap did not go as planned: 11750 record(s), 0 row(s) skipped, 3 warning(s) loading genemap.tab
 
 rm x.c x.h
 cd ..
 cat omim/morbidmap|sed -e 's/|/\t/g' > mobidmap.tab
 autoSql ~/src/hg/lib/omimMorbidMap.as x 
 cat x.sql |sed -e 's/PRIMARY KEY(description)/KEY(omimId)/' >omimMorbidMap.sql
 hgLoadSqlTab -warn hg18 omimMorbidMap omimMorbidMap.sql mobidmap.tab
 
 # get all UCSC genes (from the knownGene table) that cross-reference to a RefSeq gene 
 # that has a non-empty OMIM ID according to the refLink table.  And use OMIM ID as
 # the gene name for this new table.  Please note the alignId field still holds the KG ID.
 
 hgsql hg18 -N -e \
 'select omimId, kg.* from knownGene kg, knownToRefSeq kr, refLink l where omimId != 0 and mrnaAcc=kr.value and kg.name=kr.name ' \
 |cut -f 1,3-13 >o1.tab
 
 # collect more OMIM related genes via the MIM external DB links from UniProt
 
 hgsql hg18 -N -e \
 'select extAC, kg.* from knownGene kg, kgXref k, proteome.spXref2 p where spId=p.accession and extDB="MIM" and kg.name=kgId ' \
 |cut -f 1,3-13 >o2.tab
 
 # concatenate the above two gene sets and remove duplications.
 
 cat o1.tab o2.tab |sort -u >o3.tab
 
 # load the result into a temp table, fanO3
 hgLoadSqlTab hg18 fanO3 ~/src/hg/lib/knownGene.sql o3.tab
 
 # while holding onto the OMIM ID, get the canonical gene (via the knownGene, knowIsoforms, 
 # and knownCanonical tables) that represent a cluster which contains 
 # initial OMIM gene in the fanO3 table
 
 hgsql hg18 -N -e \
 'select f3.name, kg.* from fanO3 f3, knownGene kg, knownCanonical c, knownIsoforms i where f3.alignId=i.transcript and kg.name=c.transcript and c.clusterId=i.clusterId'\
 > o4.tab
 
 # first column is the OMIM ID
 cut -f 1 o4.tab >j1.tmp
 
 # col 3-13 is the gene structure of the canonical KG
 cut -f 3-13 o4.tab >j2.tmp
 
 # stitch them together and remove duplicates, load the result into fanO4 table
 paste j1.tmp j2.tmp |sort -u >fanO4.tab
 hgLoadSqlTab hg18 fanO4  ~/src/hg/lib/knownGene.sql fanO4.tab
 
 # finally sort the table and create bed 4 file and load it as the omimGene table
 
 hgsql hg18 -N -e 'select chrom, txStart, txEnd, name from fanO4 order by chrom, txStart, txEnd' |sort -u >omimGene.bed
 hgLoadBed hg18 omimGene omimGene.bed
 
 # create and load the omimToKnownCanonical table.
 
 hgsql hg18 -N -e 'select name, alignId from fanO4 order by name'\
 > omimToKnownCanonical.tab
 
 hgLoadSqlTab hg18 omimToKnownCanonical  \
 ~/src/hg/lib/omimToKnownCanonical.sql omimToKnownCanonical.tab
 
 # The following clean up could be done.
 # hgsql hg18 -e 'drop table fanO3'
 # hgsql hg18 -e 'drop table fanO4'
 # rm j*.tmp
 # rm o1.tab o2.tab o3.tab o4.tab
 
 # update one omimGene record to reflect a correction UniProt is 
 # going to make on their MIM external link (per 12/15/08 emails from Bob and 
 # Livia (apache@vital-it.ch ) from ExPASy.
 
     hgsql hg18 -e 'update omimGene set name="611016" where name="608636"'
     hgsql hg18 -e 'update omimToKnownCanonical set omimId="611016" where omimId="608636"'
 
 #############################################################################
 # fox2ClipSeq from Gene Yeo (DONE - 2009-01-08 - Hiram)
     mkdir /hive/data/genomes/hg18/bed/fox2ClipSeq
     cd /hive/data/genomes/hg18/bed/fox2ClipSeq
     #	lift the hg17 data to here
     liftOver -bedPlus=9 \
 	/hive/data/genomes/hg17/bed/fox2ClipSeq/forwardStrand.bed.gz \
 /usr/local/apache/htdocs/goldenPath/hg17/liftOver/hg17ToHg18.over.chain.gz \
 	    stdout forwardStrand.unMapped | gzip -c > forwardStrand.bed.gz
 
     liftOver -bedPlus=9 \
 	/hive/data/genomes/hg17/bed/fox2ClipSeq/reverseStrand.bed.gz \
 /usr/local/apache/htdocs/goldenPath/hg17/liftOver/hg17ToHg18.over.chain.gz \
 	    stdout reverseStrand.unMapped | gzip -c > reverseStrand.bed.gz
 
     #	turn into wiggle density plot
     zcat forwardStrand.bed.gz | bedItemOverlapCount hg18 stdin \
         | wigEncode stdin fox2ClipSeqDensityForwardStrand.wig \
 	fox2ClipSeqDensityForwardStrand.wib
     #	Converted stdin, upper limit 2401.00, lower limit 1.00
     zcat reverseStrand.bed.gz | bedItemOverlapCount hg18 stdin \
         | wigEncode stdin fox2ClipSeqDensityReverseStrand.wig \
 		fox2ClipSeqDensityReverseStrand.wib
     #	Converted stdin, upper limit 1406.00, lower limit 1.00
     #	and load tables
     zcat forwardStrand.bed.gz reverseStrand.bed.gz \
 	| hgLoadBed hg18 fox2ClipSeq stdin
     #	Loaded 4418298 elements of size 9
     ln -s `pwd`/*.wib /gbdb/hg18/wib
     hgLoadWiggle hg18 fox2ClipSeqDensityForwardStrand \
 	fox2ClipSeqDensityForwardStrand.wig
     hgLoadWiggle hg18 fox2ClipSeqDensityReverseStrand \
 	fox2ClipSeqDensityReverseStrand.wig
     #	add composite track definitions to makeDb/trackDb/human/trackDb.ra
 
 #############################################################################
 # REPEATMASKER - LATEST VERSION, 3.2.7 (DONE 1/30/09 rhubley and angie)
     # Robert Hubley ran the new and improved version (3.2.7) of RepeatMasker 
     # but politely deferred to staff to load the results:
     mkdir /hive/data/genomes/hg18/bed/RMRunRMH
     cd /hive/data/genomes/hg18/bed/RMRunRMH
     doRepeatMasker.pl -stop mask -buildDir `pwd` hg18
     # see do.log, cat.log
 
     # Angie loaded with new table name, chr*_rmskRM327.  Used -debug to 
     # make scripts, edited those.
     cd /hive/data/genomes/hg18/bed/RMRunRMH
     doRepeatMasker.pl -debug \
       -continue install -buildDir `pwd` hg18
     # Edit doLoad.csh: change table names: rmsk -> rmskRM327,
     # nestedRepeats -> nestedRepeatsRM327
     ./doLoad.csh >& load.log & tail -f load.log
     # Edit doSplit.csh: change -ending to .RM327.fa.out
     ./doSplit.csh >& split.log & tail -f split.log
     doRepeatMasker.pl -continue cleanup -buildDir `pwd` \
       -fileServer hgwdev hg18 >& cleanup.log & tail -f cleanup.log
     # Compare coverage to original RepeatMasker run:
     featureBits hg18 rmskRM327
 #1457032101 bases of 2881515245 (50.565%) in intersection
     featureBits hg18 rmsk
 #1406290513 bases of 2881515245 (48.804%) in intersection
     # Wow, Arian got his 50%!  :)
     # Compare Alu counts, since that is supposed to be an area of improvement:
     grep SINE/Alu hg18.fa.out | wc -l
 #1186885
     ls /hive/data/genomes/hg18/?{,?}{,_*_hap[12]}/chr[0-9XYM]{,[0-9]}{,_random,*_hap[12]}.fa.out \
     | uniq | xargs grep SINE/Alu | wc -l
 #1189976
     # A decrease... weird.  OK, breaking it down chrom-by-chrom, the _random's
     # have fewer and the regular chrom's have more Alu's.  Sounds OK to me :)
     featureBits hg18 rmsk \!rmskRM327
 #12318974 bases of 2881515245 (0.428%) in intersection
     featureBits hg18 rmskRM327 \!rmsk
 #63060562 bases of 2881515245 (2.188%) in intersection
     # hgTables: 49,804 rmskRM327 items (4,805,535 bases) have no overlap with rmsk
 
     # Added download file 2/5/09:
     cd /hive/data/genomes/hg18
     zip -j bigZips/chromOut.RM3.2.7.zip */chr*.RM327.fa.out
     ln -s /hive/data/genomes/hg18/bigZips/chromOut.RM3.2.7.zip \
       /usr/local/apache/htdocs/goldenPath/hg18/bigZips/
 
 
 #############################################################################
 # GENOME VARIANTS - 1000 GENOMES (DONE 1/7/2009 giardine, adapted from an email to angie)
     # December release from 1000 Genomes: SNP calls on four of the 6 high-cov
     # individuals: a CEU trio and a YRI daughter.
     # see ftp://ftp-trace.ncbi.nih.gov/1000genomes/release/2008_12/README_December2008_release
     cd /hive/data/genomes/hg18/bed/pgSnp/
     cat > trio2pg.pl <<'EOF'
 #!/usr/bin/perl -w
 use strict;
 
 #split out individual SNPs from trio file
 #format:chr     loc     ref     alleles snp.Q   av.max.map.Q    depth.cov       NA12891 NA12891.Q        NA12892
 NA12892.Q       NA12878 NA12878.Q       hwe     maf    tdt      display
 
 my $ac = shift @ARGV; #allele column, zero based
 if (!$ac) {
    print "Usage: trio2pg.pl alleleColumn# < infile > outfile\n";
    exit;
 }
 while (<>) { 
    chomp;
    my @f = split(/\t/);
    if ($f[0] eq 'chr') { next; }
    $f[$ac] =~ s/([ATGC])\/\1/$1/;
    if ($f[$ac] eq uc($f[2])) { next; } #reference allele only
    print "chr$f[0]\t", ($f[1]-1), "\t$f[1]\t$f[$ac]\t";
    my $c = ($f[$ac] =~ tr/\//\//) + 1;
    my $s = $f[$ac+1];
    if ($s !~ /\//) { 
       for (my $i = 1; $c > $i; $i++) { $s .= ",$f[$ac+1]"; }
    }else {
       $s =~ s/\//,/g;
       if ($c == 1) { $s =~ s/,.*//; }
    }
    my $n = "0";
    for (my $i = 1; $c > $i; $i++) { $n .= ",0"; } #allele count
    print "$c\t$n\t$s\n";
 }
 
 exit;
 'EOF'
     # << emacs
     chmod a+x trio2pg.pl
 
     #convert to pgSnp
     set relDir = /hive/data/outside/1000genomes/ncbi/ftp-trace.ncbi.nih.gov/1000genomes/release/2008_12/
     zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 7 > NA12891.pgSnp
     zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 9 > NA12892.pgSnp
     zcat $relDir/CEU.trio.dec.with.x.with.rs.calls.gz | trio2pg.pl 11 > NA12878.pgSnp
     zcat $relDir/YRI.child.dec.intersect.calls.gz | trio2pg.pl 7 > NA19240.pgSnp
     #gff for indels does not give nts, can't put in pgSnp format
 
     hgLoadBed hg18 pgNA12878 NA12878.pgSnp \
       -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
     hgLoadBed hg18 pgNA12891 NA12891.pgSnp \
       -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
     hgLoadBed hg18 pgNA12892 NA12892.pgSnp \
       -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
     hgLoadBed hg18 pgNA19240 NA19240.pgSnp \
       -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
 
 
 #############################################################################
 # GENOME VARIANTS - (DONE 1/7/09 giardine, adapted by angie from pgSnp/README)
     # File pgVenter.bed placed in /hive/data/genomes/hg18/bed/pgSnp/ by 
     # Belinda.
     cd /hive/data/genomes/hg18/bed/pgSnp/
     grep "^chr" pgVenter.bed | sort -k1,1 -k2,2n \
     | hgLoadBed hg18 pgVenter stdin \
       -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
     # 3/11/09: fetching this file because I think it's the original data (angie)
     wget ftp://ftp.jcvi.org/pub/data/huref/HuRef.InternalHuRef-NCBI.gff
 
 
 #############################################################################
 # GENOME VARIANTS - YRI NA18507 (DONE 1/9/07 giardine, adapted by angie from pgSnp/README)
     # SNP calls made by Aakrosh Ratan at PSU.
     # Files pgYri{2,3}.txt placed in /hive/data/genomes/hg18/bed/pgSnp/ by 
     # Belinda.
     # yoruban snp calls (using solid software instead of maq)
     # Loaded 11/4/08 according to hg18.history, but table status says created
     # 1/7/09:
     cd /hive/data/genomes/hg18/bed/pgSnp/
     grep "^chr" pgYri2.txt | sort -k1,1 -k2,2n \
     | hgLoadBed hg18 pgYoruban2 stdin \
       -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
     #Another yoruban SNP set, same individual, Solexa reads, includes indels
     # Loaded 11/7/08 according to hg18.history, but table status says created
     # 1/7/09:
     grep "^chr" pgYri3.txt | sort -k1,1 -k2,2n \
     | hgLoadBed hg18 pgYoruban3 stdin \
       -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
 
 
 #############################################################################
 # GENOME VARIANTS - YH (DONE 2/24/09 giardine, adapted by angie from pgSnp/README)
     #Asian individual (YH1) from Nature paper
     #http://yh.genomics.org.cn/index.jsp
     # File pgSnpYh.txt placed in /hive/data/genomes/hg18/bed/pgSnp/ by 
     # Belinda.
     cd /hive/data/genomes/hg18/bed/pgSnp/
     grep "^chr" pgSnpYh.txt | sort -k1,1 -k2,2n \
     | hgLoadBed hg18 pgYh1 stdin \
       -noSort -sqlTable=$HOME/kent/src/hg/lib/pgSnp.sql -renameSqlTable -tab
     # 3/11/09: fetching this file because I think it's the original data (angie)
     wget -O "yhsnp_add.gff" \
       'http://yh.genomics.org.cn/do.downServlet?file=data/snps/yhsnp_add.gff'
 
 
 #############################################################################
 #  Initial import of LSSNP data for SNP and hgGene linking (2009-02-02 markd)
 #############################################################################
 # dump and load LSSNP databases from Johns Hopkins.  This will be automated
 # soon.
     # download dump into tmp directory LSSNP; must load on bugle as the
     # database is mysql 5
     ssh bugle
     hgsql -e 'create database LSSNP'
     cat LSSNP/*.sql |hgsql  LSSNP
     hgsqlimport LSSNP `pwd`/LSSNP/*.txt
     ssh hgwdev
     hgLsSnpPdbLoad fetch bugle:LSSNP lsSnpPdb.tab
     hgLsSnpPdbLoad load hg18 lsSnpPdb lsSnpPdb.tab 
 #############################################################################
 
 
 #############################################################################
 # HGDP GEOGRAPHIC SNP MAPS (DONE 2/5/09 angie)
     # Project data downloaded and parsed in /hive/data/outside/hgdpGeo,
     # see makeDb/doc/hgdpGeo.txt.
     mkdir /hive/data/genomes/hg18/bed/hgdpGeo
     cd /hive/data/genomes/hg18/bed/hgdpGeo
     # Make an rsId-sorted snp coords file for joining with the hgdpGeo data.
     grep -Fwf /hive/data/outside/hgdpGeo/rsIDs.lst \
       ../snp129/snp129.bed \
     | awk 'BEGIN{OFS="\t";} {print $4, $1, $2, $3;}' \
     | sort > snp129Coords.txt
     wc -l snp129Coords.txt
 #660280 snp129Coords.txt
     # How many distinct SNPs in there?  (compare to 657000 from HGDP):
     cut -f 1 snp129Coords.txt |uniq | wc -l
 #656496
 
     # Join files to make a track table:
     join -e ERROR -t'	' -o 1.2,1.3,1.4,1.1,2.2,2.3,2.4 \
       snp129Coords.txt /hive/data/outside/hgdpGeo/hgdpGeoCoordless.txt \
     | sed -re 's/([AGTC])\*/\1/' \
     | sort -k1,1 -k2n,2n \
       > hgdpGeo.tab
     wc -l hgdpGeo.tab
 #660280 hgdpGeo.tab
     grep ERROR hgdpGeo.tab | wc -l
 #0
 
     hgLoadBed hg18 hgdpGeo hgdpGeo.tab \
       -sqlTable=$HOME/kent/src/hg/lib/hgdpGeo.sql
 #Loaded 660280 elements of size 7
 
 
 #############################################################################
 # HGDP HETEROZYGOSITY (DONE 2/12/09 angie, except for Bantu 3/12/09)
     mkdir /hive/data/genomes/hg18/bed/hgdpHzy
     cd /hive/data/genomes/hg18/bed/hgdpHzy
     foreach continent (african americas easia european mideast oceania sasia)
       wget --timestamping http://hgdp.uchicago.edu/data/hzy/$continent.gff.gz
     end
     wget --timestamping http://hgdp.uchicago.edu/data/hzy/allbantu.hzy.gff.gz
     foreach continent (african allbantu americas easia european mideast oceania sasia)
       set bedGraph = `echo $continent \
                       | sed -re 's/can$/ca/; s/pean$/pe/; s/asia/Asia/; s/allbantu/bantu/; \
                                  s/(.*)/hgdpHzy\u\1.bedGraph/'`
       echo $bedGraph
       zcat $continent.gff.gz \
       | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
         > $bedGraph
     end
 
     # 3/12/09: All of the original files' coords were intervals between SNPs,
     # but the Bantu file had SNP coordinates, and one more line per chrom than
     # the others.  So (after getting OK from Joe) I am going to transform the
     # Bantu SNP coords to intervals like the others.
     perl -we 'while (<>) { \
       chomp; ($c, $s, undef, $h) = split; \
       if (defined $lastC) { \
         if ($lastC eq $c) { \
           print "$c\t$lastS\t$s\t$lastH\n"; \
         } # Discarding last SNP on each chrom \
       } \
       ($lastC, $lastS, $lastH) = ($c, $s, $h); \
     }' \
       hgdpHzyBantu.bedGraph > tmp
     mv tmp hgdpHzyBantu.bedGraph
 
     # Using bedGraph, not wig, because there are only 640k datapoints and 
     # some are over the 10Mbase wiggle item size limit.
     foreach f (*.bedGraph)
       hgLoadBed hg18 $f:r $f -bedGraph=4
     end
     # All have same size:
 #Loaded 640676 elements of size 4
 
 
 #############################################################################
 # HGDP FST (DONE 2/12/09 angie)
     mkdir /hive/data/genomes/hg18/bed/hgdpFst
     cd /hive/data/genomes/hg18/bed/hgdpFst
     wget --timestamping \
       http://hgdp.uchicago.edu/data/FST/autosomal_illuminasnps7_pval.gff.gz
     zcat autosomal_illuminasnps7_pval.gff.gz \
     | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
       > hgdpFst.bedGraph
     hgLoadBed hg18 hgdpFst hgdpFst.bedGraph -bedGraph=4
 #Loaded 640676 elements of size 4
 
 
 #############################################################################
 # HGDP IHS (DONE 2/13/09 angie)
     mkdir /hive/data/genomes/hg18/bed/hgdpIhs
     cd /hive/data/genomes/hg18/bed/hgdpIhs
     foreach continent (Bantu Americas E.Asia European MiddleEast Oceania S.Asian)
       wget --timestamping \
         http://hgdp.uchicago.edu/data/iHS/smoothed$continent.iHS.gff.gz
       set bedGraph = `echo $continent \
                       | sed -re 's/pean$/pe/; s/\.Asian?/Asia/; \
                                  s/MiddleEast/Mideast/; s/(.*)/hgdpIhs\1.bedGraph/'`
       echo $bedGraph
       zcat smoothed$continent.iHS.gff.gz \
       | sed -e 's/^chr23/chrX/' \
       | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
         > $bedGraph
     end
     foreach f (*.bedGraph)
       hgLoadBed hg18 $f:r $f -bedGraph=4
     end
 #Reading hgdpIhsBantu.bedGraph
 #Loaded 540438 elements of size 4
 #Reading hgdpIhsAmericas.bedGraph
 #Loaded 422167 elements of size 4
 #Reading hgdpIhsEAsia.bedGraph
 #Loaded 487801 elements of size 4
 #Reading hgdpIhsEurope.bedGraph
 #Loaded 543875 elements of size 4
 #Reading hgdpIhsMideast.bedGraph
 #Loaded 552277 elements of size 4
 #Reading hgdpIhsOceania.bedGraph
 #Loaded 425340 elements of size 4
 #Reading hgdpIhsSAsia.bedGraph
 #Loaded 550231 elements of size 4
 
 
 #############################################################################
 # HGDP XP-EHH (DONE 2/12/09 angie)
     mkdir /hive/data/genomes/hg18/bed/hgdpXpehh
     cd /hive/data/genomes/hg18/bed/hgdpXpehh
     foreach continent (Bantu Americas E.Asia Europe Mideast Oceania S.Asia)
       wget --timestamping \
         http://hgdp.uchicago.edu/data/XPEHH/$continent.xpehh.forbrowser.gff.gz
       set bedGraph = `echo $continent \
                       | sed -re 's/\.Asia?/Asia/; s/(.*)/hgdpXpehh\1.bedGraph/'`
       echo $bedGraph
       zcat $continent.xpehh.forbrowser.gff.gz \
       | awk 'BEGIN{OFS="\t";} {print $1, ($4-1), $5, $6;}' \
         > $bedGraph
     end
     foreach f (*.bedGraph)
       hgLoadBed hg18 $f:r $f -bedGraph=4
     end
 #Reading hgdpXpehhBantu.bedGraph
 #Loaded 636680 elements of size 4
 #Reading hgdpXpehhAmericas.bedGraph
 #Loaded 636143 elements of size 4
 #Reading hgdpXpehhEAsia.bedGraph
 #Loaded 635799 elements of size 4
 #Reading hgdpXpehhEurope.bedGraph
 #Loaded 636680 elements of size 4
 #Reading hgdpXpehhMideast.bedGraph
 #Loaded 636849 elements of size 4
 #Reading hgdpXpehhOceania.bedGraph
 #Loaded 637418 elements of size 4
 #Reading hgdpXpehhSAsia.bedGraph
 #Loaded 636773 elements of size 4
 
 
 #############################################################################
 # LIFTOVER TO Hg19 (DONE - 2009-03-06 - Hiram )
     mkdir /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
     cd /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
     # -debug run to create run dir, preview scripts...
     doSameSpeciesLiftOver.pl -debug hg18 hg19
     # Real run:
     time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
 	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
 	 hg18 hg19 > do.log 2>&1
     #	real    85m8.064s
 
 #############################################################################
 
 # HAPMAP REL22 RECOMBINATION RATES (PHASE II)  (DONE 2/24/09 angie)
     mkdir -p /hive/data/outside/hapmap/recombination/2008-03_rel22_B36/rates
     cd /hive/data/outside/hapmap/recombination/2008-03_rel22_B36/
     wget --timestamping \
       ftp://ftp.hapmap.org/pub/hapmap/public/recombination/2008-03_rel22_B36/00README.txt
     cd rates
     wget --timestamping \
       ftp://ftp.hapmap.org/pub/hapmap/public/recombination/2008-03_rel22_B36/rates/\*
 
     # Make bedGraph-formatted files.
     mkdir -p /hive/data/genomes/hg18/bed/hapmap/recombination/2008-03_rel22_B36
     cd /hive/data/genomes/hg18/bed/hapmap/recombination/2008-03_rel22_B36
     cp /dev/null hapmapRecombRate.bed
     foreach f (/hive/data/outside/hapmap/recombination/2008-03_rel22_B36/rates/*.txt)
       set chr = `echo $f:t:r | sed -e 's/^.*chr/chr/; s/_b36.*//;'`
       echo $f $chr
       perl -wpe 's/^position .*\n// && next; \
                  m/^(\d+) (\d+\.?\d*) .*/ || die $_; $end=$1; $rate=$2; \
                  $start=$end-100 unless (defined $start); \
                  $_ = "'$chr'\t$start\t$end\t$rate\n";  $start = $end;' \
         $f >> hapmapRecombRate.bedGraph
     end
     # Some items are over the 10Mbase wiggle item size limit, so use bedGraph.
     time hgLoadBed hg18 hapmapRecombRate hapmapRecombRate.bedGraph -bedGraph=4
 #Loaded 3281323 elements of size 4
 #14.688u 1.796s 0:31.99 51.4%    0+0k 0+0io 0pf+0w
 
     # There are >3M items...  try bigWig!  :)
     wigToBigWig hapmapRecombRate.bedGraph /hive/data/genomes/hg18/chrom.sizes \
       hapmapRecombRate.bw
     ln -s `pwd`/hapmapRecombRate.bw /gbdb/hg18/bbi/
     hgsql hg18 -e 'drop table if exists hapmapRecombRateBW; \
             create table hapmapRecombRateBW (fileName varchar(255) not null); \
             insert into hapmapRecombRateBW values ("/gbdb/hg18/bbi/hapmapRecombRate.bw");'
 
 
 #############################################################################
 # HAPMAP REL27 GENOTYPES (MERGED PHASE II+III)  (DONE 2/25/09 angie)
     # First, download release to /hive/data/outside...
     mkdir -p /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III/{excluded,forward}
     cd /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III
     wget --timestamping \
       ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/00README.txt
     cd excluded
     wget --timestamping \
       ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/excluded/\*
     cd ../forward
     wget --timestamping \
       ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-02_phaseII+III/forward/\*
 
     # This directory's README refers to the README from the
     # phaseIII-only 2009_01, which gives the file format and explains
     # the population codes:
     wget --timestamping -o 00README_2009-01_phaseIII.txt \
       ftp://ftp.hapmap.org/pub/hapmap/public/genotypes/2009-01_phaseIII/00README.txt
 
     # For details page... this is Coriell's NHGRI panel (all HapMap except 
     # CEPH): http://ccr.coriell.org/Sections/Collections/NHGRI/?SsId=11
     # http://www.broad.mit.edu/mpg/hapmap3/
     # Broad, BCM and Sanger have a nice phase3 writeup.  Here is Broad's
     # copy: http://www.broad.mit.edu/mpg/hapmap3/
 
     # Now translate those into hapmapSnps* tables.
     # NOTE FOR NEXT TIME: make this a cluster job.  It takes ~half hour each pop!
     # Could run the script on each downloaded file as a separate job, and then
     # concatenate results (or just feed chr*_$pop to hgLoadBed).
     mkdir -p /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
     cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
     set sourceDir = /hive/data/outside/hapmap/genotypes/2009-02_phaseII+III/forward
     foreach pop (ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI)
       echo $pop
       zcat $sourceDir/genotypes_chr*_${pop}_r27_nr.b36_fwd.txt.gz \
       | perl -wpe 'chomp; \
           if (/^rs# alleles c\w+ pos s\w+ a\w+# c\w+ protLSID assayLSID panelLSID QCcode NA/) { \
             $_ = "";  # skip header lines  \
           } elsif (s/^(rs\d+) ([ACGT])\/([ACGT]) (chr\w+) (\d+) \+ ncbi_[bB]?36 .* QC\+ //) { \
             ($rsId, $obs1, $obs2, $chr, $end) = ($1, $2, $3, $4, $5); \
             %compl = (A=>"T", C=>"G", G=>"C", T=>"A"); \
             %hom = ();  %het = (); \
       # NOTE: one trouble-maker (other pop files have A/C with AC genotypes): \
             if ($rsId eq "rs7059622" && "'$pop'" eq "YRI") { warn "Tweaking YRI rs7059622.\n"; } \
             foreach my $al (split()) { \
               next if ($al eq "NN"); \
               $al =~ /^([ACGT])([ACGT])$/ || die "Unrecognized allele string $al"; \
               ($a1, $a2) = ($1, $2); \
       # NOTE: one trouble-maker (other pop files have A/C with AC genotypes): \
               if ($rsId eq "rs7059622" && "'$pop'" eq "YRI") \
                 { $a1 = $compl{$a1}; $a2 = $compl{$a2}; } \
       # The error that the trouble-maker triggered: \
               if (($a1 !~ /^[$obs1$obs2]$/) || ($a2 !~ /^[$obs1$obs2]$/)) \
                 { die "$rsId (${chr}_'$pop'): obs $obs1/$obs2 !~ $a1$a2!\n\t"; } \
               if ($a1 eq $a2) { $hom{$a1}++; } else { $het{$a1}++; $het{$a2}++; } \
             } \
             $start = $end - 1; \
             $hom1 = $hom{$obs1} || 0; $hom2 = $hom{$obs2} || 0; \
             $het = $het{$obs1} || 0;  $het2 = $het{$obs2} || 0; \
             $score = (1000 * (2*$hom2 + $het) / (2*($hom1 + $hom2 + $het))); \
             if ($score >= 500) { $score = 1000 - $score; } \
             $score = int($score + 0.5); \
             if ($het != $het2) { die "het{$obs1} ($het{$obs1}) != het{$obs2} ($het{$obs2})"; } \
             $_ = "$chr\t$start\t$end\t$rsId\t$score\t+\t$obs1/$obs2\t$obs1\t$hom1\t$obs2\t$hom2\t$het\n"; \
           } else { \
             die "Unrecognized format:\n$_\n\t"; \
           }' > hapmapSnps$pop.bed
     end
     wc -l hapmapSnps*.bed
 #   1561453 hapmapSnpsASW.bed
 #   4030774 hapmapSnpsCEU.bed
 #   4052336 hapmapSnpsCHB.bed
 #   1306196 hapmapSnpsCHD.bed
 #   1407877 hapmapSnpsGIH.bed
 #   4052423 hapmapSnpsJPT.bed
 #   1529764 hapmapSnpsLWK.bed
 #   1410265 hapmapSnpsMEX.bed
 #   1537638 hapmapSnpsMKK.bed
 #   1419921 hapmapSnpsTSI.bed
 #   3984356 hapmapSnpsYRI.bed
     foreach pop (ASW CEU CHB CHD GIH JPT LWK MEX MKK TSI YRI)
       hgLoadBed hg18 hapmapSnps$pop hapmapSnps$pop.bed -renameSqlTable \
         -sqlTable=$HOME/kent/src/hg/lib/hapmapSnps.sql
     end
 #Reading hapmapSnpsASW.bed
 #Loaded 1561453 elements of size 12
 #Reading hapmapSnpsCEU.bed
 #Loaded 4030774 elements of size 12
 #Reading hapmapSnpsCHB.bed
 #Loaded 4052336 elements of size 12
 #Reading hapmapSnpsCHD.bed
 #Loaded 1306196 elements of size 12
 #Reading hapmapSnpsGIH.bed
 #Loaded 1407877 elements of size 12
 #Reading hapmapSnpsJPT.bed
 #Loaded 4052423 elements of size 12
 #Reading hapmapSnpsLWK.bed
 #Loaded 1529764 elements of size 12
 #Reading hapmapSnpsMEX.bed
 #Loaded 1410265 elements of size 12
 #Reading hapmapSnpsMKK.bed
 #Loaded 1537638 elements of size 12
 #Reading hapmapSnpsTSI.bed
 #Loaded 1419921 elements of size 12
 #Reading hapmapSnpsYRI.bed
 #Loaded 3984356 elements of size 12
     rm bed.tab; nice gzip *.bed
 
 
 #############################################################################
 # HAPMAP REL27 ORTHOLOGOUS ALLELES (DONE 3/4/09 angie)
     # Similar procedure to snp129Ortho, but we make one table per species
     # because they are independent subtracks of HapMap SNPs.
     cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
     # Glom all human info that we need for the final table onto the
     # name, to sneak it through liftOver: rsId|chr|start|end|obs|strand
     awk 'BEGIN{OFS="\t";} \
         {print $1, $2, $3, \
                $4 "|" $1 "|" $2 "|" $3 "|" $7 "|" $6, \
                0, $6;}' \
       hapmapSnps???.bed \
     | sort -u -k1,1 -k2n,2n \
       > hapmapSnpsForLiftOver.bed
     wc -l hapmapSnpsForLiftOver.bed
 #4165831 hapmapSnpsCombined.bed
 
     # Orthologous allele locations:
     mkdir run.liftOChimp
     cd run.liftOChimp
     mkdir split out
     splitFile ../hapmapSnpsForLiftOver.bed 25000 split/chunk
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg18/bed/liftOver/hg18ToPanTro2.over.chain.gz \
         \{check out exists out/panTro2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     ssh pk
     cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III/run.liftOChimp
     para make jobList
 #Completed: 167 of 167 jobs
 #CPU time in finished jobs:      31364s     522.74m     8.71h    0.36d  0.001 y
 #IO & Wait Time:                   800s      13.33m     0.22h    0.01d  0.000 y
 #Average job time:                 193s       3.21m     0.05h    0.00d
 #Longest finished job:             431s       7.18m     0.12h    0.00d
 #Submission to last job:           442s       7.37m     0.12h    0.01d
     mkdir ../run.liftOMac
     cd ../run.liftOMac
     mkdir out
     ln -s ../run.liftOChimp/split .
     cp /dev/null jobList
     foreach f (split/chunk*)
       echo liftOver $f \
         /hive/data/genomes/hg18/bed/liftOver/hg18ToRheMac2.over.chain.gz \
         \{check out exists out/rheMac2.$f:t.bed\} out/hg18.$f:t.unmapped \
         >> jobList
     end
     para make jobList
 #Completed: 167 of 167 jobs
 #CPU time in finished jobs:       2482s      41.36m     0.69h    0.03d  0.000 y
 #IO & Wait Time:                  1361s      22.69m     0.38h    0.02d  0.000 y
 #Average job time:                  23s       0.38m     0.01h    0.00d
 #Longest finished job:              33s       0.55m     0.01h    0.00d
 #Submission to last job:            97s       1.62m     0.03h    0.00d
 
     # Concatenate the liftOver results, sorting by ortho pos in order to
     # efficiently access 2bit sequence in getOrthoSeq.  The output of
     # that is swizzled so that a glom of ortho coords is the first column,
     # and then we sort by that for joining with base quality info.
     # Ditto for macaque.  ~5 minutes per species:
     cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
     sort -k1,1 -k2n,2n run.liftOChimp/out/panTro2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /hive/data/genomes/panTro2/panTro2.2bit \
     | awk 'BEGIN{OFS="\t";} {print $2 ":" $3 ":" $4, $5, $6, $1;}' \
     | sort > panTro2.orthoGlom.txt
     sort -k1,1 -k2n,2n run.liftOMac/out/rheMac2.chunk*.bed \
     | ~/kent/src/hg/snp/snpLoad/getOrthoSeq.pl /hive/data/genomes/rheMac2/rheMac2.2bit \
     | awk 'BEGIN{OFS="\t";} {print $2 ":" $3 ":" $4, $5, $6, $1;}' \
     | sort > rheMac2.orthoGlom.txt
     wc -l panTro2.orthoGlom.txt rheMac2.orthoGlom.txt
 #  4057739 panTro2.orthoGlom.txt
 #  3750076 rheMac2.orthoGlom.txt
     # Get base qualities -- ~12-16min per species.
     cut -f 1 panTro2.orthoGlom.txt | sed -e 's/:/\t/g' \
     | hgWiggle -db=panTro2 -lift=1 -doAscii -bedFile=stdin quality \
     | varStepToBedGraph.pl stdin \
     | awk 'BEGIN{OFS="\t";} {print $1 ":" $2 ":" $3, int($4+0.5);}' \
     | sort > panTro2.baseQuals.txt
 #Processed 4003968 lines input, 4003685 data lines, 47 variable step declarations
     cut -f 1 rheMac2.orthoGlom.txt | sed -e 's/:/\t/g' \
     | hgWiggle -db=rheMac2 -lift=1 -doAscii -bedFile=stdin quality \
     | varStepToBedGraph.pl stdin \
     | awk 'BEGIN{OFS="\t";} {print $1 ":" $2 ":" $3, int($4+0.5);}' \
     | sort > rheMac2.baseQuals.txt
 #Processed 3749772 lines input, 3749645 data lines, 21 variable step declarations
 
     # Join the allele-glom with the base qual-glom and swizzle columns into
     # the right order for a hapmapAllelesOrtho table.
     join -a 1 -e 0 panTro2.orthoGlom.txt panTro2.baseQuals.txt \
     | perl -wpe 'chomp; ($oG, $oA, $oStr, $hG, $bQ) = split; \
         ($oC, $oS, $oE) = split(":", $oG); \
         ($rs, $hC, $hS, $hE, $hO, $hStr) = split(/\|/, $hG); \
         unless (defined $bQ) { \
           if ($oC =~ /^chr(21|Y|Y_random)$/) { $bQ = 98; } # per panTro2 quality track desc \
           elsif ($oC eq "chrM") { $bQ = 0; } \
           else { die "missing qual for $oC: $_\n\t"; } } \
         $_ = "$hC\t$hS\t$hE\t$rs\t$bQ\t$hStr\t\t$hO\t$oC\t$oS\t$oE\t$oStr\t$oA\n";' \
     | sort -k1,1 -k2n,2n \
         > hapmapAllelesChimp.bed
     wc -l hapmapAllelesChimp.bed
 #4057739 hapmapAllelesChimp.bed
     join -a 1 -e 0 rheMac2.orthoGlom.txt rheMac2.baseQuals.txt \
     | perl -wpe 'chomp; ($oG, $oA, $oStr, $hG, $bQ) = split; \
         ($oC, $oS, $oE) = split(":", $oG); \
         ($rs, $hC, $hS, $hE, $hO, $hStr) = split(/\|/, $hG); \
         unless (defined $bQ) { die "missing qual for $oC: $_\n\t"; } \
         $_ = "$hC\t$hS\t$hE\t$rs\t$bQ\t$hStr\t\t$hO\t$oC\t$oS\t$oE\t$oStr\t$oA\n";' \
     | sort -k1,1 -k2n,2n \
         > hapmapAllelesMacaque.bed
     wc -l hapmapAllelesMacaque.bed
 #3750076 hapmapAllelesMacaque.bed
 
     # Load tables.
     cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
     hgLoadBed hg18 hapmapAllelesChimp hapmapAllelesChimp.bed \
       -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapAllelesOrtho.sql
 #Loaded 4057739 elements of size 13
     hgLoadBed hg18 hapmapAllelesMacaque hapmapAllelesMacaque.bed \
       -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapAllelesOrtho.sql
 
 
 #############################################################################
 # HAPMAP REL27 SUMMARY FOR HGTRACKS FILTERING (DONE 3/5/09 angie)
     cd /hive/data/genomes/hg18/bed/hapmap/genotypes/2009-02_phaseII+III
     time hapmapPhaseIIISummary .
 #115.244u 5.009s 2:10.08 92.4%   0+0k 0+0io 2pf+0w
     time hgLoadBed hg18 hapmapPhaseIIISummary hapmapPhaseIIISummary.bed \
       -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/hapmapPhaseIIISummary.sql
 #Loaded 4166007 elements of size 18
 #33.401u 3.275s 1:46.95 34.2%    0+0k 0+0io 0pf+0w
 
 
 #############################################################################
 # GERP Conservation scoring and elements for Ensembl 31-way alignments
 # From Javier Guerroro
 # ENCODE-related data (equested by Margulies, for use by ENCODE analysis group)
 # (2009-03-05 kate)
 
     ssh hgwdev
     cd /cluster/data/hg18/bed
     mkdir -p ensembl31wayGerp/lab
     cd ensembl31wayGerp/lab
     wget -r ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/31way_msa/
     cd ..
     bzcat lab/31way_gerp_elements.bed.bz2 | \
         tail -n +2 | \
         sed 's/31way_gerp_elem_365000000/gerp31./' | \
         hgLoadBed hg18 ensembl31wayGerpElements stdin \
             -sqlTable=$HOME/kent/src/hg/lib/encode/broadPeak.sql -renameSqlTable
     # Loaded 1464897 elements of size 9
 
 cat > we.csh << 'EOF'
     foreach f (lab/*.wig.bz2)
         echo $f
         bzcat $f | tail -n +2 | wigEncode stdin temp.wig temp.wib
     end
 'EOF'
 
     bzcat lab/*.wig.bz2 | tail -n +2 | \
         wigEncode stdin ensembl31wayGerpScores.wig ensembl31wayGerpScores.wib
 
     #   load database
     mkdir /gbdb/hg18/wib
     ln -s `pwd`/ensembl31wayGerpScores.wib /gbdb/hg18/wib
     hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 ensembl31wayGerpScores ensembl31wayGerpScores.wig
 
 
 ############################################################################
 # VEGA GENES UPDATE (BUILD 33) (DONE 2008-03-11 Andy)
     mkdir  /cluster/data/hg18/bed/vega33
     cd  /cluster/data/hg18/bed/vega33
     wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/human/*" \
          "ftp://ftp.sanger.ac.uk/pub/vega/human/pep/*.tot.fa.gz"
     zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
         | grep "^chr" > nonHaps.gtf
     zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
         | grep -v "^chr" > haps.gtf
     awk 'BEGIN{OFS="\t";FS="\t";}{ if ($1 == "c6_COX") { if (($4 >= 28688544) && ($5 <= 33420241)) print; } else if ($1 == "c6_QBL") { if (($4 >= 28885510) && ($5 <= 33451440)) print;}}' haps.gtf > keeptHaps.gtf
     liftUp -type=.gtf lifted.gtf /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry keeptHaps.gtf
     cat nonHaps.gtf lifted.gtf > all.gtf
     gzip all.gtf
     rm *.gtf
     gtfToGenePred -infoOut=infoOut.txt -genePredExt all.gtf.gz stdout | gzip > all.gp.gz
     /cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl infoOut.txt > ensGtp.tab
     genePredCheck -db=hg18 all.gp.gz 
 #checked: 69859 failed: 0
     zcat all.gtf.gz | grep -i pseudo > pseudo.gtf
     zcat all.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
     gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
     gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
     genePredCheck -db=hg18 pseudo.gp
 #checked: 6901 failed: 0
     genePredCheck -db=hg18 not.pseudo.gp
 #checked: 62958 failed: 0
     hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
     hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
 
 ##############################################################################
 # UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
     mkdir /hive/data/genomes/hg18/bed/ucscToEnsembl
     cd /hive/data/genomes/hg18/bed/ucscToEnsembl
     awk '{printf "%s\t%s\n", $4, $2}' ../../jkStuff/ensGene.haplotype.lift \
 	> ucscToEnsembl.tab
 
     cat << '_EOF_' > ucscToEnsembl.sql
 # UCSC to Ensembl chr name translation
 CREATE TABLE ucscToEnsembl (
     ucsc varchar(255) not null,        # UCSC chromosome name
     ensembl varchar(255) not null,     # Ensembl chromosome name
               #Indices
     PRIMARY KEY(ucsc(21))
 );
 '_EOF_'
 
     hgsql hg18 < ucscToEnsembl.sql
     hgsql hg18 \
 -e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'
 
     awk '{printf "%s\t%d\n", $2, -$1}' ../../jkStuff/ensGene.haplotype.lift \
 	> ensemblLift.tab
 
     cat << '_EOF_' > ensemblLift.sql
 # UCSC offset to Ensembl coordinates
 CREATE TABLE ensemblLift (
     chrom varchar(255) not null,      # Ensembl chromosome name
     offset int unsigned not null,     # offset to add to UCSC position 
               #Indices
     PRIMARY KEY(chrom(6))
 );
 '_EOF_'
 
     hgsql hg18 < ensemblLift.sql
     hgsql hg18 \
 -e 'LOAD DATA LOCAL INFILE "ensemblLift.tab" INTO TABLE ensemblLift'
 
 ##############################################################################
 # FOX2 CLUSTERS (DONE 2009-04-08, Andy)
     cp cluster.combine.bed /hive/data/genomes/hg18/bed/fox2ClipSeq
 ## (got the data as an attachment from Gene Yeo)
     cd /hive/data/genomes/hg18/bed/fox2ClipSeq
     grep chr cluster.combine.bed | cut -f1-4 | \
       bedSort stdin fox2ClipClusters.hg17.bed
     liftOver fox2ClipClusters.hg17.bed \
       /gbdb/hg17/liftOver/hg17ToHg18.over.chain.gz \
       fox2ClipClusters.bed unmapped.bed
     hgLoadBed hg18 fox2ClipClusters{,.bed}
 
 ##############################################################################
 #  RE-BUILD sno/miRNA TRACK (DONE, 2009-06-11 - 2009-06-13, hartera)
     # The data in this track is out of date so update the track. 
     mkdir -p /hive/data/genomes/hg18/bed/wgRna-2009-06-11
     cd /hive/data/genomes/hg18/bed/wgRna-2009-06-11
     # Download GFF file of latest miRNA annotations from miRBase at the
     # Wellcome Trust Sanger Institute (WTSI). This is Release 13.0 (March
     # 2009)
     wget --timestamping \
 ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/CURRENT/genomes/hsa.gff
     # Re-format, need to add "chr" to the beginning of each line.
     sed -e 's/^/chr/' hsa.gff > hsMirBaseFormat.gff
     # Remove extra "chr" in comment lines
     perl -pi.bak -e 's/chr#/#/' hsMirBaseFormat.gff
     # Change chrMT to chrM
     perl -pi.bak -e 's/chrMT/chrM/' hsMirBaseFormat.gff
     # Remove all but ID name in last field
     sed -e 's/\";//g' hsMirBaseFormat.gff | sed -e 's/ID=\"//g' \
        | sed -e 's/ACC=\"MI[0-9]*\s//' > hsMirBaseFormatIdOnly.gff
 
     # use score 906 for + strand and 480 for - strand. This will show 
     # up black on the track for + strand and grey for - strand.
     # Starts appear to be 1-based when compared to miRNAs in current track
     # and those in Ensembl.
     # Confirmed with Sam Griffith-Jones (one of the authors of miRBase,
     # sam.griffith-jones@manchester.ac.uk) that these GFF coordinates
     # are 1-based. 
     # Also add thickStart and thickEnd columns and "miRNA" for type.
     awk 'BEGIN {FS="\t"} {OFS="\t"} \
         {if ($0 !~ /#/ && $7 == "+") \
          print $1, $4-1, $5, $9, 960, $7, 0, 0, "miRNA"; \
        else if ($0 !~ /#/ && $7 == "-") \
          print $1, $4-1, $5, $9, 480, $7, 0, 0, "miRNA";}' \
         hsMirBaseFormatIdOnly.gff > hsMirBaseFormatIdOnly.bed
     # 2009-06-12
     # snoRNAs are from snoRNABase at http://www-snorna.biotoul.fr/
     # Download coordinates for hg18 from
     # http://www-snorna.biotoul.fr/coordinates.php
     # This is version 3 of the database.
     # save as tab-separated file: snoRNABaseVersion3Coords.txt and remove
     # first and last lines.
     perl -pi.bak -e 's/\"//g' snoRNABaseVersion3Coords.txt
     # Reformat to BED format with thickStart and thickEnd set to 0.
     awk 'BEGIN {FS="\t"} {OFS="\t"} \
         {if ($4 == "+") \
          print $1, $2-1, $3, $5, 960, $4, 0, 0,$6; \
        else if ($4 == "-") \
          print $1, $2-1, $3, $5, 480, $4, 0, 0,$6;}' \
        snoRNABaseVersion3Coords.txt > snoRNABaseVersion3Coords.bed
    # Merge the miRNA and snoRNA files together
    cat hsMirBaseFormatIdOnly.bed snoRNABaseVersion3Coords.bed \
        > wgRna20090611.bed
    # Load into separate table rather than overwriting wgRna
    cp -p /cluster/home/hartera/src/hg/lib/wgRna.sql wgRnaJun09.sql
    perl -pi.bak -e 's/TABLE wgRna/TABLE wgRnaJun09/' wgRnaJun09.sql
    hgLoadBed -sqlTable=wgRnaJun09.sql hg18 wgRnaJun09 wgRna20090611.bed
 # Reading wgRna20090611.bed
 # Loaded 1120 elements of size 9
 # Sorted
 # Creating table definition for wgRnaJun09
 # Saving bed.tab
 # Loading hg18
 
    # Clean up
    rm *.bak
 
 hgsql -e 'select count(*) from wgRna;' hg18 
 # 1059
 # for miRNAs: 685 (676 unique names)
 # and others: 374 including 21 scaRNA
 hgsql -e 'select count(*) from wgRnaJun09;' hg18
 # 1120
 # for miRNAs: 718 (705 unique)
 # and others: 402 including 21 scaRNA
    # 2009-06-13
    # Renamed the old wgRna track to wgRnaOld and renamed the new wgRnaJun09
    # track to wgRna. Will keep the old track around for a while until
    # new track checked and QA'd.
    hgsql -e 'alter table wgRna rename wgRnaOld;' hg18
    hgsql -e 'alter table wgRnaJun09 rename wgRna;' hg18
 
 
 
 ##################
 ## Uniqueness Track: Step one (courtesy of John Castle, Rosetta)
 ## Make oligos of length XX
 
 # Perl one-liner to make a batch file
 # I've included the perl files CNV_makereads2.pl (simply uses substr on a chromosome) and fastagrep.pl (to remove sequences with Ns # The files chr$x.fa are the individual chromosomes
 
 perl -e 'for ($i = 1;$i<= 25; $i++) {$x = $i; if ($i == 23) {$x = 'X';} if ($i == 24) {$x = 'Y';} if ($i == 25) {$x = 'M';} print "~/DTcode/CNV_makereads2.pl 100 /info/genome/Projects/721/ref/chr$x.fa | fastagrep.pl -v n > chr$x.fa\n";}' > batch_chr_get
 
 #!/usr/bin/perl -w
 #---------------------------------------------------------------------
 #                   C O P Y R I G H T   N O T I C E
 #---------------------------------------------------------------------
 #            Copyright (c) 2001 Rosetta Inpharmatics, Inc. 
 #           12040 115th Avenue NE, Kirkland, WA 98034-6900
 #         All Rights Reserved.  Reproduction, adaptation, or
 #          translation without prior written permission of 
 #             Rosetta Inpharmatics, Inc. is prohibited.
 #---------------------------------------------------------------------
 # CNV_makereads.pl
 # $Id$
 
 
 #use lib ('/home/castlej/perl/','/home/castlej/OSDTools/','/home/castlej/DTcode/');
 #use strict;
 
 my $oligo_length = $ARGV[0];
 my $file = $ARGV[1];
 
 open(IN,$file);
 $/ = "\n>";# change input line separator to '>' to suck up FASTA sequences
 while ($line= <IN>) {
   $line =~ s/^>//m;
   # remove '>' from end of $line 
   $line =~ s/>$//m;
   # remove Unigene lines starting with '#'
   $line =~ s/\n\#.*$//m;
   # get sequence id
   $line =~ /^\s*(\S+).*([^\0]*)/;
   $id = $1;
   $seq = $2;
   $seq =~ s/\n//g;
 }
 
 if ($id =~ /(chr\S+)\.nib/) {
   $chr = $1;
 } elsif ($id =~ /(chr\S+)/) {
   $chr = $1;
 }
 
 for ($i = 0; $i <length($seq)-$oligo_length; $i++) {
   $a = substr($seq,$i,$oligo_length);
   $j = $i+$oligo_length;
   print ">$chr:$i-$j\n$a\n";
 }
 
 
 #!/usr/bin/perl -w
 #---------------------------------------------------------------------
 #                   C O P Y R I G H T   N O T I C E
 #---------------------------------------------------------------------
 #      Copyright (c) 2000,2001,2002 Rosetta Inpharmatics, Inc. 
 #           12040 115th Avenue NE, Kirkland, WA 98034-6900
 #         All Rights Reserved.  Reproduction, adaptation, or
 #          translation without prior written permission of 
 #             Rosetta Inpharmatics, Inc. is prohibited.
 #---------------------------------------------------------------------
 #
 # $Id$
 #
 # finds selected sequences in FASTA by regex matching in defline or sequence
 
 use strict;
 
 my( $option,
     $regex,
     @regexes,
     %tofind,
     $exceptflag,
     $key, 
     $value,
     $line,
     );
 
 $exceptflag = 0;
 
 unless (scalar(@ARGV)) {
   print "\nUsage: $0 [OPTION] PATTERN [FASTAFILE]\n";
   print "$0 finds sequences by pattern matching in FASTA format data\n\n";
   exit;
 }
 
 while ((scalar(@ARGV)) && ($ARGV[0] =~ /^-(\w+)/)) {
   $option = $1;
   shift(@ARGV);
   if ($option =~ /v/) { # user wants sequences NOT matching regex(es)
     $exceptflag = 1;
   }
   if ($option =~ /s/) { # regex on command line
     push(@regexes, shift(@ARGV));
   }
   
   if ($option =~ /f/) { # user wants list of regexes from file
     open(INHANDLE, "<$ARGV[0]") || 
       die "$0: error, can't open regex list file $ARGV[0]\n";
     while (defined($regex = <INHANDLE>)) {
       chomp $regex;
       push(@regexes, $regex);
     }
     shift(@ARGV);
   }
 }
 
 if (scalar(@regexes) < 1) { push(@regexes, shift(@ARGV)); }
 $/ = "\n>"; # change input line separator to suck up FASTA sequences
 
 SEQUENCE:
 while (defined($line = <>)) {
   # remove '>' from start of first $line
   $line =~ s/^>//m;
   # stick '>' back on all $lines
   $line = '>'.$line;
   # remove '>' from end of $line 
   $line =~ s/>$//m;
   # remove Unigene lines starting with '#'
   $line =~ s/\n\#.*$//m;
   foreach $regex (@regexes) {
     if ($line =~ /$regex/) { 
       unless ($exceptflag) { print $line; }
       next SEQUENCE; 
     }
   }
   if ($exceptflag) { print $line; }
 }
 
 
 
 # Submit batch file to cluster (we use LSF), each line is a submission
 perl -ne 'chomp; $a = "bsub -q short64  \"$_\"\n"; system($a);' batch_chr_get
 
 
 
 ####################
 # Uniqueness Step two # I've used an older version of BWA.  The newer version from sourceforge outputs a binary file which then must be converted to a text file
 # HG18 is the human genome
 # I could include banything_2GBNew.pl but it is simply a cluster "chunk and submit" code 
 # Method 1 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i;  if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "banything_2GbNew2.pl -a /ifs65/dtap/bin/bwa/bwa-0.2.0/bwa  -z 1000000 -in chr$x.fa -o chr$x.bwa  -stdout chr$x.bwa  -pre \"aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18 \" -suf \" \"  \n";}' >! batch_banything
  chmod +777  batch_banything
 batch_banything
 
 # Method 2 perl -e 'for ($i =1;$i<= 25; $i++) {$x = $i;  if ($i == 23) {$x = 'X';} elsif ($i == 24) {$x = 'Y';} elsif ($i == 25) {$x = 'M';} print "/ifs65/dtap/bin/bwa/bwa-0.2.0/bwa aln -o 0 /info/dtap/projects/1057_CNV/HG18/HG18  chr$x.fa >  chr$x.bwa\n"}' >!  batch_banything
  chmod +777  batch_banything
 perl -ne 'chomp; $a = "bsub -q long64  \"$_\"\n"; system($a);' batch_anything
 
 
 #####################
 # Uniqueness Step three 
 # I ran this one-liner from a higher level directory
 perl -e '$pwd = `pwd`; chomp($pwd); @a = `ls`; foreach $dir (@a) {chomp ($dir); unless ($dir =~ /(\d+)mer_2nd/) {next;}; @b = `ls $dir/*fa.bwa`; foreach $file (@b) {chomp($file); $f = "$pwd/$file"; $f =~ /^(\S+chr[^\.]+)\.*/; $e = $1; print "~/DTcode/CNV_parseBWA_wiggle.pl 100 1 $f\* > $e.quality.100.wiggle\n";}}' > batch_wiggle
 # Submit batch file to cluster (we use LSF), each line is a submission
 perl -ne 'chomp; $a = "bsub -q long64  \"$_\"\n"; system($a);' batch_wiggle
 
 #!/usr/bin/perl -w
 # John Castle
 # May 19, 2009
 # $Cap          a maximum value to clip data with
 # $Use_score    whether to output the uniqueness score or the number of hits
 # @FilesIn      the BWA text output files to scan
 #  ** NOTE ** The newer BWA algorithm outputs a binary file that is then made into a text file using BWA again.  
 # However, the text file output has a slightly different format so the parsing will need to change.
 
 
 ($Cap, $Use_score, @FilesIn)      = @ARGV;
 
 if ($FilesIn[0] =~ /\.gz/) {
   open(IN,"gzip -dc $FilesIn[0] |")
 } else {
   open(IN,$FilesIn[0]);
 }
 
 #### Description
 @a = split("\t",<IN>);
 $a[6] =~ /(\d+)/;
 $len = $1;
 close(IN);
 
 ### Wiggle header text
 if ($Use_score == 0) {
   print "track type=wiggle_0 name=\"Alignment scores of $len\mer as\" description=\"Unique $len mer alignments\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n";
 } else {
   print "track type=wiggle_0 name=\"$len\mer alignment scores\" description=\"$len\mer alignment scores from BWA/MAQ, where 37 indicates a unique alignment\" color=100,50,150 gridDefault=on yLineOnOff=on visibility=full maxHeightPixels=40:40:12\n";
 }
 
 ### Parse through file(s)
 foreach $file (@FilesIn) {
   if ($file =~ /\.gz/) {
     open(IN,"gzip -dc $file |");
   } else {
     open(IN,$file);
   }
   @a = split("\t",<IN>);
   $a[0]    =~ /(chr\S+):(\d+)/;
   $Chr     = $1;
   $start   = $2;
   $score   = $a[5];
   $hits    = $a[11];
   if ($hits > $Cap) {$hits = $Cap;}
   if ($Use_score == 1) {$value = $score;}
   else {$value = $hits;}
 
   while (<IN>) { # Make wiggle track, with start and end coordinates for same scoring regions
     @a = split("\t",$_);
     if ($#a <15) {
       next;
     }
     
     $a[0] =~ /(chr\S+):(\d+)/;
     $chr   = $1;
     $pos   = $2;
     $score = $a[5];
     $hits  = $a[11];
     if ($hits > $Cap) {$hits = $Cap;}
     
     if ($Use_score == 1) {$x = $score;
     } else {$x = $hits;}
     
     if ($x != $value) {
       print "$Chr\t$start\t$pos\t$value\n";
       $Chr   = $chr;
       $value = $x;
       $start = $pos;
     }
   }
   print "$Chr\t$start\t$pos\t$value\n";
   close(IN);
 }
 
 ############################################################################
 # Re-Run equCab2 alignment (DONE - 2009-06-29,07-02 - Hiram)
     mkdir /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29
     cd /hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29
 
     cat << '_EOF_' > DEF
 # Human vs. Horse
 
 BLASTZ_M=50
 
 # TARGET: Human hg18
 SEQ1_DIR=/scratch/data/hg18/bothMaskedNibs
 SEQ1_LEN=/scratch/data/hg18/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse
 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
 SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
 SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
 SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
 	-workhorse=hgwdev \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    582m47.015s
     #	failed due to power failure - Mon Jun 29 23:32:54 PDT 2009
     time doBlastzChainNet.pl `pwd`/DEF \
 	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
 	-continue=chainRun -workhorse=hgwdev \
 	-chainMinScore=3000 -chainLinearGap=medium > chainRun.log 2>&1 &
     #	real    430m13.886s
     cat fb.hg18.chainEquCab2Link.txt 
     #	1647122438 bases of 2881515245 (57.162%) in intersection
 
     mkdir /hive/data/genomes/equCab2/bed/blastz.hg18.swap
     cd /hive/data/genomes/equCab2/bed/blastz.hg18.swap
     time doBlastzChainNet.pl \
 	/hive/data/genomes/hg18/bed/lastzEquCab2.2009-06-29/DEF \
 	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
 	-swap -workhorse=hgwdev \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    238m42.004s
     cat fb.equCab2.chainHg18Link.txt 
     #	1622340736 bases of 2428790173 (66.796%) in intersection
 
 ############################################################################
 # Fantom Cage 4 Track (2009-07-16)
 cd /projects/compbiousr/sugnet/projects/cage-20090428
 mkdir data
 cd data
 # Get the Human tags from Riken's download site.
 wget -r -l 3 http://fantom.gsc.riken.jp/4/download/Tables/human/CAGE/mapping/
 
 # Apparently time series with hours at:
 # 4,5,6,8,10,11,15,21,22,27,28,33,34,35,37,40,42,43,45,47,48,49,51,52,53,57,59,61,62,63,64,65,69,73,74,91,92,93,h95 ctrls, i02, i03
 
 # Goto the data directory
 cd /projects/compbiousr/sugnet/projects/cage-20090428/data/fantom.gsc.riken.jp/4/download/Tables/human/CAGE/mapping/
 # Unzip data
 for bz in `ls *.bz2`; do \
   echo "Unzipping $bz"; \
   bunzip2 $bz; \
 done 
 
 # From column headers it looks like the values of interest are:
   # 0 = id 
   # 1 = library_count
   # 2 = edit_string
   # 3 = chrom
   # 4 = strand
   # 5 = start
   # 6 = end
 
 # Pull the raw scores into a single file
 cat h*_mapping.tbl.txt | grep -v '^#' | grep -v 'library_count' | grep 'chr' | perl -ne '$l=$_; @w = split /\t/, $l; print "$w[3]\t$w[5]\t$w[6]\t$w[0]\t$w[1]\t$w[4]\n";' > all.wscores.bed
 
     cat << '_EOF_' > toBed.pl
 #!/usr/bin/perl
 
 $prefix = shift(@ARGV);
 $prefix =~ s/h/H/g;
 while($l = <>) {
  if(!($l=~ /^\#/) && !($l=~/^id/)) { 
       chomp($l); 
       @w = split /\t/, $l; 
       $score = 100 * $w[1];
       if($score > 1000) {
 	  $score = 1000;
       }
       $name = $prefix; 
       $size  = $w[6] - $w[5];
       print "$w[3]\t$w[5]\t$w[6]\t$prefix\t$score\t$w[4]\t$w[5]\t$w[6]\t0\t1\t$size,\t0,\n";
   }
 }
 '_EOF_'
     # << happy emacs
 
 chmod 755 toBed.pl
 
 # Make the top level bed track
 for f in `ls *mapping.tbl.txt`; do 
   root=`basename $f .txt`;
   prefix=`basename $f _mapping.tbl.txt`;
   bed=$root.bed;  
   echo "Reading from $f into $bed with prefix $prefix";
   toBed.pl $prefix < $f > $bed;
 done;
 
 # Call program in stats mode to generate summary statistics about how many reads there are in a sliding window around
 # sites with tags
 cageSingleTrack -input=all.wscores.bed  -forward=all.forward.plaw.scores -reverse=all.reverse.plaw.scores -stats-only
 
 # Grab every 100th record to make a bite (byte?) sized chunk for R
 cat all.forward.plaw.scores | perl -e '$c = 0; while($l=<>) { if($c++ % 100 == 0) { print "$l"; } }' > sample.txt
 
 # Some R code to fit a power law model and get coefficient via log/log line fit
 d = read.table('sample.txt');
 # Grab all the data less than 200 counts (81% of data) as that is where the model really fits
 dd = d$V4[d$V4 < 200]
 # Use hist command to find counts at each bucket size
 h = hist(dd, 200, plot=F)
 # Take the logs
 y = log10(h$counts)
 x = log10(h$breaks[1:198])
 # Fit a robust line
 library(MASS)
 r = rlm(y~x)
 # Call:
 # rlm(formula = y ~ x)
 # Converged in 5 iterations
 # 
 # Coefficients:
 #(Intercept)           x
 #   3.987744   -1.196954
 
 # Visually note that the data fits a power law nicely
 plot(log10(h$breaks[1:198]),log10(h$counts), xlab="Log10 Tags In Window", ylab="Log10 Number of Times Occuring", main="Distribution of CAGE Tags in Sliding 35bp Window")
 abline(r)
 
 # Using the coefficient learned above predict the posterior probability of seeing this observation 
 cageSingleTrack -input=all.wscores.bed  -forward=all.forward.plaw.bg2 -reverse=all.reverse.plaw.bg2  -alpha=1.196954 -xmax=198
 
 # Load up the bed graph tracks
 hgLoadBed -bedGraph=4 hg18 FantomCageForwardPowerLawGraph all.forward.plaw.bg2
 hgLoadBed -bedGraph=4 hg18 FantomCageReversePowerLawGraph all.reverse.plaw.bg2
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
 # rnaBinding RNA Binding Proteins (2009-07-28 markd)
 # contributor: Jeremy Sanford <sanford@biology.ucsc.edu>
 
     # sfrs1Input BED table: 
     #   need to drop color, as it's in the wrong column
     #   skip header
     tawk 'NR>1{print $1,$2,$3,$4,$5,$6}' Input_sequence_blocks.bed | hgLoadBed hg18 sfrs1Input stdin
 
     # sfrs1Clip BED table:
     #   skip header
     tawk 'NR>1{print $1,$2,$3,$4,$5,$6}' SFRS1_CLIP_sequence_blocks.bed | hgLoadBed hg18 sfrs1Clip stdin
 
     # SFRS1_consensus_sites.wig
     tawk 'NR>1' SFRS1_consensus_sites.wig  | wigEncode stdin sfrs1ConsensusSites.wig  sfrs1ConsensusSites.wib
     # Converted stdin, upper limit 11.63, lower limit -28.64
     hgLoadWiggle -pathPrefix=/gbdb/hg18/wib hg18 sfrs1ConsensusSites sfrs1ConsensusSites.wig
     ln -s $(pwd -P)/sfrs1ConsensusSites.wib /gbdb/hg18/wib/
 ############################################################################
-# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-07-04, hartera)
+# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-08-04, hartera)
 # Needs updating as the current version is build 33.
 # Download the human VEGA Genes posted on ftp site on 2009-03-31
+# 2009-08-03 (hartera) - Added code to register track handler for
+# vegaGeneComposite.
+# 2009-08-15 - 2009-08-16 (hartera) - Added code to allow use of radio buttons
+# on the configuratio page for the track item labels. Modified code so it 
+# can be shared with Ensembl to create the links to Vega transcript, gene
+# and protein reports on the details pages. 
+# 2009-08-22 - Finished code for adding Vega report URLs to the details pages.
+# Loaded the vegaGtp table.
+
     mkdir /hive/data/genomes/hg18/bed/vega35
     cd /hive/data/genomes/hg18/bed/vega35
     wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/human/*" \
          "ftp://ftp.sanger.ac.uk/pub/vega/human/pep/*.tot.fa.gz"
     zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
         | grep "^chr" > nonHaps.gtf
     zcat gtf_file.gz | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/" \
         | grep -v "^chr" > haps.gtf
     awk 'BEGIN{OFS="\t";FS="\t";}{ if ($1 == "c6_COX") { if (($4 >= 28688544) && ($5 <= 33420241)) print; } else if ($1 == "c6_QBL") { if (($4 >= 28885510) && ($5 <= 33451440)) print;}}' haps.gtf > keptHaps.gtf
     liftUp -type=.gtf lifted.gtf /cluster/data/hg18/jkStuff/ensGene.haplotype.lift carry keptHaps.gtf
     cat nonHaps.gtf lifted.gtf > all.gtf
     
     # Do this to create the infoOut.txt file and extract the extra information 
     gtfToGenePred -infoOut=infoOut.txt -genePredExt all.gtf stdout | gzip > tempAll.gp.gz
     ~/kent/src/hg/utils/automation/extractGtf.pl infoOut.txt > vegaGtp.tab
 
     # Change the gene name to have the gene_id label so that this is in the
     # name2 field of the extended genePred table. This can then be displayed
     # at the track item label. 
     perl -pi.bak -e 's/gene_id/other_gene_id/' all.gtf
     perl -pi.bak -e 's/gene_name/gene_id/' all.gtf
     gzip all.gtf
     rm *.gtf tempAll.gp.gz
     # create genePred files for loading into database
     gtfToGenePred -genePredExt all.gtf.gz stdout | gzip > all.gp.gz
     genePredCheck -db=hg18 all.gp.gz 
     # checked: 81244 failed: 0
     zcat all.gtf.gz | grep -i pseudo > pseudo.gtf
     zcat all.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
     gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
     gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
     genePredCheck -db=hg18 pseudo.gp
     # checked: 8331 failed: 0
     genePredCheck -db=hg18 not.pseudo.gp
     # checked: 72913 failed: 0
     hgLoadGenePred -genePredExt hg18 vegaGene not.pseudo.gp
     hgLoadGenePred -genePredExt hg18 vegaPseudoGene pseudo.gp
     # Added code to src/hg/hgTracks/simpleTracks.c to register a track handler
     # for vegaGeneComposite that is now used for this data. This used
     # vegaGeneMethods to display the name2 field (gene) as the item label in
     # the track.
+    # 2009-08-16 (hartera)
+    # ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql
+    # There is an index on the protein field so it can not be NULL. 
+    # If there is no protein, the gene name is given.
+    # Added code to hgTracks.c and hgTrackUi.c to allow the use of 
+    # radio buttons on the track configuratioin page to select the
+    # gene name, accession or both to be displayed in the track.
+    # The gene name is displayed by default.
+    # Added code to hgc.c so that Ensembl and Vega can share code to 
+    # create links on the details pages to the Vega reports for transcript, 
+    # gene and protein through these IDs. Created new function
+    # printEnsemblOrVegaCustomUrl(). 
+
+    # 2009-08-22 (hartera)
+    # Create a vegaGtp table using the vegaGtp.tab file above. Use ensGtp.sql
+    # to create the table. vegaGtp associates geneId/transcriptId/proteinId 
+    # for the links to Vega reports from the details page. If there is no
+    # protein ID because the transcript is noncoding, the gene name is used
+    # instead. This field can not be NULL in the table as there is an index
+    # on it.     
+    cd /hive/data/genomes/hg18/bed/vega35
+    cp ~/kent/src/hg/lib/ensGtp.sql .
+    # One of the gene names is long for a noncoding gene so it does not fit 
+    # in the protein ID field so change the protein field in ensGtp.sql 
+    # to allow 40 chars instead of 20 and re-load the table.
+    hgsql -e 'drop table vegaGtp;' hg18
+    hgLoadSqlTab hg18 vegaGtp ensGtp.sql vegaGtp.tab
+    # Loaded succesfully
+    # Added code to hgc.c to use printEnsemblOrVegaCustomUrl() in 
+    # doVegaGene() to add the links to Vega reports on the details pages.
+
 ############################################################################