src/hg/makeDb/doc/hg16.txt 1.10

1.10 2009/04/27 20:11:36 hiram
liftOver to hg19 done
Index: src/hg/makeDb/doc/hg16.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg16.txt,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/hg/makeDb/doc/hg16.txt	10 Nov 2008 20:28:17 -0000	1.9
+++ src/hg/makeDb/doc/hg16.txt	27 Apr 2009 20:11:36 -0000	1.10
@@ -1,11932 +1,11946 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes how we made the browser database on 
 # NCBI build 34 (July 18, 2003 freeze)
 
 # HOW TO BUILD A ASSEMBLY FROM NCBI FILES 
 # ---------------------------------------
 
 # Make gs.17 directory, gs.17/build34 directory, and gs.17/ffa directory.
     mkdir /cluster/store4/gs.17
     mkdir /cluster/store4/gs.17/build34
     mkdir /cluster/store4/gs.17/agp
     mkdir /cluster/store4/gs.17/ffa
 
 #    Make a symbolic link from /cluster/store1 to this location
 	
     cd /cluster/store1
     ln -s /cluster/store4/gs.17 ./gs.17
 
 #    Make a symbolic link from your home directory to the build dir:
 
     ln -s /cluster/store4/gs.17/build34 ~/oo
 
 # NCBI download site:
 
     ftp ftp.ncbi.nih.gov
     # user and password from /cse/guests/kent/buildHg6.doc
     cd build_34    
  
 # Download all finished agp's and fa's into gs.17/agp
 
     mget chr*.agp
     mget chr*.fa.gz
     gunzip *.gz
 
 # Download contig agp's into gs.17/build34
 
     get ref_placed.agp   # used to be in reference.agp
     get ref_unplaced.agp # used to be in reference.agp
     get DR51.agp
     get PAR.agp          # new for this build - PAR regions added to chrY
     cat ref_placed.agp ref_unplaced.agp DR51.agp > ncbi_build34.agp
 
 # Download contig fa's into gs.17/ffa
 
     get ref_placed.fa.gz   # used to be in reference.fa
     get ref_unplaced.fa.gz # used to be in reference.fa
     get DR51.fa.gz
     get PAR.fa.gz          # new for this build - PAR regions added to chrY
     get sequence.inf
     cat ref_placed.fa ref_unplaced.fa DR51.fa > ncbi_build34.fa
 
 # Download assembly related files into gs.17/build34
 
     get seq_contig.md
     get contig_overlaps.agp
 
 # Download questionable join certificates file
 
     get e-certificates.txt
     mkdir certificates
     mv e-certificates.txt certificates
 
 # Save a copy of the original seq_contig.md file
 
     cp seq_contig.md seq_contig.md.orig
 
 # For build34, edit the seq_contig.md file to remove the alternative chr7
 # sequence supplied by the Toronto group: NT_079590, NT_079591, NT_079592,
 # NT_079593, NT_079594, NT_079595, NT_079596, NT_079597
 
 # Edit seq_contig.md to make the DR51 alternative haplotype look like a
 # chr6_random sequence:
 # 9606  6       32491690        32629063        +       NG_002432       GI:28212469     CONTIG    DR51    1
 # to 
 # 9606  6|NG_002432     1       137374  +       NG_002432       GI:28212469     CONTIG  DR51      1
 
 # Move this edited DR51 line next to other chr6_random contigs (for creating
 #	the lift file)
     
 # Sanity check
     /cluster/bin/i386/checkYbr build34/ncbi_build34.agp ffa/ncbi_build34.fa \
       build34/seq_contig.md
 
 # Convert fa files into UCSC style fa files and place in "contigs" directory
 # inside the gs.17/build34 directory 
 
     cd build34
     mkdir contigs
     /cluster/bin/i386/faNcbiToUcsc -split -ntLast ../ffa/ncbi_build34.fa \
       contigs
    
 # Copy over chrM contig from previous version
     cd ~/oo
     cp -r gs.17/build33/M .
 
 # Determine the chromosome sizes from agps
 
     /cluster/bin/scripts/getChromSizes ../agp
 
 # Create lift files (this will create chromosome directory structure) and
 #	inserts file
   
     /cluster/bin/scripts/createNcbiLifts -s chrom_sizes seq_contig.md .
 
 # Create contig agp files (will create contig directory structure)
 	
     /cluster/bin/scripts/createNcbiCtgAgp seq_contig.md ncbi_build34.agp .
 
 # Create chromsome random agp files.
 
     /cluster/bin/scripts/createNcbiChrAgp -randomonly .
 
 # Copy the original chrN.agp files from the gs.17/agp directory 
 #    into each of the chromosome directories since they contain better 
 #    gap information. Delete the comments at top from these.
 
 # Distribute contig .fa to appropriate directory (assumes all files
 # are in "contigs" directory).
 
     # create global data link for everyone.  No more home directory
     # links required.
     ln -s /cluster/store4/gs.17/build34 /cluster/data/hg16
     cd /cluster/data/hg16
     /cluster/bin/scripts/distNcbiCtgFa contigs .
     rm -r contigs
 
 # Copy over jkStuff from previous build (??)
     mkdir jkStuff
     cp /cluster/store1/gs.17/build33/jkStuff/*.sh jkStuff
     /build31/jkStuff/*.csh jkStuff
     cp /cluster/store1/gs.17/build33/jkStuff/*.gsub jkStuff        
 
 # Create contig gl files
 
     /cluster/bin/i386/agpToGl contig_overlaps.agp . -md=seq_contig.md
 
 # Create chromosome gl files
 
      jkStuff/liftGl.sh contig.gl
 
 # Files ready for repeat-masking and trf
 
 
 # CREATE STS/FISH/BACENDS/CYTOBANDS DIRECTORY STRUCTURE AND SETUP 
 #        (DONE 2003-07-23 Terry)
 
 # Create directory structure to hold information for these tracks
         cd /projects/hg2/booch/psl/
 
 # Change Makefile parameters for OOVERS, GSVERS, PREVGS, PREVOO
         make new
 
 # Update all Makefiles with latest OOVERS and GSVERS, DATABASE, and locations of .fa files
 
 # Makefiles in:
 #     /gs.17/build33/
 #     /gs.17/build33/bacends
 #     /gs.17/build33/cytobands
 #     /gs.17/build33/cytoPlots
 #     /gs.17/build33/fish
 #     /gs.17/build33/fosends
 #     /gs.17/build33/g2g
 #     /gs.17/build33/geneticPlots
 #     /gs.17/build33/primers
 #     /gs.17/build33/recombrate
 #     /gs.17/build33/sts
 #     /gs.17/build33/stsPlots
 
 # Create accession_info file *****
 	make accession_info.rdb
 
 
 # UPDATE STS INFORMATION (DONE 2003-07-23 Terry)
 # Download and unpack updated information from dbSTS:
 
     	cd /projects/hg2/booch/psl/update
 	wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.sts
 	wget ftp://ftp.ncbi.nih.gov/repository/dbSTS/dbSTS.aliases
 	wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.Z
 	mv sts.Z dbSTS.FASTA.dailydump.Z
 	gunzip dbSTS.FASTA.dailydump.Z
 
 
 # Make new directory for this info and move files there
         mkdir /cluster/store1/sts.8
         cp all.STS.fa /cluster/store1/sts.8
         cp all.primers /cluster/store1/sts.8
         cp all.primers.fa /cluster/store1/sts.8
 
 # Copy new files to cluster
         ssh kkstore
         cd /cluster/store1/sts.8
         cp /cluster/store1/sts.8 /*.* /scratch/hg/STS
 
 # Ask for propagation from sysadmin
 
 # Load the sequences into the database (after database created)
 	ssh hgwdev
 	mkdir /gbdb/hg16/sts.8
 	cd /gbdb/hg16/sts.8
 	ln -s /cluster/store1/sts.8/all.STS.fa ./all.STS.fa
 	ln -s /cluster/store1/sts.8/all.primers.fa ./all.primers.fa
 	cd /cluster/store2/tmp
 	hgLoadRna addSeq hg16 /gbdb/hg16/sts.8/all.STS.fa
 	hgLoadRna addSeq hg16 /gbdb/hg16/sts.8/all.primers.fa
 
 
 # CREATE STS MARKER ALIGNMENTS (DONE 2003-08-03 Terry)
 
 # Create full sequence alignments
         ssh kk
         cd /cluster/home/booch/sts
 
 # Update Makefile with latest OOVERS and GSVERS and
 # run cluster jobs
         make new
         make jobList 
         para create jobList
         para push 
 	# wait until alignments done
         make stsMarkers.psl
 
 # Copy files to final destination and remove originals
         make copy.assembly
         make clean
 
 # Create primer alignments
         ssh kk
         cd /cluster/home/booch/primers
         
 # Update Makefile with latest OOVERS and GSVERS and
 # run cluster jobs
         make new
         make jobList.scratch
         para create jobList
         para push
 
 # Do an initial quick filter of results (takes a while, still) and create 
 # final file - best done on eieio since disks local
 	ssh eieio
 	make filter
         make primers.psl
 
 # Copy files to final destination and remove
         make copy.assembly
         make clean
         
 # Create ePCR alignments
         ssh kk
         cd /cluster/home/booch/epcr
 
 # Update Makefile with latest OOVERS and GSVERS
         make new
         make jobList
         para create jobList
         para push
         make all.epcr
 
 # Copy files to final destination and remove
         make copy.assembly
         make clean
         
 
 # CREATE AND LOAD STS MARKERS TRACK (DONE 2003-08-03 Terry)
 
 # Copy in current stsInfo2.bed and stsAlias.bed files
         cd /projects/hg2/booch/psl/gs.17/build33
         cp ../update/stsInfo2.bed .
         cp ../update/stsAlias.bed .
 
 # Create final version of sts sequence placements
         ssh kks00
         cd /projects/hg2/booch/psl/gs.17/build33/sts
         make stsMarkers.final
 
 # Create final version of primers placements
 # Make sure PRIMERS variable in Makefile is pointing to current version
         cd /projects/hg2/booch/psl/gs.17/build33/primers
         make primers.final
 
 # Create bed file
         cd /projects/hg2/booch/psl/gs.17/build33
         make stsMap.bed
 
 # Create database tables
         ssh hgwdev
         cd /projects/hg2/booch/psl/tables
         hgsql hg16 < all_sts_primer.sql
         hgsql hg16 < all_sts_seq.sql
         hgsql hg16 < stsAlias.sql
         hgsql hg16 < stsInfo2.sql
         hgsql hg16 < stsMap.sql
 
 # Load the tables
 	cd /projects/hg2/booch/psl/gs.17/build34/sts/
         echo 'load data local infile "stsMarkers.psl.filter.lifted" into table all_sts_seq;' | hgsql hg16
 	cd /projects/hg2/booch/psl/gs.17/build34/primers/
         echo 'load data local infile "primers.psl.filter.lifted" into table all_sts_primer;' | hgsql hg16
 	cd /projects/hg2/booch/psl/gs.17/build34/
         echo 'load data local infile "stsAlias.bed" into table stsAlias;' | hgsql hg16
         echo 'load data local infile "stsInfo2.bed" into table stsInfo2;' | hgsql hg16
 	echo 'load data local infile "stsMap.bed" into table stsMap;' | hgsql hg16
 
 
 # CREATE AND LOAD RECOMBINATION RATE TRACK (DONE 2003-08-05 Terry)
 # (must be done after STS Markers track) 
 
 # Create bed file
 	cd /projects/hg2/booch/psl/gs.17/build34/recombrate
 	make recombRate.bed
 
 # Create database table
         ssh hgwdev
         cd /projects/hg2/booch/psl/tables
         hgsql hg16 < recombRate.sql
         
 # Load the table
 	cd /projects/hg2/booch/psl/gs.17/build34/recombrate/
 	echo 'load data local infile "recombRate.bed" into table recombRate;' | hgsql hg16
 
 
 # UPDATE BACEND SEQUENCES (DONE 2003-07-23 Terry)
 
 # **** Sequences were determined to not have changed since bacends.4 *****
 # **** No new sequences downloaded - See makeHg15.doc for download instructions  ***** 
 
 # Load the sequences into the database (after database created)
 	ssh hgwdev
 	mkdir /gbdb/hg16/bacends.4
 	cd /gbdb/hg16/bacends.4
 	ln -s /cluster/store1/bacends.4/BACends.fa ./BACends.fa
 	cd /cluster/store2/tmp
 	hgLoadRna addSeq hg16 /gbdb/hg16/bacends.4/BACends.fa
 
 
 # BACEND SEQUENCE ALIGNMENTS (DONE 2003-08-01 Terry)
 # (alignments done without RepeatMasking)
 
 # Create full sequence alignments
 	ssh kk
         cd /cluster/home/booch/bacends
 
 # Update Makefile with latest OOVERS and GSVERS and run cluster jobs
         make new
         make jobList
         para create jobList
         para push 
 
 # Compile alignments and lift the files (takes a while)
 	ssh eieio
 	make bacEnds.psl.lifted
 
 # Copy files to final destination and remove
         make copy.assembly
         make clean # (may want to wait until sure they're OK)
 
 # BACEND PAIRS TRACK (DONE 2003-08-01 Terry)
 
 # Add /projects/compbiousr/booch/booch/scripts to your path
 
 # Update Makefile with new location of pairs/singles 
 # files, if necessary (DONE)
         cd /projects/hg2/booch/psl/gs.17/build33/bacends
 
 # Make initial file of alignments
 	make bacEnds.rdb
  
 # Try to fish out more pairs
 	make bacEndsMiss.psl
 
 # Re-make bacEnds.rdb with new info
 	make bacEnds.rdb
  
 # Create bacEndPairs track file
         make bacEndPairs.bed
 
 # Create bacEndPairsBad and bacEndPairsLong files
 	make bacEndPairsBad.bed
 
 # Create psl file to load
 	make bacEnds.load.psl
 
 # Create database tables
         ssh hgwdev
         cd /projects/hg2/booch/psl/tables
         hgsql hg16 < all_bacends.sql
         hgsql hg16 < bacEndPairs.sql
         hgsql hg16 < bacEndPairsBad.sql
         hgsql hg16 < bacEndPairsLong.sql
 
 # Load the tables
 	cd /projects/hg2/booch/psl/gs.17/build34/bacends/
         echo 'load data local infile "bacEnds.load.psl" into table all_bacends;' | hgsql hg16
         echo 'load data local infile "bacEndPairs.bed" into table bacEndPairs;' | hgsql hg16
         echo 'load data local infile "bacEndPairsBad.bed" into table bacEndPairsBad;' | hgsql hg16
         echo 'load data local infile "bacEndPairsLong.bed" into table bacEndPairsLong;' | hgsql hg16
 
 
 # FOSEND SEQUENCE ALIGNMENTS (DONE 2003-08-03 Terry)
 
 # Create full sequence alignments
         ssh kk
         cd /cluster/home/booch/fosends
 
 # Update Makefile with latest OOVERS and GSVERS and run cluster jobs
         make new
         make jobList
 	para create jobList
         para push
 
 # Compile alignments and lift the files (takes a while)
 	ssh eieio
         cd /cluster/home/booch/fosends
 	make fosEnds.psl.lifted
 
 # Copy files to final destination and remove
         make copy.assembly
         make clean
 
 # FOSEND PAIRS TRACK (DONE 2003-08-01 Terry)
 
 # Update Makefile with location of pairs files, if necessary
         ssh kks00
         cd /projects/hg2/booch/psl/gs.17/build33/fosends
 
 # Make initial file of alignments
 	make fosEnds.rdb
 
 # Try to fish out more pairs
 	make fosEndsMiss.psl
 
 # Re-make bacEnds.rdb with new info
 	make fosEnds.rdb
  
 # Create bacEndPairs track file
         make fosEndPairs.bed
 
 # Create bacEndPairsBad and bacEndPairsLong files
 	make fosEndPairsBad.bed
 
 # Create psl file to load
 	make fosEnds.load.psl
 
 # Create database tables
         ssh hgwdev
         cd /projects/hg2/booch/psl/tables
         hgsql hg16 < all_fosends.sql
         hgsql hg16 < fosEndPairs.sql
         hgsql hg16 < fosEndPairsBad.sql
         hgsql hg16 < fosEndPairsLong.sql
 
 # Load the tables
 	cd /projects/hg2/booch/psl/gs.17/build34/fosends/
         echo 'load data local infile "fosEnds.load.psl" into table all_fosends;' | hgsql hg16        
         echo 'load data local infile "fosEndPairs.bed" into table fosEndPairs;' | hgsql hg16
         echo 'load data local infile "fosEndPairsBad.bed" into table fosEndPairsBad;' | hgsql hg16
         echo 'load data local infile "fosEndPairsLong.bed" into table fosEndPairsLong;' | hgsql hg16
 
 # Load the sequences (change fosends.# to match correct location) (done for hg15 early 4/9/2003)
 	mkdir /gbdb/hg15/fosends.3
 	cd /gbdb/hg15/fosends.3
 	ln -s /cluster/store1/fosends.3/fosEnds.fa ./fosEnds.fa
 	cd /cluster/store2/tmp
 	hgLoadRna addSeq hg15 /gbdb/hg15/fosends.3/fosEnds.fa
                 
 
 # UPDATE FISH CLONES INFORMATION (DONE 2003-07-23 Terry)
 
 # Download the latest info from NCBI
         # point browser at http://www.ncbi.nlm.nih.gov/genome/cyto/cytobac.cgi?CHR=all&VERBOSE=ctg
         # change "Show details on sequence-tag" to "yes"
         # change "Download or Display" to "Download table for UNIX"
         # press Submit - save as /projects/hg2/booch/psl/fish/hbrc/hbrc.20030723.table
 
 # Format file just downloaded.  
         cd /projects/hg2/booch/psl/fish/
 
 # Edit Makefile to point at file just downloaded (variables HBRC, HBRCFORMAT)
         make HBRC
 
 # (Manually added 21 results from FHCRC)
 
 # Copy it to the new freeze location
         cp /projects/hg2/booch/psl/fish/all.fish.format /projects/hg2/booch/psl/gs.17/build34/fish/
 
 # Save it as the new "gold" file
 	cp all.fish.format all.fish.format.gold
 
 # CREATE AND LOAD FISH CLONES TRACK (DONE 2003-08-08 Terry)
 # (must be done after Coverage, STS markers track and BAC end pairs track)
 
 # Extract the file with clone positions from database
         ssh hgwdev
         echo 'select * into outfile "/tmp/booch/clonePos.txt" from clonePos' | hgsql hg16
         mv /tmp/booch/clonePos.txt /projects/hg2/booch/psl/gs.17/build34/fish
 
 # Get current clone/accession information
 	ssh kks00
         cd /projects/hg2/booch/psl/gs.17/build34/fish
 	wget http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
 
 # Create initial placement file
 	cp /projects/hg2/booch/psl/gs.17/build33/fish/extract.pl . 
 	make cyto.markers.bed
 
 # Get sequences for accessions not in genome
 	# goto http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide
 	# select file "/projects/hg2/booch/psl/gs.17/build33/fish/not.found.acc
 	# change output to FASTA format
 	# download results to "/projects/hg2/booch/psl/gs.17/build33/fish/not.found.fa"
 
 # Place sequences against genome
 	make blat
 
 # Try to incorporate new placements
 	make cyto.markers.bed2
 
 # Create bed file
         make fishClones.bed
 
 # Create database table
         ssh hgwdev
         cd /projects/hg2/booch/psl/tables
         hgsql hg16 < fishClones.sql
 
 # Load the table
 	cd /projects/hg2/booch/psl/gs.17/build34/fish/
         echo 'load data local infile "fishClones.bed" into table fishClones;' | hgsql hg16
         
 
 # CREATE AND LOAD CHROMOSOME BANDS TRACK (DONE 2003-08-08 Terry)
 # (must be done after FISH Clones track) 
 
 # Create bed file
         ssh kks00
 	cd /projects/hg2/booch/psl/gs.17/build34/cytobands/
         make setBands.txt   # NOTE: may get errors if inserts file out-of-sync with pctSetBands file 
         make cytobands.pct.ranges
         make predict
 
 # Create database table
         ssh hgwdev
         cd /projects/hg2/booch/psl/tables
         hgsql hg16 < cytoBand.sql
         
 # Load the table
 	cd /projects/hg2/booch/psl/gs.17/build34/cytobands/
 	echo 'load data local infile "cytobands.bed" into table cytoBand;' | hgsql hg16
 
 # Make cytoBandIdeo track for ideogram gif on hgTracks page.
     # For human cytoBandIdeo is just a replicate of the cytoBand track.
     # Make the cytoBand track (above) and then:
     echo "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" | hgsql hg16
 
 # CREATING DATABASE  (DONE - 2003-07-26 - Hiram)
     ssh hgwdev
     # Make sure there is at least 5 gig free on hgwdev:/var/lib/mysql
     df -h /var/lib/mysql
 #	Filesystem            Size  Used Avail Use% Mounted on
 #	/dev/sda1             472G  416G   31G  93% /var/lib/mysql
     # Create the database.
     echo 'create database hg16' | hgsql hg15
     # make a semi-permanent read-only alias (add this to your .cshrc/.bashrc):
     #	(I have not seen a use for this in any procedures ? -Hiram)
     #		alias hg16 mysql -u hguser -phguserstuff -A hg16
     #	(use 'hgsql hg16' instead)
     # Initialize the relational-mrna and external sequence info tables:
     hgLoadRna new hg16
     # Copy over grp table (for track grouping) from another database:
     echo "create table grp (PRIMARY KEY(NAME)) select * from hg15.grp" \
     | hgsql hg16
     # add ENCODE track.  Move Repeats lower in priority
     echo 'UPDATE grp SET priority=7 WHERE name="varRep"'| hgsql hg16
 
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encode", "ENCODE Tracks", 8)' | hgsql hg16
 
     # New ENCODE groups
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeGenes", "ENCODE Regions and Genes", 8.1)' | hgsql hg16
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeTxLevels", "ENCODE Transcript Levels", 8.2)' | hgsql hg16
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChip", "ENCODE Chromatin Immunoprecipitation", 8.3)' | hgsql hg16
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeChrom", "ENCODE Chromosome, Chromatin and DNA Structure", 8.4)' | hgsql hg16
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeCompGeno", "ENCODE Comparative Genomics", 8.5)' | hgsql hg16
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeVariation", "ENCODE Variation", 8.6)' | hgsql hg16
     echo 'INSERT INTO grp (name, label, priority) VALUES ("encodeAnalysis", "ENCODE Analysis", 8.9)' | hgsql hg16
     
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE - 2003-07-26 - Hiram)
     ssh hgwdev
     # Enter hg16 into hgcentraltest.dbDb so test browser knows about it:
     echo 'insert into dbDb (name, description, nibPath, organism, \
 	defaultPos, active, orderKey, genome, scientificName) \
 	values("hg16", "July 2003", "/gbdb/hg16/nib", "Human", \
 	"chr7:26828631-26938371", 1, 10, "Human", "Homo sapiens");' \
 	| hgsql -h genome-testdb hgcentraltest
     # Make trackDb table so browser knows what tracks to expect:
     cd ~kent/src/hg/makeDb/trackDb
     cvs up -d -P .
     # Edit that makefile to add hg16 in all the right places and do
     make update
     make alpha
     cvs commit makefile
 
 # MAKE LIFTALL.LFT, NCBI.LFT (DONE - 2003-07-26 - Hiram)
     cd /cluster/data/hg16
     mkdir -p jkStuff
     cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft
     # Create jkStuff/ncbi.lft for lifting stuff built with the NCBI assembly.
     # Note: this ncbi.lift will not lift floating contigs to chr_random coords,
     # but it will show the strand orientation of the floating contigs 
     # (grep for '|').
     mdToNcbiLift seq_contig.md jkStuff/ncbi.lft 
     # If a lift file has been edited (e.g. as in 6.2.5 above), edit ncbi.lft 
     # to match. If no step 6.2.5 then no editing needed
 
 # REPEAT MASKING (DONE - 2003-07-25 - Hiram, REDONE 2003-08-02)
     # Split contigs, run RepeatMasker, lift results
     # Notes: 
     # * Using new RepeatMasker in /cluster/bluearc/RepeatMasker030619
     #	Always check for new RepeatMasker before proceeding
     # * Contigs (*/N{T,G}_*/N{T,G}_*.fa) are split into 500kb chunks to make 
     #   RepeatMasker runs manageable on the cluster ==> results need lifting.
     # * For the NCBI assembly we repeat mask on the sensitive mode setting
     #   (RepeatMasker -s)
 
     #- Split contigs into 500kb chunks:
 
     ssh eieio
     cd /cluster/data/hg16
     foreach chrom ( ?{,?} )
 	foreach c ( $chrom/N{T,G}_?????? )
       set contig = $c:t
 	echo "splitting ${chrom}/${contig}/${contig}.fa"
       faSplit size ${chrom}/${contig}/$contig.fa 500000 \
 	${chrom}/${contig}/${contig}_ -lift=${chrom}/${contig}/$contig.lft \
 	-maxN=500000
 	end
     end
 
     #- Make the run directory and job list:
     cd /cluster/data/hg16
     mkdir -p jkStuff
     #  According to RepeatMasker help file, no arguments are required to
     #	specify species because its default is set for primate (human)
     #  This run script saves the .tbl file to be sent to Arian.  He uses
     # those for his analysis.  Sometimes he needs the .cat and .align files for
     # checking problems.  Krish needs the .align files, they are large.
 
     cat << '_EOF_' > jkStuff/RMHuman
 #!/bin/csh -fe
 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/hg16/$2
 /bin/cp $2 /tmp/hg16/$2/
 cd /tmp/hg16/$2
 /cluster/bluearc/RepeatMasker030619/RepeatMasker -ali -s $2
 popd
 /bin/cp /tmp/hg16/$2/$2.out ./
  if (-e /tmp/hg16/$2/$2.align) /bin/cp /tmp/hg16/$2/$2.align ./
 if (-e /tmp/hg16/$2/$2.tbl) /bin/cp /tmp/hg16/$2/$2.tbl ./
 # if (-e /tmp/hg16/$2/$2.cat) /bin/cp /tmp/hg16/$2/$2.cat ./
 /bin/rm -fr /tmp/hg16/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/hg16/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/hg16
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x jkStuff/RMHuman
 
     ssh eieio
     cd /cluster/data/hg16
     mkdir RMRun
     rm -f RMRun/RMJobs
     touch RMRun/RMJobs
    foreach d ( ?{,?} )
      foreach c ( $d/N{T,G}_*/N{T,G}_*_*.fa )
         set f = $c:t
         set cc = $c:h
         set contig = $cc:t
         echo /cluster/store4/gs.17/build34/jkStuff/RMHuman \
    		/cluster/store4/gs.17/build34/${d}/${contig} $f \
    '{'check out line+ /cluster/store4/gs.17/build34/${d}/${contig}/$f.out'}' \
           >> RMRun/RMJobs
       end
     end
 
     # We have 6015 jobs in RMJobs:
     wc RMRun/RMJobs
 #	6015   42105 1184896 RMRun/RMJobs
 
     #- Do the run
     ssh kk
     cd /cluster/data/hg16/RMRun
     para create RMJobs
     para try, para check, para check, para push, para check,...
     #- While that is running, you can run TRF (simpleRepeat) on the small
     # cluster.  See SIMPLE REPEAT section below
 # CPU time in finished jobs:   33575296s  559588.26m  9326.47h  388.60d  1.065 y
 # IO & Wait Time:                238878s    3981.30m    66.36h    2.76d  0.008 y
 # Average job time:                7513s     125.21m     2.09h    0.09d
 # Longest job:                    18457s     307.62m     5.13h    0.21d
 # Submission to last job:         55537s     925.62m    15.43h    0.64d
 
 
     #- Lift up the split-contig .out's to contig-level .out's
     ssh eieio
     cd /cluster/data/hg16
     foreach d ( ?{,?}/N{T,G}_* )
         cd $d
         set contig = $d:t
         liftUp $contig.fa.out $contig.lft warn ${contig}_?{,?,??}.fa.out 
         cd ../..
     end
 
     #- Lift up RepeatMask .out files to chromosome coordinates via
     # picked up jkStuff/liftOut2.sh from the hg15 build.  Reset the
     # liftUp command from ~kent/bin/$MACHTYPE to be from
     # /cluster/bin/i386.  Took the redirection to dev/null off of the
     # command and capture the output here to see what errors we have.
 
     ./jkStuff/liftOut2.sh > liftOut2.out 2>&1 &
 
     #- By this point, the database should have been created (above):
     ssh hgwdev
     cd /cluster/data/hg16
     hgLoadOut hg16 ?/*.fa.out ??/*.fa.out
 
     # errors during this load:
 Processing 2/chr2.fa.out
 Strange perc. field -6.1 line 243430 of 2/chr2.fa.out
 Strange perc. field -5.6 line 243430 of 2/chr2.fa.out
 Strange perc. field -6.1 line 243432 of 2/chr2.fa.out
 Strange perc. field -5.6 line 243432 of 2/chr2.fa.out
 Processing 5/chr5.fa.out
 Strange perc. field -0.3 line 4339 of 5/chr5.fa.out
 Processing 19/chr19.fa.out
 Strange perc. field -18.6 line 77032 of 19/chr19.fa.out
 
 # SIMPLE REPEAT [TRF] TRACK (DONE - 2003-07-25 - Hiram)
     # Distribute contigs to /iscratch/i
     ssh kkr1u00
     rm -rf /iscratch/i/gs.17/build34/contigs
     mkdir -p /iscratch/i/gs.17/build34/contigs
     cd /cluster/data/hg16
     cp -p contigs/*.fa /iscratch/i/gs.17/build34/contigs
     # Make sure the total size looks like what you'd expect:
     du ./contigs /iscratch/i/gs.17/build34/contigs
     # 2839768 ./contigs
     # 2839768 /iscratch/i/gs.17/build34/contigs
     ~kent/bin/iSync
 
     # Create cluster parasol job like so:
     mkdir -p /cluster/data/hg16/bed/simpleRepeat
     cd /cluster/data/hg16/bed/simpleRepeat
     mkdir trf
     cat << '_EOF_' > runTrf
 #!/bin/csh -fe
 #
 set path1 = $1
 set inputFN = $1:t
 set outpath = $2
 set outputFN = $2:t
 mkdir -p /tmp/$outputFN
 cp $path1 /tmp/$outputFN
 pushd .
 cd /tmp/$outputFN
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
 popd
 rm -f $outpath
 cp -p /tmp/$outputFN/$outputFN $outpath
 rm -fr /tmp/$outputFN/*
 rmdir --ignore-fail-on-non-empty /tmp/$outputFN
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x runTrf
 
     cat << '_EOF_' > gsub
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     ls -1S /iscratch/i/gs.17/build34/contigs/*.fa > genome.lst
     gensub2 genome.lst single gsub spec
     para create spec
     para try
     para check
     para push
     para check
 #  Completed: 472 of 472 jobs
 # CPU time in finished jobs:      36177s     602.95m    10.05h    0.42d  0.001 y
 # IO & Wait Time:                  2038s      33.97m     0.57h    0.02d  0.000 y
 # Average job time:                  81s       1.35m     0.02h    0.00d
 # Longest job:                     6992s     116.53m     1.94h    0.08d
 # Submission to last job:         10703s     178.38m     2.97h    0.12d
     # When cluster run is done, a couple of extra files not caught in
     # the above sequence
     ./runTrf /cluster/store4/gs.17/build34/M/NT_999999/NT_999999.fa trf/NT_999999.bed
     # That produces an empty .bed file, mark it so:
     echo "# trf run produces nothing for this one" >> trf/NT_999999.bed
 
     liftUp simpleRepeat.bed /cluster/data/hg16/jkStuff/liftAll.lft \
 	warn trf/*.bed  > lu.out 2>&1
 
     # Load into the database:
     ssh hgwdev
     cd /cluster/data/hg16/bed/simpleRepeat
     /cluster/bin/i386/hgLoadBed hg16 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
 #	stringTab = 0
 #	Reading simpleRepeat.bed
 #	Loaded 627883 elements
 #	Sorted
 #	Saving bed.tab
 #	Loading hg16
 
 # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2003-07-27 - Hiram - REDONE 07-30)
     # After the simpleRepeats track has been built, make a filtered version 
     # of the trf output: keep trf's with period <= 12:
     ssh eieio
     cd /cluster/data/hg16/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/*.bed)
       awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
     # Lift up filtered trf output to chrom coords as well:
     cd /cluster/data/hg16
     mkdir -p bed/simpleRepeat/trfMaskChrom
     foreach c (?{,?})
       if (-e $c/lift/ordered.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/ordered.lst > $c/lift/oTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
       endif
       if (-e $c/lift/random.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
            $c/lift/random.lst > $c/lift/rTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
       endif
     end
 
 # MASK SEQUENCE BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE - 2003-07-27)
 #							 -Hiram
     # This used to be done right after RepeatMasking.  Now, we mask with 
     # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above.
     ssh eieio
     cd /cluster/data/hg16
 
     # Make chr*.fa from contig .fa
     #  Copied chrFa.sh from hg15/jkStuff - reset  path from ~kent to
     #  /cluster for the ctgToChromFa comand
     tcsh ./jkStuff/chrFa.sh > chrFa.out 2>&1 &
 
     # copied these three scripts from hg15 - fixup path names to
     # reference /cluster/bin instead of ~kent/bin
 
     #- Soft-mask (lower-case) the contig and chr .fa's
     tcsh ./jkStuff/makeFaMasked.sh > maFaMasked.out 2>&1
     #- Make hard-masked .fa.masked files as well:
     tcsh ./jkStuff/makeHardMasked.sh > maHardMasked.out 2>&1
     #- Rebuild the nib, mixedNib, maskedNib files:
     tcsh ./jkStuff/makeNib.sh > maNib.out 2>&1
 
     # Make symbolic links from /gbdb/hg16/nib to the real nibs.
     ssh hgwdev
     mkdir -p /gbdb/hg16/nib
     foreach f (/cluster/store4/gs.17/build34/nib/chr*.nib)
       ln -s $f /gbdb/hg16/nib
     end
     # Load /gbdb/hg16/nib paths into database and save size info.
     hgsql hg16  < ~/kent/src/hg/lib/chromInfo.sql
     cd /cluster/data/hg16
     hgNibSeq -preMadeNib hg16 /gbdb/hg16/nib ?{,?}/chr?{,?}{,_random}.fa
     echo "select chrom,size from chromInfo" | hgsql -N hg16 > chrom.sizes
 
     # Copy the masked contig fa to /iscratch and /scratch:
     #	And everything else we will need for blastz runs, etc ...
     ssh kkr1u00
     rm -rf /iscratch/i/gs.17/build34/trfFa
     mkdir -p /iscratch/i/gs.17/build34/trfFa
     cp -p /cluster/data/hg16/?{,?}/N{T,G}_*/N{T,G}_??????.fa /iscratch/i/gs.17/build34/trfFa
     rm -rf /iscratch/i/gs.17/build34/bothMaskedNibs
     mkdir -p /iscratch/i/gs.17/build34/bothMaskedNibs
     cp -p /cluster/data/hg16/nib/*.nib /iscratch/i/gs.17/build34/bothMaskedNibs
     rm -rf /iscratch/i/gs.17/build34/rmsk
     mkdir -p /iscratch/i/gs.17/build34/rmsk
     cp -p /cluster/data/hg16/?{,?}/*.out /iscratch/i/gs.17/build34/rmsk
     ~kent/bin/iSync
 
     # ssh kkstore
     #  Since kkstore is currently /cluster/bluearc/scratch, better to do
     #  this on eieio and copy to 
     rm -rf /scratch/hg/gs.17/build34/trfFa
     mkdir -p /scratch/hg/gs.17/build34/trfFa
     cp -p /cluster/data/hg16/?{,?}/N{T,G}_*/N{T,G}_??????.fa /scratch/hg/gs.17/build34/trfFa
     rm -rf /scratch/hg/gs.17/build34/bothMaskedNibs
     mkdir /scratch/hg/gs.17/build34/bothMaskedNibs
     cp -p /cluster/data/hg16/nib/*.nib /scratch/hg/gs.17/build34/bothMaskedNibs
     rm -rf /scratch/hg/gs.17/build34/rmsk
     mkdir -p /scratch/hg/gs.17/build34/rmsk
     cp -p /cluster/data/hg16/?{,?}/*.out /scratch/hg/gs.17/build34/rmsk
 
     # request rsync of kkstore /scratch
 
 
 # O+O: ASSEMBLY [GOLD], GAP, COVERAGE, MAP CONTIGS TRACKS (DONE - 2003-07-27)
 # Store o+o info in database.
     ssh eieio
     cd /cluster/store4/gs.17/build34
     if (-f contig_overlaps.agp) then
       jkStuff/liftGl.sh contig.gl
     else
       ssh hgwdev
       hgGoldGapGl -noGl hg16 /cluster/store4/gs.17 build34 
       echo ""
       echo "*** Note from makeHg15.doc:"
       echo "Come back to this step later when we have contig_overlaps.agp\!"
     endif
     ssh hgwdev
     cd /cluster/store4/gs.17/build34
     if (-f contig_overlaps.agp) then
       hgGoldGapGl hg16 /cluster/store4/gs.17 build34 
       cd /cluster/store4/gs.17
       /cluster/bin/i386/hgClonePos hg16 build34 ffa/sequence.inf /cluster/store4/gs.17 -maxErr=3
     end 
     cd /cluster/store4/gs.17
     # (2/27/04 angie) re-loaded -- chr{1,4,8,15}_random lift files changed
     # 7/30/04.
     hgCtgPos hg16 build34 
 
 
 # CREATE NON-STANDARD JOIN CERTIFICATES WEB PAGE AND TABLE
 
 # Filter certificates file to only contain those relevant to current assembly
 
     cd ~/hg16/certificates
     /cluster/bin/scripts/extractCertificates.pl e-certificates.txt ~/hg16 \
     > e-certificates.filter.txt
 
 # Create initial web page and table for loading into database
 
     hgCert e-certificates.filter.txt > certificates.html
 
 # Donna's edits to html page
 
 # (3/2/04 angie: edit cert.tab to remove some extra tab characters in comments
 #  so mySql doesn't truncate them, & reload)
 
 # Load cert table into database
 
     ssh hgwdev
     cd ~/hg16/certificates
     echo "drop table certificate" | hgsql hg16
     hgsql hg16 < ~/kent/src/hg/lib/certificate.sql 
     echo 'load data local infile "cert.tab" into table certificate;' \
     | hgsql hg16
 
 
 # AUTO UPDATE GENBANK MRNA RUN  (WORKING - 2003-07-30 - Hiram)
 
     ssh eieio
     cd /cluster/store5/genbank
     # This is a new organism, edit the etc/genbank.conf file and add:
 	# hg16
 	hg16.genome = /scratch/hg/gs.17/build34/bothMaskedNibs/chr*.nib
 	hg16.lift = /cluster/store4/gs.17/build34/jkStuff/liftAll.lft
 	hg16.genbank.est.xeno.load = yes
 	hg16.mgcTables.default = full
 	hg16.mgcTables.mgc = all
 	hg16.downloadDir = hg16
 
     ssh eieio
     cd /cluster/store5/genbank
     nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
 	-srcDb=genbank -type=mrna -verbose=1 -initial hg16
 # Completed: 49591 of 49591 jobs
 # CPU time in finished jobs:    3853288s   64221.47m  1070.36h   44.60d  0.122 y
 # IO & Wait Time:                246323s    4105.38m    68.42h    2.85d  0.008 y
 # Average job time:                  83s       1.38m     0.02h    0.00d
 # Longest job:                    21265s     354.42m     5.91h    0.25d
 # Submission to last job:         22930s     382.17m     6.37h    0.27d
 
     # Load the results from the above
     ssh hgwdev
     cd /cluster/store5/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad hg16
 
 #	To get this next one started, the above results need to be
 #	moved out of the way.  These things can be removed if there are
 #	no problems to debug
     ssh eieio
     cd /cluster/bluearc/genbank/work
     mv initial.hg16 initial.hg16.genbank.mrna
 
     ssh eieio
     cd /cluster/store5/genbank
     nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
 	-srcDb=refseq -type=mrna -verbose=1 -initial hg16
 # Completed: 68740 of 68740 jobs
 # CPU time in finished jobs:    1253290s   20888.16m   348.14h   14.51d  0.040 y
 # IO & Wait Time:                309126s    5152.10m    85.87h    3.58d  0.010 y
 # Average job time:                  23s       0.38m     0.01h    0.00d
 # Longest job:                    13290s     221.50m     3.69h    0.15d
 # Submission to last job:         13609s     226.82m     3.78h    0.16d
 
     # The iservers came back on-line, so use them for this run.
     #  The batch file can be found in:
     #	/cluster/store5/genbank/work/initial.hg16/align
     ssh hgwdev
     cd /cluster/store5/genbank
     nice bin/gbDbLoadStep -verbose=1 hg16
 
     nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial hg16
 
 # GC PERCENT (DONE 2003-07-31 - Hiram)
      ssh hgwdev
      mkdir -p /cluster/data/hg16/bed/gcPercent
      cd /cluster/data/hg16/bed/gcPercent
      hgsql hg16  < ~/kent/src/hg/lib/gcPercent.sql
      hgGcPercent hg16 ../../nib
 
 
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE - 2003-07-31 - Hiram)
     ssh hgwdev
     # Substitute BBB with the correct number for the hostname:
     echo 'insert into blatServers values("hg16", "blat6", "17778", "1"); \
           insert into blatServers values("hg16", "blat6", "17779", "0");' \
     | hgsql -h genome-testdb hgcentraltest
 
 
 # PRODUCING GENSCAN PREDICTIONS (DONE - 2003-08-01 - Hiram)
 
     ssh eieio
     mkdir -p /cluster/data/hg16/bed/genscan
     cd /cluster/data/hg16/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir -p gtf pep subopt
     # Generate a list file, genome.list, of all the contigs
     # *that do not have pure Ns* (due to heterochromatin, unsequencable 
     # stuff) which would cause genscan to run forever.
     rm -f genome.list
     touch genome.list
     foreach f ( `ls -1S /cluster/store4/gs.17/build34/?{,?}/N{T,G}_*/N{T,G}_??????.fa.masked` )
       egrep '[ACGT]' $f > /dev/null
       if ($status == 0) echo $f >> genome.list
     end
         
     # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
     # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
     # big cluster, due to limitation of memory and swap space on each
     # processing node).
     ssh kkr1u00
     cd /cluster/data/hg16/bed/genscan
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/home/hiram/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 genome.list single gsub jobList
     para create jobList
     para try
     para check
     para push
 # Completed: 491 of 491 jobs		(this was with only 6 CPUs available)
 # CPU time in finished jobs:     216220s    3603.67m    60.06h    2.50d  0.007 y
 # IO & Wait Time:                 85597s    1426.62m    23.78h    0.99d  0.003 y
 # Average job time:                 615s      10.24m     0.17h    0.01d
 # Longest job:                    10986s     183.10m     3.05h    0.13d
 # Submission to last job:         54395s     906.58m    15.11h    0.63d
 
 
     # Issue either one of the following two commands to check the
     # status of the cluster and your jobs, until they are done.
     parasol status
     para check
     # If there were out-of-memory problems (run "para problems"), then 
     # re-run those jobs by hand but change the -window arg from 2400000
     # to 1200000.  In build33, this was 22/NT_011519.
     #  In build34 there were NO failures !
     # Convert these to chromosome level files as so:     
     ssh eieio
     cd /cluster/data/hg16/bed/genscan
     liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/N{T,G}*.gtf
     liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/N{T,G}*.bed
     cat pep/*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/hg16/bed/genscan
     ldHgGene hg16 genscan genscan.gtf
 #	Reading genscan.gtf
 #	Read 42974 transcripts in 326300 lines in 1 files
 #	  42974 groups 41 seqs 1 sources 1 feature types
 #	42974 gene predictions
     hgPepPred hg16 generic genscanPep genscan.pep
 #	Processing genscan.pep
     hgLoadBed hg16 genscanSubopt genscanSubopt.bed
 #	stringTab = 0
 #	Reading genscanSubopt.bed
 #	Loaded 518038 elements
 #	Sorted
 #	Creating table definition for 
 #	Saving bed.tab
 #	Loading hg16
 
 
 # CPGISLANDS (DONE - 2003-08-01 - Hiram)
     ssh eieio
     mkdir -p /cluster/data/hg16/bed/cpgIsland
     cd /cluster/data/hg16/bed/cpgIsland
     # Copy program as built for previous hg build:
     mkdir cpg_dist
     cp -p ~/hg15/bed/cpgIsland/cpg_dist/cpglh.exe ./cpg_dist
     #  This step used to read, but I do not immediately see the .tar
     #  file anywhere:  (there is a copy in ~/rn3/bed/cpgIsland)
     # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
     # copy the tar file to the current directory
     # tar xvf cpg_dist.tar 
     # cd cpg_dist
     # gcc readseq.c cpg_lh.c -o cpglh.exe
     # cd ..
     # cpglh.exe requires hard-masked (N) .fa's.  
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
       set fout=$f:t:r:r.cpg
       echo producing $fout...
       ./cpg_dist/cpglh.exe $f > $fout
     end
     cat << '_EOF_' > filter.awk
 /* chr1\t1325\t3865\t754\tCpG: 183\t64.9\t0.7 */
 /* Transforms to:  (tab separated columns above, spaces below) */
 /* chr1  1325    3865    CpG: 183  754  183 489  64.9  0.7 */
 {
 width = $3-$2;
 printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\n",
   $1,$2,$3,$5,$6,width,$6,width*$7*0.01,100.0*2*$6/($3-$2),$7);}
 '_EOF_'
     # << this line makes emacs coloring happy
     awk -f filter.awk chr*.cpg > cpgIsland.bed
     ssh hgwdev
     cd /cluster/data/hg16/bed/cpgIsland
     hgLoadBed hg16 cpgIsland -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed
 #	stringTab = 1
 #	Reading cpgIsland.bed
 #	Loaded 27596 elements
 #	Sorted
 #	Saving bed.tab
 #	Loading hg16
 
 # VERIFY REPEATMASKER RESULTS (DONE - 2003-08-01 - Hiram)
 
     # Run featureBits on hg16 and on a comparable genome build, and compare:
     ssh hgwdev
 
     featureBits hg16 rmsk
     # --> 1388770568 bases of 2865697954 (48.462%) in intersection
     # --> 1388044886 bases of 2865697954 (48.437%) in intersection
     # --> 1388157103 bases of 2863665240 (48.475%) in intersection
     featureBits hg15 rmsk
     # --> 1386879340 bases of 2866466359 (48.383%) in intersection
     featureBits hg13 rmsk
     # --> 1383216615 bases of 2860907679 (48.349%) in intersection
 
 # PREPARE CLUSTER FOR BLASTZ RUN (DONE - 2003-08-05 - Hiram)
 
     ssh eieio
     #  This is where kkstore /scratch is kept:
     cd /cluster/bluearc/scratch/hg/gs.17/build34/rmsk
     #  The following will mark each line for rat and mouse
     #	Rat first will column 1, Mouse second will be column 2
     foreach outfl ( *.out )
 	echo "$outfl"
 	/cluster/bluearc/RepeatMasker030619/DateRepsinRMoutput.pl \
 	${outfl} -query human -comp rat -comp mouse
     end
     #	Now extract each one, 1 = Rat, 2 = Mouse
     cd /cluster/bluearc/scratch/hg/gs.17/build34
     mkdir linSpecRep.notInRat
     mkdir linSpecRep.notInMouse
     foreach f (rmsk/*.out_rat_mus)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                         linSpecRep.notInRat/$base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 2 $f > \
                         linSpecRep.notInMouse/$base.out.spec
     end
     #  That produced no difference at all between those two targets.
     #  Have requested confirmation from Arian
 
 
 #  BLASTZ MOUSE (DONE - 2003-08-07 - Hiram)
 
     ssh eieio
     cd /cluster/bluearc/mm3.RM030619
 
     foreach f (rmsk.spec/*.out_rat_hum)
 	set base = $f:t:r:r
 	echo $base.out.spec
 	/cluster/bin/scripts/extractLinSpecReps 2 $f > \
 		linSpecRep.notInHuman/$base.out.spec
     end
 
     ssh eieio
     mkdir -p /cluster/data/hg16/bed/blastz.mm3
     cd /cluster/data/hg16/bed/blastz.mm3
 
     cat << '_EOF_' > DEF
 # mouse vs. human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Mouse
 SEQ2_DIR=/iscratch/i/mm3.RM030619/mixedNib/
 # not currently used
 SEQ2_RMSK=
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman/
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/store4/gs.17/build34/bed/blastz.mm3
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
 
 # Save the DEF file in the current standard place
     DS=`date -I`
     cp DEF ~angie/hummus/DEF.mm3-hg16.$DS
 
     ssh kk
     cd ~hg16/bed/blastz.mm3
     cd /cluster/data/hg16/bed/blastz.mm3
 
     # source the DEF file to establish environment for following commands
     bash
     . ./DEF
 
     # follow the next set of directions slavishly
     mkdir -p $BASE/run
     # give up on avoiding angie's directories
     # tcl script
     # creates xdir.sh and joblist run/j
     ~angie/hummus/make-joblist $DEF > $BASE/run/j
 
     # xdir.sh makes a bunch of result directories in $BASE/raw/
     # based on chrom name and CHUNK size
     sh $BASE/xdir.sh
     cd $BASE/run
 
     # now edit j to prefix path to executable name
     # NOTE: we should have a controlled version of schwartz bin executables
     sed -e 's#^#/cluster/bin/penn/#' j > j2
     wc -l j*
     head j2
 
     # make sure the j2 edits are OK, then use it:
     mv j2 j
 
     # para create will create the file: 'batch' for the cluster run
     para create j
 	# 39663 jobs
     para try
     para check
     para push
     # ... etc ...
     # With competition on the cluster:
 # Completed: 39663 of 39663 jobs
 # CPU time in finished jobs:   14365996s  239433.27m  3990.55h  166.27d  0.456 y
 # IO & Wait Time:                681029s   11350.48m   189.17h    7.88d  0.022 y
 # Average job time:                 379s       6.32m     0.11h    0.00d
 # Longest job:                     9275s     154.58m     2.58h    0.11d
 # Submission to last job:         53023s     883.72m    14.73h    0.61d
 
     # post-process blastz
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3
     #   source the DEF file again in case you are coming back to this
     #	(must be bash shell)
 
     . ./DEF
     
     # a new run directory
     mkdir -p run.1
     
     mkdir -p $BASE/lav
     
     # create a new job list to convert out files to lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                         > run.1/jobList
     cd run.1
 
     # make sure the job list is OK
     wc -l jobList
        # 312 jobs 
     head jobList
 
     # run on cluster
     ssh kk
     cd /cluster/data/hg16/bed/blastz.mm3/run.1
     para create jobList
     para try
     para check
     para push
     # etc.
 # Completed: 339 of 339 jobs
 # CPU time in finished jobs:      11666s     194.44m     3.24h    0.14d  0.000 y
 # IO & Wait Time:                 69155s    1152.58m    19.21h    0.80d  0.002 y
 # Average job time:                 238s       3.97m     0.07h    0.00d
 # Longest job:                     1332s      22.20m     0.37h    0.02d
 # Submission to last job:          1497s      24.95m     0.42h    0.02d
 
     # convert lav files to axt
     ssh kk
     cd /cluster/data/hg16/bed/blastz.mm3
     mkdir axtChrom
     
     # a new run directory
     mkdir run.2
     cd run.2
 
     # create template file for gensub2
     # usage:  blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.mm3/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.mm3/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/mm3.RM030619/mixedNib/
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     ls -1S /cluster/store4/gs.17/build34/bed/blastz.mm3/lav > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
         # 42 jobs
     head jobList
 
     cd /cluster/data/hg16/bed/blastz.mm3/run.2
     para create jobList
     para try
     para check
     para push
     # The two crashed jobs are about chr19 and chr19_random
     #  Its chr19_random .fa file is almost all masked sequence
     #  The resulting .axt file is empty.  The chr19 is too big
 #Completed: 40 of 42 jobs
 #Crashed: 2 jobs
 #CPU time in finished jobs:       1908s      31.80m     0.53h    0.02d  0.000 y
 #IO & Wait Time:                 22178s     369.64m     6.16h    0.26d  0.001 y
 #Average job time:                 602s      10.04m     0.17h    0.01d
 #Longest job:                     1723s      28.72m     0.48h    0.02d
 #Submission to last job:          1802s      30.03m     0.50h    0.02d
     # To fixup the chr19 axtsort problem
     # sometimes alignments are so huge that they cause axtSort to run out 
     # of memory.  Run them in two passes like this:
     ssh kkr1u00
     cd /cluster/data/hg16/bed/blastz.mm3
     set base=/cluster/data/hg16/bed/blastz.mm3
     set seq1_dir=/iscratch/i/gs.17/build34/bothMaskedNibs
     set seq2_dir=/iscratch/i/mm3.RM030619/mixedNib/
     foreach c (lav/chr19)
       pushd $c
       set chr=$c:t
       set out=axtChrom/$chr.axt
       echo "Translating $chr lav to $out"
       foreach d (*.lav)
         set smallout=$d.axt
         lavToAxt $d $seq1_dir $seq2_dir stdout \
         | axtDropSelf stdin stdout \
         | axtSort stdin $smallout
       end
       cat `ls -1 *.lav.axt | sort -g` > $base/$out
       popd
     end
 
     #	Remove the empty axtChrom/chr19_random.axt file to avoid future
     #	processing errors
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3
     mkdir -p pslChrom
     set tbl = "blastzMm3"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	That takes about 20 minutes
     #	chr19 came along later
     ssh kkr1u00
     set tbl = "blastzMm3"
     foreach f (axtChrom/chr19.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
 
     # Load database tables
     ssh hgwdev
     set tbl = "blastzMm3"
     cd /cluster/data/hg16/bed/blastz.mm3/pslChrom
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
     # This takes 30 minutes to an hour
     # and later chr19
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr19_${tbl}.psl
     # create trackDb/human/hg16 and get a trackDb.ra file started with:
 #	track blastzMm3
 #	shortLabel Mouse Blastz
 #	longLabel Blastz All Mouse (Feb. 03) Alignments
 #	group compGeno
 #	priority 130
 #	visibility hide
 #	color 100,50,0
 #	altColor 255,240,200
 #	spectrum on
 #	type psl xeno mm3
 #	otherDb mm3
     # remake trackDb tables
 
     # redo chr1 (featureBits shows 7% lower aligments than hg16)
     # (DONE 2003-09-09 kate)
     # blastz run ended prematurely -- .tmp files leftover, not moved to .out's 
     ssh kk
     cd /cluster/data/hg16/bed/blastz.mm3
     bash
     . ./DEF
     cd $BASE
     mkdir run.chr1
     # create job list for human chr1, with parasol output file validation
     ~angie/hummus/make-joblist $DEF | \
         /cluster/bin/scripts/blastz-clusterjob.pl $BASE | \
             grep 'run chr1.nib' | \
             sed -e 's#^#/cluster/bin/penn/#'  \
                 > $BASE/run.chr1/spec
     grep 'chr1/' $BASE/xdir.sh > $BASE/xdir.chr1.sh
     mv raw/chr1 raw/chr1.old
     mkdir raw/chr1
     sh xdir.chr1.sh
     cd run.chr1
     para create spec
 	# 2925 jobs
     para try
     para check
     para push
     # ... etc ...
     ssh eieio
     bash
     cd /cluster/data/hg16/bed/blastz.mm3
     . DEF
     mv lav/chr1 lav/chr1.old
     mkdir run.chr1.lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
         | grep 'lav chr1 ' > run.chr1.lav/jobList
     cd run.chr1.lav
     wc -l jobList
        # 25 jobs 
     head jobList
     # run on cluster
     ssh kk
     cd /cluster/data/hg16/bed/blastz.mm3/run.chr1.lav
     para create jobList
     para try
     para check
     para push
     # etc.
     
     # convert lav files to chrom axt
     /cluster/bin/scripts/blastz-chromlav2axt /cluster/data/hg16/bed/blastz.mm3/lav/chr1 /cluster/data/hg16/bed/blastz.mm3/axtChrom/chr1.axt /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3
     mv pslChrom/chr1_blastzMm3.psl pslChrom/chr1_blastzMm3.psl.old
     /cluster/bin/i386/axtToPsl axtChrom/chr1.axt S1.len S2.len \
                 pslChrom/chr1_blastzMm3.psl
     # reload database table
     hgsql hg16 -e "drop table chr1_blastzMm3"
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/pslChrom
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr1_blastzMm3.psl
 
     # make chain
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain/run1
     mv chain/chr1.chain chain/chr1.chain.old
     mv out/chr1.out out/chr1.out.old
     axtFilter -notQ=chrUn_random /cluster/data/hg16/bed/blastz.mm3/axtChrom/chr1.axt | axtChain stdin \
 	/cluster/data/hg16/nib \
 	/cluster/data/mm3/mixedNib chain/chr1.chain > out/chr1.out 
 
     # sort chains
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     mv all.chain all.chain.old
     chainMergeSort run1/chain/*.chain > all.chain
     mv chain chain.old
     mkdir chain
     chainSplit chain all.chain
 
     # reload chr1 chain into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain/chain
     hgLoadChain hg16 chr1_chainMm3 chr1.chain 
         # Loading 510456 chains into hg16.chr1_chainMm3
 
     # make net
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     cd chain
     /cluster/bin/i386/chainPreNet chr1.chain /cluster/data/hg16/chrom.sizes \
                         /cluster/data/mm3/chrom.sizes ../preNet/chr1.chain
     cd ..
     cd preNet
     mv ../n1/chr1.net ../n1/chr1.net.old
     /cluster/bin/i386/chainNet chr1.chain -minSpace=1 \
                 /cluster/data/hg16/chrom.sizes \
                 /cluster/data/mm3/chrom.sizes ../n1/chr1.net /dev/null
     cd ..
     cp hNoClass.net hNoClass.net.old
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     netClass hNoClass.net hg16 mm3 mouse.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
 	-qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman
 
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     # rm -r n1 hNoClass.net
 
     # Make a 'syntenic' subset of these with
     mv mouseSyn.net mouseSyn.net.old
     netFilter -syn mouse.net > mouseSyn.net
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netMm3 stdin
     netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin
 
     # make tight subset of net
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     mv ../axtNet/chr1.axt ../axtNet/chr1.old.axt
     netToAxt mouseNet/chr1.net chain/chr1.chain /cluster/data/hg16/nib \
                 /cluster/data/mm3.RM030619/mixedNib ../axtNet/chr1.axt
     mv ../axtTight/chr1.axt ../axtTight/chr1.axt.old
     cd ../axtNet
     subsetAxt chr1.axt ../axtTight/chr1.axt \
                 /cluster/data/subsetAxt/coding.mat 3400
 
     # translate to psl
     cd ../axtTight
     axtToPsl chr1.axt ../S1.len ../S2.len ../pslTight/chr1_blastzTightMm3.psl
 
     # Load table into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/pslTight
     hgLoadPsl -noTNameIx hg16 chr1_blastzTightMm3.psl
         # $ featureBits -chrom=chr1 hg16 chr1_blastzTightMm3.psl
         # 14052627 bases of 221562941 (6.342%) in intersection
         # hg15: 13990547 bases of 218713898 (6.397%) in intersection
 
 
     # make axtNet300
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     netSplit mouse.net mouseNet
     mv ../axtNet300/chr1.axt ../axtNet300/chr1.axt.old
     netToAxt -maxGap=300 mouseNet/chr1.net chain/chr1.chain /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib ../axtNet300/chr1.axt
 
     # create 2-way maf file for humor alignment
     set multizDir = /cluster/data/hg16/bed/humor.2003-09-02
     cd /cluster/data/hg16
     set mouseDir = bed/blastz.mm3/axtNet300
     axtSort $mouseDir/chr1.axt $mouseDir/chr1.axt.sorted
     mv $mouseDir/chr1.axt.sorted $mouseDir/chr1.axt 
     axtToMaf $mouseDir/chr1.axt \
             /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes \
             $multizDir/maf/chr1.mm3.maf.unfixed -tPrefix=hg16. -qPrefix=mm3.
     /cluster/bin/scripts/fixmaf.pl \
             < $multizDir/maf/chr1.mm3.maf.unfixed > $multizDir/maf/chr1.mm3.maf
     rm $multizDir/maf/chr1.mm3.maf.unfixed
 
 
 # NET MOUSE BLASTZ (DONE - 2003-08-22 - Hiram)
 
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                         /cluster/data/mm3/chrom.sizes ../preNet/$i
     end
     # This foreach loop will take about 15 min to execute.
 
     cd ..
     mkdir n1 
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                             /cluster/data/mm3/chrom.sizes ../n1/$n /dev/null
     end
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     # memory usage 2490523648, utime 15421 s/100, stime 3665
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     ~/bin/i386/netClass hNoClass.net hg16 mm3 mouse.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
 	-qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman
 
     # If things look good do
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn mouse.net > mouseSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netMm3 stdin
     netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin
     # make net
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     mkdir mouseNet
     netSplit mouse.net mouseNet
     foreach n (mouseNet/chr*.net)
 	set c=$n:t:r
 	echo "netToAxt: $c.net -> $c.axt"
 	rm -f ../axtNet/$c.axt
 	netToAxt mouseNet/$c.net chain/$c.chain \
 		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
 		/cluster/data/mm3.RM030619/mixedNib \
 		../axtNet/$c.axt
 	echo "Complete: $c.net -> $c.axt"
     end
 
 
 # MAKE BLASTZ BEST MOUSE MM3 (DONE - 2003-08-26 - Hiram)
 
     #  IMPORTANT NOTE - This axtBest process has been replaced by the
     #  chain to net to axt process.  Note procedure below continues
     #  after the chain and nets have been produced.
 
     # Consolidate AXT files to chrom level, sort, pick best, make psl.
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChrom
     mkdir -p /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom
     
     # copy chrom axt's to bluearc, to avoid hitting fileserver too hard
     cp -p *.axt /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom
     # chr19 came along later:
     cp -p chr19.axt /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom
 
     ssh kk
     cd /cluster/data/hg16/bed/blastz.mm3
     mkdir -p axtBest pslBest
     mkdir run.3
     cd run.3
 
     # create script to filter files 
     cat << '_EOF_' > doBestAxt
 #!/bin/csh -f
 # usage: doBestAxt chr axt-file best-file psl-file
 /cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
 sleep 1
 /cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.mm3/S1.len \
 	/cluster/data/hg16/bed/blastz.mm3/S2.len $4
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x doBestAxt
     cd ../axtChrom
     ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
     cd ../run.3
 
     # create template for cluster job
     cat << '_EOF_' > gsub
 #LOOP
 doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.mm3/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm3/axtBest/$(root1).axt} {check out line+  /cluster/data/hg16/bed/blastz.mm3/pslBest/$(root1)_blastzBestMm3.psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 chrom.list single gsub jobList
     wc -l jobList
         # 41 jobs
     head jobList
 
     ssh kk
     cd /cluster/data/hg16/bed/blastz.mm3
     cd run.3
     para create jobList
     para try
     para check
     para push
     #  With the chr19 situation, went back and reran this situation.
     #	for some unknown reason the first time it had 9 failed jobs:
 Completed: 32 of 41 jobs
 Crashed: 9 jobs
 CPU time in finished jobs:        827s      13.78m     0.23h    0.01d  0.000 y
 IO & Wait Time:                  1299s      21.65m     0.36h    0.02d  0.000 y
 Average job time:                  66s       1.11m     0.02h    0.00d
 Longest job:                      361s       6.02m     0.10h    0.00d
 Submission to last job:          1195s      19.92m     0.33h    0.01d
     # And then rerunning those 9 failed jobs, only chr19 failed:
 Completed: 8 of 9 jobs
 Crashed: 1 jobs
 CPU time in finished jobs:        748s      12.47m     0.21h    0.01d  0.000 y
 IO & Wait Time:                  2290s      38.16m     0.64h    0.03d  0.000 y
 Average job time:                 380s       6.33m     0.11h    0.00d
 Longest job:                     1247s      20.78m     0.35h    0.01d
 Submission to last job:          1261s      21.02m     0.35h    0.01d
 
     #	Better yet, Jim says to be consistent, do all the chroms in
     #	this manner:
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     mkdir mouseNet
     netSplit mouse.net mouseNet
     foreach n (mouseNet/chr*.net)
 	set c=$n:t:r
 	echo "netToAxt: $c.net -> $c.axt"
 	rm -f ../axtNet/$c.axt
 	netToAxt mouseNet/$c.net chain/$c.chain \
 		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
 		/cluster/data/mm3.RM030619/mixedNib \
 		../axtNet/$c.axt
 	echo "Complete: $c.net -> $c.axt"
     end
 
     mkdir -p /cluster/data/hg16/bed/blastz.mm3/axtBest
     cd /cluster/data/hg16/bed/blastz.mm3/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area (DONE 2003-09-24 kate)
     cd /cluster/data/hg16/bed/blastz.mm3/axtNet
     gzip *.axt
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtNet
     cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtNet
     # add README.txt file to dir, if needed
 
     #  Convert those axt files to psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo "processing $c.axt -> ${c}_blastzBestMm3.psl"
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestMm3.psl
 	echo "Done: ${c}_blastzBestMm3.psl"
     end
 
     # Load tables
      ssh hgwdev
      set base="/cluster/data/hg16/bed/blastz.mm3"
      set tbl="blastzBestMm3"
      cd $base/pslBest
      /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
 
      # check results
      # the original axtBest stuff from the axtBest operation:
 #  featureBits hg16 blastzBestMm3
 #  1027438291 bases of 2865248791 (35.859%) in intersection
      #  After going through the chain->net->axt operation:
 #  featureBits hg16 blastzBestMm3
 #   991468768 bases of 2865248791 (34.603%) in intersection
      #  And finally after fixing a blastz execution problem on chr1:
 #  1007362800 bases of 2865248791 (35.158%) in intersection
 
 #  featureBits hg15 blastzBestMm3
 #  1035090465 bases of 2866466359 (36.110%) in intersection
 
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/hg16/axtBestMm3
      cd /gbdb/hg16/axtBestMm3
      foreach f (/cluster/data/hg16/bed/blastz.mm3/axtNet/chr*.axt)
        ln -s $f .
      end
      cd /cluster/data/hg16/bed/blastz.mm3/axtNet
      rm -f axtInfoInserts.sql
      touch axtInfoInserts.sql
      foreach f (/gbdb/hg16/axtBestMm3/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('mm3','Blastz Best in Genome','$chr','$f');" \
          >> axtInfoInserts.sql
      end
      hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
      hgsql hg16 < axtInfoInserts.sql
 
 
 # MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-08-25 - Hiram)
     # After creating axtBest alignments above, use subsetAxt to get axtTight:
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtNet
     mkdir -p ../axtTight
     tcsh
     foreach i (*.axt)
       echo $i
       subsetAxt  $i ../axtTight/$i \
         ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
     end
 
     # translate to psl
     cd ../axtTight
     mkdir -p ../pslTight
     foreach i (*.axt)
       set c = $i:r
       axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm3.psl
       echo "Done: $i"
     end
 
     # Load tables into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/pslTight
     hgLoadPsl -noTNameIx hg16 chr*_blastzTightMm3.psl
 
     # copy to axt's to download area (DONE 2003-09-24 kate)
     cd /cluster/data/hg16/bed/blastz.mm3/axtTight
     gzip *.axt
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtTight
     cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsMm3/axtTight
     # add README.txt file to dir, if needed
 
 # CHAIN MOUSE BLASTZ (DONE 2003-08-28 - Hiram)
 
 # Run axtChain on little cluster
     ssh kkr1u00
     mkdir -p /cluster/data/hg16/bed/blastz.mm3/axtChain/run1
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain/run1
     mkdir out chain
 
     ls -1S /cluster/data/hg16/bed/blastz.mm3/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
 #    axtFilter -notQ=chrUn_random $1 | axtChain stdin
 
     cat << '_EOF_' > doChain
 #!/bin/csh
     axtFilter -notQ=chrUn_random $1 | axtChain stdin \
 	/iscratch/i/gs.17/build34/bothMaskedNibs \
 	/iscratch/i/mm3.RM030619/mixedNib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     mkdir out chain
 
     # 41 jobs
     gensub2 input.lst single gsub jobList
     para create jobList
     para try
     para push # ... etc ...
 Completed: 41 of 41 jobs
 CPU time in finished jobs:      31379s     522.98m     8.72h    0.36d  0.001 y
 IO & Wait Time:                 10761s     179.35m     2.99h    0.12d  0.000 y
 Average job time:                1028s      17.13m     0.29h    0.01d
 Longest job:                    10327s     172.12m     2.87h    0.12d
 Submission to last job:         10327s     172.12m     2.87h    0.12d
 
     # now on the cluster server, sort chains
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
 
     # these steps take ~20 minutes
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg16 ${c}_chainMm3 $i
         echo done $c
     end
 
 # NET MOUSE BLASTZ (DONE - 2003-08-22 - Hiram)
 
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                         /cluster/data/mm3/chrom.sizes ../preNet/$i
     end
     # This foreach loop will take about 15 min to execute.
 
     cd ..
     mkdir n1 
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                             /cluster/data/mm3/chrom.sizes ../n1/$n /dev/null
     end
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     # memory usage 2490523648, utime 15421 s/100, stime 3665
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     ~/bin/i386/netClass hNoClass.net hg16 mm3 mouse.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
 	-qNewR=/cluster/bluearc/mm3.RM030619/linSpecRep.notInHuman
 
     # If things look good do
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn mouse.net > mouseSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
     netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netMm3 stdin
     netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm3 stdin
 
     # Add entries for net and chain to human/hg16 trackDb
 
 
 # MAKE HUMAN-MOUSE MM3 OVER.CHAIN FOR LIFTOVER (2004-07-09 kate)
     ssh eieio
     set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain
     gunzip *.gz
 
     ssh kolossus
     set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain
     cd $chainDir
     mkdir subset
 cat > makeSubset.csh << 'EOF'
     set chainDir = /cluster/data/hg16/bed/blastz.mm3/axtChain
     foreach f ($chainDir/chain/*.chain.gz)
       set c = $f:t:r:r
       echo subsetting $c
       gunzip -c $f | netChainSubset $chainDir/mouseNet/$c.net stdin \
                         subset/$c.chain
     end
 'EOF'
 # << for emacs
     csh makeSubset.csh >&! makeSubset.log &
     tail -100f makeSubset.log
     cat subset/*.chain > /cluster/data/hg16/bed/liftOver/hg16Tomm3.chain
 
     # test reciprocal best on chr6 for ENr233
     ssh kkstore
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain
 
     # renumber chains to assure unique ID's, 
     # since netting splits some (should redo the  liftOver chain with new ID's)
     # then sort by score for netter
     mkdir uniqueSubset
     chainMergeSort subset/chr6.chain > uniqueSubset/chr6.chain
     mkdir swappedSubset
     chainSwap uniqueSubset/chr6.chain swappedSubset/chr6.chain
 
     mkdir recipBestTest
     cd recipBestTest
     chainSort ../uniqueSubset/chr6.chain stdout | \
         chainNet stdin \
                 /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes\
                     human.chr6.net mouse.chr6.net
     netChainSubset mouse.chr6.net ../swappedSubset/chr6.chain stdout | \
         chainSwap stdin chr6.chain
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain/recipBestTest
     hgLoadChain hg16 rBestChainMm3 chr6.chain
     # didn't filter enuff -- perhaps didn't rechain in proper direction
 
 
 #  BLASTZ RAT  (DONE - 2003-08-07 - Hiram)
 
     ssh eieio
     mkdir -p /cluster/data/hg16/bed/blastz.rn3
     cd /cluster/data/hg16/bed/blastz.rn3
 
     cat << '_EOF_' > DEF
 # rat vs. human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInRat
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Rat
 SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
 # not currently used
 SEQ2_RMSK=
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/store4/gs.17/build34/bed/blastz.rn3
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
 
 # Save the DEF file in the current standard place
     DS=`date -I`
     cp DEF ~angie/hummus/DEF.rn3-hg16.$DS
 
     ssh kk
     cd /cluster/data/hg16/bed/blastz.rn3
 
     # source the DEF file to establish environment for following commands
     . ./DEF
 
     # follow the next set of directions slavishly
     mkdir -p $BASE/run
     # give up on avoiding angie's directories
     # tcl script
     # creates xdir.sh and joblist run/j
     ~angie/hummus/make-joblist $DEF > $BASE/run/j
 
     # xdir.sh makes a bunch of result directories in $BASE/raw/
     # based on chrom name and CHUNK size
     sh $BASE/xdir.sh
     cd $BASE/run
 
     # now edit j to prefix path to executable name
     # NOTE: we should have a controlled version of schwartz bin executables
     sed -e 's#^#/cluster/bin/penn/#' j > j2
     wc -l j*
     head j2
 
     # make sure the j2 edits are OK, then use it:
     mv j2 j
 
     # para create will create the file: 'batch' for the cluster run
     para create j
 	# 39663 jobs
     para try
     para check
     para push
     # ... etc ...
 # Completed: 41697 of 41697 jobs
 # CPU time in finished jobs:   14155946s  235932.43m  3932.21h  163.84d  0.449 y
 # IO & Wait Time:               1005629s   16760.49m   279.34h   11.64d  0.032 y
 # Average job time:                 364s       6.06m     0.10h    0.00d
 # Longest job:                     4310s      71.83m     1.20h    0.05d
 # Submission to last job:         35086s     584.77m     9.75h    0.41d
 
 
     # post-process blastz
     ssh kk
     cd /cluster/data/hg16/bed/blastz.rn3
     #   source the DEF file again in case you are coming back to this
     #	(must be bash shell)
 
     . ./DEF
     
     # a new run directory
     mkdir -p run.1
     
     mkdir -p $BASE/lav
     
     # create a new job list to convert out files to lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                         > run.1/jobList
     cd run.1
 
     # make sure the job list is OK
     wc -l jobList
        # 339 jobs 
     head jobList
 
     # run on cluster
     ssh kk
     cd /cluster/data/hg16/bed/blastz.rn3/run.1
     para create jobList
     para try
     para check
     para push
     # etc.
 # Completed: 339 of 339 jobs
 # CPU time in finished jobs:       6562s     109.37m     1.82h    0.08d  0.000 y
 # IO & Wait Time:                154475s    2574.58m    42.91h    1.79d  0.005 y
 # Average job time:                 475s       7.92m     0.13h    0.01d
 # Longest job:                      924s      15.40m     0.26h    0.01d
 # Submission to last job:           933s      15.55m     0.26h    0.01d
 
     # convert lav files to axt
     ssh kk
     cd /cluster/data/hg16/bed/blastz.rn3
     mkdir axtChrom
     
     # a new run directory
     mkdir run.2
     cd run.2
 
     # create template file for gensub2
     # usage:  blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.rn3/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.rn3/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/rn3/bothMaskedNibs
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     ls -1S /cluster/store4/gs.17/build34/bed/blastz.rn3/lav > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
         # 42 jobs
     head jobList
 
     para create jobList
     para try
     para check
     para push
     # ... etc ...
     #  The crashed job is again chr19_random
 # Completed: 41 of 42 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:       1507s      25.12m     0.42h    0.02d  0.000 y
 # IO & Wait Time:                 17520s     292.00m     4.87h    0.20d  0.001 y
 # Average job time:                 464s       7.73m     0.13h    0.01d
 # Longest job:                     1214s      20.23m     0.34h    0.01d
 # Submission to last job:          1214s      20.23m     0.34h    0.01d
 
     #	Remove the empty axtChrom/chr19_random.axt file to avoid future
     #	processing errors
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3
     mkdir -p pslChrom
     set tbl = "blastzRn3"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	That takes about 20 minutes
 
     # Load database tables
     ssh hgwdev
     set tbl = "blastzRn3"
     cd /cluster/data/hg16/bed/blastz.rn3/pslChrom
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
     # This takes 30 minutes to an hour
     #	New entry in human/hg16/trackDb.ra
 #	track blastzRn3
 #	shortLabel Rat Blastz
 #	longLabel Merged Blastz Rat (June 03) Alignments
 #	group compGeno
 #	priority 142
 #	visibility hide
 #	color 100,50,0
 #	altColor 255,240,200
 #	spectrum on
 #	type psl xeno rn3
 #	otherDb rn3
 
 # MAKE BLASTZ BEST RAT RN3 (DONE - 2003-08-08 - Hiram - Redone 08-26)
 
     # IMPORTANT NOTE - this axtBest process has been replaced by
     #	the chain -> net -> netToAxt process.  So, after chains and
     #	nets have been created, pick up this best process below.
 
     # Consolidate AXT files to chrom level, sort, pick best, make psl.
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3/axtChrom
     mkdir -p /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom
     
     # copy chrom axt's to bluearc, to avoid hitting fileserver too hard
     cp -p *.axt /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom
 
     ssh kk
     cd /cluster/data/hg16/bed/blastz.rn3
     mkdir -p axtBest pslBest
     mkdir run.3
     cd run.3
 
     # create script to filter files 
     cat << '_EOF_' > doBestAxt
 #!/bin/csh -f
 # usage: doBestAxt chr axt-file best-file psl-file
 /cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
 sleep 1
 /cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.rn3/S1.len \
 	/cluster/data/hg16/bed/blastz.rn3/S2.len $4
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x doBestAxt
     cd ../axtChrom
     ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
     cd ../run.3
 
     # create template for cluster job
     cat << '_EOF_' > gsub
 #LOOP
 doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.rn3/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.rn3/axtBest/$(root1).axt} {check out line+  /cluster/data/hg16/bed/blastz.rn3/pslBest/$(root1)_blastzBestRn3.psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 chrom.list single gsub jobList
     wc -l jobList
         # 41 jobs
     head jobList
 
     cd /cluster/data/hg16/bed/blastz.rn3
     cd run.3
     para create jobList
     para try
     para check
     para push
         # 106 minutes, almost all I/O time:
 # Completed: 41 of 41 jobs
 # CPU time in finished jobs:       2225s      37.09m     0.62h    0.03d  0.000 y
 # IO & Wait Time:                 36349s     605.81m    10.10h    0.42d  0.001 y
 # Average job time:                 941s      15.68m     0.26h    0.01d
 # Longest job:                     6415s     106.92m     1.78h    0.07d
 # Submission to last job:          6417s     106.95m     1.78h    0.07d
 
     #	Better yet, Jim says to be consistent, do all the chroms in
     #	this manner:
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3/axtChain
     mkdir ratNet
     netSplit rat.net ratNet
 
     mkdir ../axtNet
     foreach n (ratNet/chr*.net)
 	set c=$n:t:r
 	echo "netToAxt: $c.net -> $c.axt"
 	rm -f ../axtNet/$c.axt
 	netToAxt -maxGap=25 ratNet/$c.net chain/$c.chain \
 		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
 		/cluster/bluearc/rat/rn3/softNib \
 		../axtNet/$c.axt
 	echo "Complete: $c.net -> $c.axt"
     end
 
     mkdir -p /cluster/data/hg16/bed/blastz.rn3/axtBest
     cd /cluster/data/hg16/bed/blastz.rn3/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area (DONE 2003-09-24 kate)
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3/axtNet
     gzip *.axt
     ssh hgwdev
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtNet
     cd /cluster/data/hg16/bed/blastz.rn3/axtNet
     cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtNet
     # add README.txt file to dir, if needed
 
     #  Convert those axt files to psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo "processing $c.axt -> ${c}_blastzBestRn3.psl"
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestRn3.psl
 	echo "Done: ${c}_blastzBestRn3.psl"
     end
 
     # Load tables
      ssh hgwdev
      cd /cluster/data/hg16/bed/blastz.rn3/pslBest
      /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzBestRn3.psl
 
      # check results
 #  Via the netToAxt process:
 #    featureBits hg16 blastzBestRn3
 #    976121391 bases of 2865248791 (34.068%) in intersection
 #  With the original axtBest process, before the netToAxt process:
 #    featureBits hg16 blastzBestRn3
 #    1002119325 bases of 2865248791 (34.975%) in intersection
 
 #  Hg15 results:
 #    featureBits hg15 blastzBestRn3
 #    992724355 bases of 2866466359 (34.632%) in intersection
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/hg16/axtBestRn3
      cd /gbdb/hg16/axtBestRn3
      ln -s /cluster/data/hg16/bed/blastz.rn3/axtNet/chr*.axt .
 
      cd /cluster/data/hg16/bed/blastz.rn3/axtNet
      rm -f axtInfoInserts.sql
      touch axtInfoInserts.sql
      foreach f (/gbdb/hg16/axtBestRn3/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('rn3','Blastz Best in Genome','$chr','$f');" \
          >> axtInfoInserts.sql
      end
      #	Already done above.  This table needs definition only once
      # hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
      hgsql hg16 < axtInfoInserts.sql
 
 
 # MAKING RAT AXTTIGHT FROM AXTBEST (DONE - 2003-08-26 - Hiram)
     # After creating axtBest alignments above, use subsetAxt to get axtTight:
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3/axtNet
     mkdir -p ../axtTight
     tcsh
     foreach i (*.axt)
       echo $i
       subsetAxt  $i ../axtTight/$i \
         ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
     end
 
     # translate to psl
     cd ../axtTight
     mkdir -p ../pslTight
     foreach i (*.axt)
       set c = $i:r
       axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRn3.psl
       echo "Done: $i"
     end
 
     # Load tables into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.rn3/pslTight
     hgLoadPsl -noTNameIx hg16 chr*_blastzTightRn3.psl
 
     # copy  axt's to download area (DONE 2003-09-24 kate)
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3/axtTight
     gzip *.axt
     ssh hgwdev
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtTight
     cd /cluster/data/hg16/bed/blastz.rn3/axtTight
     cp -p *.axt.gz /usr/local/apache/htdocs/goldenPath/hg16/vsRn3/axtTight
     # add README.txt file to dir, if needed
 
 # CHAIN RAT BLASTZ (DONE 2003-08-08 - Hiram)
 
 # Run axtChain on little cluster
     ssh kkr1u00
     cd /cluster/data/hg16/bed/blastz.rn3
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
 
     ls -1S /cluster/data/hg16/bed/blastz.rn3/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
 #    axtFilter -notQ=chrUn_random $1 | axtChain stdin
 
     cat << '_EOF_' > doChain
 #!/bin/sh
 axtFilter $1 | axtChain stdin \
 	/iscratch/i/gs.17/build34/bothMaskedNibs \
 	/iscratch/i/rn3/bothMaskedNibs $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     mkdir out chain
 
     # 41 jobs
     gensub2 input.lst single gsub jobList
     para create jobList
     para try
     para push # ... etc ...
     # With only 6 CPUs available:
 # Completed: 40 of 40 jobs
 # CPU time in finished jobs:      21791s     363.19m     6.05h    0.25d  0.001 y
 # IO & Wait Time:                 12491s     208.18m     3.47h    0.14d  0.000 y
 # Average job time:                 857s      14.28m     0.24h    0.01d
 # Longest job:                     2724s      45.40m     0.76h    0.03d
 # Submission to last job:          5875s      97.92m     1.63h    0.07d
 
     # now on the cluster server, sort chains
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3/axtChain
     /cluster/bin/i386/chainMergeSort run1/chain/*.chain > all.chain
     /cluster/bin/i386/chainSplit chain all.chain
     # these steps take ~20 minutes
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.rn3/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg16 ${c}_chainRn3 $i
         echo done $c
     end
 
 CREATE bigZips stuff for release (DONE 2003-08-01, 08-06, 08-08 - Hiram)
 
     # make bigZips/mrna.zip (markd 8 aug 2003)
     # on hgbeta: 
     cd /genbank
     ./bin/i386/gbGetSeqs -get=seq -db=hg16 -native genbank mrna download/hg16/bigZips/mrna.fa 
     zip download/hg16/bigZips/mrna.zip  download/hg16/bigZips/mrna.fa 
     rm download/hg16/bigZips/mrna.fa 
 
     ssh hgwdev
     #  This stuff has to work in a different way because this stuff
     #	updates on a daily basis. - (DONE 2003-08-09 - Hiram)
     cd /usr/local/apache/htdocs/goldenPath/hg16/bigZips
     featureBits hg16 refGene:upstream:1000 -fa=upstream1000.fa
     zip upstream1000.zip upstream1000.fa
     rm upstream1000.fa
     featureBits hg16 refGene:upstream:2000 -fa=upstream2000.fa
     zip upstream2000.zip upstream2000.fa
     rm upstream2000.fa
     featureBits hg16 refGene:upstream:5000 -fa=upstream5000.fa
     zip upstream5000.zip upstream5000.fa
     rm upstream5000.fa
 
 # MAKING MOUSE AND RAT SYNTENY (MOUSE done 2003-09-16)(RAT Done 2003-08-28)
 
 ssh hgwdev
 mkdir -p /cluster/data/hg16/bed/syntenyMm3
 cd /cluster/data/hg16/bed/syntenyMm3
 # Copy all the needed scripts from /cluster/data/hg15/bed/syntenyMouse
 cp -p /cluster/data/hg15/bed/syntenyMouse/*.pl .
 
 ./syntenicBest.pl -db=hg16 -table=blastzBestMm3
 ./smooth.pl
 ./joinsmallgaps.pl
 ./fillgap.pl -db=hg16 -table=blastzBestMm3
 ./synteny2bed.pl
 hgLoadBed hg16 syntenyMm3 ucsc100k.bed
 
 #	And for the Rat, same thing, different directory:
 mkdir ../syntenyRn3
 cd ../syntenyRn3
 ../syntenyMm3/syntenicBest.pl -db=hg16 -table=blastzBestRn3
 #	smooth.pl overwrites genomeBest2phase created by the previous
 #	run of this above.  Runs quickly.
 ../syntenyMm3/smooth.pl
 # joinsmallgaps.pl overwrites genomeBest3phase created above. Runs quickly.
 ../syntenyMm3/joinsmallgaps.pl
 # fillgap.pl creates genomeBestFinal
 ../syntenyMm3/fillgap.pl -db=hg16 -table=blastzBestRn3
 # synteny2bed.pl creates ucsc100k.bed
 ../syntenyMm3/synteny2bed.pl
 hgLoadBed hg16 syntenyRn3 ucsc100k.bed
 # Loaded 1537 elements
 
 # NET RAT BLASTZ (WORKING - 2003-08-11 - Hiram)
 
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                         /cluster/data/rn3/chrom.sizes ../preNet/$i
     end
     # This foreach loop will take about 15 min to execute.
 
     cd ..
     mkdir n1 
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                             /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
     end
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     # memory usage 2511495168, utime 15658 s/100, stime 3383
 
     # The netClass operations requires an "ancientRepeat" table to exist
     # in either hg16 or rn3.  So, create the table:
 
     ssh hgwdev
     mkdir -p /cluster/data/hg16/bed/ancientRepeat
     cd /cluster/data/hg16/bed/ancientRepeat
     # mysqldump needs write permission to this directory
     # and you need to use your read/write enabled user with password
     chmod 777 .
     hgsqldump --all --tab=. hg15 ancientRepeat
     chmod 775 .
     hgsql hg16 < ancientRepeat.sql
     mysqlimport -u<r/w user> -p<r/w pass> hg16 ancientRepeat.txt
     # This is a hand curated table obtained from Arian.
 
 	
     #	The mouse.net argument here should have been rat.net
     cd /cluster/data/hg16/bed/blastz.rn3/axtChain
     /cluster/bin/i386/netClass hNoClass.net hg16 rn3 mouse.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInRat \
 	-qNewR=/cluster/bluearc/rat/rn3/linSpecRep.notInHuman
 
     # If things look good do
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.rn3/axtChain
     rm -r n1 hNoClass.net
     #	The arguments here should have been rat.net and ratSyn.net
     # Make a 'syntenic' subset of these with
     netFilter -syn mouse.net > mouseSyn.net
 
     # The mouse.net argument here should have been rat.net from the
     # netClass operation above.
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.rn3/axtChain
     netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netRn3 stdin
     netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetRn3 stdin
 
     # Add entries for net and chain to human/hg16 trackDb
 
 
 # MAKE HUMAN-RAT OVER.CHAIN FOR LIFTOVER (2004-07-09 kate)
     ssh eieio
     set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain
     cd $chainDir/ratNet
     gunzip *.gz
 
     ssh kolossus
     cd /cluster/data/hg16/bed/liftOver
     mkdir hg16Torn3
     cd hg16Torn3
     set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain
     mkdir subset
 cat > makeSubset.csh << 'EOF'
     set chainDir = /cluster/data/hg16/bed/blastz.rn3/axtChain
     foreach f ($chainDir/chain/*.chain)
       set c = $f:t:r:r
       echo subsetting $c
       netChainSubset $chainDir/ratNet/$c.net $f subset/$c.chain
     end
 'EOF'
 # << for emacs
     csh makeSubset.csh >&! makeSubset.log &
     tail -100f makeSubset.log
     cat subset/*.chain > /cluster/data/hg16/bed/liftOver/hg16Torn3.chain
 
 
 # Make Known Genes Track
 
   This task has many steps and currently it is described by two documents:
 
  	1. makeProteins072003.doc
 
  	   describes how the protein databases, biosql072003 and proteins072003,
  	   were built
 
  	2. makeKgHg16.doc
 
 	   describes how the Known Genes related database tables
 	   were built for hg16.  makeKgHg16.doc could be merged 
  	   with makeHg16.doc after minor editing of the format style.
 
 # LIFTING REPEATMASKER .ALIGN FILES
 
 # for this work, I had to delete some comments that were in the .align files.
 # The edited files were
 #   NT_008046_01.fa.align   (around line 10586)
 #   NT_008046_75.fa.align   (around line 3320)
 # The lines I deleted are:
 #
 # These elements can be clipped out with the options is_clip or is_only.
 # The latter does not run the 'normal' RepeatMasker routine and positions in the current
 # .out file will not correspond with the -is_only reconstructed sequence.
 #
 foreach d (?{,?}/NT_??????)
   set c=$d:t
   cd $d
   echo $c to $c.fa.align
   /cluster/bin/scripts/liftRMAlign.pl $c.lft > $c.fa.align
   cd ../..
 end
 
 foreach chr (?{,?})
   cd $chr
   echo making symbolic links for chr$chr NT .fa.align files
   foreach ctg (NT_??????)
     ln -s $ctg/$ctg.fa.align
   end
   cd ..
   if (-e $chr/lift/ordered.lft) then
     echo making $chr/chr$chr.fa.align
     /cluster/bin/scripts/liftRMAlign.pl $chr/lift/ordered.lft \
       > $chr/chr$chr.fa.align
   endif
   if (-e $chr/lift/random.lft) then
     echo making $chr/chr${chr}_random.fa.align
     /cluster/bin/scripts/liftRMAlign.pl $chr/lift/random.lft \
       > $chr/chr${chr}_random.fa.align
   endif
   echo removing symbolic links for chr$chr NT .fa.align files
   rm $chr/NT_??????.fa.align
 end
 
 
 # TWINSCAN 1.3 GENE PREDICTIONS (2003-12-12 braney)
 
     cd /cluster/data/hg16/bed
     rm -fr twinscan
     mkdir twinscan.2003-12-12
     ln -s twinscan.2003-12-12 twinscan
     cd twinscan
     tarFile=Hs-NCBI34-TS13-pseudo-masked.tgz
     wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13_pseudo/Hs-NCBI34-TS13-pseudo-masked.tgz
     wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13_pseudo/md5sum.txt
 
     # check file transferred correctly
     md5sum $tarFile | diff - md5sum.txt
     tar xvfz $tarFile
     unset tarFile 
 
     # pare down protein FASTA header to id and add missing .a:
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
       echo chr$c
       perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
     end
     ldHgGene hg16 twinscan chr_gtf/chr*.gtf -gtf -genePredExt
     hgPepPred hg16 generic twinscanPep chr_ptx/chr*-fixed.fa
 
 # RAW TWINSCAN 1.3 GENE PREDICTIONS, WITHOUT FILTERING OF PSEUDOGENES
 # (2004-01-11 acs)
 
     cd /cluster/data/hg16/bed
     mkdir twinscan_raw.2004-01-11
     ln -s twinscan.2004-01-11 twinscan_raw
     cd twinscan_raw
     tarFile=NCBI34_Hs_TS13_11_11_03.tgz
     wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13/$tarFile
     wget http://genes.cs.wustl.edu/predictions/human/NCBI34_TS13/md5sum.txt
 
     # check file transferred correctly
     md5sum $tarFile | diff - md5sum.txt
     tar xvfz $tarFile
     unset tarFile 
 
     # pare down protein FASTA header to id and add missing .a:
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
       echo chr$c
       perl -wpe 's/^(\>\S+)\s.*$/$1.a/' < chr_ptx/chr$c.ptx > chr_ptx/chr$c-fixed.fa
     end
     ldHgGene hg16 twinscan_raw chr_gtf/chr*.gtf -gtf
     hgPepPred hg16 generic twinscanrawPep chr_ptx/chr*-fixed.fa
 
 # LOAD GENEID GENES (DONE - 2003-09-02 - Hiram RELOADED -gtf 2004-04-02 kate)
     mkdir -p /cluster/data/hg16/bed/geneid/download
     cd /cluster/data/hg16/bed/geneid/download
 
     # Now download *.gtf and *.prot from 
     set dir = genome.imim.es/genepredictions/H.sapiens/golden_path_200307/geneid_v1.1/
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y Un)
       wget http://$dir/chr$c.gtf
       wget http://$dir/chr${c}_random.gtf
       wget http://$dir/chr$c.prot
       wget http://$dir/chr${c}_random.prot
     end
     wget http://$dir/readme
     # Add missing .1 to protein id's
     foreach f (*.prot)
       perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
       echo "done $f"
     end
     cd ..
     ldHgGene hg16 geneid download/*.gtf -gtf
 #	Read 32255 transcripts in 281180 lines in 40 files
 #	  32255 groups 40 seqs 1 sources 3 feature types
 #	32255 gene predictions
     hgPepPred hg16 generic geneidPep download/*-fixed.prot
 
 # QA NOTE: [ASZ 2007-10-02] sudo mytouch hg16 geneidPep 200404021400.00
 
 
 # HUMAN/MOUSE/RAT ALIGMNMENT USING HUMOR(MULTIZ) (IN PROGRESS 2003-0829 kate)
 # Multiple alignment with Mm3, Rn3
 
     ssh eieio
 
     # make mouse axtNet300
     cd /cluster/data/hg16/bed/blastz.mm3/axtChain/mouseNet
     mkdir -p ../../axtNet300
     foreach f (chr*.net)
         set c = $f:r
         echo "mouse axtNet300 on $c"
         netToAxt -maxGap=300 $c.net ../chain/$c.chain /cluster/data/hg16/nib /cluster/data/mm3.RM030619/mixedNib ../../axtNet300/$c.axt
     end
 
     # make rat axtNet300
     cd /cluster/data/hg16/bed/blastz.rn3/axtChain/ratNet
     mkdir -p ../../axtNet300
     foreach f (chr*.net)
         set c = $f:r
         echo "rat axtNet300 on $c"
         netToAxt -maxGap=300 $c.net ../chain/$c.chain /cluster/data/hg16/nib /cluster/data/rn3/nib ../../axtNet300/$c.axt
     end
 
     # create 2-way maf files
     #set multizDir = /cluster/data/hg16/bed/humor.2003-09-02
     set multizDir = /cluster/data/hg16/bed/humor.2003-09-08
     mkdir -p $multizDir/maf
     cd /cluster/data/hg16
     set mouseDir = bed/blastz.mm3/axtNet300
     set ratDir = bed/blastz.rn3/axtNet300
     foreach c (`cut -f 1 chrom.sizes`)
         echo "making mouse mafs on $c"
         # NOTE: this sort should probably be earlier in the pipeline
         axtSort $mouseDir/$c.axt $mouseDir/$c.axt.sorted
         mv $mouseDir/$c.axt.sorted $mouseDir/$c.axt 
         axtToMaf $mouseDir/$c.axt /cluster/data/hg16/chrom.sizes /cluster/data/mm3/chrom.sizes $multizDir/maf/$c.mm3.maf.unfixed -tPrefix=hg16. -qPrefix=mm3.
         /cluster/bin/scripts/fixmaf.pl \
                 < $multizDir/maf/$c.mm3.maf.unfixed > $multizDir/maf/$c.mm3.maf
 
         echo "making rat mafs on $c"
         axtSort $ratDir/$c.axt $ratDir/$c.axt.sorted
         mv $ratDir/$c.axt.sorted $ratDir/$c.axt
         axtToMaf $ratDir/$c.axt /cluster/data/hg16/chrom.sizes /cluster/data/rn3/chrom.sizes $multizDir/maf/$c.rn3.maf.unfixed -tPrefix=hg16. -qPrefix=rn3.
         /cluster/bin/scripts/fixmaf.pl \
                 < $multizDir/maf/$c.rn3.maf.unfixed > $multizDir/maf/$c.rn3.maf
         rm $multizDir/maf/*.unfixed
     end
 
 
     # copy maf files to bluearc for cluster run
     set clusterDir = /cluster/bluearc/hg16/bed
     mkdir $clusterDir/blastz.mm3/mafNet300
     cp $multizDir/maf/*.mm3.maf $clusterDir/blastz.mm3/mafNet300
     mkdir /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300
     cp $multizDir/maf/*.rn3.maf $clusterDir/blastz.rn3/mafNet300
 
     # create scripts to run on cluster
     # run "humor"
     cd $multizDir
     mkdir hmr
     mkdir run
     cd run
 cat << EOF > doHumor.kk
 /cluster/bin/penn/humor.v4 $clusterDir/blastz.mm3/mafNet300/\$1.mm3.maf $clusterDir/blastz.rn3/mafNet300/\$1.rn3.maf > $multizDir/hmr/\$1.hmr.maf
 EOF
     chmod +x doHumor.kk
     
 cat << EOF > gsub 
 #LOOP
 doHumor.kk \$(root1) {check out line+ $multizDir/hmr/\$(root1).hmr.maf}
 #ENDLOOP
 EOF
     cd $clusterDir/blastz.mm3/mafNet300
     # NOTE: probably want a better way to make the chrom list 
     ls *.maf | awk -F. '{print $1}' > $multizDir/run/chrom.list
     cd $multizDir/run
     gensub2 chrom.list single gsub jobList
 
     # run jobs
     ssh kkr9u01
     #set multizDir = /cluster/data/hg16/bed/humor.2003-09-02
     set multizDir = /cluster/data/hg16/bed/humor.2003-09-08
     cd $multizDir/run
     para create jobList
     para try
     para check
     para push
     # longest job 27 minutes
 
     # setup external files for database reference
     ssh hgwdev
     mkdir -p /gbdb/hg16/humorMm3Rn3
     cd /gbdb/hg16/humorMm3Rn3
     foreach f ($multizDir/hmr/*.maf)
         ln -s $f .
     end
 
     # load into database
     #cd $multizDir/hmr/*.maf
     /cluster/bin/i386/hgLoadMaf -warn hg16 humorMm3Rn3
 
     # copy files to download area (2003-10-24 kate)
     set dir = /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3
     mkdir -p $dir
     cp -p /gbdb/hg16/humorMm3Rn3/*.maf $dir
     cd $dir
     gzip *
     # edit downloads page to add linke to humorMm3Rn3
 
     # add pairwise mafs to downloads page (2003-11-25 kate)
     set dir = /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3
     mkdir $dir/{rn3,mm3}
     cd /cluster/data/hg16/bed/humor/maf
     cp *.mm3.maf $dir/mm3
     cp *.rn3.maf $dir/rn3
     gzip $dir/mm3/*
     gzip $dir/rn3/*
 
     # Create upstream files (kent)
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg16/humorMm3Rn3
     echo hg16 mm3 rn3 > org.txt
     foreach i (1000 2000 5000)
     featureBits hg16 refGene:upstream:$i -fa=/dev/null -bed=up.bad
     awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
     rm up.bad
     mafFrags hg16 humorMm3Rn3 up.bed upstream$i.maf -orgs=org.txt
     rm up.bed
     end
 
 
 #  MAKING BLASTZ SELF (DONE - 2003-08-08 - Hiram)
 
     # The procedure for lineage spec business with self is to simply
     # use the actual repeat masker output for this human assembly as
     # the lineage specific repeats for itself.  Thus, merely make
     # symlinks to the repeat masker out files and name them as expected
     # for blastz.  In this case they are called notInHuman but they
     # really mean InHuman.  Yes, it is confusing, but that's just the
     # nature of the game in this case.
 
     ssh eieio
     mkdir -p /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
     cd /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
     foreach f (../rmsk/*.fa.out)
 	set base = $f:t:r:r
 	echo $base.out.spec
 	ln -s $f $base.out.spec
     end
 
     ssh eieio
     mkdir -p /cluster/data/hg16/bed/blastzSelf
     cd /cluster/data/hg16/bed/blastzSelf
 
     cat << '_EOF_' > DEF
 # human vs. human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Human
 SEQ2_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 # not currently used
 SEQ2_RMSK=
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 
 BASE=/cluster/store4/gs.17/build34/bed/blastzSelf
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
 
 # Save the DEF file in the current standard place
     DS=`date -I`
     cp DEF ~angie/hummus/DEF.hg16-hg16.$DS
 
     ssh kk
     cd /cluster/data/hg16/bed/blastzSelf
 
     # source the DEF file to establish environment for following commands
     . ./DEF
 
     # follow the next set of directions slavishly
     mkdir -p $BASE/run
     # give up on avoiding angie's directories
     # tcl script
     # creates xdir.sh and joblist run/j
     ~angie/hummus/make-joblist $DEF > $BASE/run/j
 
     # xdir.sh makes a bunch of result directories in $BASE/raw/
     # based on chrom name and CHUNK size
     sh $BASE/xdir.sh
     cd $BASE/run
 
     # now edit j to prefix path to executable name
     # NOTE: we should have a controlled version of schwartz bin executables
     sed -e 's#^#/cluster/bin/penn/#' j > j2
     wc -l j*
     # 114921 j
     head j2
 
     # make sure the j2 edits are OK, then use it:
     mv j2 j
 
     # para create will create the file: 'batch' for the cluster run
     para create j
 	# 114921 jobs
     para try
     para check
     para push
     # ... etc ...
     #  With some cluster difficulties, bluearc hangups, etc:
 Completed: 114921 of 114921 jobs
 CPU time in finished jobs:   19898031s  331633.85m  5527.23h  230.30d  0.631 y
 IO & Wait Time:              42606494s  710108.24m 11835.14h  493.13d  1.351 y
 Average job time:                 544s       9.06m     0.15h    0.01d
 Longest job:                   111877s    1864.62m    31.08h    1.29d
 Submission to last job:        344744s    5745.73m    95.76h    3.99d
 
     # post-process blastz
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf
     #   source the DEF file again in case you are coming back to this
     #	(must be bash shell)
 
     . ./DEF
     
     # a new run directory
     mkdir -p run.1
     
     mkdir -p $BASE/lav
     
     # create a new job list to convert out files to lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                         > run.1/jobList
     cd run.1
 
     # make sure the job list is OK
     wc -l jobList
        # 339 jobs 
     head jobList
 
     # run on cluster
     ssh kk
     cd /cluster/data/hg16/bed/blastzSelf/run.1
     para create jobList
     para try
     para check
     para push
     # etc.
 #Completed: 339 of 339 jobs
 #CPU time in finished jobs:      21101s     351.68m     5.86h    0.24d  0.001 y
 #IO & Wait Time:                 74915s    1248.58m    20.81h    0.87d  0.002 y
 #Average job time:                 283s       4.72m     0.08h    0.00d
 #Longest job:                     2028s      33.80m     0.56h    0.02d
 #Submission to last job:          2993s      49.88m     0.83h    0.03d
 
     # convert lav files to axt
     ssh kk
     cd /cluster/data/hg16/bed/blastzSelf
     mkdir axtChrom
     
     # a new run directory
     mkdir run.2
     cd run.2
 
     # create template file for gensub2
     # usage:  blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/scripts/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastzSelf/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastzSelf/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/gs.17/build34/bothMaskedNibs
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     ls -1S /cluster/store4/gs.17/build34/bed/blastzSelf/lav > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
         # 42 jobs
     head jobList
 
     cd /cluster/data/hg16/bed/blastzSelf/run.2
     para create jobList
     para try
     para check
     para push
     # We have two crashed jobs here.  The data for chr7 and chr19 is
     # too much for the processing.  Have to run those separately on
     #	the file server eieio.
 Completed: 40 of 42 jobs
 Crashed: 2 jobs
 CPU time in finished jobs:       4737s      78.95m     1.32h    0.05d  0.000 y
 IO & Wait Time:                 57154s     952.57m    15.88h    0.66d  0.002 y
 Average job time:                1547s      25.79m     0.43h    0.02d
 Longest job:                     7969s     132.82m     2.21h    0.09d
 Submission to last job:          8029s     133.82m     2.23h    0.09d
     # Fixup chr7 and chr19 by running them in two passes like this:
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf
     set base=/cluster/data/hg16/bed/blastzSelf
     set seq1_dir=/cluster/data/hg16/nib
     set seq2_dir=/cluster/data/hg16/nib
     foreach c (lav/chr19 lav/chr7)
       pushd $c
       set chr=$c:t
       set out=axtChrom/$chr.axt
       echo "Translating $chr lav to $out"
       foreach d (*.lav)
         set smallout=$d.axt
         lavToAxt $d $seq1_dir $seq2_dir stdout \
         | axtDropSelf stdin stdout \
         | axtSort stdin $smallout
       end
       cat `ls -1 *.lav.axt | sort -g` > $base/$out
       popd
     end
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf
     #  Need to drop overlaps to eliminate diagonals
     #	DropOverlap seems to drop more than axtDropSelf above
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
 	mv axtChrom/$c.axt axtChrom/$c.axt
 	/cluster/bin/i386/axtDropOverlap axtChrom/$c.axt \
 		/cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/$c.axt
 	echo "Done: $c"
     end
     cd /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped
     gzip *.axt
     #  Needed a deliver of these right away:  (REMOVED 2005-01-27)
     ssh hgwdev
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
     cd /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
     cp -p /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt.gz .
 	
     ssh eieio
     mkdir -p /cluster/data/hg16/bed/blastzSelf/pslChrom
     cd /cluster/data/hg16/bed/blastzSelf
     set tbl = "blastzSelf"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       zcat /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/${c}.axt.gz | \
 	/cluster/bin/i386/axtToPsl stdin S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	That takes about 20 minutes
 
 XXXX Pick this up tomorrow, 03-09-12 with pslChromDroppedFix
     # Load database tables
     ssh hgwdev
     set tbl = "blastzSelf"
     cd /cluster/data/hg16/bed/blastzSelf/pslChrom
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzSelf.psl
     # This takes 30 minutes to an hour
     # create trackDb/human/hg16 and get a trackDb.ra file started with:
 
     # remake trackDb tables
 
 
 # PRODUCE FUGU BLAT ALIGNMENT (IN PROGRESS 2003-08-22 kate)
 
     # Use masked scaffolds from fr1 assembly (same sequence as
     # previous BlatFugu, however it's repeat and TRF-masked).
 
     # NOTE: can't access /iscratch/i from fileserver
     ssh kk
     mkdir /cluster/data/hg16/bed/blatFr1
     cd /cluster/data/hg16/bed/blatFr1
     mkdir psl 
     # next time, use N?_?????? (to pick up NG_ contigs)
     foreach f (/cluster/data/hg16/?{,?}/NT_??????/NT_??????.fa)
       set c=$f:t:r
       echo $c
       mkdir -p psl/$c
     end
     # special case for NG_002432
     mkdir -p psl/NG_002432
 
     # create cluster job
     cd run
     ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
     ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst
 cat << 'EOF' > gsub
 #LOOP
 /cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/hg16/bed/blatFr1/psl/$(root1)/$(root1)_$(root2).psl}
 #ENDLOOP
 'EOF'
     # << this line makes emacs coloring happy
     gensub2 human.lst fugu.lst gsub spec
     para create spec
        # 283798 jobs  
     para try
     para check
     para push
     para check
         # cd psl
         # count files with aligments
         # find . -not -size 427c | wc -l
         # 89878
         # count files with no aligments
         # find . -size 427c | wc -l
         # 195265
 
    # When cluster run is done, sort alignments
    # into chrom directory
     ssh eieio
     cd /cluster/data/hg16/bed/blatFr1
     pslCat -dir psl/N?_?????? | \
       liftUp -type=.psl stdout \
         /cluster/data/hg16/jkStuff/liftAll.lft warn stdin | \
       pslSortAcc nohead chrom temp stdin
         # 15 minutes ?
         # Processed 855648 lines into 4 temp files
 
     # Rename to correspond with tables and load into database:
     ssh hgwdev
     cd /cluster/data/hg16/bed/blatFr1/chrom
     rm -f chr*_blatFr1.psl
     foreach i (chr?{,?}{,_random}.psl)
         set r = $i:r
         echo $r
         mv $i ${r}_blatFr1.psl
     end
 
     # Next assembly, lift fugu scaffolds to Fugu browser chrUn,
     # so you can link to other browser.  And don't need to load sequence
     # liftUp -pslQ all.psl /cluster/data/fr1/fugu_v3.masked.lft warn chrom/*.psl
 
     hgLoadPsl -noTNameIx hg16 *.psl
         # $ featureBits hg16 blatFr1 refGene:CDS
         # 12787423 bases of 2865248791 (0.446%) in intersection
         # $ featureBits hg15 blatFugu refGene:CDS
         # 12427544 bases of 2866466359 (0.434%) in intersection
 
     # Edit trackDb.ra to include blatFr1
     # NOTE: already in top-level trackDb.ra
 
     # Make fugu /gbdb/ symlink and load Fugu sequence data.
     # NOTE: don't need to do this in next assembly
     mkdir /gbdb/hg16/fuguSeq
     cd /gbdb/hg16/fuguSeq
     ln -s /cluster/data/fr1/fugu_v3.masked.fa
     # hide .tab file
     cd /cluster/store2/tmp
     hgLoadSeq hg16 /gbdb/hg16/fuguSeq/fugu_v3.masked.fa
 
 
 # MAKE BLASTZ BEST SELF (RE-DONE - 2003-08-28 - Hiram)
 
     #	Pick up on this process below after chain and nets have been
     #	done.  This run.3 business is obsolete
 
     # Consolidate AXT files to chrom level, sort, pick best, make psl.
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf/axtChrom
     mkdir -p /cluster/bluearc/hg16/bed/blastzSelf/axtChrom
     
     # copy chrom axt's to bluearc, to avoid hitting fileserver too hard
     cp -p *.axt /cluster/bluearc/hg16/bed/blastzSelf/axtChrom
 
     ssh kk
     cd /cluster/data/hg16/bed/blastzSelf
     mkdir -p axtBest pslBest
     mkdir run.3
     cd run.3
 
     # create script to filter files 
     cat << '_EOF_' > doBestAxt
 #!/bin/csh -f
 # usage: doBestAxt chr axt-file best-file psl-file
 /cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
 sleep 1
 /cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastzSelf/S1.len \
 	/cluster/data/hg16/bed/blastzSelf/S2.len $4
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x doBestAxt
     cd ../axtChrom
     ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
     cd ../run.3
 
     # create template for cluster job
     cat << '_EOF_' > gsub
 #LOOP
 doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastzSelf/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastzSelf/axtBest/$(root1).axt} {check out line+  /cluster/data/hg16/bed/blastzSelf/pslBest/$(root1)_blastzBestMm3.psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 chrom.list single gsub jobList
     wc -l jobList
         # 42 jobs
     head jobList
 
     ssh kkr1u00
     cd /cluster/data/hg16/bed/blastzSelf/run.3
     para create jobList
     para try
     para check
     para push
 Completed: 38 of 42 jobs
 Crashed: 4 jobs
 CPU time in finished jobs:       1884s      31.41m     0.52h    0.02d  0.000 y
 IO & Wait Time:                  8421s     140.34m     2.34h    0.10d  0.000 y
 Average job time:                 271s       4.52m     0.08h    0.00d
 Longest job:                     2061s      34.35m     0.57h    0.02d
 Submission to last job:          2277s      37.95m     0.63h    0.03d
     #  Some of these files are getting too big for this operation
     # We will have to get back to these via the chains, nets and a
     # netToAxt trick
 
     # Problems:
 /cluster/data/hg16/bed/blastzSelf/axtBest/chr19.axt is empty
 /cluster/data/hg16/bed/blastzSelf/pslBest/chr19_blastzBestMm3.psl is empty
 Out of memory - request size 1564 bytes
 /cluster/data/hg16/bed/blastzSelf/axtBest/chr7.axt is empty
 /cluster/data/hg16/bed/blastzSelf/pslBest/chr7_blastzBestMm3.psl is empty
 Out of memory - request size 634045604 bytes
 /cluster/data/hg16/bed/blastzSelf/axtBest/chr1.axt is empty
 /cluster/data/hg16/bed/blastzSelf/pslBest/chr1_blastzBestMm3.psl is empty
 ut of memory - request size 984185908 bytes
 /cluster/data/hg16/bed/blastzSelf/axtBest/chr2.axt is empty
 /cluster/data/hg16/bed/blastzSelf/pslBest/chr2_blastzBestMm3.psl is empty
 Out of memory - request size 973662824 bytes
 
     #   Here is the replacement process for the above sequence
     #	Better yet, Jim says to be consistent, do all the chroms in
     #	this manner:
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf/axtChain
     mkdir humanNet
     mkdir ../axtNet
     netSplit human.net humanNet
     foreach n (humanNet/chr*.net)
 	set c=$n:t:r
 	echo "netToAxt: $c.net -> $c.axt"
 	rm -f ../axtNet/$c.axt
 	netToAxt humanNet/$c.net chain/$c.chain \
 		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
 		/cluster/bluearc/scratch/hg/gs.17/build34/bothMaskedNibs \
 		../axtNet/$c.axt
 	echo "Complete: $c.net -> $c.axt"
     end
 
     mkdir -p /cluster/data/hg16/bed/blastzSelf/axtBest
     cd /cluster/data/hg16/bed/blastzSelf/axtBest
     ln -s ../axtNet/chr*.axt .
 
     #  Convert those axt files to psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo "processing $c.axt -> ${c}_blastzBestSelf.psl"
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestSelf.psl
 	echo "Done: ${c}_blastzBestSelf.psl"
     end
 
     # Load tables
      ssh hgwdev
      set base="/cluster/data/hg16/bed/blastzSelf"
      set tbl="blastzBestSelf"
      cd $base/pslBest
      /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
 
     # check results
     #  After going through the chain->net->axt operation:
     #	featureBits hg16 blastzBestSelf
     #	1388295977 bases of 2865248791 (48.453%) in intersection
 
     # Hg15 doesn't have a BestSelf, gave this a try with the following
     # result:
     #	featureBits hg15 blastzSelf
     #	Out of memory - request size 6 bytes
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/hg16/axtBestSelf
      cd /gbdb/hg16/axtBestSelf
      ln -s /cluster/data/hg16/bed/blastzSelf/axtNet/chr*.axt .
 
      cd /cluster/data/hg16/bed/blastzSelf/axtNet
      rm -f axtInfoInserts.sql
      touch axtInfoInserts.sql
      foreach f (/gbdb/hg16/axtBestSelf/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('hg16','Blastz Best in Genome','$chr','$f');" \
          >> axtInfoInserts.sql
      end
      #  This table has already been created above
      # hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
      hgsql hg16 < axtInfoInserts.sql
 
 # MAKE BLASTZ BEST SELF (NOT NECESSARY - NOT USEFUL - NOT NEEDED - NOT DONE)
 
 # MAKING CHAIN SELF BLASTZ (DONE - 2003-08-27 - Hiram)
 # MAKING CHAIN SELF BLASTZ (RE-DONE - 2003-09-04 - Hiram)
 #				2003-09-04 - with dropped overlap axtChrom
 
 # Run axtChain on little cluster
     ssh kkr1u00
     mkdir -p /cluster/data/hg16/bed/blastzSelf/axtChain/run1
     cd /cluster/data/hg16/bed/blastzSelf/axtChain/run1
     mkdir out chain
 
   ls -1S /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
 #	The -notQ_random (new argument to axtFilter) will omit any
 #	*_random from the query.
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 ~/bin/i386/axtFilter -notQ_random $1 | axtChain stdin \
 	/iscratch/i/gs.17/build34/bothMaskedNibs \
 	/iscratch/i/gs.17/build34/bothMaskedNibs $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     mkdir out chain
 
     gensub2 input.lst single gsub jobList
 	# edit jobList and remove the first one that does chr19
 	#	It is a job that would fail anyway after more than an
 	#	hour of run time.  It will be done separately below
     para create jobList
     # 41 jobs
     para try
     para push # ... etc ...
 # Completed: 41 of 41 jobs
 # CPU time in finished jobs:      27107s     451.78m     7.53h    0.31d  0.001 y
 # IO & Wait Time:                 16236s     270.60m     4.51h    0.19d  0.001 y
 # Average job time:                1057s      17.62m     0.29h    0.01d
 # Longest job:                     4989s      83.15m     1.39h    0.06d
 # Submission to last job:        240988s    4016.47m    66.94h    2.79d
 
     #  The chr19 recovery process:
     ssh kk
     mkdir -p /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19
     cd /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19
     cat << '_EOF_' > gsubQ
 #LOOP
 doChainQ.sh $(path2) $(path1) {check out line+ chain/$(root1).$(path2).chain} {check out line+ out/$(root1).$(path2).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChainQ.sh
 #!/bin/sh
 ~/bin/i386/axtFilter -notQ_random -q=$1 $2 | axtChain stdin \
 	/cluster/store4/gs.17/build34/nib \
 	/cluster/store4/gs.17/build34/nib $3 > $4
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x doChainQ.sh
     #  This is a mistake, this should have been chr19.axt only
   ls -1S /cluster/bluearc/hg16/bed/blastzSelf/axtChromDropped/*.axt > input.lst
     pushd /cluster/data/hg16
     ls -d ?{,?} | sed -e "s/^/chr/" | grep -v chr19 \
 	> /cluster/data/hg16/bed/blastzSelf/axtChain/run1.19/chrom19.lst
     popd
 
     mkdir out chain
     gensub2 input.lst chrom19.lst gsubQ spec19
     para create spec19
     para try
     para check
     para push
     ... etc ...
 Completed: 948 of 1050 jobs
 Crashed: 102 jobs
 CPU time in finished jobs:      45918s     765.30m    12.75h    0.53d  0.001 y
 IO & Wait Time:               1700328s   28338.80m   472.31h   19.68d  0.054 y
 Average job time:                1842s      30.70m     0.51h    0.02d
 Longest job:                    13247s     220.78m     3.68h    0.15d
 Submission to last job:         13268s     221.13m     3.69h    0.15d
     # the "crashed 102" jobs are empty chains.
     # This mistakenly did them all, the input.lst should have been
     #	chr19 only.
 # So, copy the chr19 results to the ../run1/chain result location
 cp -p chain/chr19*.chain ../run1/chain
 
     # now on the cluster server, sort chains
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
 
     # these steps take ~20 minutes
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg16 ${c}_chainSelf $i
         echo done $c
     end
 
     #	DELIVER these chain files to hgdownload (2005-01-27 - Hiram)
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain
     gzip chr*.chain
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
     cd /cluster/data/hg16/bed/blastzSelf/axtChain/chain
     cp -p *.chain.gz /usr/local/apache/htdocs/goldenPath/hg16/vsSelf
     #	fixup README file, request push
 
 # NET SELF BLASTZ (RE-DONE 2003-09-09 - DONE - 2003-08-27 - Hiram)
 
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                         /cluster/data/hg16/chrom.sizes ../preNet/$i
     end
 
     # This foreach loop will take about 15 min to execute.
 
     cd ..
     mkdir n1 
     cd preNet
 #	Probably OK to make this minSpace=10, used to be 1
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=10 \
 			/cluster/data/hg16/chrom.sizes \
                         /cluster/data/hg16/chrom.sizes ../n1/$n /dev/null
     end
     #  The above takes about 5 minutes
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     #	memory usage 200167424, utime 2489 s/100, stime 161
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastzSelf/axtChain
     ~/bin/i386/netClass hNoClass.net hg16 hg16 human.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman \
 	-qNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInHuman
 
     # If things look good do
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn human.net > humanSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastzSelf/axtChain
     netFilter -minGap=10 human.net |  hgLoadNet hg16 netSelf stdin
     netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 syntenyNetSelf stdin
 
     # Add entries for net and chain to human/hg16 trackDb
 
 # MAKING SELF AXTTIGHT FROM AXTCHROM (DONE - 2003-09-09 - Hiram)
 
     ssh eieio
     cd /cluster/data/hg16/bed/blastzSelf/axtChrom
     mkdir -p /cluster/data/hg16/bed/blastzSelf/axtTight
     tcsh
 
     foreach i (*.axt)
       echo $i
       subsetAxt  $i /cluster/data/hg16/bed/blastzSelf/axtTight/$i \
         ~kent/src/hg/mouseStuff/subsetAxt/90.mat 5000
     end
 
     # translate to psl
     cd ../axtTight
     mkdir -p ../pslTight
     foreach i (*.axt)
       set c = $i:r
       axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightSelf.psl
       echo "Done: $i"
     end
 
     # Load tables into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastzSelf/pslTight
     hgLoadPsl -noTNameIx hg16 chr*_blastzTightSelf.psl
 
 # MAKING SELF SYNTENY - Can be done after Best (NEEDS TO BE REDONE 2003-09-09)
 
 ssh hgwdev
 mkdir -p /cluster/data/hg16/bed/syntenySelf
 cd /cluster/data/hg16/bed/syntenySelf
 # Use the scripts that were already copied to ../syntenyMm3
 
 The first one takes 3 to 4 hours.
 
 ../syntenyMm3/syntenicBest.pl -db=hg16 -table=blastzBestSelf > synBest.out 2>&1
 XXXX - Running 2003-08-27 21:32
 ../syntenyMm3/smooth.pl
 ../syntenyMm3/joinsmallgaps.pl
 ../syntenyMm3/fillgap.pl -db=hg16 -table=blastzBestSelf
 ../syntenyMm3/synteny2bed.pl
 #    Load results
 hgLoadBed hg16 syntenySelf ucsc100k.bed
 
 
 # SGP GENE PREDICTIONS vs Mm4 (DONE - 2003-12-30 - Hiram)
     mkdir -p /cluster/data/hg16/bed/sgp_mm4/download
     cd /cluster/data/hg16/bed/sgp_mm4/download
     foreach f (/cluster/data/hg16/?{,?}/chr?{,?}{,_random}.fa)
       set chr = $f:t:r
       wget --timestamping \
 	http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/$chr.gtf
       wget --timestamping \
 	http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/$chr.prot
     end
     wget --timestamping \
 http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/chrUn.gtf -O chrUn_random.gtf
     wget --timestamping \
 http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/chrUn.prot -O chrUn_random.prot
     wget --timestamping \
 http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307_x_mm4/SGP/readme
     # Add missing .1 to protein id's
     foreach f (*.prot)
       perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
     end
     cd ..
     # since this is a relolad of this table updating the data
     #	from Mm3 to Mm4.  First check what is there:
 #  featureBits hg16 sgpGene
 #  39781330 bases of 2865248791 (1.388%) in intersection
     #	now drop that table, and reload
     hgsql -e "drop table sgpGene;" hg16
     #	This used to be done with -exon=CDS but it will do the same
     #	thing _AND_ add stop codons when done with -gtf, so do this
     #	with -gtf
     ldHgGene -gtf hg16 sgpGene download/*.gtf
 #	Read 42880 transcripts in 322086 lines in 39 files
 #	  42880 groups 39 seqs 1 sources 3 feature types
 #	42880 gene predictions
     hgsql -e "drop table sgpPep;" hg16
     hgPepPred hg16 generic sgpPep download/*-fixed.prot
 #  featureBits hg16 sgpGene
 #  39698249 bases of 2865248791 (1.386%) in intersection
 #  featureBits hg15 sgpGene
 #  40395614 bases of 2866466359 (1.409%) in intersection
 
 # SGP GENE PREDICTIONS - Mm3 (DONE - 2003-09-14 - Hiram - to be verified)
     mkdir -p /cluster/data/hg16/bed/sgp/download
     cd /cluster/data/hg16/bed/sgp/download
     foreach f (/cluster/data/hg16/?{,?}/chr?{,?}{,_random}.fa)
       set chr = $f:t:r
       wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/$chr.gtf
       wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/$chr.prot
     end
     wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/chrUn.gtf -O chrUn_random.gtf
     wget http://genome.imim.es/genepredictions/H.sapiens/golden_path_200307/SGP/chrUn.prot -O chrUn_random.prot
     # Add missing .1 to protein id's
     foreach f (*.prot)
       perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
     end
     cd ..
     ldHgGene hg16 sgpGene download/*.gtf -exon=CDS
 #  Read 43109 transcripts in 323911 lines in 39 files
 #    43109 groups 39 seqs 1 sources 3 feature types
 #  43109 gene predictions
     hgPepPred hg16 generic sgpPep download/*-fixed.prot
 #  featureBits hg16 sgpGene
 #  39781330 bases of 2865248791 (1.388%) in intersection
 #  featureBits hg15 sgpGene
 #  40395614 bases of 2866466359 (1.409%) in intersection
 
 # SGP GENES (UPDATE 1/18/2006)
     sgpPep table dropped, replaced by hgc generated protein seq in browser
 
 LOAD NCI60 (DONE: Fan 10/20/2003)
 o - # ssh hgwdev
     cd /projects/cc/hg/mapplots/data/NCI60/dross_arrays_nci60/
     mkdir hg16
     cd hg16
     findStanAlignments hg16 ../BC2.txt.ns ../../image/cumulative_plates.011204.list.human hg16.image.psl >& hg16.image.log
     cp ../experimentOrder.txt ./
     sed -e 's/ / \.\.\//g' < experimentOrder.txt > epo.txt
     egrep -v unknown hg16.image.psl > hg16.image.good.psl
     stanToBedAndExpRecs  hg16.image.good.psl hg16.nci60.exp hg16.nci60.bed `cat epo.txt`
     hgsql hg16 < ../../scripts/nci60.sql
     echo "load data local infile 'hg16.nci60.bed' into table nci60" | hgsql hg16
     mkdir /cluster/store4/gs.17/build34/bed/nci60
     mv hg16.nci60.bed /cluster/store4/gs.17/build34/bed/nci60
     rm *.psl
 
 # LOAD AFFYRATIO [GNF in progress jk Sept 19, 2003] 
 # LOAD AFFYRATIO U95Av2 sequences [DONE hartera Feb 2, 2004]
 # Used consensus/exemplar sequences instead of target sequences
 # LOAD AFFYRATIO [in progress, Feb 4, 2004]
 # changed pslReps parameters as minAli = 0.97 was too stringent
     # Set up cluster job to align consenesus/exemplars to hg16
     ssh kkr1u00
     cd /cluster/data/hg16/bed
     rm -rf affyGnf.2004-02-04/
     mkdir affyGnf.2004-02-04
     cd affyGnf.2004-02-04/
     mkdir -p /iscratch/i/affy
     cp /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa /iscratch/i/affy
     iSync
 
     ssh kk
     cd /cluster/data/hg16/bed/affyGnf.2004-02-04
     ls -1 /iscratch/i/affy/HG-U95Av2_all.fa > affy.lst
     ls -1 /scratch/hg/gs.17/build34/trfFa/ > allctg.lst
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
     gensub2 allctg.lst affy.lst template.sub para.spec
     mkdir psl
     para create para.spec
 
     # Actually do the job with usual para try/check/push/time etc.
 # para time 2/4/04
 #Completed: 491 of 491 jobs
 #CPU time in finished jobs:       8344s     139.06m     2.32h    0.10d  0.000 y
 #IO & Wait Time:                  2281s      38.02m     0.63h    0.03d  0.000 y
 #Average job time:                  22s       0.36m     0.01h    0.00d
 #Longest job:                      289s       4.82m     0.08h    0.00d
 #Submission to last job:           388s       6.47m     0.11h    0.00d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyU95.psl
     pslSort dirs raw.psl tmp psl
     
     # change filter parameters for these sequences. only use alignments that
     # cover 30% of sequence and have at least 95% identity in aligned region. 
     # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
     pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp affyU95.psl ../../jkStuff/liftAll.lft warn contig.psl
 
     # Merge with spot data and load into database. added -chip flag to 
     # affyPslAndAtlasToBed to allow correct parsing
     ssh hgwdev
     cd /cluster/data/hg16/bed/affyGnf.2004-02-04
     /cluster/home/sugnet/bin/i386/affyPslAndAtlasToBed -chip=U95Av2 affyU95.psl   /projects/compbiodata/microarray/affyGnf/human_atlas_U95_gnf.noquotes.txt affyRatio.bed affyRatio.exr >& affyPslAndAtlasToBed.log 
     hgLoadBed -sqlTable=$HOME/src/hg/lib/affyRatio.sql hg16 affyRatio affyRatio.bed    
     #	This affyU95 load was later changed to eliminate the long names
     # hgLoadPsl hg16 affyU95.psl
     #	by the following:
     sed -e "s/U95Av2://" affyU95.psl | sed -e "s/;//" > affyU95shortQname.psl
     hgLoadPsl hg16 -table=affyU95 affyU95shortQname.psl
     # Clean up
     rm -r psl tmp err affyRatio.bed affyRatio.exr bed.tab scores.tab *.debug batch.bak contig.psl raw.psl
 
 LOAD AffyUclaRatio [in progress jk Sept 19, 2003] 
 #LOAD AffyUclaRatio and AFFY U133A and U133B sequences[DONE hartera Feb 3, 2004]
 # Used consensus/exemplar sequences instead of target sequences
     # Set up cluster job to align consensus/exemplars to hg16
     ssh kkr1u00
     cd /cluster/data/hg16/bed
     rm -rf affyUcla.2004-02-04/
     mkdir affyUcla.2004-02-04
     cd affyUcla.2004-02-04/
     mkdir -p /iscratch/i/affy
     cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa /iscratch/i/affy
     iSync
 
     ssh kk
     cd /cluster/data/hg16/bed/affyUcla.2004-02-04/
     ls -1 /iscratch/i/affy/HG-U133AB_all.fa > affy.lst
     ls -1 /scratch/hg/gs.17/build34/trfFa/ > allctg.lst
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
     gensub2 allctg.lst affy.lst template.sub para.spec
     mkdir psl
     para create para.spec
 
     # Actually do the job with usual para try/check/push/time etc.
 # on 2/4/04:
 #Completed: 491 of 491 jobs
 #CPU time in finished jobs:      23137s     385.61m     6.43h    0.27d  0.001 y
 #IO & Wait Time:                 23057s     384.29m     6.40h    0.27d  0.001 y
 #Average job time:                  94s       1.57m     0.03h    0.00d
 #Longest job:                      617s      10.28m     0.17h    0.01d
 #Submission to last job:           747s      12.45m     0.21h    0.01d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyU133.psl.
     pslSort dirs raw.psl tmp psl
     
     # change filter parameters for these sequences. only use alignments that
     # cover 30% of sequence and have at least 95% identity in aligned region. 
     # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
     pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp affyU133.psl ../../jkStuff/liftAll.lft warn contig.psl
 
     # Merge with spot data and load into database.
     ssh hgwdev
     cd /cluster/data/hg16/bed/affyUcla.2004-01-28/
     # added to hashPsls to process shorter Affy probe set names
     # assumes that names has 2 colons but when shortened to fit in the seq 
     # database, there is only 1.
     # e.g. full name: "consensus:HG-U133A:212933_x_at;" short name: "HG-U133A:212933_x_at;"
     affyUclaMergePslData affyUclaMergePslData -pslFile=affyU133.psl -affyFile=/projects/compbio/data/microarray/affyUcla/data/030602_ucla_normal_human_tissue_snapshot.txt -bedOut=affyUcla.bed -expRecordOut=affyUcla.expRecords -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames -toDiffFile=toDiff.txt
     hgLoadBed -sqlTable=$HOME/src/hg/lib/affyUcla.sql hg16 affyUcla affyUcla.bed
     hgLoadPsl hg16 affyU133.psl
     
     # Clean up
     rm -r psl tmp err affyUcla.bed affyUcla.expRecords bed.tab *.debug batch.bak contig.psl raw.psl
     
     # Add in sequence data for affyU95 and affyU133 tracks.
     # Copy probe sequence to /gbdb if it isn't already
     mkdir -p /gbdb/hgFixed/affyProbes
     cd /gbdb/hgFixed/affyProbes
     ln -s /projects/compbio/data/microarray/affyGnf/sequences/HG-U95/HG-U95Av2_all.fa .
     ln -s /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa .
     
     # use perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
     # in HG-U95Av2_all.fa seque
     # reload sequences with "U95Av2" prefix removed so acc matches name used 
     # in other dependent tables for affyU95Av2 only
     hgLoadSeq -abbr=U95Av2: hg16 /gbdb/hgFixed/affyProbes/HG-U95Av2_all.fa
     hgLoadSeq hg16 /gbdb/hgFixed/affyProbes/HG-U133AB_all.fa
 
 # QA repush 2006-02-08 seq/extFile to correct mismatched ID for affyU133 alignment data (Jen)
 
 # Load AFFYUCLANORM, extended version of affyUcla track. Hopefully
 # final freeze of data set.
     mkdir ~sugnet/store1/
     cd hg16
     mkdir affyUcla
     cd affyUcla/
     ssh kk
     cd /cluster/store1/sugnet/hg16/affyUcla
     cp /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all ./
     ls -1 /scratch/hg/gs.17/build34/trfFa/* > allctg.lst
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
     echo "HG-U133AB_all" > affy.lst
     gensub2 allctg.lst affy.lst template.sub para.spec
     mkdir psl
     para create para.spec
        Checking input files
        491 jobs written to batch
        updated job database on disk
     para push
     # Wait until jobs run...
     exit
     pslSort dirs hg16.affyU133AB_all.psl tmp psl
     # Lots of messages
        writing hg16.affyU133AB_all.psl
        Cleaning up temp files
     wc hg16.affyU133AB_all.psl 
          60962 1280141 13677509 hg16.affyU133AB_all.psl
     ls /cluster/data/hg16/jkStuff/liftAll.lft
        /cluster/data/hg16/jkStuff/liftAll.lft
     liftUp hg16.affyU133AB_all.lifted.psl  /cluster/data/hg16/jkStuff/liftAll.lft warn hg16.affyU133AB_all.psl 
        Got 491 lifts in /cluster/data/hg16/jkStuff/liftAll.lft
        Lifting hg16.affyU133AB_all.psl
     pslReps -minCover=0.5 -sizeMatters -minAli=0.97 -nearTop=0.005  hg16.affyU133AB_all.lifted.psl hg16.affyU133AB_all.lifted.pslReps.psl out.psr
        Processing hg16.affyU133AB_all.lifted.psl to hg16.affyU133AB_all.lifted.pslReps.psl and out.psr
        Processed 60957 alignments
     affyUclaMergePslData -pslFile=hg16.affyU133AB_all.lifted.pslReps.psl -affyFile=/projects/compbio/data/microarray/affyUcla/data/feature_biomaterial_chip_logratios_formatForTrack.txt -bedOut=hg16.affyUcla.bed -expRecordOut=hg16.affyUcla.expRecords -expFile=/projects/compbio/data/microarray/affyUcla/data/expNames.sorted.txt
        Reading psls from: hg16.affyU133AB_all.lifted.pslReps.psl
        Outputing beds:
        ............................................
        Freeing Memory.
        Done.
     addUclaAnnotations.pl hg16.affyUcla.expRecords  /projects/compbio/data/microarray/affyUcla/data/normal_tissue_database_annotations2.txt > hg16.affyUcla.annotations.expRecords
        
     # Load the databases
     cp ~/jk/hg/lib/affyRatio.sql ./
     sed -e 's/affyRatio/affyUclaNorm/' < affyRatio.sql > affyUclaNorm.sql
     # Just use the hgLoadBed program specifying sqlFile
     hgLoadBed hg16 affyUclaNorm hg16.affyUcla.bed -sqlTable=affyUclaNorm.sql
        Reading hg16.affyUcla.bed
        Loaded 44446 elements of size 15
        Sorted
        Saving bed.tab
        Loading hg16
     cp ~/jk/hg/lib/expRecord.sql ./
     sed -e 's/expRecord/affyUclaNormExps/' < expRecord.sql > affyUclaNormExps.sql
     hgFixedS -A < affyUclaNormExps.sql 
     echo "load data local infile 'hg16.affyUcla.annotations.expRecords' into table affyUclaNormExps" | hgFixedS -A
     
     # Cleanup
     rm HG-U133AB_all 
 
 # DO FAMILY BROWSER VERSIONS OF AFFYUCLANORMAL TRACK (In Progress -jk 3/2/2004)
 # (This is suspended because GNF Gene Atlas data is available and public!)
    # Create affyU133Orient table data
    ssh eieio
    cd /cluster/data/hg16/bed/affyUcla.2044-02-04
    pslSortAcc nohead chrom temp affyU133.psl
    rm -r temp
    cd chrom
    #This loop takes about 15 minutes
    foreach i (*.psl)
      polyInfo $i /cluster/data/hg16/nib/$i:r.nib \
        /projects/compbio/data/microarray/affyUcla/sequences/HG-U133AB_all.fa \
        $i:r.polyInfo
      echo done $i
    end
    cat *.polyInfo > ../affyU133OrientInfo.bed
    rm *.polyInfo
 
    # Load orientation table data 
    ssh hgwdev
    cd /cluster/data/hg16/bed/affyUcla.2044-02-04
    sed 's/mrnaOrientInfo/affyU133OrientInfo/' \
        $HOME/kent/src/hg/lib/mrnaOrientInfo.sql > affyU133OrientInfo.sql
    hgLoadBed hg16 affyU133OrientInfo affyU133OrientInfo.bed \
       -sqlTable=affyU133OrientInfo.sql > /dev/null
        
    # Do clustering (this takes about 10 minutes to run)
    clusterRna hg16 u133Cluster.bed /dev/null -noEst -noRefSeq -group=u133Group.tab -mRNAOrient=affyU133OrientInfo -rna=affyU133
 ~~~
 
 # GNF ATLAS 2  [Done jk 3/29/2004]
     # Align probes from GNF1H chip.
     ssh kk
     cd /cluster/data/hg16/bed
     mkdir -p geneAtlas2/run/psl
     cd geneAtlas2/run
     mkdir -p /cluster/bluearc/geneAtlas2
     cp /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /cluster/bluearc/geneAtlas2
     ls -1 /scratch/hg/gs.17/build34/trfFa/ > genome.lst
     ls -1 /cluster/bluearc/geneAtlas2/gnf1h.fa > mrna.lst
     echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/11.ooc  /scratch/hg/gs.17/build34/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
     gensub2 genome.lst mrna.lst gsub spec
     para create spec
     para try
     para check
     para push
     para time
 #Completed: 491 of 491 jobs
 #CPU time in finished jobs:      10718s     178.63m     2.98h    0.12d  0.000 y
 #IO & Wait Time:                  1499s      24.99m     0.42h    0.02d  0.000 y
 #Average job time:                  25s       0.41m     0.01h    0.00d
 #Longest job:                      652s      10.87m     0.18h    0.01d
 #Submission to last job:           723s      12.05m     0.20h    0.01d
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create gnf1h.psl.
     pslSort dirs raw.psl tmp psl
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp ../affyGnf1h.psl ../../../jkStuff/liftAll.lft warn contig.psl
     rm -r contig.psl raw.psl psl
 
     # Load probes and alignments from GNF1H into database.
     ssh hgwdev
     cd /cluster/data/hg16/bed/geneAtlas2
     ln -s /projects/compbio/data/microarray/geneAtlas2/human/gnf1h.fa /gbdb/hgFixed/affyProbes
     hgLoadPsl hg16 affyGnf1h.psl
     hgLoadSeq hg16 /gbdb/hgFixed/affyProbes/gnf1h.fa
     grep -v U133B ../affyUcla.2004-02-04/affyU133.psl | sed 's/exemplar://' \
 	| sed 's/consensus://' \
         | sed 's/HG-U133A://' | sed 's/;//' > affyU133A.psl
     hgMapMicroarray gnfAtlas2.bed hgFixed.gnfHumanAtlas2MedianRatio \
     	affyU133A.psl  /cluster/data/hg16/bed/geneAtlas2/affyGnf1h.psl
     # Note that the unmapped 11000 records are from all-N sequences.
     hgLoadBed hg16 gnfAtlas2 gnfAtlas2.bed
 
     
 
 
 # GENE BOUNDS (RNACLUSTER) (DONE 10-05-03 Chuck)
     # Create rnaCluster table (depends on {est,mrna}OrientInfo created but not checked in)
     cd /cluster/store4/gs.17/build34/
     # Create a list of accessions that come from RAGE libraries and need to
     # be excluded. (added by Chuck Wed Nov 27 13:09:07 PST 2002)
     ~/kent/src/hg/geneBounds/clusterRna/generateRageAccList.csh hg16 \
       rage.libs
     mkdir -p bed/rnaCluster/chrom
     # Exclude accesions in the RAGE file
     foreach f (?{,?}/chr*.fa)
       set c = $f:t:r
       set out = bed/rnaCluster/chrom/$c.bed
       echo clusterRna -mrnaExclude=hg16.rage.libs hg16 /dev/null $out -chrom=$c
       clusterRna -mrnaExclude=hg16.rage.libs hg16 /dev/null $out -chrom=$c
     end
 
     cd bed/rnaCluster
     hgLoadBed hg16 rnaCluster chrom/*.bed > /dev/null
 
     
 # MAKE UNIGENE ALIGNMENTS (DONE - 2003-10-09 - Hiram)
     # Download of the latest UniGene version is now automated by a 
     # cron job -- see /cluster/home/angie/crontab , 
     # /cluster/home/angie/unigeneVers/unigene.csh .  
     # If hgwdev gets rebooted, that needs to be restarted... maybe there's 
     # a more stable place to set up that cron job.  
 
     # substitute XXX -> the uniGene version used by SAGE, if building the 
     # uniGene/SAGE track;  or just the latest uniGene version in 
     # /projects/cc/hg/sugnet/uniGene/ , if doing uniGene alignments only.
     # set Version = XXX
     set Version = 162			(bash: export Version=162)
     cd /projects/cc/hg/sugnet/uniGene/uniGene.$Version
     gunzip Hs.seq.uniq.gz Hs.data.gz
     ../countSeqsInCluster.pl Hs.data counts.tab
     ../parseUnigene.pl Hs.seq.uniq Hs.seq.uniq.simpleHeader.fa leftoverData.tab
     # Distribute UniGene sequence to /iscratch/i/ (kkstore can see /projects)
     ssh kkstore
     set Version = 162 # same as above
     mkdir -p /iscratch/i/uniGene.$Version
     cp -p \
   /projects/cc/hg/sugnet/uniGene/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
       /iscratch/i/uniGene.$Version
     ssh kkr1u00
     ~kent/bin/iSync
     ssh kk
     set Version = 162 # same as above
     mkdir -p /cluster/data/hg16/bed/uniGene.$Version
     cd /cluster/data/hg16/bed/uniGene.$Version
     ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > allctg.lst
     ls -1S /iscratch/i/uniGene.$Version/Hs.seq.uniq.simpleHeader.fa \
       > uniGene.lst
     cat << '_EOF_' > template.sub
 #LOOP
 /cluster/bin/i386/blat -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) $(path2)  {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
     gensub2 allctg.lst uniGene.lst template.sub para.spec
     para create para.spec
     mkdir psl
     para try
     para check
     para push
 # Checking finished jobsCompleted: 491 of 491 jobs
 # CPU time in finished jobs:      39689s     661.49m    11.02h    0.46d  0.001 y
 # IO & Wait Time:                 38269s     637.81m    10.63h    0.44d  0.001 y
 # Average job time:                 159s       2.65m     0.04h    0.00d
 # Longest job:                     1805s      30.08m     0.50h    0.02d
 # Submission to last job:          1972s      32.87m     0.55h    0.02d
 
     # ssh eieio
     set Version = 162 # same as above
     cd /cluster/data/hg16/bed/uniGene.$Version
     pslSort dirs raw.psl tmp psl >& pslSort.log
     liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
     | pslReps -minCover=0.2 -sizeMatters -minAli=0.965 -nearTop=0.002 \
       stdin hg16.uniGene.lifted.pslReps.psl /dev/null
     # use hg16.uniGene.lifted.pslReps.psl for building SAGE track (next).
     
 # LOAD SAGE DATA (TBD)
     ssh hgwdev
     cd ~/kent/src/hg/sage
     make
     # XXX = uniGene build for which SAGE was built -- not necessarily current!
     # Figure out the build number by peeking at this file:
     wget -O - ftp://ftp.ncbi.nih.gov/pub/sage/map/info.txt 2> /dev/null
     # Or, look at the contents of this directory:
     ls /projects/cc/hg/sugnet/uniGene
 #   set Version = XXX
     set Version=162
     mkdir /projects/cc/hg/sugnet/sage/sage.$Version
     cd /projects/cc/hg/sugnet/sage/sage.$Version
     ncftp ftp://ftp.ncbi.nih.gov/pub/sage
       mget -R map/readme.txt map/info.txt extr info map/Hs
       quit
 #  That downloaded about 380 Mb of data
     mkdir map
     mv Hs map
     cd map/Hs/NlaIII
     unzip -j SAGEmap_tag_ug-rel.zip
     cd ../../../extr/
     ../../scripts/summarizeCounts.pl expCounts.tab ./SAGE_*
     ../../scripts/countGenesPerTag.pl expCounts.tab allTags.count.tab
     ../../scripts/createArraysForTags.pl allTags.count.tab tagExpArrays.tab \
       ./SAGE_*
     ../../scripts/countsPerExp.pl expCounts.tab expList.tab
 
     cd ../map/Hs/NlaIII/ 
     cat << '_EOF_' > /tmp/t.pl
 #!/usr/local/bin/perl
 
 while (<>) {
  chomp($_);
  @p = split(/\t/, $_);
  print "$p[2]\t$p[3]\t$p[0]\n";
 }
 '_EOF_'
     chmod +x /tmp/t.pl
 
     cat SAGEmap_tag_ug-rel | /tmp/t.pl | sort | sed -e 's/ /_/g' \
       > SAGEmap_ug_tag-rel_Hs
     cd ../../../extr
     createSageSummary ../map/Hs/NlaIII/SAGEmap_ug_tag-rel_Hs \
       tagExpArrays.tab sageSummary.sage
     # Create the uniGene alignments 
     # /cluster/data/hg16/uniGene/hg16.uniGene.lifted.pslReps.psl
     # -- see "MAKE UNIGENE ALIGNMENTS" above
     # continuing from above, we are already in this extr directory
     cd /projects/cc/hg/sugnet/sage/sage.$Version/extr
     addAveMedScoreToPsls \
       /cluster/data/hg16/bed/uniGene.$Version/hg16.uniGene.lifted.pslReps.psl \
       sageSummary.sage  uniGene.wscores.bed
     hgLoadBed hg16 uniGene_2 uniGene.wscores.bed
     hgsql hg16 < ~kent/src/hg/lib/sage.sql 
     echo "load data local infile 'sageSummary.sage' into table sage" \
         | hgsql hg16
     cd ../info
     ../../scripts/parseRecords.pl ../extr/expList.tab  > sageExp.tab
     hgsql hg16 < ~/kent/src/hg/lib/sageExp.sql 
     echo "load data local infile 'sageExp.tab' into table sageExp" | hgsql hg16
     # update ~/kent/src/hg/makeDb/trackDb/human/hg16/uniGene_2.html 
     # with current uniGene date. 
 
 # MAKING FOLDUTR TABLES (DONE - jk - 2003-10-14, REDONE jk 2004-04-07)
 # First set up directory structure and extract UTR sequence on hgwdev
     ssh hgwdev
     mkdir -p /cluster/data/hg16/bed/rnaStruct
     cd /cluster/data/hg16/bed/rnaStruct
     mkdir -p utr3/split utr5/split utr3/fold utr5/fold
     utrFa hg16 knownGene utr3 utr3/utr.fa
     utrFa hg16 knownGene utr5 utr5/utr.fa
 
 # Split up files and make files that define job.
     ssh kk
     cd /cluster/data/hg16/bed/rnaStruct
     faSplit sequence utr3/utr.fa 50000 utr3/split/s
     faSplit sequence utr5/utr.fa 50000 utr5/split/s
     ls -1 utr3/split > utr3/in.lst
     ls -1 utr5/split > utr5/in.lst
     cd utr3
     cat > gsub <<end
 #LOOP
 rnaFoldBig split/\$(path1) fold
 #ENDLOOP
 end
     cp gsub ../utr5
 
 # Do cluster run for 3' UTRs
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 #CPU time in finished jobs:     842416s   14040.26m   234.00h    9.75d  0.027 y
 #IO & Wait Time:                 78541s    1309.02m    21.82h    0.91d  0.002 y
 #Average job time:                  32s       0.53m     0.01h    0.00d
 #Longest job:                     3318s      55.30m     0.92h    0.04d
 #Submission to last job:          4282s      71.37m     1.19h    0.05d
 #
 #Completed: 37250 of 37250 jobs
 #CPU time in finished jobs:    1028594s   17143.24m   285.72h   11.91d  0.033 y
 #IO & Wait Time:                125807s    2096.78m    34.95h    1.46d  0.004 y
 #Average job time:                  31s       0.52m     0.01h    0.00d
 #Longest job:                     3396s      56.60m     0.94h    0.04d
 #Submission to last job:          4422s      73.70m     1.23h    0.05d
 
 
 # Do cluster run for 5' UTRs 
     cd ../utr5
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 #Completed: 25808 of 25808 jobs
 #CPU time in finished jobs:      51700s     861.67m    14.36h    0.60d  0.002 y
 #IO & Wait Time:                114430s    1907.16m    31.79h    1.32d  0.004 y
 #Average job time:                   6s       0.11m     0.00h    0.00d
 #Longest job:                     1044s      17.40m     0.29h    0.01d
 #Submission to last job:          1164s      19.40m     0.32h    0.01d
 #
 #Completed: 29770 of 29770 jobs
 #CPU time in finished jobs:     100407s    1673.45m    27.89h    1.16d  0.003 y
 #IO & Wait Time:                 93019s    1550.32m    25.84h    1.08d  0.003 y
 #Average job time:                   6s       0.11m     0.00h    0.00d
 #Longest job:                     2209s      36.82m     0.61h    0.03d
 #Submission to last job:          2596s      43.27m     0.72h    0.03d
 
 
 
 # Load database
     ssh hgwdev
     cd /cluster/data/hg16/bed/rnaStruct/utr5
     hgLoadRnaFold hg16 foldUtr5 fold
     cd ../utr3
     hgLoadRnaFold hg16 foldUtr3 fold
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 
 # TBA (Webb Miller's Threaded Blockset Aligner) Alignments (CFTR region)   2003-10-17 kate
 # 9-way alignment: human, chimp, baboon, mouse, rat, doc, cat, cow, pig
 # Using sequences from browser (human, mouse, rat), and from
 # Elliot Margulies at NISC (via Webb)
 
     # unrolled sequences and ran TBA in /cluster/data/nisc/targets/cftr/tba9Mammal
     ssh kksilo
     mkdir -p /cluster/data/hg16/bed/nisc/cftr
     ln -s /cluster/data/nisc/targets/cftr/tba9Mammal/human.out \
                 /cluster/data/hg16/bed/nisc/cftr/tba9Mammal.maf
 
     # setup external files for database reference
     ssh hgwdev
     set table = tba9MammalCFTR
     mkdir -p /gbdb/hg16/$table
     cd /gbdb/hg16/$table
     ln -s /cluster/data/hg16/bed/nisc/cftr/tba9Mammal.maf tba.maf
     mkdir -p /gbdb/hg16/${table}
     cd /gbdb/hg16/${table}
     ln -s /cluster/data/hg16/bed/nisc/cftr/tba9Mammal.maf tba.maf
 
     # load into database
     cd /cluster/data/hg16/bed/nisc/cftr
     /cluster/bin/i386/hgLoadMaf -WARN hg16 $table
 
 # TBA with Non-mammalian species included (Fugu & Chicken)  2003-10-20 kate
     ssh hgwdev
     ln -s /cluster/data/nisc/targets/cftr/CFTR.non-mammal/human.out \
                 /cluster/data/hg16/bed/nisc/cftr/tbaFishBird.maf
     set table = tbaFishBirdCFTR
     mkdir -p /gbdb/hg16/$table
     cd /gbdb/hg16/$table
     ln -s /cluster/data/hg16/bed/nisc/cftr/tbaFishBird.maf tba.maf
     cd /cluster/data/hg16/bed/nisc/cftr
     /cluster/bin/i386/hgLoadMaf -WARN hg16 $table
         # 1072 warnings (mostly score=0's, a few minus scores)
         # 4377 rows
 
 # TBA 25-species CFTR region  (DONE 2003-10-28 kate)
 # run in /cluster/data/nisc/targets/cftr/25way, using makefile
 
     ssh hgwdev
     ln -s /cluster/data/nisc/targets/cftr/25way/human.maf \
                 /cluster/data/hg16/bed/nisc/cftr/tba25.maf
     set table = tba25CFTR
     mkdir -p /gbdb/hg16/$table
     cd /gbdb/hg16/$table
     ln -s /cluster/data/hg16/bed/nisc/cftr/tba25.maf tba.maf
     cd /cluster/data/hg16/bed/nisc/cftr
     /cluster/bin/i386/hgLoadMaf -WARN hg16 $table
         # 22267 rows
         # 24 warnings
 
 
 # MAKE HG16-PANTRO1 MAF FOR MULTIZ/TBA (DONE 3/8/04 angie)
     ssh kolossus
     mkdir /cluster/data/hg16/bed/blastz-blat.panTro1.lifted
     cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted
     # use the combined blastz-blat reciprocal best human-pt0 chain, but 
     # assign unique IDs:
     chainSort /cluster/data/pt0/bed/blastz-blatHg16/human.best.2.chain stdout \
     | chainMergeSort stdin \
     | chainSplit pt0RBestChain stdin
     # re-net with the new IDs:
     mkdir pt0RBestNet
     foreach f (pt0RBestChain/*.chain)
       echo chaining $f
       chainNet $f /cluster/data/hg16/chrom.sizes \
         /cluster/data/pt0/scaffold.sizes pt0RBestNet/$f:t:r.net /dev/null
     end
     # Now lift chain to panTro1 coords:
     mkdir chain
     foreach f (pt0RBestChain/*.chain)
       liftUp -chainQ rBestChain/$f:t \
         /cluster/data/panTro1/jkStuff/scaffolds.lft warn $f
     end
     # re-net with panTro1 coords (liftUp -netQ doesn't like - strand lifting):
     mkdir rBestNet
     foreach f (rBestChain/*.chain)
       echo chaining $f
       chainNet $f /cluster/data/hg16/chrom.sizes \
         /cluster/data/panTro1/chrom.sizes rBestNet/$f:t:r.net /dev/null
     end
     # make axt and maf from the hg16-panTro1 net:
     mkdir axtRBestNet mafRBestNet
     foreach f (rBestNet/chr*.net)
       set chr = $f:t:r
       netToAxt $f rBestChain/$chr.chain /cluster/data/hg16/nib \
         /cluster/data/panTro1/nib stdout \
       | axtSort stdin axtRBestNet/$chr.axt
       axtToMaf axtRBestNet/$chr.axt /cluster/data/hg16/chrom.sizes \
         /cluster/data/panTro1/chrom.sizes mafRBestNet/$chr.maf \
         -tPrefix=hg16. -qPrefix=panTro1.
     end
 
     # copy reciprocal net axt's for download (2004-10-04 kate)
     cd axtRBestNet
     gzip *.axt
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg16/vsPanTro1
     mkdir axtRBestNet
     cd axtRBestNet
     cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/axtRBestNet/*.gz .
     md5sum *.gz > md5sum.txt
     
     # load renumbered chains into database  (2004-03-14 kate)
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg16 ${c}_rBestChainPanTro1 $i
         echo done $c
     end
 
     # save for download (2004-05-14 kate)
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain
     chainMergeSort -saveId *.chain > ../rBest.chain
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/rBestChain
     set dir = /usr/local/apache/htdocs/goldenPath/hg16/vsPanTro1
     mkdir -p $dir
     cp -p ../rBest.chain $dir/human.best.chain
     cd $dir
     gzip *.chain
     # copy README file
     
 
     # load net into database  (2004-03-14 kate)
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted
     cat rBestNet/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     netClass noClass.net hg16 panTro1 human.net
     netFilter -chimpSyn human.net > rBest.net
     hgLoadNet -warn hg16 rBestNetPanTro1 rBest.net
 
 
 # EXPERIMENT: TBA WHOLE CHROM 5 SPECIES (DONE ENOUGH 3/8/04 angie)
     # Put 2-ways in /cluster/bluearc
     ssh eieio
     mkdir /cluster/bluearc/hg16/tba
     mkdir /cluster/bluearc/hg16/tba/{hp,hg}
     cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/chr*.maf \
       /cluster/bluearc/hg16/tba/hp
     # hg16-mm3 already in /cluster/bluearc/hg16/bed/blastz.mm3/mafNet300
     # hg16-rn3 already in /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300
     cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf \
       /cluster/bluearc/hg16/tba/hg
 
     ssh kolossus
     mkdir /cluster/data/hg16/bed/tbaExperiment
     cd /cluster/data/hg16/bed/tbaExperiment
     # tba needs to run multiz, so make sure they're in $PATH:
     set path = (/cluster/bin/penn $path)
     # Try just one chromosome:
     set chr = chr16
     # tba needs filenames to correspond to its tree input, so make links to 
     # maf and fasta:
     rm -f human.chimp.maf human.mouse.maf human.rat.maf human.chicken.maf \
           human
     mafSort /cluster/bluearc/hg16/tba/hp/$chr.z.mm3/mafNet300/$chr.mm3.maf > \
       human.mouse.mafmaf > human.chimp.maf
     mafSort /cluster/bluearc/hg16/bed/blastz.mm3/mafNet300/$chr.mm3.maf > \
       human.mouse.maf
     mafSort /cluster/bluearc/hg16/bed/blastz.rn3/mafNet300/$chr.rn3.maf > \
       human.rat.maf
     mafSort /cluster/bluearc/hg16/tba/hg/$chr.hg.maf > human.chicken.maf
     ln -s /cluster/data/hg16/?{,rror that tba is dying with is this:
     # ?}/$chr.fa human
     tba "(((human chimp) (mouse rat)) chicken)" \
       human.chimp.maf human.mouse.maf human.rat.maf human.chicken.maf
     # Doh -- looks like tba wants *all* pairwise inputs, and how do we 
     # tell which rat-chicrror that tba is dying with is this:
     # ken alignments to include for a given human chr??
     # The error that tba is dying with is this:
     # pair2tb.v4: alignments of human out of order around 172596-175110
     # ... even though inputs are sorted...?  Oh well, clean up:
     rm human*
     rm -r /cluster/bluearc/hg16/tba/
 
 
 # CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene)
 # First see if need to update superfamily data from 
 # ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
 # in /cluster/store1/superFamily/genomes/README.ucsc.  Then
 # make sure that knownToEnsembl and ensGtp tables are created, then:
     zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg16 hs stdin
 
 
 # BLASTZ CHICKEN (done, 11/3/2003, Adam)
 # (adapted from BLASTZ mouse/rat, above)
 
 # NOTE: this first time we're using the contigs that Terry has
 # installed at /cluster/bluearc/gg0 (see fa and split100
 # subdirectories).  When we have an assembly, things should be able to
 # proceed more as with mouse and rat
 
     ssh kk
     mkdir -p /cluster/data/hg16/bed/blastz.gg0
     cd /cluster/data/hg16/bed/blastz.gg0
 
     # first it looks like we need to run TRF on the contigs (realizing
     # this on second time through!)
     mkdir trf
     cd trf
     rm -rf jobList
     foreach file (/cluster/bluearc/gg0/split100/*.fa)
 	set root=$file:t:r
 	echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $file /dev/null -bedAt=/cluster/data/hg16/bed/blastz.gg0/trf/${root}.bed -tempDir=/tmp" >> jobList
     end
     #(run jobList on cluster)  -- took 2.5 min.
 
     # add new softmasking to reflect TRF output
     mkdir /cluster/bluearc/gg0/split100_with_trf
     rm -rf jobList
     foreach file (/cluster/bluearc/gg0/split100/*.fa)
 	set root=$file:t:r
 	echo "/cluster/bin/i386/maskOutFa -softAdd $file /cluster/data/hg16/bed/blastz.gg0/trf/${root}.bed /cluster/bluearc/gg0/split100_with_trf/${root}.fa" >> jobList
     end
     (run jobList on cluster)  # took <1 min.
 
     # now set up for BLASTZ (picking up with instructions above for
     # mouse and rat)
 
     cat << '_EOF_' > DEF
 # chicken vs. human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_Q=/cluster/data/penn/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Chicken
 SEQ2_DIR=/cluster/bluearc/gg0/split100_with_trf
 # not currently used
 SEQ2_RMSK=
 # not currently used
 SEQ2_FLAG=
 SEQ2_SMSK=
 SEQ2_IN_CONTIGS=1
 SEQ2_CHUNK=
 SEQ2_LAP=
 
 BASE=/cluster/store4/gs.17/build34/bed/blastz.gg0
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line makes emacs coloring happy
 
 # Save the DEF file in the current standard place
     DS=`date -I`
     cp DEF ~angie/hummus/DEF.gg0-hg16.$DS
 
     # source the DEF file to establish environment for following commands
     . ./DEF
 
     # follow the next set of directions slavishly
     mkdir -p $BASE/run
     # give up on avoiding angie's directories
     # tcl script
     # creates xdir.sh and joblist run/j
     ~angie/hummus/make-joblist $DEF > $BASE/run/j
 
     # xdir.sh makes a bunch of result directories in $BASE/raw/
     # based on chrom name and CHUNK size
     sh $BASE/xdir.sh
     cd $BASE/run
 
     # now edit j to prefix path to executable name
     # NOTE: we should have a controlled version of schwartz bin executables
     sed -e 's#^#/cluster/bin/penn/#' j > j2
     wc -l j*
     head j2
 
     # make sure the j2 edits are OK, then use it:
     mv j2 j
 
     # para create will create the file: 'batch' for the cluster run
     para create j
     para try
     para check
     para push
     # ... etc ...
 
 #Completed: 33561 of 33561 jobs
 #CPU time in finished jobs:   11426279s  190437.98m  3173.97h  132.25d  0.362 y
 #IO & Wait Time:                212940s    3549.01m    59.15h    2.46d  0.007 y
 #Average job time:                 347s       5.78m     0.10h    0.00d
 #Longest job:                     4036s      67.27m     1.12h    0.05d
 #Submission to last job:         16433s     273.88m     4.56h    0.19d
 
     # post-process blastz
     ssh kk
     cd /cluster/data/hg16/bed/blastz.gg0
     #   source the DEF file again in case you are coming back to this
     #	(must be bash shell)
 
     . ./DEF
     
     # a new run directory
     mkdir -p run.1
     
     mkdir -p $BASE/lav
     
     # create a new job list to convert out files to lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                         > run.1/jobList
     cd run.1
 
     # make sure the job list is OK
     wc -l jobList
        # 339 jobs 
     head jobList
 
     # run on cluster
     ssh kk
     cd /cluster/data/hg16/bed/blastz.rn3/run.1
     para create jobList
     para try
     para check
     para push
     # etc.
 
 #Completed: 339 of 339 jobs
 #CPU time in finished jobs:       8611s     143.52m     2.39h    0.10d  0.000 y
 #IO & Wait Time:                106450s    1774.17m    29.57h    1.23d  0.003 y
 #Average job time:                 339s       5.66m     0.09h    0.00d
 #Longest job:                      456s       7.60m     0.13h    0.01d
 #Submission to last job:           465s       7.75m     0.13h    0.01d
 
     # convert lav files to axt
     ssh kk
     cd /cluster/data/hg16/bed/blastz.gg0
     mkdir axtChrom
     
     # a new run directory
     mkdir run.2
     cd run.2
 
     # create custom version of blastz-chromlav2axt with -fa option,
     # because nibs aren't available for chicken
     cp /cluster/bin/scripts/blastz-chromlav2axt .
     # (hand edit: add -fa option to call to lavToAxt)
 
     # create template file for gensub2
     # usage:  blastz-chromlav2axt lav-dir axt-file seq1-dir seq2-dir
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/store4/gs.17/build34/bed/blastz.gg0/run.2/blastz-chromlav2axt /cluster/store4/gs.17/build34/bed/blastz.gg0/lav/$(root1) {check out line+ /cluster/store4/gs.17/build34/bed/blastz.gg0/axtChrom/$(root1).axt} /iscratch/i/gs.17/build34/bothMaskedNibs /cluster/bluearc/gg0/fa/chicken_with_trf.fa
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     ls -1S /cluster/store4/gs.17/build34/bed/blastz.gg0/lav > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
         # 42 jobs
     head jobList
 
     para create jobList
     para try
     para check
     para push
     # ... etc ...
 
 #Completed: 39 of 42 jobs
 #Crashed: 3 jobs
 #CPU time in finished jobs:      32763s     546.05m     9.10h    0.38d  0.001 y
 #IO & Wait Time:                 48182s     803.03m    13.38h    0.56d  0.002 y
 #Average job time:                2076s      34.59m     0.58h    0.02d
 #Longest job:                     5291s      88.18m     1.47h    0.06d
 #Submission to last job:          5291s      88.18m     1.47h    0.06d
 
     # The crashes are three of the "randoms" (chr8, 18, 19) -- parasol
     # thinks they crashed because of 0-length output files
     
     # This run took quite a bit longer than with mouse and rat, presumably
     # because of the use of the fa file
 
     #	Remove the empty axtChrom/chr*_random.axt files to avoid future
     #	processing errors
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.gg0
     mkdir -p pslChrom
     set tbl = "blastzGg0"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     # (~5 minutes)
 
     # Load database tables
     ssh hgwdev
     set tbl = "blastzGg0"
     cd /cluster/data/hg16/bed/blastz.gg0/pslChrom
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_${tbl}.psl
     #	New entry in human/hg16/trackDb.ra
 #	track blastzGg0
 #	shortLabel Chicken Blastz
 #	longLabel Blastz Chicken (Gg0-contigs, 5.2x coverage)
 #	group compGeno
 #	priority 145.9
 #	visibility hide
 #	color 100,50,0
 #	altColor 255,240,200
 #	spectrum on
 #	type psl xeno 
 
 
 # MAKE BLASTZ BEST CHICKEN (finished, Adam, 11/3/03)
 
     # Consolidate AXT files to chrom level, sort, pick best, make psl.
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.gg0/axtChrom
     mkdir -p /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom
     
     # copy chrom axt's to bluearc, to avoid hitting fileserver too hard
     cp -p *.axt /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom
 
     ssh kk
     cd /cluster/data/hg16/bed/blastz.gg0
     mkdir -p axtBest pslBest
     mkdir run.3
     cd run.3
 
     # create script to filter files 
     cat << '_EOF_' > doBestAxt
 #!/bin/csh -f
 # usage: doBestAxt chr axt-file best-file psl-file
 /cluster/bin/i386/axtBest $2 $1 $3 -minScore=300
 sleep 1
 /cluster/bin/i386/axtToPsl $3 /cluster/data/hg16/bed/blastz.gg0/S1.len \
 	/cluster/data/hg16/bed/blastz.gg0/S2.len $4
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # NOTE: in a subsequent run, we have used -minScore=6000 and added
     # the -matrix option to use HoxD55.q (need to add a line with gap
     # penalties to the bottom of the score matrix file, e.g., "O =
     # 400, E = 30"; see
     # /cluster/data/hg16/bed/blastz.gg0/run.3.2003-11-11).  These new
     # options should be considered part of the standard procedure, at
     # least for now.
 
     chmod +x doBestAxt
     cd ../axtChrom
     ls -1S | sed 's/.axt$//' > ../run.3/chrom.list
     cd ../run.3
 
     # create template for cluster job
     cat << '_EOF_' > gsub
 #LOOP
 doBestAxt $(root1) {check in line+ /cluster/bluearc/hg16/bed/blastz.gg0/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.gg0/axtBest/$(root1).axt} {check out line+  /cluster/data/hg16/bed/blastz.gg0/pslBest/$(root1)_blastzBestGg0.psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     gensub2 chrom.list single gsub jobList
     wc -l jobList
     head jobList
 
     cd /cluster/data/hg16/bed/blastz.gg0
     cd run.3
     para create jobList
     para try
     para check
     para push
 #Checking finished jobs
 #Completed: 39 of 39 jobs
 #CPU time in finished jobs:       1111s      18.52m     0.31h    0.01d  0.000 y
 #IO & Wait Time:                  7775s     129.58m     2.16h    0.09d  0.000 y
 #Average job time:                 228s       3.80m     0.06h    0.00d
 #Longest job:                     1375s      22.92m     0.38h    0.02d
 #Submission to last job:          1375s      22.92m     0.38h    0.02d
 
     # create human/chicken mafs
     cd /cluster/data/hg16/bed/blastz.gg0
     mkdir maf
     foreach file (axtBest/*.axt)
 	set root=$file:t:r
 	echo $root
 	/cluster/bin/i386/axtToMaf $file S1.len S2.len maf/${root}.maf.unfixed -tPrefix=hg16. -qPrefix=gg0.
 	/cluster/bin/scripts/fixmaf.pl < maf/${root}.maf.unfixed > maf/${root}.maf
     end
 
 # MULTIZ HUMAN/MOUSE/RAT/CHICKEN (Finished, Adam, 11/3)
 # (chicken added to human/mouse/rat alignments described above [HUMOR])
 
     ssh kk
     mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0
     mkdir hmrc
 
     # wrapper script for multiz
     cat << EOF > mz
 #!/bin/csh
 /cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3
 EOF
     chmod +x mz
 
     # put the MAFs on bluearc
     ssh eieio
     mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0/hmr
     mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0/hc
     cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf /cluster/bluearc/multiz.hg16mm3rn3gg0/hmr
     cp /cluster/data/hg16/bed/blastz.gg0/maf/*.maf /cluster/bluearc/multiz.hg16mm3rn3gg0/hc
     logout  # back to kk
 
     # set up joblist
     rm -f jobList
     foreach file (/cluster/bluearc/multiz.hg16mm3rn3gg0/hmr/*.maf) 
 	set root=`echo $file:t:r | sed 's/\.hmr//'`
 	echo "/cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/mz /cluster/bluearc/multiz.hg16mm3rn3gg0/hc/${root}.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/hmrc/${root}.maf" >> jobList
     end
     # (run on cluster)  41 jobs, ~10 min
 
     # FIXME: maybe should run on the common denominator of the two
     # sets, then copy over remaining MAFs (?)  In this case, copied
     # chr8_random and chr18_random from hmr
 
     # clean up bluearc (these are big files!)
     rm -r /cluster/bluearc/multiz.hg16mm3rn3gg0
 
     # setup external files for database reference
     ssh hgwdev
     mkdir -p /gbdb/hg16/multizMm3Rn3Gg0
     ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0/hmrc/*.maf /gbdb/hg16/multizMm3Rn3Gg0
 
     # load into database
 #    cd $multizDir/hmr/*.maf
     /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3Gg0
 
     # add dummy entry to dbDb so that name shows up as "Chicken"
     echo 'insert into dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName) values ("gg0", "November 2003", "", "Chicken", "", 0, 0, "Chicken", "Gallus gallus");' | hgsql -h genome-testdb hgcentraltest
 
 
 # BLASTZ Mm4  (DONE - 2003-10-31 - Hiram)
 
     ssh kk
     mkdir -p /cluster/data/hg16/bed/blastz.mm4.2003-10-29
     cd /cluster/data/hg16/bed
     ln -s  blastz.mm4.2003-10-29 blastz.mm4
     cd blastz.mm4
 
     cat << '_EOF_' > DEF
 # human vs. mouse
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 # RMSK not currently used
 SEQ1_RMSK=/iscratch/i/gs.17/build34/rmsk
 # FLAG not currently used
 SEQ1_FLAG=-primate
 SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 
 # QUERY
 # Mouse
 SEQ2_DIR=/scratch/mus/mm4/softNib
 # RMSK not currently used
 SEQ2_RMSK=/scratch/mus/mm4/rmsk
 # FLAG not currently used
 SEQ2_FLAG=-rodent
 SEQ2_SMSK=/scratch/mus/mm4/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg16/bed/blastz.mm4
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # prepare first cluster run
     ssh kk
     cd /cluster/data/hg16/bed/blastz.mm4
     source DEF
     /cluster/data/mm4/jkStuff/BlastZ_run0.sh
     cd run.0
     para try, check, push, check, ....
 # Completed: 43390 of 43392 jobs
 # Crashed: 2 jobs
 # CPU time in finished jobs:   15770466s  262841.10m  4380.69h  182.53d  0.500 y
 # IO & Wait Time:                626227s   10437.11m   173.95h    7.25d  0.020 y
 # Average job time:                 378s       6.30m     0.10h    0.00d
 # Longest job:                     8052s     134.20m     2.24h    0.09d
 # Submission to last job:         45886s     764.77m    12.75h    0.53d
     # the two crashed jobs:
 # /cluster/home/angie/schwartzbin/blastz-run chr10.nib 40000001 50010000 chrX.nib 120000001 150000000 /cluster/data/hg16/bed/blastz.mm4/DEF
 # blastz: Illegal character '@' in sequence file.
 # /cluster/home/angie/schwartzbin/blastz-run chr18.nib 1 10010000 chr15.nib 60000001 90000000 /cluster/data/hg16/bed/blastz.mm4/DEF
 # seq_read(/tmp/blastz.zstcGa/s1.fa): Input/output error
     # unusual errors.  Simply try them again and they work
 
     #	Second cluster run to convert the .out's to .lav's
     #	You do NOT want to run this on the big cluster.  It brings
     #	the file server to its knees.  Run this on the small cluster.
     ssh kkr1u00
     cd /cluster/data/hg16/bed/blastz.mm4
     source DEF
     /cluster/data/mm4/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # Completed: 339 of 339 jobs
 # CPU time in finished jobs:      15434s     257.23m     4.29h    0.18d  0.000 y
 # IO & Wait Time:                  2393s      39.89m     0.66h    0.03d  0.000 y
 # Average job time:                  53s       0.88m     0.01h    0.00d
 # Longest job:                     1128s      18.80m     0.31h    0.01d
 # Submission to last job:          2561s      42.68m     0.71h    0.03d
 
     #	Third cluster run to convert lav's to axt's
     source DEF
     cd /cluster/data/hg16/bed/blastz.mm4
     /cluster/data/mm4/jkStuff/BlastZ_run2.sh
     cd run.2
     para try, check, push, etc ...
 # Completed: 38 of 42 jobs
 # Crashed: 4 jobs
 # CPU time in finished jobs:       1826s      30.44m     0.51h    0.02d  0.000 y
 # IO & Wait Time:                  9781s     163.01m     2.72h    0.11d  0.000 y
 # Average job time:                 305s       5.09m     0.08h    0.00d
 # Longest job:                     1489s      24.82m     0.41h    0.02d
 # Submission to last job:          5125s      85.42m     1.42h    0.06d
     # FAILED: chr1, chr19, chr19_random, chr5
     # try these on kolossus
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.mm4/run.2
     /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
 	/cluster/data/hg16/bed/blastz.mm4/lav/chr1 \
 	/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr1.axt \
 	/cluster/data/hg16/nib /cluster/data/mm4/nib
     /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
 	/cluster/data/hg16/bed/blastz.mm4/lav/chr19 \
 	/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr19.axt \
 	/cluster/data/hg16/nib /cluster/data/mm4/nib
     /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
 	/cluster/data/hg16/bed/blastz.mm4/lav/chr19_random \
 	/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr19_random.axt \
 	/cluster/data/hg16/nib /cluster/data/mm4/nib
     /cluster/data/mm4/jkStuff/x86_64-chromlav2axt \
 	/cluster/data/hg16/bed/blastz.mm4/lav/chr5 \
 	/cluster/data/hg16/bed/blastz.mm4/axtChrom/chr5.axt \
 	/cluster/data/hg16/nib /cluster/data/mm4/nib
     # about 26 minutes total time for those four
     # chr19_random.axt is still empty, remove it to avoid errors later
 
     # translate sorted axt files into psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm4
     mkdir -p pslChrom
     set tbl = "blastzMm4"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     #	That takes about 30 minutes
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm4/pslChrom
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzMm4.psl
     # this is a 55 minute job
 
     # featureBits on blastzMm3 or 4 will not work on hgwdev, runs out of
     # memory.  But if you reset your ~/.hg.conf to use the read-only
     #	user and contact the hgwdev host, and build featureBits as a
     #	x86_64 binary, you can run it on kolossus:
     # featureBits hg16 blastzMm3
     # 1050190071 bases of 2865248791 (36.653%) in intersection
     # featureBits hg16 blastzMm4
     # 1056761609 bases of 2865248791 (36.882%) in intersection
 
 
 # CHAIN Mm4 BLASTZ (DONE - 2003-11-03 - Hiram)
 
 # The axtChain is best run on the small kluster, or the kk9 kluster
 #	in this case, it was run on the kk kluster
     ssh kkr1u00
     mkdir -p /cluster/data/hg16/bed/blastz.mm4/axtChain/run1
     cd /cluster/data/hg16/bed/blastz.mm4/axtChain/run1
     mkdir out chain
 
     ls -1S /cluster/data/hg16/bed/blastz.mm4/axtChrom/*.axt > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
     axtFilter -notQ_random $1 | axtChain stdin \
 	/iscratch/i/gs.17/build34/bothMaskedNibs \
 	/iscratch/i/mm4/softNib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
 
     # 41 jobs
     gensub2 input.lst single gsub jobList
     para create jobList
     para try
     para push # ... etc ...
 # Completed: 41 of 41 jobs
 # CPU time in finished jobs:      24547s     409.12m     6.82h    0.28d  0.001 y
 # IO & Wait Time:                  3955s      65.91m     1.10h    0.05d  0.000 y
 # Average job time:                 695s      11.59m     0.19h    0.01d
 # Longest job:                     7336s     122.27m     2.04h    0.08d
 # Submission to last job:          8251s     137.52m     2.29h    0.10d
 
     # now on the file server, sort chains
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm4/axtChain
     time chainMergeSort run1/chain/*.chain > all.chain
     # real    10m5.525s
     # user    8m9.350s
     # sys     0m48.450s
 
     time chainSplit chain all.chain
     # real    10m23.201s
     # user    7m51.930s
     # sys     0m53.910s
 
     # these steps take ~20 minutes
     # optionally: rm run1/chain/*.chain
 
     # Load chains into database
     # next machine
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm4/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg16 ${c}_chainMm4 $i
         echo done $c
     end
 
 # NET Mm4 (DONE - 2003-11-03 - Hiram)
 
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm4/axtChain
     mkdir preNet
     cd chain
     foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i /cluster/data/hg16/chrom.sizes \
                         /cluster/data/mm4/chrom.sizes ../preNet/$i
     end
     # real    11m58.018s
     # user    4m10.390s
     # sys     2m10.780s
 
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                             /cluster/data/mm4/chrom.sizes ../n1/$n /dev/null
     end
 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
     # memory usage 2505211904, utime 15891 s/100, stime 3245
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm4/axtChain
     time netClass hNoClass.net hg16 mm4 mouse.net \
 	-tNewR=/cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInMouse \
 	-qNewR=/cluster/bluearc/scratch/mus/mm4/linSpecRep.notInHuman
     # real    14m2.042s
     # user    10m6.450s
     # sys     1m46.950s
 
     # If things look good do
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm4/axtChain
     rm -r n1 hNoClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn mouse.net > mouseSyn.net
     # real    9m44.445s
     # user    6m42.660s
     # sys     1m10.100s
 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm4/axtChain
     netFilter -minGap=10 mouse.net |  hgLoadNet hg16 netMm4 stdin
     netFilter -minGap=10 mouseSyn.net | hgLoadNet hg16 syntenyNetMm4 stdin
 #   real    12m53.070s
 #   user    6m6.540s
 #   sys     0m50.580s
     # check results
     # featureBits hg16 netMm4
     # 2823565051 bases of 2865248791 (98.545%) in intersection
     # featureBits hg16 netMm3
     # 2834484276 bases of 2865248791 (98.926%) in intersection
 
     # featureBits hg16 syntenyNetMm3
     # 2804467412 bases of 2865248791 (97.879%) in intersection
     # featureBits hg16 syntenyNetMm4
     # 2786960572 bases of 2865248791 (97.268%) in intersection
 
     # Add entries for net and chain to mouse/hg16 trackDb
 
     # make net
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm4/axtChain
     mkdir mouseNet
     time netSplit mouse.net mouseNet
     # real    10m44.479s
     # user    6m43.680s
     # sys     1m20.860s
 
     mkdir ../axtNet
     foreach n (mouseNet/chr*.net)
 	set c=$n:t:r
 	echo "netToAxt: $c.net -> $c.axt"
 	rm -f ../axtNet/$c.axt
 	netToAxt mouseNet/$c.net chain/$c.chain \
 		/cluster/data/hg16/nib \
 		/cluster/data/mm4/nib \
 		../axtNet/$c.axt
 	echo "Complete: $c.net -> axtNet/$c.axt"
     end
 
     ssh hgwdev
     mkdir -p /cluster/data/hg16/bed/blastz.mm4/axtBest
     cd /cluster/data/hg16/bed/blastz.mm4/axtBest
     ln -s ../axtNet/chr*.axt .
 
     # copy net axt's to download area
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm4/axtNet
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet
     cd /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtNet
     gzip *.axt
     # add README.txt file to dir, if needed
 
     #  Convert those axt files to psl
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm4
     mkdir pslBest
     foreach a (axtBest/chr*.axt)
 	set c=$a:t:r
 	echo "processing $c.axt -> ${c}_blastzBestMm4.psl"
     /cluster/bin/i386/axtToPsl axtBest/${c}.axt \
 	S1.len S2.len pslBest/${c}_blastzBestMm4.psl
 	echo "Done: ${c}_blastzBestMm4.psl"
     end
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm4/pslBest
     time /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*_blastzBestMm4.psl
     # real    10m47.853s
     # user    2m48.700s
     # sys     0m24.250s
 
      # check results
     # featureBits hg16 blastzBestMm4
     # 996722004 bases of 2865248791 (34.787%) in intersection
     # featureBits hg16 blastzBestMm3
     # 1007362800 bases of 2865248791 (35.158%) in intersection
 
     # Make /gbdb links and add them to the axtInfo table:
      mkdir -p /gbdb/hg16/axtBestMm4
      cd /gbdb/hg16/axtBestMm4
      ln -s /cluster/data/hg16/bed/blastz.mm4/axtNet/chr*.axt .
      cd /cluster/data/hg16/bed/blastz.mm4/axtNet
      rm -f axtInfoInserts.sql
      foreach f (/gbdb/hg16/axtBestMm4/chr*.axt)
        set chr=$f:t:r
        echo "INSERT INTO axtInfo (species, alignment, chrom, fileName) \
                 VALUES ('mm4','Blastz Best in Genome','$chr','$f');" \
          >> axtInfoInserts.sql
      end
     hgsql hg16 < ~/kent/src/hg/lib/axtInfo.sql
     #	table axtInfo may already exist, ignore create error.
     hgsql hg16 < axtInfoInserts.sql
 
 # MAKING THE AXTTIGHT FROM AXTBEST (DONE - 2003-11-04 - Hiram)
     # After creating axtBest alignments above, use subsetAxt to get axtTight:
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm4/axtNet
     mkdir ../axtTight
     tcsh
     foreach i (*.axt)
       echo subsetAxt $i ../axtTight/$i
       subsetAxt  $i ../axtTight/$i \
         ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
     end
 
     # translate to psl
     cd ../axtTight
     mkdir ../pslTight
     foreach i (*.axt)
       set c = $i:r
       axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMm4.psl
       echo "Done: $i"
     end
 
     # Load tables into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm4/pslTight
     hgLoadPsl -noTNameIx hg16 chr*_blastzTightMm4.psl
 
     # check results
     # featureBits hg16 blastzTightMm4
     # 162641577 bases of 2865248791 (5.676%) in intersection
     # featureBits hg16 blastzTightMm3
     # 164148288 bases of 2865248791 (5.729%) in intersection
 
     # copy to axt's to download area
     cd /cluster/data/hg16/bed/blastz.mm4/axtTight
     mkdir -p /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight
     cp -p *.axt /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight
     cd /usr/local/apache/htdocs/goldenPath/hg16/vsMm4/axtTight
     gzip *.axt
     # add README.txt file to dir, if needed
 
 
 # RUNNING AXTBEST (DONE 12/2/03 angie)
     # Penn State complained of a loss in coverage when using axtNet instead 
     # of axtBest.  So run axtBest for them, and axtToMaf in prep for multiz.
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29
     # I removed links from axtBest/* to axtNet/*
     foreach f (axtChrom/chr*.axt)
       set chr=$f:t:r
       echo axtBesting $chr
       axtBest $f $chr axtBest/$chr.axt -minScore=300
     end
     # As usual, ran out of mem on chr19, so use kolossus & 2 passes:
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29
     set chr = chr19
     foreach d (lav/$chr/*.lav)
       set smallout=$d.axt
       lavToAxt $d /cluster/data/hg16/nib /cluster/data/mm4/nib stdout \
       | axtSort stdin $smallout
       axtBest $smallout $chr $smallout:r.axtBest
     end
     cat `ls -1 lav/$chr/*.axtBest | sort -g` \
     > lav/$chr/$chr.axtBestPieces
     axtBest lav/$chr/$chr.axtBestPieces $chr axtBest/$chr.axt
     rm lav/$chr/*.axt*
 
 
 # MAKE MAF FROM AXTBEST FOR PENN STATE (DONE 12/2/03 angie)
     ssh eieio
     cd /cluster/data/hg16/bed/blastz.mm4.2003-10-29
     mkdir mafBest
     foreach f (axtBest/chr*.axt)
       set maf = mafBest/$f:t:r.hm.maf
       echo translating $f to $maf
       axtToMaf $f \
             /cluster/data/hg16/chrom.sizes /cluster/data/mm4/chrom.sizes \
             $maf -tPrefix=hg16. -qPrefix=mm4.
     end
 
 
 # MAKING MOUSE MM4 SYNTENY (DONE 2003-11-05 - Hiram)
 
     ssh hgwdev
     mkdir -p /cluster/data/hg16/bed/syntenyMm4
     cd /cluster/data/hg16/bed/syntenyMm4
 
 #	updating the scripts in use here from 
 #	/cluster/data/hg16/bed/syntenyMm3
     cp -p /cluster/data/hg16/bed/syntenyMm3/*.pl .
 
 #	fix the syntenicBest script to not try and work on empty
 #	results from its queries.  Also, set the db and table name
 #	in the script itself so the arguments are not needed
 
     ./syntenicBest.pl
 #	on the order of 3 to 4 hours to complete syntenicBest
 #	almost no time, or only a few minutes at most for any of
 #	the rest
     ../syntenyMm3/smooth.pl
     ../syntenyMm3/joinsmallgaps.pl
 #	set db and table name in fillgap.pl
     ./fillgap.pl
     ../syntenyMm3/synteny2bed.pl
     hgLoadBed hg16 syntenyMm4 ucsc100k.bed
 # featureBits hg16 syntenyMm3
 # 2651945520 bases of 2865248791 (92.556%) in intersection
 # featureBits hg16 syntenyMm4
 # 2560252977 bases of 2865248791 (89.355%) in intersection
     # hgTracks.c needed to be updated to recognize syntenyMm4 so it
     # would color properly.
 
 
 # TIGR GENE INDEX (DONE 2004-05020 Fan)
     mkdir -p /cluster/data/hg16/bed/tigr
     cd /cluster/data/hg16/bed/tigr
     wget ftp://ftp.tigr.org/pub/data/tgi/Homo_sapiens/TGI_track_HumanGenome_hg16_05-2004.tgz
     tar xvzf TGI*.tgz
 
     foreach f (*cattle*)
       set f1 = `echo $f | sed -e 's/cattle/cow/g'`
       mv $f $f1
     end
 
     foreach o (mouse cow human pig rat)
       echo $o
       setenv O $o
       foreach f (chr*_$o*s)
         tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
       end
     end
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/tigr
     hgsql hg16 -e "drop table tigrGeneIndex"
     hgsql hg16 < ~/kent/src/hg/lib/tigrGeneIndex.sql
 
     foreach f (*.gff)
         echo Processing $f ...
         /cluster/home/fanhsu/bin/i386/ldHgGene -oldTable -exon=TC hg16 tigrGeneIndex $f
         hgsql hg16 -e "select count(*) from tigrGeneIndex"
     end
     # Total of 354491 entries created in tigrGeneIndex table.
 
     hgsql hg16 -e "update tigrGeneIndex set cdsStart = txStart;"
     hgsql hg16 -e "update tigrGeneIndex set cdsEnd = txEnd;"
 
     checkTableCoords hg16 tigrGeneIndex
 
     gzip *.gff *TCs
 
 # LOAD VEGA GENES AND PSEUDOGENES (DONE 2003-11-11 braney )
     #####
     ##### WARNING: vega procedure changed, use process later in file
     #####
     mkdir ~/hg16/bed/vega
     cd ~/hg16/bed/vega
     
     wget "http://www.sanger.ac.uk/Users/keenan/vega_homo_sapiens_core_4_0.gtf.gz"
     gunzip vega_homo_sapiens_core_4_0.gtf.gz
 
 # Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks
     awk '$2 != "Pseudogene" && $2 != "Ig_Pseudogene_Segment" && $2 != "Ig_Segment" {print "chr"$0}' \
 	vega_homo_sapiens_core_4_0.gtf  > vega_fixed.gtf
     awk '$2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment" {print "chr"$0}' \
     	vega_homo_sapiens_core_4_0.gtf  > vega_pseudo.gtf
     ldHgGene hg16 vegaGene vega_fixed.gtf -gtf
     ldHgGene hg16 vegaPseudoGene vega_pseudo.gtf -gtf
 
     wget "http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz"
     hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa
 
     vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab
     hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql
     echo "load data local infile 'vegaInfo.tab' into table vegaInfo" | hgsql hg16
 
 # Set cdsStart and cdsEnd to 0 if method is Novel_Transcript
     foreach ntname (`echo 'select name from vegaGene,vegaInfo \
                            where vegaGene.name = vegaInfo.transcriptId AND \
                                  vegaInfo.method = "Novel_Transcript"' \
                      | hgsql -N hg16`)
       echo "update vegaGene set cdsStart = 0 where name = '$ntname'" \
       | hgsql hg16
       echo "update vegaGene set cdsEnd = 0 where name = '$ntname'" \
       | hgsql hg16
     end
 
 
 
 # LOAD FIRSTEF TRACK  Done 2003-07-31 braney
 # Create firstEF track from Zhang lab at CSHL
 # contacts
 # Gengxin Chen <cheng@cshl.edu>
 # Ivo Grosse <grosse@ipk-gatersleben.de>
 # Michael Zhang <mzhang@cshl.edu>
 
     mkdir /cluster/data/hg16/bed/firstEF
     cd /cluster/data/hg16/bed/firstEF
 # Got firstEF.txt from Gengzin 7/30/03
     hgLoadBed hg16 firstEF firstEF.txt
 
 # Load chicken sequence loaded & processed by booch & acs (2003-11-4 kate)
     hgLoadSeq hg16 /gbdb/gg0/chicken.fa
         # 73234 sequences
 
 
 # LOAD ENSEMBL GENES (DONE 2003-11-07 angie)
     mkdir /cluster/data/hg16/bed/ensembl
     cd /cluster/data/hg16/bed/ensembl
     # Get the ensembl protein data from 
     # http://www.ensembl.org/Homo_sapiens/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Structures" box. 
     # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
     # Save as ensemblGene.gtf.gz
 
     # Ensembl handles random chromosomes differently than us, so we
     # strip this data.  Fortunately it just loses a couple of genes.
     # Add "chr" to front of each line in the gene data gtf file to make 
     # it compatible with our software.
     # Finally, get rid of the ".1" or ".2" after the name
     gunzip -c ensemblGene.gtf.gz \
     | grep -v ^6_DR51 \
     | grep -v ^DR51 \
     | grep -v _NT_ \
     | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \
                  || die "Line $. doesnt start with human chrom:\n$_"' \
     | sed -e 's/\..\"/\"/g' \
     > ensGene.gtf
     ssh hgwdev
     /cluster/bin/i386/ldHgGene hg16 ensGene \
       /cluster/data/hg16/bed/ensembl/ensGene.gtf
 
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Choose Text, tab-separated as the output format.  Result name ensGtp.
     # Save file as ensGtp.txt.gz
     gunzip ensGtp.txt.gz
     hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
     echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16
 
     # Load Ensembl peptides:
     # Get them from ensembl as above in the gene section except for
     # Page 3) Choose the "Sequences" box. 
     # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
     # Save file as ensemblPep.fa.gz
     gunzip ensemblPep.fa.gz
     hgPepPred hg16 ensembl ensemblPep.fa
 
 
 LOAD GENOMIC DUPES (DONE - 2003-11-11 - Hiram)
 o - Load genomic dupes
     ssh hgwdev
     mkdir /cluster/data/hg16/bed/genomicDups
     cd /cluster/data/hg16/bed/genomicDups
     # pick up Build34GenomicDups.gz from
     #	http://humanparalogy.cwru.edu/build34/files_for_ucsc/build34_ucsc.htm
     #	it has a user and password login.  you can use this wget command
     #	with the user/password:
     wget --http-user=X --http-passwd=X \
     "http://humanparalogy.cwru.edu/build34/files_for_ucsc/Build34GenomicDups.gz"
     gunzip *.gz
     # awk -f filter.awk oo33_dups_for_kent > genomicDups.bed
     hgsql hg16 < ~/kent/src/hg/lib/genomicDups.sql
     hgLoadBed hg16 -oldTable genomicDups Build34GenomicDups
     # load of genomicDups did not go as planned: 57702 record(s), 0 row(s) skipped, 57702 warning(s) loading bed.tab
     # There was an error in this data delivery.  To fixup:
     hgsql -e \
 	'update genomicDups set name = concat(otherChrom,":",otherStart);' \
 	hg16
 
 # LOAD CHIMP NET (2003-11-20 kate)
 # NOTE: Net preparation doc'ed in makePt0.doc
     ssh hgwdev
     cd /cluster/data/pt0/bed/blastz.hg16/axtChain
     netFilter -minGap=10 chimp.net |  hgLoadNet hg16 netPt0 stdin
     netFilter -minGap=10 chimpSyn.net | hgLoadNet hg16 syntenyNetPt0 stdin
 
 
 # CHIMP BEST CHAINS, IN CHROMOSOME COORDINATES (2004-02-25 kate)
 #       NOTE: start with scaffold-based human-reference reciprocal best chains 
 #       doc'ed in makePt0.doc, then lift using scaffold lift file in panTro1
 #  NOTENOTENOTE:  Angie redid this with chain renumbering
     ssh kksilo
     mkdir -p /cluster/data/hg16/bed/blastz-blat.panTro1
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
     liftUp -chainQ best.chain \
         /cluster/data/panTro1/jkStuff/scaffolds.lft \
         warn /cluster/data/pt0/bed/blastz-blatHg16/human.best.chain
     chainSplit bestChain best.chain
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz-blat.panTro1/bestChain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain hg16 ${c}_bestChainPanTro1 $i
     end
 
 
 # CHIMP ALL CHAINS, IN CHROMOSOME COORDINATES (2004-02-25 kate)
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
     liftUp -chainQ all.chain \
         /cluster/data/panTro1/jkStuff/scaffolds.lft \
         warn /cluster/data/pt0/bed/blastz-blatHg16/all.chain
     chainSplit chain all.chain
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz-blat.panTro1/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg16 ${c}_chainPanTro1 $i
         echo done $c
     end
 
 # CHIMP RECIPRCAL BEST NET, IN CHROMOSOME COORDS (kate)
 #    Redo the netting on chrom-based chain files
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
      ~/bin/x86_64/chainNet all.chain -minSpace=10 \
         /cluster/data/hg16/chrom.sizes /cluster/data/panTro1/chrom.sizes \
         human.net chimp.net
 
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
     chainSwap all.chain all.swap.chain
     ~/bin/i386/netChainSubset chimp.net all.swap.chain stdout | \
         chainSort stdin chimpNet.chain
 
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
     ~/bin/x86_64/chainNet all.chain -minSpace=10 \
         /cluster/data/hg16/chrom.sizes /cluster/data/panTro1/chrom.sizes \
         human.net chimp.net
 
 
 
 # UPDATE WOODY BINARIES (see PHYLOHMM CONSERVATION entries below)
 # done, acs, 2003-11-19
     ssh hgwdev
     cd /cluster/data/woody   # better place?  don't have permission in /cluster/install
     cvs update -dP
     cd src
     make
     # make sure Makefile has INSTALLDIR = /cluster/bin/woody
     make install
 
 # CFTR PHYLOHMM CONSERVERVATION
 # done, acs, 2003-11-19 (currently using 9-way alignment)
 
 # NOTE: essentially the same procedure applies for any Zoo or ENCODE
 # target, as long as a suitable tree topology is available for the
 # species in question (when distant species are included, e.g.,
 # chicken and fish, the branch-length estimation procedure may need to
 # be adapted slightly -- details to come)
 
     ssh hgwdev
     # (update woody binaries, if necessary -- see above)
     # make sure /cluster/bin/penn/tbaBin and /cluster/bin/woody in path
 
     mkdir -p /cluster/data/nisc/targets/cftr/phyloHMMcons
     cd /cluster/data/nisc/targets/cftr/phyloHMMcons
     # extract sufficient stats for phylog. analysis from MAF file
     CFTR_START=115365025      # boundaries of CFTR region in hg16 coords 
     CFTR_END=117242450	      # (these don't have to be perfect)
     maf_project /cluster/data/nisc/targets/cftr/tba9way.maf /cluster/data/nisc/targets/cftr/tba9Mammal/human > cftr9_humanref.maf
     msa_view cftr9_humanref.maf -i MAF -o SS -s ${CFTR_START} -e ${CFTR_END} -r 1 -O hg16,chimp,baboon,mm3,rn3,cow,pig,cat,dog > cftr9.ss
     head cftr9.ss            
 #NSEQS = 9
 #LENGTH = 2063003
 #TUPLE_SIZE = 1
 #NTUPLES = 57302
 #NAMES = hg16,chimp,baboon,mm3,rn3,cow,pig,cat,dog
 #ALPHABET = ACGTN
 #IDX_OFFSET = 115365024
 #NCATS = -1
 # 
 #0       C--------       26480
 
     # fit a phylogenetic model to the data, with rate variation
     echo "((((1,2),3),(4,5)),((6,7),(8,9)))" > cftr9.nh
     # (indexes refer to sequences in the order of the NAMES line in
     # the *.ss file)
     fit_tree_model -m cftr9.ss -i SS -t cftr9.nh -s REV -o cftr9_rev -E -l fit.log -k 5 -a 4.8 -T -p MED
     # (takes about 5 min.  Watch log file for convergence --
     # single lines correspond to outer maximization algorithm,
     # interleaved sets of lines correspond to inner maximization
     # algorithms [see http://www.cse.ucsc.edu/~acs/Siepel-03-0304.pdf
     # for background])
     # Note: k=5 is adequate for a good estimate of the alpha
     # parameter, even though we'll use k=10 in the next step.  The -a
     # argument just provides a reasonable starting value for alpha, to
     # speed convergence
 
     # (check estimated branch lengths to be sure they make sense)
     cat cftr9_rev.nh
 #((((hg16:0.005601,chimp:0.005707):0.019356,baboon:0.034458):0.080743,(mm3:0.072487,rn3:0.079445):0.287368):0.035643,((cow:0.107791,pig:0.102431):0.040419,(cat:0.074444,dog:0.104476):0.053251):0.035643);
     # (small deviations from one data set to the next are normal)
 
     # you can also do "draw_tree cftr9_rev.nh > cftr9_rev.ps" to get a
     # simple postscript rendering of the tree.  Zero- or
     # near-zero-length branches usually indicate a problem, e.g.,
     # incorrect topology
 
     # (also check cftr9_rev.mod; look in particular at ALPHA)
     cat cftr9_rev.mod
 #ALPHABET: A C G T
 #ORDER: 0
 #SUBST_MOD: REV
 #NRATECATS: 5
 #ALPHA: 4.778715
 #TRAINING_LNL: -6471907.615171
 #BACKGROUND: 0.304536 0.191156 0.191907 0.312401
 #RATE_MAT:
 #  -0.848833    0.150792    0.552489    0.145552
 #   0.240232   -1.259134    0.166198    0.852704
 #   0.876738    0.165547   -1.285792    0.243507
 #   0.141887    0.521764    0.149586   -0.813238
 #TREE: ((((1:0.005601,2:0.005707):0.019356,3:0.034458):0.080743,(4:0.072487,5:0.079445):0.287368):0.035643,((6:0.107791,7:0.102431):0.040419,(8:0.074444,9:0.104476):0.053251):0.035643);
 
     # now compute the posterior probabilities of interest, according to
     # a phylo-HMM
     label -m cftr9.ss -d cftr9_rev.mod -i SS -o cftr9 -k 10 -L 0.9 -A -p 0 -j 1 -x -s chr7
     # (takes 12 min)
     # (check postprob file)
     wc cftr9.postprob
 #1752168 3504336 31539024 cftr9.postprob
     head cftr9.postprob
 #115370785       0.0664
 #115370786       0.0583
 #115370787       0.0448
 #115370788       0.0271
 #115370789       0.0217
 #115370790       0.0232
 #115370791       0.0331
 #115370792       0.0396
 #115370793       0.0417
 #115370794       0.0557
 
     # load as a (Hiramesque) wiggle track
     cd /cluster/data/nisc/targets/cftr/phyloHMMcons
     zcat cftr9.postprob.gz | wigAsciiToBinary -chrom=chr7 -binsize=1024 \
 	-dataSpan=1 -wibFile=chr7_phyloHMMcons_CFTR -name=cftr9 stdin
     rm -r /gbdb/hg16/wib/chr7_phyloHMMcons_CFTR.wib
     ln -s \
      /cluster/data/nisc/targets/cftr/phyloHMMcons/chr7_phyloHMMcons_CFTR.wib \
 	/gbdb/hg16/wib/chr7_phyloHMMcons_CFTR.wib
     hgLoadWiggle hg16 chr7_phyloHMMcons_CFTR chr7_phyloHMMcons_CFTR.wig
     chmod 664 chr7_phyloHMMcons_CFTR.wib
     chmod 775 .
 
     # add trackDb.ra entry, e.g.,
 #track phyloHMMcons_CFTR
 #shortLabel phyloHMMcons CFTR
 #longLabel phylo-HMM-based conservation, CFTR (post. prob. of slowest of 10 rates)
 #group compGeno
 #priority 150
 #visibility hide
 #color 175,150,128
 #altColor 255,128,0
 #type wig 0.0 1.0
 #autoScaleDefault Off
 
     # adapt HTML for details page, if necessary (e.g., copy an existing
     # phyloHMMcons*.html page to phyloHMMcons_CFTR.html, edit to
     # reflect data set, do "make update", don't forget to cvs add and
     # commit)
 
     # cleanup
     rm cftr9.ss cftr9_humanref.maf   # easy to regenerate
     gzip cftr9.postprob
 
 # CFTR PHYLOHMM CONSERVATION, 25-way alignment
 # done, acs, 2003-11-21
 
 # This can be done exactly as above for the 9-way alignment, except
 # that the tree estimation procedure has to be adjusted to circumvent
 # the problem that the distant species align only in conserved regions
 # (so that a tree estimated from the whole data set will have
 # disproportionally short branches to and among these species).  The
 # procedure I've used is semi-manual and somewhat ad hoc, but I'll
 # record the main steps here for completeness.  I'll only cover the
 # tree estimation procedure (running 'label' and loading the track is
 # the same as before) .
 
     ssh hgwdev
     mkdir /cluster/data/nisc/targets/cftr/phyloHMMcons25
     cd /cluster/data/nisc/targets/cftr/phyloHMMcons25
 
     # extract sufficient statistics for two data sets: all sites for
     # mammals and sites in 3rd codon positions for all species.  I'm
     # not including platyplus with the mammals (it's technically a
     # mammal, but a monotreme, and quite distant) because it seems to
     # align mostly in conserved regions
     maf_project /cluster/data/nisc/targets/cftr/25way/tba.maf /cluster/data/nisc/targets/cftr/25way/human > cftr25_humanref.maf
     setenv CFTR_START 115365025 
     setenv CFTR_END 117242450	
     setenv SPEC_ORDER  hg16,chimp,orangutan,baboon,macaque,vervet,lemur,rabbit,rn3,mm3,cow,pig,horse,cat,dog,ajbat,cpbat,hedgehog,opossum,dunnart,platypus,chicken,zfish,tetra,fr1
     msa_view cftr25_humanref.maf -i MAF -o SS -s $CFTR_START -e $CFTR_END -r 1 -O $SPEC_ORDER > cftr25.ss 
 	# whole data set, ordered suff stats -- use this for 'label'
     msa_view cftr25.ss -i SS -o SS -z -l 21,22,23,24,25 -x > cftr20.ss
 				# exclude non-mammals (plus platypus)
     /bin/echo -e 'NCATS = 3\ncds 1-3' > cats.cm   # category map for cds sites
     /cluster/home/acs/woody/scripts/refFlat2gff.pl -S -P -A hg16 -w 'chrom="chr7" and cdsStart > 115365025 and cdsEnd < 117242450' | sed 's/chr7/hg16/' | egrep -v 'NM_152829|NM_001233|NM_018412' > cftr.gff
 	# gets refseq annotations for this region as a gff; the egrep
 	# explicitly removes some duplicate entries (should have a
 	# better way of doing this); the sed changes the seq name so
 	# that msa_view recognizes it's the same as the name in the
 	# alignment
     msa_view cftr25_humanref.maf -i MAF -o SS -z -c cats.cm -g cftr.gff -O $SPEC_ORDER > cftr25.3.ss
 
     # now fit a tree model to each data set
     echo "((((((((((1,2),3),((4,5),6)),7),(8,(9,10))),((((11,12),(13,(14,15))),(16,17)),18)),(19,20)),21),22),(23,(24,25)))" > cftr25.nh
     fit_tree_model -m cftr25.3.ss -C 3 -i SS -t cftr25.nh -s REV -o cftr25 -E -l cftr25.3.log -T -p MED -k 5 -a 1.8
 
     # (this next one may take an hour or two -- run it on a fast
     # workstation or be sure to nice if on hgwdev; you can speed it up
     # by giving it a good starting *.mod file based on the above [-M option])
     echo "(((((((1,2),3),((4,5),6)),7),(8,(9,10))),((((11,12),(13,(14,15))),(16,17)),18)),(19,20))" > cftr20.nh
     fit_tree_model -m cftr20.ss -i SS -t cftr20.nh -s REV -o cftr20 -E -l cftr20.log -T -p MED -k 5 -a 4
 
     cp cftr20.mod cftr25_hybrid.mod
     # Now edit cftr25_hybrid.mod by hand.  Copy the tail end of the
     # TREE line from cftr25.3.mod, corresponding to all nodes and
     # branches outside of the clade for the non-monotreme mammals, and
     # append it to the TREE line in cftr25_hybrid.mod (adjusting
     # parens as necessary).  Then multiply each one of these new
     # branch lengths by a factor of 1.2 (computed as the sum of all
     # branch lengths in cftr25.mod divided by the sum of the
     # corresponding branch lengths in cftr25.3.mod).  The resulting
     # tree is well supported within the (non-monotreme mammals) and
     # includes a reasonable approximation of the non-mammal branch
     # lengths.  Proceed with 'label' using cftr25_hybrid.mod.
 
     # cleanup
     rm cftr25_humanref.maf cftr*.ss         # easy to regenerate
 
 # HMR PHYLOHMM CONSERVATION
 # (started, acs, 2003-11-11, finished 11-19)
 
     ssh hgwdev
     # (update woody binaries, if necessary -- see above)
     # (also, make sure /cluster/bin/woody in path)
     mkdir /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
     cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
 
     # estimate a phylog. model using the entire genome-wide alignments
     # first extract sufficient statistics by chromosome
     ssh eieio
     cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
     foreach file (/cluster/data/hg16/bed/humor/hmr/*.maf)
 	set prefix = $file:t:r
 	msa_view -i MAF $file -o SS -z -O hg16,mm3,rn3 > $prefix.ss 
     end
     logout
     # NOTE: may be worth doing the above as a small cluster job instead
     # (put the mafs on bluearc -- end up doing this below anyway)
 
     # now combine suff stats across chromosomes
     # (back on hgwdev)
     ls chr*.ss > files
     msa_view -i SS -o SS -A hg16,mm3,rn3 '*files' > all.ss
     # estimate the model (very fast, now that suff stats are avail)
     echo "(1,(2,3));" > tree.nh
     fit_tree_model -i SS -m all.ss -t tree.nh -s REV -k 10 -o rev_dg
     cat rev_dg.mod
 #ALPHABET: A C G T
 #ORDER: 0
 #SUBST_MOD: REV
 #NRATECATS: 10
 #ALPHA: 4.428803
 #TRAINING_LNL: -448054115.568696
 #BACKGROUND: 0.286083 0.213573 0.213691 0.286652
 #RATE_MAT:
 #  -0.891523    0.166770    0.574850    0.149902
 #   0.223389   -1.146311    0.153784    0.769137
 #   0.769591    0.153699   -1.147159    0.223869
 #   0.149605    0.573055    0.166888   -0.889548
 #TREE: (1:0.192598,(2:0.076303,3:0.083043):0.192598);
 
     # now, break up the genome-wide MAFs into pieces; it's worth doing
     # this as a little cluster job
     ssh eieio
     mkdir -p /cluster/bluearc/hg16/bed/humor
     cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/data/hg16/?{,?}/chr*.fa /cluster/bluearc/hg16/bed/humor
     logout
     ssh kk
     cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
     cat << EOF > doSplit
 #!/bin/sh
 
 WOODY=/cluster/bin/woody
 FA_SRC=/cluster/bluearc/hg16/bed/humor
 WINDOWS=/cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11/WINDOWS
 
 maf=$1
 prefix=`echo $maf | awk -F\/ '{print $NF}' | awk -F\. '{print $1}'` 
 mkdir -p /scratch/msa_split
 ${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$prefix.fa -O hg16,mm3,rn3 -w 1000000,0 -r /scratch/msa_split/$prefix -o SS -I 1000 -d 1 -B 5000
 cd /scratch/msa_split
 for file in ${prefix}.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
 rm -f /scratch/msa_split/${prefix}.*.ss
 EOF
     chmod +x doSplit
     mkdir -p WINDOWS
     rm -f WINDOWS/* jobs.lst
     foreach file (/cluster/bluearc/hg16/bed/humor/*.maf) 
 	echo "doSplit $$file" >> jobs.lst
     end
     para create jobs.lst  
     # etc ...   (run cluster job)
 
     # now setup and run the cluster job to compute the conservation scores
     # NOTE: the TMP dir should be set to something other than /scratch,
     # as it is not shared between cluster nodes ?
     
 cat << EOF > doPostProbs
 #!/bin/sh
 
 WOODY=/cluster/bin/woody
 TMP=/scratch/phyloHMMcons
 
 file=$1
 root=`echo $file | awk -F\/ '{print $NF}' | sed 's/\.ss\.gz//'`
 chrom=`echo $root | awk -F\. '{print $1}'`
 
 mkdir -p $TMP
 zcat $file | $WOODY/label -m - -d rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x
 mkdir -p POSTPROBS/$chrom
 gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz
 rm $TMP/$root.postprob
 EOF
     chmod +x doPostProbs
 
     mkdir -p POSTPROBS
     rm -f jobs2.lst 
     foreach file (WINDOWS/chr*.ss.gz) 
 	echo "doPostProbs $file" >> jobs2.lst 
     end
     para create jobs2.lst
     # etc ... (run cluster job)
     logout
 
     # finally, make track
     # phyloHMMcons.hg16mm3rn3.2003-11-11 dir)
     ssh eieio
     cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
     mkdir wibLimits
     mkdir wib
     foreach dir (POSTPROBS/*)
 	set chrom = $dir:t
 	echo $chrom 
 	zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
 	    wigAsciiToBinary -chrom=$chrom -binsize=1024 \
 		-dataSpan=1 -wibFile=wib/${chrom}_phyloHMMcons -name=hmr \
 		stdin > wibLimits/${chrom}
     end
     ssh hgwdev
     cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
     hgLoadWiggle hg16 phyloHMMcons_HMR wib/*_phyloHMMcons.wig 
     ln -s `pwd`/wib/chr*_phyloHMMcons.wib /gbdb/hg16/wib 
     chmod 775 . wib
     chmod 664 wib/*.wib
 
     # add entry to trackDb.ra
 #track phyloHMMcons_HMR
 #shortLabel phyloHMMcons HMR
 #longLabel phylo-HMM-based conservation, human-mouse-rat (post. prob. of slowest of 10 rates)
 #group compGeno
 #priority 150
 #visibility hide
 #color 175,150,128
 #altColor 255,128,0
 #type wig 0.0 1.0
 #autoScaleDefault Off
 
     # cleanup (only when you're pretty sure you're done!)
     rm -r chr*.ss WINDOWS wiggle.tab para.results batch*
 
 
 # CHICKEN BLAT (translated)
 # (done, acs, 2003-11-19)
 
     # (using repeat- and TRF-masked files already created -- see
     # CHICKEN BLASTZ, above)
     ssh kk
 
     # set up main dir
     cd /cluster/data/hg16/bed
     mkdir blat.gg0.2003-11-19
     ln -s blat.gg0.2003-11-19 blat.gg0
     cd blat.gg0
 
     # warning: I'm writing this up in a rush -- watch for errors!
 
     # set up cluster job
     cat << EOF > make-joblist.pl
 #!/usr/bin/perl
 
 # script to create a job list for translated blat of human
 # vs. another species; assumes directory of fa files for the xeno
 # species.  Output directories are set up as a side effect.
 
 # USAGE: make-joblist.pl <hg-nibs-dir> <xeno-fa-dir> <hg-chr-lengths-file> <output_root_dir>
 
 $SIZE=10000000;			# partitioning params for human
 $OVERLAP=10000;
 
 # read lengths of chromosomes
 open(LENF, $ARGV[2]);
 while (<LENF>) { ($chr, $l) = split(/\s+/); $length{$chr} = $l;}
 close(LENF);
 
 @falist = <$ARGV[1]/*.fa>;
 foreach $nib (<$ARGV[0]/*.nib>) {
   $nib =~ /.*(chr.*)\.nib/ || die();
   $chr = $1;
   $l = $length{$chr};
 
   for ($start = 1; $start <= $l; $start += $SIZE) {
     $end = $start + $SIZE + $OVERLAP - 1;
     if ($end > $l) { $end = $l; }
     $dir = sprintf("%s/%s/%d_%d", $ARGV[3], $chr, $start, $end);
     foreach $fa (@falist) {    
       $fa =~ /.*\/([^\/]+)\.fa/ || die();
       $name = $1;
       printf "/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax %s:%d-%d %s {check out line+ %s/%s_%d_%d_%s.psl}\\n", $nib, $start, $end, $fa, $dir, $chr, $start, $end, $name;
     }
     `mkdir -p $dir`;		# set up output directories
   }
 }
 EOF
 
     # NOTE: there's a slight error above with indexing.  Next time use
     # something like:
 
 #  for ($start = 0; $start < $l; $start += $SIZE) {
 #    $end = $start + $SIZE + $OVERLAP;
 #    if ($end >= $l) { $end = $l; }
 
     # The "make-lift.pl" script below should be changed also to be
     # consistent (should be enough to change exactly the same lines)
 
     chmod +x make-joblist.pl
     cp /cluster/data/hg16/bed/blastz.gg0/S1.len . # just borrow existing lens
     mkdir -p run
     ./make-joblist.pl /iscratch/i/gs.17/build34/bothMaskedNibs /cluster/bluearc/gg0/split100_with_trf S1.len /cluster/data/hg16/bed/blat.gg0/psl > run/jobs.lst
     # make sure directory structure is created under psl
     cd run
     para create jobs.lst ; para try ; para check ; para push ; # etc...
 
 #33561 jobs in batch
 #0 jobs (including everybody's) in Parasol queue.
 #Checking finished jobs
 #Completed: 33561 of 33561 jobs
 #CPU time in finished jobs:   14432527s  240542.12m  4009.04h  167.04d  0.458 y
 #IO & Wait Time:                147210s    2453.50m    40.89h    1.70d  0.005 y
 #Average job time:                 434s       7.24m     0.12h    0.01d
 #Longest job:                    14117s     235.28m     3.92h    0.16d
 #Submission to last job:         31483s     524.72m     8.75h    0.36d
 
     # post process psl files
     cd ..			# back to main blat.gg0 dir
 cat << EOF > make-lift.pl
 #!/usr/bin/perl
 
 # create a lift spec to map psl files for windows to chromosome coords
 # USAGE: make-lift.pl <hg-nibs-dir> <hg-chr-lengths-file>
 
 $SIZE=10000000;
 $OVERLAP=10000;
 
 open(LENF, $ARGV[1]);
 while (<LENF>) { ($chr, $l) = split(/\s+/); $length{$chr} = $l;}
 close(LENF);
 
 foreach $nib (<$ARGV[0]/*.nib>) {
   $nib =~ /.*(chr.*)\.nib/ || die();
   $chr = $1;
   $l = $length{$chr};
   for ($start = 1; $start <= $l; $start += $SIZE) {
     $end = $start + $SIZE + $OVERLAP - 1;
     if ($end > $l) { $end = $l; }
     printf "%d\t%s:%d-%d\t%d\t%s\t%d\n", $start, $chr, $start, $end, $end-$start, $chr, $l;
   }
 }
 EOF
 
     chmod +x make-lift.pl
     ./make-lift.pl /iscratch/i/gs.17/build34/bothMaskedNibs S1.len > psl.lft
     mkdir -p pslChrom
     foreach dir ( psl/* )
 	set chrom = $dir:t 
 	echo $chrom 
 	/cluster/bin/i386/pslCat -dir $dir/* | /cluster/bin/i386/liftUp pslChrom/${chrom}_blatGg0.psl psl.lft warn stdin
     end
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg16/bed/blat.gg0/pslChrom
     /cluster/bin/i386/hgLoadPsl -noTNameIx hg16 chr*.psl
     #	New entry in human/hg16/trackDb.ra
 #	track blatGg0
 #	shortLabel Chicken Blat
 #	longLabel Chicken Translated Blat (Gg0-contigs, 5.2x coverage)
 #	group compGeno
 #	priority 145.95
 #	visibility hide
 #	color 100,50,0
 #	altColor 255,240,200
 #	spectrum on
 #	type psl xeno 
 
     # look at coverage
     featureBits hg16 blatGg0 knownGene:CDS
 #18205137 bases of 2865248791 (0.635%) in intersection
     featureBits hg16 knownGene:CDS
 #31268809 bases of 2865248791 (1.091%) in intersection
 
 # RELOAD ENSEMBL GENES WITH VERSION 34a (DONE 2003/12/16 markd)
     # save current tables, just in case.
     rename table  ensGene to ensGene_old;
     rename table  ensGtp  to ensGtp_old;  
     rename table  ensPep  to ensPep_old;  
 
     mkdir /cluster/data/hg16/bed/ensembl34a
     cd /cluster/data/hg16/bed/ensembl34a
     # Get the ensembl protein data from 
     # http://www.ensembl.org/Homo_sapiens/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Structures" box. 
     # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
     # Save as ensemblGene.gtf.gz
 
     # Ensembl handles random chromosomes differently than us, so we
     # strip this data.  Fortunately it just loses a couple of genes.
     # Add "chr" to front of each line in the gene data gtf file to make 
     # it compatible with our software.
     # Finally, get rid of the ".1" or ".2" after the name
     zcat ensemblGene.gtf.gz \
     | grep -v ^6_DR51 \
     | grep -v ^DR51 \
     | grep -v _NT_ \
     | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \
                  || die "Line $. doesnt start with human chrom:\n$_"' \
     | sed -e 's/\..\"/\"/g' \
     > ensGene.gtf
     ssh hgwdev
     /cluster/bin/i386/ldHgGene hg16 ensGene \
       /cluster/data/hg16/bed/ensembl34a/ensGene.gtf
 
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Choose Text, tab-separated as the output format.  Result name ensGtp.
     # Save file as ensGtp.txt.gz
     gunzip ensGtp.txt.gz
     hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
     echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16
     gzip ensGtp.txt
 
     # Load Ensembl peptides:
     # Get them from ensembl as above in the gene section except for
     # Page 3) Choose the "Sequences" box. 
     # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
     # Save file as ensemblPep.fa.gz
     zcat ensemblPep.fa.gz | hgPepPred hg16 ensembl stdin
 
     # compare size of old and new tables as a sanity check
     drop table ensGene_old;
     drop table ensGtp_old;  
     drop table ensPep_old;  
 
     # Create knownToEnsembl column and knownToSuperfamily column
     hgMapToGene hg16 ensGene knownGene knownToEnsembl
     zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper hg16 hs stdin
 
 
 # LOAD ECgene tables ((redone with existing data) braney, 2004-01-30)
     cd /cluster/data/hg16/bed
     rm -f ECgene
     mkdir ECgene.2003-12-18
     ln -s ECgene.2003-12-18 ECgene
     cd ECgene
     wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_hg16_v1.1_25oct2003_genes.txt.gz"
     wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_hg16_v1.1_25oct2003_genepep.txt.gz"
     gunzip *.gz
     ldHgGene -predTab hg16 ECgene ECgene_hg16_v1.1_25oct2003_genes.txt 
     hgPepPred hg16 tab ECgenePep ECgene_hg16_v1.1_25oct2003_genepep.txt
     rm genePred.tab
     gzip *
 
 # QA NOTE: [ASZ, 2007-10-01] mytouch to ECGenePep table 200401301000.00
 # contents were fine.  passed -keys rule.
 
 
 # MULTIZ HUMAN/MOUSE/RAT/CHIMP (kpollard, 12/16/03)
 #  chimp added to human/mouse/rat (HUMOR) alignment described above
 #  for now, human referenced and no new BLASTZ runs
 
     ssh kk
     #fix order in human/chimp BLASTZ MAF files
     #use Kate's new files in humanBestAxt.2
     cd /cluster/data/pt0/bed/blastz-blatHg16
     mkdir humanBestAxt.ord
     mkdir maf.ord
     foreach file (humanBestAxt.2/*.axt)
 	set root=$file:t:r
 	echo $root
 	/cluster/bin/i386/axtSort $file humanBestAxt.ord/${root}.axt
 	/cluster/bin/i386/axtToMaf humanBestAxt.ord/${root}.axt ../blastz.hg16/S1.len /cluster/data/pt0/scaffold.sizes maf.ord/${root}.maf.unfixed -tPrefix=hg16. -qPrefix=pt0.
 	/cluster/bin/scripts/fixmaf.pl < maf.ord/${root}.maf.unfixed > maf.ord/${root}.maf
     end
 
     #test on chr11 with HMR
     ssh eieio
     mkdir -p /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord
     cp /cluster/data/pt0/bed/blastz-blatHg16/maf.ord/chr11.maf /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord
     logout  # back to kk
     /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0pt0/mz /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hp.ord/chr11.maf /cluster/bluearc/multiz.hg16mm3rn3gg0pt0/hmr/chr11.hmr.maf /cluster/data/hg16/bed/multiz.hg16mm3rn3gg0pt0/hmrp/chr11.ord.maf
     #looks good, go ahead with HMRP multiz
 
     mkdir -p /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0
     mkdir hmrp
 
     # wrapper script for multiz
     cat << EOF > mz
 #!/bin/csh
 /cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3
 EOF
     chmod +x mz
 
     ssh eieio
     # clean up bluearc
     rm -r /cluster/bluearc/multiz.hg16mm3rn3gg0pt0
     # move MAFS to bluearc
     mkdir -p /cluster/bluearc/multiz.hg16mm3rn3pt0/hmr
     mkdir -p /cluster/bluearc/multiz.hg16mm3rn3pt0/hp
     cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/bluearc/multiz.hg16mm3rn3pt0/hmr
     cp /cluster/data/pt0/bed/blastz-blatHg16/maf.ord/*.maf /cluster/bluearc/multiz.hg16mm3rn3pt0/hp
     logout
 
     # set up joblist (common denominator set: no chr19_random in hmr)
     foreach file (/cluster/bluearc/multiz.hg16mm3rn3pt0/hmr/*.maf) 
 	set root=`echo $file:t:r | sed 's/\.hmr//'`
 	echo "/cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/mz /cluster/bluearc/multiz.hg16mm3rn3pt0/hp/${root}.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/${root}.maf" >> jobList
     end
 
     #run MULTIZ
     chmod +x jobList
     para create jobList
     #submit 10 jobs
     para try
     #keep an eye on them
     para check
     para finished
     para running
     #once these are done, submit rest
     para push
     para check
     para time
     #ran on cluster: 41 jobs, longest 42 min
 
     #copy over chr19_random.maf from human/chimp
     cp /cluster/bluearc/multiz.hg16mm3rn3pt0/hp/chr19_random.maf /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/chr19_random.maf
 
     # clean up bluearc
     ssh eieio
     rm -r /cluster/bluearc/multiz.hg16mm3rn3pt0
     logout
 
     # setup external files for database reference
     ssh hgwdev
     mkdir -p /gbdb/hg16/multizMm3Rn3Pt0
     ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp/*.maf /gbdb/hg16/multizMm3Rn3Pt0
 
     #load into database
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3pt0/hmrp
     /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3Pt0
         # 5385226 mafs in 42 files
         # 0-2594 warnings/file
     
     #NOTE: only added track to hgwdev-kpollard (for now).
 
 
 # LIFTOVER RNAGENE FROM HG15 (DONE CIRCA 12/27/03 schattner)
 #	Replaced below by new RNAGENES (2004-03-09)
     cd /cluster/data/hg16/bed/bedOver
     mkdir rnaGene
     cd rnaGene
     hgsql -N hg15 '-e select * from rnaGene' > rnaGeneHg15.bed 
     liftOver rnaGeneHg15.bed ../over.chain rnaGeneLiftGene.bed \
       rnaGeneLiftGeneMiss.bed
     hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/rnaGene.sql hg16 rnaGene \
       /cluster/data/hg16/bed/bedOver/rnaGene/rnaGeneLiftGene.bed
 
 
 LOAD RNAGENES (DONE - 2004-03-09 - Hiram)
     #	http://www.genetics.wustl.edu/eddy
     #	Sean Eddy, eddy@genetics.wustl.edu
     #	Dept. of Genetics, Washington University School of Medicine
 
     ssh hgwdev
     mkdir -p /cluster/data/hg16/bed/rnaGene
     cd /cluster/data/hg16/bed/rnaGene
     mkdir rnaGenePrevious
     #	save previous rnaGene track for reference
     hgsqldump -T rnaGenePrevious hg16 rnaGene
     wget --timestamping \
 	ftp://ftp.genetics.wustl.edu/pub/eddy/annotation/human-hg16/*
     grep -v "^#" ncrna-hg16-mito.gff | sed -e "s/^NT_999999/chrM/" > mito.gff
     grep -v "^#" ncrna-hg16-chrom.gff > chrom.gff
     cat chrom.gff mito.gff > all.gff
 
     hgsql -e 'drop table rnaGene;' hg16
     hgsql hg16 < ~/kent/src/hg/lib/rnaGene.sql
     hgRnaGenes hg16 all.gff
 
 # rmMm3Rn3 3-way Regulatory Potential Score track (DONE - 2004-01-14 - Hiram)
 #	Data from: James Taylor james@bx.psu.edu
 #	Track description from: Francesca Chiaromonte chiaro@stat.psu.edu
     ssh eieio
     # Right now we are out of space on this /cluster/store4 filesystem,
     #	so send the data to the bluearc
     mkdir /cluster/bluearc/hg16/bed/regPotential3X
     ln -s /cluster/bluearc/hg16/bed/regPotential3X \
     	/cluster/data/hg16/bed/regPotential3X
 
     cd /cluster/data/hg16/bed/regPotential3X
     mkdir data
     cd data
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
     wget --timestamping \
 	"http://www.bx.psu.edu/~james/rp/hg16_mm3_rn3/chr${c}.hmr.maf.gz_rpscores.txt.truncated.bz2"
     wget --timestamping \
 	"http://www.bx.psu.edu/~james/rp/hg16_mm3_rn3/chr${c}.hmr.maf.gz_rpscores.txt.bz2"
     end
     # The truncated files were a test.  They want to see the raw data.
 
     ssh eieio
     cd /cluster/data/hg16/bed/regPotential3X
     mkdir wigRawData
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
 	bzcat data/chr${c}.hmr.maf.gz_rpscores.txt.bz2 | sort -n | \
 	    wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \
 	    -verbose -wibFile=wigRawData/chr${c}_rpMm3Rn3_Data \
 	    -name=${c} stdin > chr${c}.out
 	echo chr${c} done
     end
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/regPotential3X/wigRawData
     hgLoadWiggle hg16 regPotential3X chr*_rpMm3Rn3_Data.wig
     ln -s `pwd`/chr*_rpMm3Rn3_Data.wib /gbdb/hg16/wib
 
 # rmMm4 2-way Regulatory Potential Score track (DONE - 2004-01-14 - Hiram)
 #	Data from: James Taylor james@bx.psu.edu
 #	Track description from: Francesca Chiaromonte chiaro@stat.psu.edu
     ssh eieio
     # Right now we are out of space on this /cluster/store4 filesystem,
     #	so send the data to the bluearc
     mkdir /cluster/bluearc/hg16/bed/regPotential2X
     ln -s /cluster/bluearc/hg16/bed/regPotential2X \
     	/cluster/data/hg16/bed/regPotential2X
 
     cd /cluster/data/hg16/bed/regPotential2X
 
     mkdir data
     cd data
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
     wget --timestamping \
 	"http://www.bx.psu.edu/~james/rp/hg16_mm4/chr${c}.axt_rpscores.txt.truncated"
     wget --timestamping \
 	"http://www.bx.psu.edu/~james/rp/hg16_mm4/chr${c}.axt_rpscores.txt"
     end
     gzip *.truncated *.txt
     #	I'll bet you could gzip the .wig files too and zcat them
     #	into hgLoadWiggle ?
 
     # The truncated files were a test.  It turns out the full scores
     #	are desired to be seen
 
     #	The data is for every 5 bases.  Doesn't appear to be in order,
     #	so sort it into wigAsciiToBinary
     ssh eieio
     cd /cluster/data/hg16/bed/regPotential2X
     mkdir wigFiles
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
 	zcat data/chr${c}.axt_rpscores.txt.gz | sort -n | \
 	    wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \
 	    -wibFile=wigFiles/chr${c}_rpMm4 stdin > chr${c}.limits
 	echo chr${c} done
     end
 
     #	To load the data
     #		(some day in the future the above wigAsciiToBinary function
     #		will be folded into hgLoadWiggle and thus one command)
     ssh hgwdev
     cd /cluster/data/hg16/bed/regPotential2X/wigFiles
     hgLoadWiggle hg16 regPotential2X chr*_rpMm4.wig
     ln -s `pwd`/chr*_rpMm4.wib /gbdb/hg16/wib
 
     # an optional data load to check a display problem
     mkdir wigTrunc
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 M X Y)
 	zcat data/chr${c}.axt_rpscores.txt.truncated.gz | sort -n | \
 	    wigAsciiToBinary -chrom=chr${c} -dataSpan=5 \
 	    -wibFile=wigTrunc/chr${c}_rpMm4t stdin > chr${c}t.limits
     end
     ssh hgwdev
     cd /cluster/data/hg16/bed/regPotential2X/wigTrunc
     hgLoadWiggle hg16 regPotential2XTrunc chr*_rpMm4t.wig
     ln -s `pwd`/chr*_rpMm4t.wib /gbdb/hg16/wib
 
 
 # CREATE chimpSimpleDiff TRACK AND TABLE
     # Convert chimp quality scores from uncompressed contig to compressed
     # supercontig format.  This will take half an hour or so.
     cd /cluster/data/pt0
     zcat contigs.quals.gz | qaToQac stdin stdout | \
         chimpSuperQuals assembly.agp stdin scaffolds.qac
 
     # Make single base pair high quality differences into a bed file
     # and load into database
     cd /cluster/data/hg16/bed
     mkdir chimpSimpleDiff
     cd chimpSimpleDiff
     chimpHiQualDiffs /cluster/data/pt0/bed/blastz-blatHg16/axtBest \
     	/cluster/data/pt0/scaffolds.qac chimpSimpleDiff.bed
     sed 's/simpleNucDiff/chimpSimpleDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > \
     	chimpSimpleDiff.sql
     hgLoadBed -sqlTable=simpleNucDiff.sql hg16 chimpSimpleDiff.bed
     
   ### chimpFixedDiff -- panTro1 (Daryl, July 8, 2005)
 
     # Convert chimp quality scores from uncompressed to compressed
     # chromosome format.  This took 22 minutes on crow.
     cd /cluster/data/panTro1
     cat */chr*.qa | qaToQac stdin chrom.qac
 
     # Make single base pair high quality differences into a bed file
     # and load into database
     cd /cluster/data/hg16/bed
     mkdir chimpFixedDiff
     cd chimpFixedDiff
     sed 's/simpleNucDiff/chimpFixedDiff/' ~/kent/src/hg/lib/simpleNucDiff.sql > chimpFixedDiffs.sql
 
     # chimpHiQualDiffs was changed to allow different quality
     # parameters as command line options
 
     ## FIRST ATTEMPT:  
     set axtDir = cluster/data/hg16/bed/blastz-blat.panTro1.lifted/axtRBestNet
     ##  time chimpFixedDiffs /$axtDir /cluster/data/panTro1/chrom.qac chimpFixedDiffs.bed >& chimpFixedDiffs.log
     # This crashed twice at the same place, but ran successfully when
     # each chromosome was run separately.
     mkdir chroms; cd chroms
     ls -1 $axtDir | grep chr | grep axt | sed 's/.axt//' | xargs mkdir
     rmdir chr*random
     foreach f (chr*)
 	echo -n $f "  "
 	ln -s /$axtDir/$f.axt $f/$f.axt
 	time nice chimpFixedDiffs $f /cluster/data/panTro1/chrom.qac $f.chimpFixedDiffs.bed>>& cfd.log
     end
     cat chr*bed > ../chimpFixedDiffs.bed
 
     ## The load (sort) ran out of memory on hgwdev, so I sorted the
     ## file first on kolossus (3 minutes) and then loaded it on hgwdev
     ssh kolossus
     hgLoadBed -strict -sqlTable=chimpFixedDiffs.sql -noLoad hg16 chimpFixedDiff chimpFixedDiffs.bed
     exit
     ## hgwdev (37 minutes)
     hgLoadBed -hasBin -noSort -sqlTable=chimpFixedDiffs.sql hg16 chimpFixedDiff bed.tab
 
     TODO: need to filter out polymorphic sites (SNPs)
 
 
 ## LS-SNP links [load data only] (Daryl Thomas; November 3, 2005)
     # Data from Rachel Karchin in the Andrej Sali lab at UCSF
     # /cluster/data/hg16/bed/lssnp
     hgsql hg16 < ${HOME}/kent/src/hg/lib/lsSnpFunction.sql
     hgsql hg16 < ${HOME}/kent/src/hg/lib/lsSnpStructure.sql
     mysql> load data local infile "snp-human2-function-predictions.txt" into table lsSnpFunction;
     Query OK, 7689 rows affected (0.52 sec)
     mysql> load data local infile "snp-human2-structure-predictions.txt" into table lsSnpStructure;
     Query OK, 28144 rows affected (2.39 sec)
 
 
 #  gc5Base wiggle TRACK (DONE - 2004-03-12 - Hiram)
 #	reloaded wib files 2005-05-17 to place them in /gbdb/hg16/wib/gc5Base
     # a demonstration wiggle track.  Perform a gc count with a 5 base
     # window.  Also compute a "zoomed" view for display efficiency.
     mkdir /cluster/data/hg16/bed/gc5Base
     cd /cluster/data/hg16/bed/gc5Base
 
     #	in the script below, the 'grep -w GC' selects the lines of
     #	output from hgGcPercent that are real data and not just some
     #	information from hgGcPercent.  The awk computes the number
     #	of bases that hgGcPercent claimed it measured, which is not
     #	necessarily always 5 if it ran into gaps, and then the division
     #	by 10.0 scales down the numbers from hgGcPercent to the range
     #	[0-100].  Two columns come out of the awk print statement:
     #	<position> and <value> which are fed into wigAsciiToBinary through
     #	the pipe.  It is set at a dataSpan of 5 because each value
     #	represents the measurement over five bases beginning with
     #	<position>.  The result files end up in ./wigData5.
     cat << '_EOF_' > runGcPercent.sh
 #!/bin/sh
 mkdir -p wigData5
 mkdir -p dataLimits5
 for n in ../../nib/*.nib
 do
 	c=`basename ${n} | sed -e "s/.nib//"`
 	C=`echo $c | sed -e "s/chr//"`
 	echo -n "working on ${c} - ${C} ... "
 	hgGcPercent -chr=${c} -doGaps \
 		-file=stdout -win=5 hg16 ../../nib | grep -w GC | \
 		awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \
 	wigAsciiToBinary \
 	-dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
 	-name=${C} stdin 2> dataLimits5/${c}
 echo "done"
 done
 '_EOF_'
     chmod +x runGcPercent.sh
     #	This is going to take perhaps two hours to run.  It is a lot of
     #	data.  make sure you do it on the fileserver:
     ssh eieio
     cd /cluster/data/hg16/bed/gc5Base
     ./runGcPercent.sh
     # load the .wig files back on hgwdev:
     ssh hgwdev
     cd /cluster/data/hg16/bed/gc5Base
     hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/gc5Base hg16 gc5Base wigData5/*.wig
     # and symlink the .wib files into /gbdb
     mkdir /gbdb/hg16/wib/gc5Base
     ln -s `pwd`/wigData5/*.wib /gbdb/hg16/wib/gc5Base
 
     # to speed up display for whole chromosome views, compute a "zoomed"
     # view and load that on top of the existing table.  The savings
     # comes from the number of data table rows the browser needs to load
     # for a full chromosome view.  Without the zoomed view there are
     # over 43,000 data rows for chrom 1.  With the zoomed view there are
     # only 222 rows needed for the display.  If your original data was
     # at 1 value per base the savings would be even greater.
     #	Pretty much the same data calculation
     # situation as above, although this time note the use of the
     # 'wigZoom -dataSpan=1000 stdin' in the pipeline.  This will average
     # together the data points coming out of the awk print statment over
     # a span of 1000 bases.  Thus each <position> coming out of wigZoom
     # will represent the measurement of GC in the next 1000 bases.  Note
     # the use of -dataSpan=1000 on the wigAsciiToBinary to account for
     # this type of data.  You want your dataSpan here to be an exact
     # multiple of your original dataSpan (5*200=1000) and on the order
     # of at least 1000, doesn't need to go too high.  For data that is
     # originally at 1 base per value, a convenient span is: -dataSpan=1024
     # A new set of result files ends up in ./wigData5_1K/*.wi[gb]
     cat << '_EOF_' > runZoom.sh
 #!/bin/sh
 
 mkdir -p wigData5_1K
 mkdir -p dataLimits5_1K
 
 for n in ../../nib/*.nib
 do
         c=`basename ${n} | sed -e "s/.nib//"`
         C=`echo $c | sed -e "s/chr//"`
         echo -n "working on ${c} - ${C} ... "
         hgGcPercent -chr=${c} -doGaps \
                 -file=stdout -win=5 hg16 ../../nib | grep -w GC | \
                 awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \
 	wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \
         -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \
         -name=${C} stdin 2> dataLimits5_1K/${c}
 echo "done"
 done
 '_EOF_'
     chmod +x runZoom.sh
     #	This is going to take even longer than above, certainly do this
     #	on the fileserver
     ssh eieio
     time ./runZoom.sh
 real    232m3.265s
 user    302m37.050s
 sys     16m13.770s
     #	Then load these .wig files into the same database as above
     ssh hgwdev
     hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/gc5Base -oldTable hg16 gc5Base \
 	wigData5_1K/*.wig
     # and symlink these .wib files into /gbdb
     mkdir -p /gbdb/hg16/wib/gc5Base
     ln -s `pwd`/wigData5_1K/*.wib /gbdb/hg16/wib/gc5Base
 
 # KNOWN GENES TRACK (STARTED - 2004-01-15 - with Gene Sorter complete
 #				1004-02-17 Hiram)
 
     # you will probably need to make the programs in kent/src/hg/protein
     cd ~/kent/src/hg/protein
     make
     # The scripts run below will check for programs and let you know
     # which ones are missing
 
     # obtain new SwissProt database (should be done about once a month)
     # the swiss prot data is currently living on store5, first step is
     # on the fileserver.  This script was used once as it was created,
     # it may need to be verified and improved as it is used again.  See
     # comments at the top of the script.
     ssh eieio
     cd /cluster/data/swissprot
     ~/kent/src/hg/protein/mkSwissProtDB.sh
     # that obtains the data and unpacks it, second step is on hgwdev
     # to create the database
     ssh hgwdev
     cd /cluster/data/swissprot
     ~/kent/src/hg/protein/mkSwissProtDB.sh
     # Now the proteins database can be created from that.  Must be on hgwdev
     # Again, a script that has been used once upon creation, see
     # comments in it.  For example currently it is assumed these two
     # scripts have been run on the same day.  In this case 03112
     ssh hgwdev
     cd /cluster/data/proteins
     ~/kent/src/hg/protein/mkProteinsDB.sh
     # with those two databases existing, read for the actual known genes
     # track build.  Must be on hgwdev since it is all mostly database
     # operations.  The {Date} argument is the date stamp created by the
     # above two scripts.  Something of the form YYMMDD, e.g.: 031112
     # Again, a script that has been used only once at creation, see
     # comments at top of script.
     ssh hgwdev
     mkdir /cluster/data/hg16/bed/knownGenes
     cd /cluster/data/hg16/bed/knownGenes
     DateStamp=040115
     ~/kent/src/hg/protein/KGprocess.sh ${DateStamp}
     # that runs to a point where it prepares data and jobList for a
     # cluster run.  Continue with a cluster run on kk
     ssh kk
     cd /cluster/data/hg16/bed/knownGenes/kgBestMrna
     para create jobList
     para try
     para check
     para push
     # this is a quick cluster job.  Less than five minutes. e.g.:
 # Completed: 43580 of 43580 jobs
 # CPU time in finished jobs:     114636s    1910.60m    31.84h    1.33d  0.004 y
 # IO & Wait Time:                111889s    1864.82m    31.08h    1.30d  0.004 y
 # Average job time:                   5s       0.09m     0.00h    0.00d
 # Longest job:                        9s       0.15m     0.00h    0.00d
 # Submission to last job:           282s       4.70m     0.08h    0.00d
 
     # Continuing back on hgwdev, run the same script again
     ssh hgwdev
     cd /cluster/data/hg16/bed/knownGenes
     DateStamp=031112
     ~/kent/src/hg/protein/KGprocess.sh ${DateStamp}
     # that should run to completion and the known genes track is ready
 
     # Add the proteins link into gdbPdb.hgcentral:
     hgsql -e 'INSERT INTO gdbPdb (genomeDb, proteomeDb) \
 	VALUES ("hg16","proteins040115");' \
 	-h genome-testdb hgcentraltest
 
 # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
 # This depends on the go and uniProt databases as well as 
 # the kgAlias and kgProAlias tables.  The hgKgGetText takes
 # about 5 minutes when the database is not too busy.  The rest
 # is real quick.
      ssh hgwdev
      cd /cluster/data/hg16/bed/knownGenes.2004-01-29
      mkdir index
      cd index
      hgKgGetText hg16 knownGene.text
      ixIxx knownGene.text knownGene.ix knownGene.ixx
      ln -s /cluster/data/hg16/bed/knownGenes.2004-01-29/index/knownGene.ix /gbdb/hg16/knownGene.ix
      ln -s /cluster/data/hg16/bed/knownGenes.2004-01-29/index/knownGene.ixx /gbdb/hg16/knownGene.ixx
 
 
 # VEGA GENES UPDATE from 2004/01/15 below  (2004-02-04 - Hiram)
     mv ~/hg16/bed/vega ~/hg16/bed/vega.badcds
     mkdir /cluster/data/hg16/bed/vegaUpdate
     cd /cluster/data/hg16/bed/vegaUpdate
     
     wget --timestamping ftp://ftp.sanger.ac.uk/pub/searle/*.gtf.gz
 
     # Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks.  Just
     # omit snoRNAs, as there are so few of them
     zcat *.gtf.gz | awk '!(/small nucleolar RNA/ || $2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}'  > vegaGene.gtf
     zcat *.gtf.gz | awk '($2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}'  > vegaPseudoGene.gtf
 
     ldHgGene -gtf hg16 vegaGeneUpdate vegaGene.gtf
     ldHgGene -gtf hg16 vegaPseudoGeneUpdate vegaPseudoGene.gtf
 
     wget http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz
     hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa
 
     vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab
     hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql
     hgsql -e "load data local infile 'vegaInfo.tab' into table vegaInfo" hg16
 
 
 # LOAD VEGA GENES AND PSEUDOGENES (reloaded 2004/01/15 markd)
     # reloaded due to bug in creating bogus CDS
     mv ~/hg16/bed/vega ~/hg16/bed/vega.badcds
     mkdir ~/hg16/bed/vega
     cd ~/hg16/bed/vega
     
     wget http://www.sanger.ac.uk/Users/keenan/vega_homo_sapiens_ncbi34.gtf.gz
 
     # Load genes and Immunoglobulin/Pseudogenes into 2 separate tracks.  Just
     # omit snoRNAs, as there are so few of them
     zcat vega_homo_sapiens_ncbi34.gtf.gz | awk '!(/small nucleolar RNA/ || $2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}'  > vegaGene.gtf
     zcat vega_homo_sapiens_ncbi34.gtf.gz | awk '($2 == "Pseudogene" || $2 == "Ig_Pseudogene_Segment" || $2 == "Ig_Segment") {print "chr"$0}'  > vegaPseudoGene.gtf
 
     ldHgGene -gtf hg16 vegaGene vegaGene.gtf
     ldHgGene -gtf hg16 vegaPseudoGene vegaPseudoGene.gtf
 
     wget http://www.sanger.ac.uk/Users/keenan/vega_pep_dump_ncbi34.fa.gz
     hgPepPred hg16 generic vegaPep vega_pep_dump_ncbi34.fa
 
     vegaBuildInfo vega_homo_sapiens_core_4_0.gtf vegaInfo.tab
     hgsql hg16 < ~/kent/src/hg/lib/vegaInfo.sql
     hgsql -e "load data local infile 'vegaInfo.tab' into table vegaInfo" hg16
 
 
 # KNOWN GENES UPDATE (DONE - 2004-01-29 - Hiram)
 # RELOADED THE cgapBiocDesc AND cgapAlias TABLES TO REMOVE REPLICATED ROWS 
 # (DONE, 2005-07-26, hartera)
 # RELOADED cgapAlias AGAIN AS TOO MANY ROWS REMOVED BEFORE (hartera, 2005-10-06)
     # update swissProt and proteins databases
     # You want to run these two scripts on the same day to keep the
     #	the date stamp consistent.  In this case the data stamp is 040115
     ssh eieio
     cd /cluster/data/swissprot
     ~kent/src/hg/protein/mkSwissProtDB.sh
     # that obtains the data and unpacks it, second step is on hgwdev
     # to create the database
     ssh hgwdev
     cd /cluster/data/swissprot
     ~/kent/src/hg/protein/mkSwissProtDB.sh
     # Now the proteins database can be created from that.  Must be on
     # hgwdev
     ssh hgwdev
     cd /cluster/data/proteins
     ~/kent/src/hg/protein/mkProteinsDb.sh 040115
 
     # prepare all the tables in a temporary database, then move
     #	into Hg16.  Leave a link in hg16/bed so it can be found
     mkdir /cluster/data/kgDB/bed/hg16
     ln -s /cluster/data/kgDB/bed/hg16 \
 	/cluster/data/hg16/bed/knownGenes.2004-01-29
     cd /cluster/data/kgDB/bed/hg16
     ~/kent/src/hg/protein/KGprocess.sh kgDB hg16 040115
     # That runs to a point that prepares a cluster job, continuing on kk
     ssh kk
     cd /cluster/data/kgDB/bed/hg16/kgBestMrna
     para create jobList
     para try
     para push 
     ... etc ...
     # on a busy cluster, takes almost an hour:
 # Completed: 46583 of 46583 jobs
 # CPU time in finished jobs:     127351s    2122.51m    35.38h    1.47d  0.004 y
 # IO & Wait Time:                119182s    1986.37m    33.11h    1.38d  0.004 y
 # Average job time:                   5s       0.09m     0.00h    0.00d
 # Longest job:                       14s       0.23m     0.00h    0.00d
 # Submission to last job:          3513s      58.55m     0.98h    0.04d
 
      # Continuing back on hgwdev, run the same script again
     ssh hgwdev
     cd /cluster/data/kgDB/bed/hg16
     ~/kent/src/hg/protein/KGprocess.sh kgDB hg16 040115
     # should continue to completion, all tables are in kgDB and can be
     # moved if they check out to be similar to existing tables in hg16
     # You can verify table sizes with the script:
     ~kent/src/hg/protein/checkTbls.pl kgDB
     ~kent/src/hg/protein/checkTbls.pl hg16 kg
     # should have similar row counts in each of these outputs
     #	This rename can be done more simply with the 'rename' command
     #	instead of the 'alter table' used here.
     cat << '_EOF_' > renameTables.sh
 #!/bin/sh
 
 SOURCE=kgDB
 TARGET=hg16
 
 for T in cgapAlias cgapBiocDesc cgapBiocPathway dupSpMrna \
         keggMapDesc keggPathway kgAlias kgProtAlias kgXref \
         knownGene knownGeneLink knownGeneMrna knownGenePep mrnaRefseq spMrna
 do
     hgsql -e "drop table ${T};" ${TARGET}
     hgsql -e "alter table ${SOURCE}.${T} rename ${TARGET}.${T}" mysql
     echo "done $T"
 done
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x renameTables.sh
     ./renameTables.sh
     # RELOAD THE cgapBiocDesc AND cgapAlias TABLES (hartera, 2005-07-26)
     # Reload the cgapBiocDesc and cgapAlias tables as they have replicated 
     # rows. Need to sort and unique the file before loading into the database.
 
     cd /cluster/data/kgDB/bed/hg16
     sort -u cgapBIOCARTAdesc.tab > cgapBIOCARTAdescSorted.tab
     # for cgapAlias, the number of rows in the table is different to the
     # tab file here so dump the table first.
     # RELOAD cgapAlias AGAIN AS sort -nu REMOVES MORE ROWS THAN sort -u
     # OR sort -n | uniq.
     #USE sort -n then uniq TO SORT ON THE IDs AND THEN UNIQ(hartera, 2005-10-06)
     # hgsql -N -e 'select * from cgapAlias;' hg16 > cgapAliasDump.txt
     # above command used to get alias file from hg16 before sorting
     sort -n cgapAliasDump.txt | uniq > cgapAliasDumpSorted.tab
     hgsql hg16 -e "drop table cgapBiocDesc"
     hgsql hg16 -e "drop table cgapAlias"
     hgsql hg16 < ~/kent/src/hg/lib/cgapBiocDesc.sql
     hgsql hg16 < ~/kent/src/hg/lib/cgapAlias.sql
     hgsql hg16 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" \
           into table cgapBiocDesc'
     hgsql hg16 -e 'load data local infile "cgapAliasDumpSorted.tab" \
           into table cgapAlias'
 
     # the following extra process will be included in the next version
     # of KGprocess.sh to create the kgProtMap table:
     mkdir /cluster/data/kgDB/bed/hg16/kgProtMap
     cd /cluster/data/kgDB/bed/hg16/kgProtMap
     awk '{print ">" $1;print $2}' ../refMrna.tab > kgMrna.fa
     /scratch/blast/formatdb -i kgMrna.fa -p F
     echo "`date` creating kgPep.fa"
 hgsql -N -e 'select spID,seq from kgXref,knownGenePep where kgID=name' ${DB} \
         | awk '{print ">" $1;print $2}' >kgPep.fa
     rm -fr kgPep
     rm -f jobList
     mkdir kgPep
     faSplit sequence kgPep.fa 5000 kgPep/kgPep
     for f in kgPep/*.fa
     do
       echo ./kgProtBlast.csh $f >> jobList
     done
     awk '{printf "%s\t%s\n", $3,$2}' ../kgXref.tab > kgProtMrna.pairs
     # run a cluster job
     ssh kk9
     cd /cluster/data/kgDB/bed/hg16/kgProtMap
     para create jobList
     para try
     para push ... etc
 # Completed: 4949 of 4949 jobs
 # CPU time in finished jobs:    1061454s   17690.90m   294.85h   12.29d  0.034 y
 # IO & Wait Time:                 13400s     223.33m     3.72h    0.16d  0.000 y
 # Average job time:                 217s       3.62m     0.06h    0.00d
 # Longest job:                      996s      16.60m     0.28h    0.01d
 # Submission to last job:         12152s     202.53m     3.38h    0.14d
 
     # back to hgwdev
     ssh hgwdev
     cd /cluster/data/kgDB/bed/hg16/kgProtMap
     find ./psl.tmp -name '*.psl.gz' | xargs zcat | \
         pslReps -nohead stdin psl.tmp/kgProtMrna.psl /dev/null
     cd psl.tmp
     (pslMap kgProtMrna.psl ../../tight_mrna.psl stdout | \
         sort -k 14,14 -k 16,16n -k 17,17n > kgProtMap.psl) > kgProtMap.out 2>&1
     # this table data is ready to load, verify it by comparison with
     # existing kgProtMap data, then load:
     hgLoadPsl hg16 kgProtMap.psl
 
 
 # MAKE LINEAGE-SPECIFIC REPEATS FOR CHICKEN (DONE 2/18/04 angie)
     # In an email 2/13/04, Arian said we could treat all human repeats as 
     # lineage-specific for human-chicken blastz.  Scripts expect *.out.spec 
     # filenames, so set that up:
     ssh kkr1u00
     cd /cluster/data/hg16
     mkdir /iscratch/i/gs.17/build34/linSpecRep.Chicken
     foreach f (/scratch/hg/gs.17/build34/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/gs.17/build34/linSpecRep.Chicken/$f:t:r:r.out.spec
     end
     iSync
     # Use these the next time we run human-chicken blastz.
 
 
 # BLASTZ CHICKEN (GALGAL2) (DONE 2/26/04 angie)
     ssh kk
     # space is awful tight on store4 -- use store7.  
     mkdir -p /cluster/store7/hg16/bed/blastz.galGal2.2004-02-25
     ln -s /cluster/store7/hg16/bed/blastz.galGal2.2004-02-25 \
       /cluster/data/hg16/bed/
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     # Set L=10000 (higher threshold on blastz's outer loop) and abridge 
     # repeats.
     cat << '_EOF_' > DEF
 # human vs. chicken
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.Chicken
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken
 SEQ2_DIR=/iscratch/i/galGal2/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/galGal2/linSpecRep
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/store7/hg16/bed/blastz.galGal2.2004-02-25
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # first cluster run: raw blastz alignments
     ssh kk
     bash # if a csh/tcsh user
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
     para try, check, push, check, ....
 #Completed: 51189 of 51189 jobs
 #Average job time:                 477s       7.95m     0.13h    0.01d
 #Longest job:                     2318s      38.63m     0.64h    0.03d
 #Submission to last job:         29598s     493.30m     8.22h    0.34d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     bash # if a csh/tcsh user
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
     para try, check, push, etc ...
 #Completed: 339 of 339 jobs
 #Average job time:                   6s       0.11m     0.00h    0.00d
 #Longest job:                       21s       0.35m     0.01h    0.00d
 #Submission to last job:           150s       2.50m     0.04h    0.00d
 
     # third run: lav -> axt
     ssh kki
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 cat `ls -1 *.lav | sort -g` \
 | $HOME/bin/x86_64/lavToAxt stdin \
     /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/galGal2/nib stdout \
 | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
 $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     foreach d (../lav/chr*)
       echo "do.csh $d" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 42 of 42 jobs
 #Average job time:                  38s       0.64m     0.01h    0.00d
 #Longest job:                      147s       2.45m     0.04h    0.00d
 #Submission to last job:           147s       2.45m     0.04h    0.00d
 
 
 # RUN AXTBEST AND GENERATE MAF FOR MULTIZ (DONE 2/26/04 angie)
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     mkdir axtBest pslBest
     foreach chrdir (lav/chr*)
       set chr=$chrdir:t
       echo axtBesting $chr
       axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
       axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/$chr.psl
     end
     mkdir mafBest
     foreach f (axtBest/chr*.axt)
       set maf = mafBest/$f:t:r.hg.maf
       axtToMaf $f \
             /cluster/data/hg16/chrom.sizes /cluster/data/galGal2/chrom.sizes \
             $maf -tPrefix=hg16. -qPrefix=galGal2.
     end
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     cat pslBest/chr*.psl | hgLoadPsl -table=blastzBestGalGal2 hg16  stdin
 
 
 # CHAIN CHICKEN BLASTZ (DONE 2/26/04 angie)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # Make our own linear gap file with reduced gap penalties, 
     # in hopes of getting longer chains:
     cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize	11
 smallSize	111
 position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
 qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
 tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
 bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtFilter -notQ=chrUn $1 \
 | axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
                       -linearGap=../../chickenHumanTuned.gap \
                       -minScore=5000 stdin \
     /iscratch/i/gs.17/build34/bothMaskedNibs \
     /iscratch/i/galGal2/nib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
     # axtChrom/chr1{8,9}_random.axt are empty, so the {out line +} checks 
     # failed:
 #Completed: 40 of 42 jobs
 #Crashed: 2 jobs
 #Average job time:                  28s       0.46m     0.01h    0.00d
 #Longest job:                       76s       1.27m     0.02h    0.00d
 #Submission to last job:            92s       1.53m     0.03h    0.00d
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain hg16 ${c}_chainGalGal2 $i
     end
 
 
 # RESCORE CHICKEN BLASTZ (DONE 3/1/04 angie)
     # Webb noticed low scores in latest runs with repeats abridged --
     # PSU's restore_rpts program rescored alignments with default matrix 
     # instead of BLASTZ_Q matrix.  Rescore them here so the chainer sees 
     # the higher scores:
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     mkdir axtChrom.rescore
     foreach f (axtChrom/chr*.axt)
       axtRescore -scoreScheme=/cluster/data/blastz/HoxD55.q \
         $f axtChrom.rescore/$f:t
     end
     mv axtChrom axtChrom.orig
     mv axtChrom.rescore axtChrom
 
 
 # NET HUMAN BLASTZ (DONE 2/26/04 angie)
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
     netClass noClass.net hg16 galGal2 human.net
 
     # Make a 'syntenic' subset:
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn human.net > humanSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
     netFilter -minGap=10 human.net |  hgLoadNet hg16 netGalGal2 stdin
     netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 netSyntenyGalGal2 stdin
     # Add entries for chainGalGal2, netGalGal2, syntenyGalGal2 to 
     # human/hg16 trackDb
 
 
 # MAKE VSGALGAL2 DOWNLOADABLES (DONE 3/1/04 angie)
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     # Webb asked for axtChrom/chr22.axt... since axtChrom is rel. small 
     # this time, just put it all out there.
     zip /cluster/data/hg16/zip/GGaxtChrom.zip axtChrom/chr*.axt
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
     cp all.chain chicken.chain
     zip /cluster/data/hg16/zip/chicken.chain.zip chicken.chain
     rm chicken.chain
     cp human.net chicken.net
     zip /cluster/data/hg16/zip/chicken.net.zip chicken.net
     rm chicken.net
     cp humanSyn.net chickenSyn.net
     zip /cluster/data/hg16/zip/chickenSyn.net.zip chickenSyn.net
     rm chickenSyn.net
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/hg16/vsGalGal2
     cd /usr/local/apache/htdocs/goldenPath/hg16/vsGalGal2
     mv /cluster/data/hg16/zip/GGaxtChrom.zip axtChrom.zip
     mv /cluster/data/hg16/zip/chicken*.zip .
     md5sum *.zip > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 
 # MULTIZ HUMAN/MOUSE/RAT/GALGAL2 (DONE 3/8/04 angie)
 # (galGal2 added to human/mouse/rat alignments described above [HUMOR])
     # put the MAFs on bluearc
     ssh eieio
     mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr
     mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg
     cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf \
       /cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr
     cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf \
       /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg
 
     ssh kki
     mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2
     mkdir hmrg
     # Wrapper script required because of stdout redirect:
     cat << '_EOF_' > doMultiz
 #!/bin/csh
 /cluster/bin/penn/multiz $1 $2 - > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doMultiz
     rm -f jobList
     foreach file (/cluster/bluearc/multiz.hg16mm3rn3galGal2/hmr/*.maf) 
       set root=$file:t:r:r
       echo "doMultiz /cluster/bluearc/multiz.hg16mm3rn3galGal2/hg/${root}.hg.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/${root}.maf" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 41 of 41 jobs
 #Average job time:                  88s       1.47m     0.02h    0.00d
 #Longest job:                      276s       4.60m     0.08h    0.00d
 #Submission to last job:           278s       4.63m     0.08h    0.00d
 
     # clean up bluearc (these are big files!)
     rm -r /cluster/bluearc/multiz.hg16mm3rn3galGal2
 
     # setup external files for database reference
     ssh hgwdev
     mkdir -p /gbdb/hg16/multizMm3Rn3GalGal2
     ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/*.maf \
       /gbdb/hg16/multizMm3Rn3GalGal2
     # load into database
     /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3GalGal2
 
 
 # LOAD SOFTBERRY GENES (DONE - 2004-02-10 - Hiram)
      mkdir -p /cluster/data/hg16/bed/softberry
      cd /cluster/data/hg16/bed/softberry
      set file = Soft_fgenesh_jul03.tar.gz
      wget --timestamping ftp://www.softberry.com/pub/SC_HUM_JUL03/$file
      tar xzvf $file
      ldHgGene hg16 softberryGene fgenesh_jul03/chr*.gff
      hgPepPred hg16 softberry fgenesh_jul03/*.protein
      hgSoftberryHom hg16 fgenesh_jul03/*.protein
 
 
 # CHIMP (panTro1) ALIGNMENTS (2004-02-12 kate)
 
     # lift scaffold-based reciprocal best chains to chrom coordinates
     ssh eieio
     mkdir -p bed/blastz-blat.panTro1
     cd bed/blastz-blat.panTro1
     cp /cluster/data/pt0/bed/blastz-blatHg16/human.best.chain \
                                         best.scaffolds.chain
     cp /cluster/data/panTro1/jkStuff/scaffolds.lft scaffolds.lft
     ~kate/bin/i386/liftUp -chainQ best.chain scaffolds.lft \
                                         warn best.scaffolds.chain
 
 #Make a track from Tarjei's chimp deletions file (2/12/04, kpollard)
 #  80-12000 bp indels in human/chimp alignments
 
     #make .bed files from Tarjei's .fa files
     cd /cluster/data/panTro1/bed/indels
     /cluster/bin/i386/faSimplify indels.human.fa , , temp.fa
     /cluster/bin/i386/faSize detailed=on temp.fa > human.start.txt
     /cluster/bin/i386/faSimplify indels.human.fa ">" , temp.fa
     /cluster/bin/i386/faSize detailed=on temp.fa > human.chr.txt
     R
 	#Commands in R
 	chr<-read.table("human.chr.txt") #read in chromosome and size
 	start<-read.table("human.start.txt") #read in start and size
 	both<-cbind(chr,start) #concatinate: chrN size start size
 	sum(both[,2]!=both[,4]) #check that the size columns are identical
 	#0
 	both[,4]<-both[,2]+both[,3] #add start and size to get stop
 	both<-both[,c(1,3,4,2)] #reorder columns to get chrN start stop size
 	both[,4]<-paste("CD",1:length(both[,4]),"_",both[,4],sep="") #make name like CDN_size
 	write(t(both),"indels.human.bed",ncol=4) #write bed file
 	q() #quit
     #delimit with tabs
     cat indels.human.bed | gawk '{print $1"\t"$2"\t"$3"\t"$4}' > indels.human.tab.bed
     #load track into browser
     mkdir -p /gbdb/hg16/hg_insert
     ln -s  /cluster/data/panTro1/bed/indels/indels.human.tab.bed /gbdb/hg16/hg_insert
     cd /cluster/data/panTro1/bed/indels
     /cluster/bin/i386/hgLoadBed hg16 hg_insert indels.human.tab.bed
     #change name to humanDels
     hgsql hg16
 	rename table hg_insert to chimpDels;
 	exit
     #add description file chimpDels.html 
     # to ~/kent/src/hg/makeDb/trackDb/human/hg16
     #add a track entry to trackDb.ra 
     # in ~/kent/src/hg/makeDb/trackDb/human/hg16
 
 
 #	FAMILY BROWSER UPDATE	(DONE - 2004-02-17 - Hiram)
 #	to be done after knownGene tables are complete from known gene
 #	process.
 #
 # Cluster together various alt-splicing isoforms.
 #	Creates the knownIsoforms and knownCanonical tables
 ssh hgwdev
 mkdir /cluster/data/hg16/bed/famBro.2004-02-17
 ln -l /cluster/data/hg16/bed/famBro.2004-02-17 /cluster/data/hg16/bed/famBro
 cd /cluster/data/hg16/bed/famBro
 hgClusterGenes hg16 knownGene knownIsoforms knownCanonical
 
 # Extract peptides from knownGenes into fasta file
 # and create a blast database out of them.
 mkdir /cluster/data/hg16/bed/famBro/blastp
 cd /cluster/data/hg16/bed/famBro/blastp
 pepPredToFa hg16 knownGenePep known.faa
 #	You may need to build this binary in src/hg/near/pepPredToFa
 /scratch/blast/formatdb -i known.faa -t known -n known
 #	This command is in /projects/compbio/bin/$MACH/formatdb
 
 # Copy over database to bluearc
 rm -fr /cluster/bluearc/hg16/blastp
 mkdir -p /cluster/bluearc/hg16/blastp
 cp -p /cluster/data/hg16/bed/famBro/blastp/known.* /cluster/bluearc/hg16/blastp
 
 # Load up cluster/bluearc with blastp and related files
 # if necessary
 if (! -e /cluster/bluearc/blast/blastall) then
     mkdir -p /cluster/bluearc/blast
     cp /projects/compbio/bin/i686/blastall /cluster/bluearc/blast
     mkdir -p /cluster/bluearc/blast/data
     cp /projects/compbio/bin/i686/data/* /cluster/bluearc/blast/data
 endif
 
 # Split up fasta file into bite sized chunks for cluster
 cd /cluster/data/hg16/bed/famBro/blastp
 mkdir split
 faSplit sequence known.faa 8000 split/kg
 
 # Make parasol run directory   (this would not work on kk, use kk9 instead)
 #	Need to check the difference between the blast in /scratch/blast
 #	and this /cluster/bluearc/blast
 ssh kk9
 mkdir /cluster/data/hg16/bed/famBro/blastp/self
 cd /cluster/data/hg16/bed/famBro/blastp/self
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast/data
 export BLASTMAT
 /cluster/bluearc/blast/blastall -p blastp \
 	-d /cluster/bluearc/hg16/blastp/known -i $1 -o $2 \
 	-e 0.01 -m 8 -b 1000
 '_EOF_'
 chmod a+x blastSome
 
 # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
 #	'ls ../../split/*.fa' is too much, hence the echo
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 
 # Wait a couple of minutes, and do a para check,  if all is good
 # then do a
 para push
 # This should finish in ~15 minutes if the cluster is free.
 
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:      73213s    1220.22m    20.34h    0.85d  0.002 y
 # IO & Wait Time:                 20054s     334.23m     5.57h    0.23d  0.001 y
 # Average job time:                  12s       0.20m     0.00h    0.00d
 # Longest job:                      118s       1.97m     0.03h    0.00d
 # Submission to last job:          1117s      18.62m     0.31h    0.01d
 
 # Load into database.  This takes about an hour.
 ssh hgwdev
 cd /cluster/data/hg16/bed/famBro/blastp/self/run/out
 hgLoadBlastTab hg16 knownBlastTab *.tab
 # Scanning through 7748 files
 # Loading database with 11376875 rows
 
 cd /cluster/data/hg16/bed/famBro
 # Create table that maps between known genes and RefSeq
 hgMapToGene hg16 refGene knownGene knownToRefSeq
 #	may need to build this command in src/hg/near/hgMapToGene
 #	row count changed from 32674 to 35416
 
 # Create table that maps between known genes and LocusLink
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg16 \
 	> refToLl.txt
 hgMapToGene hg16 refGene knownGene knownToLocusLink -lookup=refToLl.txt
 #	row count went from 32845 to 35146
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt hg16 knownGene name proteinID Pfam knownToPfam
 #	row count went from 31201 to 32225
 # JK Fixed bug that let multiple identical columns happen in knownToPfam
 # on April 15, 2004.  Row count now 30467
 
 # Create table to map between known genes and GNF Atlas2
 # expression data.
     hgMapToGene hg16 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 
 # Create expression distance table - takes about an hour
 # (Regenerated April 16, 2004 in response to knownToGnfAtlas2 update)
     hgExpDistance hg16 hgFixed.gnfHumanAtlas2MedianRatio \
     	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnfAtlas2
 
 # Create a table that maps between known genes and 
 # the nice affy expression data.
 hgMapToGene "-type=bed 12" hg16 affyUcla knownGene knownToU133
 #	row count went from 34148 to 36818
 
 # Create expression distance table.  This will take about an hour.
 cd ~/kent/src/hg/near/hgExpDistance
 time hgExpDistance hg16 affyUcla affyUclaExp knownExpDistance \
 	-weights=affyUcla.weight -lookup=knownToU133
 # 42 genes, 42 weights, 26.500000 total wieght
 # Got 36818 unique elements in affyUcla
 # Made knownExpDistance.tab
 # Loaded knownExpDistance
 # Made query index
 # real    80m50.113s
 # user    62m33.290s
 # sys     2m15.200s
 
 #	This command should be done elsewhere, /tmp or something like that
 #	It makes a temporary .tab file of almost 1 Gb
 #	row count went from 34148000 to 36818000
 
 # Create table that maps between known genes and 
 # the GNF data.
 hgMapToGene hg16 affyU95 knownGene knownToU95
 cd /tmp
 #	hgFixed.gnfHumanU95Exps argument is unused, no need to exist
 hgExpDistance hg16 hgFixed.gnfHumanU95MedianRatio hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95
 #	row count went from 11718000 to 17330000
 #  original makeNear.doc had this as:
 # hgExpDistance hg16 affyGnfU95 affyGnfU95Exps knownGnfDistance -lookup=knownToU95
 
 # Make sure that GO database is up to date.
 See README in /cluster/store1/geneOntology.
 #	I update this GO database very carefully, checking that all
 #	structures in it remain the same from release to release and
 #	backing up the current go DB in a backup database.  In this case
 #	the backup is go040107 - when it was loaded for Mm4, and the new
 #	go database is based on data from Dec 17th 2003 and Feb 2004 according
 #	to the time stamp on the fetched data.  This build was done in
 #	/cluster/store1/geneOntology/20040217
 
 cd /cluster/data/hg16/bed/famBro
 
 # Create knownToEnsembl column
 hgMapToGene hg16 ensGene knownGene knownToEnsembl
 #	table row count went from previous version: 36068 to 38251
 
 # Make knownToCdsSnp column.  This is a little complicated by
 # having to merge data form the snpTsc and the snpNih tracks.
 hgMapToGene hg16 snpTsc knownGene knownToCdsSnp -createOnly -all -cds
 hgMapToGene hg16 snpTsc knownGene snp1 -noLoad -all -cds
 hgMapToGene hg16 snpNih knownGene snp2 -noLoad -all -cds
 sort snp1.tab snp2.tab > knownToCdsSnp.tab
 rm snp1.tab snp2.tab
 hgsql \
     -e 'load data local infile "knownToCdsSnp.tab" into table knownToCdsSnp;' \
 	hg16
 #	row count went from 87273 to 106199
 
 # Make C. elegans ortholog column using blastp on wormpep.
 # First make C. elegans protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/ce1/blastp should have data
 
 # Create the ceBlastTab  (the blastall binary only works on kk9 for now ...)
 ssh kk9
 mkdir /cluster/data/hg16/bed/famBro/blastp/ce1
 cd /cluster/data/hg16/bed/famBro/blastp/ce1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
 	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 
 # Wait a couple of minutes, and do a para check,  if all is good
 # then do a
 para push
 
 # This should finish in ~10 minutes if the cluster is free.
 # Here's the para time results
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:      28869s     481.16m     8.02h    0.33d  0.001 y
 # IO & Wait Time:                 20454s     340.89m     5.68h    0.24d  0.001 y
 # Average job time:                   6s       0.11m     0.00h    0.00d
 # Longest job:                       52s       0.87m     0.01h    0.00d
 # Submission to last job:           584s       9.73m     0.16h    0.01d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg16/bed/famBro/blastp/ce1/run/out
 hgLoadBlastTab hg16 ceBlastTab -maxPer=1 *.tab
 #	row count went from 25599 to 26958
 
 # Make mouse ortholog column using blastp on mouse known genes.
 # First make mouse protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This already exists.  See makeMm4.doc for procedure
 #	the directory: /cluster/bluearc/mm4/blastp should have data
 
 # Make parasol run directory 
 ssh kk9
 mkdir /cluster/data/hg16/bed/famBro/blastp/mm4
 cd /cluster/data/hg16/bed/famBro/blastp/mm4
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
 	-p blastp -d /cluster/bluearc/mm4/blastp/known \
 	-i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
 # (wordLine wouldn't run on kk9:
 #	wordLine: /lib/i686/libc.so.6: version `GLIBC_2.3' not found
 #	run this echo statement on hgwdev
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 
 # Wait a couple of minutes, and do a para check,  if all is good
 # then do a
 para push
 
 #	takes about 15 minutes:
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:      54179s     902.98m    15.05h    0.63d  0.002 y
 # IO & Wait Time:                 20428s     340.47m     5.67h    0.24d  0.001 y
 # Average job time:                  10s       0.16m     0.00h    0.00d
 # Longest job:                       76s       1.27m     0.02h    0.00d
 # Submission to last job:          2031s      33.85m     0.56h    0.02d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg16/bed/famBro/blastp/mm4/run/out
 hgLoadBlastTab hg16 mmBlastTab -maxPer=1 *.tab
 # Scanning through 7748 files
 # Loading database with 35611 rows
 #	row count went from 33191 to 35611
 
 
 # REFSEQ HOMOLOGS (DONE 6/18/04 angie)
     # Translate mmBlastTab's knownGene acc's into RefSeq where possible, 
     # since our users frequently ask for help in determining homologs for 
     # human/mouse RefSeq accs...
     ssh hgwdev
     hgsql hg16 -e \
      'create table mmRefSeqHomolog \
       select hg16.knownToRefSeq.value as name, \
              mm3.knownToRefSeq.value as homolog, \
              mmBlastTab.identity, mmBlastTab.aliLength, mmBlastTab.mismatch, \
              mmBlastTab.gapOpen, mmBlastTab.qStart, mmBlastTab.qEnd, \
              mmBlastTab.tStart, mmBlastTab.tEnd, mmBlastTab.eValue , \
              mmBlastTab.bitScore \
       from mmBlastTab, hg16.knownToRefSeq, mm3.knownToRefSeq \
       where hg16.knownToRefSeq.name = mmBlastTab.query and \
             mm3.knownToRefSeq.name = mmBlastTab.target;'
 
 
 
 # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
 # First make protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/dr1/blastp should have data
 
 # Make parasol run directory 
 ssh kk9
 mkdir /cluster/data/hg16/bed/famBro/blastp/dr1
 cd /cluster/data/hg16/bed/famBro/blastp/dr1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
 	-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
 	-i $1 -o $2 -e 0.005 -m 8 -b 1
 '_EOF_'
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 
 # Wait a couple of minutes, and do a para check,  if all is good
 # then do a
 para push
 
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:      40575s     676.24m    11.27h    0.47d  0.001 y
 # IO & Wait Time:                 19781s     329.69m     5.49h    0.23d  0.001 y
 # Average job time:                   8s       0.13m     0.00h    0.00d
 # Longest job:                       95s       1.58m     0.03h    0.00d
 # Submission to last job:          2036s      33.93m     0.57h    0.02d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg16/bed/famBro/blastp/dr1/run/out
 hgLoadBlastTab hg16 drBlastTab -maxPer=1 *.tab
 # Scanning through 7748 files
 # Loading database with 32204 rows
 #	row count went from 30339 to 32204
 
 # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
 # First make protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/sc1/blastp should have data
 
 # Make parasol run directory 
 ssh kk9
 mkdir /cluster/data/hg16/bed/famBro/blastp/sc1
 cd /cluster/data/hg16/bed/famBro/blastp/sc1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
 	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 
 # Wait a couple of minutes, and do a para check,  if all is good
 # then do a
 para push
 
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:       8577s     142.96m     2.38h    0.10d  0.000 y
 # IO & Wait Time:                 19756s     329.26m     5.49h    0.23d  0.001 y
 # Average job time:                   4s       0.06m     0.00h    0.00d
 # Longest job:                       15s       0.25m     0.00h    0.00d
 # Submission to last job:          1172s      19.53m     0.33h    0.01d
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg16/bed/famBro/blastp/sc1/run/out
 hgLoadBlastTab hg16 scBlastTab -maxPer=1 *.tab
 #	row count went from 17089 to 17886
 
 # Make Drosophila melanagaster ortholog column using blastp on FlyBase.
 # First make SwissProt protein database and copy it to cluster/bluearc
 # if it doesn't exist already
 #	This is already done, see makeMm3.doc for procedure
 #	the directory: /cluster/bluearc/dm1/blastp should have data
 
 # Make parasol run directory 
 ssh kk9
 mkdir /cluster/data/hg16/bed/famBro/blastp/dm1
 cd /cluster/data/hg16/bed/famBro/blastp/dm1
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast/data /cluster/bluearc/blast/blastall \
 	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
 	-i $1 -o $2 -e 0.01 -m 8 -b 1
 '_EOF_'
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 
 # Wait a couple of minutes, and do a para check,  if all is good
 # then do a
 para push
 
 # Completed: 7748 of 7748 jobs
 # CPU time in finished jobs:      33371s     556.18m     9.27h    0.39d  0.001 y
 # IO & Wait Time:                 19546s     325.77m     5.43h    0.23d  0.001 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest job:                       53s       0.88m     0.01h    0.00d
 # Submission to last job:          1657s      27.62m     0.46h    0.02d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg16/bed/famBro/blastp/dm1/run/out
 hgLoadBlastTab hg16 dmBlastTab -maxPer=1 *.tab
 # Scanning through 7748 files
 # Loading database with 28645 rows
 #	row count went from 27173 to 28645
 
 
 LOAD SNPS (Done.  Daryl Thomas; February 18, 2004)
     # SNP processing has been condensed into a single script,
     # which makes snpNih, snpTsc, and snpMap
     #   ${HOME}/kent/src/hg/snp/locations/processSnpLocations.csh
 
     # snpBuild = 119
     # Run from directory $oo/bed/snp/build$snpBuild/snpMap
     mkdir -p $oo/bed/snp/build$snpBuild/snpMap
     cd       $oo/bed/snp/build$snpBuild/snpMap
     processSnpLocations.csh hg16 human 34_2 119 >& log &
 
     # check data:
     # wc -l snpTsc.bed; hg16 -e "select count(*) from snpTsc; 
     # wc -l snpNih.bed; hg16 -e "select count(*) from snpNih; 
     # wc -l snpMap.bed; hg16 -e "select count(*) from snpMap; 
     # select * from snpNih limit 5; desc snpNih; show indexes from snpNih"
     # select * from snpTsc limit 5; desc snpTsc; show indexes from snpTsc"
     # select * from snpMap limit 5; desc snpMap; show indexes from snpMap"
 
     # remove temp files
     # rm human* *bed.gz
 
 LOAD SNP DETAILS (Done.  Daryl Thomas; February 18, 2004)
     # SNP processing has been condensed into a single script,
     # which makes dbSnpRsHg
     #   ${HOME}/kent/src/hg/snp/details/processSnpDetails.csh
 
     # snpBuild = 119
     # Run from directory $oo/bed/snp/build$snpBuild/snpMap
     mkdir -p $oo/bed/snp/build$snpBuild/details/Done
     mkdir -p $oo/bed/snp/build$snpBuild/details/Observed
     cd       $oo/bed/snp/build$snpBuild/details
     processSnpDetails.csh hg16 human 119 >& log &
     load data local infile "$fileBase.out" into table $database.$table
     gzip $fileBase.out
 
     # check data:
     # hgFixed -e "select count(*) from dbSnpRsHg; 
     # select * from dbSnpRSHg limit 5; desc dbSnpRsHg; show indexes from dbSnpRSHg"
 
     # remove temp files
     # rm dbSnpRs*
 
 
 
 # LOAD SNPS ( Daryl Thomas; February ??, 2005)
     set db    = hg16
     set org   = human
     set build = 122
     set dir   = /cluster/bluearc/snp/$db/build$build
 
     # ssh to some quiet machine with fast access to the bluearc
     # it takes ~4.5 hours to download the data 
     # (build 124 directly to /cluster/bluearc/... from eieio)
     # Check to make sure the chrMT file is included
     mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq
     cd $dir
     ln -s /cluster/data/$db/jkStuff/liftAll.lft .
 
     screen
     ftp ftp.ncbi.nih.gov
     cd snp/$org/XML
     prompt
     mget ds_ch*.xml.gz
     exit # screen
     exit # machine
 
     # TODO: check chromStart for each locType
 
     cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts
     chmod 775 /cluster/bin/scripts/parseDbSnpXML
 
     ssh kk
     touch jobList
     foreach file ( /cluster/bluearc/snp/$db/build$build/ds_ch*.xml.gz )
 	    set out = $file:t:r
 	    echo /cluster/bin/scripts/parseDbSnpXML $file /cluster/bluearc/snp/$db/build$build $out.contig >> jobList
     end
 
     # para create jobList; para push; para check ... 
 # CPU time in finished jobs:      28235s     470.58m     7.84h    0.33d  0.001 y
 # IO & Wait Time:                  1986s      33.10m     0.55h    0.02d  0.000 y
 # Average job time:                1119s      18.65m     0.31h    0.01d
 # Longest job:                     2339s      38.98m     0.65h    0.03d
     exit # kk
 
     mv -r $dir /cluster/data/$db/bed/snp/build$build
     set dir = /cluster/data/$db/bed/snp/build$build
     cd $dir
 
     ssh eieio # or wherever data is local
 
     # concatenate the details files to make it easier to lift (and load)
     time zcat det/ds_ch*.xml.contig.det.gz > $db.build$build.contig.bed
     # 16.120u 13.070s 1:35.26 30.6%   0+0k 0+0io 86pf+0w (hgwdev)
     time gzip $db.build$build.contig.bed
     # 102.307u 5.524s 1:48.97 98.9%   0+0k 0+0io 1pf+0w (eieio/store5)
 
     # some of the NT contigs are not in the liftSpec - this is expected as snps that map to
     # alternate assemblies (Celera) are in the original files, but we disregard their mappings.
     time liftUp $db.build$build.bed liftAll.lft warn $db.build$build.contig.bed.gz 
     # 190.473u 18.873s 3:52.33 90.1%  0+0k 0+0io 1pf+0w (eieio/store5)
 
     time gzip $db.build$build.bed
     # 107.476u 5.286s 1:54.25 98.6%   0+0k 0+0io 0pf+0w
 
     ssh hgwdev # or wherever database is located
     # hgLoadBed is the important step - check to make sure there are no warnings
     time hgLoadBed $db snp $db.build$build.bed.gz -sqlTable=${HOME}/kent/src/hg/lib/snp.sql
     # Loaded 8722437 elements of size 16
     # 206.170u 48.370s 35:59.52 11.7% 0+0k 0+0io 82994pf+0w
 
 
     # basic snp table is now loaded, but exception column needs to be updated
     # ~ 3 hours wall clock time from here to end
 
     # run queries from snpException.query against snp table
     mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/build$build
     cd       /usr/local/apache/htdocs/qa/test-results/snpException/build$build
     time snpException $db 0 ${db}snpException > ${db}snpException.log
     chmod o+rx . 
     chmod o+r  * 
     # 24.590u 34.150s 41:04.48 2.3%   0+0k 0+0io 191pf+0w
 
     # check alignment of flanking sequences
     time snpValid $db /cluster/data/$db/bed/snp/build$build/seq > ${db}snpValid.log
     # 4688.790u 172.770s 1:28:45.62 91.2%     0+0k 0+0io 23000pf+0w
     # 5205.860u 216.570s 1:55:10.27 78.4%     0+0k 0+0io 72408pf+0w (hgwdev)
 
     ### NOTE: the pseudoautosomal snps are reported in the chrX files
     ### only, which causes problems for snpValid when checking the
     ### chrY snp mappings.  I got around this by confirming that all
     ### of the 'missing flank' errors (#23) were in pseudoautosomal
     ### regions and ignoring them.  I manually truncated the
     ### hg17snpException.23.bed file before continuing with the next
     ### step.  This could/should be fixed in the next iteration.
 
     # update snpExceptions table to match the number of exceptions found in the snpValid results
     # these numbers come from counting the numbers of lines in the output files without headers
     mysql> update snpExceptions set num=60797  where exceptionId=21;
     mysql> update snpExceptions set num=5657   where exceptionId=22;
     mysql> update snpExceptions set num=284098 where exceptionId=23;
     mysql> update snpExceptions set num=173    where exceptionId=24;
 
     # create list of statements to update the snp table and run them
     time tail +3 ${db}snpException.* | awk '/rs/ {printf "%s\t%d\t%d\n",$4,$2,$5}' | sort -k1,2n > exceptionList.txt
     # ~10 seconds
     time updateExceptionList.pl < exceptionList.txt > updateExceptionList.sql
     # 36.270u 1.980s 0:38.27 99.9%    0+0k 0+0io 337pf+0w
     time hgsql $db < updateExceptionList.sql
     # 18.130u 26.680s 58:39.97 1.2%   0+0k 0+0io 413pf+0w build122 (had to optimize table during run)
     # 8.420u 10.370s 11:58.44 2.6%    0+0k 0+0io 413pf+0w build123 (this is mostly a mysql process)
     # 6.550u  9.370s 14:34.17 1.8%    0+0k 0+0io 413pf+0w build124
     # > wc -l build12*/updateExceptionList.sql
     # 1110994 build122/updateExceptionList.sql
     #  387166 build123/updateExceptionList.sql
     #  383759 build124/updateExceptionList.sql
 
 # Add Affy SNPs from new submission
 
     #!/bin/csh -fe
 
     set db = hg16
     cd /cluster/data/$db/bed/snp/affy/latest
     touch affy.txt affy.bed Affy.bed bed.tab
     rm -f affy*.txt affy*.bed Affy.bed* bed.tab
 
     # datafile was provided by Valmeekam, Venu [Venu_Valmeekam@affymetrix.com]
     tar xfz affyhg16maps.tgz
     wc -l affy*txt
 
     awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10K\t0\n",        $1,$2,$3,$4,$6,$7);}' < affy10K.txt         > affy10K.bed
     awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy10Kv2\t0\n",      $1,$2,$3,$4,$6,$7);}' < affy10Kv2.txt       > affy10Kv2.bed
     awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_HindIII\t0\n",$1,$2,$3,$4,$6,$7);}' < affy50K_HindIII.txt > affy50K_HindIII.bed
     awk '$1 !~ /^chrom/ {printf("%s\t%d\t%d\t%s\t0\t%s\t%s\tunknown\tsnp\tunknown\t0\t0\tunknown\texact\tAffy50K_XbaI\t0\n",   $1,$2,$3,$4,$6,$7);}' < affy50K_XbaI.txt    > affy50K_XbaI.bed
 
     # this is a temporary kluge to fix some bad input data.
     cat affy*.bed | sed 's/_par//' > Affy.bed
 
     # the source enum for 'dbSnp' is 2; all of the affy* values are higher.
     hgsql $db -e "delete from snp where source > 2 "
 
     hgLoadBed $db snp Affy.bed -oldTable -tab
 
     rm -f affy*.txt affy*.bed bed.tab
     gzip Affy.bed
 
     #mysql> select source, count(*) from snp group by source;                                                                                
     #+-----------------+----------+
     #| source          | count(*) |
     #+-----------------+----------+
     #| dbSnp           |  8722437 |
     #| Affy10K         |    11464 |
     #| Affy10Kv2       |    10128 |
     #| Affy50K_HindIII |    56965 |
     #| Affy50K_XbaI    |    58646 |
     #+-----------------+----------+
     #5 rows in set (52.96 sec)
 
 
     # March 7, 2005: fixed pseudoautosomal snps:
     #affy10Kv2.txt:chrX_par  1920780 1920781 SNP_A-1606360   0       ?       C/T
     #affy10Kv2.txt:chrX_par  2047561 2047562 SNP_A-1510197   0       ?       G/T
     #affy10Kv2.txt:chrX_par  2047486 2047487 SNP_A-1510243   0       ?       A/G
     #affy10Kv2.txt:chrX_par  2060858 2060859 SNP_A-1606356   0       ?       A/G
     #affy10Kv2.txt:chrX_par  2163964 2163965 SNP_A-1606329   0       ?       C/T
     
     delete from snp where chrom = 'chrY' and name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329');
     update snp set chrom = 'chrX' where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329');
     insert into snp 
 	    select  bin, 'chrY' as chrom, chromStart, chromEnd, name, score, strand, 
 		    observed, molType, class, valid, avHet, avHetSE, func, locType, source, exception 
 	    from    snp 
 	    where   name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329');
     select chrom, count(*) from snp where name in ('SNP_A-1606360','SNP_A-1510197','SNP_A-1510243','SNP_A-1606356','SNP_A-1606329') group by chrom;
 
 
 ### hapmapRecombRate (Daryl; September 19, 2005)
   # updated coordinates (Daryl; December 8, 2005)
     mkdir -p /cluster/data/hg16/bed/hapmap/recombination/HapMap_PhaseI/20051115
     cd       /cluster/data/hg16/bed/hapmap/recombination/HapMap_PhaseI/20051115
     wget -N http://www.stats.ox.ac.uk/~cfreeman/HapMap_Phase1/genetic_map_HapMap_Phase1_UCSC.tar.gz
     tar xvfz genetic_map_HapMap_Phase1_UCSC.tar.gz
     tail --lines=+2 -q Gen_map_chr*_COMBINED_UCSC.txt | sed 's/_non_par//;s/_par1//;s/_par2//' | awk '{printf "%s\t%d\t%d\t%0.3f\n",$1,$2,$3,$4}' >! hg16.hapmapRecombRate.bed
     liftOver hg16.hapmapRecombRate.bed /cluster/data/hg16/bed/liftOver/hg16ToHg17.over.chain.gz hg17.hapmapRecombRate.bed hg16ToHg17.unmapped
     hgLoadBed -bedGraph=4 hg16 hapmapRecombRate hg16.hapmapRecombRate.bed
     hgLoadBed -bedGraph=4 hg17 hapmapRecombRate hg17.hapmapRecombRate.bed
     rm -f bed.tab Gen_map_chr*.txt
 
 ### hapmapRecombHotspot (Daryl; September 19, 2005; chr X data update October 21, 2005)
     wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/Genomewidehots16a.txt
     wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/chrX_non_par_hotspots.txt
     wget -N http://www.stats.ox.ac.uk/~mcvean/HapMap/hotspots/chrX_par1_hotspots.txt
     # this takes about 3 seconds to run
     rm -f hg*.hapmapRecombHotspots.bed 
     tail +2 Genomewidehots16a.txt     |                    awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >  hg16.hapmapRecombHotspots.bed
     tail +2 chrX_non_par_hotspots.txt | sed s/_non_par// | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >> hg16.hapmapRecombHotspots.bed
     tail +2 chrX_par1_hotspots.txt    | sed s/_par1//    | awk -F " " '{printf "chr%s\t%d\t%d\n",$1, $3-1, $4}' >> hg16.hapmapRecombHotspots.bed
     liftOver hg16.hapmapRecombHotspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.hapmapRecombHotspots.bed hg16ToHg17.unmapped
     hgLoadBed hg16 hapmapRecombHotspots hg16.hapmapRecombHotspots.bed
     hgLoadBed hg17 hapmapRecombHotspots hg17.hapmapRecombHotspots.bed
     rm -f bed.tab
 
 ### encodeRecombHotspot (Daryl; December 8, 2005)
     mkdir -p /cluster/data/hg16/bed/hapmap/recombination/ENCODE_16c.1/hotspots
     cd       /cluster/data/hg16/bed/hapmap/recombination/ENCODE_16c.1/hotspots
     wget -N http://www.stats.ox.ac.uk/~cfreeman/ENCODE_16c.1/Hotspots16c1.txt
     wget -N http://www.stats.ox.ac.uk/~cfreeman/ENCODE_16c.1/Readme_rates_hotspots.txt
     tail +2 Hotspots16c1.txt | sed 's/ENm010\.7p15\.2/chr7/;s/ENm013\.7q21\.13/chr7/;s/ENm014\.7q31\.33/chr7/;s/ENr112\.2p16\.3/chr2/;s/ENr113\.4q26/chr4/;s/ENr123\.12q12/chr12/;s/ENr131\.2q37\.1/chr2/;s/ENr213\.18q12\.1/chr18/;s/ENr232\.9q34\.11/chr9/;s/ENr321\.8q24\.11/chr8/' | awk '{printf "%s\t%d\t%d\n", $1, $3, $4}' > hg16.encodeRecombHotspot.bed
     liftOver hg16.encodeRecombHotspot.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.encodeRecombHotspot.bed hg16ToHg17.unmapped
     hgLoadBed hg16 encodeRecombHotspot hg16.encodeRecombHotspot.bed
     hgLoadBed hg17 encodeRecombHotspot hg17.encodeRecombHotspot.bed
     rm -f bed.tab *bed *unmapped
 
 
 
 ### Perlegen Recombination Rates and Hotspots (Daryl; December 9, 2005)
     # Home page: http://www.stats.ox.ac.uk/mathgen/Recombination.html
 
     mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen
     cd       /cluster/data/hg16/bed/hapmap/recombination/Perlegen
     wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt
 
     mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen/hotspots
     cd       /cluster/data/hg16/bed/hapmap/recombination/Perlegen/hotspots
 
     wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt
     wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/hotspots.zip
     unzip hotspots.zip
     tail +2 hotspots.txt  | grep -v 1.51000 | awk '{printf "chr%s\t%d\t%d\n",$1,$3-1,$4}' > hg16.perlegenRecombHotspots.bed
     tail +2 coldspots.txt | grep -v "-"     | awk '{printf "chr%s\t%d\t%d\n",$1,$3-1,$4}' > hg16.perlegenRecombColdspots.bed
 
     liftOver hg16.perlegenRecombHotspots.bed  /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombHotspots.bed  hg16ToHg17.hots.unmapped
     liftOver hg16.perlegenRecombColdspots.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombColdspots.bed hg16ToHg17.cold.unmapped
 
     hgLoadBed hg16 perlegenRecombHotspots  hg16.perlegenRecombHotspots.bed
     hgLoadBed hg17 perlegenRecombHotspots  hg17.perlegenRecombHotspots.bed
     hgLoadBed hg16 perlegenRecombColdspots hg16.perlegenRecombColdspots.bed
     hgLoadBed hg17 perlegenRecombColdspots hg17.perlegenRecombColdspots.bed
 
     rm -f bed.tab hg1*ed *spots*txt
 
     mkdir -p /cluster/data/hg16/bed/hapmap/recombination/Perlegen/rates
     cd       /cluster/data/hg16/bed/hapmap/recombination/Perlegen/rates
     cp ../makeBed.pl .
     chmod ug+x makeBed.pl
 
     wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/README.txt
     wget -nv -N http://www.stats.ox.ac.uk/mathgen/oxstats_map/recombination_rates.zip
     unzip recombination_rates.zip
 
     rm -f hg16.perlegenRecombRate.bed
     time ./makeBed.pl > hg16.perlegenRecombRate.bed
     cut -f1 hg16.perlegenRecombRate.bed | sort -u
     wc -l hg16.perlegenRecombRate.bed
 
     liftOver hg16.perlegenRecombRate.bed /cluster/data/hg16/bed/bedOver/hg16ToHg17.over.chain hg17.perlegenRecombRate.bed hg16ToHg17.rates.unmapped
 
     hgLoadBed hg16 perlegenRecombRate  hg16.perlegenRecombRate.bed
     hgLoadBed hg17 perlegenRecombRate  hg17.perlegenRecombRate.bed
 
     rm -f bed.tab chr*_rates.txt hg1*ed
 
 
 # HapMap Linkage Disequilibrium (Daryl; January 2006)
 
     mkdir -p /cluster/data/hg16/bed/hapmap/ld_data/2005-10/data
     cd /cluster/data/hg16/bed/hapmap/ld_data/2005-10/data
     screen
     ftp www.hapmap.org
     cd ld_data/2005-10
     prompt
     mget ld_chr*.txt.gz
 
     # look for consistency in max LD distance
     set out = maxDist.txt
     rm -f $out
     touch $out
     foreach f (ld_*.txt.gz)
 	echo -n "$f    " >> $out
 	zcat $f | awk '{if ($2-$1>max) max=$2-$1} END {print max}' >> $out
     end
     # most should be 249999
     grep -v 249999 maxDist.txt
 
     # look for consistency in line counts
     # ssh eieio; screen
     set out = wcList.txt
     rm -f $out
     touch $out
     # this takes about 2 hours to run completely on eieio (local disk)"
     foreach f (*.txt.gz) 
 	echo -n $f:r:r "    " | sed 's/ld_//;s/chr//;s/_/\t/' >> $out
 	zcat $f | cut -f1 -d " " | uniq | wc -l >> $out
     end
     # plot the sizes from wcList.txt by population (lines) 
     # with chrom on the X axis and size on the Y axis.
     # look for anomalies
 
 
     mkdir ../bed
     cd ../bed
     # from the raw LD values, compute colors and encode
     cat << EOF > makeLdBed.pl
 #!/usr/bin/perl -W
 
 sub min ($$)
 {
     my $a = shift @_;
     my $b = shift @_;
     if ($a<$b) {return $a;}
     return $b;
 }
 sub encodeDprime($)
 {
     my $val = shift @_;
     if    ( ($val > 1) || ($val < -1) ) { die "Dprime value ($val) is out of range [-1,1]";}
     elsif ($val>=0) { $ret = ord('a') + $val*9;}
     else         { $ret = ord('A') - $val*9;}
     return chr($ret);
 }
 sub encodeRsquared($)
 {
     my $val = shift @_;
     if ( ($val > 1) || ($val < 0) ) { die "R^2 value ($val) is out of range [0,1]";}
     return encodeDprime($val);
 }
 sub encodeLod($$)
 {
     my $lod    = shift @_;
     my $dPrime = shift @_;
     $ret = ord('a');
     if ($lod>=2) # high LOD
 	{
 	if (abs($dPrime)<0.5) { $ret = ord('y'); } # high LOD, low D'  -> pink
 	else { $ret += min((int($lod-abs($dPrime)-1.5)), 9) ;}
 	}
     elsif (abs($dPrime)>0.99) { $ret = ord('z'); } # high D', low LOD  -> blue
     return chr($ret); 
 }
 
 $inDir  = shift||"data"; 
 $outDir = shift||"bed";
 $foo    = "";
 $bar    = ""; 
 @rest   = (); 
 @pops   = ("CEU", "CHB", "JPT", "YRI");
 
 foreach $pop (@pops)
 {
     opendir(DIR, $inDir) || die "can't open $inDir";
     @hmFiles = grep {/^ld_/ && /_${pop}.txt.gz$/} readdir(DIR); #ld_chr22_CEU.txt.gz
     closedir(DIR);
     printf "\nPOP:\t$pop\t$#hmFiles\n";
     foreach $hmFile (sort @hmFiles)
     {
     ($foo, $chrom, $bar) = split /_/, $hmFile;
     $chrom =~ s/chrx/chrX/;
     $chrom =~ s/chry/chrY/;
     $outfile = "$outDir/${pop}_${chrom}.bed";
     if ((-e $outfile)||(-e "$outfile.gz")) { next; }
     $tmpFile = "/tmp/${pop}_${chrom}.bed";
     printf("$inDir/$hmFile => $outfile.gz\t" . `date`);
     open(OUT, "> $tmpFile" ) || die "can't open $tmpFile";
     open(IN, "zcat $inDir/$hmFile | " ) || die "can't open $inDir/$hmFile";
     $line = <IN>;
     chomp($line);
     ($chromStart, $chromEnd, $pop, $name, $marker2, $dprime, $rsquared, $lod, @rest) = split / /, $line;
     $ldCount = 1;
     while (<IN>)
     {
         chomp();
 	($chromStartNew, $chromEndNew, $pop, $nameNew, $marker2, $dprime, $rsquared, $lod, @rest) = split / /;
 	if ($chromStart ne $chromStartNew)
 	    {
 	    $chromStart--;
 	    printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
 	    $chromStart   = $chromStartNew;
 	    $chromEnd     = $chromEndNew;
 	    $name         = $nameNew;
 	    $ldCount      = 1;
 	    $dprimeList   = encodeDprime($dprime);
 	    $rsquaredList = encodeRsquared($rsquared);
 	    $lodList      = encodeLod($lod, $dprime);
 	    }
 	elsif ($chromEndNew-$chromStartNew<250000)
 	    {
 	    $chromEnd     = $chromEndNew;
 	    $ldCount++;
 	    $dprimeList   .= encodeDprime($dprime);
 	    $rsquaredList .= encodeRsquared($rsquared);
 	    $lodList      .= encodeLod($lod, $dprime);
 	    }
     }
     close(IN);
     $chromStart--;
     printf(OUT "$chrom\t$chromStart\t$chromEnd\t$name\t$ldCount\t$dprimeList\t$rsquaredList\t$lodList\n");
     close(OUT);
     system("gzip $tmpFile");
     system("mv $tmpFile.gz $outDir");
     }
 }
 EOF
 #
     chmod ug+x ./makeLdBed.pl
     ssh eieio
     screen
     time ./makeLdBed.pl
 
     # look for consistency in line counts
     # ssh eieio
     set out = wcList.txt
     rm -f $out
     touch $out
     foreach f (*.bed.gz) 
 	echo -n $f:r:r "    " | sed 's/chr//g;s/_/\t/g' >> $out
 	zcat $f | wc -l >> $out
     end
     # plot the sizes from wcList.txt by population (lines) 
     # with chrom on the X axis and size on the Y axis.
     # look for anomalies
 
 
     # load data
     sed 's/hapmapLd/hapmapLdCeu/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
     sed 's/hapmapLd/hapmapLdChb/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
     sed 's/hapmapLd/hapmapLdJpt/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
     sed 's/hapmapLd/hapmapLdYri/' ${HOME}/kent/src/hg/lib/hapmapLd.sql | hgsql hg16
 
     # The length of each of the three value vectors (rsquared, dprime,
     # and lod) is the same and is stored in the score field.
 
     # 30-40 minutes
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X)
 	echo
 	echo -n loading CEU chr${c} 
 	zcat CEU_chr${c}.bed.gz | wc -l
 	hgLoadBed -noSort -oldTable -strict hg16 hapmapLdCeu CEU_chr${c}.bed.gz
 	echo
 	echo -n loading CHB chr${c} 
 	zcat CHB_chr${c}.bed.gz | wc -l
 	hgLoadBed -noSort -oldTable -strict hg16 hapmapLdChb CHB_chr${c}.bed.gz
 	echo
 	echo -n loading JPT chr${c} 
 	zcat JPT_chr${c}.bed.gz | wc -l
 	hgLoadBed -noSort -oldTable -strict hg16 hapmapLdJpt JPT_chr${c}.bed.gz
 	echo
 	echo -n loading YRI chr${c} 
 	zcat YRI_chr${c}.bed.gz | wc -l
 	hgLoadBed -noSort -oldTable -strict hg16 hapmapLdYri YRI_chr${c}.bed.gz
     end
     rm -f bed.tab
 
 
 # Tajima's D (DONE -- 2005-06-04 -- Daryl)
 # Data from Chris Carlson in Debbie Nickerson's lab
 # Chris Carlson [csc47<AT>u<DOT>washington<DOT>edu]
 
     set db=hg16
     set dir=/cluster/data/$db/bed/tajdpoly/latest
     cd $dir
     
     set chain = "/gbdb/hg17/liftOver/hg17ToHg16.over.chain"
     foreach p (AD ED XD)
 	# lift SNP tracks
 	set f   = $p.SNP.track
 	set in  = /cluster/data/hg17/bed/tajdpoly/latest/$f.bed4
 	set out = /cluster/data/hg16/bed/tajdpoly/latest/$f.$db
 	liftOver $in $chain $out.$db.bed4 $out.$db.unmapped
 	# lift tajd tracks
 	set f   = $p.tajd.track
 	set in  = /cluster/data/hg17/bed/tajdpoly/latest/$f.bedGraph
 	set out = /cluster/data/hg16/bed/tajdpoly/latest/$f.$db
 	liftOver $in $chain $out.bedGraph $out.unmapped
 	# load SNP tracks
 	set f = $p.SNP.track.hg16
 	echo `date` $f "=>" $f.bed4
 	hgLoadBed $db tajdSnp$p $f.bed4
 	head -3 $f*
 	hgsql -e "select * from tajdSnp$p limit 3" $db
 	# load tajd tracks
 	set f = $p.tajd.track.$db
 	echo `date` $f "=>" $f.bedGraph
 	hgLoadBed -bedGraph=4 $db tajd$p $f.bedGraph
 	head -3 $f*
 	hgsql -e "select * from tajd$p limit 3" $db
     end
 
     # deleting elements that overlap with gaps -- tajd files have overlaps due to the windowing scheme (snps are not found in gaps)
     rm -f delete.sql
     touch delete.sql
     set $where="where t.chrom=g.chrom and (t.chromStart between g.chromStart and g.chromEnd or t.chromEnd between g.chromStart and g.chromEnd)"
     foreach p (AD ED XD SnpAD SnpED SnpXD)
 	foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y)
 	    echo "select 'tajd$p' as pop, t.chrom, t.chromStart from tajd${p} t, chr${c}_gap g $where " | \
 		hgsql $db | grep -v pop | \
 		awk '{printf "delete from %s where chrom=\"%s\" and chromStart=%d;\n", $1, $2, $3}' >> delete.sql
 	end
     end
     $db < delete.sql
 
     # cleanup elements that didn't get deleted properly
     ## cleanup.pl
     #!/usr/bin/perl -W
     $pop=shift;
     while (<>)
     {   
 	if (/^(chr..?)\s+(\d+)/) 
 	{ print "delete from tajd$pop where chrom='$1' and chromStart<$2 and chromEnd>$2;\n"; }
     }
     ##
 
     foreach p (AD ED XD)
 	featureBits $db tajd$p gap -bed=$p.inGaps.bed
 	cleanup.pl < $p.inGaps.bed $p | $db
 	featureBits $db tajd$p gap -bed=$p.inGaps.bed ## should be empty now
     end
 
 
 # JAX ORTHOLOG (WORKING hiram 2004-02-20 )
     # Add Jackson labs info
     cd /cluster/data/hg16/bed
     mkdir jaxOrtholog
     cd jaxOrtholog
     wget --timestamping ftp://ftp.informatics.jax.org/pub/reports/HMD_Human4.rpt
     # save a little space
     gzip HMD_Human4.rpt
 
     # this is a tricky one to parse.  This .rpt file is plain text, no
     # tabs, with expected text columns to contain the data.  We need to
     # convert this.  Beware of table changes, you may need to rework
     # this each time if they change the data.  Here is what we have
     # today, an example first line with text columns numbered:
 # 1234567 101234567 201234567 301234567 401234567 501234567 601234567 701234567 
 801234567 90123456 100123456 110123456 120123456 130123456 140123456 150123456 1
 60123456 170123456 180123456 170
 # MGI:1918914                    71664                          0610006F02Rik   
           10           syntenic D3         196410                         MGC173
 01                  12q13.13
 # ^ mgiId
 #         ^ mouse chr
 #                      ^ mouseCm position
 #                               ^ possible Mouse band
 #                                            Mouse-Human Symbol ^
 #                                                            Human Symbol ^
 #                   ^ Human Band(s)
 
     #	This awk script picks out the correct columns, removes spaces,
     #	picks the first of possibly several human band designations,
     #	and decides if a mouse band has been specified
     cat << '_EOF_' > jaxToUCSC.awk
 /^MGI:/ {
     LAST=NF
     PREV=LAST-1
     humanSymbol = substr($0,153,26)
     gsub(" ","",humanSymbol)
     Band = substr($0,179)
     gsub(" *$","",Band)
     gsub("^ *","",Band)
     mgiId = substr($0,1,31)
     gsub(" ","",mgiId)
     mouseSym = substr($0,63,26)
     gsub(" ","",mouseSym)
     mouseChr = substr($0,89,13)
     gsub(" ","",mouseChr)
     mouseCm = substr($0,102,9)
     gsub(" ","",mouseCm)
     mouseBand = substr($0,111,11)
     gsub(" ","",mouseBand)
     if (length(mouseBand) < 1) { mouseBand = "N/A" }
     printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", humanSymbol,Band,
         mgiId,mouseSym,mouseChr,mouseCm,mouseBand
 }
 '_EOF_'
     # << this line makes emacs coloring happy
     # then using that script to fix it:
     zcat HMD_Human4.rpt.gz | awk -f jaxToUCSC.awk > jaxOrtholog.tab
 
     # Drop (just in case), create and load the table:
     hgsql -e 'drop table jaxOrtholog;' hg16
     hgsql hg16 < ~/kent/src/hg/lib/jaxOrtholog.sql
     hgsql -e \
 	'load data local infile "jaxOrtholog.tab" into table jaxOrtholog;' hg16
     # save a little space
     gzip jaxOrtholog.tab
 
 LOAD ACEMBLY (DONE - 2004-03-30 - Hiram)
     mkdir -p /cluster/data/hg16/bed/acembly
     cd /cluster/data/hg16/bed/acembly
 
     # Data is obtained from:
     # Danielle et Jean Thierry-Mieg	mieg@ncbi.nlm.nih.gov
 
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.proteins.fasta.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.gff.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.mrnas.fasta.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_34.human.genes/acembly.ncbi_34.genes.pfamhits.tar.gz
 
     tar xvzf acembly.ncbi_34.genes.gff.tar.gz
     tar xvzf acembly.ncbi_34.genes.proteins.fasta.tar.gz
     cd acembly.ncbi_34.genes.gff
 
     # chrom 6.gff is broken, it has a bogus number in the first column
     #	where a 6 should be.  Fix-up until I hear from the authors:
     mv x1.acemblygenes.6.gff x1.acemblygenes.6.gff.broken
     sed -e "s/^28212469/6/" x1.acemblygenes.6.gff.broken > x1.acemblygenes.6.gff 
 
     #	There are a number of start and end coordinates that are
     #	in reversed order.  Until I hear from the authors, I have
     #	switched those coords:
     cat << '_EOF_' > fixupReversedBlocks
 #!/bin/sh
 for i in x1*.gff
 do
         echo -n "$i working ..."
         awk -F"\t" '
 {
 if ($4 > $5) {
         printf "%s\t%s\t%s\t%s\t%s", $1, $2, $3, $5, $4
         for ( i = 6; i <= NF; ++i ) {
                 printf "\t%s", $i
         }
         printf "\n"
 } else
         print
 }
 ' $i > $i.fixed
         echo " done"
 done
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x fixupReversedBlocks
     ./fixupReversedBlocks
 
     # Save just the floating-contig features to different files for lifting 
     # and lift up the floating-contig features to chr*_random coords:
     # NOTE: file prefix (x1) has been added since build 31
 
     foreach f (x1.acemblygenes.*.gff.fixed)
       set c=$f:r:e
       set c=$f:r:r:e
       egrep '^[a-zA-Z0-9]+\|NT_[0-9][0-9][0-9][0-9][0-9][0-9]' $f | \
         perl -wpe 's/^(\w+)\|(\w+)/$1\/$2/' > ctg-chr${c}_random.gff
       if (-e ../../../$c/lift/random.lft) then
         liftUp chr${c}_random.gff ../../../$c/lift/random.lft warn \
           ctg-chr${c}_random.gff
       endif
       grep -v ^$c\| $f | grep -v ^Hs | perl -wpe 's/^/chr/;' | \
 	grep -v "^chr//" > chr$c.gff
       echo "done $c"
     end
     # that last grep strips out _random or floating contig lines from the
     # normal chrom gff, and add the "chr" prefix
     #	Three of them end up empty, check for this and remove them
     #	if necessary
     rm -f chr19_random.gff chr18_random.gff chrUn.gff
     #	There was one error in a coordinate on chr17_random:
     # chr17_random    acembly stop_codon      -2      0       .       +       1      gene_id M17S2; transcript_id M17S2.cDec03;
     #	This line was removed (shows up as first line) from
     #	chr17_random.gff before the database load
 
     #- Load into database:
     cd ..
     ldHgGene -gtf hg16 acembly acembly.ncbi_34.genes.gff/chr*.gff
     hgPepPred hg16 generic acemblyPep \
 	acembly.ncbi_34.genes.proteins.fasta/*.fasta
 
     #	check that the track is OK
     checkTableCoords hg16 acembly
     #	should display no errors
 
 
 # MAKE HUMAN-CHIMP OVER.CHAIN FOR LIFTOVER (DONE 3/2/04 angie)
 
     ssh kolossus
     mkdir /cluster/data/hg16/bed/bedOver/hg16toPt0
     cd /cluster/data/hg16/bed/bedOver/hg16toPt0
     # use the combined blastz-blat best human chain, but assign unique IDs
     # so that netChainSubset doesn't die:
     chainSort /cluster/data/pt0/bed/blastz-blatHg16/human.best.2.chain stdout \
     | chainMergeSort stdin \
     | chainSplit chain stdin
     # re-net with the new IDs:
     mkdir net
     foreach f (chain/*.chain)
       echo chaining $f
       chainNet $f /cluster/data/hg16/chrom.sizes \
         /cluster/data/pt0/scaffold.sizes net/$f:t:r.net /dev/null
     end
     # Now get a single-cov subset as usual:
     mkdir subset
     foreach f (chain/*.chain)
       echo subsetting net/$f:t:r.net, $f to subset/$f:t
       netChainSubset net/$f:t:r.net $f subset/$f:t
     end
     cat subset/*.chain > /cluster/data/hg16/bed/bedOver/hg16Topt0.chain
     # make it available:
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/hg16/liftOver/
     zip -j hg16Topt0.zip /cluster/data/hg16/bed/bedOver/hg16Topt0.chain
     # update README.txt
 
     # lift scaffold-based over.chain to chrom-based (2004-07-09 kate)
     ssh kksilo
     cd /cluster/data/hg16/bed/bedOver
     liftUp -chainQ hg16TopanTro1.chain /cluster/data/panTro1/jkStuff/scaffolds.lft warn hg16Topt0.chain
 
     # NOTE: these chains appear to be broken up -- try using all chains,
     # instead of reciprocal best
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
     netChainSubset human.net all.chain over.chain
     # load just for ENCODE dev
     hgLoadChain hg16 liftOverPanTro1Chain over.chain
     # TODO: delete table
 
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
     chainSwap \
         /cluster/data/panTro1/bed/blastz-blatHg16.pt0.swap/all.newId.chain \
                 all.newId.swp.chain
     chainSplit chain.newId all.newId.swp.chain
     mkdir preNet
     cd chain.newId
 cat > preNet.csh << 'EOF'
     foreach i (*.chain)
         echo pre-netting $i
         chainSort $i stdout | \
             chainPreNet stdin /cluster/data/hg16/chrom.sizes \
                         /cluster/data/panTro1/chrom.sizes ../preNet/$i
     end
 'EOF'
     csh preNet.csh >&! preNet.log &
     tail -100f preNet.log
     cd ..
 # << for emacs
 
 mkdir n1
 cd preNet
 cat > net.csh << 'EOF'
     foreach i (*.chain)
         set n = $i:r.net
         echo netting $i
         chainNet $i -minSpace=1 /cluster/data/hg16/chrom.sizes \
                         /cluster/data/panTro1/chrom.sizes ../n1/$n /dev/null
     end
 'EOF'
     csh net.csh >&! net.log &
     tail -100f net.log
     cd ..
 
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
 
     # GOT HERE
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
     netClass hNoClass.net hg16 panTro1 chimp.newId.net
 
     # chain files from the net
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
     netChainSubset chimp.newId.net all.newId.swp.chain over.newId.chain
     cp over.newId.chain \
         /cluster/data/hg16/bed/liftOver/hg16ToPanTro1.newId.over.chain
     mv hg16TopanTro1.chain hg16Topantro1.chain.old
     cd /cluster/data/hg16/bed/liftOver
     ln -s hg16ToPanTro1.newId.over.chain hg16TopanTro1.chain
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz-blat.panTro1
     hgLoadChain hg16 liftOverPanTro1NewIdChain over.newId.chain
 
 
 # MAKE HUMAN-CHICKEN OVER.CHAIN FOR LIFTOVER (DONE 3/2/04 angie)
     ssh kolossus
     mkdir /cluster/data/hg16/bed/bedOver/hg16TogalGal2
     cd /cluster/data/hg16/bed/bedOver/hg16TogalGal2
     set chainDir = /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
     netSplit $chainDir/human.net net
     mkdir subset
     foreach f ($chainDir/chain/*.chain)
       echo subsetting $f:t:r
       netChainSubset net/$f:t:r.net $f subset/$f:t
     end
     cat subset/*.chain > /cluster/data/hg16/bed/bedOver/hg16TogalGal2.chain
 
 
 # HUMAN/MOUSE/RAT/CHICKEN (HMRG) PHYLOHMM CONSERVATION (IN PROGRESS 2004-03-8 kate)
 
 # Set path
 
     set path = ($path /cluster/bin/woody)
 
 # Obtain phylogenetic model (hmrc_rev_dg.mod)
 # from Adam (hand-tuned, instead of fit_model)
 # then, create New Hampshire tree for data (.nh file)
 
     cat hmrc_rev_dg.mod
         #ALPHABET: A C G T
         #ORDER: 0
         #SUBST_MOD: REV
         #NRATECATS: 10
         #ALPHA: 4.4
         #BACKGROUND: 0.286083 0.213573 0.213691 0.286652
         #RATE_MAT:
           #-0.891523    0.166770    0.574850    0.149902
            #0.223389   -1.146311    0.153784    0.769137
            #0.769591    0.153699   -1.147159    0.223869
            #0.149605    0.573055    0.166888   -0.889548
         #TREE: ((1:0.192598,(2:0.076303,3:0.083043):0.192598):0.47,4:0.47);
 
 
     /cluster/data/woody/scripts/extract-tree.pl human,mouse,rat,chicken \
                                 hmrc_rev_dg.mod
 #((human:0.192598,(mouse:0.076303,rat:0.083043):0.192598):0.47,chicken:0.47);
 
     ssh eieio
     set path = ($path /cluster/bin/woody)
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2
     cd phyloHMM
 
     # now, break up the genome-wide MAFs into pieces; it's worth doing
     # this as a little cluster job
     # NOTE: using the hg16 chr fasta files stashed on bluearc for hg16 humor run
     # NOTE: next time add "check out" lines to assure files are created
     ssh eieio
     mkdir -p /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2
     cp /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/hmrg/*.maf /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
     
     cat << 'EOF' > doSplit
 #!/bin/sh
 
 WOODY=/cluster/bin/woody
 FA_SRC=/cluster/bluearc/hg16/bed/humor
 WINDOWS=/cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM/WINDOWS
 
 maf=$1
 c =`basename $maf .maf`
 echo $c
 mkdir -p /scratch/msa_split
 ${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg16,mm3,rn3,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -d 1 -B 5000
 echo "Copying..."
 cd /scratch/msa_split
 for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
 rm -f /scratch/msa_split/$c.*.ss
 echo "Done copying"
 'EOF'
     chmod +x doSplit
     mkdir -p WINDOWS
     rm -f WINDOWS/* jobs.lst
     foreach file (/cluster/bluearc/hg16/bed/multiz.hg16mm3rn3galGal2/*.maf) 
 	echo "doSplit $file" >> jobs.lst
     end
 
     ssh kk
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
     para create jobs.lst  
     # para try, para push, etc.
 
     # now setup and run the cluster job to compute the conservation scores
     # NOTE: need to use gensub2, check out+ facilities to check for
     # failures.  Will want to chunk msa_split output (above) into chr dirs.
     # to make the gensub template reasonable.
     
 cat << 'EOF' > doPostProbs
 #!/bin/sh
 
 WOODY=/cluster/bin/woody
 TMP=/tmp/phyloHMMcons
 
 file=$1
 root=`basename $file .ss.gz`
 chrom=`echo $root | awk -F\. '{print $1}'`
 echo $chrom
 
 mkdir -p $TMP
 zcat $file | $WOODY/label -m - -d hmrc_rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x
 mkdir -p POSTPROBS/$chrom
 gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz
 rm $TMP/$root.postprob
 'EOF'
     chmod +x doPostProbs
 
     mkdir -p POSTPROBS
     rm -f jobs2.lst 
     foreach file (WINDOWS/chr*.ss.gz) 
        echo "doPostProbs $file" >> jobs2.lst 
     end
     wc -l job2.lst
     para create jobs2.lst
     # etc ... (run cluster job)
 
     # Create wiggle (.wib) file using and load into database
     ssh eieio
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
     mkdir wibLimits
     mkdir wib
 cat > makeWig.csh << 'EOF'
     foreach dir (POSTPROBS/*)
 	set chrom = $dir:t
 	echo $chrom 
 	zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
 	    wigAsciiToBinary -chrom=$chrom \
 		-dataSpan=1 -wibFile=wib/${chrom}_hmrg_phyloHMM -name=hmrg \
 		stdin > wibLimits/${chrom}
     end
 'EOF'
     csh makeWig.csh >&! makeWig.log &
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
     hgLoadWiggle hg16 multizMm3Rn3GalGal2_phyloHMM_wig wib/*_hmrg_phyloHMM.wig 
     ln -s `pwd`/wib/chr*_hmrg_phyloHMM.wib /gbdb/hg16/wib 
     chmod 775 . wib
     chmod 664 wib/*.wib
 
     #	Add zoom records to table to speed display of large regions (>600Kbp)
     # NOTE: this doesn't work -- the rows were dropped
     ssh eieio
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
     mkdir -p wib1K wibLimits1K
 cat > wigZoom1K.csh << 'EOF'
     foreach dir (POSTPROBS/*)
 	set chrom = $dir:t
 	echo $chrom 
 	zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
 	    wigZoom stdin | wigAsciiToBinary -chrom=$chrom \
                 -dataSpan=1024 -wibFile=wib1K/${chrom}_hmrg_phyloHMM_1K \
                 -name=hmrg stdin > wibLimits1K/${chrom}
     end
 'EOF'
     csh wigZoom1K.csh >&! wigZoom1K.log &
     tail -100f wigZoom1K.log
     ssh hgwdev
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM/wib1K
     hgLoadWiggle -oldTable hg16 multizMm3Rn3GalGal2_phyloHMM_wig *.wig
     #	create symlinks for .wib files
     ln -s `pwd`/*.wib /gbdb/hg16/wib
     # NOTE: this doesn't work -- the rows were dropped
 
     # setup external files for database reference
     # reuse mafs loaded in the maf track (just symlink the /gbdb dir before
     # loading
     ssh hgwdev
     ln -s /gbdb/hg16/multizMm3Rn3GalGal2 /gbdb/hg16/multizMm3Rn3GalGal2_phyloHMM
     # load into database
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
     /cluster/bin/i386/hgLoadMaf -warn hg16 multizMm3Rn3GalGal2_phyloHMM
     # create trackDb entry
     # track multizMm3Rn3GalGal2_phyloHMM
     # type wigMaf 0.0 1.0
     # wiggle multizMm3Rn3GalGal2_phyloHMM_wig
     # etc.
 
     # Load pairwise mafs
     ssh hgwdev
     cd /gbdb/hg16
     mkdir -p mouse_hmrg rat_hmrg chicken_hmrg
 
     foreach f (/cluster/data/hg16/bed/humor/maf/*.mm3.maf)
         ln -s $f /gbdb/hg16/mouse_hmrg
     end
     cd /tmp
     hgLoadMaf -WARN hg16 mouse_hmrg
 
     foreach f (/cluster/data/hg16/bed/humor/maf/*.rn3.maf)
         ln -s $f /gbdb/hg16/rat_hmrg
     end
     cd /tmp
     hgLoadMaf -WARN hg16 mouse_hmrg
 
     foreach f (/cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf)
         ln -s $f /gbdb/hg16/chicken_hmrg
     end
     cd /tmp
     hgLoadMaf -WARN hg16 chicken_hmrg
 
     # copy files to download area 
     set dir = /usr/local/apache/htdocs/goldenPath/hg16/multizMm3Rn3GalGal2
     mkdir $dir
     ln -s $dir multiz
     cp -p /gbdb/hg16/multizMm3Rn3GalGal2_phyloHMM/*.maf $dir
     cd $dir
     gzip *
 
 # As the 5-way alignment is imminent, this wasn't completed
     
     # edit downloads page to add links
     # add pairwise mafs to downloads page 
     mkdir $dir/{rn3,mm3}
     cd /cluster/data/hg16/bed/humor/maf
     cp *.mm3.maf $dir/mm3
     cp *.rn3.maf $dir/rn3
     gzip $dir/mm3/*
     gzip $dir/rn3/*
     # also add human/chicken maf's
 
     # Create upstream files
     ssh hgwdev
     echo hg16 mm3 rn3 galGal2> org.txt
     foreach i (1000 2000 5000)
     featureBits hg16 refGene:upstream:$i -fa=/dev/null -bed=up.bad
     awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
     rm up.bad
     mafFrags hg16 multizMm3Rn3GalGal2 up.bed upstream$i.maf -orgs=org.txt
     rm up.bed
     end
 
 
 #  miRNA track (DONE - 2004-05-04 - Hiram)
     #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
     #	and Michel.Weber@ibcg.biotoul.fr
     #	notify them if this assembly updates to renew this track
     ssh hgwdev
     mkdir /cluster/data/rn3/bed/miRNA
     cd /cluster/data/rn3/bed/miRNA
     wget --timestamping \
     hgLoadBed rn3 miRNA rn3.bed
     # entry in trackDb/trackDb.ra already there
 
 #  miRNA track (UPDATED - 2004-05-04 - Hiram)
     #	(first version done 2004-03-02)
     #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
     #	and Michel.Weber@ibcg.biotoul.fr
     #	notify them if this assembly updates to renew this track
     cd /cluster/data/hg16/bed
     mv miRNA miRNA.2004_03_02
     mkdir miRNA
     cd miRNA
     wget --timestamping \
     "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/hsa_ncbi34.*"
     grep -v "^track " hsa_ncbi34.bed | sed -e "s/ /\t/g" > hg16.bed
     #	check existing track for comparison after update load
     #	featureBits hg16 miRNA
     #	15385 bases of 2865248791 (0.001%) in intersection
     hgLoadBed hg16 miRNA hg16.bed
     #	featureBits hg16 miRNA
     #	16923 bases of 2865248791 (0.001%) in intersection
 
     # added an entry to trackDb/trackDb.ra:  (good for Mm4 and Ce1 too)
 track miRNA
 shortLabel miRNA
 longLabel MicroRNAs from the miRNA Registry
 group genes
 priority 63
 visibility hide
 useScore 1
 color 255,64,64
 type bed 8
 url http://www.sanger.ac.uk/cgi-bin/Rfam/mirna/mirna_entry.pl?id=$$
     #	Note the useScore item.  This colors plus strand items in black
     #	and minus strand items in gray.  A rarely used option.
     #	This same track is in Rn3, Mm4 and Ce2 too.  Added
     #	findBedPos(query, hgp, "miRNA");
     #	to lib/hgFind.c to allow searching for these items.
 
 
 #5-WAY MULTIZ & PHYLO-HMM HUMAN/CHIMP/MOUSE/RAT/CHICKEN (3/19/04, kpollard)
 
     # UPDATE WOODY BINARIES
     ssh hgwdev
     cd /cluster/data/woody   
     cvs update -dP
     cd src
     make
     # make sure Makefile has INSTALLDIR = /cluster/bin/woody
     make install 
 
     #MULTIZ to add chimp, then chicken to HUMOR (see above)
     ssh kk
     set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
     mkdir -p $fiveDir/hmrp
     mkdir -p $fiveDir/hmrpg
     cd $fiveDir
 	
     #wrapper script for multiz
     cat << EOF > mz
 #!/bin/csh
 /cluster/bin/penn/tbaBin/multiz \$1 \$2 - > \$3
 EOF
     chmod +x mz
 
     #CHIMP
     # put the MAFs on bluearc
     ssh eieio
     set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
     mkdir -p $clustDir/hp
     mkdir -p $clustDir/hmr
     cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf $clustDir/hmr
     cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/*.maf $clustDir/hp
     logout  # back to kk
 
     #set up joblist (common denominator set: no chr19_random in hmr)
     set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
     cd $fiveDir
     rm -f jobList
     foreach file ($clustDir/hmr/*.maf) 
 	set root=`echo $file:t:r | sed 's/\.hmr//'`		
        	echo "mz $clustDir/hp/${root}.maf $file $fiveDir/hmrp/${root}.maf" >> jobList
     end
 
     #run on kk	
     chmod +x jobList
     para create jobList
     #para try, para check, para push, etc.
 
     #add chr19_random from hp to hmrp
     cp /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/chr19_random.maf $fiveDir/hmrp
 
     #clean up bluearc
     ssh eieio
     set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
     rm -r $clustDir/hp
     rm -r $clustDir/hmr
 
     #CHICKEN
     # put the MAFs on bluearc
     ssh eieio
     set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
     mkdir -p $clustDir/hmrp
     mkdir -p $clustDir/hg
     cp $fiveDir/hmrp/*.maf $clustDir/hmrp
     cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest/*.maf
  $clustDir/hg
     logout  # back to kk
     logout #move to kki
 
     #set up job list 2
     ssh kki
     set fiveDir = /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
     set clustDir = /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
     cd $fiveDir
     rm -f jobList.2
     foreach file ($clustDir/hg/*.maf) 
 	set root=`echo $file:t:r | sed 's/\.hg//'`		
 	echo "mz $file $clustDir/hmrp/${root}.maf $fiveDir/hmrpg/${root}.maf" >> jobList.2
     end
     
     #run on kki	
     chmod +x jobList.2
     para create jobList.2
     #para try, para check, para push, etc.
 
     # clean up bluearc
     ssh eieio
     rm -r /cluster/bluearc/multiz.hg16mm3rn3panTro1galGal2
     logout
 
     #PHYLO-HMM CONSERVATION
     #Set path
     set path = ($path /cluster/bin/woody)
 
     #Create "sufficient statistics" (SS) file from maf
     ssh eieio
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
     mkdir phyloHMM
     cd phyloHMM
 
     # create script to run msa_view.
     cat > makeSS.csh << 'EOF'
 set path = ($path /cluster/bin/woody)
 cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg
 foreach f (chr*.maf)
     set c = $f:r
     echo "$c"
     msa_view $f -i MAF -o SS -s 1 -r 1 -O hg16,mm3,rn3,panTro1,galGal2 > \
        ../phyloHMM/$c.ss
 end
 'EOF'
     csh makeSS.csh >&! makeSS.log &
     tail -100f makeSS.log
     head phyloHMM/chr1.ss
     head phyloHMM/chrY.ss
     
     #model hpmrc_rev_dg.mod (from Adam)
     set path = ($path /cluster/bin/woody)
     cat hpmrc_rev_dg.mod
 #ALPHABET: A C G T
 #ORDER: 0
 #SUBST_MOD: REV
 #NRATECATS: 10
 #ALPHA: 4.4
 #BACKGROUND: 0.286083 0.213573 0.213691 0.286652
 #RATE_MAT:
 #  -0.891523    0.166770    0.574850    0.149902
 #   0.223389   -1.146311    0.153784    0.769137
 #   0.769591    0.153699   -1.147159    0.223869
 #   0.149605    0.573055    0.166888   -0.889548
 #TREE: ((1:0.0056,2:0.0057):0.1043,(3:0.076303,4:0.083043):0.2753):0.47,5:0.47); 
     /cluster/data/woody/scripts/extract-tree.pl human,chimp,mouse,rat,chicken \
         hpmrc_rev_dg.mod
 #((human:0.0056,chimp:0.0057):0.1043,(mouse:0.076303,rat:0.083043):0.2753):0.47,chicken:0.47);
     #order is human-chimp-mouse-rat-chicken, so fix maf order in next step
 
     #break up the genome-wide MAFs into pieces
     # NOTE: using the hg16 chr fasta files stashed on bluearc for hg16 humor
     mkdir -p /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
     cp /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg/*.maf /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
     cat << 'EOF' > doSplit
 #!/bin/sh
 
 WOODY=/cluster/bin/woody
 FA_SRC=/cluster/bluearc/hg16/bed/humor
 WINDOWS=/cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM/WINDOWS
 
 maf=$1
 c=`basename $maf .maf`
 echo $c
 mkdir -p /scratch/msa_split
 ${WOODY}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O hg16,panTro1,mm3,rn3,galGal2 -w 1000000,0 -r /scratch/msa_split/$c -o SS -I 1000 -d 1 -B 5000
 echo "Copying..."
 cd /scratch/msa_split
 for file in $c.*.ss ; do gzip -c $file > ${WINDOWS}/$file.gz ; done
 rm -f /scratch/msa_split/$c.*.ss
 echo "Done copying"
 'EOF'
     chmod +x doSplit
     mkdir -p WINDOWS
     rm -f WINDOWS/* jobs.lst
     foreach file (/cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/*.maf) 
 	echo "doSplit $file" >> jobs.lst
     end
     
     #run on kki
     ssh kki
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
     para create jobs.lst  
     # para try, para check, para push, etc.
     logout
 
     #compute the conservation scores
     # NOTE: need to use gensub2, check out+ facilities to check for
     # failures.  Will want to chunk msa_split output (above) into chr dirs.
     # to make the gensub template reasonable.    
     ssh kk
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
     cat << 'EOF' > doPostProbs
 #!/bin/sh
 
 WOODY=/cluster/bin/woody
 TMP=/tmp/phyloHMMcons
 
 file=$1
 root=`basename $file .ss.gz`
 chrom=`echo $root | awk -F\. '{print $1}'`
 echo $chrom
 
 mkdir -p $TMP
 zcat $file | $WOODY/label -m - -d hpmrc_rev_dg.mod -i SS -o $TMP/$root -k 10 -L 0.9 -A -p 0 -j 1 -s $chrom -x
 mkdir -p POSTPROBS/$chrom
 gzip -c $TMP/$root.postprob > POSTPROBS/$chrom/$root.postprob.gz
 rm $TMP/$root.postprob
 'EOF'
     # << this line makes emacs coloring happy
 
     chmod +x doPostProbs
     mkdir -p POSTPROBS
     rm -f jobs2.lst 
     foreach file (WINDOWS/chr*.ss.gz) 
        echo "doPostProbs $file" >> jobs2.lst 
     end
     wc -l jobs2.lst
     para create jobs2.lst
     #para try, para check, para push, etc.
     #1 problem: chr19_random crashed - due to no alignments in HMR. Leave out.
 
     # Create wiggle (.wib) file and load into database
     ssh eieio
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
     mkdir wibLimits
     mkdir wib
     cat > makeWig.csh << 'EOF'
 foreach dir (POSTPROBS/*)
     set chrom = $dir:t
     echo $chrom 
     zcat `ls POSTPROBS/$chrom/*postprob.gz | sort -t\. -k2,2n` | \
 	wigAsciiToBinary -chrom=$chrom \
 	-dataSpan=1 -wibFile=wib/${chrom}_hpmrg_phyloHMM -name=hpmrg \
 	stdin > wibLimits/${chrom}
 end
 'EOF'
     # << this line makes emacs coloring happy
     csh makeWig.csh >&! makeWig.log &
 
     #load tables
     ssh hgwdev
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2/phyloHMM
     hgLoadWiggle hg16 mzPt1Mm3Rn3Gg2_pHMM_wig wib/*_hpmrg_phyloHMM.wig 
     ln -s `pwd`/wib/chr*_hpmrg_phyloHMM.wib /gbdb/hg16/wib 
     chmod 775 . wib 
     chmod 664 wib/*.wib
 
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg
     mkdir -p /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM
     ln -s /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/hmrpg/*.maf /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM
     hgLoadMaf hg16 -warn mzPt1Mm3Rn3Gg2_pHMM   
 
     cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/
     mkdir -p /gbdb/hg16/chimp_hmrg
     ln -s /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/*.maf /gbdb/hg16/chimp_hmrg 
     hgLoadMaf hg16 -warn chimp_hmrg   
 
     #cleanup bluearc
     ssh eieio
     rm -r /cluster/bluearc/hg16/bed/multiz.hg16mm3rn3panTro1galGal2
     logout
 
     #Add description file: mzPt1Mm3Rn3Gg2_pHMM.html
     #Add track to trackDb.ra: mzPt1Mm3Rn3Gg2_pHMM
 
     #Copy files to download area
     cd /gbdb/hg16
     set dir = /usr/local/apache/htdocs/goldenPath/hg16/mzPt1Mm3Rn3Gg2
     mkdir $dir
     ln -s $dir multiz
     cp -p /gbdb/hg16/mzPt1Mm3Rn3Gg2_pHMM/*.maf $dir
     cd $dir
     gzip *
 
     # edit downloads page to add links
     # add pairwise mafs to downloads page 
     mkdir $dir/{rn3,mm3,pt1,gg2}
     cd /cluster/data/hg16/bed/humor/maf
     cp *.mm3.maf $dir/mm3
     cp *.rn3.maf $dir/rn3
     gzip $dir/mm3/*
     gzip $dir/rn3/*
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafBest
     cp *.maf $dir/gg2
     gzip $dir/gg2/*
     cd /cluster/data/hg16/bed/blastz-blat.panTro1.lifted/mafRBestNet/
     cp *.maf $dir/pt1
     gzip $dir/pt1/*
 
 # EXONIPHY HMR
 # (started, acs, 2004-03-23)
 # (redone 2004-07-01, with new version of software; have revised 
 # docs accordingly)
 # Warning: some commands here require bash shell
     ssh hgwdev
     # (make sure /cluster/bin/phast is in path)
     mkdir /cluster/store6/exoniphy.hg16mm3rn3.2004-03-23
     cd /cluster/data/hg16/bed
     ln -s /cluster/store6/exoniphy.hg16mm3rn3.2004-03-23
     ln -s exoniphy.hg16mm3rn3.2004-03-23 exoniphy.hg16mm3rn3
 
     # first, break up the genome-wide MAFs into pieces; it's worth doing
     # this as a little cluster job
     ssh eieio
     mkdir -p /cluster/bluearc/hg16/bed/humor
     cp /cluster/data/hg16/bed/humor/hmr/*.maf /cluster/data/hg16/?{,?}/chr*.fa /cluster/bluearc/hg16/bed/humor
     logout
     ssh kk
     cd /cluster/data/hg16/bed/exoniphy.hg16mm3rn3
     cat << '_EOF_' > doSplit
 #!/bin/sh
 
 PHAST=/cluster/bin/phast
 FA_SRC=/cluster/bluearc/hg16/bed/humor
 WINDOWS=/cluster/data/hg16/bed/exoniphy.hg16mm3rn3/WINDOWS
 
 maf=$1
 prefix=`basename $maf .hmr.maf`
 chr=`echo $prefix | sed 's/chr//g ; s/_random//g'`
 mkdir -p /scratch/msa_split
 ${PHAST}/msa_split $maf --in-format MAF --refseq ${FA_SRC}/$prefix.fa --order hg16,mm3,rn3 --windows 50000,2000 --out-root /scratch/msa_split/$prefix --out-format SS --min-informative 1000 --between-blocks 1000 --tuple-size 3
 mkdir -p ${WINDOWS}/$chr
 cd /scratch/msa_split
 for file in `ls | egrep -w ${prefix}` ; do gzip -c $file > ${WINDOWS}/$chr/$file.gz ; rm $file ; done
 _EOF_
     # << this line makes emacs coloring happy
 
     chmod +x doSplit
     mkdir -p WINDOWS
     rm -rf WINDOWS/* jobs.lst
     for file in /cluster/bluearc/hg16/bed/humor/*.maf ; do echo "doSplit $file" >> jobs.lst ; done
 
     para create jobs.lst  
     # etc ...   (run cluster job)
 
 
     # now set up cluster job for exoniphy.  
     cat << '_EOF_' > doExoniphy
 #!/bin/bash
 zcat $1 | /cluster/bin/phast/exoniphy - ${*:3} > $2
 _EOF_
     # << this line makes emacs coloring happy
     chmod +x doExoniphy
 
     rm -f jobs.lst
     for dir in WINDOWS/* ; do
 	chrNo=`basename $dir`
 	mkdir -p OUTPUT/$chrNo
 	for file in $dir/* ; do 
 	    base=`basename $file .ss.gz`
 	    chrStr=`echo $base | awk -F\. '{print $1}'`
 	    echo "doExoniphy $file OUTPUT/$chrNo/$base.gff --seqname $chrStr --idpref $base --score --indels --quiet " >> jobs.lst 
 	done 
     done
 
 #[acs@kk exoniphy.hg16mm3rn3]$ wc jobs.lst
 #  59175  591750 7179445 jobs.lst
     
     para create jobs.lst
     # etc... (run cluster job)
 
 #Completed: 59175 of 59175 jobs
 #CPU time in finished jobs:   49361849s  822697.48m 13711.62h  571.32d  1.565 y
 #IO & Wait Time:                258451s    4307.52m    71.79h    2.99d  0.008 y
 #Average job time:                 839s      13.98m     0.23h    0.01d
 #Longest job:                     1868s      31.13m     0.52h    0.02d
 #Submission to last job:         75584s    1259.73m    21.00h    0.87d
 
     # create track
     logout
     ssh hgwdev
     cd /cluster/data/hg16/bed/exoniphy.hg16mm3rn3
     for dir in OUTPUT/* ; do 
 	    chrNo=`basename $dir`
 	    echo $chrNo
 	    find $dir -name "*.gff" | grep -v random > files
 	    if [ -s files ] ; then cat `cat files` | refeature - --unique --sort --include-only CDS,start_codon,stop_codon > chr$chrNo.gff ; fi
 	    find $dir -name "*.gff" | grep random > files
 	    if [ -s files ] ; then cat `cat files` | refeature - --unique --sort --include-only CDS,start_codon,stop_codon > chr${chrNo}_random.gff ; fi
     done
 
     ldHgGene -gtf -frame hg16 exoniphy chr*.gff
 #track exoniphy
 #shortLabel Exoniphy 
 #longLabel Exoniphy: Conserved Exon Predictions (Human/Mouse/Rat)
 #group genes
 #priority 50.9
 #visibility hide
 #color 173,17,162
 #type genePred
 
 #
 # Load tfbsCons track DONE 2004-03-31 braney
 #
 
 set humordir=/gbdb/hg16/humorMm3Rn3
 set transfacdir=/projects/compbio/data/transfac
 set outdir=hg16_tfbsCons
 
 ssh hgwdev
 mkdir /cluster/data/hg16/bed/tfbsCons
 cd /cluster/data/hg16/bed/tfbsCons
 # Get tfbsConsUtils.tar.gz from Matt Weirauch with Perl scripts  weirauch@soe.ucsc.edu
 set tarfile=/cluster/data/hg15/bed/tfbsCons/tfbsConsUtils.tar.gz
 tar zxf $tarfile
 # the following takes days (says Matt)
 nice getTfbsConsData.pl `pwd` $humordir $transfacdir ./IDS.txt $outdir -over &
 cd $outdir
 rm chr*.bed
 hgLoadBed -noSort hg16 tfbsCons -sqlTable=$HOME/kent/src/hg/lib/tfbsCons.sql tfbsCons.bed -tab
 
 # Get mapping of ID's from Matt so we can link into the TRANSFAC database
 set idmap=/cluster/data/hg16/bed/tfbsCons/tfbsConsMap
 hgsql hg16 < ~/kent/src/hg/lib/tfbsConsMap.sql
 echo "load data local infile '$idmap' into table tfbsConsMap;" | hgsql hg16
 
 
 # PREP FOR LIFTOVER CHAINS TO HG16 (2004-04-12 kate)
 
 # split into 3K chunks
 ssh eieio
 set tempDir = /cluster/bluearc/hg/gs.17/build34/liftOver
 cd $tempDir
 mkdir lift 
 cat > split.csh << 'EOF'
 set scratch = /iscratch/i/gs.17/build34/liftOver/split
 mkdir -p $scratch
 foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M)
     echo chr$i
     faSplit -lift=lift/chr$i.lft size /cluster/data/hg16/$i/chr$i.fa -oneFile 3000 $scratch/chr$i
 end
 'EOF'
 csh split.csh >&! split.log &
 tail -100f split.log
 /cluster/bin/iSync
 
 
 
 # ECORES FROM GENOSCOPE [DONE, hartera, 2004-03-31]
 # download data from  http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores# ecotigHF - ecores on Human, genome conserved with Fugu, Fr1
 # ecotigHT - ecores on Human, genome conserved with Tetraodon (March 2004)
 
     ssh hgwdev
     mkdir /cluster/data/hg16/bed/ecores/
     # add parse_ecotig.pl to this directory
 
 # FUGU
     mkdir /cluster/data/hg16/bed/ecores/fr1
     cd /cluster/data/hg16/bed/ecores/fr1/
 
     # download data for ecotigHF to this directory
     # parse ecotig files to produce a bed format file
     perl ../parse_ecotig.pl < ecotigHF > ecotigHF.bed
 
     # change from upper to lower case for "CHR"
     perl -pi.bak -e 's/CHR/chr/g' ecotigHF.bed
 
     hgLoadBed -tab hg16 ecoresFr1 ecotigHF.bed
 
     # clean up
     rm *.bak
 
 # TETRAODON
     mkdir /cluster/data/hg16/bed/ecores/tetraodon
     cd /cluster/data/hg16/bed/ecores/tetraodon/
 
     # download data for ecotigHT to this directory
     # parse ecotig files to produce a bed format file
     perl ../parse_ecotig.pl < ecotigHT > ecotigHT.bed
     # change from upper to lower case for "CHR"
     perl -pi.bak -e 's/CHR/chr/g' ecotigHT.bed
 
     hgLoadBed -tab hg16 ecoresTetraodon ecotigHT.bed
 
     # clean up
     rm *.bak
 
     # add entries in kent/src/hg/makeDb/trackDb/human/hg16/trackDb.ra
     # add html for details pages to this directory:
     # ecoresFr1.html and ecoresTetraodon.html
 
 
 # VNTR MICROSATELLITE REPEATS FROM GEROME BREEN (DONE 4/28/04 angie)
     ssh hgwdev
     mkdir /cluster/data/hg16/bed/vntr
     cd /cluster/data/hg16/bed/vntr
     # saved email attachment from Gerome Breen <g.breen@iop.kcl.ac.uk>
     # as HumJuly2003microsats_finished_for_angieH.txt
     # Replace 1-based start coords with 0-based, tweak n/a distance values:
     tail +2 HumJuly2003microsats_finished_for_angieH.txt \
     | perl -wpe 's/(first|last) in chromosome\/sequence/-1/i' \
     | awk '{printf "%s\t%d\t%d\t%s\t%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6, $7, $8, $9, $10;}'  \
       > vntr.bed
     hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/vntr.sql hg16 \
       vntr vntr.bed
 
 
 # WEBB'S PUTATIVE NON-EXONIC CONSERVED REGIONS (DONE 4/6/04 angie)
     ssh hgwdev
     mkdir /cluster/data/hg16/bed/webbNonExonic
     cd /cluster/data/hg16/bed/webbNonExonic
     wget http://bio.cse.psu.edu/~webb/nonexonic.tar.gz
     tar xvzf nonexonic.tar.gz 
     # Score should really be scaled from 5k..276k --> 200-1000
     cat chr* \
     | awk '{printf "%s\t%d\t%d\t%s:%d-%d\t%d\t%c\n", $2, $3-1, $4, $5, $6, $7, $9, $8;}' \
     > webbNonExonic.bed
     hgLoadBed hg16 webbNonExonic webbNonExonic.bed
 
 
 # phylo HMM data quintile calculation
     ssh eieio
     cat << '_EOF_' > /tmp/allpHMMdata.sh
 #!/bin/sh
 #       there is only an empty file in chr13_random, it causes all
 #       files following it on the xargs zcat line to be missed.
 #       Eliminate it from the processing
 find ./POSTPROBS -type f | grep -v chr13_random | sort -t\. -k2,2n | \
         xargs zcat | awk '{print $2}' > /tmp/pHMM.data
 '_EOF_'
     chmod +x /tmp/allpHMMdata.sh
     cd  /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
     time /tmp/allpHMMdata.sh
 
 #  Create top 5 % set of data for phyloHMMcons.hg16mm3rn3.2003-11-11
 #	(DONE - 2004-05-15 - Hiram)
     cd  /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
     cat << '_EOF_' > top5.sh
 #!/bin/sh
 #
 #       Do not work on chr13_random, it has no data
 #	this for loop should have been:
 #	ls POSTPROBS/chr* | sort -t\. -k2,2n | while read i
 #	to get the data in properly sorted order.  With this as is,
 #	we will need to sort the coords later to make any wiggle
 #	track out of this data
 #
 mkdir top5_data
 for i in POSTPROBS/chr*
 do
         c=${i/POSTPROBS\//}
         echo $i $c
         if [ "$c" != "chr13_random" ]; then
             if [ ! -f top5_data/$c.ascii.gz ]; then
                 find ${i} -type f | sort -t\. -k2,2n | while read FN
                 do
                     zcat ${FN}
                 done | awk '{if ($2 > 0.450) print}' > top5_data/$c.ascii
                 rm -f top5_data/$c.ascii.gz
                 gzip top5_data/$c.ascii &
             else
                 ls -og top5_data/$c.ascii.gz
             fi
         fi
 done
 '_EOF_'
     chmod +x top5_data
     #	running this script takes several hours, make sure you do it
     #	on the file server
     ssh eieio
     cd  /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11
     #	Then, to make the histogram data:
     cd /cluster/data/hg16/bed/phyloHMMcons.hg16mm3rn3.2003-11-11/top5_data
     cat << '_EOF' > mkHisto.sh
 #!/bin/sh
 
 for f in chr*.ascii.gz
 do
     zcat $f
 done | textHistogram -real -col=2 -binSize=0.001 -maxBinCount=1000 stdin
 '_EOF_'
     chmod +x mkHisto.sh
     ./mkHisto.sh > histoGram.data
     
 # BLASTZ FUGU (FR1) (DONE 4/19/04 angie)
     ssh kk
     # space is awful tight on store4 -- use store7.  
     mkdir -p /cluster/store7/hg16/bed/blastz.fr1.2004-04-19
     ln -s /cluster/store7/hg16/bed/blastz.fr1.2004-04-19 \
       /cluster/data/hg16/bed/
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
     # Set L=6000 (more relaxed than chicken) and abridge repeats.
     # Treat all repeats as lineage-specific (reuse linSpecRep.Chicken).
     cat << '_EOF_' > DEF
 # human vs. fugu
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Reuse parameters from human-chicken.
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.Chicken
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Fugu
 SEQ2_DIR=/iscratch/i/fr1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/fr1/linSpecRep
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/store7/hg16/bed/blastz.fr1.2004-04-19
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     bash # if a csh/tcsh user
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
     para try, check, push, check, ....
 #Completed: 11865 of 11865 jobs
 #Average job time:                 414s       6.90m     0.11h    0.00d
 #Longest job:                      709s      11.82m     0.20h    0.01d
 #Submission to last job:          5678s      94.63m     1.58h    0.07d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
     bash # if a csh/tcsh user
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
     para try, check, push, etc ...
 #Completed: 339 of 339 jobs
 #Average job time:                   4s       0.07m     0.00h    0.00d
 #Longest job:                       19s       0.32m     0.01h    0.00d
 #Submission to last job:            91s       1.52m     0.03h    0.00d
 
     # third run: lav -> axt
     ssh kki
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 cat `ls -1 *.lav | sort -g` \
 | $HOME/bin/x86_64/lavToAxt stdin \
     /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/fr1/nib stdout \
 | $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
 $HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     foreach d (../lav/chr*)
       echo "do.csh $d" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 42 of 42 jobs
 #Average job time:                  16s       0.26m     0.00h    0.00d
 #Longest job:                       75s       1.25m     0.02h    0.00d
 #Submission to last job:            80s       1.33m     0.02h    0.00d
 
 
 # CHAIN FUGU BLASTZ (REDONE 10/1/04 angie)
     # NOTE: originally done 4/19, but with a buggy axtChain.
     # axtChain dir moved aside to axtChain.orig before rebuilding.
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     # Check size>0 for .axt files (empty inputs cause out line+ check to fail):
     cp /dev/null input.lst
     foreach f (`ls -1S /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChrom/*.axt`)
       if (-s $f) then
         echo $f >> input.lst
       endif
     end
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # Reuse gap penalties from chicken run.
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
          -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
          -minScore=5000 $1 \
     /iscratch/i/gs.17/build34/bothMaskedNibs \
     /iscratch/i/fr1/nib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 #Completed: 41 of 41 jobs
 #Average job time:                  26s       0.44m     0.01h    0.00d
 #Longest job:                      121s       2.02m     0.03h    0.00d
 #Submission to last job:           121s       2.02m     0.03h    0.00d
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo loading $c
         hgLoadChain hg16 ${c}_chainFr1 $i
     end
 
 
 # NET FUGU BLASTZ (REDONE 10/1/04 angie)
     # NOTE: originally done 4/19, but with results of a buggy axtChain.
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
     netClass noClass.net hg16 fr1 fugu.net
 
     # Make a 'syntenic' subset:
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn fugu.net > fuguSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19/axtChain
     netFilter -minGap=10 fugu.net |  hgLoadNet hg16 netFr1 stdin
     netFilter -minGap=10 fuguSyn.net | hgLoadNet hg16 netSyntenyFr1 stdin
 
 # LIFTOVER CHAIN TO FUGU FR1 (DONE 2004-09-28 kate)
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.fr1/axtChain
     time netChainSubset human.net all.chain \
         /cluster/data/hg16/bed/liftOver/hg16ToFr1.chain
 
 
 # RUN AXTBEST (DONE 4/20/04 angie)
     # Webb asked for axtBest too...
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.fr1.2004-04-19
     mkdir axtBest
     foreach f (axtChrom/*.axt)
       set chr=$f:t:r
       echo axtBesting $chr
       axtBest $f $chr axtBest/$chr.axt -minScore=300
     end
 
 
 # H-INVITATIONAL GENE ANNOTATION DATABASE (2004-04-29 kate)
     # https://www.jbirc.aist.go.jp/hinv/top.html
     # Create knownGene table to reference HINV gene ID's
     #  for link on knownGenes details page
     # Also, create an HINV gene track, just to look at 
     # (probably not publish, as these are just mRNA alignments
     # already visible on browser).
 
     # download CDNA file (release 1.0)
     ssh kksilo
     mkdir /cluster/data/hinv
     cd /cluster/data/hinv
     wget http://www.jbirc.aist.go.jp/hinv/download/alldata/flatfile/FCDNA.gz
     gunzip FCDNA.gz
     mv FCDNA FCDNA.1.0
 
     # set up assembly work area
     ssh eieio
     cd /cluster/data/hg16
     mkdir -p bed/hinv
     cd bed/hinv
 
     # extract H-INV ID's and Genbank accessions of mRNAs
     awk '/CDNA_ACCESSION-NO:/ {print $2}' < /cluster/data/hinv/FCDNA.1.0 \
                                                         > accessions.txt
     awk '/CDNA_H-INVITATIONAL-ID:/ {print $2}' < /cluster/data/hinv/FCDNA.1.0 \
                                                         > ids.txt
     paste accessions.txt ids.txt > queries.txt
 
     # create PSL file from alignments for these mRNA's, extracted from the 
     #       table of all aligned mRNA's
     hgsql hg16 -s -e "SELECT * FROM all_mrna"  | cut -f 2- > all_mrna.tab
 
     pslReps /dev/null stdout /dev/null | cat - all_mrna.tab > all_mrna.psl
         # using pslReps to generate the PSL file header
     pslSelect -queryPairs=queries.txt all_mrna.psl hinv_mrna.psl
 
     # load track of mrna alignments
     hgwdev
     cd /cluster/data/hg16/bed/hinv
     hgLoadPsl hg16 -table=HInvGeneMrna hinv_mrna.psl
 
     # also make a gene track using the genomic exon coordinates for build34
     # in the FCDNA file.  NOTE: not all of the genes have these
     ssh kksilo
     cd /cluster/data/hg16/bed/hinv
     /cluster/data/hinv/hinvToGff.pl < /cluster/data/hinv/FCDNA.1.0 > hinv.gff
     ssh hgwdev
     cd /cluster/data/hg16/bed/hinv
     ldHgGene hg16 HInvGene hinv.gff
         # Read 40140 transcripts
     # TrackDb for this
         # track HInvGene
         # shortLabel H-INV Gene
         # longLabel H-Invitational Genes
         # group genes
         # priority 37
         # visibility hide
         # color 0,100,180
         # type genePred .
 
     # also make a table with various useful items for each gene
     ssh hgwdev
     hgsql hg16 < ~/kent/src/hg/lib/HInv.sql
     cd /cluster/data/hg16/bed/hinv
     /cluster/data/hinv/hinvToTable.pl < /cluster/data/hinv/FCDNA.1.0 > HInv.tab
     echo 'load data local infile "HInv.tab" into table HInv' | hgsql hg16
 
     # create table for knownGenes detail page
     ssh hgwdev
     cd /cluster/data/hg16/bed/hinv
     hgMapToGene hg16 HInvGeneMrna knownGene knownToHInv
 
 
 # GENERATE GALGAL2 MAF FOR MULTIZ FROM NET (DONE 5/10/04 angie)
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/axtChain
     netSplit human.net net
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.galGal2.2004-02-25
     mkdir axtNet
     foreach f (axtChain/net/*)
       set chr = $f:t:r
       netToAxt $f axtChain/chain/$chr.chain /cluster/data/hg16/nib \
         /cluster/data/galGal2/nib stdout \
       | axtSort stdin axtNet/$chr.axt
     end
     mkdir mafNet
     foreach f (axtNet/chr*.axt)
       set maf = mafNet/$f:t:r.hg.maf
       axtToMaf $f \
             /cluster/data/hg16/chrom.sizes /cluster/data/galGal2/chrom.sizes \
             $maf -tPrefix=hg16. -qPrefix=galGal2.
     end
 
 
 # MULTIZ HUMAN/MOUSE/RAT/GALGAL2 WITH NET MAF FOR ALL (DONE 5/10/04 angie)
 # (galGal2 net maf added to human/mouse/rat alignments described above [HUMOR])
     # put the MAFs on bluearc
     ssh eieio
     mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr
     mkdir -p /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg
     cp /cluster/data/hg16/bed/humor.2003-09-08/hmr/*.maf \
       /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr
     cp /cluster/data/hg16/bed/blastz.galGal2.2004-02-25/mafNet/*.maf \
       /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg
 
     ssh kki
     mkdir /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet
     mkdir hmrg
     # Wrapper script required because of stdout redirect:
     cat << '_EOF_' > doMultiz
 #!/bin/csh
 /cluster/bin/penn/multiz $1 $2 - > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doMultiz
     rm -f jobList
     foreach file (/cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hmr/*.maf) 
       set root=$file:t:r:r
       echo "doMultiz /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet/hg/${root}.hg.maf $file /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg/${root}.maf" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 40 of 41 jobs
 #Crashed: 1 jobs
 #Average job time:                  84s       1.40m     0.02h    0.00d
 #Longest job:                      267s       4.45m     0.07h    0.00d
 #Submission to last job:           290s       4.83m     0.08h    0.00d
     # The crash was due to empty hg/chr18_random.hg.maf -- OK.
 
     # clean up bluearc (these are big files!)
     rm -r /cluster/bluearc/multiz.hg16mm3rn3galGal2.allNet
 
     # put this out there for Glenn Tesler (not a browser track!)
     ssh eieio
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg
     gzip *
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/angie/hg16.multizMm3Rn3GalGal2.allNet
     foreach f (/cluster/data/hg16/bed/multiz.hg16mm3rn3galGal2.allNet/hmrg/*)
       ln -s $f /usr/local/apache/htdocs/angie/hg16.multizMm3Rn3GalGal2.allNet
     end
 
 
 # EPONINE TSS PREDICTION (DONE 5/21/04 angie)
     # Eponine runs fine on 2.5Mb contig, but barfs on much larger contig;
     # chop up sequence at gaps into ~2.5Mb chunks for cluster run.
     ssh eieio
     mkdir /cluster/bluearc/hg16/chunks
     cd /cluster/data/hg16
     # Note: faSplit seems to ignore the ".chunk_" suffix below:
     foreach f (?{,?}/NT_*/NT_??????.fa)
       set ctg = $f:t:r
       faSplit -minGapSize=10 -lift=/cluster/bluearc/hg16/chunks/$ctg.lft \
         gap $f 2500000 /cluster/bluearc/hg16/chunks/$ctg.chunk_
     end
     mkdir /cluster/data/hg16/bed/eponine
     cd /cluster/data/hg16/bed/eponine
     wget http://www.sanger.ac.uk/Software/analysis/eponine/eponine-scan.jar
     cat << '_EOF_' > doEpo
 #!/bin/csh
 set path=(/usr/java/j2re1.4.1_01/bin $path)
 java -jar ./eponine-scan.jar -threshold 0.999 -seq $1 > $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doEpo
     cp /dev/null jobList
     foreach f (/cluster/bluearc/hg16/chunks/NT*.fa)
       echo "./doEpo {check in line+ $f} {check out exists out/$f:t:r.gff}" \
       >> jobList
     end
     mkdir out
     ssh kk9
     cd /cluster/data/hg16/bed/eponine
     para create jobList
     para try, check, push, check, ...
 #Completed: 1588 of 1588 jobs
 #Average job time:                 208s       3.47m     0.06h    0.00d
 #Longest job:                      447s       7.45m     0.12h    0.01d
 #Submission to last job:          3591s      59.85m     1.00h    0.04d
 
     # lift chunks -> contigs
     mkdir contigs/
     foreach l (/cluster/bluearc/hg16/chunks/*.lft)
       set ctg = $l:t:r
       liftUp contigs/$ctg.gff $l warn out/${ctg}*.gff
     end
     # lift contigs -> chrom
     liftUp eponine.gff ../../jkStuff/liftAll.lft warn contigs/NT_*.gff
     # Translate to bed 4 + float-score -- it would be a shame to lose 
     # those scores in genePred or bed 5 (int score)
     awk 'BEGIN {i=0;} \
          {printf "%s\t%d\t%d\t%s.%d\t%s\t%s\n", $1, $4-1, $5, $1, i, $6, $7; \
           i = i + 1;}' \
       eponine.gff > eponine.bed
     # load up
     ssh hgwdev
     cd /cluster/data/hg16/bed/eponine
     sed -e 's/bed6FloatScore/eponine/g' \
       $HOME/kent/src/hg/lib/bed6FloatScore.sql > eponine.sql
     hgLoadBed hg16 eponine eponine.bed -tab -sqlTable=eponine.sql
 
 
 # RELOAD ENSEMBL GENES WITH VERSION 34d (DONE 2004/05/20 baertsch)
     # save current tables, just in case.
     rename table  ensGene to ensGene_old;
     rename table  ensGtp  to ensGtp_old;  
     rename table  ensPep  to ensPep_old;  
 
     mkdir /cluster/data/hg16/bed/ensembl34d
     cd /cluster/data/hg16/bed/ensembl34d
     # Get the ensembl protein data from 
     # http://www.ensembl.org/Homo_sapiens/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Structures" box. 
     # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
     # Save as ensemblGene.gtf.gz
 
     # Ensembl handles random chromosomes differently than us, so we
     # strip this data.  Fortunately it just loses a couple of genes.
     # Add "chr" to front of each line in the gene data gtf file to make 
     # it compatible with our software.
     # Finally, get rid of the ".1" or ".2" after the name
     zcat ensbuild34d.gff.gz \
     | grep -v ^6_DR51 \
     | grep -v ^DR51 \
     | grep -v _NT_ \
     | perl -wpe 's/^([0-9]|X|Y|Un)/chr$1/ \
                  || die "Line $. doesnt start with human chrom:\n$_"' \
     | sed -e 's/\..\"/\"/g' \
     > ensGene.gtf
     ssh hgwdev
     /cluster/bin/i386/ldHgGene -gtf -genePredExt hg16 ensGene \
       /cluster/data/hg16/bed/ensembl34d/ensGene.gtf
 
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Choose Text, tab-separated as the output format.  Result name ensGtp.
     # Save file as ensGtp.txt.gz
     gunzip ensGtp.txt.gz
     hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
     echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16
     gzip ensGtp.txt
 
     # Load Ensembl peptides:
     # Get them from ensembl as above in the gene section except for
     # Page 3) Choose the "Sequences" box. 
     # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
     # Save file as ensemblPep.fa.gz
     zcat ensemblPep.fa.gz | hgPepPred hg16 ensembl stdin
 
     # compare size of old and new tables as a sanity check
     drop table ensGene_old;
     drop table ensGtp_old;  
     drop table ensPep_old;  
 
     # Create knownToEnsembl column 
     hgMapToGene hg16 ensGene knownGene knownToEnsembl
 
 #### BUILD Ensembl cross-reference table, ensemblXref3 (DONE - 2004-05-24 - Fan)
 
     # Get the ensembl gene/protein cross-reference data from
     # http://www.ensembl.org/Homo_sapiens/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Homo_sapiens choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Feature" box, select gene, transcript, protein, 
 	      SPTrEMBL ID, SWISSPROT ID, and SWISSPROT AC 
     # Page 4) Choose "Text, tab separated".  choose gzip compression.  hit export.
     # Save as ensXref.txt
 
     sed ensXref.txt -e 's/\./\t/g' > ensemblXref3.tab
 
     hgsql hg16 -e "drop table ensemblXref3"
     hgsql hg16 < ~/src/hg/lib/ensemblXref3.sql
 
     hgsql hg16 -e 'load data local infile "ensemblXref3.tab" into table ensemblXref3 ignore 1 lines'
 
 
 #### REBUILD SUPERFAMILY RELATED TABLES (DONE - 2004-05-21 - Fan)
 
 # Download Superfamily data files and build the Superfamily DB
 # from supfam.mrc-lmb.cam.ac.uk
 
         mkdir /cluster/store1/superFamily/040516
         cd /cluster/store1/superFamily/040516
 
 # ftp over the following two files:
 
         ass_16-May-2004.tab.gz
         supfam_16-May-2004.sql.gz
 
 # This may take about an hour.
 
      hgsql hg16 -e "create database superfam040516"
      hgsql superfam040516 < supfam_16-May-2004.sql
 
 # Make sure to add an index on id of the des table of superfam040516.
 
      hgsql superfam040516 < ~/src/hg/lib/sfAssign.sql
      hgsql superfam040516 -e 'load data local infile "ass_16-May-2004.tab" into table superfam040516.sfAssign;'
 
 # Build or rebuild Superfamily track and create sf tables needed for PB
 
    hgsql hg16 < ~/src/hg/lib/sfAssign.sql
 
    cd /cluster/store1/superFamily/040516
   
    hgsql hg16 -e 'load data local infile "ass_16-May-2004.tab" into table hg16.sfAssign;'
 
 # If hg16.sfDes already exists, drop it.
 
    hgsql superfam040516 -e "select * from des" >sfDes.tab
 
    hgsql hg16 < ~/src/hg/lib/sfDes.sql
 
    hgsql hg16 -e 'load data local infile "sfDes.tab" into table hg16.sfDes ignore 1 lines;'
 
 # If hg16.superfamily already exists, drop it.
 
    hgSuperfam hg16 > sf.log
 
 # It is normal that many proteins does not have corresponding Superfamily entries.
 
 # If hg16.sfDescription exists, drop it.
 
    hgsql hg16 < ~/src/hg/lib/sfDescription.sql
    hgsql hg16 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table hg16.sfDescription;'
 
 # Finally, load the superfamily table.
 
    hgLoadBed hg16 superfamily superfamily.tab -tab
 
 # Create knownToSuperfamily table
    
    cat /cluster/store1/superFamily/040516/ass_16-May-2004.tab \
    | hgKnownToSuper hg16 hs stdin
 # creates 32542 rows in knownToSuper
 
 # seq table acc field is too small; up the max to match new hgLoadSeq 
 # schema (2004/05/22 markd)
 
     alter table modify column `acc` varchar(128) NOT NULL default '';
 
 
 ####  Blat knownGene proteins to determine exons (braney 2004-06-02)
     ssh kk
     mkdir -p /cluster/data/hg16/bed/blat.hg16KG.2004-05-27
     cd /cluster/data/hg16/bed
     rm blat.hg16KG
     ln -s  blat.hg16KG.2004-05-27 blat.hg16KG
     cd blat.hg16KG
     pepPredToFa hg16 knownGenePep known.fa
     hgPepPred hg16 generic blastKGPep00 known.fa
     cat << '_EOF_' > blatSome
 #!/bin/csh -fe
 /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
 '_EOF_'
     ls -1S /scratch/hg/gs.17/build34/bothMaskedNibs/*.nib > human.lst
     mkdir kgfa
     cd kgfa
     faSplit sequence ../known.fa 300 kg
     ls -1S kgfa/*.fa > kg.lst
     cat << '_EOF_' > blatGsub
 #LOOP
 blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     gensub2 human.lst kg.lst blatGsub blatSpec
     mkdir psl
     cd psl
     foreach i (`cat ../human.lst`)
 	mkdir `basename $i .nib`
     end
     para create blatSpec
     para push
 # Completed: 12222 of 12222 jobs
 # CPU time in finished jobs:   23286365s  388106.09m  6468.43h  269.52d  0.738 y
 # IO & Wait Time:                710342s   11839.03m   197.32h    8.22d  0.023 y
 # Average job time:                1963s      32.72m     0.55h    0.02d
 # Longest job:                   106239s    1770.65m    29.51h    1.23d
 # Submission to last job:        106248s    1770.80m    29.51h    1.23d
 
     pslSort dirs raw.psl /tmp psl/*
     pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
     sort -rn cooked.psl | pslUniq stdin hg16KG.psl
     pslxToFa hg16KG.psl hg16KG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
     kgName hg16 hg16KG.psl blastKGRef00
     ssh hgwdev
     cd /cluster/data/hg16/bed/blat.hg16KG
     hgsql hg16 < ~/kent/src/lib/hg/blastRef.sql
     echo "rename table blastRef to blastKGRef00" | hgsql hg16
     echo "load data local infile 'blastKGRef00' into table blastKGRef00" | hgsql hg16
 
 ### RUN BLASTZ VS. MACACA MULATTA
 
 #get sequence from trace repository
 cd /cluster/bluearc/macaca
 for i in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 ; do echo $i ; wget ftp://ftp.ncbi.nih.gov/pub/TraceDB/macaca_mulatta/fasta.macaca_mulatta.0$i.gz ; done
 
 # distribute contigs to bluearc and /iscratch/i for cluster run
 #split the sequence into 1mb chunks (about 13k reads per file)
 ssh kksilo
 mkdir -p /cluster/bluearc/macaca/split
 for i in 001 002 003 004 005 006 007 008 009 010 011 012 013 014 ; do faSplit about macacca_mulatta.$i.fa 10000000 split/$i/mac ; done
 find split -name \*.fa > mac.lst
 hgsql hg16 -N < chromLen.sql > S1.len
 #flatten directory structure for Angie's scripts
 for i in `ls` ; do cd /iscratch/i/macaca/$i ; for j in `ls` ; do mv $j ../$i.$j ; done ; done
 
     ssh kkr1u00
     mkdir -p /iscratch/i/macaca/
     df /iscratch/i
     cp /cluster/bluearc/macaca/split/* /iscratch/i/macaca
     /cluster/bin/scripts/iSync
 
 # make DEF file for blastz
     ssh kksilo
     cd /cluster/bluearc/macaca
 
 # NOTE: need schwartzbin below for utils still not in penn bin
 
 cat << '_EOF_' > DEF
 # human vs. macaca mulatta
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_T=2
 BLASTZ_K=4500
 BLASTZ_Q=/cluster/data/blastz/human_mulatta.q
 
 
 BLASTZ_ABRIDGE_REPEATS=0
 
 SEQ1_DIR=/scratch/hg/gs.17/build34/bothMaskedNibs/
 SEQ1_RMSK=/scratch/hg/gs.17/build34/rmsk/
 SEQ1_SMSK=
 SEQ1_FLAG=-primate
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 SEQ2_DIR=/iscratch/i/macaca/
 SEQ2_RMSK=
 SEQ2_SMSK=
 SEQ2_FLAG=-primate
 SEQ2_IN_CONTIGS=1
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/bluearc/macaca
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 DEBUG=0
 
 '_EOF_'
 
     # << this line makes emacs coloring happy
 
 # Save the DEF file in the current standard place
     cp DEF ~angie/hummus/DEF.hg16-rm0.`date -I`
 
     ssh kk
     cd /cluster/bluearc/macaca
     # source the DEF file to establish environment for following commands
     bash
     source ./DEF
     cp /cluster/data/mm4/jkStuff/BlastZ_run0.sh .
     ./BlastZ_run0.sh
     cd run.1
     para try
     para check
     para push
 
     #   Second cluster run to convert the .out's to .lav's
     cp /cluster/data/mm4/jkStuff/BlastZ_run1.sh . 
     ssh kk
     cd /cluster/data/pt0/bed/blastz.hg16
     bash
     source DEF
     ./BlastZ_run1.sh
     cd run.2
     para try
     para check
     para push
 
     #   Prepare third cluster run script to convert lav's to axt's
     cd /cluster/bluearc/macaca/
 cat << '_EOF_' > ../../jkStuff/BlastZ_run2.sh
 #!/bin/sh
 #       prepare third cluster run for blastz processing
 # NOTE: should run this on iservers (4G), 
 # with chr19 and chr1 on kolossus (8G)
 M=`uname -n`
 if [ "$M" != "kk" ]; then
     echo "ERROR: you are on machine: '$M'"
     echo -e "\tthis script expects machine kk"
     exit 255
 fi
 source DEF
 mkdir axtChrom
 mkdir run.2
 cd run.2
 # usage:  blastz-contiglav2axt lav-dir axt-file seq1-dir seq2-file
 echo '#LOOP' > gsub
 echo '/cluster/bin/scripts/blastz-contiglav2axt '${BASE}'/lav/$(root1) {check out line+ '${BASE}'/axtChrom/$(root1).axt} '${SEQ1_DIR}' /cluster/bluearc/macaca/split/'${path2} >> gsub
 echo '#ENDLOOP' >> gsub
 ls -1S ${BASE}/lav > chrom.list
 gensub2 chrom.list ../mac.lst gsub jobList
 wc -l jobList
 echo "running 'para create'"
 para create jobList
 echo "Ready for cluster run.  para try, check, push, etc ..."
 '_EOF_'
     chmod +x ../../jkStuff/BlastZ_run2.sh
     #   Third cluster run to convert lav's to axt's
     source DEF
     ../../jkStuff/BlastZ_run2.sh
     cd run.2
     para try, check, push, etc ...
     # NOTE: ran this on kolossus and mini-cluster
     # 30 min. to 2 hrs. per chrom
     # Wrapper script required because of stdout redirect:
 
     cd /cluster/bluearc/macaca
     cat << '_EOF_' > doMultiz
 #!/bin/csh
 /cluster/bin/penn/multiz $1 $2 - > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doMultiz
     rm -f jobList
     foreach file (/cluster/data/pt0/bed/blastz-blatHg16.2003-11-24/maf/*.maf) 
       set root=$file:t:r:r
       echo "doMultiz /cluster/data/pt0/bed/blastz-blatHg16.2003-11-24/maf/${root}.maf $file /cluster/bluearc/macaca/blastz.hg16/${root}.maf" >> jobList
     end
     para create jobList
     para try, check, push, check
 
 # seq table acc field is too small; up the max to match new hgLoadSeq 
 # schema (2004/05/22 markd)
 
     alter table modify column `acc` varchar(128) NOT NULL default '';
 
 
 ####  Blat knownGene proteins to determine exons (braney 2004-06-02)
     ssh kk
     mkdir blat.hg16KG.2004-05-27
     rm blat.hg16KG
     ln -s  blat.hg16KG.2004-05-27 blat.hg16KG
     pepPredToFa hg16 knownGenePep known.fa
     grep ">" known.fa | sed "s/>//" > kgName.lst
     kgName hg16 kgName.lst kg.mapNames
     cat << '_EOF_' > blatSome
 #!/bin/csh -fe
 /cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
 '_EOF_'
     ls -1S /scratch/hg/gs.17/build34/bothMaskedNibs/*.nib > human.lst
     mkdir kgfa
     cd kgfa
     faSplit sequence ../known.fa 300 kg
     ls -1S kgfa/*.fa > kg.lst
     cat << '_EOF_' > blatGsub
 #LOOP
 blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
 #ENDLOOP
 '_EOF_'
     gensub2 human.lst kg.lst blatGsub blatSpec
     mkdir psl
     cd psl
     foreach i (`cat ../human.lst`)
 	mkdir `basename $i .nib`
     end
     para create blatSpec
     para push
 # Completed: 12222 of 12222 jobs
 # CPU time in finished jobs:   23286365s  388106.09m  6468.43h  269.52d  0.738 y
 # IO & Wait Time:                710342s   11839.03m   197.32h    8.22d  0.023 y
 # Average job time:                1963s      32.72m     0.55h    0.02d
 # Longest job:                   106239s    1770.65m    29.51h    1.23d
 # Submission to last job:        106248s    1770.80m    29.51h    1.23d
 
     pslSort dirs raw.psl /tmp psl/*
     pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
     pslxToFa uniq.psl uniq_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
 
 
 # LIFTOVER CHAINS TO HG17 (DONE 2004-07-14 kate)
 
     # run alignment
     # NOTE: split hg17 to /iscratch/i is doc'ed in makeHg17.doc
     ssh kk
     cd /cluster/data/hg16
     makeLoChain-align hg16 /scratch/hg/gs.17/build34/bothMaskedNibs \
                     hg17 /iscratch/i/hg17/liftOver/split
         # Created parasol job in bed/blat.hg17.2004-07-14/run
     cd bed
     rm blat.hg17
     ln -s blat.hg17.2004-07-14 blat.hg17
     cd blat.hg17/run
     para try
     para check
     para push
 
     # lift results
     # the lift directory was defined in makeHg17.doc when split was performed
     # this expects data in bed/blat.hg17, so symlink must be there 
     # use kolossus for speed
     ssh kolossus
     cd /cluster/data/hg16/bed/blat.hg17
     makeLoChain-lift hg16 hg17 /cluster/data/hg17/bed/liftOver/liftSplit \
                         >&! lift.log &
     tail -100f lift.log
         # 25 minutes 
     
     # chain alignments
     ssh kk
     makeLoChain-chain hg16 /cluster/data/hg16/nib hg17 /cluster/data/hg17/nib
         # Created parasol job in /cluster/data/hg16/bed/blat.hg17/chainRun
     cd /cluster/data/hg16/bed/blat.hg17/chainRun
     para try
         # 46 jobs
     para check
     para push
 
     # make alignment net
     ssh kolossus
     makeLoChain-net hg16 hg17
 
     # load into database and copy to download directory
     ssh hgwdev
     makeLoChain-load hg16 hg17
     cp /cluster/data/hg16/bed/blat.hg17/over.chain \
         /cluster/data/hg16/bed/liftOver/hg16ToHg17.chain
         # Finished loading hg16ToHg17.over.chain
         # Now, add download link for /usr/local/apache/htdocs/goldenPath/hg16/liftOver/hg16ToHg17.over.chain.gz
 
 
 # LIFTOVER CHAIN FROM HG17 TO HG16 (IN PROGRESS 2005-01-03 kate)
 
     ssh kolossus
     cd /cluster/data/hg16/bed/blast.hg17
     mkdir net.hg17
     cd chain
     chainMergeSort
     chainNet stdin /cluster/data/hg16/chrom.sizes \
                    /cluster/data/hg17/chrom.sizes \
                         /dev/null ../net.hg17
     time chainSwap 
     netChainSubset net.hg17
 
 
 
 # ENCODE Regions        (kate)
 #       NOTE: these instructions are not yet complete (scripts and datafiles 
 #               are currently in ~kate/encode)
     mkRegionsBed.pl build34_regions.txt > encodeRegionsHg16.bed
     hgLoadBed hg16 encodeRegions encodeRegionsHg16.bed -noBin
     mkdir -p /cluster/data/hg16/bed/encodeRegions
     cp encodeRegionsHg16.bed /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed
 
     # Create hgFixed table for name+description
     hgsql -D hgFixed < ${HOME}/kent/src/hg/lib/encodRegionInfo.sql
     sed -e 's/^/INSERT INTO encodeRegionInfo (name, descr) VALUES (\"/' \
             -e 's/|/\",\"/' \
             -e 's/$/\");/' < regionInfo.txt | hgsql -D hgFixed
 
     # create frameset for region display
     make
 
     # create sequence downloads
     set dir = /usr/local/apache/htdocs/ENCODE/sequences
     rm sizes.txt
     foreach b (hg12 hg13 hg15 hg16)
         encodeSequence.pl regions.$b.txt /cluster/data/$b/nib > $b.fa
         cp $b.fa $dir
         faCount $b.fa | awk '{print $1, $2}' > $dir/${b}_count.txt
         echo $b >> sizes.txt
         faSize $b.fa >> sizes.txt
         echo "" >> sizes.txt
     end
     cp sizes.txt $dir
     cd $dir
     md5sum *.fa > md5sum.txt
 
     # QA
     checkEncodeRegions.pl regions.hg12.txt /cluster/data/hg12/nib > hg12.check
     cp sizes.txt $dir
        # etc.
     csh printRegionDiffs.csh > regionDiffs.out
 
 
 
 ## end of blastz macaca mulatta alignment
 
 
 
 # UN-ANNOTATED (EXCEPT FOR CROSS-SPECIES) REGIONS (DONE 6/8/04 angie)
     # Anton Nekrutenko asked for this... easy to do with featureBits!
     # NOTE: excluding mRNAs this time because of the controversial 
     # just-submitted-to-GenBank intronic BV* "mRNA" seqs.  
     ssh hgwdev
     mkdir /cluster/data/hg16/bed/unAnnotated
     cd /cluster/data/hg16/bed/unAnnotated
     nice featureBits hg16 -minSize=12 \
       \!gap \
       \!knownGene \!refGene \!mgcGenes \
       \!vegaGene \!vegaPseudoGene \!ensGene \!acembly \!ECgene \
       \!geneid \!genscan \!twinscan \!slamMouse \!sgpGene \!softberryGene \
       \!rnaGene \!superfamily \
       \!est \!xenoMrna \!HInvGene \!tigrGeneIndex \
       \!uniGene_2 \
       \!cpgIsland \!rmsk \!simpleRepeat \
       -bed=unAnnotated.bed
 #905732944 bases of 2865248791 (31.611%) in intersection
     hgLoadBed hg16 unAnnotated unAnnotated.bed
     # not much of a drop in coverage with the -minSize:
     nice featureBits hg16 unAnnotated
 #903585585 bases of 2865248791 (31.536%) in intersection
 
 
 # ANDY LAW CPGISSLANDS (DONE 6/15/04 angie)
     # See notes about this in makeGalGal2.doc.
     ssh eieio
     mkdir /cluster/data/hg16/bed/cpgIslandGgfAndy
     cd /cluster/data/hg16/bed/cpgIslandGgfAndy
     cp /dev/null cpgIslandAndy.bed
     cp /dev/null cpgIslandGgfAndy.bed
     foreach f (../../?{,?}/chr*.fa)
       set chr = $f:t:r
       echo preproc $chr
       /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc
       echo running original on $chr
       awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \
       | /cluster/home/angie/andy-cpg-island.pl \
       | perl -wpe '$i=0 if (not defined $i); \
                    chomp; ($s,$e) = split("\t"); $s--; \
                    $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
       >> cpgIslandAndy.bed
       echo running modified on $chr
       /cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \
       | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                    $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                    $pGc = (100.0 * $gc / $n); \
                    $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                         "$pCpG\t$pGc\t$oE\n";' \
       >> cpgIslandGgfAndy.bed
     end
     # load into database:
     ssh hgwdev
     cd /cluster/data/hg16/bed/cpgIslandGgfAndy
     # this one is a bed 4:
     hgLoadBed hg16 cpgIAndy -tab -noBin cpgIslandAndy.bed
     # this one is a cpgIslandExt but with a different table name:
     sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
     hgLoadBed hg16 cpgIslandGgfAndy -tab -noBin \
       -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
     # WOW, even masking out repeat bases from the results, there's a huge 
     # increase in reported islands!!
     featureBits hg16 cpgIsland
 #21077002 bases of 2865248791 (0.736%) in intersection
     featureBits hg16 cpgIslandGgfAndy
 #135249416 bases of 2865248791 (4.720%) in intersection
     featureBits hg16 cpgIslandGgfAndy \!rmsk
 #68714633 bases of 2865248791 (2.398%) in intersection
     wc -l ../cpgIsland/cpgIsland.bed *bed
 #  27596 ../cpgIsland/cpgIsland.bed
 # 376478 cpgIslandAndy.bed
 # 260761 cpgIslandGgfAndy.bed
     # http://www.pnas.org/cgi/content/full/99/6/3740
     # Takai D Jones PA
     # Comprehensive analysis of CpG islands in human chromosomes 21 and 22
     #
     # Regions of DNA of greater than 500 bp with a G+C equal to or
     # greater than 55% and observed CpG/expected CpG of 0.65 were more
     # likely to be associated with the 5' regions of genes and this
     # definition excluded most Alu-repetitive elements.
     #
     # Also, our description reduced the number of CpG islands located
     # on these chromosomes from 14,062 to 1,101, which is more
     # consistent with the expected number of genes (750) located on
     # these two chromosomes.
     #
     # To exclude "mathematical CpG islands" (for example, a 300-bp
     # sequence containing one G, 150 Cs, and only one CpG, which would
     # meet the criteria of a CpG island), we added one more condition:
     # that there are at least seven CpGs in these 200 bp. This number
     # was selected on the basis that there would be 200/16 (i.e.,
     # 12.5) CpGs in a random DNA fragment containing no suppression of
     # CpG. Because Gardiner-Garden and Frommer's criterion (1) of
     # ObsCpG/ExpCpG of 0.6 would accommodate (0.6 � 12.5) CpGs (i.e.,
     # 7.5), we selected seven CpGs as being a reasonable cutoff for
     # the initial analysis.
     #
     egrep -w '^chr2[12]' ../cpgIsland/cpgIsland.bed | wc -l
 #   1033
     egrep -w '^chr2[12]' cpgIslandAndy.bed | wc -l
 #  16462
     # Hmm, how did I find fewer with looser params??  Better run Takai and 
     # Jones's script on chr21 and chr22 for comparison...
     egrep -w '^chr2[12]' cpgIslandGgfAndy.bed |wc -l
 #  10680
     # OK, I just have to try again with masked sequence:
     ssh eieio
     cd /cluster/data/hg16/bed/cpgIslandGgfAndy
     cp /dev/null cpgIslandMaskedAndy.bed
     cp /dev/null cpgIslandMaskedGgfAndy.bed
     foreach f (../../?{,?}/chr*.fa.masked.gz)
       set chr = $f:t:r:r:r
       echo preproc $chr
       zcat $f \
       | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
       > $chr.masked.preproc
       echo running original on $chr
       awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \
       | /cluster/home/angie/andy-cpg-island.pl \
       | perl -wpe '$i=0 if (not defined $i); \
                    chomp; ($s,$e) = split("\t"); $s--; \
                    $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
       >> cpgIslandMaskedAndy.bed
       echo running modified on $chr
       /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \
       | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                    $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                    $pGc = (100.0 * $gc / $n); \
                    $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                         "$pCpG\t$pGc\t$oE\n";' \
       >> cpgIslandMaskedGgfAndy.bed
     end
     ssh hgwdev
     cd /cluster/data/hg16/bed/cpgIslandGgfAndy
     hgLoadBed hg16 cpgIAndyMasked -tab -noBin cpgIslandMaskedAndy.bed
     # this one is a cpgIslandExt but with a different table name:
     sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandMaskedGgfAndy.sql
     hgLoadBed hg16 cpgIslandGgfAndyMasked -tab -noBin \
       -sqlTable=cpgIslandMaskedGgfAndy.sql cpgIslandMaskedGgfAndy.bed
     featureBits hg16 cpgIAndyMasked
 #93307698 bases of 2865248791 (3.257%) in intersection
     featureBits hg16 cpgIslandGgfAndyMasked
 #56180461 bases of 2865248791 (1.961%) in intersection
     wc -l *ed
 # 376478 cpgIslandAndy.bed
 # 260761 cpgIslandGgfAndy.bed
 # 125851 cpgIslandMaskedAndy.bed
 #  80350 cpgIslandMaskedGgfAndy.bed
     # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's,
     # might not be the right thing to do (?).  Give it a try with less-masked 
     # sequence.
     ssh eieio
     cd /cluster/data/hg16/bed/cpgIslandGgfAndy
     cp /dev/null cpgIslandGgfAndyOnlyRM.bed
     cp /dev/null cpgIslandGgfAndyOnlyRMAlu.bed
     foreach f (../../?{,?}/chr*.fa)
       set chr = $f:t:r
       echo preproc, ggf-andy $chr onlyRM
       zcat $f.out.gz > /tmp/tmp.fa.out
       maskOutFa $f /tmp/tmp.fa.out stdout \
       | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
       | /cluster/home/angie/ggf-andy-cpg-island.pl \
       | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                    $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                    $pGc = (100.0 * $gc / $n); \
                    $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                         "$pCpG\t$pGc\t$oE\n";' \
       >> cpgIslandGgfAndyOnlyRM.bed
       echo preproc, ggf-andy $chr onlyRMAlu
       head -3 /tmp/tmp.fa.out > /tmp/tmp2.fa.out
       awk '$11 == "SINE/Alu" {print;}' /tmp/tmp.fa.out >> /tmp/tmp2.fa.out
       maskOutFa $f /tmp/tmp2.fa.out stdout \
       | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
       | /cluster/home/angie/ggf-andy-cpg-island.pl \
       | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                    $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                    $pGc = (100.0 * $gc / $n); \
                    $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                         "$pCpG\t$pGc\t$oE\n";' \
       >> cpgIslandGgfAndyOnlyRMAlu.bed
     end
 #  80314 cpgIslandGgfAndyOnlyRM.bed
 # 110598 cpgIslandGgfAndyOnlyRMAlu.bed
     ssh hgwdev
     cd /cluster/data/hg16/bed/cpgIslandGgfAndy
     sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
     hgLoadBed hg16 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \
       cpgIslandGgfAndyOnlyRM.bed
     sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRMAlu/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
     hgLoadBed hg16 cpgIslandGgfAndyOnlyRMAlu -tab -noBin -sqlTable=/tmp/c.sql \
       cpgIslandGgfAndyOnlyRMAlu.bed
     featureBits hg16 cpgIslandGgfAndyOnlyRM
 #56275308 bases of 2865248791 (1.964%) in intersection
     featureBits hg16 cpgIslandGgfAndyOnlyRMAlu
 #78743130 bases of 2865248791 (2.748%) in intersection
 
 
 #### mrnaBlastz track - all mrnas aligned using blastz  Robert 2/20/2004
 mkdir /cluster/data/hg16/bed/mrnaBlastz
 cd /cluster/data/hg16/bed/mrnaBlastz
 /cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank genbank mrna mrna.fa -db=hg16 -native
 faTrimPolyA mrna.fa hg16Mrna.fa
 faSize hg16Mrna.fa -detailed=on > S2.len
 mkdir /cluster/bluearc/hg/mrnaHg16
 faSplit sequence hg16Mrna.fa 100 /cluster/bluearc/hg/mrnaHg16/mrna
 ls -1 /cluster/bluearc/scratch/hg/mrnaHg16/ > mrna.lst
 hgsql hg16 < chromInfo.sql > S1.len
 awk '{print $1}' S1.len |grep -v random >  S1.lst
 cd /cluster/bluearc/hg/gs.17/build34/mrnaBlastz
 make-joblist
 para create spec
 para push
 ~angie/hummus/do.out2lav DEF > j
 para create j
 para push
 #!/bin/tcsh
 set base="/cluster/bluearc/hg/gs.17/build34/mrnaBlastz"
 
     cd $base
     mkdir -p pslRaw
     foreach c (lav/*)
       pushd $c
       set chr=$c:t
       set out=$base/pslRaw/$chr.psl
       echo "Translating $chr lav to $out"
       cat `ls -1 *.lav | sort -g` \
         | lavToPsl stdin stdout \
         | sed -e 's@scratch/hg/gs.17/build34/bothMaskedNibs//@@' | sed -e 's/\.nib:[0-9]*-[0-9]*//' > $out 
       popd
     end
 
 for i in `ls pslRaw/` ; do echo sortIt.sh pslRaw/$i pslSort/$i >> spec.sort ; done
 para create spec.sort - sorts pslRaw to pslSort
 
 for i in `awk '{print $1}' S1.len` ; do echo pslFilterDups pslSort/$i.psl pslFilter/$i.psl  >> spec.dup ; done
 para create spec.dup - filters pslSort to pslFilter using pslFilterDups
 
 for i in `awk '{print $1}' S1.len` ; do echo axtChain -linearGap=linearGap.txt -psl pslFilter/$i.psl /scratch/hg/gs.17/build34/bothMaskedNibs/ -faQ /cluster/data/hg16/bed/mrnaBlastz/hg16Mrna.fa chain/$i.chain >> spec.chain ; done
 para create spec.chain - chains pslFilter to chain
 
 mkdir chainFilter
 for i in `awk '{print $1}' S1.len` ; do echo doFilter ../chain/$i.chain ../chainFilter/$i.chain >> spec.filter ; done
 spec.filter - filters chain to chainFilter using doFilter
 
 mkdir -p preNet
 
 cd chainFilter
 foreach i ( *.chain)
 chainPreNet $i ../S1.len ../S2.len ../preNet/$i
 end
 
 ls /cluster/data/hg16/nib/*.nib > S1.lst
 
 for i in `awk '{print $1}' S1.len`; do chainToPsl ../preNet/$i.chain ../S1.len ../S2.len ../S1.lst /cluster/data/hg16/bed/mrnaBlastz/hg16Mrna.fa ../psl/$i.psl >> spec.chain2psl.new ; echo $i done chainToPsl ; done
 ssh kk9-10
 para create spec.chain2psl.new
 
 for i in `awk '{print $1}' S1.len`; do hgLoadPsl -noTNameIx hg16 -table=${i}_mrnaBlastz psl/$i.psl ; echo $i done ; done
 
 ## end of blastz Mrna track
 
 #### BUILD RETROGENE TRACK ( done Robert 6/15/2004)
 cp  /cluster/data/genbank/data/aligned/genbank.137.0/hg16/full/mrna.native.rawPsl.gz .
 gunzip mrna.native.rawPsl.gz
 awk  '{OFS="\t";print $1,$2,$3,$4,$5,$6,$7,$8,$9,substr($10,1,index($10,".")-1),$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23}' mrna.native.rawPsl  > mrnaBlat.psl
 hgLoadPsl hg16 mrnaBlat.psl
 
 hgsql hg16 -N -B < refGene.sql > refGene.tab
 cd /cluster/bluearc/hg/gs.17/build34/mrnaBlastz/
 netToBed /cluster/data/hg16/bed/blastz.mm3/axtChain/mouseSynNet.net mouseSyn.bed
 
 ssh eieio
 pslCat -nohead -check all_mrna.psl /cluster/bluearc/hg/gs.17/build34/mrnaBlastz/psl/*.psl |awk '{print $0, $1*3-$2}' | sort -k 10,10 -k 22nr -T /tmp | awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' >  blatBlastz.psl 
 
 awk '{OFS="\t"; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21}' /scratch/blatBlastz.psl  > /scratch/x.psl
 hgsql hg16 < mrna.sql | grep -v matches | awk '{OFS="\t"; print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22}' > all_mrna.psl
 tawk '$12 > 1 && $12 < 9999999{x=$11;$11=$12;$12=x;print $0}'  /cluster/data/kgDB/bed/hg16/kgBestMrna/sortedKnownGene.tab > sortedKnownGene.tab
 ssh kkr1u00
 cd /cluster/data/hg16/bed/pseudo
 cp refGene.tab /iscratch/i/hg/gs.17/build34/pseudo
 cp /cluster/data/hg16/bed/simpleRepeat.bed /iscratch/i/hg/gs.17/build34/pseudo
 cp mrnaHg16.fa /iscratch/i/hg/gs.17/build34/pseudo
 cp mouseSyn.bed /iscratch/i/hg/gs.17/build34/pseudo
 cp sortedKnownGene.tab /iscratch/i/hg/gs.17/build34/pseudo
 pslSplit nohead -chunkSize=121 /iscratch/i/hg/gs.17/build34/pseudo blatBlastz.psl
 cd /iscratch/i/hg/gs.17/build34/pseudo
 iSync
 
 ssh kk
 cd /cluster/data/hg16/bed/pseudo
 para create spec.kk
 para push
 
 #post process and load track
 ./buildSort.sh
 
 
 ### PHASTCONS HUMAN/CHIMP/MOUSE/RAT/CHICKEN (6/20/04, acs)
 
     # this is an addendum to Katie's '5-WAY MULTIZ & PHYLO-HMM' (see above)
     # just redoing the 'label' step with the new 'phastCons' program
     # picking up where it says "compute the conservation scores" 
 
     ssh hgwdev
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
     
     # set up wrapper for phastCons
     cat << '_EOF_' > doPhastCons
 #!/bin/sh
 
 PHAST=/cluster/bin/phast
 TMP=/tmp/phastCons
 
 file=$1
 root=`basename $file .ss.gz`
 chrom=`echo $root | awk -F\. '{print $1}'`
 
 mkdir -p $TMP PREDICTIONS/$chrom PHASTCONS/$chrom
 zcat $file | $PHAST/phastCons - hpmrc_rev_dg.mod --nrates 20 --transitions 0.018,0.002 --viterbi PREDICTIONS/$chrom/$root.bed --score --seqname $chrom --quiet > ${TMP}/$root.pp
 gzip -c $TMP/$root.pp > PHASTCONS/$chrom/$root.pp.gz
 rm $TMP/$root.pp
 '_EOF_'
     chmod u+x doPhastCons
 
     # the --transitions arguments are approximate maximum likelihood
     # estimates obtained by running the program *without* --transitions
     # (causes estimation by EM) on five randomly selected 1M bp
     # windows.  All estimates were in the same ballpark (took a rough average)
 
     # set up cluster job
     ssh eieio
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
     cp WINDOWS/*.ss.gz /cluster/bluearc/hg16/bed/hg16mm3rn3panTro1galGal2-SS/
     logout
     rm -f jobs.lst
     for file in /cluster/bluearc/hg16/bed/hg16mm3rn3panTro1galGal2-SS/*.ss.gz ; do echo doPhastCons $file >> jobs.lst ; done
     ssh kk
     cd /cluster/data/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/phyloHMM
     para create ; para try ; para push ... etc.
     
     # now create tracks
     mkdir -p PHASTCONS/wib
     for dir in PHASTCONS/chr* ; do \
 	echo $dir ;\
 	chr=`basename $dir` ;\
 	zcat `ls $dir/*.pp.gz | sort -t\. -k2,2n` | \
 	    wigAsciiToBinary -chrom=$chr \
 	    -wibFile=PHASTCONS/wib/${chr}_phastCons stdin ;\
     done
     hgLoadWiggle hg16 phastCons PHASTCONS/wib/chr*_phastCons.wig
     mkdir -p /gbdb/hg16/wib
     rm -f /gbdb/hg16/wib/chr*phastCons.wib
     ln -s `pwd`/PHASTCONS/wib/*.wib /gbdb/hg16/wib
     chmod 775 . PHASTCONS PHASTCONS/wib
     chmod 664 PHASTCONS/wib/*.wib
     
     # tweak scores and names of predictions
     cat PREDICTIONS/*/*.bed | sed 's/id //' | \
 	awk '{printf "%s\t%s\t%s\tlod=%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", \
 	    $1, $2, $3, $5, 147.49 * log($5) - 240.34, $6, $7, $8, $9, \
 	    $10, $11, $12}' > all.bed
     hgLoadBed hg16 phastConsElements all.bed
 
     # Scores are transformed as follows, for a reasonable-looking 
     # "spectrum".  Let x_max be the maximum score (here
     # x_max  = 4490) and let x_med be the median score (here x_med =
     # 39).  The  scores are transformed via the function f(x) = a *
     # log x + b, s.t. f(x_med) = 300 and f(x_max) = 1000.  Solving
     # for a and b, you get b = (300 log x_max - 1000 log x_med) /
     # (log x_max - log x_med), a = (1000 - b) / log x_max.  Here a =
     # 147.49, b = -240.34
 
 #track phastCons
 #shortLabel phastCons 
 #longLabel phastCons Conservation Score, Human/Chimp/Mouse/Rat/Chicken
 #group compGeno
 #priority 103
 #visibility hide
 #color 0,10,100
 #maxHeightPixels 40
 #type wig 0.0 1.0
 #autoScaleDefault off
 
 #track phastConsElements
 #shortLabel phastConsElements
 #longLabel phastCons Conserved Elements, Human/Chimp/Mouse/Rat/Chicken
 #group compGeno
 #priority 104
 #visibility hide
 #spectrum on
 #color 0,60,120
 #altColor 200,220,255
 #exonArrows off
 #type bed 12 .
 
 
 # Ensembl 34d GENE PREDICTIONS (2004-07-13 baertsch)
 ## reloaded ensGene to add frame info, no change to data
     /cluster/bin/i386/ldHgGene -gtf -genePredExt hg16 ensGene \
       /cluster/data/hg16/bed/ensembl34d/ensGene.gtf
 # TWINSCAN 1.3 GENE PREDICTIONS (2004-07-13 baertsch)
 ## reloaded twinscan to add frame info, no change to data
     ldHgGene hg16 twinscan chr_gtf/chr*.gtf -gtf -genePredExt
 
 
 #### AFFYTRANSFRAG AND AFFYTRANCRIPTION TRACKS - (2004-07-21 sugnet)
 # tracks covering about 1/3 of genome with probes
 # every 5bp and hybridized to RNA from SK-N-AS cell line.
 # Lifted from genome version hg15.
 
 # affyTransfrag track: lift tranfrags to hg16
    cd /cluster/store6/weber/affy/transfrags/transfragsLabeled/
    mkdir hg16
    cd hg16
    liftOver ../SK_phase2_tfgs_final.biggerThan50bp.tab /cluster/store4/gs.17/build34/bed/bedOver/33to34.chain \ 
       SK_phase2_tfgs_final.hg16.bed SK_phase2_tfgs_final.err.bed
    # check to make sure that most lifted...
    wc *.bed
    #  12      49     346 SK_phase2_tfgs_final.err.bed
    #  170749  853745 6936780 SK_phase2_tfgs_final.hg16.bed
    #  170761  853794 6937126 total
    hgLoadBed hg16 affyTransfrags SK_phase2_tfgs_final.hg16.bed 
    # Reading SK_phase2_tfgs_final.hg16.bed
    # Loaded 170749 elements of size 5
    # Sorted
    # Creating table definition for 
    # Saving bed.tab
    # Loading hg16
 
 # affyTranscription track: 
    cd /cluster/store6/weber/affy/graph/hg15/gz
    gunzip *.gz
    mkdir hg16
    cd hg16
    ln -s ../*.signal ./
    # remapGraphs.pl just makes a quick bed file for each signal file with 1bp spans
    # and then lifts via liftOver to new genome.
    remapGraphs.pl -liftChain /cluster/store4/gs.17/build34/bed/bedOver/33to34.chain \
       -oldGenome hg15 -newGenome hg16 *.signal
    # Lifting chr13.hg16.signal.
    # Lifting chr13.sk.signal.
    # Lifting chr14.sk.signal.
    # Lifting chr19.sk.signal.
    # Lifting chr20.sk.signal.
    # Lifting chr21.sk.signal.
    # Lifting chr22.hg16.signal.
    # Lifting chr22.sk.signal.
    # Lifting chr6.sk.signal.
    # Lifting chr7.sk.signal.
    # Lifting chrX.sk.signal.
    # Lifting chrY.sk.signal.
    # runWiggles.sh just calls wigAsciiToBinary for each signal file.
    cat ../runWiggles.sh | sed -e 's/hg15/hg16/g' | sed -e 's/sk/hg16/g' > runWiggles.sh
    ./runWiggles.sh 
    hgLoadWiggle -pathPrefix=/gbdb/hg16/wib/affyTranscription hg16 affyTranscription *.wig
    Connected to database hg16 for track affyTranscription
    Creating table definition with 13 columns in hg16.affyTranscription
    Saving wiggle.tab
    Loading hg16
    cp *.wib /cluster/data/hg16/bed/affyTranscription/wib/
    cd  /gbdb/hg15/wib/affyTranscription/
    ln -s /cluster/data/hg16/bed/affyTranscription/wib/*.wib ./
    cd /cluster/data/hg16/bed/affyTranscription/wib/*.wib
    chmod 664 *.wib
    cd /cluster/store6/weber/affy/graph/hg15/gz/hg16
    rm *.wib *.wig *.bed
    gzip *hg16.signal &
 
 
 # EXTRACT LINEAGE-SPECIFIC REPEATS FOR DOG (DONE 2004/08/11 markd)
     cd /cluster/bluearc/scratch/hg/gs.17/build34/rmsk
 
     # Run Arian's DateRepsinRMoutput.pl to add extra columns telling 
     # whether repeats in -query are also expected in -comp species.  
     # Even though we already have the human-mouse linSpecReps,
     # extractLinSpecReps requires two columns of DateRepsinRMoutput.pl
     # additions.  So add mouse, then ignore it.  
     # Dog in extra column 1, Mouse in extra column 2
     foreach outfl ( *.out )
         echo "$outfl"
         /cluster/bluearc/RepeatMasker/DateRepsinRMoutput.pl \
           ${outfl} -query human -comp dog -comp mouse
     end
     # Now extract dog (extra column 1), ignore mouse.
     cd /cluster/bluearc/scratch/hg/gs.17/build34
     mkdir linSpecRep.notInDog
     foreach f (rmsk/*.out_dog_mus)
         set base = $f:t:r:r
         echo $base.out.spec
         /cluster/bin/scripts/extractLinSpecReps 1 $f > \
                         linSpecRep.notInDog/$base.out.spec
     end
     # Clean up.
     rm /cluster/bluearc/scratch/hg/gs.17/build34/rmsk/*.out_dog_mus
 
     # copy to iservers
     ssh kkr1u00
     cp -r /cluster/bluearc/scratch/hg/gs.17/build34/linSpecRep.notInDog  /iserver/kkr1u00/i/gs.17/build34/
     iSync
 
 # BLASTZ DOG (CANFAM1) (DONE 2004/08/12 markd)
 
     ssh kk
     # store4 low on disk space; symlink to store7
     mkdir -p /cluster/store7/hg16/bed/blastz.canFam1.2004-08-10
     ln -s  /cluster/store7/hg16/bed/blastz.canFam1.2004-08-10 /cluster/data/hg16/bed
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
     # Use default (Human-Mouse) settings for starters.
     cat << '_EOF_' > DEF
 # human vs. dog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Default
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Human
 SEQ1_DIR=/scratch/hg/gs.17/build34/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInDog
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog
 SEQ2_DIR=/scratch/hg/canFam1/nib
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/scratch/hg/canFam1/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg16/bed/blastz.canFam1.2004-08-10
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # first cluster run: raw blastz alignments
     bash # if a csh/tcsh user
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
     source DEF
     mkdir -p $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j 2>log
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     # edit jobList to do chr19 first; hg17 run notes indicated
     # this might save around 4 hours
     para create jobList
     para try, check, push, check, ....
 
 #Completed: 93225 of 93225 jobs
 #CPU time in finished jobs:   18459718s  307661.97m  5127.70h  213.65d
 #IO & Wait Time:                429193s    7153.21m   119.22h    4.97d
 #Average job time:                 203s       3.38m     0.06h    0.00d
 #Longest job:                    18951s     315.85m     5.26h    0.22d
 #Submission to last job:         58889s     981.48m    16.36h    0.68d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     bash # if a csh/tcsh user
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
     para try, check, push, etc ...
 #Completed: 339 of 339 jobs
 #CPU time in finished jobs:       3771s      62.85m     1.05h    0.04d  0.000 y
 #IO & Wait Time:                  6671s     111.18m     1.85h    0.08d  0.000 y
 #Average job time:                  31s       0.51m     0.01h    0.00d
 #Longest job:                      334s       5.57m     0.09h    0.00d
 #Submission to last job:          1464s      24.40m     0.41h    0.02d
 
     # third run: lav -> axt
     ssh kki
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
     mkdir axtChrom pslChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 set chr = $1:t
 cat `ls -1 *.lav | sort -g` \
 |  /cluster/bin/x86_64/lavToAxt stdin \
     /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/canFam1/nib stdout \
 | /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
 /cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod a+x do.csh
     cp /dev/null jobList
     foreach d (../lav/chr*)
       echo "do.csh $d" >> jobList
     end
     para create jobList
     para try, check, push, check
 #Completed: 42 of 42 jobs
 #CPU time in finished jobs:       1297s      21.62m     0.36h    0.02d  0.000 y
 #IO & Wait Time:                 15428s     257.13m     4.29h    0.18d  0.000 y
 #Average job time:                 398s       6.64m     0.11h    0.00d
 #Longest job:                     1714s      28.57m     0.48h    0.02d
 #Submission to last job:          1723s      28.72m     0.48h    0.02d
 
    # axtChrom/chr19_random.axt is empty, probably ok
 
 # CHAIN DOG BLASTZ (DONE)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 \
     /iscratch/i/gs.17/build34/bothMaskedNibs \
     /iscratch/i/canFam1/nib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     # edit to remove chr19_random
     para create jobList
     para try, check, push, check...
 #Completed: 41 of 41 jobs
 #CPU time in finished jobs:       8233s     137.22m     2.29h    0.10d  0.000 y
 #IO & Wait Time:                 11718s     195.29m     3.25h    0.14d  0.000 y
 #Average job time:                 487s       8.11m     0.14h    0.01d
 #Longest job:                     4623s      77.05m     1.28h    0.05d
 #Submission to last job:          4971s      82.85m     1.38h    0.06d
 
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     rm run1/chain/*.chain
 
     # hg17 said:
     # Lots of chaff with scores in the 3000's.  Many very-high-scoring 
     # chains.  So filter the chain down somewhat...
     # didn't bother rechecking, just filtered.
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain
     rm chain/*
     chainSplit chain all.chain
     gzip all.chain.unfiltered
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain hg16 ${c}_chainCanFam1 $i
     end
     # Coverage is significantly higher than mouse:
     featureBits hg16 -chrom=chr1 chainCanFam1Link
 # 123343602 bases of 221562941 (55.670%) in intersection
 
 # NET DOG BLASTZ (DONE 2004/08/15)
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
     netClass noClass.net hg16 canFam1 dog.net
 
     # Make a 'syntenic' subset:
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn dog.net > dogSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.canFam1.2004-08-10/axtChain
     netFilter -minGap=10 dog.net |  hgLoadNet hg16 netCanFam1 stdin
     netFilter -minGap=10 dogSyn.net | hgLoadNet hg16 syntenyNetCanFam1 stdin
     # Add entries for chainCanFam1, netCanFam1 to human/hg16 trackDb
 
 
 # LIFTOVER CHAIN TO DOG CANFAM1 (DONE 2004-09-16 kate)
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.canFam1/axtChain
     time netChainSubset dog.net all.chain \
         /cluster/data/hg16/bed/liftOver/hg16ToCanFam1.chain
 
 
 # LOAD ENSEMBL ESTS (DONE 2004-09-07 braney)
      cd /cluster/data/hg16/bed
      mkdir ensEst
      cd ensEst
 
 #        Get the ensembl EST data from http://www.ensembl.org/
 #        Go to the Martview link
 #        Choose Homo sapiens as the organism
 #        Follow this sequence through the pages:
 #        Page 1) Choose the Ensembl ESTs choice. Hit next.
 #        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.        
 #        Page 3) Choose the "Structures" box. 
 #        Page 4) Choose GTF as the ouput, choose gzip compression and then hit Export.
 #	        Name file ensEst.gff.gz
 
 # Ensembl handles random chromosomes differently than us.  They give the
 # contig name.  We can lift these up to our chrN_random chromosomes
     gunzip ensEst.gff.gz
     sed "/^[0-9XY]*\t/d" ensEst.gff | sed "s/^.*_NT/NT/" > random.gff 
     liftUp -type=".gff" liftRandom.gff /cluster/data/hg16/jkStuff/liftAll.lft warn random.gff
     sed "/_NT_/d" ensEst.gff | sed "s/^/chr/" > unrandom.gff 
     cat liftRandom.gff unrandom.gff > fixed.gff
     ldHgGene hg16 ensESTGene fixed.gff
 
 #        Get the ensembl protein data from http://www.ensembl.org/
 #        Go to the Martview link
 #        Choose Homo sapien as the organism
 #        Follow this sequence through the pages:
 #        Page 1) Choose the Ensembl ESTs choice. Hit next.
 #        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
 #        Page 3) Choose the "Sequences" box. 
 #        Page 4) Choose Transcripts/Proteins and Gene sequence Only as the ouput, 
 #	    choose text/fasta and gzip compression and then hit export. Name to ensEstPep.fasta
 
     gunzip ensEstPep.fasta.gz
     sed "s/|.*//" ensEstPep.fasta > fixedPep.fa
     hgPepPred hg16 generic ensESTPep fixedPep.fa
 
     # ensGtp associates geneId/transcriptId/proteinId for name searches
     # Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
     # Choose Text, tab-separated as the output format.
     # Save file as ensGtp.tsv.gz
     gunzip ensGtp.tsv.gz
     sed "s/ensGtp/ensESTGtp/"  ~/kent/src/hg/lib/ensGtp.sql | hgsql hg16
     echo "load data local infile 'ensESTGtp.tsv' into table ensESTGtp ignore 1 lines" | hgsql hg16
 
 # QA Note - table ensGtp was updated on 2004-08-18 to remove a header line that
   was included in the actual table data. This was not ever pushed out to the rr.
   Table fix (push) done on 2006-01-31 (Jen). Original push on 2004-06. No other
   pushQ entries exist for table change on 2004-08.
 
 # BLASTZ MOUSE MM5 (DONE 2004-09-10 kate)
     
     ssh kk
     # use store7 (lots of space)
     mkdir -p /cluster/store7/hg16/bed/blastz.mm5.2004-09-10
     ln -s /cluster/store7/hg16/bed/blastz.mm5.2004-09-10 \
                 /cluster/data/hg16/bed
     cd /cluster/data/hg16/bed
     ln -s  blastz.mm5.2004-09-10 blastz.mm5
     cd blastz.mm5
 
     cat << '_EOF_' > DEF
 # human vs. mouse
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET
 # Human
 SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 # not used
 SEQ1_RMSK=
 # not used
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInMouse
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY
 # Mouse
 SEQ2_DIR=/scratch/mus/mm5/softNib
 # RMSK not currently used
 SEQ2_RMSK=/scratch/mus/mm5/rmsk
 # FLAG not currently used
 SEQ2_FLAG=-rodent
 SEQ2_SMSK=/scratch/mus/mm5/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg16/bed/blastz.mm5.2004-09-10
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     # first cluster run: blastz alignments
     ssh kk
     bash # if a csh/tcsh user
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
         # 44060 jobs
     para try, check, push, check, ....
 # Average job time:                 382s       6.37m     0.11h    0.00d
 # Longest job:                     4510s      75.17m     1.25h    0.05d
 # Submission to last job:         26324s     438.73m     7.31h    0.30d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     bash # if a csh/tcsh user
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
         # 339 jobs
     para try, check, push, etc ...
 # Average job time:                  16s       0.27m     0.00h    0.00d
 # Longest job:                      112s       1.87m     0.03h    0.00d
 # Submission to last job:           401s       6.68m     0.11h    0.00d
 
     # convert lav files to axt
     ssh kki
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
     mkdir axtChrom pslChrom
     # a new run directory
     mkdir run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh
 cd $1
 set chr = $1:t
 cat `ls -1 *.lav | sort -g` \
 |  /cluster/bin/x86_64/lavToAxt -dropSelf stdin \
     /iscratch/i/gs.17/build34/bothMaskedNibs /iscratch/i/mus/mm5/softNib stdout \
 | /cluster/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
 /cluster/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
   ../../pslChrom/$chr.psl
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChrom/$(root1).axt} {check out line+ /cluster/data/hg16/bed/blastz.mm5.2004-09-10/pslChrom/$(root1).psl}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
         # 42 jobs
     head jobList
     para create jobList
     para try, check, push, check,...
 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm5/pslChrom
     foreach f (*.psl)
         set c = $f:r
         hgLoadPsl -noTNameIx hg16 -table=${c}_blastzMm5 $f
     end
     # takes 30-60 min
 
 
 # CHAIN MOUSE MM5 BLASTZ (DONE 2004-09-15 kate)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChrom/*.axt \
       > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in line+ $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 \
     /iscratch/i/gs.17/build34/bothMaskedNibs \
     /iscratch/i/mus/mm5/softNib $2 > $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     # edit to remove chr19_random
     para create jobList
         # 41 jobs
     para try, check, push, check...
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
     time chainMergeSort run1/chain/*.chain > all.chain
         # 5 min -- 230.070u 58.980s 5:07.13 94.1%  0+0k 0+0io 117pf+0w
     time chainSplit chain all.chain
         # 5 min -- 208.490u 56.360s 4:48.81 91.7%  0+0k 0+0io 125pf+0w
     rm run1/chain/*.chain
 
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain/chain
     foreach i (*.chain)
         set c = $i:r
         echo $c
         hgLoadChain hg16 ${c}_chainMm5 $i
     end
     # compare with previous mouse, and with this assembly on later human
     featureBits hg16 -chrom=chr1 chainMm5
     featureBits hg17 -chrom=chr1 chainMm5
     featureBits hg16 -chrom=chr1 chainMm3
 
     featureBits hg16 -chrom=chr1 chainMm5Link
         # 83288228 bases of 221562941 (37.591%) in intersection
     featureBits hg17 -chrom=chr1 chainMm5Link
         # 83773012 bases of 222827847 (37.595%) in intersection
     featureBits hg16 -chrom=chr1 chainMm3Link
         # 82665800 bases of 221562941 (37.310%) in intersection
 
 
 # NET MOUSE MM5 BLASTZ (DONE 2004-09-16 kate)
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
     chainPreNet all.chain ../S1.len ../S2.len stdout \
     | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
     | netSyntenic stdin noClass.net
     # < 10 minutes
 
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
     time netClass noClass.net hg16 mm5 human.net
         # 15 minutes
 
     # Make a 'syntenic' subset:
     ssh kksilo
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
     rm noClass.net
     # Make a 'syntenic' subset of these with
     netFilter -syn human.net > humanSyn.net
 
     # Load the nets into database 
     ssh hgwdev
     cd /cluster/data/hg16/bed/blastz.mm5.2004-09-10/axtChain
     netFilter -minGap=10 human.net |  hgLoadNet hg16 netMm5 stdin
     netFilter -minGap=10 humanSyn.net | hgLoadNet hg16 netSyntenyMm5 stdin
 
     # GOT HERE
 
     # Add entries for chainMm5, netMm5, netSyntenyMm5
     # human/hg16 trackDb
 
 
 # LIFTOVER CHAIN TO MOUSE MM5 (DONE 2004-09-16 kate)
     ssh kolossus
     cd /cluster/data/hg16/bed/blastz.mm5/axtChain
     time netChainSubset human.net all.chain \
         /cluster/data/hg16/bed/liftOver/hg16ToMm5.chain
         # 7 mins.
 
 # TIGHT FOR MOUSE MM5 (TBD kate)
 # BEST FOR MOUSE MM5 (TBD kate)
 # SYNTENIC NET FOR MOUSE MM5 (TBD kate)
 
 # DOWNLOADS FOR MOUSE MM5 (TBD kate)
 
 
 # BLASTZ FOR ZEBRAFISH DANRER1 (WORKING 2004-09-29 kate)
 
     # Treat all repeats as lineage-specific
     ssh kkr1u00
     mkdir /iscratch/i/gs.17/build34/linSpecRep.notInZebrafish
     foreach f (/iscratch/i/gs.17/build34/rmsk/chr*.fa.out)
         cp -p $f  \
            /iscratch/i/gs.17/build34/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
     end
     iSync
   
     ssh kk
     # use store7 (lots of space)
     mkdir -p /cluster/store7/hg16/bed/blastz.danRer1.2004-09-29
     ln -s /cluster/store7/hg16/bed/blastz.danRer1.2004-09-29 \
                 /cluster/data/hg16/bed
     cd /cluster/data/hg16/bed
     ln -s  blastz.danRer1.2004-09-29 blastz.danRer1
     cd blastz.danRer1
 
     cat << '_EOF_' > DEF
 # human vs zebrafish (danRer1)
 # params for zebrafish --  L=6000 (threshold for gapped alignments)
 # (same params as used for Fugu)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 
 # Reuse parameters from hg16-fr1.
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # Target: Human
 SEQ1_DIR=/iscratch/i/gs.17/build34/bothMaskedNibs
 SEQ1_RMSK=
 SEQ1_FLAG=
 SEQ1_SMSK=/iscratch/i/gs.17/build34/linSpecRep.notInZebrafish
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # Query: Zebrafish (danRer1)
 SEQ2_DIR=/iscratch/i/danRer1/nib/
 SEQ2_RMSK=
 SEQ2_FLAG=
 SEQ2_SMSK=/iscratch/i/danRer1/linSpecRep.notInHuman
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/hg16/bed/blastz.danRer1.2004-09-29
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 '_EOF_'
     # << this line keeps emacs coloring happy
     # Save the DEF file in the current standard place
     cp DEF ~angie/hummus/DEF.hg16-danRer1.2004-09-29
 
     # prepare first cluster run
     ssh kk
     bash # if a csh/tcsh user
     cd /cluster/data/hg16/bed/blastz.danRer1
     source DEF
     mkdir $RAW run.0
     /cluster/home/angie/hummus/make-joblist $DEF > $BASE/run.0/j
     sh ./xdir.sh
     cd run.0
     sed -e 's@^blastz-run@/cluster/bin/penn/blastz-run@' j > jobList
     para create jobList
         # 57630 jobs
     para try, check, push, check, ....
 # Average job time:                 477s       7.95m     0.13h    0.01d
 # Longest job:                    12147s     202.45m     3.37h    0.14d
 
     # second cluster run: lift raw alignments -> lav dir
     ssh kki
     cd /cluster/data/hg16/bed/blastz.danRer1
     bash # if a csh/tcsh user
     source DEF
     mkdir run.1 lav
     /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > $BASE/run.1/jobList
     cd run.1
     wc -l jobList
     para create jobList
         # 339 jobs
     para try
     para check 
     para push
         # GOT HERE
 
     # third run: lav -> axt
     ssh kki
     cd /cluster/data/hg16/bed/blastz.danRer1
     mkdir axtChrom run.2
     cd run.2
     cat << '_EOF_' > do.csh
 #!/bin/csh -ef
 cd $1
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin /iscratch/i/gs.17/build34/bothMaskedNibs \
 /iscratch/i/danRer1/nib stdout \
 | axtSort stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/hg16/bed/blastz.danRer1/axtChrom/$(root1).axt}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
     head jobList
     para create jobList
         # 42 jobs
 
     # GOT HERE
 
 # CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-08-26, hartera)
 # Make chains with rescored blastz
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/hg17/bed/blastz.tetNig1
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/hg17/bed/blastz.tetNig1/axtChrom/*.axt \
         > input.lst
 cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
 
     # Make our own linear gap file with reduced gap penalties, 
     # in hopes of getting longer chains - works well for species at 
     # chicken-human distance or greater
     cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize	11
 smallSize	111
 position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
 qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
 tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
 bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
 '_EOF_'
     # << this line makes emacs coloring happy
 
 cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -linearGap=../../chickenHumanTuned.gap $1 \
     /iscratch/i/gs.18/build35/bothMaskedNibs \
     /iscratch/i/tetNig1/nib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     ara try, check, push, check,...
 =======
         # 29 jobs
     para try, check, push, check,...
 
 
 # SEGMENTAL DUPLICATIONS (DONE 10/21/04 angie)
     ssh hgwdev
     mkdir /cluster/data/hg16/bed/genomicSuperDups
     cd /cluster/data/hg16/bed/genomicSuperDups
     wget http://humanparalogy.gs.washington.edu/segDupDb.tar
     # This tar file contains files for both hg16 and hg17.  A note 
     # from Xinwei She about the contents:
 #Build34 contains 4 tables: 3 of them are already in the genome browser source code:
 #genomicSuperDups, celereCoverage and celeraDupPositive. A new table, vanillaTrack,
 #which display the Celera assembly overlay in the public assembly build34, is added.
 #There trackDb entries can be founded in the file trackDb.add.
 #
 #Build35 contains only 2 tables: genomicSuperDups and celeraDupPositive.
     tar xvf segDupDb.tar 
     cd bd34
     # use tail +2 to skip past the header line:
     zcat celeraCoverage.tab.gz | tail +2 \
     | hgLoadBed -tab hg16 celeraCoverage stdin
     zcat celeraDupPositive.tab.gz | tail +2 \
     | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraDupPositive.sql \
         hg16 celeraDupPositive stdin
     zcat genomicSuperDups.tab.gz | tail +2 \
     | hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql \
         hg16 genomicSuperDups stdin
     # Change the name of "vanillaTrack" to celeraOverlay:
     zcat vanillaTrack.mysqldump.gz | sed -e 's/vanillaTrack/celeraOverlay/g' \
     | hgsql hg16
     # It needs a new index, and it needs a bin field, so dump out its 
     # contents and load them back in using hgLoadBed and an edited 
     # SQL definition:
     hgsql hg16 -N -e 'select * from celeraOverlay' > celeraOverlay.bed
     # Make a ~/kent/src/hg/lib/celeraOverlay.as and run autoSql.  
     # Add bin and indices to celeraOverlay.sql, and reload with hgLoadBed:
     hgLoadBed -tab -sqlTable=$HOME/kent/src/hg/lib/celeraOverlay.sql \
       hg16 celeraOverlay celeraOverlay.bed
     # clean up
     rm celeraOverlay.bed bed.tab
 
 # YALE PSEUDOGENES (started Robert Baertsch, finished JK 2/21/05)
     ssh hgwdev
     cd /cluster/data/hg16/bed
     mkdir pseudoYale
     cd pseudoYale
     # Place file obtained from Mark Gerstein at yale in pseudoYale.gtf
     ldHgGene hg16 pseudoYale pseudoYale.gtf
     # Note - I'm guessing how this goes.  Robert left no record. -jk
 
 ## refresh vega tracks with vega build30        (done 5/4/04 Robert)
 ##download vega mysql tables
 cd /cluster/store8/ensembl
 mkdir vega30_35c
 cd vega30_35c
 ln /cluster/store8/ensembl/vega30_35c /cluster/data/hg17/bed/vega30 -s
 
 for i in `cat tables` ; do wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/$i.gz ; done
 wget -N ftp://ftp.ensembl.org/pub/human-30.35c/data/mysql/homo_sapiens_vega_30_35c/homo_sapiens_vega_30_35c_mysql40_compatible..sql.gz 
 gunzip *.gz
 ##create mysql database
 mysql 
 create database vega30
 use vega30
 source homo_sapiens_vega_30_35c_mysql40_compatible.sql
 source dropMt.sql
 source load.sql
 exit
 
 
 hgsql vega30 -N -B < vegaGene.sql > vegaGene.tab
 awk -f vegaGene.awk < vegaGene.tab > vegaGene.gp     
 ldHgGene hg17 vegaGene -predTab vegaGene.gp -gtf -genePredExt 
 hgsql vega30 -N -B < vegaPseudo.sql > vegaPseudo.tab
 awk -f vegaPseudo.awk < vegaPseudo.tab > vegaPseudo.gp     
 ldHgGene hg17 vegaPseudoGene -predTab vegaPseudo.gp -gtf -genePredExt 
 
 #load processed pseudogenes
 grep Processed vegaPseudo.tab > vegaProcPseudo.tab
 awk -f vegaPseudo.awk < vegaProcPseudo.tab > vegaProcPseudo.gp
 ldHgGene hg17 vegaProcessedPseudo -predTab vegaProcPseudo.gp -gtf -genePredExt
 
 #load vegaInfo
 hgsql vega30 -N -B < vegaGeneInfo.sql > vegaInfo.tab
 hgsql vega30 -N -B < vegaPseudoInfo.sql >> vegaInfo.tab
 
 hgsql hg17 -N -B < /cluster/home/baertsch/kent/src/hg/lib/vegaInfo.sql
 echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg17 -N -B
 
 #load down to hg16
 liftOver vegaGene.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaGeneHg16.gp unMapped.gp -genePred
 liftOver vegaPseudo.gp /gbdb/hg17/liftOver/hg17ToHg16.over.chain vegaPseudoGeneHg16.gp unMappedPseudo.gp -genePred
 
 ldHgGene hg16 vegaGene -predTab vegaGeneHg16.gp -gtf 
 ldHgGene hg16 vegaPseudoGene -predTab vegaPseudoGeneHg16.gp -gtf 
 echo 'truncate table vegaInfo' | hgsql hg16 -N -B
 echo 'load data local infile "vegaInfo.tab" into table vegaInfo' | hgsql hg16 -N -B
 
 # QA note - table vegaPep dropped during this update. Not dropped from rr at 
   time of initial push, creating a -times error in joinerCheck. Table vegaPep 
   dropped from hgwbeta and rr/mgc on 2006-01-31.
 
 #########################################################################
 # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)
 
 ##########################################################################
 # CNPs from University of Washington (Done, Heather and Daryl, June/July 2005)
 # data from http://humanparalogy.gs.washington.edu/structuralvariation
 
 ssh hgwdev
 cd /cluster/data/hg16/bed
 mkdir cnp
 cd cnp
 
 # Sharp data
 cp dupArray.txt cnpSharp.bed.orig
 # change CNP type to match Iafrate data (with permission from Andy)
 sed -e "s/dup/Gain" cnpSharp.bed.orig > cnpSharp.bed.2
 sed -e "s/del/Loss/" cnpSharp.bed.2 > cnpSharp.bed.3
 sed -e "s/Both Loss and Gain/Gain and Loss/" cnpSharp.bed.3 > cnpSharp.bed
 hgLoadBed hg16 cnpSharp -tab -sqlTable=cnpSharp.sql cnpSharp.bed
 # Loaded 160 elements of size 14
 # note: 11 names with special characters: CTD-2183E4*, RP11-111A4?, RP11-325E8#, RP11-1000I9*, RP11-159F11*,
 # RP11-177L24*, RP11-136P13*, RP11-1151C19*, RP11-1008M3*, RP11-379N11?, CTD-3185D7#
 # no apparent problems with these
 hgsql hgFixed < cnpSharpCutoff.sql
 echo 'load data local infile "sampleCUTOFF.txt" into table cnpSharpCutoff' | hgsql hgFixed
 hgsql hg16 < cnpSharpSamples.sql
 echo 'load data local infile "andyArraySample.txt" into table cnpSharpSamples' | hgsql hg16
 hgsql hg16 < cnpSharpSampleCount.sql
 hgsql hg16 < sampleCount.sql
 
 
 # fosmid discordants
 # don't need the id column
 cp fosmidDiscordant.txt fosmidDiscordant.bed
 hgLoadBed hg16 fosmidDiscordantPrelim -tab -sqlTable=fosmidDiscordantPrelim.sql fosmidDiscordant.bed
 hgsql hg16 < fosmidDiscordant.sql
 echo 'insert into fosmidDiscordant select bin, chrom, chromStart, chromEnd, name from fosmidDiscordantPrelim' | hgsql hg16
 echo 'drop table fosmidDiscordantPrelim' | hgsql hg16
 
 # Iafrate data
 cp Iafrate.txt cnpIafrate.bed
 hgLoadBed hg16 cnpIafrate -tab -sqlTable=cnpIafrate.sql cnpIafrate.bed
 
 # Sebat data
 cp Sebat.txt cnpSebat.bed
 hgLoadBed hg16 cnpSebat -tab -sqlTable=cnpSebat.sql cnpSebat.bed
 
 # deletions added May 2006
 # From mccarroll@molbio.mgh.harvard.edu
 genId.pl < mcCarrolldels.txt > mcCarrolldels.bed
 hgLoadBed hg16 -noBin -tab delMccarroll mcCarrolldels.bed
 # Hinds data via Andy Sharp
 sort -n hindsDels.txt > hindsDels.sort
 genId.pl < hindsDels.sort > hindsDels.bed
 hgLoadBed hg16 -noBin -tab delHinds hindsDels.bed
 # From conrad@uchicago.edu
 conrad.pl < conradDels.txt > conradDels.bed
 hgLoadBed hg16 -noBin -tab delConrad conradDels.bed
 
 
 
 ##########################################################################
 # sno/miRNA track from Michel Weber (DONE - 2005-06-16 - Hiram)
 #	received the data file UCSC_snotrack_hg16.txt via email
     ssh hgwdev
     cd /cluster/data/hg16/bed/wgRna
     #	As a quick first pass at classification, take a look at the
     #	items in the hg17.wgRna table and use those as a guide
     hgsql -N -e "select * from wgRna;" hg17 > hg17.wgRna.txt
     awk '{print $5,$10}' hg17.wgRna.txt > name.type.hg17
     #	combine this new sno data with the existing miRNA data
     hgsql -N -e "select * from miRNA;" hg16 > hg16.miRNA.txt
     cat << '_EOF_' > addTypes.pl
 #!/usr/bin/env perl
 
 use warnings;
 use strict;
 
 my %types;              #       key is name, value is the type
 
 open (FH, "name.type.hg17") or die "Can not open name.type.hg17";
 while (my $line=<FH>)
 {
     chomp $line;
     my ($name, $type) = split('\s+',$line);
     $types{$name} = $type;
 }
 close (FH);
 
 open (FH,"grep ^chr UCSC_snotrack_hg16.txt | sort -k1,1 -k2,2n|") or
         die "can not open UCSC_snotrack_hg16.txt";
 while (my $line=<FH>)
 {
     chomp $line;
     my $type="unknown";
     my ($chrom, $start, $end, $name, $score, $strand) = split('\s+',$line);
     if (exists($types{$name})) { $type = $types{$name}; }
     else { if ($name =~ m/^HBII/) { $type = "CDBox"; } }
     print "$chrom\t$start\t$end\t$name\t$score\t$strand\t0\t0\t$type\n";
 }
 close (FH);
 '_EOF_'
     #	happy emacs
     chmod +x addTypes.pl 
     awk '{print $2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t""miRna"}' \
 	hg16.miRNA.txt > hg16.wgRna.tab
     ./addTypes.pl >> hg16.wgRna.tab
     hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/wgRna.sql hg16 wgRna \
 	hg16.wgRna.tab
     #	this leaves 16 items classified as unknown, request to
     #	Michel Weber for proper classification
 
 ################################################################################
 # Build hg17Kg table for KG II for hg16, using hg17 KG data (DONE 2005-07-11 Fan).
 
    ssh hgwdev
    cd /cluster/data/mm6/bed
    mkdir hg17Kg
    cd hg17Kg
    hgsql hg16 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl
 
    hgsql hg16 -N -e \
 'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
    |sort -u > all_mrna.cds
 
    bash
    mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
    exit
 
    hgsql hg16 -e 'drop table mrnaGp'
    hgsql hg16 < ~/src/hg/lib/mrnaGp.sql
    hgsql hg16 -e 'load data local infile "all_mrna.gp" into table mrnaGp'
 
    hgsql hg16 -N -e \
    'select mrnaGp.* from mrnaGp,hg17.knownGene where mrnaGp.name = knownGene.name and mrnaGp.chrom=knownGene.chrom' \
    |sort -u > mrnaGp2.tab
    hgsql hg16 -e 'drop table mrnaGp2'
    hgsql hg16 < ~/src/hg/lib/mrnaGp2.sql
    hgsql hg16 -e 'load data local infile "mrnaGp2.tab" into table mrnaGp2'
 
 # Create hg16Kg table in hg17 to get over a hurdle that we can not do join
 # between mySQL DBs
 
    hgsql hg17 -e 'drop table hg16Kg'
    hgsql hg17 < ~/src/hg/lib/hg16Kg.sql
    hgsql hg16 -N -e 'select * from knownGene' >hg16Kg.tab
    hgsql hg17 -e 'load data local infile "hg16Kg.tab" into table hg16Kg'
 
    hgsql hg17 -N -e \
    'select hg16Kg.* from hg16Kg, knownGene where hg16Kg.name=knownGene.name and knownGene.name not like "NM_%" and hg16Kg.chrom=knownGene.chrom '\
    >j
 
    cut -f 1-10 j >j1
 
 # j1 are mRNA records through old KG process.
 # j2 are RefSeq records based on hg17 KG
 # mrnaGp2 are mRNA records based on hg17 KG non-Refseq entries and GenBank CDS data (which is incomplete).
 
    hgsql hg16 -N -e \
    'select refGene.* from refGene, hg17.knownGene where hg17.knownGene.name=refGene.name' >j2
 
    cat j1 j2 mrnaGp2.tab |sort -u >j.tab
 
    ~/kent/src/hg/protein/sortKg.pl j.tab >hg17Kg.tab
 
    wc hg17Kg.tab
 
    hgsql hg16 -e "delete from hg17Kg"
 
    hgsql hg16 -e 'load data local infile "hg17Kg.tab" into table hg17Kg'
 
 ####################################################################
 # Make mouse ortholog column using blastp on mm6 known genes. (DONE 7/12/05, Fan).
 
 # First make mouse protein database and copy it to /cluster/panasas
 # if it doesn't exist already
 #	This already exists.  See makeMm6.doc for procedure
 
 # Make parasol run directory 
 ssh kk
 mkdir -p /cluster/data/hg16/bed/blastp/mm6
 cd /cluster/data/hg16/bed/blastp/mm6
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/panasas/home/store/mm6/blastp/known \
 -i $1 -o $2 -e 0.001 -m 8 -b 1
 '_EOF_'
     # << keep emacs happy
 chmod a+x blastSome
 
 # Make gensub2 file
 cat << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	this echo trick is used because otherwise the command line is
 #	too long and you can not do a simple ls
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para check
 para push ... etc ...
 Completed: 5812 of 5812 jobs
 CPU time in finished jobs:      96031s    1600.52m    26.68h    1.11d  0.003 y
 IO & Wait Time:                 15641s     260.68m     4.34h    0.18d  0.000 y
 Average job time:                  19s       0.32m     0.01h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:             168s       2.80m     0.05h    0.00d
 Submission to last job:           766s      12.77m     0.21h    0.01d
 
 # Load into database.  
 ssh hgwdev
 cd /cluster/data/hg16/bed/blastp/mm6/run/out
 hgLoadBlastTab hg16 mmBlastTab -maxPer=1 *.tab
 # Scanning through 5812 files
 # Loading database with 35707 rows
 
 # Update otherOrg.ra under hg/hgGene/hgGeneData/Human/hg16 to mm6 instead of
 # mm4.
 
 
 ##########################################################################
 # EVOFOLD  - RNA secondary structure predictions lifted from hg17 (Jakob Skou Pedersen)
 # Jakob Skou Pedersen, July 12, 2005
   ssh -C hgwdev
   mkdir -p /cluster/data/hg16/bed/evofold
   cd /cluster/data/hg16/bed/evofold
   # lifting folds from hg17 to hg16
   echo "select chrom, chromStart, chromEnd, name, score, strand, size, secStr, conf from evofold;" | hgsql hg17 | sed -e 1d > foldsHg17.bed
   liftOver -minMatch=1.0 foldsHg17.bed /cluster/data/hg17/bed/liftOver/hg17ToHg16.over.chain tmp.bed unmapped.bed
   # remove elements which are wrong size after lifting
   awk '$3-$2 == $7' tmp.bed > foldsHg16.bed
   hgLoadBed -notItemRgb -sqlTable=/cluster/home/jsp/prog/kent/src/hg/lib/evofold.sql hg16 evofold foldsHg16.bed
   # clean up
   rm foldsHg17.bed unmapped.bed tmp.bed
 
 # Tajima's D (DONE -- 2005-09-20 -- Daryl)
 # Data from Chris Carlson in Debbie Nickerson's lab
 # Chris Carlson [csc47<AT>u<DOT>washington<DOT>edu]
 
     # lifted down from hg17.  See makeHg17.doc for details
 
 # AFFYHUEX1 track (sugnet Wed Oct  5 12:18:18 PDT 2005)
 
 mkdir hg16
 cd hg16
 pwd
 # /cluster/store1/sugnet/affymetrixHumanAllExon/hg16
 mkdir gff beds annot
 cd gff
 # download gff design files
 cp ../../hg17/gff/parseGff.pl .
 # parse gff script...
 #!/usr/bin/perl -w
 if(scalar(@ARGV) == 0) {
     print STDERR "parseGff.pl - Parse out affymetrixes gff annotation 
 probesets for human all exon design.
 usage:
    parseGff.pl file1.design.gff file2.design.gff ... fileN.design.gff
 ";
     exit(1);
 }
 
 sub splitField($) {
     my $l = shift(@_);
     my @w = split / /, $l;
     return $w[1];
 }
 
 while($file = shift(@ARGV)) {
     if(!($file =~ /(.+)\.gff/)) {
 	die "$file doesn't have .gff suffix\n";
     }
     $prefix = $1;
     print STDERR "Doing file $file.\n";
     open(IN, $file) or die "Can't open $file to read.";
     open(BED, ">../beds/$prefix.pset.bed") or die "Can't open ../beds/$prefix.pset.bed to write.";
     open(ANNOT, ">../annot/$prefix.tab") or die "Can't open ../annot/$prefix.tab to write.";
     while($line = <IN>) {
 	# Only want the probeset records.
 	if($line =~ /\tprobeset\t/) {
 	    $score = 0;
 	    $cds = 0;
 	    $bounded = 0;
 	    chomp($line);
 	    # pop off an microsoft line endings.
 	    $line =~ s/\r$//;
 	    @words = split /\t/, $line;
 	    # This makes the evidence be comman separated.
 	    $words[8] =~ s/\" \"/,/g;
 	    # This gets rid of pesky quotes.
 	    $words[8] =~ s/\"//g;
 
 	    # Set the score based on the annotation type
 	    if($words[8] =~ /full/) {
 		$score = 200;
 	    }
 	    elsif($words[8] =~ /extended/) {
 		$score = 500;
 	    }
 	    elsif($words[8] =~ /core/) {
 		$score = 900;
 	    }
 	    if($words[8] =~ /bounded/) {
 		$score -= 200;
 	    }
 	    if($words[8] =~ /cds/) {
 		$score += 100;
 	    }
 	    if($score <= 0) {
 		$score = 100;
 	    }
 		
 	    # Print out the annotation fields.
 	    @fields = split /; /,$words[8];
 	    $id = splitField($fields[1]);
 	    $f = shift(@fields);
 	    $f = splitField($f);
 	    print ANNOT "$f";
 	    while($f = shift(@fields)) {
 		if($f =~ /^bounded/) {
 		    $bounded = 1;
 		}
 		if($f =~ /^cds/) {
 		    $cds = 1;
 		}
 		if(!($f =~ /^bounded/ || $f =~ /^cds/)) {
 		    $f = splitField($f);
 		    print ANNOT "\t$f";
 		}
 	    }
 	    print ANNOT "\t$bounded\t$cds";
 	    print ANNOT "\n";
 	    print BED "$words[0]\t$words[3]\t$words[4]\t$id\t$score\t$words[6]\n";
 	}
     }
     close(IN);
     close(BED);
     close(ANNOT);
 }
 
 ./parseGff.pl *.gff
 cat beds/*.bed > affyHuEx1.bed
 hgLoadBed hg16 affyHuEx1 affyHuEx1.bed -strict
 cat annot/*.tab > affyHuEx1.annot.tab 
 cp ../hg17/affyHuEx1Annot.sql ./
 # Contents of affyHuEx1Annot.sql file
 CREATE TABLE affyHuEx1Annot (
   numIndependentProbes smallint not null,
   probesetId int(11) not null,
   exonClustId int(11) not null,
   numNonOverlapProbes smallint not null,
   probeCount smallint not null,
   transcriptClustId int(11) not null,
   probesetType smallint not null,
   numXHybeProbe smallint not null,
   psrId int(11) not null,
   level varchar(10) not null,
   evidence varchar(255) not null,
   bounded smallint not null,
   cds smallint not null,
   PRIMARY KEY (probesetId)
 );
 hg16S -A < affyHuEx1Annot.sql 
 echo "load data local infile 'affyHuEx1.annot.tab' into table affyHuEx1Annot;" | hg16S -A
 
 # end AFFYHUEX1 track
 
 ##########################################################################
 # NHGRI DNASE I HYPERSENSITIVE SITES (2005-10-05 kate)
 #  Submitted by Greg Crawford via web site,
 #  http://research.nhgri.nih.gov/DNaseHS/May2005/
 #  In addition, a file containing the 'randoms' was FTP'ed by Greg
 #     NOTE: bad chr8_random entry removed, as per G. Crawford
 
     # Same display as ENCODE track by Angie...
     # Jim asked to add scores for grayscale-coloring:
     #   clusters of 2 drawn in 50%,  clusters of 3 drawn in 75%,
     #   and clusters of 4 or more drawn in 100% black.
 
     mkdir /cluster/data/hg16/bed/nhgri/lab
     cd /cluster/data/hg16/bed/nhgri/lab
     foreach c (`cut -f 1 /cluster/data/hg16/chrom.sizes`)
         echo $c
         wget -nd http://research.nhgri.nih.gov/DNaseHS/May2005/clusters/$c.LynxClusters.bed 
     end
     cd ..
 
     # special handling for ID's on chrM (they are preceded by 'M_')
     ls lab/chr*.bed lab/randoms.txt \
         | grep -v chrM | xargs cat | grep '^chr'  \
         | perl -wpe 'if (/500bp_(\d+)_(\d+)/) { \
                    $id = $1 . "_" . $2; \
                    $score = ($2 >= 4) ? 1000 : $2 * 250; \
                    s/500bp.+/$id\t$score/; } else { die "parse"; }' > hs.bed
 
     cat lab/chrM*.bed | grep '^chr'  \
         | perl -wpe 'if (/500bp_(M_\d+)_(\d+)/) { \
                    $id = $1 . "_" . $2; \
                    $score = ($2 >= 4) ? 1000 : $2 * 250; \
                    s/500bp.*/$id\t$score/; } else { die "parse"; }' >> hs.bed
     hgLoadBed hg16 nhgriDnaseHs hs.bed
         # Loaded 14224 elements of size 5
     checkTableCoords hg16 nhgriDnaseHs
 
 # MYTOUCH FIX - jen - 2006-01-24
   sudo mytouch hg16 superfamily 0407141100.00
   sudo mytouch hg16 acemblyPep 0406151200.00
   sudo mytouch hg16 twinscanPep 0407141200.00
   sudo mytouch hg16 superfamily 0407141100.00
   sudo mytouch hg16 ensPep 0407141100.00
   sudo mytouch hg16 knownToEnsembl 0407141100.00
   sudo mytouch hg16 sfDescription 0407141100.00
   sudo mytouch hg16 ensEstGtp 0409081800.00
   sudo mytouch hg16 ensEstPep 0409081800.00
 
 
 ##########################################################################
 # AFFY HUEX1 OFF-BY-ONE FIX (Andy 2006-12-14)
    ssh hgwdev
    cd /cluster/data/hg16/bed/affyHumanExon
    liftOver /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.fixed.bed \
       /gbdb/hg17/liftOver/hg17ToHg16.over.chain.gz affyHuEx1.fixed.bed affyHuEx1.unmapped
 awk 'BEGIN{OFS="\t"}{print $4,$3-$2}' affyHuEx1.fixed.bed | sort -k2,2nr | head
 #2325773 204918
 #2402134 204802
 #3645108 60419
 #2366900 52086
 #3016074 9552
 #3641787 8061
 #2321649 8054
    # So there's 4 of them with problems this time:
      egrep -v "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed > alreadyok.bed
      egrep "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed \
       /cluster/data/hg17/bed/affyHumanExon/affyHuEx1.fixed.bed > good.hg17.bed
      bedToFa /cluster/data/hg17/hg17.2bit good.hg17.bed good.hg17.fa
      gfClient blat6 17785 /cluster/data/hg16/nib good.hg17.fa bad.hg16.psl
      tail +6 bad.hg16.psl | awk '$11==$13{print}' > good.hg16.psl
      pslToBed good.hg16.psl good.hg16.bed
      # Scores were lost in the transformations.  Put em back in.
      egrep "\b(2325773|2402134|3645108|2366900)\b" affyHuEx1.fixed.bed
 #chr1    24924744        25129662        2325773 500     +
 #chr1    24924872        25129674        2402134 900     -
 #chr1    168139941       168192027       2366900 1000    +
 #chr16   2600606 2661025 3645108 200     +
      awk 'BEGIN{OFS="\t"} 
          $4=="2325773"{score="500";} 
          $4=="2402134"{score="900";} 
          $4=="3645108"{score="200";}
      {print $1,$2,$3,$4,score,$6}' good.hg16.bed > good.bed
      cat alreadyok.bed good.bed > affyHuEx1.fixed.bed
      bedSort affyHuEx1.fixed.bed tmp.bed
      rm good.* bad.* alreadyok.bed
    hgLoadBed hg16 affyHuEx1 affyHuEx1.fixed.bed
 ##########################################################################
 # GenBank gbMiscDiff table (markd 2007-01-10)
 # Supports `NCBI Clone Validation' section of mgcGenes details page
 
    # genbank release 157.0 now contains misc_diff fields for MGC clones
    # reloading mRNAs results in gbMiscDiff table being created.
    ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna hg16
 
 ################################################
 # SPLIT EXPRESSION & REGULATION GROUPS
 # (2008-09-09 kate)
 
 echo "insert into grp (name, label, priority) values ('expression', 'Expression', 4.5)" | hgsql hg16
 echo "update grp set label='Regulation' where name='regulation'" | hgsql hg16
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 
 echo hg16 panTro1 mm3 rn3 galGal2> /hive/data/genomes/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/species.lst
 update genbank.conf:
 hg16.upstreamGeneTbl = refGene
 hg16.upstreamMaf = mzPt1Mm3Rn3Gg2_pHMM /hive/data/genomes/hg16/bed/multiz.hg16mm3rn3panTro1galGal2/species.lst
 
 
 #############################################################################
 # MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08 angie)
     ssh hgwdev
     mkdir /cluster/data/hg16/bed/mrnaPcr
     cd /cluster/data/hg16/bed/mrnaPcr
     # First, get consistent FA and PSL for UCSC Genes.
     genePredToBed /cluster/data/hg16/bed/kgHg16C/kgBestMrna/knownGene.tab \
       > ucscGenes.bed
     hgsql hg16 -NBe 'select kgId,geneSymbol from kgXref' \
     | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
       > idSub.txt
     subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
     sequenceForBed -keepName -db=hg16 -bedIn=ucscGenesIdSubbed.bed \
       -fastaOut=stdout \
     | faToTwoBit -ignoreDups stdin kgTargetSeq.2bit
     cut -f 1-10 /cluster/data/hg16/bed/kgHg16C/kgBestMrna/knownGene.tab \
     | genePredToFakePsl hg16 stdin kgTargetAli.psl /dev/null
 
     # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
     cd /cluster/data/hg16/bed/mrnaPcr
     hgLoadPsl hg16 kgTargetAli.psl
     mkdir /gbdb/hg16/targetDb
     ln -s /cluster/data/hg16/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/hg16/targetDb/
 
     # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
     # /gbdb/hg16/targetDb/kgTargetSeq.2bit .
 
     ssh hgwdev
     # Add records to hgcentraltest blatServers and targetDb:
     hgsql hgcentraltest -e \
       'INSERT into blatServers values ("hg16Kg", "blat13", 17795, 0, 1);'
     hgsql hgcentraltest -e \
       'INSERT into targetDb values("hg16Kg", "UCSC Genes", \
          "hg16", "kgTargetAli", "", "", \
          "/gbdb/hg16/targetDb/kgTargetSeq.2bit", 1, now(), "");'
 
 
 #############################################################################
+# LIFTOVER TO Hg19 (DONE - 2009-04-24 - Hiram )
+    mkdir /hive/data/genomes/hg16/bed/blat.hg19.2009-04-24
+    cd /hive/data/genomes/hg16/bed/blat.hg19.2009-04-24
+    # -debug run to create run dir, preview scripts...
+    doSameSpeciesLiftOver.pl -buildDir=`pwd` -debug hg16 hg19
+    # Real run:
+    time nice -n +19 \
+	$HOME/kent/src/hg/utils/automation/doSameSpeciesLiftOver.pl \
+	-buildDir=`pwd` -verbose=2 \
+	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
+	 hg16 hg19 > do.log 2>&1 &
+    #	real    93m11.093s
+
+#############################################################################