src/hg/makeDb/doc/danRer6.txt 1.5

1.5 2009/07/23 19:12:38 galt
minor correction, thanks Hiram
Index: src/hg/makeDb/doc/danRer6.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer6.txt,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/makeDb/doc/danRer6.txt	10 Jul 2009 02:42:07 -0000	1.4
+++ src/hg/makeDb/doc/danRer6.txt	23 Jul 2009 19:12:38 -0000	1.5
@@ -1,719 +1,719 @@
 ##################################################
 #
 #  danRer6 = Danio rerio Zv8   Dec. 2008
 #
 #  by Galt Barber 2009-06-17
 #
 
 # Danio Rerio (zebrafish) from Sanger, version Zv8 (released 2008-12-12)
 #  Project website:
 #    http://www.sanger.ac.uk/Projects/D_rerio/
 #  Assembly notes:
 #    http://www.sanger.ac.uk/Projects/D_rerio/
 #    http://www.sanger.ac.uk/Projects/D_rerio/Zv8_assembly_information.shtml
 
 # TODO this comment is copied over and may need removal
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 
 ###############################################
 # set up main genome directory
 
     ssh hgwdev
     cd /hive/data/genomes
     mkdir danRer6
     cd danRer6
 
 
 ###########################################################################
 # DOWNLOAD SEQUENCE (DONE, 2009-06-17, galt)
 
     cd /hive/data/genomes/danRer6
     mkdir download
     cd download
 
     # get sequence from Sanger  
     # (See below, they later give us corrected files at another location)
 
     wget --timestamping -r -np -l 2 -nd -L 'ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv8release'
 
     # matches ftp site sizes	
     ls -l --block-size=K
 -rw-rw-r-- 1 galt protein       1K Apr  9 15:56 README
 -rw-rw-r-- 1 galt protein    3999K Dec 12  2008 Zv8_chr.agp
 -rw-rw-r-- 1 galt protein 1553865K Dec 12  2008 Zv8_contigs.fa
 -rw-rw-r-- 1 galt protein    6475K Dec 12  2008 Zv8_scaffold.agp
 -rw-rw-r-- 1 galt protein 1470539K Dec 12  2008 Zv8_scaffolds.fa
 -rw-rw-r-- 1 galt protein 1471257K Apr  9 15:55 Zv8_toplevel.fa
 
     # Zv8_chr.agp maps contigs (in Zv8_contigs.fa) to chr1-25 
     # Oddly enough, the contig names are 
     # all in the form of scaffoldName.1, ... 
     # so that any original contig name is lost.
 
     # Zv8_scaffold.agp maps contigs to scaffolds, 
     #  producing Zv8_scaffolds.fa
 
     # Not entirely sure yet about Zv8_toplevel.fa
     #  but it seems to be more scaffolds only.
     # I think only the configs.fa is useful fasta.
 
     # There is nothing that maps the scaffolds to
     # the chroms except that they gave the contig names
     # like scaffold1.1 etc.  This connection via name 
     # parts is not official agp structure, it's a hack.
 
     # People want to see the detailed internal gap structure
     # of the scaffolds even inside their chroms, so that
     # creates a minor problem.
 
     # Additionally, we want to include the chroms for sure,
     # but we want to also include any other scaffolds not
     # already included in the chroms. I am going to write
     # a c program to merge the chrom and scaffold agps
     # so that we don't duplicate scaffolds that are in 
     # the chroms.  It will have to rely on the name 
     # hack they provide to connect scaffold to chrom.
 
     # This agp/fasta structure has existed at least since Zv7
     # and I don't know if Sanger or others use it more 
     # generally.
 
     # I created a new program agpSangerUnfinished in kent/src/hg
     # because Sanger wants to use unfinished clones but can't
     # officially report it that way, so they hack their agp file
     # but they don't change contigs.fa to match.
     # compile it
     cd kent/src/hg/agpSangerUnfinished
     make
     rehash
 
     cd /hive/data/genomes/danRer6/download
 
     # fix unfinished names and coordinates
     agpSangerUnfinished Zv8_chr.agp Zv8_contigs.fa Zv8_chr.fix.agp
     agpSangerUnfinished Zv8_scaffold.agp Zv8_contigs.fa Zv8_scaffold.fix.agp
 
     # I created a new program agpMergeChromScaf in kent/src/hg
     # as mentioned above for merging scaffold data 
     # nog alreadhromScaf Zv8_
     # compile it
     cd kent/src/hg/agpMergeChromScaf
     make
     rehash
 
     cd /hive/data/genomes/danRer6/download
     agpMergeChromScaf Zv8_chr.fix.agp Zv8_scaffold.fix.agp danRer6.agp
 
     # Now make final fasta from fixed/merged agp
     agpAllToFaFile danRer6.agp Zv8_contigs.fa danRer6.fa
 
     # check it out!
     faToTwoBit danRer6.fa danRer6.2bit
     checkAgpAndFa danRer6.agp danRer6.2bit >& checkAgpAndFa.log
     tail -1 checkAgpAndFa.log
 #All AGP and FASTA entries agree - both files are valid
 
 -------------------------------------------------------------------
 #RE-DOING with their corrected clone names:  (done 2009-06-24 galt)
 
 
 ftp://ftp.sanger.ac.uk/pub/zfish/jt8/rt
 
 "
 I should stress that the sequence content of the FASTA file has not changed
 from the versions you
 already have; I have just altered the headers for the first set of clones
 mentioned in the previous
 email. Only the contig-level FASTA file is affected; the scaffold-level and
 top-level FASTA files are
 not concerned with the names of clones, so they remain as before.
 "
 
     # move old version out of the way
     cd /hive/data/genomes/danRer6
     mv download download0
     mkdir download
     cd download
 
     # get sequence from Sanger
     # James Torrance <jt8@sanger.ac.uk> 
     # created this version of Zv8 with fixed clone-ids.
     wget --timestamping -r -np -l 2 -nd -L 'ftp://ftp.sanger.ac.uk/pub/zfish/jt8/rt'
 
 [hgwdev:download> ll
 -rw-rw-r-- 1 galt protein   1051781 Jun 23 17:23 Zv8_chr.agp.gz
 -rw-rw-r-- 1 galt protein 468741052 Jun 23 22:43 Zv8_contigs.fa.gz
 -rw-rw-r-- 1 galt protein   1683358 Jun 23 17:23 Zv8_scaffold.agp.gz
 -rw-rw-r-- 1 galt protein       155 Jun 23 22:44 md5sum.dat
 
 [hgwdev:download> cat md5sum.dat 
 1bb31f1f3290038066f74680a854ff69  Zv8_chr.agp.gz
 c4cc4c9574721ce57fde0c79357a4a13  Zv8_contigs.fa.gz
 fc4f9261b77fd177096f0cc9c8718a8d  Zv8_scaffold.agp.gz
 
 [hgwdev:download> md5sum *.gz
 1bb31f1f3290038066f74680a854ff69  Zv8_chr.agp.gz
 c4cc4c9574721ce57fde0c79357a4a13  Zv8_contigs.fa.gz
 fc4f9261b77fd177096f0cc9c8718a8d  Zv8_scaffold.agp.gz
 # they match! no corruption during transfer
 
 
     # decompress
     gunzip *.gz
 
 [hgwdev:download> ll
 -rw-rw-r-- 1 galt protein    4094225 Jun 23 17:23 Zv8_chr.agp
 -rw-rw-r-- 1 galt protein 1591157262 Jun 23 22:43 Zv8_contigs.fa
 -rw-rw-r-- 1 galt protein    6630120 Jun 23 17:23 Zv8_scaffold.agp
 
     # repeat all of the above steps from the first section
     # (after the wget) down to where it says
 #All AGP and FASTA entries agree - both files are valid
 
 
 #######################################################################
 # MAKE GENOME DB FROM CHROMOSOMES AND UNMAPPED SCAFFOLDS 
 # (DONE, 2009-06-24, galt)
 # CHANGE dbDb ENTRY MADE BY SCRIPT TO DISPLAY SAME DEFAULT POSITION 
 # AS FOR DANRER5, GENE vdac2 (DONE, 2007-09-10, hartera)
 
     # To find the accession number for chrM, go to http://www.ncbi.nih.gov/ and search
     # Nucleotide for "Danio mitochondrion genome". 
     # That shows accession: NC_002333.2, 16596 bp
     # wget for mitochondron genome so:
     # http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=NC_002333.2&retmode=text
     ssh hgwdev
     cd /hive/data/genomes/danRer6/
     cat << 'EOF' > danRer6.config.ra
 db danRer6
 clade vertebrate
 scientificName Danio rerio
 commonName Zebrafish
 assemblyDate Dec. 2008
 assemblyLabel Wellcome Trust Sanger Institute, Zv8 assembly (CAAK00000000.5)
-orderKey 449
+orderKey 448
 genomeCladePriority 90
 dbDbSpeciesDir zebrafish
 mitoAcc NC_002333.2
 agpFiles /hive/data/genomes/danRer6/download/danRer6.agp
 fastaFiles /hive/data/genomes/danRer6/download/danRer6.fa
 taxId 7955
 'EOF'    
  # << keep emacs coloring happy
 
     ssh hgwdev
     cd /hive/data/genomes/danRer6/
 
     # makeGenomeDb.pl creates the db for the genome, makes the hgcentraltest
     # entries and loads the gold and gap tables. Creates the GC
     # percent, short match and restriction enzyme tracks. It also creates
     # a danRer6 directory in the trackDb/zebrafish directory with a trackDb.ra 
     # file and a template for the assembly descripton, and for the 
     # gold and gap track tracks 
 
     # Run makeGenomeDb.pl
     makeGenomeDb.pl danRer6.config.ra > & makeGenomeDb.pl.out &
 
     # Follow the directions at the end of the log 
     tail -20 makeGenomeDb.log
 
 #----
 #NOTES -- STUFF THAT YOU WILL HAVE TO DO --
 #cd /hive/data/genomes/danRer6/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/zebrafish/danRer6
 #
 #Search for '***' notes in each file in and make corrections (sometimes the
 #files used for a previous assembly might make a better template):
 #  description.html /hive/data/genomes/danRer6/html/{trackDb.ra,gap.html,gold.html}
 #
 #Then cd ../.. (to trackDb/) and
 # - edit makefile to add danRer6 to DBS.
 # - (if necessary) cvs add zebrafish
 # - cvs add zebrafish/danRer6
 # - cvs add zebrafish/danRer6/*.{ra,html}
 # - cvs ci -m "Added danRer6 to DBS." makefile
 # - cvs ci -m "Initial descriptions for danRer6." zebrafish/danRer6
 # - (if necessary) cvs ci zebrafish
 # - Run make update DBS=danRer6 and make alpha when done.
 # - (optional) Clean up /hive/data/genomes/danRer6/TemporaryTrackDbCheckout
 # - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there.
 #----
 
 
     cd /hive/data/genomes/danRer6/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/zebrafish/danRer6
     # save in case
     mkdir orig
     cp * orig/
     # start with version from danRer5
     cp ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer5/description.html .
     cp ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer5/gap.html .
     cp ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer5/gold.html .
     # edit them for danRer6
     vi description.html
     vi gap.html
     vi gold.html
 
     cd ../.. 
     vi makefile    # - edit makefile to add danRer6 to DBS.
     cvs add zebrafish/danRer6
     cvs add zebrafish/danRer6/*.{ra,html}
     cvs ci -m "Added danRer6 to DBS." makefile
     cvs ci -m "Initial descriptions for danRer6." zebrafish/danRer6
     make update DBS=danRer6
     make alpha  DBS=danRer6
     
    
     # (2009-06-24, galt), Change the default Browser position in dbDb
     # so that it displays vdac2 - chr13:14,786,820-14,801,993 - same as for 
     # danRer5.
     ssh hgwdev
     hgsql -h genome-testdb \
  	-e 'update dbDb set defaultPos = "chr13:14,786,820-14,801,993" where name = "danRer6";' \
 	hgcentraltest
 
 
     # temporarily add in a 2bit file so we can run the browser:
     ln -s `pwd`/danRer6.unmasked.2bit /gbdb/danRer6/danRer6.2bit 
     
 
 ###########################################################
 # Repeatmasking  (DONE 2009-06-26 galt)
 # 
 #
 
     ssh hgwdev
     cd /hive/data/genomes/danRer6/
 
     screen
 
     # repeat mask
     doRepeatMasker.pl danRer6 > & doRepeatMasker.pl.out &
 
     # about 12 hours.
 
 #/cluster/bin/scripts/extractNestedRepeats.pl danRer6.fa.out
 #RepeatMasker bug?: Undefined id, line 330298 of input:
 #  261  27.7  3.1  1.2  Zv8_NA4692   12290   12450   (10778) C ERV1-N3-I_DR-int LTR (824) 3767   3604
 #At least one ID was missing (see warnings above) -- please report to Robert
 #Hubley.  -continue at your disgression.
 
     # Seems to only be one line that is a problem, and it may even have
     # been duplicated, so remove the problem line from danRer6.fa.out
 #261  27.7  3.1  1.2  Zv8_NA4692   12290   12450   (10778) C ERV1-N3-I_DR-int LTR                  (824) 3767   3604
 #267  24.9  1.8  1.2  Zv8_NA4692   12300   12313   (10915) C ERV1-N3-I_DR-int LTR                  (824) 3722   3604 298373
     # keep the second of these lines, remove the first (line 330298) which is missing the last column.
 
     # since danRer6.nestedRepeats.bed did get created,
     #  and doCat.csh appears to have finished,
     # I am going to ignore this error and continue
 
     doRepeatMasker.pl -continue mask  \
       -buildDir /hive/data/genomes/danRer6/bed/RepeatMasker.2009-06-26/ danRer6 \
       >> & doRepeatMasker.pl.out &
 
     cat bed/RepeatMasker.2009-06-26/faSize.rmsk.txt 
     #1512402306 bases (5525484 N's 1506876822 real 729112148 upper 777764674 lower)
     #in 11724 sequences in 1 files
 
 
     [hgwdev:danRer6> featureBits -countGaps danRer6 rmsk
     #777997590 bases of 1512402306 (51.441%) in intersection
 
     # simple repeat masker trf
     doSimpleRepeat.pl danRer6 > & doSimpleRepeat.pl.out &
     #[1]    Exit 25   doSimpleRepeat.pl danRer6 >& doSimpleRepeat.pl.out
     #oops messed up on chrM producing no output, liftUp produced nothing
     #this is not a problem, and the bug is in the automated process,
     #they should either tolerate nothing to do and create an empty output,
     #or just filter chrM out of the input.
 
     doSimpleRepeat.pl -continue filter  \
       -buildDir /hive/data/genomes/danRer6/bed/simpleRepeat.2009-06-29/ danRer6 \
       >> & doSimpleRepeat.pl.out &
 
     featureBits -countGaps danRer6 simpleRepeat
     #111749881 bases of 1512402306 (7.389%) in intersection
 
     # make final masked .2bit
     twoBitMask danRer6.rmsk.2bit -add bed/simpleRepeat.2009-06-29/trfMask.bed danRer6.2bit
     #Warning: BED file bed/simpleRepeat.2009-06-29/trfMask.bed has >=13 fields
     #which means it might contain block coordinates, but this program uses only the
     #first three fields (the entire span -- no support for blocks).
     #(this seems to be normal/ok)
 
 ############################################################################
 #  prepare cluster data (DONE - 2009-07-06 - Galt)
     
     ssh hgwdev
     cd /hive/data/genomes/danRer6
 
     # create gbdb symlink
     rm /gbdb/danRer6/danRer6.2bit
     ln -s `pwd`/danRer6.2bit /gbdb/danRer6/
 
     calc '1024*1.14/2.88'
     #405
 
     blat danRer6.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=405
     #Wrote 71088 overused 11-mers to 11.ooc
 
     mkdir /hive/data/staging/data/danRer6
     cp -p danRer6.2bit /hive/data/staging/data/danRer6
     cp -p 11.ooc /hive/data/staging/data/danRer6
     cp -p chrom.sizes /hive/data/staging/data/danRer6
 
     # ask admin to sync this directory: /hive/data/staging/data/danRer6/
     #       to the kluster nodes /scratch/data/danRer6/
 
 ############################################################################
 # running cpgIsland (DONE - 2009-07-07 - Galt)
     
     ssh hgwdev
     cd /hive/data/genomes/danRer6
 
     mkdir bed/cpgIsland
     cd bed/cpgIsland
     cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     # comment out the following two lines if it compiles cleanly
     # some day  (there were some other fixups too, adding include lines)
     sed -i -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c
     make
     #warning: incompatible implicit declaration of built-in function
     # ignore the warnings
     cd ../../ 
     ln -s hg3rdParty/cpgIslands/cpglh.exe
 
     # make hardmasked fasta files
     mkdir -p hardMaskedFa
     bash
     cut -f1 ../../chrom.sizes | while read C
     do
     echo ${C}
     twoBitToFa ../../danRer6.2bit:$C stdout \
 	| maskOutFa stdin hard hardMaskedFa/${C}.fa
     done
 
     #exit bash 
     exit
 
     cut -f1 ../../chrom.sizes > chr.list
     cat << '_EOF_' > template
     #LOOP
     ./runOne $(root1) {check out line results/$(root1).cpg}
     #ENDLOOP
     '_EOF_'
     # << happy emacs
 
     cat << '_EOF_' > runOne
     #!/bin/csh -fe
     ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
     mv /scratch/tmp/$1.$$ $2
     '_EOF_'
     # << happy emacs
 
     chmod +x runOne
     mkdir results
 
     pk
     cd /hive/data/genomes/danRer6/bed/cpgIsland
 
     gensub2 chr.list single template jobList
     para create jobList
     para try
     para check
     para push
     para time
     #Completed: 11724 of 11724 jobs
     #CPU time in finished jobs:        111s       1.85m     0.03h    0.00d  0.000 y
     #IO & Wait Time:                 73830s    1230.50m    20.51h    0.85d  0.002 y
     #Average job time:                   6s       0.11m     0.00h    0.00d
     #Longest finished job:              41s       0.68m     0.01h    0.00d
     #Submission to last job:          1141s      19.02m     0.32h    0.01d
 
     exit # return to hgwdev
     # should be in /hive/data/genomes/danRer6/bed/cpgIsland
 
     # Transform cpglh output to bed +
     catDir results | awk '{ \
     $2 = $2 - 1; \
     width = $3 - $2; \
     printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", \
        $1, $2, $3, $5,$6, width, \
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); \
     }' > cpgIsland.bed
 
     # took around 2 minutes
 
     hgLoadBed danRer6 cpgIslandExt -tab \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
 
     #Loaded 13361 elements of size 10
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE 2009-07-09 braney )
     # bash  if not using bash shell already
 
     cd /cluster/data/danRer6
     mkdir /cluster/data/danRer6/blastDb
 
     awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
     twoBitToFa -seqList=1meg.lst  danRer6.2bit temp.fa
     faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
     rm temp.fa 1meg.lst
 
     awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
     twoBitToFa -seqList=less1meg.lst  danRer6.2bit temp.fa
     faSplit about temp.fa 1000000 blastDb/y 
     rm temp.fa less1meg.lst
 
     cd blastDb
     for i in *.fa
     do
 	/hive/data/outside/blast229/formatdb -i $i -p F
     done
     rm *.fa
     ls *.nsq | wc -l
 # 1513
 
     mkdir -p /cluster/data/danRer6/bed/tblastn.hg18KG
     cd /cluster/data/danRer6/bed/tblastn.hg18KG
     echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 1513 query.lst
 
    # we want around 250000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\)
 
 # 36727/(250000/1513) = 222.271804
 
    mkdir -p kgfa
    split -l 222 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
    done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    wc kg.lst
 # 166  166 2158 kg.lst
 
    mkdir -p blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/danRer6/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/hive/data/outside/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer6/blastDb.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
         if pslCheck -prot $3.tmp
         then                  
             mv $3.tmp $3     
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0               
     fi                      
 fi                         
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     exit 
     
     ssh swarm
     cd /cluster/data/danRer6/bed/tblastn.hg18KG
     gensub2 query.lst kg.lst blastGsub blastSpec
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 # Completed: 251158 of 251158 jobs
 # CPU time in finished jobs:    8587545s  143125.76m  2385.43h   99.39d  0.272 y
 # IO & Wait Time:                885281s   14754.68m   245.91h   10.25d  0.028 y
 # Average job time:                  38s       0.63m     0.01h    0.00d
 # Longest finished job:            1167s      19.45m     0.32h    0.01d
 # Submission to last job:          9852s     164.20m     2.74h    0.11d
 
     ssh swarm
     cd /cluster/data/danRer6/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS ../blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     para create chainSpec
     para try, check, push, check etc.
 
 # Completed: 166 of 166 jobs
 # CPU time in finished jobs:     107801s    1796.69m    29.94h    1.25d  0.003 y
 # IO & Wait Time:                 23945s     399.08m     6.65h    0.28d  0.001 y
 # Average job time:                 794s      13.23m     0.22h    0.01d
 # Longest finished job:           25056s     417.60m     6.96h    0.29d
 # Submission to last job:         25085s     418.08m     6.97h    0.29d
 
 
     cd /cluster/data/danRer6/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
     cd ..
     pslCheck blastHg18KG.psl
 # checked: 43305 failed: 0 errors: 0
 
     # load table 
     ssh hgwdev
     cd /cluster/data/danRer6/bed/tblastn.hg18KG
     hgLoadPsl danRer6 blastHg18KG.psl
 
     # check coverage
     featureBits danRer6 blastHg18KG 
 # 19115900 bases of 1506896106 (1.269%) in intersection
 
     featureBits danRer6 blastHg18KG refGene  -enrichment
 # blastHg18KG 1.269%, refGene 1.827%, both 0.659%, cover 51.97%, enrich 28.45x
 
     rm -rf blastOut
 #end tblastn
 
 ############################################################################
 # create lift file on unBridged gaps for genbank splits (2009-07-07 - Hiram)
     mkdir /hive/data/genomes/danRer6/bed/gap
     cd /hive/data/genomes/danRer6/bed/gap
     gapToLift danRer6 danRer6.unBridged.lift -bedFile=unBridged.lift.bed
     cp -p danRer6.unBridged.lift ../../jkStuff
     cp -p danRer6.unBridged.lift /hive/data/staging/data/danRer6
 
     # ask admin to sync this directory: /hive/data/staging/data/danRer6/
     #       to the kluster nodes /scratch/data/danRer6/
 
 ###########################################################################
 # AUTO UPDATE GENBANK MRNA AND EST AND ZGC GENES RUN
 # (DONE, 2009-07-07, galt)
     ssh hgwdev  
     cd $HOME/kent/src/hg/makeDb/genbank
     cvs update -dP
     # edit etc/genbank.conf to add danRer6 and commit to CVS
     # done already for first run so no need to update.
 # danRer6 (zebrafish)
 danRer6.serverGenome = /hive/data/genomes/danRer6/danRer6.2bit
 danRer6.clusterGenome = /scratch/data/danRer6/danRer6.2bit
 danRer6.ooc = /scratch/data/danRer6/11.ooc
 danRer6.lift = /hive/data/genomes/danRer6/jkStuff/danRer6.unBridged.lift
 danRer6.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 danRer6.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 danRer6.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 danRer6.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 danRer6.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 danRer6.downloadDir = danRer6
 danRer6.perChromTables = no
 danRer6.refseq.mrna.xeno.load  = yes
 
     # end of section added to etc/genbank.conf
     cvs commit -m "Added danRer6." etc/genbank.conf
     # also added the perChromTables line afterwards. This means that there
     # will not be one table per chromosome. Would be too many tables as 
     # there are 5010 scaffolds.
     # update /cluster/data/genbank/
     make etc-update
 
     # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
     # danRer genome information
 
     ssh genbank
     cd /cluster/data/genbank
     nice bin/gbAlignStep -initial danRer6 &
 
     #var/build/logs/2009.07.07-20:23:34.danRer6.initalign.log
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     nice ./bin/gbDbLoadStep -drop -initialLoad danRer6 &
 
     #var/dbload/hgwdev/logs/2009.07.08-09:59:18.dbload.log
 
     # enable daily alignment and update of hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add danRer6 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added danRer6." etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 
 #########################################################################
 #  BLATSERVERS ENTRY (DONE - 2009-07-08 - Galt)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("danRer6", "blat14", "17790", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("danRer6", "blat14", "17791", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 ############################################################################
 # Making download files (DONE - 2009-07-08 - Galt)
 
     ssh hgwdev
     cd /hive/data/genomes/danRer6
 
     ln -s bed/RepeatMasker.2009-06-26/danRer6.fa.out .
     cd bed
     ln -s simpleRepeat.2009-06-29 simpleRepeat
     cd ..
 
     makeDownloads.pl danRer6 >& downloads.log
 
 # *** Please take a look at the downloads for danRer6 using a web browser.
 # *** The downloads url is: http://hgwdev.cse.ucsc.edu/goldenPath/danRer6. 
 # *** Edit each README.txt to resolve any notes marked with "***":
 #     /hive/data/genomes/danRer6/goldenPath/database/README.txt
 #     /hive/data/genomes/danRer6/goldenPath/bigZips/README.txt
 #     (The htdocs/goldenPath/danRer6/*/README.txt "files" are just links to those.)