src/hg/makeDb/doc/danRer5.txt 1.17

1.17 2009/08/10 16:38:35 hartera
Documented production of dog canFam2 chains, nets and downloads.
Index: src/hg/makeDb/doc/danRer5.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer5.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/makeDb/doc/danRer5.txt	21 Jul 2009 21:01:41 -0000	1.16
+++ src/hg/makeDb/doc/danRer5.txt	10 Aug 2009 16:38:35 -0000	1.17
@@ -1,3147 +1,3162 @@
 # for emacs: -*- mode: sh; -*-
 
 # Danio Rerio (zebrafish) from Sanger, version Zv7 (released 07/13/07)
 #  Project website:
 #    http://www.sanger.ac.uk/Projects/D_rerio/
 #  Assembly notes:
 #    http://www.sanger.ac.uk/Projects/D_rerio/
 #    ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_assembl_information.shmtl
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 
 ###########################################################################
 # DOWNLOAD SEQUENCE (DONE, 2007-07-16, hartera)
 # MOVE FILES TO SEPARATE DIRECTORY (DONE, 2007-07-20, hartera)
      ssh kkstore06
      mkdir /cluster/store4/danRer5 
      ln -s /cluster/store4/danRer5 /cluster/data
      cd /cluster/data/danRer5
      wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/README
      wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_chr.agp
      wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_contigs.fa
      wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_scaffold.agp
      wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_scaffolds.fa
      # move assembly files to a separate directory
      mkdir Zv7release
      mv *.agp *.fa README ./Zv7release/
 
 ###########################################################################
 # CREATE AGP FILES FOR RANDOMS (chrNA and chrUn) 
 # (2007-08-16 - 2007-08-18, hartera)
     ssh kkstore06
     mkdir /cluster/data/danRer5/Zv7release/randoms
     cd /cluster/data/danRer5/Zv7release/randoms
     # first find the contigs assigned to chromosomes:
     awk '{if ($5 !~ /N/) print $6}' ../Zv7_chr.agp | sort | uniq \
         > chromContigs.txt
     # get list of all contigs:
     awk '{if ($5 !~ /N/) print $6}' ../Zv7_scaffold.agp | sort | uniq \
         > allContigs.txt
     # find contigs not assigned to a chromosome
     comm -23 allContigs.txt chromContigs.txt > contigsRandomsAndAccs.txt
     # get all those that are in the Zv7_scaffold.agp and get a list of 
     # scaffold names.
     foreach c (`cat contigsRandomsAndAccs.txt`)
         grep -w $c ../Zv7_scaffold.agp >> contigsRandomsScafs.agp
     end
     # check that all the contigs/clones names in contigsRandomsAndAccs.txt
     # are also in contigsRandomsScafs.agp
     awk '{if ($6 != 100) print $6}' contigsRandomsScafs.agp \
         | sort | uniq > names.sort
     wc -l names.sort contigsRandomsAndAccs.txt
     # 19845 names.sort
     # 19845 contigsRandomsAndAccs.txt 
     comm -12 contigsRandomsAndAccs.txt names.sort | wc -l
     # 19845
     # get list of scaffolds names
     awk '{print $1}' contigsRandomsScafs.agp | sort -k 1.7,1n | uniq \
         > scaffoldsRandoms.txt
     # make an AGP of the randoms scaffolds only
     grep -w -f scaffoldsRandoms.txt ../Zv7_scaffold.agp > randomsScaffold.agp
     # check that we have all the scaffolds to be selected
     awk '{print $1}' randomsScaffold.agp | sort | uniq > names2.sort
     sort scaffoldsRandoms.txt > scafsTmp.sort
     wc -l scafsTmp.sort names2.sort
     # 5010 scafsTmp.sort
     # 5010 names2.sort
     comm -12 scafsTmp.sort names2.sort | wc -l
     # 5010
     # all scaffolds are in the new AGP file that occur in scaffoldsRandoms.txt
     # get the list of contigs from the agp
     awk '{if ($5 !~ /N/) print $6}' randomsScaffold.agp | sort | uniq \
         > randomContigs.list 
     # extract the FASTA sequences for just these contigs
     faSomeRecords ../Zv7_contigs.fa randomContigs.list Zv7contigs_random.fa
     # remove excess part of the names for the contigs with accessions 
     perl -pi.bak -e 's/^(>[A-Z]{2}[0-9]+\.[0-9]+)\s.+$/$1/' Zv7contigs_random.fa
     # check that all the contigs from the list are in the FASTA file
     grep '>' Zv7contigs_random.fa | sed -e 's/^>//' | sort | uniq \
          > Zv7contigs.list
     wc -l *.list
     # 19736 Zv7contigs.list
     # 19845 randomContigs.list
     
     comm -13 Zv7contigs.list randomContigs.list > notFound
     wc -l notFound
     # 109 notFound
     # (2007-08-06, hartera)
     # RE-DONE (2007-08-18, hartera)
     # These are present in the original, but there are several sequeunces for
     # each accession since there are Ns in the sequence. Need to fix the names
     # in the AGP to match each part of the sequence in the FASTA file.
     # First, get the list of headers from the FASTA for all contigs
     grep '>' ../Zv7_contigs.fa > contigs.headers
     # remove ">"
     perl -pi.bak -e 's/>//' contigs.headers
     foreach a (`cat notFound`)
         grep $a contigs.headers >> contigsHeaders.notFound
     end
     awk '{print $1}' contigsHeaders.notFound > contigsHeaders.notFound.accs
     perl -pi.bak -e 's/_.+$//' contigsHeaders.notFound.accs
     sort contigsHeaders.notFound.accs | uniq \
          > contigsHeaders.notFound.accs.sort
     sort notFound | uniq > notFound.sort
     wc -l notFound.sort 
     # 109 notFound.sort
     wc -l contigsHeaders.notFound.accs.sort
     # 109 contigsHeaders.notFound.accs.sort
     comm -12 notFound.sort contigsHeaders.notFound.accs.sort | wc -l
     # 109
     # So all the not Found accessions are in the list of contig headers 
     # accessions: contigsHeaders.notFound.accs.sort
     # Then extract the names for the accession parts 
     # e.g. BX649254.13_01364 zK228P6.01364 BX649254 1 32480
     # and add them to the AGP file for the correct lines for these 
     # components. The last 2 fields above are the start and end coordinates
     # relative to the accession (BX649254.13 in this case). Also, add
     # 50,000 Ns between scaffolds.
     # Wrote program to do this: agpAddCtgNamesAndGaps.c
     /cluster/home/hartera/bin/x86_64/agpAddCtgNamesAndGaps \
             contigsHeaders.notFound randomsScaffold.agp \
             randomsScafFixed.agp >& agpScafs.log
     awk '{print $1}' randomsScaffold.agp | sort | uniq > scafs.lst.uniq
     wc -l scafs.lst.uniq
     # 5010 scafs.lst.uniq
     wc -l *.agp
     # 46688 contigsRandomsScafs.agp
     # 40641 randomsScafFixed.agp
     # 35633 randomsScaffold.agp
 
     # 35633 + 5010 = 40643 but there are 2 less gap rows since there are none
     # at the ends of the random chromosomes. So the number of lines in the 
     # AGP file is correct: 40641.
     # get the list of contigs again for the randoms from the FASTA headers
     # and check all of these are present in the AGP file and vice versa.
     cp contigs.headers contigs.names
     # remove excess part of the names for the contigs with accessions 
     perl -pi.bak -e \
          's/^([A-Z]{2}[0-9]+\.[0-9]+_?[0-9]*\.?[0-9]*)\s.+$/$1/' \
          contigs.names
     sort contigs.names | uniq > contigs.names.sort
     awk '{print $6}' randomsScafFixed.agp | sort | uniq > contigsFromAgp.sort
     wc -l contigs*.sort
     # 60092 contigs.names.sort
     # 20351 contigsFromAgp.sort
     comm -13 contigs.names.sort contigsFromAgp.sort
 # CR293502.4
 # CR356227.33
 # CR388165.16
 # CR854948.10
 # CR931788.11
 # CR954226.7
 # CT573234.3
 # CT573263.4
 # CT737182.2
 # CT997808.4
 
 # These accessions are not matched from the headers to the AGP file. 
 # e.g. CR293502.4
 # CR293502.4_00001 zK31A1.00001 CR293502 1 27131
 # CR293502.4_02478 zK31A1.02478 CR293502 27232 29631
 # CR293502.4_01210.0 zK31A1.01210.0 CR293502 119649 233137
 # The last 2 fields above are coordinates relative to the accession and should
 # match to fields 7 and 8 in the relevant lines of the AGP file but, in this
 # case, they do not.
 # in the AGP file:
 # Zv7_scaffold2558        1944730 1960624 49      D       CR293502.4      11237
 # 27131   +
 # Zv7_scaffold2558        1960625 1960724 50      N       100     fragment no
 # Zv7_scaffold2558        1960725 1963124 51      D       CR293502.4      27232
 # 29631   +
 # Zv7_scaffold2558        1963125 1963224 52      N       100     fragment no
 # Zv7_scaffold2558        1963225 1995007 53      D       CR293502.4      201355
 # 233137  -
 # Co-ordinates are relative to CR293502.4 in fields 7 and 8
     grep CR293502.4 randomsScaffold.agp > CR293502.4.agp
 # E-mailed Tina Eyre (te3@sanger.ac.uk) and Ian Sealy (is1@sanger.ac.uk) at 
 # Sanger to ask them about these discrepancies and how to fix it (2007-08-09).
 # Received the Zv7_all_scaffolds.agp file on 2007-08-14 from Ian Sealy
 # (is1@sanger.ac.uk). This contains the unfinished clones and should not be
 # shared with the public. The contig/clone names match up to those for clone
 # name fragments in the Zv7_contigs.fa file.
 # Received Zv7_all_scaffolds.agp:
     grep CR293502.4 ../Zv7_all_scaffolds.agp > CR293502.4.allScafs.agp 
 # Zv7_scaffold2558        1944730 1960624 49      U       CR293502.4_00001
 # 11237    27131   +       zK31A1.00001    15895   27131
 # Zv7_scaffold2558        1960725 1963124 51      U       CR293502.4_02478
 # 12400    +       zK31A1.02478    2400    2400
 # Zv7_scaffold2558        1963225 1995007 53      U       CR293502.4_01210.0
 # 81707    113489  -       zK31A1.01210.0  31783   113489
    # Coordinates in fields 7 and 8 of this file are relative to the clone  
    # fragment names in field 6.
     foreach f (CR293502*.agp)
       awk \
       '{if ($5 !~ /N/) print "faFrag ", $6".fa", $7-1, $8, $6$7"-"$8".fa";}' \
        $f >> faFrag${f}
     end  
     chmod +x faFrag*
     awk '{print $6}' CR293502.4.allScafs.agp > ctgList.txt
     foreach f (`cat ctgList.txt`)
        echo $f > list
        faSomeRecords ../Zv7_contigs.fa list ${f}.fa
     end  
     faFragCR293502.4.agp
 # Wrote 15895 bases to CR293502.411237-27131.fa
 # Wrote 2400 bases to CR293502.427232-29631.fa
 # Wrote 31783 bases to CR293502.4201355-233137.fa
     faFragCR293502.4.allScafs.agp
 # Wrote 15895 bases to CR293502.4_0000111237-27131.fa
 # Wrote 2400 bases to CR293502.4_024781-2400.fa
 # Wrote 31783 bases to CR293502.4_01210.081707-113489.fa
 # When diff on each pair of files of the same size, the sequence is 
 # identical, only the headers are different.
     
      # Decided to base assembly on scaffolds not contigs (2007-08-22)
 
 ##########################################################################
 # ALL CHROMS AGP (2007-08-18, hartera)
     ssh kkstore06
     cd /cluster/data/danRer5/Zv7release
     awk '{if ($5 !~ /N/) print $6;}' Zv7_all_scaffolds.agp | sort | uniq \
         > contigNamesFromAllScafs.sort
     # compare to contig names from FASTA file
     comm -13 contigNamesFromAllScafs.sort ./randoms/contigs.names.sort
     # no difference: all contig names from AGP file are in the FASTA file
     comm -23 contigNamesFromAllScafs.sort ./randoms/contigs.names.sort \
          > notInAllScafs
     wc -l notInAllScafs
     # 3924 notInAllScafs
     grep "Zv7_NA" notInAllScafs | wc -l
     # 3924
     # So the only ones not in FASTA file are the 3924 Zv7_NA contigs 
     # get clone names without underscore and extension
     # remove excess part of the names for the contigs with accessions
     cp contigNamesFromAllScafs.sort contigNamesFromAllScafs2.sort 
     perl -pi.bak -e \
          's/^([A-Z]{2}[0-9]+\.[0-9]+)_?[0-9]*\.?[0-9]*$/$1/' \
          contigNamesFromAllScafs2.sort
     grep -v "Zv7_NA" contigNamesFromAllScafs2.sort \
          | sort | uniq > contigNamesFromAllScafs2NoNAs.sort
     # get list of contigs and clones in randoms only
     awk '{if ($5 !~ /N/) print $6;}' ./randoms/randomsScaffold.agp \
         | sort | uniq > randomsContigsNames.sort
     # remove randoms scaffolds from the list of contigs/clones from 
     # Zv7_all_scaffolds.agp
     comm -13 randomsContigsNames.sort contigNamesFromAllScafs2NoNAs.sort \
          > contigNamesFromAllScafs2NoRandoms.txt
     sort contigNamesFromAllScafs2NoRandoms.txt | uniq \
          > contigNamesFromAllScafs2NoRandoms.sort
     # then get the compare this list to a list of clones/contigs 
     # from Zv7_chr.agp
     awk '{if ($5 !~ /N/) print $6;}' Zv7_chr.agp | sort | uniq \
         > chromsAgpContigs.sort
     comm -23 contigNamesFromAllScafs2NoRandoms.sort chromsAgpContigs.sort \
          | wc -l
     # 0
     # So there are no new contigs in the Zv7_all_scaffolds.agp file that
     # are not randoms or Zv7_NA.
     # Try agpAddCtgNamesAndGaps.c on Zv7_chr.agp and see if all 
     # clone fragments can be found in the FASTA file.
     cp ./randoms/contigs.headers .
     # get the names from the headers
     awk '{print $1}' contigs.headers | sort | uniq > contigsNames.headers.sort
     # get the contig/clone names from the Zv7_chr.agp file: sorted file is
     # chromContigs.txt
     comm -13 contigsNames.headers.sort ./randoms/chromContigs.txt \
          > contigsInChromAgpOnly.txt
     wc -l contigsInChromAgpOnly.txt
     # 575 contigsInChromAgpOnly.txt
     # Get FASTA file headers for just this set of contigs. These are ones
     # with fragment that are named XXNNNN_NN e.g. BX511136.3_00285.0
     grep -f contigsInChromAgpOnly.txt contigs.headers \
          > contigsInChromAgpOnly.headers
     /cluster/home/hartera/bin/x86_64/agpAddCtgNamesAndGaps \
            contigsInChromAgpOnly.headers Zv7_chr.agp \
            chrFixed.agp >& agpChroms.log
     # check if all the contig/clone names in the AGP file have now been 
     # found in the FASTA file.
 
     sort contigs.names | uniq > contigs.names.sort
     # get contig/clone names from fixed AGP file
     awk '{if ($5 !~ /N/) print $6}' chrFixed.agp | sort | uniq \
         > contigsFromChrFixedAgp.sort
     # get list of names in the FASTA contig headers
     cp ./randoms/contigs.names.sort .
 
     wc -l contigs*.sort
     # 60092 contigs.names.sort
     # 39659 contigsFromChrFixedAgp.sort
     comm -13 contigs.names.sort contigsFromChrFixedAgp.sort \
          > notFoundInChrFixedAgp.txt
     wc -l notFoundInChrFixedAgp.txt
     # 334 notFoundInChrFixedAgp.txt
     echo BX005112.16 > list
     nice faSomeRecords Zv7_contigs.fa list BX005112.16_02613.fa
     faSize BX005112.16_02613.fa 
     # 171673 bases (0 N's 171673 real 171673 upper 0 lower) in 1 sequences in
     # 1 files
     grep BX005112.16 Zv7_chr.agp 
 # chr21   31909678        32080531        1296    D       BX005112.16     820
 # 171673   +
 # chr21   32080632        32084318        1298    D       BX005112.16     171774
 # 175460   +
     grep BX005112.16 chrFixed.agp
 # chr21   1104388161      1104559014      1296    D       BX005112.16     820
 # 171673   +
 # chr21   1104559115      1104562801      1298    D       BX005112.16_02934
 # 171774   175460  +
     grep BX005112.16 Zv7_all_scaffolds.agp
 # Zv7_scaffold2077        678529  849382  54      U       BX005112.16_02613
 # 820      171673  +       zK85G15.02613   170854  171673
 # Zv7_scaffold2077        849483  853169  56      U       BX005112.16_02934
 # 1 3687    +       zK85G15.02934   3687    3687
     grep BX005112.16 contigs.headers
 # BX005112.16_02613 zK85G15.02613 BX005112 1 171673
 # BX005112.16_02934 zK85G15.02934 BX005112 171774 175460
     echo BX005112.16 > list2
 
     nice faSomeRecords Zv7_scaffolds.fa list2 BX005112.fa
     # not found, these accedssions are not in the FASTA file.
     # In order to create the chroms, need to extract the relevant coords from 
     # the accessions to create the FASTA file.
     # Now basing assembly on scaffolds instead of contigs so create
     # AGP files for scaffolds.
 
 ##########################################################################
 # CREATE A SCAFFOLDS AGP WITH SCAFFOLDS ONLY FOR RANDOMS 
 # (2007-08-22 and 2007-08-24, hartera) 
 # Make AGP using just the unmapped scaffolds and not making virtual 
 # chromosomes for unmapped scaffolds so that this is the same as the 
 # way Ensembl handles these. Therefore there will be 25 chromosomes, chrM, 
 # plus 5010 unmapped scaffolds.  
     ssh kkstore06
     # make agps and fasta directories
     mkdir -p /cluster/data/danRer5/Zv7release/agps
     mkdir -p /cluster/data/danRer5/Zv7release/fasta
     # move assemmbly FASTA files to this directory
     cd /cluster/data/danRer5/Zv7release
     mv Zv7_*.fa ./fasta/
     cd /cluster/data/danRer5/Zv7release/randoms
     # get list of scaffolds for randoms - one for Un_random and one for 
     # NA_random scaffolds.
     awk '{if ($1 ~ /Zv7_scaffold/) print $1;}' randomsScaffold.agp \
         | sort -k 1.7,1n | uniq > Un_randomScafs.list
     awk '{if ($1 ~ /Zv7_NA/) print $1;}' randomsScaffold.agp \
         | sort -k 1.7,1n | uniq > NA_randomScafs.list
     wc -l *randomScafs.list
     # 4844 NA_randomScafs.list
     # 166 Un_randomScafs.list
     # 5010 total
 
     # get sequences for just these scaffolds
     foreach f (NA Un)
        faSomeRecords ../fasta/Zv7_scaffolds.fa ${f}_randomScafs.list \
                      Zv7${f}_randomScafs.fa
     end
     # check that they are all there
     foreach f (NA Un)
        grep '>' Zv7${f}_randomScafs.fa > ${f}_Random.headers
     end
     wc -l *Random.headers
     # 4844 NA_Random.headers
     # 166 Un_Random.headers
     # 5010 total
     perl -pi.bak -e 's/>//' *Random.headers
     foreach f (NA Un)
        sort -k 1.7,1n ${f}_Random.headers | uniq > ${f}_Random.headers.sort
     end 
     comm -12 NA_Random.headers.sort NA_randomScafs.list | wc -l 
     # 4844
     comm -12 Un_Random.headers.sort Un_randomScafs.list | wc -l
     # 166
     # Total is 4844 + 166 = 5010 
     # so all the sequences in the scaffolds lists are in the FASTA files.
     
     # Make an AGP from these scaffolds FASTA sequences with 50000 Ns 
     # inserted between scaffolds.
     foreach c (NA_random Un_random)
         scaffoldFaToAgp -scaffoldGapSize=0 Zv7${c}Scafs.fa 
     end
     
 # scaffold gap size is 0, total scaffolds: 4844
 # chrom size is 117689868
 # writing Zv7NA_randomScafs.agp
 # writing Zv7NA_randomScafs.gap
 # writing Zv7NA_randomScafs.lft
 # scaffold gap size is 0, total scaffolds: 166
 # chrom size is 45800611
 # writing Zv7Un_randomScafs.agp
 # writing Zv7Un_randomScafs.gap
 # writing Zv7Un_randomScafs.lft
    
    # Create AGP with just the scaffolds
  
    # sort NA by scaffold number:
    # first remove gap lines:
    grep -w -v "N" Zv7NA_randomScafs.agp > Zv7NA_randomScafsNoGaps.agp
    grep -w -v "N" Zv7Un_randomScafs.agp > Zv7Un_randomScafsNoGaps.agp
    sort -k 6.7n -k 6.8n -k 6.9,6.10n -k 6.10,6.11n \
         Zv7NA_randomScafsNoGaps.agp > Zv7NA_randomScafsSorted.agp
    foreach f (Zv7Un_randomScafsNoGaps.agp Zv7NA_randomScafsSorted.agp)
       set g = $f:r
       echo $g 
       awk 'BEGIN {OFS="\t"} {print $6, $7, $8, "1", "W", $6, $7, $8, "+"}' \
           $f > ${g}2.agp
    end
    wc -l Zv7*_randomScafs*agp
    # 9687 Zv7NA_randomScafs.agp
    # 4844 Zv7NA_randomScafsNoGaps.agp
    # 4844 Zv7NA_randomScafsSorted.agp
    # 4844 Zv7NA_randomScafsSorted2.agp
    # 331 Zv7Un_randomScafs.agp
    # 166 Zv7Un_randomScafsNoGaps.agp
    # 166 Zv7Un_randomScafsNoGaps2.agp
    # 4844 + 166 = 5010 -> total unmapped scaffolds
    cat Zv7NA_randomScafsSorted2.agp Zv7Un_randomScafsNoGaps2.agp \
        > Zv7AllRandomScafs.agp
    wc -l Zv7AllRandomScafs.agp
    # 5010 Zv7AllRandomScafs.agp
    # move scaffolds AGP to agps directory:
    mv Zv7AllRandomScafs.agp /cluster/data/danRer5/Zv7release/agps/
    # move sequences to FASTA directory
    mv Zv7*_randomScafs.fa \
       /cluster/data/danRer5/Zv7release/fasta/
 
 #######################################################################
 # PROCESS CHROMOSME AGP AND SCAFFOLDS AGP TO CREATE A SCAFFOLDS TO 
 # CHROMOSOMES AGP FILE (DONE, 2007-08-24, hartera) 
 
    # The Zv7_chr.agp file contains a mapping of contigs to chromsomes
    # and the Zv7_scaffold.agp file contains a mapping of contigs to scaffolds.
    # To build the Browser and assemble chromosomes, we need an AGP file
    # that maps scaffolds to chromsomes. 
    # A program, createScaffoldsAgp.c was written to create this AGP file. 
    # in kent/src/hg/oneShot/createScaffoldsAgp/.
    ssh kkstore06
    cd /cluster/data/danRer5/Zv7release/agps
    createScaffoldsAgp ../Zv7_scaffold.agp ../Zv7_chr.agp \
                       Zv7ScafToChrom.agp >& createAgp.log
    wc -l *.agp
    # 5010 Zv7AllRandomScafs.agp
    # 4943 Zv7ScafToChrom.agp
    # 9953 total
 
 ###########################################################################
 # CHECK AGP FILES AND FASTA CONTENTS AND SIZE CONSISTENCY 
 # (DONE, 2007-08-22, hartera)
 # 
    # Check that all the scaffolds in AGPs are in the FASTA file and vice versa
    # get list of scaffolds in AGP files: Zv7ScafToChrom.agp,
    # chrNA_random.scaffolds.agp and chrUn_random.scaffolds.agp
    ssh kkstor06
    cd /cluster/data/danRer5/Zv7release/assembly/agps
    
    foreach f (*.agp)
       awk '{if ($5 !~ /N/) print $6}' $f >> allScafsFromAgps.txt
    end
    sort allScafsFromAgps.txt | uniq > allScafsFromAgps.sort
    grep '>' ../fasta/Zv7_scaffolds.fa | sed -e 's/>//' | sort | uniq \
         > scafsHeaders.sort
    wc -l *.sort
    # 7494 allScafsFromAgps.sort
    # 7494 scafsHeaders.sort
    comm -12 allScafsFromAgps.sort scafsHeaders.sort | wc -l
    # 7494
    # 7494 are common to both files so they the AGP files contain all the
    # scaffolds in the FASTA file and vice versa.
    # create one AGP file for the assembly
    cat Zv7ScafToChrom.agp Zv7AllRandomScafs.agp > Zv7ScafToChromAndRandom.agp
 
    cd /cluster/data/danRer5/Zv7release/fasta
    # make a scaffolds directory to work in 
    mkdir scaffolds
    cd scaffolds
    faSize -detailed ../Zv7_scaffolds.fa > Zv7.scaffolds.sizes
    # Check that these sizes correspond to the sizes in the scaffolds agp file
    # use script compareSizes2.pl
    cat << '_EOF_' > compareSizes2.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($file, $agp);
 
 $file = $ARGV[0];
 $agp = $ARGV[1];
 
 open(FILE, $file) || die "Can not open $file: $!\n";
 open(AGP, $agp) || die "Can not open $agp: $!\n";
 open(OUT, ">log.txt") || die "Can not create log.txt: $!\n";
 
 my ($l, @f, $name, $size, %scafsHash);
 while (<FILE>)
 {
 $l = $_;
 @f = split(/\t/, $l);
 
 $name = $f[0]; 
 $size = $f[1];
 $scafsHash{$name} = $size;
 }
 close FILE;
 
 while (<AGP>)
 {
 my ($line, @fi, $scaf, $end);
 $line = $_;
 
 if ($line =~ /Zv/)
    {
    @fi = split(/\t/, $line);
    $scaf = $fi[5];
    $end = $fi[7];
 
    if (exists($scafsHash{$scaf}))
       {
       if ($scafsHash{$scaf} == $end)
          {
          print OUT "$scaf - ok\n";
          }
       else
          {
          print OUT "$scaf - different size to sequence\n";
          }
       }
    else
       {
       print OUT "$scaf - does not exist in list of sizes\n";
       }
    }
 }
 close AGP;
 close OUT;
 '_EOF_'
    # << happy emacs
    chmod +x compareSizes2.pl
    perl compareSizes2.pl Zv7.scaffolds.sizes \
         ../../agps/Zv7ScafToChromAndRandom.agp 
    grep different log.txt
    grep not log.txt
    # these are all consistent with the sequence sizes
    # check that the co-ordinates in the agp files are consistent:
    # field 2 is the start position, field 3 is the end and field 8 is the size
    # so check that this is consistent.
    cd /cluster/data/danRer5
    awk '{if ($5 !~ /N/ && (($3-$2+1) != $8)) print $6;}' \
        ../../agps/Zv7AllRandomScafs.agp > Zv7.scaffolds.coordCheck 
    # this file is empty so they are ok. do the same for the contigs to
    # chromsomes .agp file
    awk '{if ($5 !~ /N/ && (($3-$2+1) != ($8-$7 +1))) print $6;}' \
        ../../Zv7_chr.agp > Zv7.contigsToChroms.coordCheck
    # this file is empty so ok
    # in order to create the scaffolds to chroms AGP file with 
    # createScaffoldsAgp, the coordinates were checked between the 
    # Zv7_scaffold.agp and Zv7_chr.agp files.
    cd ..
    rm -r scaffolds
  
 #######################################################################
 # MAKE GENOME DB FROM CHROMOSOMES AND UNMAPPED SCAFFOLDS 
 # (DONE, 2007-08-24, hartera)
 # CHANGE dbDb ENTRY MADE BY SCRIPT TO DISPLAY SAME DEFAULT POSITION 
 # AS FOR DANRER4, GENE vdac2 (DONE, 2007-09-10, hartera)
 # CHANGE dbDb ENTRY SO THAT THE FULL NAME OF THE SANGER INSTITUTE IS GIVEN
 # IN THE SOURCENAME FIELD (DONE, 2007-09-22, hartera)
 # CHANGED FRAG IN GOLD TABLE FROM GI NUMBER TO ACCESSION FOR chrM 
 # (DONE, 2007-10-01, hartera)
    # Since there will be a mixture of chroms and unmapped scaffolds, the
    # chroms must be built first being inputting the assembly sequence to 
    # makeGenomeDb.pl
    ssh kkstore06
    cd /cluster/data/danRer5/Zv7release/
    cd fasta
    agpToFa -simpleMultiMixed ../agps/Zv7ScafToChrom.agp all \
          Zv7_chromosomes.fa ./Zv7_scaffolds.fa
    # check the chroms are all there
    grep '>' Zv7_chromosomes.fa | sed -e 's/>//' > headers
    # all chroms 1-25 are there
    rm headers
    # check agp and FASTA
    checkAgpAndFa ../agps/Zv7ScafToChrom.agp Zv7_chromosomes.fa
    # All AGP and FASTA entries agree - both files are valid
    # cat together chromosomes FASTA and scaffolds FASTA:
    cat Zv7_chromosomes.fa Zv7NA_randomScafs.fa Zv7Un_randomScafs.fa \
        > Zv7ChromsAndRandomScafs.fa
    # To find the gi number for chrM, go to http://www.ncbi.nih.gov/ and search
    # Nucleotide for "Danio mitochondrion genome". 
    # That shows gi number: 15079186, accession: NC_002333.2, 16596 bp
    # also GI:8576324 and accession: AC024175
    cd /cluster/data/danRer5/
    cat << 'EOF' > danRer5.config.ra
 db danRer5
 clade vertebrate
 scientificName Danio rerio
 commonName Zebrafish
 assemblyDate Jul. 2007
 assemblyLabel Sanger Centre, Danio rerio Sequencing Project Zv7
 orderKey 449
 dbDbSpeciesDir zebrafish
 # NC_002333.2
 mitoAcc 15079186
 agpFiles /cluster/data/danRer5/Zv7release/agps/Zv7ScafToChromAndRandom.agp
 fastaFiles /cluster/data/danRer5/Zv7release/fasta/Zv7ChromsAndRandomScafs.fa 
 'EOF'    
  # << keep emacs coloring happy
    # corrected wget for mitochondron genome so:
    # http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=15079186&retmode=text
    # remove directories created
    rm -r TemporaryTrackDbCheckout bed html jkStuff
    rm dbDbInsert.sql 
    ssh hgwdev
    cd /cluster/data/danRer5/
    hgsql -e 'delete from dbDb where name="danRer5";' hgcentraltest
    
    ssh kkstore06
    cd /cluster/data/danRer5
    # Run with -debug to see what it would do / check for issues:
    makeGenomeDb.pl danRer5.config.ra -debug -verbose=3
    makeGenomeDb.pl danRer5.config.ra >& makeGenomeDb.log & 
    # Took about 10 minutes.
    tail -f makeGenomeDb.log
    
    # Follow the directions at the end of the log after
     #NOTES -- STUFF THAT YOU WILL HAVE TO DO --
 # Search for '***' notes in each file in and make corrections (sometimes the
 # files used for a previous assembly might make a better template):
 #  description.html /cluster/data/danRer5/html/{trackDb.ra,gap.html,gold.html}
 
 # Then cd ../.. (to trackDb/) and
 # - edit makefile to add danRer5 to DBS.
 # - (if necessary) cvs add zebrafish
 # - cvs add zebrafish/danRer5
 # - cvs add zebrafish/danRer5/*.{ra,html}
 # - cvs ci -m "Added danRer5 to DBS." makefile
 # - cvs ci -m "Initial descriptions for danRer5." zebrafish/danRer5
 # - (if necessary) cvs ci zebrafish
 # - Run make update DBS=danRer5 and make alpha when done.
 # - (optional) Clean up /cluster/data/danRer5/TemporaryTrackDbCheckout
 # - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there.
    # makeGenomeDb.pl creates the db for the genome, makes the hgcentraltest
    # entries and loads the gold (scaffolds) and gap tables. Creates the GC
    # percent, short match and restriction enzyme tracks. It also creates
    # a danRer5 directory in the trackDb/zebrafish directory with a trackDb.ra 
    # file and a template for the assembly descripton, and for the 
    # gold and gap track tracks 
    # Then need to load in fragment gaps manually for gap table.
    
    # (2007-09-10, hartera), Change the default Browser position in dbDb
    # so that it displays vdac2 - chr13:14,786,820-14,801,993 - same as for 
    # danRer4.
    ssh hgwdev
    hgsql -h genome-testdb -e \
       'update dbDb set defaultPos = "chr13:14,786,820-14,801,993" \
        where name = "danRer5";' hgcentraltest
 
    # (2007-09-22, hartera), Change the source name for danRer5 to give 
    # Sanger's full name and use similar format to other genome assembly
    # entries in dbDb.
    hgsql -h genome-testdb -e \
       'update dbDb set sourceName = "Wellcome Trust Sanger Institute, Zv7
 assembly" where name = "danRer5";' \
       hgcentraltest
    # (2007-10-01, hartera), Added the accession instead of the gi number
    # for the frag field for chrM in the gold table. gi number is gi|15079186
    # and accession is NC_002333.2. This is the same as AC024175.3 which was
    # used previously.
    ssh hgwdev
    hgsql -e \
        'update gold set frag = "NC_002333.2" where chrom = "chrM";' danRer5 
 
 ##########################################################################
 # REBUILD GAP TABLE WITH FRAGMENT GAPS AS WELL AS CONTIG GAPS 
 # (DONE, 2007-08-24, hartera)
 # It is confusing not to display the fragment gaps in the gap track - 
 # these are gaps within contigs.
     ssh kkstore06
     cd /cluster/data/danRer5/Zv7release/
     # need to use the contigs to chroms AGP
     # also copy over the contigs to scaffolds AGP for the unmapped scaffolds
     cp /cluster/data/danRer5/Zv7release/randoms/randomsScaffold.agp \
        /cluster/data/danRer5/Zv7release/agps/   
     mkdir /cluster/data/danRer5/gap 
     awk '{if ($5 ~ /N/) print;}' Zv7_chr.agp \
         > /cluster/data/danRer5/gap/chroms.gap
     # 34449 gaps
     awk '{if ($5 ~ /N/) print;}' ./agps/randomsScaffold.agp  \
         > /cluster/data/danRer5/gap/randomScafs.gap
     # 15278 gaps
     cat chroms.gap randomScafs.gap > danRer5.gap
     wc -l /cluster/data/danRer5/gap/*.gap
     # 34449 /cluster/data/danRer5/gap/chroms.gap
     # 15278 /cluster/data/danRer5/gap/randomScafs.gap
     # 49727 /cluster/data/danRer5/gap/danRer5.gap
 
     rm chroms.gap randomScafs.gap 
     # 2459 is current count in table. 
     ssh hgwdev
     cd /cluster/data/danRer5/
     hgLoadGap -unsplit danRer5 /cluster/data/danRer5/gap/danRer5.gap
     hgsql -e 'select count(*) from gap;' danRer5
     # 49727 
     # So all the gaps were loaded. 
        
 ###########################################################################
 # REPEATMASKER RUN (DONE, 2007-08-25 - 2007-08-28, hartera)
 # RE-RUN REPEATMASKER (DONE, 2007-08-30 - 2007-09-05, hartera)
 # The sequence was under RepeatMasked, because the new zebrafish repeats 
 # library with the unclassified repeats must be distributed to all the 
 # nodes for pk.
 # NESTED REPEATS TRACK RE-CREATED AFTER FIX TO extractNestedRepeats.pl
 # SCRIPT (DONE, 2007-09-19, angie)
 
 # RepeatMasker,v 1.19 2007/05/23 (May 17 2007 (open-3-1-8) version of
 # RepeatMasker) with RepeatMasker library RELEASE 20061006.
 # Added zebunc.ref (Zebrafish Unclassified) repeats from RepBase12.07
 # from the Genetic Information Research Institute (GIRI,
 # http://www.girinst.org/server/RepBase/index.php). Some repeats were 
 # removed from zebunc.ref because they mask out real genes (advised by
 # Zebrafish bioinformatics group at Sanger) - more details below.
 
     # Download the zebunc.ref unclassified repeats file for zebrafish 
     # from RepBase:
     ssh kkstore06
     cd /cluster/data/danRer5/
     gunzip repeatmaskerlibraries-20061006.tar.gz 
     # no unclassified zebrafish repeats here
     # Download zebunc.ref from
     # http://www.girinst.org/server/RepBase/RepBase12.07.fasta/zebunc.ref
     # last updated 21-Aug-2007 16:21 	451K
     # copy the one used for danRer4 and see if it is any different
     cp /cluster/bluearc/RepeatMasker060320/Libraries/zebunc.ref.txt \
        zebuncref2006.txt
     diff zebunc.ref zebuncref2006.txt
     # no difference so check format is still correct for the current 
     # version of RepeatMasker on pk /scratch/data/RepeatMasker/Libraries/
     # format still looks the same.
     ssh pk
     mkdir /tmp/danRer5
     cd /tmp/danRer5
     faOneRecord /cluster/data/danRer5/Zv7release/fasta/Zv7_contigs.fa \
                 Zv7_scaffold1.1 > Zv7scaffold1.1.fa
     # Run RepeatMasker on this to get it to create the danio library
     /scratch/data/RepeatMasker/RepeatMasker -ali -s -species danio \
                   Zv7scaffold1.1.fa
     cp /cluster/bluearc/RepeatMasker060320/Libraries/zebunc.ref.format \
        /scratch/data/RepeatMasker/Libraries/
     # Add this to the specieslib created for danio
     cd /scratch/data/RepeatMasker/Libraries/
     cat zebunc.ref.format \
        >> /scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib 
     grep '>' specieslib | wc -l
     # 1184
     # then the RepeatMasker script
     # Need to re-do this and add the extra zebrafish repeats to the danio
     # libraries on /cluster/bluearc/scratch/data/RepeatMasker/
     # and get it rsynced to all the parasol hubs, nodes, workhorse machines
     # and fileservers. 
     # E-mail from Kerstin Howe about zebrafish repeats on Aug 31, 2007
     # states that some of these unknown repeats were removed because they 
     # align to real genes. Save this e-mail as README.Repeats
     # (2007-09-03, hartera)
     ssh kkstore06
     cd /cluster/data/danRer5/
     # get list of zebunc.ref sequence names
     grep '>' zebunc.ref | sed -e 's/>//' > zebunc.ref.names
     # make a list of the repeats that Sanger removed: unknownRepsRemoved.txt
     # Sanger removed some of these repeats because they masked out real genes
     # get list list and remove them from list of repeat names.
     grep -w -v -f unknownRepsRemoved.txt zebunc.ref.names \
          > zebuncFixed.ref.names
     wc -l *.names
     # 958 zebunc.ref.names
     # 943 zebuncFixed.ref.names
 
     # select just those remaining names from the zebunc.ref FASTA file
     faSomeRecords zebunc.ref zebuncFixed.ref.names zebuncFixed.ref.fa 
     grep '>' zebuncFixed.ref.fa | wc -l
     # 943 
     # then add these repeats to the library for zebrafish on bluearc
     ssh hgwdev
     # do a dummy run first to create the specieslib for zebrafish
     /cluster/bluearc/scratch/data/RepeatMasker/RepeatMasker \
           -species 'danio' /dev/null
     # then cat the new library to the specieslib for danio
     cat /cluster/data/danRer5/zebuncFixed.ref.fa >> \
  /cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib 
     cd /cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio
     grep '>' specieslib | grep "Dr | wc -l
     # 943
     # (2007-09-04, hartera)
     # also copy to /cluster/bluearc/RepeatMasker
     # do a dummy run first to create the specieslib for zebrafish
     /cluster/bluearc/RepeatMasker/RepeatMasker -species 'danio' /dev/null
     cp -p \
 /cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib\
     /cluster/bluearc/RepeatMasker/Libraries/20061006/danio/
     # ask cluster-admins for an rsync from 
     # /cluster/bluearc/scratch/data/RepeatMasker/ 
     # to /scratch/data/RepeatMasker/ on all machines (parasol hubs, nodes, 
     # fileservers, workhorse machines)
   
     ssh hgwdev
     # cleanup
     rm /gbdb/danRer5/danRer5.2bit
     rm -r /san/sanvol1/scratch/danRer5/RMPart
     rm /san/sanvol1/scratch/danRer5/danRer5.2bit
     rm /san/sanvol1/scratch/danRer5/*.ooc
     hgsql -e 'drop table rmsk;' danRer5
 
     ssh kkstore06
     # clean up
     cd /cluster/data/danRer5
     rm danRer5.rmsk.2bit danRer5.rmskTrf.2bit danRer5.2bit
     mv RepeatMasker.2007-08-25/ OldRepeatMask
     ## use screen for this
     time nice doRepeatMasker.pl -bigClusterHub=pk -species danio \
          danRer5 >& repeatmask.log &
     tail -f repeatmask.log
     # 0.338u 0.267s 20:33:39.53 0.0%  0+0k 0+0io 5pf+0w
 
     # Checked part of danRer5.rmsk.2bit with twoBitToFa and it looks masked.
 
     ## From previous run of RepeatMasker, this is fixed now:
     # Output is in 
     # /scratch/tmp/doRepeatMasker.cluster.s15477/chr3:16000000-16500000.fa.out
     # Checked in danRer4 RM run files and some of them have "*" at the end of 
     # a line e.g. /cluster/data/danRer4/1/chr1_11/chr1_11_10.fa.out
     # Also, the IDs in the current run do not run consecutively 
     # e.g. 1,2,3,4,5,6,2,7,8 etc.
     # E-mailed Angie to ask for advice   
     # (2007-08-27, hartera)
     # liftUp is fixed so it will skip this repeat in the liftUp step when
     # it finds a "*" but no ID.
     # After the repeat DNA11TA1_DR with ID 821, there is a line with a 
     # repeat but no ID, just a "*" at the end which confuses liftUp.
     # Angie sent this problem as an example to Robert Hubley to fix the bug.
     # Here is the offending line:
 # 294    9.8  0.0  0.0  chr3:16000000-16500000  372065 372105 (127895) +
 # DNA11TA1_DR      DNA/Tc1           164  204    (0)      *
     # Angie fixed liftUp to just warn when it finds a "*" instead of an ID.
   
     # NOTE: the Nested Repeats track is created automatically by the
     # doRepeatMasker.pl script. When the script was run to create the 
     # RepeatMasker track it was found that the extractNestedRepeats.pl
     # needed fixing which was done by angie. 
     # The Nested Repeats track was then re-created (angie, 2007-09-19)
     cd /cluster/data/danRer5/bed/RepeatMasker.2007-09-04
     extractNestedRepeats.pl danRer5.fa.out > danRer5.nestedRepeats.bed
 
     hgLoadBed danRer5 nestedRepeats danRer5.nestedRepeats.bed \
        -sqlTable=\$HOME/kent/src/hg/lib/nestedRepeats.sql
  
 ###########################################################################
 # SIMPLE REPEAT (TRF) TRACK (DONE, 2007-08-25 - 2007-08-27, hartera)
    # TRF can be run in parallel with RepeatMasker on the file server
    # since it doesn't require masked input sequence.
    
    ssh kolossus
    ## use screen for this
    mkdir /cluster/data/danRer5/bed/simpleRepeat
    cd /cluster/data/danRer5/bed/simpleRepeat
    time nice twoBitToFa ../../danRer5.unmasked.2bit stdout \
         | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
                 -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp \
         >& trf.log &
    # Took about 8 hours to run
    tail -f trf.log 
    # Crashed on kolossus so re-run as before on kki:
    # Broken pipe                   twoBitToFa ../../danRer5.unmasked.2bit
    # stdout |
    #     12.260u 4.233s 8:05:02.78 0.0%  0+0k 0+0io 0pf+0w
    #    Segmentation fault            trfBig -trf=/cluster/bin/i386/trf stdin
    # /dev/null -bedAt=simpleRepeat.bed  ...
    # This time split the sequence up and run on kki
    ssh kkstore06
    cd /cluster/data/danRer5/
    mkdir splitNoMask
    cd splitNoMask
    twoBitToFa ../danRer5.unmasked.2bit danRer5.unmasked.fa
    faSplit byname danRer5.unmasked.fa .
    rm danRer5.unmasked.fa
     
    # Copy the split sequences to iscratch dir on kkr1u00 
    ssh kkr1u00
    rm -r /iscratch/i/danRer5/splitNoMask
    mkdir -p /iscratch/i/danRer5/splitNoMask
    foreach s (/cluster/data/danRer5/splitNoMask/*.fa)
       echo "Copying $s ..."
       cp $s /iscratch/i/danRer5/splitNoMask/
    end
  
    # Count files transferred
    foreach f (/iscratch/i/danRer5/splitNoMask/*.fa)
       ls $f >> seq.lst
    end
    wc -l seq.lst 
    # 5036 seq.lst
    # correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds.
    rm seq.lst
  
    # rsync to cluster machines
    foreach R (2 3 4 5 6 7 8)
       rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/
    end
     
    ## use screen
    ssh kki
    mkdir -p /cluster/data/danRer5/bed/simpleRepeat
    cd /cluster/data/danRer5/bed/simpleRepeat
    mkdir trf
 cat << '_EOF_' > runTrf
 #!/bin/csh -fe
 #
 set path1 = $1
 set inputFN = $1:t
 set outpath = $2
 set outputFN = $2:t
 mkdir -p /tmp/$outputFN
 cp $path1 /tmp/$outputFN
 pushd .
 cd /tmp/$outputFN
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
 popd
 rm -f $outpath
 cp -p /tmp/$outputFN/$outputFN $outpath
 rm -fr /tmp/$outputFN/*
 rmdir --ignore-fail-on-non-empty /tmp/$outputFN
 '_EOF_'
  # << keep emacs coloring happy
    chmod +x runTrf
 
 cat << '_EOF_' > gsub
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
 #ENDLOOP
 '_EOF_'
    # << keep emacs coloring happy
 
    foreach f (/iscratch/i/danRer5/splitNoMask/*.fa)
       ls -1S $f >> genome.lst
    end 
    gensub2 genome.lst single gsub jobList
    /parasol/bin/para create jobList
    # 5036 jobs written to batch
    /parasol/bin/para try, check, push, check etc...
    /parasol/bin/para time
 # still crashing on one sequence: 
 # sh: line 1: 23369 Segmentation fault      /cluster/bin/i386/trf
 # /tmp/Zv7_scaffold2487.tf 2 7 7 80 10 50 2000 -m -d
 # can't open /tmp/Zv7_scaffold2487.tf.2.7.7.80.10.50.2000.mask
 # Completed: 5035 of 5036 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:      31429s     523.82m     8.73h    0.36d  0.001 y
 # IO & Wait Time:                 12949s     215.81m     3.60h    0.15d  0.000 y
 # Average job time:                   9s       0.15m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3818s      63.63m     1.06h    0.04d
 # Submission to last job:          5013s      83.55m     1.39h    0.06d
 
     # test this sequence 
    ssh kkstore06
    cd /cluster/data/danRer5/bed/simpleRepeat
    mkdir noMaskSplit
    mkdir run2
    cd noMaskSplit
    faSplit -minGapSize=100 -lift=scaf2487.lft gap \
       /cluster/data/danRer5/splitNoMask/Zv7_scaffold2487.fa \
       10000 scaf2487_
     
    ssh kkr1u00
    mkdir /iscratch/i/danRer5/splitScaf2487/
    cp /cluster/data/danRer5/bed/simpleRepeat/noMaskSplit/*.fa \
       /iscratch/i/danRer5/splitScaf2487/
    ls /iscratch/i/danRer5/splitScaf2487/*.fa | wc -l 
    # 250
    # rsync to all iServers
    foreach R (2 3 4 5 6 7 8)
       rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/
    end
     
    ssh kki
    cd /cluster/data/danRer5/bed/simpleRepeat/run2
    cp ../runTrf .
    cp ../gsub .
    mkdir trf
    ls -1S /iscratch/i/danRer5/splitScaf2487/*.fa > genome.lst
    gensub2 genome.lst single gsub jobList
    /parasol/bin/para create jobList
    # 250 jobs written to batch
    /parasol/bin/para try, check, push, check etc...
    /parasol/bin/para time
 # Completed: 249 of 250 jobs
 # Crashed: 1 jobs
 #CPU time in finished jobs:         73s       1.22m     0.02h    0.00d  0.000 y
 #IO & Wait Time:                   622s      10.36m     0.17h    0.01d  0.000 y
 #Average job time:                   3s       0.05m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              23s       0.38m     0.01h    0.00d
 #Submission to last job:            63s       1.05m     0.02h    0.00d
     
    /parasol/bin/para problems >& problems 
 # trf/scaf2487_082.bed does not exist
 # stderr:
 # sh: line 1:  7469 Segmentation fault      /cluster/bin/i386/trf
 # /tmp/scaf2487_082.tf 2 7 7 80 10 50 2000 -m -d
 # can't open /tmp/scaf2487_082.tf.2.7.7.80.10.50.2000.mask
    # Failed sequence looks fine - just A,C,G,T or N.
    # Downloaded the latest version of trf (v4.0, previous was v3.21) 
    # for both 64 bit and i386. Tried 64 bit version on the scaf2487_082.fa
    # and it works fine. So now run on kolossu.
    mv /cluster/data/danRer5/bed/simpleRepeat \
       /cluster/data/danRer5/bed/OldsimpleRepeat
    
    # run on the pk cluster, too slow on kolossus using the 2bit sequence file.
    ssh pk
    mkdir -p /san/sanvol1/scratch/danRer5/splitNoMask
    # copy the split sequence files to the san - 1 for each chorm and 1
    # for each unmapped scaffold
    foreach f (/cluster/data/danRer5/splitNoMask/*.fa)
      cp $f /san/sanvol1/scratch/danRer5/splitNoMask
    end
    mkdir /cluster/data/danRer5/bed/simpleRepeat
    cd /cluster/data/danRer5/bed/simpleRepeat
    mkdir trf
 cat << '_EOF_' > runTrf
 #!/bin/csh -fe
 #
 set path1 = $1
 set inputFN = $1:t
 set outpath = $2
 set outputFN = $2:t
 mkdir -p /tmp/$outputFN
 cp $path1 /tmp/$outputFN
 pushd .
 cd /tmp/$outputFN
 /cluster/bin/i386/trfBig -trf=/cluster/bin/x86_64/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
 popd
 rm -f $outpath
 cp -p /tmp/$outputFN/$outputFN $outpath
 rm -fr /tmp/$outputFN/*
 rmdir --ignore-fail-on-non-empty /tmp/$outputFN
 '_EOF_'
  # << keep emacs coloring happy
    chmod +x runTrf
 
 cat << '_EOF_' > gsub
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
 #ENDLOOP
 '_EOF_'
    # << keep emacs coloring happy
 
    foreach f (/san/sanvol1/scratch/danRer5/splitNoMask/*.fa)
       ls -1S $f >> genome.lst
    end 
    /parasol/bin/gensub2 genome.lst single gsub jobList
    /parasol/bin/para create jobList
    # 5036 jobs written to batch
    /parasol/bin/para try, check, push, check etc...
    /parasol/bin/para time
 # Completed: 5036 of 5036 jobs
 # CPU time in finished jobs:      45132s     752.20m    12.54h    0.52d  0.001 y
 # IO & Wait Time:                 16660s     277.67m     4.63h    0.19d  0.001 y
 # Average job time:                  12s       0.20m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            5804s      96.73m     1.61h    0.07d
 # Submission to last job:          6042s     100.70m     1.68h    0.07d
     
    # (2007-08-27, hartera)
    # the input to trf was the chromosomes and scaffolds so no liftUp needed.
    # cat all the files together
    cat ./trf/*.bed > simpleRepeat.bed
    # load table
    ssh hgwdev
    cd /cluster/data/danRer5/bed/simpleRepeat
    hgLoadBed danRer5 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    # Loaded 966523 elements of size 16
 
 ###########################################################################
 # PROCESS SIMPLE REPEATS INTO A MASK (DONE, 2007-08-28, hartera)
 # RE-DONE AFTER RE-RUNNING REPEATMASKER (DONE, 2007-09-05, hartera) 
    # The danRer5.rmsk.2bit is already masked with RepeatMasker output.
    # After the simpleRepeats track has been built, make a filtered
    # version of the trf output for masking: keep trfs with period <= 12.
    ssh kkstore06
    cd /cluster/data/danRer5/bed/simpleRepeat
    rm -r trfMask
    rm trfMask.bed
    mkdir trfMask
    foreach f (trf/*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
 
    cat ./trfMask/*.bed > trfMask.bed
    # don't need to lift up. these are already in chrom and unmapped 
    # scaffold coordinates.
    ## Add trfMask to repeat masked sequence
    ssh kkstore06
    cd /cluster/data/danRer5
    # cleanup old sequence:
    rm danRer5.rmskTrf.2bit danRer5.2bit
 
    cat << '_EOF_' > addTrf.csh
 #!/bin/csh -efx
 #This script will fail if any of its commands fail.
 
 set DB = danRer5
 set WORK_DIR = /cluster/data/${DB}
 cd ${WORK_DIR}
 set inputTwoBit = ${WORK_DIR}/${DB}.rmsk.2bit
 set outputTwoBit = ${WORK_DIR}/${DB}.rmskTrf.2bit
 cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \
         | twoBitMask -add -type=.bed ${inputTwoBit} stdin ${outputTwoBit}
 twoBitToFa ${outputTwoBit} stdout | faSize stdin > faSize.${DB}.rmskTrf.txt
 '_EOF_'
     # << happy emacs
     chmod +x ./addTrf.csh
     time ./addTrf.csh
     # Warning: BED file stdin has >=13 fields which means it might contain
     # block coordinates, but this program uses only the first three fields
     # (the entire span -- no support for blocks). 
     # This is ok, there are no blocks, only the first 3 fields are 
     # standard BED format.
     # 44.550u 6.030s 0:50.66 99.8%    0+0k 0+0io 0pf+0w
     cat faSize.danRer5.rmskTrf.txt
 # 1440582308 bases (4986636 N's 1435595672 real 724596899 upper 710998773
 # lower) in 5036 sequences in 1 files
 # Total size: mean 286056.9 sd 3643515.2 min 2000 (Zv7_NA5415) max 70371393
 # (chr5) median 7336
 # N count: mean 990.2 sd 9834.4
 # U count: mean 143883.4 sd 1862761.1
 # L count: mean 141183.2 sd 1772056.1
 # %49.35 masked total, %49.53 masked real
     # this is similar to previous masking of zebrafish assembly sequence
     # for danRer4: %44.10 masked total, %48.94 masked real
     ln -s danRer5.rmskTrf.2bit danRer5.2bit
 
     # add symlink to /gbdb/danRer5 to symlink the danRer5.2bit file
     ssh hgwdev
     rm /gbdb/danRer5/danRer5.2bit
     ln -s /cluster/data/danRer5/danRer5.rmskTrf.2bit \
           /gbdb/danRer5/danRer5.2bit
     # copy over to the san
     rm /san/sanvol1/scratch/danRer5/danRer5.2bit
     cp -p /cluster/data/danRer5/danRer5.rmskTrf.2bit \
           /san/sanvol1/scratch/danRer5/danRer5.2bit
     # also copy to iscratch
     ssh kkr1u00
     rm /iscratch/i/danRer5/danRer5.2bit
     cp -p /cluster/data/danRer5/danRer5.rmskTrf.2bit \
           /iscratch/i/danRer5/danRer5.2bit
     # rsync to iServers
     foreach R (2 3 4 5 6 7 8)
        rsync -a --progress /iscratch/i/danRer5/ \
              kkr${R}u00:/iscratch/i/danRer5/
     end
 
 ###########################################################################
 # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2007-08-28, hartera)
 # RE-RUN AFTER RE-RUNNING REPEATMASKER AND RE-MASKING SEQUENCE
 # (DONE, 2007-09-05, hartera)
     # Use -repMatch=512 (based on size -- for human we use 1024, and
     # the zebrafish genome is ~50% of the size of the human genome
     # zebrafish is about 1.44 Gb and human is 3.1 Gb.
     ssh kolossus
     rm -r /cluster/data/danRer5/bed/ooc
     mkdir /cluster/data/danRer5/bed/ooc
     cd /cluster/data/danRer5/bed/ooc
     time blat /cluster/data/danRer5/danRer5.2bit \
        /dev/null /dev/null -tileSize=11 -makeOoc=danRer5_11.ooc -repMatch=512
     # 75.647u 3.159s 1:21.04 97.2%    0+0k 0+0io 2pf+0w
     # Wrote 40297 overused 11-mers to danRer5_11.ooc
     
     # then make 10.ooc file for overused 10mers, repMatch = 4096 
     # for humans so use 2048 for danRer5
     time blat /cluster/data/danRer5/danRer5.2bit \
        /dev/null /dev/null -tileSize=10 -makeOoc=danRer5_10.ooc -repMatch=2048
     # 72.078u 2.587s 1:14.83 99.7%    0+0k 0+0io 0pf+0w
     # Wrote 9589 overused 10-mers to danRer5_10.ooc
 
     # copy the *.ooc files over to the san for cluster runs and Genbank run
     rm /san/sanvol1/scratch/danRer5/*.ooc
     cp -p *.ooc /san/sanvol1/scratch/danRer5/
     # also copy to iscratch
     ssh kkr1u00
     rm /iscratch/i/danRer5/*.ooc
     cp -p /cluster/data/danRer5/bed/ooc/*.ooc /iscratch/i/danRer5/
     # rsync to iServers
     foreach R (2 3 4 5 6 7 8)
        rsync -a --progress /iscratch/i/danRer5/*.ooc \
              kkr${R}u00:/iscratch/i/danRer5/
     end
    
 ###########################################################################
 # CREATE LIFT FILE SEPARATED BY NON-BRIDGED GAPS 
 # (DONE, 2007-08-28, hartera)
 # This is needed for the GenBank run. Use Hiram's gapToLift program
     ssh hgwdev
     cd /cluster/data/danRer5/jkStuff
     gapToLift danRer5 nonBridgedGap.lft
     # copy over to the san
     cp -p nonBridgedGap.lft /san/sanvol1/scratch/danRer5
     # there are 14973 rows in this file, only 370 for mouse
     # that's ok. 
 
 ###########################################################################
 # AUTO UPDATE GENBANK MRNA AND EST AND ZGC GENES RUN
 # (DONE, 2007-08-28, hartera)
 # RE-RUN AFTER RE-DOING REPEATMASKING OF GENOME SEQUENCE
 # (DONE, 2007-09-05, hartera)
     ssh hgwdev  
     cd $HOME/kent/src/hg/makeDb/genbank
     cvs update -dP
     # edit etc/genbank.conf to add danRer5 and commit to CVS
     # done already for first run so no need to update.
 # danRer5 (zebrafish)
 danRer5.serverGenome = /cluster/data/danRer5/danRer5.2bit
 danRer5.clusterGenome = /iscratch/i/danRer5/danRer5.2bit
 danRer5.ooc = /iscratch/i/danRer5/danRer5_11.ooc
 danRer5.lift = /cluster/data/danRer5/jkStuff/nonBridgedGap.lft
 danRer5.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 danRer5.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 danRer5.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 danRer5.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 danRer5.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 danRer5.downloadDir = danRer5
 danRer5.mgcTables.default = full
 danRer5.mgcTables.mgc = all
 danRer5.perChromTables = no
    # end of section added to etc/genbank.conf
    cvs commit -m "Added danRer5." etc/genbank.conf
    # also added the perChromTables line afterwards. This means that there
    # will not be one table per chromosome. Would be too many tables as 
    # there are 5010 scaffolds.
    # update /cluster/data/genbank/
    make etc-update
 
    # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
    # danRer genome information
 
    ssh kkstore06
    cd /cluster/data/genbank
    # for re-run, remove everything under these directories:
    rm -r /cluster/data/genbank/data/aligned/genbank.161.0/danRer5/*
    rm -r /cluster/data/genbank/data/aligned/refseq.24/danRer5/*
    time nice bin/gbAlignStep -initial danRer5 &
    # 3691.965u 996.052s 4:08:37.03 31.4%     0+0k 0+0io 32pf+0w
    # logFile: var/build/logs/2007.09.05-10:04:19.danRer5.initalign.log
    # check log file - looks ok, it has finished (last line in log file)
    # kkstore06.cse.ucsc.edu 2007.09.05-14:12:56 danRer5.initalign: finish 
   
    # then load database
    sh hgwdev
    cd /cluster/data/genbank
    # -drop parameter will ensure that old tables are dropped before loading
    time nice ./bin/gbDbLoadStep -drop -initialLoad danRer5 &
    # logFile: var/dbload/hgwdev/logs/2007.09.05-14:36:03.dbload.log
    # 1108.286u 310.389s 41:23.51 57.1%       0+0k 0+0io 16pf+0w
    # This time ran smoothly and the only problem was an NFS issue with
    # removing directories at the end - the load went ok though. 
    # On the first time loading the GenBank alignments, it failed:
    # failed because there were too many open files, trying to load a table
    # per chrom and per scaffold so add the danRer5.perChromTables = n
    # line to genbank.conf file (see above) so that it will all be loaded into
    # one table. Too many table otherwise, as there are 25 chroms, chrM and 
    # 5010 scaffolds. 
    # Solution: remove lock file: 
    # rm /cluster/data/genbank/var/dbload/hgwdev/run/dbload.lock
    # then run again
 
 ###########################################################################
 # ENSEMBL GENES TRACK FOR PROTEIN-CODING GENES FROM 
 # ENSEMBL VERSION 46 (AUG 2007) (DONE, 2007-09-05, markd)
 # Compare peptides translated from genome to those downloaded from
 # BioMart for these transcripts and LOAD ensPep TABLE WITH PEPTIDE 
 # SEQUENCES FOR THE PROTEIN-CODING ENSEMBL GENES 
 # (DONE, 2007-09-24 - 2007-09-25, hartera)
 # ADD AUTOSQL CODE FOR ensInfo TABLE AND ADD CODE TO HGC TO PRINT
 # THIS ADDITIONAL INFORMATION ON THE DETAILS PAGE FOR EACH GENE
 # (DONE, 2007-09-28 and 2007-10-03, hartera)
    # Used the programs created by Robert Baertsch to download Ensembl  
    # genes from Ensembl's ftp site, load it into the danRer5 database.
   
    ssh hgwdev
    mkdir /cluster/data/danRer5/bed/ensembl46
    cd /cluster/data/danRer5/bed/ensembl46
    
    # Download Ensembl genes from ftp site and create tracks in browser:
    # current version is danio_rerio core_46_7
    hgLoadEnsembl danio_rerio core_46_7 danRer5
 
    # these problems were detected, add to problem.ids file:
    genePredCheck -db=danRer5 ensGene.gp
 Error: ensGene.gp:2720: ENSDART00000019661 exonFrame on non-CDS exon 14
 Error: ensGene.gp:3874: ENSDART00000027605 exonFrame on non-CDS exon 0
 Error: ensGene.gp:9416: ENSDART00000062681 exonFrame on non-CDS exon 12
 checked: 31743 failed: 3
 
    hgLoadEnsembl -f problem.ids danio_rerio core_46_7 danRer5
 
    # hgLoadEnsembl creates the ensGene, ensGeneXref, ensGtp and ensInfo 
    # tables but no ensPep peptide sequence table.
 
    # Compare the peptides as translated from the genomic sequence of the 
    # CDS for each transcript and the peptides for the transcripts downloaded
    # from BioMart (2007-09-23 - 2007-09-24, hartera)
    ssh kkstore06
    mkdir /cluster/data/danRer5/bed/ensembl46/peptides
 
    ssh hgwdev
    cd /cluster/data/danRer5/bed/ensembl46/peptides
    hgsql -N -e 'select distinct(name) from ensGene;' danRer5 | sort \
          > ensGene.txIds.sort
 
    # First get peptides from the genome translation of the genes defined in 
    # ensGene. This does not work for the mitochondrial genome. The ability 
    # to deal with vertebrate mitochondrial genetic code for translation 
    # of genes on chrM was added (markd).
    getRnaPred -peptides -genePredExt danRer5 ensGene all ensGeneTablePep.fa
    # Took about 10 minutes
    
    # Then download the peptide sequences from BioMart and compare 
    # differences may include selenocysteines and translationnal frameshifts
 
    # Get the ensembl peptide sequences from
    # http://www.ensembl.org/biomart/martview
    # Follow this sequence:
    # 1) Choose the Ensembl Genes 46 as the database and then 
    # Danio Rerio genes (ZFISH7) as the dataset.
    # 2) Click on the Attributes link in the left panel. Select sequences.
    # 3) Expand the SEQUENCES section and choose Peptide as type of sequence 
    # to export and then expand the Header Information section and select 
    # Ensembl Gene ID from Gene Attributes and Ensembl Transcript ID 
    # and Ensembl Peptide ID from Transcript Attributes 
    # 4) Click on the Filters link in the left panel and expand the GENE
    # section. Select the Gene type box and then select protein_coding as 
    # these are the only genes with an associated protein sequence.
    # 5) Click on the Results link in the top black menu bar and 
    # choose FASTA for the output and export all results to
    # Compressed file (.gz). Select unique results only.
    # save the file as ensembl46Pep.fasta.gz and move to
    # /cluster/data/danRer5/bed/ensembl46
  
    ssh kkstore06
    cd /cluster/data/danRer5/bed/ensembl46/peptides
    # unzip the Enseml peptides file downloaded from BioMart
    gunzip ensembl46Pep.fasta.gz
    grep '>' ensembl46Pep.fasta | wc -l
    # 31743
    grep '>' ensembl46Pep.fasta > headers
    awk 'BEGIN {FS="|"} {print $1;}' headers | sort | uniq \
        > pepBioMart.txIds.sort
    perl -pi.bak -e 's/>//' pepBioMart.txIds.sort
    # compare list of transcript IDs from the ensGene table to those for the
    # downloaded peptides from BioMart.
    comm -13 ensGene.txIds.sort pepBioMart.txIds.sort
    # from the BioMart peptide download there were 3 extra:
 # ENSDART00000019661
 # ENSDART00000027605
 # ENSDART00000062681
    # These were the problem IDs that were removed from the set - see above.
    comm -23 ensGene.txIds.sort pepBioMart.txIds.sort
    # no difference
    # change headers so that they only have the Ensembl transcript ID.
    awk 'BEGIN{FS="|"} {if ($1 ~ /ENSDART/) print $1; else print;}' \
        ensembl46Pep.fasta > ensembl46PepTxIdsOnly.fasta
    # use faCmp (~/kent/src/utils/faCmp/faCmp.c) to compare the two files.
    # it requires the same number of sequences in both files. 
    # Use faFilter to remove the ones not in ensGeneTablePep.fa 
    faFilter -v -name=ENSDART00000019661 ensembl46PepTxIdsOnly.fasta tmp1.fa
    faFilter -v -name=ENSDART00000027605 tmp1.fa tmp2.fa
    faFilter -v -name=ENSDART00000062681 \
             tmp2.fa ensembl46PepTxIdsOnlyFilt.fasta
    grep '>' ensembl46PepTxIdsOnlyFilt.fasta | wc -l
    # 31740
    rm tmp*
    # faCmp assumed that DNA sequences were being compared. An option was 
    # added so that protein sequences could be compared (markd added this).
    faCmp -sortName -peptide \
          ensGeneTablePep.fa ensembl46PepTxIdsOnlyFilt.fasta \
          >& ensTablevsBioMartFasta.diff
 # ENSDART00000038434 in ensGeneTablePep.fa differs from ENSDART00000038434 at
 # ensembl46PepTxIdsOnlyFilt.fasta at base 114 (X != V)
 # ENSDART00000046903 in ensGeneTablePep.fa differs from ENSDART00000046903 at
 # ensembl46PepTxIdsOnlyFilt.fasta at base 71 (X != T)
 # ENSDART00000047515 in ensGeneTablePep.fa differs from ENSDART00000047515 at
 # ensembl46PepTxIdsOnlyFilt.fasta at base 149 (X != L)
 # ENSDART00000049170 in ensGeneTablePep.fa differs from ENSDART00000049170 at
 # ensembl46PepTxIdsOnlyFilt.fasta at base 172 (X != S)
 # ENSDART00000080260 in ensGeneTablePep.fa differs from ENSDART00000080260 at
 # ensembl46PepTxIdsOnlyFilt.fasta at base 353 (X != G)
 # ENSDART00000081978 in ensGeneTablePep.fa differs from ENSDART00000081978 at
 # ensembl46PepTxIdsOnlyFilt.fasta at base 21 (X != R)
 # ENSDART00000082954 in ensGeneTablePep.fa differs from ENSDART00000082954 at
 # ensembl46PepTxIdsOnlyFilt.fasta at base 747 (X != A)
 # ENSDART00000083022 in ensGeneTablePep.fa differs from ENSDART00000083022 at
 # ensembl46PepTxIdsOnlyFilt.fasta at base 344 (X != L)
 # ENSDART00000105255 in ensGeneTablePep.fa differs from ENSDART00000105255 at
 # ensembl46PepTxIdsOnlyFilt.fasta at base 427 (X != A)
 
 # There are differences in just 9 of the protein sequences where we have an 
 # "X" because there is an "N" in the genome sequence in the codon for the
 # amino acid whereas Ensembl used transcript evidence to predict the 
 # missing base. Since there are so few differences, then the predicted 
 # translation from the genome is good enough without loading an ensPep table.
    # Load anyway for completeness since there are some differences. Use 
    # "generic" as the type for loading using hgPepPred, not "ensembl" since
    # all the IDs were removed from the header line but the transcript ID.
    # If type "ensembl" is defined, it expects at least 3 IDs separated by "|".
 
    ssh hgwdev
    cd /cluster/data/danRer5/bed/ensembl46/peptides
    # load the BioMart downloaded Ensembl 46 peptide sequences into database:
    hgPepPred danRer5 generic ensPep ensembl46PepTxIdsOnlyFilt.fasta
    hgsql -e 'select count(*) from ensPep;' danRer5
    # 31740
    # All the sequences were loaded. 
    # Added a zebrafish/danRer5/trackDb.ra entry for ensGene with the
    # archive setting, aug2007, so that URLs for Ensembl Genes point to the 
    # correct archive. Added the Esnembl version number to the longlabel and
    # as a dataVersion setting (Ensembl Relese 46).
 
    # clean up
    ssh kkstore06
    cd /cluster/data/danRer5/bed/ensembl46/peptides
    rm tmp*.fa headers ensPep.tab ensembl46PepTxIdsOnly.fasta *.bak
    rm pepBioMart.txIds.sort ensGene.txIds.sort 
 
    # (2007-09-28, hartera)
    # Add code to use ensInfo table, use autoSql. There is already a 
    # ensInfo.sql table specified in ~/kent/src/hg/lib
    ssh hgwdev
 cat << 'EOF' > $HOME/kent/src/hg/lib/ensInfo.as
 table ensInfo
 "ensembl Genes track additional information"
     (
     string name;	"Ensembl transcript ID"
     string otherId;	"other ID"
     string geneId; 	"Ensembl gene ID"
     string class;	"Ensembl gene type"
     string geneDesc;	"Ensembl gene description"
     string status;	"Ensembl gene confidence"
     )
 'EOF'
     cd $HOME/kent/src/hg/lib/ensInfo.as
     # move the current ensInfo.sql out of the way
     mv ensInfo.sql tmp.sql
     autoSql ensInfo.as ensInfo
     # renmae autogenerated file and reinstate original file as ensInfo.sql
     # Difference is that this file has a longblob for description whereas
     # the description field in the autoSql generated code just has a 
     # varchar[255] which may not be big enough in all cases.
     mv ensInfo.sql ensInfoTemp.sql
     mv tmp.sql ensInfo.sql
     mv ensInfo.h ../inc/
     # commit ../inc/ensInfo.h, ensInfo.c and ensInfo.as to CVS
     # add ensInfo.o to makefile and commmit to CVS
     cd $HOME/kent/src/hg/
     # add code to hgc.c to doEnsemblGene function to write information 
     # from the ensInfo table on the details page for each gene.
     # (2007-10-03, hartera). Check whether the name2 field in ensGene is
     # always the same as otherId in the ensInfo table.
     ssh hgwdev
     cd /cluster/data/danRer5/bed/ensembl46
     hgsql -N -e 'select name, name2 from ensGene;' danRer5 \
           | sort | uniq > names.ensGene.sort
     hgsql -N -e 'select name, otherId from ensInfo;' danRer5 \
           | sort | uniq > names.ensInfo.sort
     wc -l names*.sort
     # 31740 names.ensGene.sort
     # 31991 names.ensInfo.sort
     comm -12 names.ensGene.sort names.ensInfo.sort | wc -l
     # 31740
     # So the name, name2 pair in ensGene is the same as the name, otherId
     # pair in ensInfo. ensInfo has more entries than ensGene because
     # it includes pseudogene and non-coding RNAs. hgc already prints the 
     # Alternate Name using name2 from ensGene so do not need to print out
     # the otherId field from ensInfo.
 
 ###########################################################################
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer5 (DONE, 2006-09-11, hartera)
    ssh hgwdev
    # DNA port is "0", trans prot port is "1"  
    echo 'insert into blatServers values("danRer5", "blat8", "17784", "1",
 "0");  insert into blatServers values("danRer5", "blat8", "17785", "0",
 "1");' \
     | hgsql hgcentraltest
    # this enables blat and isPcr, isPcr is enabled by loading blat server
    # with tilesize=5 (ask for this when request blat servers from
    # cluster admin).
    # if you need to delete those entries
    echo 'delete from blatServers where db="danRer5";' | hgsql hgcentraltest
 
 ###########################################################################
 # VEGA GENES (DONE, 2007-09-05 - 2007-09-12, hartera)
 # Data provided by Tina Eyre from Sanger: te3@sanger.ac.uk
 # GTF file sent on 2007-08-30, Vegav27
     ssh kkstore06
     mkdir -p /cluster/data/danRer5/bed/vega27/data/
     cd /cluster/data/danRer5/bed/vega27/data
     # 2 files: vegav27.gtf.zip  vegav27_information.txt.zip
     unzip vegav27.gtf.zip
     mv ./lustre/work1/ensembl/te3/otter_vega_builds/vega/070523/vegav27.gtf .
     rm -r lustre
     unzip vegav27_information.txt.zip     
     mv \
        ./lustre/work1/ensembl/te3/otter_vega_builds/vega/070523/vegav27_information.txt
 \    .
     rm -r lustre
     # asked for the vegav27_information.txt file to be in the format 
     # required to load the table directory. 
     # Check the format then prepare data files to load database tables.
     # Can grep for "otter" to get only gene description lines from the GTF 
     # file - the rest is output and header from program that generated it.
     # 2007-09-06, hartera
     ssh kkstore06
     cd /cluster/data/danRer5/bed/vega27
     # get just lines describing genes and remove "chr" prefix from scaffolds
     # NOTE: there are no NA scaffolds. 
   grep "otter" ./data/vegav27.gtf | sed -e 's/chrZv7_/Zv7_/' > vegav27Fixed.gtf   
     # vegaInfo is transcriptId, otterId, geneId, method and geneDesc
     # Get otter transcript ID and otter gene ID:
     awk 'BEGIN{FS=";"} {OFS="\t"} \
         {if (($5 ~ /otter_gene_id/) && ($6 ~ /otter_transcript_id/)) \
          print $5, $6;}' vegav27Fixed.gtf | sort | uniq > vega27OtterIds.txt
     # remove the otter_gene_id and otter_transcript_id and 
     # extra spaces and quotation marks.
     perl -pi.bak -e 's/\sotter_gene_id\s\"//;' vega27OtterIds.txt
     perl -pi.bak -e 's/"\t\sotter_transcript_id\s"(OTTDART[0-9]+)"/\t$1/' \
          vega27OtterIds.txt
     wc -l vega27OtterIds.txt
     # 12819 vega27OtterIds.txt
     # get list of the otter gene IDs only
     awk '{print $1}' vega27OtterIds.txt | sort | uniq > otterGeneIds.only
     # then get list of otter gene IDs from information file
     awk '{if ($1 ~ /OTTDARG/) print $1}' data/vegav27_information.txt \
         | sort | uniq > infoOtterGeneIds.only
     comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l
     # 8649
     wc -l *.only
     # 10374 infoOtterGeneIds.only
     # 8649 otterGeneIds.only
     # all the gene IDs from the GTF file but the information file contains 
     # 1725 are in the information file but not the GTF file
     comm -13 otterGeneIds.only infoOtterGeneIds.only > geneIds.InfoOnly.txt
 cat << 'EOF' > mergeTransIdAndInfo.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($idsFile, $infoFile, %idsHash);
 $idsFile = $ARGV[0];
 $infoFile = $ARGV[1];
 open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
 open (INFO, $infoFile) || die "Can not open $idsFile: $!\n";
 
 # Read in the otter gene IDs and transcript IDs
 while (<IDS>)
 {
 my (@f, $line);
 chomp;
 $line = $_;
 # split line on tab
 @f = split(/\t/, $line);
 # hash keyed by gene ID, stores transcript ID
 # more than one ID so store as an array
 if ($f[0] =~ /OTTDARG/)
    {
    # add to array in hash if it is an otter gene ID
    push @{$idsHash{$f[0]}}, $f[1];
    }
 }
 close IDS;
 
 while (<INFO>)
 {
 my ($line, @info, $geneId, @transIds);
 # read in information file and get the otter gene ID
 $line = $_;
 # split line on tab
 @info = split(/\t/, $line);
 if ($info[0] =~ /OTTDARG/)
    {
    $geneId = $info[0];
    # look up transcript ID in hash 
    
    # if gene ID exists in the hash then print out the information
    # file line with the correct transcript ID
    if (exists($idsHash{$geneId}))
       {
       @transIds = @{$idsHash{$geneId}};
       # rewrite line with transcript ID
       foreach my $t (@transIds)
          {
          print "$t\t$line";
          }
       }
    }
 }
 close INFO;
 'EOF'
    # << happy emacs
     chmod +x mergeTransIdAndInfo.pl
     mergeTransIdAndInfo.pl vega27OtterIds.txt ./data/vegav27_information.txt \
          > vegav27InfoWithTransIds.txt 
     wc -l vega27OtterIds.txt
     # 12819 vega27OtterIds.txt
     wc -l vegav27InfoWithTransIds.txt
     # 12820 vegav27InfoWithTransIds.txt
     awk 'BEGIN {OFS="\t"} {print $2, $1}' vegav27InfoWithTransIds.txt \
         | sort | uniq > infoWithTrans.Ids
     sort vega27OtterIds.txt | uniq > otterIds.sort
     comm -13 otterIds.sort infoWithTrans.Ids
     comm -23 otterIds.sort infoWithTrans.Ids
     # No difference, there must be an extra line duplicated
     wc -l infoWithTrans.Ids
     # 12819 infoWithTrans.Ids
     awk 'BEGIN {OFS="\t"} {print $2, $1}' vegav27InfoWithTransIds.txt \
         | sort | uniq -c | sort -nr > count
     # 2 OTTDARG00000014371      OTTDART00000027056
     # this appears twice in info file
     grep OTTDARG00000014371 ./data/vegav27_information.txt
 # OTTDARG00000014371      DKEY-26M21.1    ZDB-GENE-041210-211
 # si:ch211-162i19.2       protein_coding  novel protein   NOVEL
 # OTTDARG00000014371      DKEY-26M21.1    ZDB-GENE-041210-211
 # si:ch211-162i19.2       protein_coding  novel protein   NOVEL
     # remove one copy from the vegav27_information.txt file.
     # so just sort and uniq the resulting information file with transcript IDs
     sort vegav27InfoWithTransIds.txt | uniq > vegav27InfoWithTransIdsUniq.txt
     wc -l vegav27InfoWithTransIdsUniq.txt
     # 12819 vegav27InfoWithTransIdsUniq.txt
     # There are extra IDs in the information file as it includes all Vega
     # genes (including clones not in Ensembl). Those actually in the Vega 
     # v27 GTF are those that are found on clones that are in Ensembl.
     # Vega includes only clone sequence and Zv7_NA scaffolds are WGS so 
     # these are not annotated in Vega. This information is from 
     # an e-mail from Tina Eyre (te3@sanger.ac.uk) on 2007-09-07.
 
     # then use the info file to grab those genes that are pseudogenes, get the
     # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
     # to a separate file. 
 
     # Next step is to find which transcripts are pseudogenes.
     grep pseudogene vegav27InfoWithTransIdsUniq.txt | sort | uniq | wc -l
     # found 58 pseudogenes so need to create the pseudogene track 
     # Get transcript IDs for pseudogenes.
     grep pseudogene vegav27InfoWithTransIdsUniq.txt | awk '{print $1}' \
          > pseudogenes.ids 
     grep -w -f pseudogenes.ids vegav27Fixed.gtf > vegav27Pseudogene.gtf 
     awk '{print $20}' vegav27Pseudogene.gtf | sort | uniq | wc -l
     # 58
     # Need to make the GTF file vegGene table by subtracting pseudogenes:
     grep -vw -f pseudogenes.ids vegav27Fixed.gtf > vegaGenev27.gtf
     wc -l vega*gtf
     # 168381 vegaGenev27.gtf
     # 168602 vegav27Fixed.gtf
     # 221 vegav27Pseudogene.gtf
 
     # Need to relabel IDs to get the name to be the otter transcript ID
     # and name 2 to be the transcript_id (needs to be labeled as gene_id)
     # Also, relabel the otter_transcript_id to be transcript_id as ldHgGene
     # groups the rows by this ID.  
     sed -e 's/gene_id/tmp_id/' vegaGenev27.gtf \
         | sed -e 's/transcript_id/gene_id/' \
         | sed -e 's/otter_transcript_id/transcript_id/' > vegaGenev27Format.gtf
     # remove an extra line
     grep -v "rachels_data" vegaGenev27Format.gtf > tmp
     mv tmp vegaGenev27Format.gtf
 
     # Do the same for the pseudogene GTF file:
     sed -e 's/gene_id/tmp_id/' vegav27Pseudogene.gtf \
         | sed -e 's/transcript_id/gene_id/' \
         | sed -e 's/otter_transcript_id/transcript_id/' \
         > vegaPseudogenev27Format.gtf
 
     gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
         vegaGenev27Format.gtf vegaGenev27.gp
     genePredCheck vegaGenev27.gp
     # genepred ok checked: 12761 failed: 0
     gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
         vegaPseudogenev27Format.gtf vegaPseudoGenev27.gp
     genePredCheck vegaPseudoGenev27.gp
     # genepred ok checked: 58 failed: 0
     
     # load GTF files for Vega genes and pseudogenes:
     ssh hgwdev
     cd /cluster/data/danRer5/bed/vega27
     # load vegaGene table
     hgLoadGenePred -genePredExt danRer5 vegaGene vegaGenev27.gp
     # load vegaPseudoGene table
     hgLoadGenePred -genePredExt danRer5 vegaPseudoGene vegaPseudoGenev27.gp
 
     hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer5 
 # There are 36, chr1-25 and 11 scaffolds are annotated:
 # Zv7_scaffold2491
 # Zv7_scaffold2498
 # Zv7_scaffold2509
 # Zv7_scaffold2516
 # Zv7_scaffold2523
 # Zv7_scaffold2533
 # Zv7_scaffold2537
 # Zv7_scaffold2551
 # Zv7_scaffold2560
 # Zv7_scaffold2633
 # Zv7_scaffold2650
 
     hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer5
 # 12 chroms and 1 scaffolds have pseudogenes annotated
 # chr2, chr3, chr4, chr5, chr9, chr12, chr13, chr18, chr19, chr20, chr24,
 # Zv7_scaffold2509 
     # Only finished sequence is annotated by Vega 
     # Vega information tables:
     # mySQL table definition and autosql-generated files created previously 
     # for zebrafish-specific information (vegaInfoZfish).
     # Add clone_id to a separate table instead of this one. A tab-separated 
     # file of transcript ID and clone ID was provided by Tina Eyre 
     # (te3@sanger.ac.uk) at Sanger.  
     # Need to grep for the transcript IDs     
     # created a second table for the cloneId accessions since there
     # are multiple ids for some VEGA genes. Otherwise, there would be 
     # a comma separated list in this field or many rows repeated but just
     # different in the cloneId field. Associate transcript ID to clone IDs.  
     # Table definitions are in danRer4.txt.
     
     # first check that the clone IDs file is complete.
     # this is from Tina Eyre sent on 2007-09-07 and the file is
     # vegav27_transcript_to_clone.txt.zip
     ssh kkstore06
     cd /cluster/data/danRer5/bed/vega27
     unzip vegav27_transcript_to_clone.txt.zip
     awk '{print $1}' vegav27_transcript_to_clone.txt | sort | uniq \
         > transWithCloneId.txt
     wc -l transWithCloneId.txt
     # 12819 transWithCloneId.txt
     awk '{print $1}' vegav27InfoWithTransIdsUniq.txt | sort | uniq \
         > vegaInfoTransIds.txt
     wc -l vegaInfoTransIds.txt
     # 12819 vegaInfoTransIds.txt
     # compare to vega27OtterIds.txt
     comm -12 transWithCloneId.txt vegaInfoTransIds.txt | wc -l
     # 12819
     # so all the transcript IDs are in this list.
     # sort this table:
     sort vegav27_transcript_to_clone.txt | uniq > vegav27TransToClone.txt
     
     # load these tables: 
     ssh hgwdev
     cd /cluster/data/danRer5/bed/vega27
     hgLoadSqlTab danRer5 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                  vegav27InfoWithTransIdsUniq.txt
     hgLoadSqlTab danRer5 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
                  vegav27TransToClone.txt
 
 # Add trackDb.ra entry. Keep Vega Genes and Vega Pseudogenes separate as for
 # human and add version number to description. 
 
 ###########################################################################
 # BLASTZ FOR MEDAKA (oryLat1) (DONE, 2007-09-10, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
     # No lineage-specific repeats for this species pair. 
     # Medaka also has no species-specific repeats in the RepeatMasker
     # library so run this using dynamic masking.
     ssh kkstore06
     mkdir /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
     cd /cluster/data/danRer5/bed
     ln -s blastz.oryLat1.2007-09-10 blastz.oryLat1
     cd /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
 # Use Blastz parameters as for tetNig1 in danRer4.txt. Using scaffolds makes this run
 # slower so it is best to have the scaffolds in the query. Use HoxD55.q 
 # matrix as medaka is quite distant from zebrafish. Blastz uses 
 # lineage-specfic repeats but there are none for these two species.
 # Use soft-masked scaffolds for oryLat1 (and dynamic masking) and also use 
 # windowMasker plus SDust repeats since there are no species-specific
 # RepeatMasker repeats for medaka. The random scaffolds were used in Blastz
 # separately (oryLat1UnScaffolds.2bit) rather than as an artificial unordered
 # chromosome to avoid getting Blastz alignments across non-contiguous
 # scaffolds. 
 cat << '_EOF_' > DEF
 # zebrafish (danRer5) vs. Medaka (oryLat1)
 BLASTZ=blastz.v7.x86_64
 BLASTZ_H=2500
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - zebrafish (danRer5) 
 SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit
 SEQ1_LEN=/cluster/data/danRer5/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - Medaka (oryLat1)
 # (40M chunks covers the largest chroms in one # gulp)
 SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
 TMPDIR=/scratch/tmp
 '_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    ## use screen
    time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -qRepeats=windowmaskerSdust \
         -blastzOutRoot /san/sanvol1/scratch/danRer5OryLat1 \
         `pwd`/DEF >& doBlastz.log &
    # 0.154u 0.096s 6:13:59.67 0.0%   0+0k 0+0io 4pf+0w
 
 # TEST PARAMETERS:
 # RUN 1 - parameters above used for fugu queried against zebrafish before.
 # BLASTZ_H=2500
 # BLASTZ_M=50
 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 # 
 # TRY WITH PARAMETERS USED FOR DANRER4 - Run 2:
 # use parameters for oryLat1 in danRer4.txt. Using scaffolds makes this run
 # slower so it is best to have the scaffolds in the query. Use HoxD55.q 
 # matrix as medaka is quite distant from zebrafish. Blastz uses 
 # lineage-specfic repeats but there are none for these two species.
 # Use soft-masked scaffolds and dynamic masking.
 # zebrafish (danRer5) vs. Medaka (oryLat1)
 # BLASTZ=blastz.v7.x86_64
 # BLASTZ_H=2000
 # BLASTZ_Y=3400
 # BLASTZ_L=6000
 # BLASTZ_K=2200
 # BLASTZ_M=50
 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
    # Run 1:
    featureBits danRer5 refGene:cds chainOryLat1Run1Link -enrichment
 # refGene:cds 1.019%, chainOryLat1Run1Link 10.478%, both 0.895%, cover 87.79%,
 # enrich 8.38x
    # Run 2:
    featureBits danRer5 refGene:cds chainOryLat1Link -enrichment
 # refGene:cds 1.019%, chainOryLat1Link 11.164%, both 0.894%, cover 87.77%,
 # enrich 7.86x
    # Chains danRer4, parameters like Run 2:
    featureBits danRer4 refGene:cds chainOryLat1Link -enrichment
 # refGene:cds 0.952%, chainOryLat1Link 12.366%, both 0.813%, cover 85.43%,
 # enrich 6.91x
    # Run 1: 
    hgsql -e 'select count(*) from chainOryLat1Run1Link;' danRer5
 +----------+
 | count(*) |
 +----------+
 | 32786056 |
 +----------+
     # Run 2:
     hgsql -e 'select count(*) from chainOryLat1Link;' danRer5
 +----------+
 | count(*) |
 +----------+
 | 46341892 |
 +----------+
     # Run 1:
     hgsql -e 'select count(*) from chainOryLat1Run1;' danRer5
 +----------+
 | count(*) |
 +----------+
 |  2197571 |
 +----------+
     # Run 2:
     hgsql -e 'select count(*) from chainOryLat1;' danRer5
 +----------+
 | count(*) |
 +----------+
 |  2961333 |
 +----------+
    # Nets Run 1:
    featureBits danRer5 refGene:cds netOryLat1Run1 -enrichment
 # refGene:cds 1.019%, netOryLat1Run1 65.343%, both 0.977%, cover 95.85%,
 # enrich 1.47x
    # Nets Run 2:
    featureBits danRer5 refGene:cds netOryLat1 -enrichment   
 # refGene:cds 1.019%, netOryLat1 66.238%, both 0.974%, cover 95.63%, enrich
 # 1.44x
    # Nets danRer4, parameters like Run 2:
    featureBits danRer4 refGene:cds netOryLat1 -enrichment   
 # refGene:cds 0.952%, netOryLat1 64.174%, both 0.888%, cover 93.29%, enrich
 # 1.45x
    # [1]    Exit 25
    # /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk
    # 0.188u 0.132s 7:33:05.44 0.0%   0+0k 0+0io 3pf+0w
    # Command failed:
 # ssh -x hgwdev nice
 # /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10/axtChain/installDownloads.csh
    ssh hgwdev
    # remove downloads and run install downloads again
    cd /usr/local/apache/htdocs/goldenPath/danRer5/
    rm -r vsOryLat1
    rm -r liftOver
    ssh kkstore06
    cd /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
    # Run script again starting at download step
    time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -qRepeats=windowmaskerSdust -continue download \
         -blastzOutRoot /san/sanvol1/scratch/danRer5OryLat1 \
         `pwd`/DEF >& doBlastz2.log &
    # 0.095u 0.047s 17:35.63 0.0%     0+0k 0+0io 0pf+0w
 
  ## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND
   # ALIGNMENT DOWNLOADS FOR DANRER5 ON ORYLAT1 - documented in oryLat1.txt
 
 ###########################################################################
 # SPLIT MASKED SEQUENCE INTO CHROM AND SCAFFOLD FASTA FILES
 # (DONE, 2007-09-10, hartera)
    ssh kkstore06
    cd /cluster/data/danRer5/
    mkdir splitSoftMask
    cd splitSoftMask
    twoBitToFa ../danRer5.2bit danRer5.softMasked.fa
    faSplit byname danRer5.softMasked.fa .
    rm danRer5.softMasked.fa
     
    # Copy the chrom and scaffold sequences to /san/sanvol1/scratch/danRer5
    mkdir -p /san/sanvol1/scratch/danRer5/splitSoftMask
    foreach f (/cluster/data/danRer5/splitSoftMask/*.fa)
       cp -p $f /san/sanvol1/scratch/danRer5/splitSoftMask/
    end
 
    foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa)
       ls $f >> seq.lst
    end
    wc -l seq.lst
    # 5036 seq.lst 
    # correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds.
    rm seq.lst
 
    # Copy the split sequences to iscratch dir on kkr1u00 
    ssh kkr1u00
    rm -r /iscratch/i/danRer5/splitSoftMask
    mkdir -p /iscratch/i/danRer5/splitSoftMask
    foreach s (/cluster/data/danRer5/splitSoftMask/*.fa)
       echo "Copying $s ..."
       cp $s /iscratch/i/danRer5/splitSoftMask/
    end
  
    # Count files transferred
    foreach f (/iscratch/i/danRer5/splitSoftMask/*.fa)
       ls $f >> seq.lst  
    end
    wc -l seq.lst
    # 5036 seq.lst
    # correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds.
    rm seq.lst
 
    # rsync to cluster machines
    foreach R (2 3 4 5 6 7 8)
       rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/
    end
     
 ###########################################################################
 # AFFY ZEBRAFISH GENOME ARRAY CHIP TRACK (DONE, 2007-09-10 - 2007-09-12) 
 # With the first version of this track on danRer4, QA found a number of short
 # alignments of <= 30 bp and there are a number in the <= 50bp range.
 # These do not seem to be meaningful so filtering was changed to try to
 # remove these alignments while retaining meaningful alignments.
 # pslCDnaFilter was used with the same settings as used for the
 # Genbank EST alignments for zebrafish.
 # Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the
 # higher minIdentity is causing alignments to be dropped that should not be.
 # Blat's minIdentity seems to be more severe than that for pslReps or
 # pslCDnaFilter as it takes insertions and deletions into account.
 # These are Jim's recommendations.
 # Load both consensus and target sequences to form a composite track. Users
 # have asked for Affy target sequences in the past.
     # Array chip consensus sequences already downloaded for danRer1
     ssh hgwdev
     cd /projects/compbio/data/microarray/affyZebrafish
     mkdir -p /san/sanvol1/scratch/affy
     # Affy Zebrafish consensus sequences already in this directory:
    cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
        /san/sanvol1/scratch/affy/
     # Set up cluster job to align Zebrafish consensus sequences to danRer5
     mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2007-09-10
     ln -s /cluster/data/danRer5/bed/affyZebrafish.2007-09-10 \
           /cluster/data/danRer5/bed/affyZebrafish
 
     ssh pk
     cd /cluster/data/danRer5/bed/affyZebrafish
     ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst
     foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa)
        ls -1 $f >> genome.lst
     end
     wc -l genome.lst 
     # 5036 genome.lst
     # for output:
     mkdir -p /san/sanvol1/scratch/danRer5/affy/psl
     # use -repeats option to report matches to repeat bases separately
     # to other matches in the PSL output.
     echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
 -ooc=/san/sanvol1/scratch/danRer5/danRer5_11.ooc $(path1) $(path2) {check out
 line+ /san/sanvol1/scratch/danRer5/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
 > template.sub
 
     /parasol/bin/gensub2 genome.lst affy.lst template.sub para.spec
     /parasol/bin/para create para.spec
     # 5036 jobs written to batch
     /parasol/bin/para try, check, push ... etc.
     /parasol/bin/para time
 # Completed: 5036 of 5036 jobs
 #CPU time in finished jobs:      22117s     368.61m     6.14h    0.26d  0.001 y
 #IO & Wait Time:                 14290s     238.17m     3.97h    0.17d  0.000 y
 #Average job time:                   7s       0.12m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             840s      14.00m     0.23h    0.01d
 #Submission to last job:           988s      16.47m     0.27h    0.01d
 
     # need to do pslSort
     ssh pk
     cd /san/sanvol1/scratch/danRer5/affy
     # Do sort and then best in genome filter.
     # only use alignments that have at least
     # 95% identity in aligned region.
     # Previously did not use minCover since a lot of sequence is in
     # Un and NA so genes may be split up so good to see all alignments.
     # However, found a number of short alignments of <= 50 bp. These are
     # not meaningful so maybe need to use minCover. If increased too much,
     # then hits on poor parts of the assembly will be missed.
     # use pslCDnaFilter with the same parameters as used for zebrafish
     # Genbank EST alignments.
     pslSort dirs raw.psl tmp psl
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl \
        affyZebrafishConsensus.psl
 #                         seqs    aligns
 #             total:     15295   459469
 # drop minNonRepSize:     2791    391655
 #     drop minIdent:     2590    30197
 #     drop minCover:     1712    7112
 #        weird over:     237     1038
 #        kept weird:     176     228
 #    drop localBest:     1393    13532
 #              kept:     14971   16973
 # 96.6% of sequences aligned. There are 15502 Affy consensus sequences.
 
 # FOR DANRER4:
 #                         seqs    aligns
 #             total:     15272   828202
 #drop minNonRepSize:     2763    741674
 #     drop minIdent:     2656    39188
 #     drop minCover:     2550    10784
 #        weird over:     359     1439
 #        kept weird:     277     347
 #    drop localBest:     2830    17737
 #              kept:     14952   18819
 # Kept 97.9% of alignments. There are 15502 Affy sequences originally
 # aligned so there are now 96.5% remaining.
      
     # rsync these psl files
     rsync -a --progress /san/sanvol1/scratch/danRer5/affy/*.psl \
          /cluster/data/danRer5/bed/affyZebrafish/
     
     ssh kkstore06
     cd /cluster/data/danRer5/bed/affyZebrafish
     # shorten names in psl file
     sed -e 's/Zebrafish://' affyZebrafishConsensus.psl > tmp.psl
     mv tmp.psl affyZebrafishConsensus.psl
     pslCheck affyZebrafishConsensus.psl
     # psl is good
 
     # load track into database
     ssh hgwdev
     cd /cluster/data/danRer5/bed/affyZebrafish
     hgLoadPsl danRer5 affyZebrafishConsensus.psl
     # Add consensus sequences for Zebrafish chip
     # Copy sequences to gbdb if they are not there already
     mkdir -p /gbdb/hgFixed/affyProbes
     ln -s \
     /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /gbdb/hgFixed/affyProbes
     # these sequences were loaded previously so no need to reload.
     hgLoadSeq -abbr=Zebrafish: danRer5 \
               /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
     # Clean up
     rm batch.bak raw.psl
     # check number of short alignments:
     hgsql -e \
      'select count(*) from affyZebrafishConsensus where (qEnd - qStart) <= 50;'\
      danRer5
     # 6
     # There were 7 for danRer4 - this is ok.
     
     ### TARGET SEQUENCES SUBTRACK: (2007-09-11 - 2007-09-12)
     ### Create subtrack of target sequences, this is just the region of the 
     # consensus sequence that is used for designing the probesets.
     
     # Array chip target sequences downloaded from 
  # http://www.affymetrix.com/support/technical/byproduct.affx?product=zebrafish
     ssh hgwdev
     # copy sequences to /projects/compbio/data/microarray/affyZebrafish
     cd /projects/compbio/data/microarray/affyZebrafish
     unzip Zebrafish_target.zip
     grep '>' Zebrafish_target > target.headers
     wc -l target.headers
     # 15617 target.headers
     # includes the control sequences
     # remove these
     grep -v "AFFX" target.headers > targetNoControls.headers
     wc -l targetNoControls.headers
     # 15502 targetNoControls.headers
     mv Zebrafish_target Zebrafish_target.fa 
     grep '>' Zebrafish_target.fa | wc -l
     # 15617 sequences
     awk 'BEGIN {FS=";"} {print $1;}' targetNoControls.headers \
         | sed -e 's/>//' > targetNoControls.ids
     wc -l *.ids
     # 15502 targetNoControls.ids
     # remove semi-colon after name in the target FASTA file
     sed -e 's/;//' Zebrafish_target > Zebrafish_target.fa
     faSomeRecords Zebrafish_target.fa targetNoControls.ids \
           Zebrafish_targetNoControls.fa
     grep '>' Zebrafish_targetNoControls.fa | wc -l
     # 15502
     # Remove "Zebrafish:" from name, leave in "tg:" to differentiate 
     # sequences from the consensus ones.
     perl -pi.bak -e 's/>target:Zebrafish:/>tg:/' Zebrafish_targetNoControls.fa
     
     # make affy directory on the san if it does not exist already:
     mkdir -p /san/sanvol1/scratch/affy
    cp -p \
 /projects/compbio/data/microarray/affyZebrafish/Zebrafish_targetNoControls.fa \
        /san/sanvol1/scratch/affy/
     # Set up cluster job to align Zebrafish consensus sequences to danRer5
     # Directories below were set up above
    
     # mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2006-09-10
     # ln -s /cluster/data/danRer5/bed/affyZebrafish.2006-09-10 \
     #      /cluster/data/danRer5/bed/affyZebrafish
     mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2006-09-10/target
 
     ssh pk
     cd /cluster/data/danRer5/bed/affyZebrafish/target
     ls -1 /san/sanvol1/scratch/affy/Zebrafish_targetNoControls.fa \
         > affyTarget.lst
     foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa)
        ls -1 $f >> genome.lst
     end
     wc -l genome.lst 
     # 5036 genome.lst
     # for output:
     mkdir -p /san/sanvol1/scratch/danRer5/affy/pslTarget
     # use -repeats option to report matches to repeat bases separately
     # to other matches in the PSL output.
     echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
 -ooc=/san/sanvol1/scratch/danRer5/danRer5_11.ooc $(path1) $(path2) {check out
 line+ /san/sanvol1/scratch/danRer5/affy/pslTarget/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
 
     /parasol/bin/gensub2 genome.lst affyTarget.lst template.sub para.spec
     /parasol/bin/para create para.spec
     # 5036 jobs written to batch
     /parasol/bin/para try, check, push ... etc.
 # Keeps crashing - can't write to directory - permissions look ok.
 # Tried again (2007-09-12, hartera) and it worked fine.
     /parasol/bin/para time
 # Completed: 5036 of 5036 jobs
 #CPU time in finished jobs:      17604s     293.40m     4.89h    0.20d  0.001y
 #IO & Wait Time:                 14325s     238.75m     3.98h    0.17d  0.000 y
 #Average job time:                   6s       0.11m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             767s      12.78m     0.21h    0.01d
 #Submission to last job:           842s      14.03m     0.23h    0.01d
 
     # need to do pslSort
     ssh pk
     cd /san/sanvol1/scratch/danRer5/affy
     # Do sort and then best in genome filter.
     pslSort dirs rawTarget.psl tmp pslTarget
     # only use alignments that have at least 95% identity in aligned region.
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 rawTarget.psl \
        affyZebrafishTarget.psl
 #                         seqs    aligns
 #             total:     15216   308151
 #      drop invalid:     1       1
 # drop minNonRepSize:     2051    256676
 #     drop minIdent:     1917    20458
 #     drop minCover:     741     2951
 #        weird over:     143     607
 #        kept weird:     92      116
 #    drop localBest:     1306    11562
 #              kept:     14820   16503
 # 95.6% of sequences aligned. There are 15502 Affy consensus sequences.
 
     # rsync these psl files
     rsync -a --progress /san/sanvol1/scratch/danRer5/affy/*Target.psl \
          /cluster/data/danRer5/bed/affyZebrafish/target/
 
     ssh kkstore06
     cd /cluster/data/danRer5/bed/affyZebrafish/target
     # shorten names in psl file
     #sed -e 's/Zebrafish://' affyZebrafishTarget.psl > tmp.psl
     #mv tmp.psl affyZebrafishConsensus.psl
     pslCheck affyZebrafishTarget.psl
     # psl is good
 
     # load track into database
     ssh hgwdev
     cd /cluster/data/danRer5/bed/affyZebrafish/target
     hgLoadPsl danRer5 affyZebrafishTarget.psl
     # Add consensus target for Zebrafish chip
     # Copy sequences to gbdb if they are not there already
     mkdir -p /gbdb/hgFixed/affyProbes
     ln -s \
 /projects/compbio/data/microarray/affyZebrafish/Zebrafish_targetNoControls.fa \
       /gbdb/hgFixed/affyProbes
     # these sequences were loaded previously so no need to reload.
     hgLoadSeq danRer5 \
               /gbdb/hgFixed/affyProbes/Zebrafish_targetNoControls.fa
     # Clean up
     rm batch.bak rawTarget.psl
     # check number of short alignments:
     hgsql -e \
      'select count(*) from affyZebrafishTarget where (qEnd - qStart) <= 50;'\
      danRer5
     # 33
     # This is ok, these are shorter regions anyway.
     # Added trackDb.ra entry to zebrafish/danRer5/trackDb.ra. This is
     # a composite track consisting of 2 subtracks for the consensus
     # sequence Blat alignments and the target sequence Blat alignments.
     # The composite track is called affyZebrafish. Add searches for both
     # tracks.
 
 ###########################################################################
 # MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE, 2007-09-11, hartera)
 # ADDED A CHROMOSOMES DOWNLOAD DIRECOTRY (DONE, 2007-09-14, hartera)
     ssh hgwdev
     cd /cluster/data/danRer5
     ln -s /cluster/data/danRer5/bed/RepeatMasker.2007-09-04/danRer5.fa.out \
           /cluster/data/danRer5
     /cluster/bin/scripts/makeDownloads.pl danRer5 -verbose=2 \
           >& jkStuff/downloads.log &
     tail -f jkStuff/downloads.log
     # Edit README.txt files when done:
 # *** All done!
 # *** Please take a look at the downloads for danRer5 using a web browser.
 # *** Edit each README.txt to resolve any notes marked with "***":
 #     /cluster/data/danRer5/goldenPath/database/README.txt
 #     /cluster/data/danRer5/goldenPath/bigZips/README.txt
 #    (The htdocs/goldenPath/danRer5/*/README.txt "files" are just links to
 #     those.)
     # ADD CHROMOSOMES DOWNLOADS DIRECTORY
     # Add chromosome and scaffolds downloads, these are already in 
     # /cluster/data/danRer5/splitSoftMask (2007-09-14, hartera)
     # compress files:
     ssh kkstore06
     mkdir /cluster/data/danRer5/goldenPath/chromosomes
     cd /cluster/data/danRer5/splitSoftMask
     foreach c (chr*.fa)
       gzip $c
     end
     # now scaffolds, compress together into one file
     foreach s (Zv7_*.fa)
       gzip -c $s >> unmappedScaffolds.fa.gz
     end 
     mv *.gz /cluster/data/danRer5/goldenPath/chromosomes/
 
     ssh hgwdev
     # make symbolic links to the goldenPath downloads directory for danRer5
     mkdir /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes
     ln -s /cluster/data/danRer5/goldenPath/chromosomes/*.gz \
        /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes/
     cd /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes
     md5sum *.gz > md5sum.txt
     cp /usr/local/apache/htdocs/goldenPath/danRer4/chromosomes/README.txt \ 
        /cluster/data/danRer5/goldenPath/chromosomes/
     # edit README.txt to make it specific for the danRer5 (Sanger Zv7) 
     # assembly and then link it to the downloads directory.
     ln -s /cluster/data/danRer5/goldenPath/chromosomes/README.txt \
        /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes/
 
 ########################################################################
 # BLASTZ Swap Mouse mm9 (DONE - 2007-09-12 - Hiram)
     ssh kkstore06
     # the original
     cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-11
     cat fb.mm9.chainDanRer5Link.txt
     #	48497464 bases of 2620346127 (1.851%) in intersection
 
     #	the swap
     mkdir /cluster/data/danRer5/bed/blastz.mm9.swap
     cd /cluster/data/danRer5/bed/blastz.mm9.swap
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-chainMinScore=5000 \
 	/cluster/data/mm9/bed/blastzDanRer5.2007-09-11/DEF \
 	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> swap.log 2>&1 &
     #	real    9m47.163s
     cat fb.danRer5.chainMm9Link.txt
     #	34017483 bases of 1435609608 (2.370%) in intersection
 
 ########################################################################
 # ADD CONTIGS TRACK (DONE, 2007-09-14, hartera)
     # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from 
     # chunks (contigs) agp files.
     ssh kkstore06
     mkdir -p /cluster/data/danRer5/bed/ctgPos2
     # get chrM information from danRer4:
     ssh hgwdev 
     cd /cluster/data/danRer5/bed/ctgPos2
     hgsql -N -e 'select * from ctgPos2 where chrom = "chrM";' danRer4 \
          > chrM.ctgPos2.txt
 
     ssh kkstore06
     cd /cluster/data/danRer5/bed/ctgPos2
     # update chrM accession to NC_002333.2
     perl -pi.bak -e 's/AC024175\.3/NC_002333\.2/' chrM.ctgPos2.txt
     
     # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
     # get the contigs for the chroms, chr1-25:
     awk 'BEGIN {OFS="\t"} \
         {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
          /cluster/data/danRer5/Zv7release/Zv7_chr.agp >> ctgPos2.tab
     # Add chrM to ctgPos2 file:
     cat chrM.ctgPos2.txt >> ctgPos2.tab
  
     # then do the same for the unmapped scaffolds:
     awk 'BEGIN {OFS="\t"} \
         {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
         /cluster/data/danRer5/Zv7release/randoms/randomsScaffold.agp \
         >> ctgPos2.tab
 
     # load the ctgPos2 table
     ssh hgwdev
     cd /cluster/data/danRer5/bed/ctgPos2
     # use hgLoadSqlTab as it gives more error messages than using 
     # "load data local infile ...".
     hgLoadSqlTab danRer5 ctgPos2 \
             ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
     # check that all chroms and scaffolds are there
     hgsql -N -e 'select distinct(chrom) from ctgPos2;' danRer5 \
            | sort > chromsAndScafsCtgPos2.txt
     hgsql -N -e 'select distinct(chrom) from chromInfo;' danRer5 \
            | sort > chromsAndScafsChromInfo.txt
     wc -l chroms*
     # 5036 chromsAndScafsChromInfo.txt
     # 5036 chromsAndScafsCtgPos2.txt
     comm -12 chromsAndScafsCtgPos2.txt chromsAndScafsChromInfo.txt | wc -l
     # 5036
     # So all chroms and unmapped scaffolds are represented in the table.
 
     # cleanup
     ssh kkstore06
     cd /cluster/data/danRer5/bed/ctgPos2
     rm *.bak chromsAndScafs*
 
 # create trackDb.ra entry and html page for ctgPos2 track.
 # add search for the track and make sure the termRegex will handle
 # contigs named "Zv7_scaffoldN.N" where N is an integer and all the 
 # accessions in the *.agp files.
 
 ###########################################################################
 # CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO 
 # AFFY ZEBRAFISH ALIGNMENTS (DONE, 2007-09-16, hartera)
 # Array data is for whole embryos of five wild type zebrafish strains. 
 # Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's 
 # Hospital Boston. Contact: adibiase@enders.tch.harvard.edu
 # see danRer4.txt and hgFixed.txt for the history of this track and the 
 # expression data.
     ssh hgwdev
     mkdir -p /cluster/data/danRer5/bed/ZonLab/wtArray
     cd /cluster/data/danRer5/bed/ZonLab/wtArray
     
     # Map the data to the consensus sequence alignments first. 
     # use AllRatio table for mapping. There are not many arrays in this 
     # dataset so using AllRatio will allow the selection of All Arrays
     # from the track controls on the track description page. Also set up the
     # Zebrafish microarrayGroups.ra so that the Medians of replicates or
     # Means of replicates can also be selected for display.
     # Create mapped data in zebrafishZonWT.bed.
     hgMapMicroarray zebrafishZonWTAffyConsensus.bed \
          hgFixed.zebrafishZonWTAllRatio \
          /cluster/data/danRer5/bed/affyZebrafish/affyZebrafishConsensus.psl
     # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTAllRatio
     # Mapped 14971,  multiply-mapped 2002, missed 0, unmapped 646
 
     hgLoadBed danRer5 affyZonWildType zebrafishZonWTAffyConsensus.bed
     # Loaded 16973 elements of size 15
 
     # then for the Affy Zebrafish chip target sequences, map these to the 
     # Wild Type expression data
     hgMapMicroarray -pslPrefix="tg:" zebrafishZonWTAffyTarget.bed \
          hgFixed.zebrafishZonWTAllRatio \
          /cluster/data/danRer5/bed/affyZebrafish/target/affyZebrafishTarget.psl
     # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTAllRatio
     # Mapped 14820,  multiply-mapped 1683, missed 0, unmapped 797
 
     hgLoadBed danRer5 affyTargetZonWildType zebrafishZonWTAffyTarget.bed
     # Loaded 16503 elements of size 15
 
     # add trackDb.ra entry at trackDb/zebrafish/danRer5 level to create
     # a composite affyZonWildType with alignments and array data for both
     # Affy Zebrafish chip consensus sequences and target sequences - see
     # danRer4.txt for more details of history of trackDb.ra and 
     # microarrayGroups.ra entries for this data.
 
 ###########################################################################
 # BLASTZ FOR TETRAODON (tetNig1) (DONE, 2007-09-16, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
     # No lineage-specific repeats for this species pair. 
     # Tetraodon also has no species-specific repeats in the RepeatMasker
     # library so run this using dynamic masking.
     # The tetraodon 2bit file of scaffolds was used 
     # (tetNig1.randomContigs.sdTrf.2bit) - this contains sequences for 
     # scaffolds from randoms. These were used for Blastz rather than the 
     # random chroms so that alignments do not occur across non-contiguous 
     # scaffolds.
     ## use screen to run
     ssh kkstore06
     mkdir /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16
     cd /cluster/data/danRer5/bed
     ln -s blastz.tetNig1.2007-09-16 blastz.tetNig1
     cd /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16
 
 # Use Blastz parameters for tetNig1 in danRer4.txt. Using scaffolds makes this run
 # slower so it is best to have the scaffolds in the query. Use HoxD55.q 
 # matrix as tetraodon is quite distant from zebrafish. Blastz uses 
 # lineage-specfic repeats but there are none for these two species.
 # Use soft-masked scaffolds for tetNig1 (and dynamic masking) and also use 
 # windowMasker plus SDust repeats since there are no species-specific
 # RepeatMasker repeats for tetraodon.
 cat << '_EOF_' > DEF
 # zebrafish (danRer5) vs. tetraodon (tetNig1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_H=2500
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - zebrafish (danRer5)
 SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit
 SEQ1_LEN=/cluster/data/danRer5/chrom.sizes
 SEQ1_LIMIT=30
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - Tetraodon (tetNig1)
 # soft-masked chroms and random scaffolds in 2bit format
 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift
 SEQ2_CHUNK=1000000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16
 TMPDIR=/scratch/tmp
 '_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    mkdir /san/sanvol1/scratch/danRer5TetNig1
    time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -qRepeats=windowmaskerSdust \
         -blastzOutRoot /san/sanvol1/scratch/danRer5TetNig1 \
         `pwd`/DEF >& doBlastz.log &
    # Took about 5 hours 7 minutes to run.
 
    featureBits danRer5 refGene:cds chainTetNig1Link -enrichment
 # refGene:cds 1.019%, chainTetNig1Link 6.359%, both 0.866%, cover 84.94%,
 # enrich 13.36x  
    featureBits danRer4 refGene:cds chainTetNig1Link -enrichment
 # refGene:cds 0.953%, chainTetNig1Link 7.345%, both 0.836%, cover 87.72%,
 # enrich 11.94x
    featureBits danRer5 refGene:cds netTetNig1 -enrichment
 # refGene:cds 1.019%, netTetNig1 63.685%, both 0.961%, cover 94.30%, enrich
 # 1.48x
    featureBits danRer4 refGene:cds netTetNig1 -enrichment
 # refGene:cds 0.953%, netTetNig1 63.389%, both 0.909%, cover 95.41%, enrich
 # 1.51x
 
    ## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND
     # ALIGNMENT DOWNLOAD FOR DANRER5 ON TETNIG1 - documented in tetNig1.txt
 
 ###########################################################################
 # BLASTZ FOR FUGU (fr2) (DONE, 2007-09-18 - 2007-09-19, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
     # No lineage-specific repeats for this species pair. 
     # Fugu also has no species-specific repeats in the RepeatMasker
     # library so run this using dynamic masking.
     # Usee the Fugu 2bit file of scaffolds (fr2.scaffolds.2bit) - this 
     # contains sequences for all the scaffolds of chrUn. The whole assembly
     # is unmapped, unordered scaffolds. The individual scaffolds were    
     # used for Blastz rather than the random chroms so that alignments do not 
     # occur across non-contiguous scaffolds.
 
     ## use screen to run
     ssh kkstore06
     mkdir /cluster/data/danRer5/bed/blastz.fr2.2007-09-18
     cd /cluster/data/danRer5/bed
     ln -s blastz.fr2.2007-09-18 blastz.fr2
     cd /cluster/data/danRer5/bed/blastz.fr2.2007-09-18
 
 # Use Blastz parameters for fr1 in danRer4.txt. Using scaffolds makes this run
 # slower so it is best to have the scaffolds in the query. Use HoxD55.q 
 # matrix as Fugu is quite distant from zebrafish. Blastz uses 
 # lineage-specfic repeats but there are none for these two species.
 # Use soft-masked scaffolds for fr2 (and dynamic masking) and also use 
 # windowMasker plus SDust repeats since there are no species-specific
 # RepeatMasker repeats for Fugu. Fugu is about 400.5 Mb of scaffolds plus
 # chrM which is about 16.4 kb.
 
 cat << '_EOF_' > DEF
 # zebrafish (danRer5) vs. Fugu (fr2)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_H=2500
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - zebrafish (danRer5)
 SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit
 SEQ1_LEN=/cluster/data/danRer5/chrom.sizes
 SEQ1_LIMIT=30
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - Fugu (fr2)
 # soft-masked chroms and random scaffolds in 2bit format
 SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer5/bed/blastz.fr2.2007-09-18
 TMPDIR=/scratch/tmp
 '_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    mkdir /san/sanvol1/scratch/danRer5Fr2
    time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -qRepeats=windowmaskerSdust \
         -blastzOutRoot /san/sanvol1/scratch/danRer5Fr2 \
         `pwd`/DEF >& doBlastz.log &
    # Took about 6 hours and 50 minutes to run.
    cat fb.danRer5.chainFr2Link.txt
    # 104476108 bases of 1435609608 (7.277%) in intersection
    # Compare coverage to refGene CDS: 
    featureBits danRer5 refGene:cds chainFr2Link -enrichment
 # refGene:cds 1.019%, chainFr2Link 7.277%, both 0.886%, cover 86.93%, 
 # enrich 11.95x   
    featureBits danRer4 refGene:cds chainFr2Link -enrichment
 # refGene:cds 0.955%, chainFr2Link 8.543%, both 0.810%, cover 84.73%, 
 # enrich 9.92x   
    featureBits danRer5 refGene:cds netFr2 -enrichment
 # refGene:cds 1.019%, netFr2 64.210%, both 0.968%, cover 94.95%, enrich 1.48x
    featureBits danRer4 refGene:cds netFr2 -enrichment
 # refGene:cds 0.955%, netFr2 62.980%, both 0.885%, cover 92.68%, enrich 1.47x 
 
    ## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND
     # ALIGNMENT DOWNLOAD FOR DANRER5 ON FR2 -document in fr2.txt
 
 ###########################################################################
 # CREATE LIFTOVER CHAINS FROM danRer5 TO danRer4 
 # (DONE 2007-09-19 AND 2007-09-21, hartera)
     ssh kkstore06
     mkdir /cluster/data/danRer5/bed/blat.danRer4
     cd /cluster/data/danRer5/bed/blat.danRer4
 
     time nice doSameSpeciesLiftOver.pl danRer5 danRer4 \
         -bigClusterHub pk \
         -ooc /san/sanvol1/scratch/danRer5/danRer5_11.ooc \
 	-buildDir /cluster/data/danRer5/bed/blat.danRer4 >& do.log &
     # 0.401u 0.253s 8:09:03.62 0.0%   0+0k 0+0io 11pf+0w
     # Remove symbolic link to liftOver chains and copy over the file
     rm ../liftOver/danRer5ToDanRer4.over.chain.gz
     cp -p danRer5ToDanRer4.over.chain.gz ../liftOver
 
     # a link in /usr/local/apache/htdocs/goldenPath/danRer5/liftOver has 
     # already been made to this file and md5sum.txt needs to be updated
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/danRer5/liftOver
     md5sum *.gz > md5sum.txt
  
 ###########################################################################
 # VEGA GENES UPDATE TO v28 (DONE, 2007-09-28 - 2007-10-01, hartera)
 # Data provided by Tina Eyre from Sanger: te3@sanger.ac.uk
 # UPDATE SENT ON 2007-09-27, Vega v28. RE-LOAD TABLES TO UPDATE THE 
     ssh kkstore06
     mkdir -p /cluster/data/danRer5/bed/vega28/data/
     cd /cluster/data/danRer5/bed/vega28/data
     # 3 files: vegav28.gtf.zip  vegav28_information.txt.zip
     # vegav28_transcript_to_clone.txt.zip
     unzip vegav28.gtf.zip
     unzip vegav28_information.txt.zip  
     unzip vegav28_transcript_to_clone.txt.zip   
     # asked for the vegav28_information.txt file to be in the format 
     # required to load the table directory. 
     # Check the format then prepare data files to load database tables.
     cd /cluster/data/danRer5/bed/vega28
     # this time all the lines are describing genes and there is no extra 
     # header lines and output from the program that generated the file. 
     # NOTE: there are no NA scaffolds. 
     # Remove "chr" prefix from scaffolds
     sed -e 's/chrZv7_/Zv7_/' ./data/vegav28.gtf > vegav28Fixed.gtf   
     # vegaInfo is transcriptId, otterId, geneId, method and geneDesc
     # Get otter transcript ID and otter gene ID:
     awk 'BEGIN{FS=";"} {OFS="\t"} \
         {if (($5 ~ /otter_gene_id/) && ($6 ~ /otter_transcript_id/)) \
          print $5, $6;}' vegav28Fixed.gtf | sort | uniq > vega28OtterIds.txt
     # remove the otter_gene_id and otter_transcript_id and 
     # extra spaces and quotation marks.
     perl -pi.bak -e 's/\sotter_gene_id\s\"//;' vega28OtterIds.txt
     perl -pi.bak -e 's/"\t\sotter_transcript_id\s"(OTTDART[0-9]+)"/\t$1/' \
          vega28OtterIds.txt
     wc -l vega28OtterIds.txt
     # 14001 vega28OtterIds.txt
     # there were 12819 in Vega v27
     # get list of the otter gene IDs only
     awk '{print $1}' vega28OtterIds.txt | sort | uniq > otterGeneIds.only
     # then get list of otter gene IDs from information file
     awk '{if ($1 ~ /OTTDARG/) print $1}' data/vegav28_information.txt \
         | sort | uniq > infoOtterGeneIds.only
     comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l
     # 8489
     wc -l *.only
     # 10374 infoOtterGeneIds.only
     # 9318 otterGeneIds.only
     comm -23 otterGeneIds.only infoOtterGeneIds.only > geneIdsGtfNotInfo.txt
     # also check the transcripts in transcript to clone are all there.
     awk 'BEGIN {FS="\t"} {print $2;}' vega28OtterIds.txt \
         | sort | uniq > otterTxIds.only
     wc -l otterTxIds.only
     # 14001 otterTxIds.only
     awk 'BEGIN {FS="\t"} {print $1;}' ./data/vegav28_transcript_to_clone.txt \
         | sort | uniq > vegaCloneTxIds.sort
     wc -l vegaCloneTxIds.sort
     # 14001 vegaCloneTxIds.sort
     comm -12 otterTxIds.only vegaCloneTxIds.sort | wc -l
     # 14001
     # All otter transcript IDs in the GTF vegav28 gene file are in the
     # vegav28_transcript_to_clone.txt file as expected.
     wc -l geneIdsGtfNotInfo.txt
     # 829 geneIdsGtfNotInfo.txt
     # E-mailed Tina Eyre at Sanger - on 2007-09-28 - to ask about 
     # these otter gene Ids that are in the GTF file but not in the 
     # vegav28_information.txt file.
     # Received correct file: 2007-10-01
     # Check again
     ssh kkstore06
     cd /cluster/data/danRer5/bed/vega28/data
     unzip vegav28_information.txt.zip
     cd ..    
     # then get list of otter gene IDs from information file
     awk '{if ($1 ~ /OTTDARG/) print $1}' ./data/vegav28_information.txt \
         | sort | uniq > infoOtterGeneIds.only
     comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l
     # 9318
     wc -l *.only
     # 11115 infoOtterGeneIds.only
     # 9318 otterGeneIds.only
     # All IDs from the GTF file are contained in the information file.
     # all the gene IDs from the GTF file but the information file contains 
     # 1797 are in the information file but not the GTF file
     # Extra genes are included that are found on clones that do not map 
     # to Ensembl.
     comm -13 otterGeneIds.only infoOtterGeneIds.only > geneIds.InfoOnly.txt
 cat << 'EOF' > mergeTransIdAndInfo.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($idsFile, $infoFile, %idsHash);
 $idsFile = $ARGV[0];
 $infoFile = $ARGV[1];
 open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
 open (INFO, $infoFile) || die "Can not open $idsFile: $!\n";
 
 # Read in the otter gene IDs and transcript IDs
 while (<IDS>)
 {
 my (@f, $line);
 chomp;
 $line = $_;
 # split line on tab
 @f = split(/\t/, $line);
 # hash keyed by gene ID, stores transcript ID
 # more than one ID so store as an array
 if ($f[0] =~ /OTTDARG/)
    {
    # add to array in hash if it is an otter gene ID
    push @{$idsHash{$f[0]}}, $f[1];
    }
 }
 close IDS;
 
 while (<INFO>)
 {
 my ($line, @info, $geneId, @transIds);
 # read in information file and get the otter gene ID
 $line = $_;
 # split line on tab
 @info = split(/\t/, $line);
 if ($info[0] =~ /OTTDARG/)
    {
    $geneId = $info[0];
    # look up transcript ID in hash 
    
    # if gene ID exists in the hash then print out the information
    # file line with the correct transcript ID
    if (exists($idsHash{$geneId}))
       {
       @transIds = @{$idsHash{$geneId}};
       # rewrite line with transcript ID
       foreach my $t (@transIds)
          {
          print "$t\t$line";
          }
       }
    }
 }
 close INFO;
 'EOF'
    # << happy emacs
     chmod +x mergeTransIdAndInfo.pl
     mergeTransIdAndInfo.pl vega28OtterIds.txt ./data/vegav28_information.txt \
          > vegav28InfoWithTransIds.txt 
     wc -l vega28OtterIds.txt
     # 14001 vega28OtterIds.txt
     wc -l vegav28InfoWithTransIds.txt
     # 14001 vegav28InfoWithTransIds.txt
     # so just sort and uniq the resulting information file with transcript IDs
     sort vegav28InfoWithTransIds.txt | uniq > vegav28InfoWithTransIdsUniq.txt
     wc -l vegav28InfoWithTransIdsUniq.txt
     # 14001 vegav28InfoWithTransIdsUniq.txt
     # So no duplicated lines.
     # There are extra IDs in the information file as it includes all Vega
     # genes (including clones not in Ensembl). Those actually in the Vega 
     # v27 GTF are those that are found on clones that are in Ensembl.
     # Vega includes only clone sequence and Zv7_NA scaffolds are WGS so 
     # these are not annotated in Vega. This information is from 
     # an e-mail from Tina Eyre (te3@sanger.ac.uk) on 2007-09-07.
 
     # then use the info file to grab those genes that are pseudogenes, get the
     # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
     # to a separate file. 
 
     # Next step is to find which transcripts are pseudogenes.
     grep pseudogene vegav28InfoWithTransIdsUniq.txt | sort | uniq | wc -l
     # found 68 pseudogenes so need to create the pseudogene track 
     # Get transcript IDs for pseudogenes.
     grep pseudogene vegav28InfoWithTransIdsUniq.txt | awk '{print $1}' \
          > pseudogenes.ids 
     grep -w -f pseudogenes.ids vegav28Fixed.gtf > vegav28Pseudogene.gtf 
     awk '{print $20}' vegav28Pseudogene.gtf | sort | uniq | wc -l
     # 68
     # Need to make the GTF file vegGene table by subtracting pseudogenes:
     grep -vw -f pseudogenes.ids vegav28Fixed.gtf > vegaGenev28.gtf
     wc -l vega*gtf
     # 178987 vegaGenev28.gtf
     # 179249 vegav28Fixed.gtf
     # 262 vegav28Pseudogene.gtf
     # Need to relabel IDs to get the name to be the otter transcript ID
     # and name 2 to be the transcript_id (needs to be labeled as gene_id)
     # Also, relabel the otter_transcript_id to be transcript_id as ldHgGene
     # groups the rows by this ID.  
     sed -e 's/gene_id/tmp_id/' vegaGenev28.gtf \
         | sed -e 's/transcript_id/gene_id/' \
         | sed -e 's/otter_transcript_id/transcript_id/' > vegaGenev28Format.gtf
 
     # Do the same for the pseudogene GTF file:
     sed -e 's/gene_id/tmp_id/' vegav28Pseudogene.gtf \
         | sed -e 's/transcript_id/gene_id/' \
         | sed -e 's/otter_transcript_id/transcript_id/' \
         > vegaPseudogenev28Format.gtf
     
     gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
         vegaGenev28Format.gtf vegaGenev28.gp
     genePredCheck vegaGenev28.gp
     # checked: 13933 failed: 0
     gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
         vegaPseudogenev28Format.gtf vegaPseudogenev28.gp
     genePredCheck vegaPseudogenev28.gp
     # checked: 68 failed: 0
     
     # load GTF files for Vega genes and pseudogenes:
     ssh hgwdev
     cd /cluster/data/danRer5/bed/vega28
     hgsql -e 'drop table vegaGene;' danRer5
     # load vegaGene table
     hgLoadGenePred -genePredExt danRer5 vegaGene vegaGenev28.gp
     hgsql -e 'drop table vegaPseudoGene;' danRer5
     # load vegaPseudoGene table
     hgLoadGenePred -genePredExt danRer5 vegaPseudoGene vegaPseudogenev28.gp
 
     hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer5 
 # For vegav28 there are 29, chr1-25 plue 4 scaffolds:
 # Zv7_scaffold2498
 # Zv7_scaffold2509
 # Zv7_scaffold2516
 # Zv7_scaffold2572
 
 # For Vega v27, there were more scaffolds annotated so e-mailed Tina Eyre
 # to ask why this is the case.
 # There are 36, chr1-25 and 11 scaffolds are annotated:
 # Zv7_scaffold2491
 # Zv7_scaffold2498
 # Zv7_scaffold2509
 # Zv7_scaffold2516
 # Zv7_scaffold2523
 # Zv7_scaffold2533
 # Zv7_scaffold2537
 # Zv7_scaffold2551
 # Zv7_scaffold2560
 # Zv7_scaffold2633
 # Zv7_scaffold2650
 
     hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer5
 # 13 chroms and 1 scaffolds have pseudogenes annotated
 # chr2, chr3, chr4, chr5, chr9, chr12, chr13, chr18, chr19, chr20, chr22, chr24,
 # Zv7_scaffold2509 
     # Only finished sequence is annotated by Vega 
     # Vega information tables:
     # mySQL table definition and autosql-generated files created previously 
     # for zebrafish-specific information (vegaInfoZfish).
     # Add clone_id to a separate table instead of this one. A tab-separated 
     # file of transcript ID and clone ID was provided by Tina Eyre 
     # (te3@sanger.ac.uk) at Sanger.  
     # Need to grep for the transcript IDs     
     # created a second table for the cloneId accessions since there
     # are multiple ids for some VEGA genes. Otherwise, there would be 
     # a comma separated list in this field or many rows repeated but just
     # different in the cloneId field. Associate transcript ID to clone IDs.  
     # Table definitions are in danRer4.txt.
     
     # Load the vega to clone ID information:
     ssh kkstore06
     cd /cluster/data/danRer5/bed/vega28
     sort ./data/vegav28_transcript_to_clone.txt | uniq \
          > vegav28TransToClone.txt
     
     # load the vegaInfo and vegaToCloneId tables: 
     ssh hgwdev
     cd /cluster/data/danRer5/bed/vega28
     hgsql -e 'drop table vegaInfoZfish;' danRer5
     hgLoadSqlTab danRer5 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                  vegav28InfoWithTransIdsUniq.txt
     hgsql -e 'drop table vegaToCloneId;' danRer5
     hgLoadSqlTab danRer5 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
                  vegav28TransToClone.txt
     
     # A peptide file was requested this time (vegav28_protein.fa) so load
     # it into vegaPep:
     ssh kkstore06
     cd /cluster/data/danRer5/bed/vega28
     # There are more peptides in the FASTA file than there are transcript IDs
     # in the GTF file so get those peptides sequences for transcripts in GTF.
     # otterTxIds.only is the list of transcript IDs (14,001 IDs).
     faSomeRecords ./data/vegav28_protein.fa otterTxIds.only vega28Pep.fa 
     grep '>' vega28Pep.fa | wc -l 
     # 10110 
 
     grep '>' vega28Pep.fa | sed -e 's/>//' | sort | uniq > vega28PepTxIds.txt
     comm -12 vega28PepTxIds.txt otterTxIds.only | wc -l
     # 10110
     comm -13 vega28PepTxIds.txt otterTxIds.only > otterTxNotInPeptides.txt
     wc -l otterTxNotInPeptides.txt
     # 3891 otterTxNotInPeptides.txt
     # Checked a few and these appear to be novel processed transcripts or novel 
     # protein-coding so probably the novel ones do not have an associated 
     # protein - check with Tina Eyre.
     ssh hgwdev
     cd /cluster/data/danRer5/bed/vega28
     hgPepPred danRer5 generic vegaPep vega28Pep.fa
     # hgsql -e 'select count(*) from vegaPep;' danRer5
     # 10110
     # Check which type of transcripts do not have a peptide translation.
     hgsql -N -e 'select distinct(name) from vegaPep;' danRer5 | \
          sort > vegaPep.names.sort
     hgsql -N -e 'select distinct(transcriptId) from vegaInfoZfish;' danRer5 \
          | sort > vegaInfo.txId.sort
     comm -12 vegaPep.names.sort vegaInfo.txId.sort | wc -l
     # 10110
     wc -l *.sort
     # 14001 vegaInfo.txId.sort
     # 10110 vegaPep.names.sort
     comm -13 vegaPep.names.sort vegaInfo.txId.sort > vegaTx.inInfoNotPep.txt
     wc -l vegaTx.inInfoNotPep.txt
     # 3891 vegaTx.inInfoNotPep.txt
     foreach t (`cat vegaTx.inInfoNotPep.txt`)
        hgsql -N -e "select transcriptId, method, confidence from vegaInfoZfish
 where transcriptId = '${t}';" danRer5 >> vegaTx.inInfoNotPep.type
     end
 
     awk 'BEGIN {OFS="\t"} {print $2, $3}' vegaTx.inInfoNotPep.type \
         | sort | uniq -c | sort -nr > vegaTx.inInfoNotPep.counts
  
 # Add trackDb.ra entry for trackDb/zebrafish/danRer5/trackDb.ra if it is not
 # there already. Keep Vega Genes and Vega Pseudogenes separate as for
 # human and add version number to description. 
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-10-18)
     ssh kkstore06
     # bash  if not using bash shell already
 
     mkdir /cluster/data/danRer5/blastDb
     cd /cluster/data/danRer5
     awk '{if ($2 > 1000000) print $1}' chrom.sizes > great1M.lst
     twoBitToFa danRer5.2bit stdout | faSomeRecords stdin great1M.lst temp.fa
     faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
     rm temp.fa
     twoBitToFa danRer5.2bit stdout | faSomeRecords -exclude stdin great1M.lst temp.fa
     faSplit sequence temp.fa 156 blastDb/y
     cd blastDb
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.fa
 
     mkdir -p /cluster/bluearc//danRer5/blastDb
     cd /cluster/data/danRer5/blastDb
     for i in nhr nin nsq; 
     do 
 	echo $i
 	cp *.$i /cluster/bluearc//danRer5/blastDb
     done
 
     mkdir -p /cluster/data/danRer5/bed/tblastn.hg18KG
     cd /cluster/data/danRer5/bed/tblastn.hg18KG
     echo /cluster/bluearc//danRer5/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 1456 query.lst
 
    # we want around 200000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
 
 # 36727/(200000/1456) = 267.372560
 
    mkdir -p /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa
    split -l 267 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/danRer5/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer5/blastDb.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
 
         if pslCheck -prot $3.tmp
         then                   
             mv $3.tmp $3      
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0 
     fi        
 fi                                                                                
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     exit 
     
     ssh kk
     cd /cluster/data/danRer5/bed/tblastn.hg18KG
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 # Completed: 75786 of 75786 jobs
 # CPU time in finished jobs:   13289094s  221484.91m  3691.42h  153.81d  0.421 y
 # IO & Wait Time:               6534562s  108909.36m  1815.16h   75.63d  0.207 y
 # Average job time:                 262s       4.36m     0.07h    0.00d
 # Longest finished job:            1158s      19.30m     0.32h    0.01d
 # Submission to last job:         37516s     625.27m    10.42h    0.43d
 
     ssh kkstore06
     cd /cluster/data/danRer5/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh pk
     cd /cluster/data/danRer5/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para maxNode 30
     para try, check, push, check etc.
 
 # Completed: 138 of 138 jobs
 # CPU time in finished jobs:      83388s    1389.80m    23.16h    0.97d  0.003 y
 # IO & Wait Time:                 43355s     722.58m    12.04h    0.50d  0.001 y
 # Average job time:                 918s      15.31m     0.26h    0.01d
 # Longest finished job:           21693s     361.55m     6.03h    0.25d
 # Submission to last job:         21713s     361.88m     6.03h    0.25d
 
     ssh kkstore06
     cd /cluster/data/danRer5/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/danRer5/bed/tblastn.hg18KG/blastHg18KG.psl 
     cd ..
     pslCheck blastHg18KG.psl 
 
     # load table 
     ssh hgwdev
     cd /cluster/data/danRer5/bed/tblastn.hg18KG
     hgLoadPsl danRer5 blastHg18KG.psl
 
     # check coverage
     featureBits danRer5 blastHg18KG 
 # 21056644 bases of 1435609608 (1.467%) in intersection
 
     featureBits danRer4 blastHg18KG 
 # 21159392 bases of 1626093931 (1.301%) in intersection
 
     featureBits danRer5 ensGene:cds blastHg18KG  -enrichment
 # ensGene:cds 2.114%, blastHg18KG 1.467%, both 1.077%, cover 50.92%, enrich 34.72x
     featureBits danRer4 ensGene:cds blastHg18KG  -enrichment
 # ensGene:cds 2.230%, blastHg18KG 1.301%, both 1.058%, cover 47.46%, enrich 36.47x
 
     ssh kkstore06
     rm -rf /cluster/data/danRer5/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut
 #end tblastn
 ##########################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 
 ############################################################################
 # SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-28 - Hiram)
     cd /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27
     cat fb.oryLat2.chainDanRer5Link.txt 
     #	138070427 bases of 700386597 (19.713%) in intersection
 
     #	and for the swap
     mkdir /cluster/data/danRer5/bed/blastz.oryLat2.swap
     cd /cluster/data/danRer5/bed/blastz.oryLat2.swap
     time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
 	/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \
 	-swap -tRepeats=windowmaskerSdust \
 	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
     #	real    58m15.303s
     #	had to finish the load nets manually, mistakenly included
     #	-qRepeats=windowmaskerSdust when that is not valid for danRer5
     cat fb.danRer5.chainOryLat2Link.txt
     #	158709399 bases of 1435609608 (11.055%) in intersection
 
     #	Then, continuing:
     doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
 	/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \
 	-continue=download -swap -tRepeats=windowmaskerSdust \
 	-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > download.log 2>&1 &
 
 #########################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 danRer5.upstreamGeneTbl = refGene
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
+# SWAP BLASTZ FOR DOG (canFam2) (DONE, 2009-08-08, hartera)
+# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
+# See canFam2.txt for creation of zebrafish chain and net tracks and downloads
+# on the dog genome assembly.
+    mkdir /hive/data/genomes/danRer5/bed/blastz.canFam2.swap
+    cd /hive/data/genomes/danRer5/bed/blastz.canFam2.swap
+    
+    time nice doBlastzChainNet.pl -chainMinScore=5000 -chainLinearGap=loose \
+	-swap /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07/DEF \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
+        -chainMinScore=5000 -chainLinearGap=loose \
+	>& doBlastz.swap.log &
+    cat fb.danRer5.chainCanFam2Link.txt 
+    # 32053647 bases of 1435609608 (2.233%) in intersection
+