src/hg/makeDb/doc/danRer2.txt 1.6

1.6 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/danRer2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer2.txt,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 1000000 -r1.5 -r1.6
--- src/hg/makeDb/doc/danRer2.txt	28 Mar 2007 17:35:39 -0000	1.5
+++ src/hg/makeDb/doc/danRer2.txt	25 Nov 2009 21:48:38 -0000	1.6
@@ -1,5375 +1,5375 @@
 # for emacs: -*- mode: sh; -*-
 
                                                                                 
 # Danio Rerio (zebrafish) from Sanger, version Zv4 (released 6/30/04)
 #  Project website:
 #    http://www.sanger.ac.uk/Projects/D_rerio/
 #  Assembly notes:
 #    http://www.sanger.ac.uk/Projects/D_rerio/Zv4_assembly_information.shtml
 #  NOTE: Error in scaffolds agp file. Notified Sanger and got new scaffolds
 # agp and recreated FASTA files from this new one (2004-11-29)
 # Previous agp file was missing a scaffold from the end of most chromosomes.
 # There is also a chrUn set of scaffolds that are in the chunks agp file - these# just have the identifier Zv4_scaffoldN instead of a chromosome number and 
 # they are scaffolds that correspond to FPC contigs but their position is 
 # unknown so they are not mapped to a chromosome.
 
 # DOWNLOAD SEQUENCE (DONE, 2004-10-18, hartera)
 # ERRORS IN SCAFFOLDS AGP SO GET NEW AGP FROM SANGER AND DOWNLOAD 
 # (hartera, 2004-11-29) from Mario Caccamo: mc2@sanger.ac.uk
      ssh kksilo
      mkdir /cluster/store8/danRer2
      ln -s /cluster/store8/danRer2 /cluster/data
      cd /cluster/data/danRer2
      wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/README
      wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/stats
      wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.chunks.agp
      wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.scaffolds.agp
      wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.fasta
 # get new agp file and download to /cluster/data/danRer2 (hartera, 2004-11-29)
      # Remove all chrom directories to start processing with new agp file
      ssh kksilo
      cd /cluster/data/danRer2
      foreach c (`cat chrom.lst`)
         rm -r $c
      end
 
 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2004-10-18, hartera)
 # DOWNLOAD SEQUENCE AND CREATE FILES AGAIN (hartera, 2004-11-30)
 # ADD chrM.chunks.agp (DONE, 2004-12-06, hartera)
 # Error reported by a user: chrM.agp is space delimited rather
 # than tab delimited so correct this. NCBI defines the agp format as 
 # being tab delimited. (DONE, 2005-04-25, hartera)
      ssh kksilo
      mkdir -p /cluster/data/danRer2/M
      cd /cluster/data/danRer2/M
      # go to http://www.ncbi.nih.gov/ and search Nucleotide for
      # "Danio mitochondrion genome".  That shows the gi number:
      # 8576324 for the accession, AC024175
  # Use that number in the entrez linking interface to get fasta:
      wget -O chrM.fa \
       'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
      # Edit chrM.fa: make sure the header line says it is the
      # Danio Rerio mitochondrion complete genome, and then replace the
      # header line with just ">chrM".
      perl -pi.bak -e 's/>.+/>chrM/' chrM.fa
      rm *.bak                                                         
      # Make a "pseudo-contig" for processing chrM too:
      mkdir ./chrM_1
      sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
      mkdir ./lift
      echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
      echo "chrM_1" > ./lift/ordered.lst
      echo "0     M/chrM_1        16596   chrM    16596" > ./lift/ordered.lft
 # create a .agp file for chrM as hgGoldGapGl and other
 # programs require a .agp file so create chrM.agp
     cat << '_EOF_' > ./chrM.agp
 chrM       1       16596   1       F       AC024175.3      1       16596   +
 '_EOF_'
      # Create a chrM.chunks.agp (hartera, 2004-12-06)
      cd /cluster/data/danRer2/M/agps
      awk 'BEGIN {OFS="\t"} \
         {print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' ../chrM.agp \
          > chrM.chunks.agp
      # chrM.agp is space delimited instead of tab delimited 
      # so correct this (hartera, 2005-04-25)
      cd /cluster/data/danRer2/M
      # edit chrM.agp and change field delimiters from spaces to tabs.
      # add this new file to zipped files of agp and get it pushed to the
      # downloads area - see "MAKE DOWNLOADABLE SEQUENCE FILES" section of 
      # this make doc. 
 
 # Create list of chromsosomes (DONE, 2004-10-18, hartera)
 # Add "M" for mitochondrion chromosome (2004-10-25, hartera)
 # Add "Un" for chrUn (2004-11-29, hartera)
      ssh kksilo
      cd /cluster/data/danRer2
      awk '{print $1;}' Zv4.scaffolds.agp | sort -n | uniq > chrom.lst
      # add NA - these are contigs in the chunks agp
      echo "NA" >> chrom.lst
      # add chrM
      echo "M" >> chrom.lst
      # add chrUn
      echo "Un" >> chrom.lst
 
 # SPLIT AGP FILES BY CHROMOSOME (DONE, 2004-10-19, hartera)
 # AGP USED TO CREATE FASTA WAS SCAFFOLDS AGP 
 # RE-DO SPLITTING AGP FILES AFTER GETTING NEW SCAFFOLDS AGP 
 # (hartera, 2004-11-29)
      ssh kksilo
      cd /cluster/data/danRer2
      # There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and 
      # then one for chunks (contigs on danRer1) showing how they map on to 
      # scaffolds.
      # add "chr" prefix for the agp files
      perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp
      # for chromosomes:
      foreach c (`cat chrom.lst`)
        mkdir $c
        perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
          ./Zv4.chunks.agp \
          > $c/chr$c.chunks.agp
        perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
          ./Zv4.scaffolds.agp \
          > $c/chr$c.scaffolds.agp
      end
 
 # CREATE AGP AND FASTA FOR chrNA (DONE, 2004-10-25, hartera)
 # REMAKE AGP WITH NEW SCAFFOLDS AGP FILE AND CREATE chrUn AGP 
 # (hartera, 2004-11-29)
 
      ssh kksilo
      cd /cluster/data/danRer2
      # for NA make agp files
      grep "Zv4_NA" Zv4.chunks.agp > NA.chunks.agp
      # make a scaffolds agp for NA - use first 9 fields of chunks file
      # and remove ".1" from 6th field
      awk 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5, $6, $7, $8, $9}' \
          NA.chunks.agp | perl -pi.bak -e 's/(Zv4_NA[0-9]+)\.1+/$1/' \
          > NA.scaffolds.agp 
      # move agps to NA directory created above
      mv NA.scaffolds.agp NA.chunks.agp ./NA
      # from scaffolds agp, get name of scaffolds to get from FASTA file for NA
      foreach c (NA)
        awk '{print $1;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
        $HOME/bin/i386/faSomeRecords /cluster/data/danRer2/Zv4.fasta \
           $c/chr$c.scaffolds.lst $c/chr$c.fa
      end
      # create agp with 1000Ns between scaffolds as contig gaps for chrNA
      foreach c (NA)
         $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
         mv $c/chr$c.fa $c/chr$c.scaffolds.fa
         perl -pi -e "s/chrUn/chr$c/" $c/chr$c.*
      end 
      # change the type to "W" in the agp for WGS
      perl -pi.bak -e "s/D/W/;" NA/chrNA.agp
      cd NA
      mv chrNA.agp chrNA.scaffolds.agp
      rm *.bak
      # use this chrNA.scaffolds.agp to create chrNA.chunks.agp
 cat << '_EOF_' > /cluster/data/danRer2/jkStuff/createChunksAgp.pl
 #!/usr/bin/perl -w
 use strict;
                                                                                 
 # input is list of scaffolds and chunks and file of chrN.scaffolds.agp with Ns.
                                                                                 
 my $accs = $ARGV[0];
 my $scaf = $ARGV[1];
                                                                                 
 open (ACCS, $accs) || die "Can not open $accs: $!";
 open (SCAFS, $scaf) || die "Can not open $scaf: $!";
 my %accsHash;
 while (<ACCS>) {
    chomp;
    my @fi = split(/\t/);
    $accsHash{$fi[0]} = $fi[1];
 }
 close ACCS;
                                                                                 
 while (my $line = <SCAFS>) {
    chomp $line;
    my @f = split(/\t/, $line);
    my $acc;
    if ($f[4] ne "N") {
       if (exists($accsHash{$f[5]}) ) {
          $acc = $accsHash{$f[5]};
       else {
           print "$f[5] can not be found\n";
       }
       print "$f[0]\t$f[1]\t$f[2]\t$f[3]\t$f[4]\t$acc\t$f[6]\t$f[7]\t$f[8]";
       print "\t$f[5]\t$f[6]\t$f[7]";
    }
    else {
       print $line;
    }
    print "\n";
 }
 '_EOF_'
      chmod +x /cluster/data/danRer2/jkStuff/createChunksAgp.pl
      
      awk 'BEGIN {OFS="\t";} { if ($1 ~ /^Zv4_NA/) print $1,$6 }' \
          /cluster/data/danRer2/Zv4.chunks.agp > NA.accs 
      perl ../jkStuff/createChunksAgp.pl NA.accs chrNA.scaffolds.agp \
           > chrNA.chunks.agp
 
    # Also create agp for chrUn - these are scaffolds that map to FPC contigs
    # but are unplaced on the chromosomes
      # for Un, make agp files
      ssh kksilo
      cd /cluster/data/danRer2
      mkdir Un
      # make a scaffolds agp for Un - use first 9 fields of chunks file
      # and remove ".1" from 6th field
      awk 'BEGIN {OFS="\t"} {if ($1 ~ /Zv4_scaffold/) \
          print $1, $2, $3, $4, $5, $6, $7, $8, $9}' \
          Zv4.chunks.agp | perl -pi.bak -e 's/(Zv4_NA[0-9]+)\.1+/$1/' \
          > Un/Un.scaffolds.agp 
      # from scaffolds agp, get name of scaffolds to get from FASTA file for Un
      foreach c (Un)
        awk '{print $1;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
        $HOME/bin/i386/faSomeRecords /cluster/data/danRer2/Zv4.fasta \
           $c/chr$c.scaffolds.lst $c/chr$c.fa
      end
      # create agp with 1000Ns between scaffolds as contig gaps for chrUn
      foreach c (Un)
         $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
         mv $c/chr$c.fa $c/chr$c.scaffolds.fa
      end 
      # change the type to "W" in the agp for WGS
      perl -pi.bak -e "s/D/W/;" Un/chrUn.agp
      cd Un
      mv chrUn.agp chrUn.scaffolds.agp
      rm *.bak
      # create chunks agp for chrUn
      # this includes accessions so need to get from Zv4.chunks.agp
      # modify perl script above to do this (2004-12-06, hartera)
      awk 'BEGIN {OFS="\t";} { if ($1 ~ /^Zv4_/) print $1,$6 }' \
          /cluster/data/danRer2/Zv4.chunks.agp > Un.accs 
      perl ../jkStuff/createChunksAgp.pl Un.accs chrUn.scaffolds.agp \
           > chrUn.chunks.agp
 
 # BUILD CHROM-LEVEL SEQUENCE (DONE, 2004-10-21, hartera)
 # Move scaffolds files for NA into scaffolds directory (2004-11-22, hartera)
 # RE-BUILD SEQUENCE WITH NEW AGPS FROM CORRECTED SCAFFOLDS AGP 
 # (2004-11-30, hartera)
      ssh kksilo
      cd /cluster/data/danRer2
      # Sequence is already in upper case so no need to change
      foreach c (`cat chrom.lst`)
        echo "Processing ${c}"
        $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
          $c/chr$c.fa ./Zv4.fasta
        echo "${c} - DONE"
      end
      # some Ns in sequence files are in lower case so change to upper case
      foreach c (`cat chrom.lst`) 
         cat $c/chr${c}.fa | tr 'n' 'N' > $c/chr${c}.fa.tr
         if ($c == "Un") then  
            perl -pi.bak -e 's/^>chrUN/>chrUn/' $c/chr${c}.fa.tr
         endif
         mv $c/chr${c}.fa.tr $c/chr${c}.fa
      end
      # move scaffolds agp to be chrom agp and clean up (2004-11-30)
      foreach c (`cat chrom.lst`)
         cd $c
         rm *.bak
         cp chr${c}.scaffolds.agp chr${c}.agp
         mkdir agps
         mv chr${c}.*.agp ./agps/
         cd ..
      end
 
      # move scaffolds files for NA into scaffolds directory (2004-11-22)
      # and again (2004-11-30)
      foreach c (NA Un)
         mkdir -p /cluster/data/danRer2/$c/scaffolds
         cd /cluster/data/danRer2/$c
         mv chr$c.scaffolds.* ./scaffolds
         rm $c.*.agp
         cd .. 
      end
 
 # CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2004-10-21, hartera)
 # CHECKED THESE ARE OK (hartera, 2004-11-30)
      # Check that the size of each chromosome .fa file is equal to the
      # last coord of the .agp:
      ssh hgwdev
      cd /cluster/data/danRer2
      foreach c (`cat chrom.lst`)
        foreach f ( $c/chr$c.agp )
          set agpLen = `tail -1 $f | awk '{print $3;}'`
          set h = $f:r
          set g = $h:r
          echo "Getting size of $g.fa"
          set faLen = `faSize $g.fa | awk '{print $1;}'`
          if ($agpLen == $faLen) then
            echo "   OK: $f length = $g length = $faLen"
          else
            echo "ERROR:  $f length = $agpLen, but $g length = $faLen"
          endif
        end
      end
      # all are the OK so FASTA files are the expected size
 
 # BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
 # (DONE, 2004-10-25, hartera)
 # RE-DONE (2004-11-30, hartera)
                                                                                 
      ssh kksilo
      cd /cluster/data/danRer2
      foreach c (`cat chrom.lst`)
        foreach agp ($c/chr$c.agp)
          if (-e $agp) then
            set fa = $c/chr$c.fa
            echo splitting $agp and $fa
            cp -p $agp $agp.bak
            cp -p $fa $fa.bak
            splitFaIntoContigs $agp $fa . -nSize=5000000
          endif
        end
      end
 
 # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-10-25, hartera)
     # This used to hold scripts -- better to keep them inline here
     # Now it should just hold lift file(s) and
     # temporary scripts made by copy-paste from this file.
     mkdir /cluster/data/danRer2/jkStuff
     # This is where most tracks will be built:
     mkdir /cluster/data/danRer2/bed
 
 # CREATING DATABASE (DONE, 2004-10-25, hartera)
     # Create the database.
     # next machine
     ssh hgwdev
     echo 'create database danRer2' | hgsql ''
     # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
     echo 'drop database danRer2' | hgsql danRer2
     # Delete and re-create database as above (hartera, 2004-11-30)
     # Use df to make sure there is at least 10 gig free on
     df -h /var/lib/mysql
 # Before loading data:
 # Filesystem            Size  Used Avail Use% Mounted on
 # /dev/sdc1             1.8T  637G 1023G  39% /var/lib/mysql
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-10-25, hartera)
 # RECREATE GRP TABLE (hartera, 2004-11-30)
     # next machine
     ssh hgwdev
     #  the following command copies all the data from the table
     #  grp in the database danRer1 to the new database danRer2
     echo "create table grp (PRIMARY KEY(NAME)) select * from danRer1.grp" \
       | hgsql danRer2
     # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
     echo 'drop table grp;' | hgsql danRer2
 
 # REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2004-10-26, hartera)
 # There is a new Repeat library at WUSTL that has new repeats for Danio rerio
 # This is Dr.repeats.020501
 # Add a README about these repeats
     ssh kksilo
     cd /cluster/data/danRer2
     wget --timestamp \
          http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501
     mv Dr.repeats.020501 /cluster/bluearc/RepeatMasker/Libraries/danioRW.lib
     # add danioRW.lib to danio.lib
     cd /cluster/bluearc/RepeatMasker/Libraries
     cp danio.lib danioRMandRW.lib
     cat danioRW.lib >> danioRMandRW.lib
     # add type as "unknown" to this file as these repeats are not classified
     perl -pi.bak -e 's/^(>Dr[0-9]+)/$1#Unknown/' danioRMandRW.lib
     # these new repeats are not classified by type so add "DrRW" as type later
     # Add a README about these repeats from WUSTL
     wget --timestamp \
           http://www.genetics.wustl.edu/fish_lab/repeats/Readme.txt
 
 #- Split contigs into 500kb chunks, at gaps if possible:
     foreach c (`cat chrom.lst`)
       foreach d ($c/chr${c}*_?{,?})
         cd $d
         echo "splitting $d"
         set contig = $d:t
         ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
             -minGapSize=100
         cd ../..
       end
     end
 
 #- Make the run directory and job list:
     cd /cluster/data/danRer2
     # use RepeatMasker from January 2004
     cat << '_EOF_' > jkStuff/RMZebrafish
 #!/bin/csh -fe
                                                                                 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/danRer2/$2
 /bin/cp $2 /tmp/danRer2/$2/
 cd /tmp/danRer2/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRW.lib $2
 popd
 /bin/cp /tmp/danRer2/$2/$2.out ./
 if (-e /tmp/danRer2/$2/$2.align) /bin/cp /tmp/danRer2/$2/$2.align ./
 if (-e /tmp/danRer2/$2/$2.tbl) /bin/cp /tmp/danRer2/$2/$2.tbl ./
 if (-e /tmp/danRer2/$2/$2.cat) /bin/cp /tmp/danRer2/$2/$2.cat ./
 /bin/rm -fr /tmp/danRer2/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2
 '_EOF_'
     chmod +x jkStuff/RMZebrafish2
     mkdir RMRun
     cp /dev/null RMRun/RMJobs
     foreach c (`cat chrom.lst`)
       foreach d ($c/chr${c}_?{,?})
           set ctg = $d:t
           foreach f ( $d/${ctg}_?{,?}.fa )
             set f = $f:t
             echo /cluster/data/danRer2/jkStuff/RMZebrafish \
                  /cluster/data/danRer2/$d $f \
                '{'check out line+ /cluster/data/danRer2/$d/$f.out'}' \
               >> RMRun/RMJobs
           end
       end
     end
                                                                                 
     #- Do the run
     ssh kk
     cd /cluster/data/danRer2/RMRun
     para create RMJobs
     para try, para check, para check, para push, para check,...
 # para time
 # CPU time in finished jobs:   10326858s  172114.29m  2868.57h  119.52d  0.327 y
 # IO & Wait Time:                 33702s     561.71m     9.36h    0.39d  0.001 y
 # Average job time:                3081s      51.35m     0.86h    0.04d
 # Longest job:                     4065s      67.75m     1.13h    0.05d
 # Submission to last job:         39673s     661.22m    11.02h    0.46d
 
 
     #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
     ssh kksilo
     cd /cluster/data/danRer2
     foreach d (*/chr*_?{,?})
       set contig = $d:t
       echo $contig
       liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
         > /dev/null
     end
                                                                                 
     #- Lift pseudo-contigs to chromosome level
     foreach c (`cat chrom.lst`)
       echo lifting $c
       cd $c
       if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
         liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
         > /dev/null
       endif
       cd ..
     end
 
     #- Load the .out files into the database with:
     ssh hgwdev
     cd /cluster/data/danRer2
     hgLoadOut danRer2 */chr*.fa.out
 # When try masking sequence (see below) there are 60 instances of negative 
 # co-ordinates:
 #     2 Dr000074
 #     48 Dr000158
 #     6 Dr000375
 #      1 Dr000511
 #      1 Dr000759
 #      1 Dr000975
 #      5 Dr001181
 # Sent sample output from chr1_3_21 to Arian Smit and he suggested that 
 # Dr000158 is a Satellite. When this classification is added to the FASTA
 # header: >Dr000158#Satellite, the negative co-ordinates disappeared.
 # If the classification is changed to "Unknown" then there are negative 
 # co-ordinates.
 # Took a look at these 7 repeats above and found overlapping matches
 # Yi Zhou at Boston Children's Hospital looked at the repeats and split 
 # them up into smaller chunks: danioRMandRWsplit.libe
 # Running RepeatMasker with this library removed a lot of negative co-ordinates
 # but some new ones appeared. There are 7 instances of negative co-ordinates.
 # TDR5, (TAGA)n, Dr000355, Dr001182
 # Dr001182 has two repeats, the second being an imperfect replicate of the
 # first so this was split into two repeats and RepeatMasker run again (11/18/04)
 # This time there were TDR5, (TAGA)n, Dr000355, Dr000509 with negative repeats
 # but only 5 instances.
 # 11/13/04
 # try RepeatMasker with 7 repeats with negative co-ords split into smaller
 # repeats. get list of repeat names without these then get those sequences
     ssh kksilo
     cd /cluster/data/danRer2/bed/
     $HOME/bin/i386/faSomeRecords \
           /cluster/bluearc/RepeatMasker/Libraries/danioRMandRW.lib \
           rmandrw.txt danioRMandRWsplit.lib
 # splitreps is list of split repeats
     cat splitreps >> danioRMandRWsplit.lib
     mv danioRMandRWsplit.lib /cluster/bluearc/RepeatMasker/Libraries/
     # then run repeat masker on problem repeat areas e.g. chr1_3_21.fa
     mkdir /cluster/data/danRer2/RMRun/testSplitReps
     cd /cluster/data/danRer2/RMRun/testSplitReps
     nice /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRWsplit.lib chr1_3_21.fa 
   # works well so change all the classes to unknown and re-run with this library
    perl -pi.bak -e 's/^>(Dr[0-9a-z\-]+)#DrRW/>$1#Unknown/' danioRMandRWsplit.lib
  # then run RepeatMasker as above with new split library (DONE, 2004-11-16)
  # edit RMZebrafish to use this library (library is danioRMandRWsplit.lib) 
  # and run from RMRun2 directory
    # need to remove chrN.fa.masked files
    # then convert chrN.fa back to upper case
     ssh kksilo
     cd /cluster/data/danRer2
     foreach c (`cat chrom.lst`)
        cd $c
        echo "Processing $c ..."
        rm chr${c}.fa.masked
        cat chr${c}.fa | tr '[a-z]' '[A-Z]' > chr${c}.tmp
        perl -pi.bak -e 's/^>CHR([0-9A-Z]+)/>chr$1/' chr${c}.tmp 
        mv chr${c}.tmp chr${c}.fa
        rm chr${c}.tmp.bak
        cd ..
     end
 # still get some negative co-ordinates when try masking
 # Dr001182 is a repeat sequence which contains two instances of a repeat with
 # the second one not being a perfect match to the first
 # split these into two repeats and then re-run RepeatMasker
 #
 # e-mailed Kerstin Jekosch at Sanger as it looks like they have used this
 # WUSTL Repeat library to mask the Zv4 assembly for Ensembl. Kerstin recommended
 # downloading this new library from RepBase as it has been properly 
 # formatted for RepBase version 10. In RepBase, it is the Zebrafish Unclassified
 # library and it consists of 958 unfinished consensus sequences of unclassified
 # repeats extracted from the library at 
 # http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501 
 # which has a total of 1225 repeats. 267 repeats present in the library have
 # been replaced by a set of consensus sequences of classified transposable 
 # elements that are reported in Repbase Update and Repbase Reports.
 
 # DOWNLOAD NEW VERSION OF THE WUSTL ZEBRAFISH REPEATS FROM REPBASE
 # (DONE, 2004-11-18, hartera)
 # Go to http://www.girinst.org/server/RepBase/
 # and select zebunc (Zebrafish unclassified library)
 # click on repeatmaskerlibraries and then download 
 # repeatmaskerlibrariesJuly2004.tar.gz
     gunzip repeatmaskerlibrariesJuly2004.tar.gz
     tar xvf repeatmaskerlibrariesJuly2004.tar
     perl -pi.bak -e 's/^>(Dr[0-9]+)/>$1#Unknown/' zebunc.ref
     cat danio.lib zebunc.ref >> danioandzebunc.lib
 
 # REDO REPEATMASKER RUN AND LOAD NEW RESULTS (DONE, 2004-11-22, hartera)
 # REDONE (hartera, 2004-12-01)
     # sequences already split into 500kb chunks - see above
     # use RepeatMasker open-3.0 version, Sep 2004, this was in 
     #  /cluster/bluearc/RepeatMasker.040909 and is now the default
     # new zebrafish library was downloaded from RepBase - zebunc.ref
     # copy to /cluster/bluearc/RepeatMasker.040909/Libraries
     # add "Unknown" as classification for these repeats
     perl -pi.bak -e 's/>(Dr[0-9]+)/>$1#Unknown \@danio [S:]' zebunc.ref
     # add to RepeatMasker library
     mv RepeatMasker.lib RepeatMasker.lib.old
     cat RepeatMasker.lib.old zebunc.ref >> RepeatMasker.lib
 
     cat << '_EOF_' > jkStuff/RMZebrafish
 #!/bin/csh -fe
                                                                                 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/danRer2/$2
 /bin/cp $2 /tmp/danRer2/$2/
 cd /tmp/danRer2/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species danio $2
 popd
 /bin/cp /tmp/danRer2/$2/$2.out ./
 if (-e /tmp/danRer2/$2/$2.align) /bin/cp /tmp/danRer2/$2/$2.align ./
 if (-e /tmp/danRer2/$2/$2.tbl) /bin/cp /tmp/danRer2/$2/$2.tbl ./
 if (-e /tmp/danRer2/$2/$2.cat) /bin/cp /tmp/danRer2/$2/$2.cat ./
 /bin/rm -fr /tmp/danRer2/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2
 '_EOF_'
     chmod +x jkStuff/RMZebrafish
     rm -r RMRun
     mkdir -p RMRun
 
     cp /dev/null RMRun/RMJobs
     foreach c (`cat chrom.lst`)
       foreach d ($c/chr${c}_?{,?})
           set ctg = $d:t
           foreach f ( $d/${ctg}_?{,?}.fa )
             set f = $f:t
             echo /cluster/data/danRer2/jkStuff/RMZebrafish \
                  /cluster/data/danRer2/$d $f \
                '{'check out line+ /cluster/data/danRer2/$d/$f.out'}' \
               >> RMRun/RMJobs
           end
       end
     end
                                                                                 
     #- Do the run
     ssh kk
     cd /cluster/data/danRer2/RMRun
     para create RMJobs
     para try, para check, para check, para push, para check,...
 # para time
 # Completed: 3899 of 3899 jobs
 # CPU time in finished jobs:   11636116s  193935.26m  3232.25h  134.68d  0.369 y
 # IO & Wait Time:                 39078s     651.31m    10.86h    0.45d  0.001 y
 # Average job time:                2994s      49.91m     0.83h    0.03d
 # Longest job:                     4064s      67.73m     1.13h    0.05d
 # Submission to last job:         19022s     317.03m     5.28h    0.22d
 
     #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
     ssh kksilo
     cd /cluster/data/danRer2
     foreach d (*/chr*_?{,?})
       set contig = $d:t
       echo $contig
       liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
         > /dev/null
     end
                                                                                 
     #- Lift pseudo-contigs to chromosome level
     foreach c (`cat chrom.lst`)
       echo lifting $c
       cd $c
       if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
         liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
         > /dev/null
       endif
       cd ..
     end
 
     #- Load the .out files into the database with:
     ssh hgwdev
     cd /cluster/data/danRer2
     hgLoadOut danRer2 */chr*.fa.out
 # Note: 23 records dropped due to repStart > repEndm (2004-12-01)
 # Processing 1/chr1.fa.out
 bad rep range in: 1354  157     0       0       chr1    7313059 7313288 -5489473
 5       -       (0)     (917)   (917)   689     5       3        
 bad rep range in: 794   247     63      0       chr1    22624284        22624474
         -39583549       -       (0)     (860)   (860)   659     0       1       
 *
 bad rep range in: 841   234     44      0       chr1    27730487        27730692
         -34477331       -       (0)     (555)   (555)   342     5       1       
 # Processing 11/chr11.fa.out
 bad rep range in: 2007  124     74      108     chr11   6853360 6853376 -3076000
 2       +       HATN10_DR       DNA     hAT     167     166     -289    2
 # Processing 13/chr13.fa.out
 bad rep range in: 939   65      0       158     chr13   11182894        11182923
         -26476056       +       DIRS1_DR        LTR     DIRS1   6133    6132    
 0       1
 # Processing 14/chr14.fa.out
 bad rep range in: 350   125     29      137     chr14   39592288        39592300
         -20407524       +       HAT1_DR DNA     hAT     612     611     -2681   
 8
 # Processing 16/chr16.fa.out
 bad rep range in: 311   225     0       136     chr16   38708823        38708835
         -4054346        +       Dr000294        Unknown Unknown 403     400     
 -549    6
 # Processing 18/chr18.fa.out
 bad rep range in: 9780  128     42      58      chr18   12479604        12480762
         -35227702       +       Dr000158        Unknown Unknown 45      -2229   
 -3930   1
 # Processing 19/chr19.fa.out
 bad rep range in: 249   169     0       167     chr19   25584255        25584266
         -26439321       +       Dr000331        Unknown Unknown 314     311     
 -844    3
 # Processing 2/chr2.fa.out
 bad rep range in: 596   206     44      0       chr2    24978519        24978655
         -27097055       -       (1)     (317)   (317)   176     5       4       
  
 bad rep range in: 326   56      19      0       chr2    40153004        40153058
         -11922652       -       (129)   (79)    (79)    25      5       2       
  
 bad rep range in: 1454  56      0       0       chr2    50993722        50993901
         -1081809        -       (0)     (1155)  (1155)  977     5       8 
 # Processing 3/chr3.fa.out
 bad rep range in: 605   76      0       11      chr3    42820214        42820307
         -1974931        -       (6)     (5062)  (5062)  4971    5       3       
 # Processing 4/chr4.fa.out
 bad rep range in: 1072  143     21      100     chr4    920087  920366  -3235941
 8       -       (1)     (540)   (540)   284     5       1        
 bad rep range in: 330   194     109     29      chr4    1398685 1398823 -3188096
 1       -       (204)   (375)   (375)   227     5       14       
 bad rep range in: 978   212     58      90      chr4    24351201        24351580
         -8928204        -       (594)   (553)   (553)   187     5       1   
 # Processing 5/chr5.fa.out
 bad rep range in: 4380  113     49      44      chr5    4937637 4937659 -6284667
 7       +       TDR23   DNA     DNA     889     888     -259    1
 # Processing 6/chr6.fa.out
 bad rep range in: 649   14      0       0       chr6    7782737 7782809 -2542980
 7       -       (956)   (419)   (419)   348     5       1        
 # Processing 7/chr7.fa.out
 bad rep range in: 272   158     0       50      chr7    19882140        19882200
         -42635991       -       (800)   (419)   (419)   363     5       7    
 # Processing NA/chrNA.fa.out
 bad rep range in: 1068  181     113     122     chrNA   75025954        75025966
         -243271514      +       TDR18   DNA     DNA     188     187     -385    
 5
 bad rep range in: 493   109     3       153     chrNA   202800523       20280058
 5       -115496895      +       Dr000876        Unknown Unknown 1       -32     
 -157    1
 bad rep range in: 444   271     4       164     chrNA   249521533       24952154
 8       -68775932       +       CR1-1_DR        LINE    L2      4833    4832    
 -490    9
 # Processing Un/chrUn.fa.out
 bad rep range in: 237   149     0       152     chrUn   96855194        96855206
         -76596190       +       Dr000331        Unknown Unknown 312     311     
 -844    1
 
 # To display the new repeats which are classed as "Unknown", add this class
 # to $HOME/kent/src/hg/hgTracks/rmskTrack.c 
 # to the repeatClassNames and repeatClasses arrays
 # check coverage of repeats masked
 # featureBits -chrom=chr1 danRer1 rmsk
 # 11589712 bases of 40488791 (28.624%) in intersection
 # featureBits -chrom=chr1 danRer2 rmsk
 # 26879295 bases of 61678023 (43.580%) in intersection
 
 # MAKE LIFTALL.LFT (DONE, 2004-10-26, hartera)
 # RE-DONE (hartera, 2004-12-01)
     ssh kksilo
     cd /cluster/data/danRer2
     cat */lift/ordered.lft > jkStuff/liftAll.lft
 
 # SIMPLE REPEAT [TRF] TRACK  (DONE, 2004-10-26, hartera)
 # RE-DONE (DONE, hartera, 2004-12-02)
     # TRF runs pretty quickly now... it takes a few hours total runtime,
     # so instead of binrsyncing and para-running, just do this on the
     # local fileserver
     ssh kksilo
     rm -r /cluster/data/danRer2/bed/simpleRepeat
     mkdir -p /cluster/data/danRer2/bed/simpleRepeat
     cd /cluster/data/danRer2/bed/simpleRepeat
     mkdir trf
     cp /dev/null jobs.csh
     foreach d (/cluster/data/danRer2/*/chr*_?{,?})
       set ctg = $d:t
       foreach f ($d/${ctg}.fa)
         set fout = $f:t:r.bed
         echo $fout
         echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
         >> jobs.csh
       end
     end
                                                                                 
     chmod a+x jobs.csh
     csh -ef jobs.csh >&! jobs.log &
     # check on this with
     tail -f jobs.log
     wc -l jobs.csh
     ls -1 trf | wc -l
     endsInLf trf/*
    # kksilo overloaded so take chrNA_35 onwards as jobs2.csh and run on kolossus 
     liftUp simpleRepeat.bed /cluster/data/danRer2/jkStuff/liftAll.lft warn \
       trf/*.bed
     # Load into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/simpleRepeat
     hgLoadBed danRer2 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
 
 # PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-10-26, hartera)
 # RE-DONE (2004-12-02, hartera)
     # After the simpleRepeats track has been built, make a filtered version
     # of the trf output: keep trf's with period <= 12:
     ssh kksilo
     cd /cluster/data/danRer2/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/chr*.bed)
       awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
     # Lift up filtered trf output to chrom coords as well:
     cd /cluster/data/danRer2
     mkdir bed/simpleRepeat/trfMaskChrom
     foreach c (`cat chrom.lst`)
       if (-e $c/lift/ordered.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/ordered.lst > $c/lift/oTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
       endif
       if (-e $c/lift/random.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
            $c/lift/random.lst > $c/lift/rTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
       endif
     end
 
 # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF 
 # (DONE, 2004-11-22, hartera)
 # RE-DONE (2004-12-02, hartera)
 # RE-MAKE MASKED CHR1 SEQUENCE AS INCOMPLETELY MASKED (2005-06-06, hartera)
 # Although there was an error in the RepeatMasker output for chr1, this 
 # loaded ok for the chr1_rmsk table so that is complete. It is just the 
 # actual sequence that was not masked properly.
     ssh kksilo
     cd /cluster/data/danRer2
     # Soft-mask (lower-case) the contig and chr .fa's,
     # then make hard-masked versions from the soft-masked.
     set trfCtg=bed/simpleRepeat/trfMask
     set trfChr=bed/simpleRepeat/trfMaskChrom
     # errors in the RepeatMasker output with negative co-ordinates 
     # (60 instances) - mainly for specific sequences in the new zebrafish 
     # library so took a look at these together with Yi Zhou from Boston 
     # Children's Hospital who split these repeats into smaller sequences.
     # These were used to replace the original repeats in the repeat library and     # RepeatMasker was re-run. Finally, it was found that a newer version of 
     # RepeatMasker open-3.0.5 version reduced the negative co-ordinates to 
     # 2 instances and a cleaned up version of the new zebrafish library 
     # from RepBase was also used - see above.
     foreach f (*/chr*.fa)
       echo "repeat- and trf-masking $f"
       maskOutFa -soft $f $f.out $f
       set chr = $f:t:r
       maskOutFa -softAdd $f $trfChr/$chr.bed $f
       echo "hard-masking $f"
       maskOutFa $f hard $f.masked
     end
 # with the new version of RepeatMasker, there are 2 instances of negative
 # co-ordinates which can just be ignored. 
 # WARNING: negative rEnd: -2229 chr18:12479605-12480762 Dr000158
 # WARNING: negative rEnd: -32 chrNA:202800524-202800585 Dr000876
 # with new agp and chrom sequence, there is also an instance of a * instead
 # of a co-ordinate (2004-12-02)
 # repeat- and trf-masking 1/chr1.fa
 # invalid signed number: "*"
     # mask contigs also
     foreach c (`cat chrom.lst`)
       echo "repeat- and trf-masking contigs of chr$c"
       foreach d ($c/chr*_?{,?})
         set ctg=$d:t
         set f=$d/$ctg.fa
         maskOutFa -soft $f $f.out $f
         maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
         maskOutFa $f hard $f.masked
       end
     end
 # same warnings here too:
 # WARNING: negative rEnd: -2229 chr18_3:2110746-2111903 Dr000158
 # WARNING: negative rEnd: -32 chrNA_41:1844381-1844442 Dr000876
 # repeat- and trf-masking contigs of chr1
 # invalid signed number: "*"  (2004-12-02) and the co-ordinates for chr18_3
 # are: WARNING: negative rEnd: -2229 chr18_3:1862490-1863647 Dr000158
 
     # Build nib files, using the soft masking in the fa
     mkdir nib
     foreach f (*/chr*.fa)
       faToNib -softMask $f nib/$f:t:r.nib
     end
 
     # Re-do masking for chr1.fa. A user noticed that the masking for this 
     # is very low compared to other chroms (2% vs almost 50%). Re-running
     # maskOutFa and debugging revealed that the error reported above
     # "invalid signed number: "*" " is causing the program to abort 
     # so the chrom is incompletely masked. This row of the RepeatMasker file
     # (chr1.fa.out) is incorrect and the maskOutFa program fails when it 
     # tries to load this line into a data structure. 
     # Solution: remove the erroneous line (line 52575) and re-run maskOutFa
     ssh kkstore01
     mkdir /cluster/data/danRer2/tmpMask
     cd  /cluster/data/danRer2/tmpMask
     cp /cluster/data/danRer2/1/chr1.fa .
     cp /cluster/data/danRer2/1/chr1.fa.out .
     # make sure it is all upper case
     cat chr1.fa | tr '[a-z]' '[A-Z]' > chr1.tmp
     sed -e 's/>CHR1/>chr1/' chr1.tmp > chr1.fa
     # remove line 52575 from chr1.fa.out
     # re-run maskOutFa
     set trfCtg=/cluster/data/danRer2/bed/simpleRepeat/trfMask
     set trfChr=/cluster/data/danRer2/bed/simpleRepeat/trfMaskChrom
     
     foreach f (chr1.fa)
       echo "repeat- and trf-masking $f"
       maskOutFa -soft $f $f.out $f
       set chr = $f:t:r
       maskOutFa -softAdd $f $trfChr/$chr.bed $f
       echo "hard-masking $f"
       maskOutFa $f hard $f.masked
     end
     # then add chr1New as FASTA heading and copy to the directory for chr1
     sed -e 's/>chr1/>chr1New/' chr1.fa > chr1New.fa
     sed -e 's/>chr1/>chr1New/' chr1.fa.masked > chr1New.fa.masked
     mv chr1.fa.out /cluster/data/danRer2/1/chr1New.fa.out
     mv chr1New.fa /cluster/data/danRer2/1/
     mv chr1New.fa.masked /cluster/data/danRer2/1/
     cd ..
     rm -r tmpMask
     cd 1
     faSize chr1.fa
     # 62208023 bases (3421437 N's 58786586 real 57507587 upper 1278999 lower) 
     faSize chr1New.fa
     # 2.1% are in lower case
     # 62208023 bases (3421437 N's 58786586 real 31874160 upper 26912426 lower)
     # 43% masked as lower case
     faSize chr1.fa.masked
     # 62208023 bases (4700436 N's 57507587 real 57507587 upper 0 lower) 
     # 7.6% are Ns
     faSize chr1New.fa.masked
     # 62208023 bases (30333863 N's 31874160 real 31874160 upper 0 lower)
     # 49% are Ns
 
     # just use this new masked sequence for downloads and replace the
     # masked chr1 sequences for downloads - see DOWNLOADABLE SEQUENCE
     # section below
  
 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE, 2004-11-22, hartera)
 # RE-DONE (2004-12-02, hartera)
 # MOVE danRer2.2bit out of gbdb nib directory (2004-12-15, hartera)
     # Make symbolic links from /gbdb/danRer2/nib to the real nibs.
     ssh hgwdev
     cd /cluster/data/danRer2
     mkdir -p /gbdb/danRer2/nib
     foreach f (/cluster/data/danRer2/nib/chr*.nib)
       ln -s $f /gbdb/danRer2/nib
     end
     # Load /gbdb/danRer2/nib paths into database and save size info
     # hgNibSeq creates chromInfo table
     hgNibSeq -preMadeNib danRer2 /gbdb/danRer2/nib */chr*.fa
     echo "select chrom,size from chromInfo" | hgsql -N danRer2 > chrom.sizes
     # take a look at chrom.sizes, should be 28 lines
     wc chrom.sizes
                                              
     # Make one big 2bit file as well, and make a link to it in
     # /gbdb/danRer2/nib because hgBlat looks there:
     faToTwoBit */chr*.fa danRer2.2bit
     ln -s /cluster/data/danRer2/danRer2.2bit /gbdb/danRer2/nib/
     # Move this link out of nib directory (2004-12-15, hartera)
     rm /gbdb/danRer2/nib/danRer2.2bit
     ln -s /cluster/data/danRer2/danRer2.2bit /gbdb/danRer2/
     
 # MAKE GOLD AND GAP TRACKS (DONE, 2004-11-22, hartera)
 # RE-DONE (2004-12-03, hartera)
     ssh hgwdev
     cd /cluster/data/danRer2
     # the gold and gap tracks are created from the chrN.agp file and this is
     # the scaffolds or supercontigs agp 
     # move other agp files out of the way to an agps directory
     foreach c (`cat chrom.lst`)
        mkdir ./$c/agps
        mv ./$c/chr${c}.chunks.agp ./$c/agps/
        mv ./$c/chr${c}.scaffolds.agp ./$c/agps/
     end
     # move the scaffolds agp for NA to agps directory
     mv ./NA/scaffolds/chrNA.scaffolds.agp ../agps
     # the gold and gap tracks are created from the chrN.agp file
     hgGoldGapGl -noGl -chromLst=chrom.lst danRer2 /cluster/data/danRer2 .
 
 # MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DANRER2
 # (DONE, 2004-11-22, hartera)
 # Correct nibPath for danRer2 in dbDb table (2004-12-15, hartera)
     # Make trackDb table so browser knows what tracks to expect:
     ssh hgwdev
     mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer2
                                                                                 
     cd $HOME/kent/src/hg/makeDb/trackDb
     cvs up -d -P
     # Edit that makefile to add danRer2 in all the right places and do
     make update
     make alpha
     cvs commit -m "Added danRer2." makefile
     # Add dbDb and defaultDb entries:
     echo 'insert into dbDb (name, description, nibPath, organism,  \
           defaultPos, active, orderKey, genome, scientificName,  \
           htmlPath, hgNearOk, hgPbOk, sourceName)  \
           values("danRer2", "June 2004", \
           "/gbdb/danRer1/nib", "Zebrafish", "chr2:16,330,443-16,335,196", 1, \
           37, "Zebrafish", "Danio rerio", \
           "/gbdb/danRer2/html/description.html", 0, 0, \
           "Sanger Centre, Danio rerio Sequencing Project Zv4");' \
     | hgsql -h genome-testdb hgcentraltest
     # set danRer2 to be the default assembly for Zebrafish
     echo 'update defaultDb set name = "danRer2" \
           where genome = "Zebrafish";' \
           | hgsql -h genome-testdb hgcentraltest
     # dbDb table has danRer1 instead of danRer2 in nib path. However, this
     # should point to the position of the danRer2.2bit file
     # (2004-12-15, hartera)
     echo 'update dbDb set nibPath="/gbdb/danRer2" \
          where name = "danRer2";' | hgsql hgcentraltest
 
 # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-11-22, hartera)
     ssh hgwdev
     mkdir /cluster/data/danRer2/html
     # make a symbolic link from /gbdb/danRer2/html to /cluster/data/danRer2/html
     ln -s /cluster/data/danRer2/html /gbdb/danRer2/html
     # Add a description page for zebrafish
     cd /cluster/data/danRer2/html
     cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/description.html .
     # Edit this for zebrafish danRer2
                                                                                 
     # create a description.html page here
     mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer2
     cd ~/kent/src/hg/makeDb/trackDb/zebrafish
     cvs add danRer2
     cvs commit danRer2
     # Add description page here too
     cd danRer2 
     cp /cluster/data/danRer2/html/description.html .
     cvs add description.html
     cvs commit -m "First draft of description page for danRer2." \
         description.html
 
 # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, 2004-11-22, hartera)
 # RE-DONE (2004-12-03, hartera)
 # PUT MASKED SEQUENCE ON BLUEARC (DONE, 2004-12-22 and 2004-12-23, hartera)
     ssh kkr1u00
     # Chrom-level mixed nibs that have been repeat- and trf-masked:
     rm -rf /iscratch/i/danRer2/nib
     mkdir -p /iscratch/i/danRer2/nib
     cp -p /cluster/data/danRer2/nib/chr*.nib /iscratch/i/danRer2/nib
     # Pseudo-contig fa that have been repeat- and trf-masked:
     rm -rf /iscratch/i/danRer2/trfFa
     mkdir /iscratch/i/danRer2/trfFa
     foreach d (/cluster/data/danRer2/*/chr*_?{,?})
       cp -p $d/$d:t.fa /iscratch/i/danRer2/trfFa
     end
     rm -rf /iscratch/i/danRer2/rmsk
     mkdir -p /iscratch/i/danRer2/rmsk
     cp -p /cluster/data/danRer2/*/chr*.fa.out /iscratch/i/danRer2/rmsk
     cp -p /cluster/data/danRer2/danRer2.2bit /iscratch/i/danRer2/
     iSync
     # add to the bluearc
     ssh kksilo
     mkdir -p /cluster/bluearc/danRer2/nib
     cp -p /cluster/data/danRer2/nib/chr*.nib /cluster/bluearc/danRer2/nib
     mkdir -p /cluster/bluearc/danRer2/trfFa
     foreach d (/cluster/data/danRer2/*/chr*_?{,?})
       cp -p $d/$d:t.fa /cluster/bluearc/danRer2/trfFa
     end
 
 # CREATE gc5Base wiggle TRACK (DONE, 2004-12-03, hartera)
     ssh kksilo
     mkdir -p /cluster/data/danRer2/bed/gc5Base
     cd /cluster/data/danRer2/bed/gc5Base
     # The number of bases that hgGcPercent claimed it measured is calculated,       # which is not necessarily always 5 if it ran into gaps, and then the 
     # division by 10.0 scales down the numbers from hgGcPercent to the range
     # [0-100].  wigEncode now replaces wigAsciiToBinary and the previous 
     # processing step between these two programs. The result file is *.wig.
     # Each value represents the measurement over five bases beginning with
     # <position>. wigEncode also calculates the zoomed set of data.
     
     nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer2 \
         /iscratch/i/danRer2/nib | \
         wigEncode stdin gc5Base.wig gc5Base.wib
     # load the .wig file back on hgwdev:
     ssh hgwdev
     cd /cluster/data/danRer2/bed/gc5Base
     hgLoadWiggle -pathPrefix=/gbdb/danRer2/wib/gc5Base \
                  danRer2 gc5Base gc5Base.wig
     # and symlink the .wib file into /gbdb
     mkdir -p /gbdb/danRer2/wib/gc5Base
     ln -s `pwd`/gc5Base.wib /gbdb/danRer2/wib/gc5Base
 
 # MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2004-12-03, hartera)
     # Use -repMatch=512 (based on size -- for human we use 1024, and
     # the zebrafish genome is ~50% of the size of the human genome
     ssh kkr1u00
     mkdir /cluster/data/danRer2/bed/ooc
     cd /cluster/data/danRer2/bed/ooc
     mkdir -p /cluster/bluearc/danRer2
     ls -1 /cluster/data/danRer2/nib/chr*.nib > nib.lst
     blat nib.lst /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/danRer2/11.ooc -repMatch=512
     # Wrote 44008 overused 11-mers to /cluster/bluearc/danRer2/11.ooc
     # For 10.ooc, repMatch = 4096 for human, so use 2048
     blat nib.lst /dev/null /dev/null -tileSize=10 \
       -makeOoc=/cluster/bluearc/danRer2/10.ooc -repMatch=2048
     # Wrote 10639 overused 10-mers to /cluster/bluearc/danRer2/10.ooc
     # keep copies of ooc files in this directory and copy to iscratch
     cp /cluster/bluearc/danRer2/*.ooc .
     cp -p /cluster/bluearc/danRer2/*.ooc /iscratch/i/danRer2/
     iSync
 
 # ADD CONTIGS TRACK (DONE, 2004-12-06, hartera)
 # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from lift
 # RELOAD CONTIGS TRACK - NOT ALL CONTIGS HAD LOADED INTO DATABASE TABLE
 # (DONE, 2005-06-15, hartera)
     ssh kksilo
     mkdir -p /cluster/data/danRer2/bed/ctgPos2
     cd /cluster/data/danRer2/bed/ctgPos2
     # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
     foreach c (`cat /cluster/data/danRer2/chrom.lst`) 
          awk 'BEGIN {OFS="\t"} \
          {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
          /cluster/data/danRer2/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab
     end
 
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ctgPos2
     # Reload table (2005-06-16, hartera) - not all the contigs had loaded 
     # into the table because the unique primary key for ctgPos2.sql consisted
     # of only the first 14 characters of the contig name. However, contigs for 
     # danRer2 have names such as "Zv4_scaffold99.1" and "Zv4_scaffold99.3" for
     # which the first 14 characters are not unique so only the first one is
     # loaded. Also, there are cases where the contig has an accession and more
     # than one contig share the same accession as their name so names are not
     # unique. Therefore, this was changed in ctgPos2.sql so that the first
     # 20 characters of the contig name and the chromStart are the composite
     # primary key: PRIMARY KEY(contig(20),chromStart),
     echo "drop table ctgPos2" | hgsql danRer2
     hgsql danRer2 < ~/kent/src/hg/lib/ctgPos2.sql
     echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \
          | hgsql danRer2
 
 # Add trackDb.ra entry and ctgPos2.html
 
 # ENSEMBL GENES (DONE, 2004-12-06, hartera)
     ssh hgwdev                                                                      mkdir -p /cluster/data/danRer2/bed/ensembl
     cd /cluster/data/danRer2/bed/ensembl
     # Get the ensembl protein data from
     # http://www.ensembl.org/Danio_rerio/martview
     # Follow this sequence through the pages:
     # Page 1) Make sure that the Danio_rerio choice is selected. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Structures" box.
     # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
     # Save as ensemblGene.gtf.gz
     
     # the Ensembl gene predictions are mapped to chromosomes except for 
     # chrNA and chrUn. So, a lift file needs to be created to lift from the
     # scaffold to chromosome co-ordinates
    
     ssh kksilo
     mkdir -p /cluster/data/danRer2/bed/ensembl/liftSupertoChrom
     cd /cluster/data/danRer2/bed/ensembl/liftSupertoChrom
     # the lift files for scaffolds to chrom for NA and Un are already created
     cp /cluster/data/danRer2/Un/chrUn.lft .
     cp /cluster/data/danRer2/NA/chrNA.lft .
     cat *.lft >> liftUnScaffoldToChrom.lft    
 
     # get chrUn and chrNA records 
     cd /cluster/data/danRer2/bed/ensembl
     awk '$1 ~ /^Zv4_NA[0-9]+/ || $1 ~ /^Zv4_scaffold[0-9]+/' ensemblGene.gtf \
                     > ensemblGenechrUns.gtf
     # get records for all other chroms
     awk '$1 ~ /^[0-9]+/' ensemblGene.gtf > ensemblGenechroms.gtf
                                     
     liftUp -type=.gtf ensemblGenechrUns.lifted \
          ./liftSupertoChrom/liftUnScaffoldToChrom.lft warn ensemblGenechrUns.gtf
     # Got 38882 lifts in ./liftSupertoChrom/liftUnScaffoldToChrom.lft
     sed -e "s/^/chr/" ensemblGenechroms.gtf > ensGene.gtf
     cat ensemblGenechrUns.lifted >> ensGene.gtf
     # check some of the lifted co-ordinates
     # load into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ensembl
     /cluster/bin/i386/ldHgGene danRer2 ensGene \            
             /cluster/data/danRer2/bed/ensembl/ensGene.gtf
     # Read 32062 transcripts in 592139 lines in 1 files
     # 32062 groups 27 seqs 1 sources 4 feature types
     # 32062 gene predictions
 
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
     # hgKnownToSuper.  Use ensMart to create it as above, except:
     # Page 3) Choose the "Features" box. In "Ensembl Attributes", check
     # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
     # Choose Text, tab-separated as the output format.  Result name ensGtp.
     gunzip ensGtp.tsv.gz
     hgsql danRer2 < ~/kent/src/hg/lib/ensGtp.sql
     # remove header line from ensGtp.txt
     echo "load data local infile 'ensGtp.tsv' into table ensGtp" \
          | hgsql -N danRer2
 
     # Get the ensembl peptide sequences from
     # http://www.ensembl.org/Danio_rerio/martview
     # Choose Danio Rerio as the organism
     # Follow this sequence through the pages:
     # Page 1) Choose the Ensembl Genes choice. Hit next.
     # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
     # Page 3) Choose the "Sequences" tab.
     # Page 4) Choose Transcripts/Proteins and peptide Only as the output,
     # choose text/fasta and gzip compression,
     # name the file ensemblPep.fa.gz and then hit export.
 
     gunzip ensemblPep.fa.gz
     hgPepPred danRer2 ensembl ensemblPep.fa
 
 # AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2004-12-06, hartera)
     # sequences already downloaded for danRer1
     ssh hgwdev
     cd /projects/compbio/data/microarray/affyZebrafish
 
     cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \       /cluster/bluearc/affy/
     # Set up cluster job to align Zebrafish consensus sequences to danRer2
     ssh kkr1u00
     mkdir -p /cluster/data/danRer2/bed/affyZebrafish.2004-12-06
     ln -s /cluster/data/danRer2/bed/affyZebrafish.2004-12-06 \
           /cluster/data/danRer2/bed/affyZebrafish
     cd /cluster/data/danRer2/bed/affyZebrafish
     mkdir -p /iscratch/i/affy
     cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
     iSync
 
     ssh kk
     cd /cluster/data/danRer2/bed/affyZebrafish
     ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst
     ls -1 /iscratch/i/danRer2/trfFa/ > genome.lst
                                                                                
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/danRer2/11.ooc /iscratch/i/danRer2/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
     
     gensub2 genome.lst affy.lst template.sub para.spec
     mkdir psl
     para create para.spec
     para try, check, push ... etc.
 # para time
 # Completed: 299 of 299 jobs
 # CPU time in finished jobs:       4702s      78.37m     1.31h    0.05d  0.000 y
 # IO & Wait Time:                  3077s      51.28m     0.85h    0.04d  0.000 y
 # Average job time:                  26s       0.43m     0.01h    0.00d
 # Longest job:                      128s       2.13m     0.04h    0.00d
 # Submission to last job:           685s      11.42m     0.19h    0.01d
     ssh kksilo
     cd /cluster/data/danRer2/bed/affyZebrafish
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyZebrafish.psl
     pslSort dirs raw.psl tmp psl
     # only use alignments that have at least
     # 95% identity in aligned region.
     # do not use minCover since a lot of sequence is in Un, NA and Finished
     # so genes may be split up so good to see all alignments
     pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl
     # shorten names in psl file
     sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.bak
     mv affyZebrafish.psl.bak affyZebrafish.psl
     pslCheck affyZebrafish.psl
     # psl is good
     # load track into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/affyZebrafish
     hgLoadPsl danRer2 affyZebrafish.psl
                                                                                
     # Add consensus sequences for Zebrafish chip
     # Copy sequences to gbdb if they are not there already
     mkdir -p /gbdb/hgFixed/affyProbes
     ln -s \
        /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \       /gbdb/hgFixed/affyProbes
                                                                                
     hgLoadSeq -abbr=Zebrafish: danRer2 \
               /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
     # Clean up
     rm batch.bak contig.psl raw.psl
     # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/zebrafish/danRer2
 
 # RH MAP TRACK (DONE, 2004-12-07, hartera)
 # create sym links to RH sequences in /gbdb/danRer2 (2004-12-17, hartera)
 # RE-DO RH MAP TRACK WITH MORE STRINGENT POST-BLAT FILTERING 
 # (DONE, 2005-05-15, hartera)
 
     # Data from Leonard Zon's lab at the Childrens Hospital, Boston
     # Radiation hybdrid (RH) map data consists of 8707 genomic sequences:
     #   (1) 2835 BAC clone ends (Max Planck Inst.  Robert
     #   Geisler's lab, Tuebingen, Germany,
     #   (2) 2015 NCBI ESTs (submitted from around the planet,
     #   (3) 171 RH shotgun sequences (Max Planck Inst. & Children^?s
     #   Hosp. Boston,
     #   (4) 3686  WZ compiled ESTs from WashU
     #
     ssh kkr1u00
     mkdir -p /cluster/data/danRer2/bed/ZonLab/rhMap
     cd /cluster/data/danRer2/bed/ZonLab/rhMap
     
   # sequences for RHmap are in /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rh.fa\
   # rhcontigs.fa is the sequences all in one file with formatted header
     # copy to iscratch if not there already
     mkdir -p /iscratch/i/danRer2/rhmap
     cp -p /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \
           /iscratch/i/danRer2/rhmap    
     iSync
     # do the blat run to map RH map sequences to danRer2
     ssh kk
     cd /cluster/data/danRer2/bed/ZonLab/rhmap
     mkdir psl
     ls -1S  /iscratch/i/danRer2/rhmap/rhcontigs.fa > rhmap.lst
     ls -1S /iscratch/i/danRer2/trfFa/*.fa > genome.lst
 # try same parameters as for bacEnds
     cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/iscratch/i/danRer2/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line keeps emacs coloring happy
      gensub2 genome.lst rhmap.lst gsub spec
      para create spec
      para try
      para check
      para push, try ... etc.
 # Completed: 299 of 299 jobs
 # CPU time in finished jobs:       7977s     132.94m     2.22h    0.09d  0.000 y
 # IO & Wait Time:                  3114s      51.91m     0.87h    0.04d  0.000 y
 # Average job time:                  37s       0.62m     0.01h    0.00d
 # Longest job:                       54s       0.90m     0.01h    0.00d
 # Submission to last job:           155s       2.58m     0.04h    0.00d
 
     ssh kksilo
     cd /cluster/data/danRer2/bed/ZonLab/rhMap
     # Make & check the psl table
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create rhmap.psl
     pslSort dirs raw.psl tmp psl
     # only use alignments that have at least 80% identity in aligned region.
     # these are the parameters used for STS markers
     # pslReps -nearTop=0.0001 -minAli=0.8 -noIntrons raw.psl \
     #        contig.psl /dev/null
     # Processed 667164 alignments
     # try more stringent filtering for these:  minAli=0.96 and minCover=0.40
     # was suggested by Jim (DONE, 2005-05-15, hartera) - these parameters
     # worked well for danRer1 - see makeDanRer1.doc
     mv rhMap.psl rhMap.psl.old
     mv contig.psl contig.psl.old
     pslReps -nearTop=0.0001 -minAli=0.96 -minCover=0.40 raw.psl \
             contig.psl /dev/null
     # Processed 667164 alignments
     liftUp rhMap.psl /cluster/data/danRer2/jkStuff/liftAll.lft warn contig.psl
     pslCheck rhMap.psl
     # psl is ok.      
     # for previous parameters, the sequence with the most alignments had 1128.
     # over 1400 have multiple alignments and 7064 sequences have 1 alignment
     # now the most is 8. 78 sequences have 3-8 alignments, 820 have 2 and
     # 6709 have 1 alignment.       
     # for the new filtered psl there are 8609 alignments, there were 13821 
     # for the previous parameters (8492 sequences aligned, 98%)                         
     # Load sequence alignments into database.
     # Reload rhMap alignments - more stringent filtering (hartera, 2005-05-15)
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/rhMap
     echo "drop table rhMap;" | hgsql danRer2
     hgLoadPsl danRer2 rhMap.psl
     hgsql -N -e "select qName from rhMap;" danRer2 | sort | uniq | wc
     # 7607 out of 8690 sequences aligned (88%)
     # Add RH Map sequences
     # Copy sequences to gbdb if they are not there already
     # create sym link in /gbdb/danRer2 directory instead of /gbdb/danRer1
     # (2004-12-17, hartera)
     mkdir -p /gbdb/danRer2/rhMap
     ln -s \
       /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \
        /gbdb/danRer2/rhMap
     cd /cluster/data/danRer2/bed/ZonLab/rhMap
     hgLoadSeq danRer2 /gbdb/danRer1/rhmap/rhcontigs.fa
     # change extFile entry
     echo 'update extFile set path = "/gbdb/danRer2/rhMap/rhcontigs.fa" where id = 15513;' | hgsql danRer2
 
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DANRER2
 # (DONE, 2004-12-08, hartera)
    # hgcentraltest is now on hgwdev                                            
     ssh hgwdev
    # DNA port is "0", trans prot port is "1"
  echo 'insert into blatServers values("danRer2", "blat14", "17789", "0", "0");    insert into blatServers values("danRer2", "blat14", "17788", "1", "0");' \
     | hgsql hgcentraltest
     # if you need to delete those entries
     echo 'delete from blatServers where db="danRer2";' \
     | hgsql hgcentraltest
     # to check the entries:
     echo 'select * from blatServers where db="danRer2";' \
     | hgsql hgcentraltest
 
 # AUTO UPDATE GENBANK MRNA AND EST RUN  (DONE, 2004-12-09, hartera)
     ssh eieio
     cd /cluster/data/genbank
     # This is a new assembly, edit the etc/genbank.conf file and add:
 # danRer2 (zebrafish)
 danRer2.genome = /iscratch/i/danRer2/nib/chr*.nib
 danRer2.lift = /cluster/data/danRer2/jkStuff/liftAll.lft
 danRer2.downloadDir = danRer2
     # Default includes native genbank mRNAs and ESTs,
     # genbank xeno mRNAs but no xenoESTs, native RefSeq mRNAs but
     # not xeno RefSeq
     cvs commit -m "Added danRer2" etc/genbank.conf
     # No need to edit ~/kent/src/hg/makeDb/genbank/src/align/gbBlat 
     # since the 11.ooc file for danRer1 will be good for both assemblies.
     # as no new sequence was added, just an improvement on the mapping of 
     # unmapped sequence to chromosomes
 
     # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
     # danRer genome information
     # Install to /cluster/data/genbank:
     make install-server
 
     ssh eieio
     cd /cluster/data/genbank
     # This is an -initial run, all sequences:
     nice bin/gbAlignStep -verbose=1 -initial danRer2
  
     # Load results for all sequences:
     ssh hgwdev
     cd /cluster/data/genbank
     nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad danRer2
     # Move results need to be moved out the way
     cd /cluster/data/genbank/work
     mv initial.danRer2 initial.danRer2.allseqs
 
 # BLASTZ SWAP FOR hg17 vs. danRer2 BLASTZ TO CREATE danRer2 vs. hg17
 # (DONE, 2004-12-09, hartera)
 # USE RESCORED ALIGNMENTS (see makeHg17.doc)
     ssh kolossus
     mkdir -p /cluster/data/danRer2/bed/blastz.hg17.swap
     cd /cluster/data/danRer2/bed/blastz.hg17.swap
     # use rescored axtChrom from blastzDanRer2 on hg17
     set aliDir = /cluster/data/hg17/bed/blastz.danRer2
     cp $aliDir/S1.len S2.len
     cp $aliDir/S2.len S1.len
     mkdir unsorted axtChrom
     cat $aliDir/axtChrom/chr*.axt \
     | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
     | axtSplitByTarget stdin unsorted
     # Sort the shuffled .axt files.
     foreach f (unsorted/*.axt)
       echo sorting $f:t:r
       axtSort $f axtChrom/$f:t
     end
     du -sh $aliDir/axtChrom unsorted axtChrom
     # 1.1G    /cluster/data/hg17/bed/blastz.danRer2/axtChrom
     # 1.1G    unsorted
     # 1.1G    axtChrom
 
     rm -r unsorted
     # translate sorted axt files into psl
     cd /cluster/data/danRer2/bed/blastz.hg17.swap
     mkdir -p pslChrom
     set tbl = "blastzHg17"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     # Load database tables
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/pslChrom
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl danRer2 $f
       echo "$f Done"
     end
 
 # CHAIN HUMAN (hg17) BLASTZ (DONE, 2004-12-10, hartera)
 # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
 # AND DEGENERATE DNA (DONE, 2004-12-21, hartera)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/danRer2/bed/blastz.hg17.swap
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     # Reuse gap penalties from hg16 vs chicken run.
 cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize^V     11
 smallSize^V     111
 position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V
 72111^V 152111^V        252111
 qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V
 31600^V 56600
 bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V
 16000^V 32000^V 57000
 '_EOF_'
     # << this line makes emacs coloring happy
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
     -linearGap=../../chickenHumanTuned.gap $1 \
     /iscratch/i/danRer2/nib \
     /iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x doChain
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # create input list and batch for cluster run
     ls -1S /cluster/data/danRer2/bed/blastz.hg17.swap/axtChrom/*.axt \
         > input.lst
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:       2416s      40.27m     0.67h    0.03d  0.000 y
 # IO & Wait Time:                   351s       5.85m     0.10h    0.00d  0.000 y
 # Average job time:                  99s       1.65m     0.03h    0.00d
 # Longest job:                      144s       2.40m     0.04h    0.00d
 # Submission to last job:           524s       8.73m     0.15h    0.01d
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r >> hist5000.out
       textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
       echo ""
     end
 # only chrNA has a large number of chains with score <= 5000 (>200,000 chains)
 # chrUn has about 90,000 chains with score <=5000
     
     # apply filter with minScore=5000
     rm -r chain
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chain
     chainSplit chain all.chain
 
     # remove repeats from chains and reload into database
     # (2004-12-21, hartera)
     
     mv chain chainRaw
     mkdir chain
     cd chainRaw
     foreach f (*.chain)
        set c = $f:r
        echo $c 
        nice chainAntiRepeat /iscratch/i/danRer2/nib \
                        /iscratch/i/gs.18/build35/bothMaskedNibs $f \
                        ../chain/$c.chain
     end
     cd ..  
     chainMergeSort ./chain/*.chain > all.chain.antirepeat
     chainSplit chainAR all.chain.antirepeat
     # load filtered chains and check
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain/chainAR
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain danRer2 ${c}_chainHg17 $i
         echo done $c
     end
 # featureBits -chrom=chr1 danRer2 refGene:cds chainHg17 -enrichment
 # refGene:cds 0.512%, chainHg17 34.242%, both 0.444%, cover 86.63%, 
 # enrich 2.53x
 # featureBits -chrom=chr1 danRer2 refGene:cds chainHg17Link -enrichment
 # refGene:cds 0.512%, chainHg17Link 4.760%, both 0.399%, cover 77.80%, 
 # enrich 16.3
 # featureBits -chrom=chr1 danRer1 refGene:cds chainHg17Link -enrichment
 # refGene:cds 0.529%, chainHg17Link 3.924%, both 0.409%, cover 77.24%, 
 # enrich 19.69x
 # number of rows for chr1:
 # chainHg17Link 185694
 # after using chainAntiRepeat
 # chainHg17ARLink 176455
 # after chainAntiRepeat done on filtered chains:
 # featureBits -chrom=chr1 danRer2 refGene:cds chainHg17ARLink -enrichment
 # refGene:cds 0.512%, chainHg17ARLink 4.625%, both 0.398%, cover 77.79%, 
 # enrich 16.82x
 
 # NET HUMAN (hg17) BLASTZ (DONE, 2004-12-10,hartera)
 # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera)
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
     mkdir preNet
     cd chainAR
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                      ../preNet/$i
     end
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 126734336, utime 657 s/100, stime 69
     # Add classification info using db tables:
     # netClass looks for ancient repeats in one of the databases
     # hg17 has this table - hand-curated by Arian but this is for
     # human-rodent comparisons so do not use here, use -noAr option
     # linSpecRep.notInHuman and linSpecRep.notInZebrafish exist 
     # (see makeHg17.doc)
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
     time netClass noClass.net danRer2 hg17 humanhg17.net \
          -tNewR=/cluster/bluearc/danRer2/linSpecRep.notInHuman \
          -qNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish -noAr
     # 87.940u 51.110s 4:05.95 56.5%   0+0k 0+0io 631pf+0w 
     # load net into database
     netFilter -minGap=10 humanhg17.net |  hgLoadNet danRer2 netHg17 stdin
 # featureBits danRer1 refGene:cds netHg17 -enrichment
 # refGene:cds 0.462%, netHg17 27.868%, both 0.384%, cover 83.21%, enrich 2.99x
 # featureBits danRer2 refGene:cds netHg17 -enrichment
 # refGene:cds 0.468%, netHg17 30.608%, both 0.406%, cover 86.82%, enrich 2.84x
 
 # BLASTZ SWAP FOR mm5 vs danRer2 BLASTZ TO CREATE danRer2 vs mm5 BLASTZ
 # (DONE, 2004-12-13, hartera)
 
     ssh kolossus
     mkdir -p /cluster/data/danRer2/bed/blastz.mm5.swap
     cd /cluster/data/danRer2/bed/blastz.mm5.swap
     # use rescored axtChrom from blastzDanRer1 on mm5
     set aliDir = /cluster/data/mm5/bed/blastz.danRer2
     cp $aliDir/S1.len S2.len
     cp $aliDir/S2.len S1.len
     mkdir unsorted axtChrom
     cat $aliDir/axtChrom/chr*.axt \
     | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
     | axtSplitByTarget stdin unsorted
     # Sort the shuffled .axt files.
     foreach f (unsorted/*.axt)
       echo sorting $f:t:r
       axtSort $f axtChrom/$f:t
     end
     du -sh $aliDir/axtChrom unsorted axtChrom
     # 919M    /cluster/data/mm5/bed/blastz.danRer2/axtChrom
     # 921M    unsorted
     # 919M    axtChrom
     rm -r unsorted
     # translate sorted axt files into psl
     ssh kolossus
     cd /cluster/data/danRer2/bed/blastz.mm5.swap
     mkdir -p pslChrom
     set tbl = "blastzMm5"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
     # Load database tables
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.mm5.swap/pslChrom
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl danRer2 $f
       echo "$f Done"
     end
 
 # CHAIN MOUSE (mm5) BLASTZ (DONE, 2004-12-13, hartera)
 # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
 # AND DEGENERATE DNA (DONE, 2004-12-21, hartera)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/danRer2/bed/blastz.mm5.swap
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/danRer2/bed/blastz.mm5.swap/axtChrom/*.axt \
                     > input.lst
     cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
 
     # << this line makes emacs coloring happy
 
 # Reuse gap penalties from hg16 vs chicken run.
     cat << '_EOF_' > ../../chickenHumanTuned.gap
 tablesize^V     11
 smallSize^V     111
 position^V      1^V     2^V     3^V     11^V    111^V   2111^V  12111^V 32111^V 72111^V 152111^V        252111
 qGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
 tGap^V  325^V   360^V   400^V   450^V   600^V   1100^V  3600^V  7600^V  15600^V 31600^V 56600
 bothGap^V       625^V   660^V   700^V   750^V   900^V   1400^V  4000^V  8000^V  16000^V 32000^V 57000
 '_EOF_'
     # << this line makes emacs coloring happy
 # create chains with only default filtering on score - minScore = 1000
 cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
          -linearGap=../../chickenHumanTuned.gap $1 \
     /iscratch/i/danRer2/nib \
     /cluster/bluearc/scratch/mus/mm5/softNib $2 >& $3
 '_EOF_'
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:       2405s      40.08m     0.67h    0.03d  0.000 y
 # IO & Wait Time:                   447s       7.46m     0.12h    0.01d  0.000 y
 # Average job time:                 102s       1.70m     0.03h    0.00d
 # Longest job:                      147s       2.45m     0.04h    0.00d
 # Submission to last job:           523s       8.72m     0.15h    0.01d
 
    # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
 
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r >> hist5000.out
       textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
       echo ""
     end
     # only chrUn and chrNA have >100,000 chains with score <=5000
     # filter on minScore=5000
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k
     rm -r chain
     chainSplit chain all.chainFilt5k
 
     # remove repeats from chains and reload into database
     # (2004-12-21, hartera)
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
     mv chain chainRaw
     mkdir chain
     cd chainRaw
     foreach f (*.chain)
        set c = $f:r
        echo $c 
        nice chainAntiRepeat /iscratch/i/danRer2/nib \
                        /cluster/bluearc/scratch/mus/mm5/softNib $f \
                        ../chain/$c.chain
     end
     cd ..  
     chainMergeSort ./chain/*.chain > all.chain.antirepeat
     chainSplit chainAR all.chain.antirepeat
     # load chains into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain/chainAR
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain danRer2 ${c}_chainMm5 $i
         echo done $c
     end
 
 # featureBits -chrom=chr1 danRer2 refGene:cds chainMm5 -enrichment
 # refGene:cds 0.512%, chainMm5 34.341%, both 0.436%, cover 85.08%, enrich 2.48x
 # featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment
 # refGene:cds 0.512%, chainMm5Link 4.588%, both 0.395%, cover 77.08%, 
 # enrich 16.80x
 # featureBits -chrom=chr1 danRer1 refGene:cds chainMm5 -enrichment
 # refGene:cds 0.529%, chainMm5 42.554%, both 0.459%, cover 86.70%, enrich 2.04x
 # featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Link -enrichment
 # refGene:cds 0.529%, chainMm5Link 7.948%, both 0.412%, cover 77.80%, 
 # enrich 9.79x
 # after chainAntiRepeats:
 # featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment
 # refGene:cds 0.512%, chainMm5Link 4.499%, both 0.395%, cover 77.08%, 
 # enrich 17.13x
  
 # before chainAntiRepeat:
 # chr1_chainMm5Link 167661
 # after chainAntiRepeat:
 # chr1_chainMm5Link 162853
 
 # NET MOUSE (mm5) BLASTZ (DONE, 2004-12-13, hartera)
 # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera)
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
     mkdir preNet
     cd chainAR 
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                      ../preNet/$i
     end
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 119873536, utime 849 s/100, stime 81
     # Add classification info using db tables:
     # netClass looks for ancient repeats in one of the databases
     # hg17 has this table - hand-curated by Arian but this is for
     # human-rodent comparisons so do not use here, use -noAr option
     mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
     mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInMouse
     cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
        /cluster/bluearc/mm5/linSpecRep.notInZebrafish
     cp /iscratch/i/danRer2/linSpecRep.notInMouse/* \
        /cluster/bluearc/danRer2/linSpecRep.notInMouse
 
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
     time netClass noClass.net danRer2 mm5 mouseMm5.net \
           -tNewR=/cluster/bluearc/danRer2/linSpecRep.notInMouse \
           -qNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish -noAr
     # 86.210u 51.710s 4:02.29 56.9%   0+0k 0+0io 206pf+0w
     netFilter -minGap=10 mouseMm5.net |  hgLoadNet danRer2 netMm5 stdin
 
 # featureBits danRer2 refGene:cds netMm5 -enrichment
 # refGene:cds 0.468%, netMm5 30.706%, both 0.404%, cover 86.40%, enrich 2.81x
 # featureBits danRer1 refGene:cds netMm5 -enrichment
 # refGene:cds 0.461%, netMm5 36.622%, both 0.395%, cover 85.70%, enrich 2.34x
 # add html and trackDb.ra entries 
 # after chainAntiRepeat:
 # featureBits danRer2 refGene:cds netMm5 -enrichment
 # refGene:cds 0.468%, netMm5 30.565%, both 0.405%, cover 86.40%, enrich 2.83x  
 
 # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-12-14, hartera)
 # CORRECTION MADE TO chrM.agp FILE - MADE TAB DELIMITED INSTEAD OF
 # SPACE DELIMITED SO REPLACE OLD AGP FILE IN DOWNLOADS 
 # (DONE, 2005-04-25, hartera)
 # REPLACE chr1 WITH chr1New IN chromFa and chromFaMasked - chr1 was
 # incompletely masked due to an error in the RepeatMasker output for chr1.
 # See section on "MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF"
 # (2005-06-06, hartera)
 # UPDATED README FILES WITH NEW SEQUENCE FTP FOR SANGER (2005-08-04,hartera)
  
     ssh kksilo
     cd /cluster/data/danRer2
     #- Build the .zip files
     cat << '_EOF_' > jkStuff/zipAll.csh
 rm -rf zip
 mkdir zip
 # chrom AGP's
 zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
 # chrom RepeatMasker out files
 zip -j zip/chromOut.zip */chr*.fa.out
 # soft masked chrom fasta
 zip -j zip/chromFa.zip */chr*.fa
 # hard masked chrom fasta
 zip -j zip/chromFaMasked.zip */chr*.fa.masked
 # chrom TRF output files
 cd bed/simpleRepeat
 zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed
 cd ../..
 # get GenBank native mRNAs
 cd /cluster/data/genbank
 ./bin/i386/gbGetSeqs -db=danRer2 -native GenBank mrna \
         /cluster/data/danRer2/zip/mrna.fa
 # get GenBank xeno mRNAs
 ./bin/i386/gbGetSeqs -db=danRer2 -xeno GenBank mrna \
         /cluster/data/danRer2/zip/xenoMrna.fa
 # get native RefSeq mRNAs
 ./bin/i386/gbGetSeqs -db=danRer2 -native refseq mrna \
 /cluster/data/danRer2/zip/refMrna.fa
 # get native GenBank ESTs
 ./bin/i386/gbGetSeqs -db=danRer2 -native GenBank est \
 /cluster/data/danRer2/zip/est.fa
                                                                                 
 cd /cluster/data/danRer2/zip
 # zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
 zip -j mrna.zip mrna.fa
 zip -j xenoMrna.zip xenoMrna.fa
 zip -j refMrna.zip refMrna.fa
 zip -j est.zip est.fa
 '_EOF_'
     # << this line makes emacs coloring happy
     csh ./jkStuff/zipAll.csh |& tee ./jkStuff/zipAll.log
     cd zip
     # remake just the agp files zip with correcte chrM.agp (2005-04-25,hartera)
     ssh kksilo
     cd /cluster/data/danRer2
     rm /cluster/data/danRer2/zip/chromAgp.zip
 # chrom AGP's
 zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
     
     #- Look at zipAll.log to make sure all file lists look reasonable.
     # Make upstream files and Copy the .zip files to
     # hgwdev:/usr/local/apache/...
     ssh hgwdev
     cd /cluster/data/danRer2/zip
     # make upstream files for zebrafish RefSeq
     featureBits danRer2 refGene:upstream:1000 -fa=upstream1000.fa
     zip upstream1000.zip upstream1000.fa
     featureBits danRer2 refGene:upstream:2000 -fa=upstream2000.fa
     zip upstream2000.zip upstream2000.fa
     #- Check zip file integrity:
     foreach f (*.zip)
       unzip -t $f > $f.test
       tail -1 $f.test
     end
     wc -l *.zip.test
     set gp = /usr/local/apache/htdocs/goldenPath/danRer2
     mkdir -p $gp/bigZips
     cp -p *.zip $gp/bigZips
     mkdir -p $gp/chromosomes
     foreach f (../*/chr*.fa)
        zip -j $gp/chromosomes/$f:t.zip $f
     end
     cd $gp/bigZips
     md5sum *.zip > md5sum.txt
     cd $gp/chromosomes
     md5sum *.zip > md5sum.txt
     # Take a look at bigZips/* and chromosomes/*, update their README.txt's
 
     # Remake just the chromAgp.zip with correct chrM.agp (2005-04-25,hartera)
     ssh kksilo
     cd /cluster/data/danRer2
     rm /cluster/data/danRer2/zip/chromAgp.zip
 # chrom AGP's
 zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
     ssh hgwdev
     cd /cluster/data/danRer2/zip
     #- Check zip file integrity:
     foreach f (chromAgp.zip)
       unzip -t $f > $f.test
       tail -1 $f.test
     end
     wc -l chromAgp.zip.test
     # looks good
     set gp = /usr/local/apache/htdocs/goldenPath/danRer2
     rm $gp/bigZips/chromAgp.zip
     rm $gp/bigZips/md5sum.txt
     cp -p chromAgp.zip $gp/bigZips
     # go to directory with zip files and remake the md5sum.txt file
     cd $gp/bigZips
     md5sum *.zip > md5sum.txt
 
     # Remake chr1 zips for masked file with chr1New.fa (2005-06-06, hartera)
     ssh kkstore01
     cd /cluster/data/danRer2
     # move old chr1 files temporarily out of the way
     mkdir 1/tmp
     mv 1/chr1.fa.out ./tmp
     mv 1/chr1.fa ./tmp
     mv 1/chr1.fa.masked ./tmp
     rm zip/chromOut.zip zip/chromFa.zip zip/chromFaMasked.zip
     # chrom RepeatMasker out files
     zip -j zip/chromOut.zip */chr*.fa.out
     # soft masked chrom fasta
     zip -j zip/chromFa.zip */chr*.fa
     # hard masked chrom fasta
     zip -j zip/chromFaMasked.zip */chr*.fa.masked
  
     cd /cluster/data/danRer2/zip
     #- Check zip file integrity:
     foreach f (*.zip)
       unzip -t $f > $f.test
       tail -1 $f.test
     end
     wc -l *.zip.test
     ssh hgwdev
     cd /cluster/data/danRer2/zip
     set gp = /usr/local/apache/htdocs/goldenPath/danRer2
     # copy over just those modified zip files
     cp -p chromOut.zip $gp/bigZips
     cp -p chromFa.zip $gp/bigZips
     cp -p chromFaMasked.zip $gp/bigZips
     # remove chr1.fa.zip and zip chr1New
     rm $gp/chromosomes/chr1.fa.zip
     zip -j $gp/chromosomes/chr1New.fa.zip ../1/chr1New.fa
     # replace md5sum 
     cd $gp/bigZips
     rm md5sum.txt
     md5sum *.zip > md5sum.txt
     cd $gp/chromosomes
     rm md5sum.txt
     md5sum *.zip > md5sum.txt
     # Update README.txt for each to state that a new masked chr1 was produced
     # updated bigZips and chromosomes README.txt files with new ftp site
     # for the assembly sequence at Sanger (2005-08-04, hartera):
     # now ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/
 
 # MAKE VSHG17 DOWNLOADABLES (DONE, 2004-12-14, hartera)
 # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat 
 # (DONE, 2004-12-21, hartera)
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChrom
     set gp = /usr/local/apache/htdocs/goldenPath/danRer2
     mkdir -p $gp/vsHg17/axtChrom
     cp -p *.axt $gp/vsHg17/axtChrom
     cd $gp/vsHg17/axtChrom
     gzip *.axt
     md5sum *.gz > md5sum.txt
     
     # copy chains and net to downloads area
     # re-make chains and net downloadables (2004-12-21, hartera)
     rm $gp/vsHg17/human*.gz $gp/vsHg17/md5sum.txt
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
     gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/humanHg17.chain.gz
     gzip -c humanhg17.net > /cluster/data/danRer2/zip/humanHg17.net.gz
     cd $gp/vsHg17
     mv /cluster/data/danRer2/zip/human*.gz .
     md5sum *.gz > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
  
 # MAKE VSMM5 DOWNLOADABLES (DONE, 2004-12-14, hartera)
 # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat 
 # (DONE, 2004-12-21, hartera)
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChrom
     set gp = /usr/local/apache/htdocs/goldenPath/danRer2
     mkdir -p $gp/vsMm5/axtChrom
     cp -p *.axt $gp/vsMm5/axtChrom
     cd $gp/vsMm5/axtChrom
     gzip *.axt
     md5sum *.gz > md5sum.txt
     
     # copy chains and nets to downloads area
     # re-make chains and net downloadables (2004-12-21, hartera)
     rm $gp/vsMm5/mouse*.gz $gp/vsMm5/md5sum.txt
     cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
     gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/mouseMm5.chain.gz
     gzip -c mouseMm5.net > /cluster/data/danRer2/zip/mouseMm5.net.gz
     cd $gp/vsMm5
     mv /cluster/data/danRer2/zip/mouse*.gz .
     md5sum *.gz > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 # BLASTZ HG17 CLEANUP (DONE, 2004-12-14, hartera)
 # RE-DONE (DONE, 2004-12-21, hartera)
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.hg17.swap
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/noClass.net &
     nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
     nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
     nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
     
 # BLASTZ MM5 CLEANUP (DONE, 2004-12-14, hartera)
 # RE-DONE (DONE, 2004-12-21, hartera)
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.mm5.swap
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/noClass.net &
     nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
     nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain & 
     nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
 
 # BLASTZ FOR FUGU (fr1) (DONE, 2004-12-14, hartera)
 # Fugu is quite distant from zebrafish, more distant than human/chicken
 # so use the HoxD55.q matrix in future for blastz (2005-06-01, hartera)
 # Blastz requires lineage-specific repeats but there are none 
 # for these two fish.
 
     ssh kk
     mkdir -p /cluster/data/danRer2/bed/blastz.fr1.2004-12-13
     ln -s /cluster/data/danRer2/bed/blastz.fr1.2004-12-13 \
           /cluster/data/danRer2/bed/blastz.fr1
     cd /cluster/data/danRer2/bed/blastz.fr1
 # use same parameters as for danRer1 vs fr1
 cat << '_EOF_' > DEF
 # zebrafish (danRer2) vs. Fugu (fr1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
                                                                                 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
 BLASTZ_ABRIDGE_REPEATS=0
                                                                                 
 # TARGET - zebrafish (danRer2)
 SEQ1_DIR=/iscratch/i/danRer2/nib
 SEQ1_RMSK=
 # lineage-specific repeats
 # we don't have that information for these species
 SEQ1_SMSK=
 SEQ1_FLAG=
 SEQ1_IN_CONTIGS=0
 # 10 MB chunk for target
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - Fugu (fr1)
 # soft-masked chrom nib
 SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib
 SEQ2_RMSK=
 SEQ2_SMSK=
 SEQ2_FLAG=
 SEQ2_IN_CONTIGS=0
 # 10 Mbase for query
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
                                                                                 
 BASE=/cluster/data/danRer2/bed/blastz.fr1
                                                                                 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
                                                                                 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
                                                                                 
     # save the DEF file in the current standard place
     chmod +x DEF
     cp DEF ~angie/hummus/DEF.danRer2-fr1.2004-12-13
     # setup cluster run
     # source the DEF file
     bash
     . ./DEF
     /cluster/data/danRer2/jkStuff/BlastZ_run0.sh
     cd run.0
     # check batch looks ok then
     para try, check, push, check, ....
 # para time
 # Completed: 6055 of 6055 jobs
 # CPU time in finished jobs:    1440485s   24008.08m   400.13h   16.67d  0.046 y
 # IO & Wait Time:                132074s    2201.24m    36.69h    1.53d  0.004 y
 # Average job time:                 260s       4.33m     0.07h    0.00d
 # Longest job:                     2054s      34.23m     0.57h    0.02d
 # Submission to last job:          4060s      67.67m     1.13h    0.05d
          
     ssh kki
     cd /cluster/data/danRer2/bed/blastz.fr1
     bash # if a csh/tcsh user
     . ./DEF
     /cluster/data/danRer2/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # para time
 # Completed: 173 of 173 jobs
 # CPU time in finished jobs:        249s       4.16m     0.07h    0.00d  0.000 y
 # IO & Wait Time:                   695s      11.58m     0.19h    0.01d  0.000 y
 # Average job time:                   5s       0.09m     0.00h    0.00d
 # Longest job:                       16s       0.27m     0.00h    0.00d
 # Submission to last job:           296s       4.93m     0.08h    0.00d
 
     #   Third cluster run to convert lav's to axt's
     ssh kki
     cd /cluster/data/danRer2/bed/blastz.fr1
     mkdir axtChrom
     # a new run directory
     mkdir run.2
     cd run.2
 cat << '_EOF_' > do.csh
 #!/bin/csh
 cd $1
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt stdin /iscratch/i/danRer2/nib \
 /cluster/bluearc/fugu/fr1/chromNib stdout \
 | axtSort stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer2/bed/blastz.fr1/axtChrom/$(root1).axt}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     \ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
     head jobList
     para create jobList
     para try, check, push, check,...
 # para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:         71s       1.19m     0.02h    0.00d  0.000 y
 # IO & Wait Time:                   765s      12.75m     0.21h    0.01d  0.000 y
 # Average job time:                  30s       0.50m     0.01h    0.00d
 # Longest job:                       66s       1.10m     0.02h    0.00d
 # Submission to last job:           223s       3.72m     0.06h    0.00d
 
     # translate sorted axt files into psl
     ssh kolossus
     cd /cluster/data/danRer2/bed/blastz.fr1
     mkdir -p pslChrom
     set tbl = "blastzFr1"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
     end
                                                                                
     # Load database tables
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.fr1/pslChrom                                                   
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl danRer2 $f
       echo "$f Done"
     end
 # featureBits -chrom=chr1 danRer2 refGene:cds blastzFr1 -enrichment
 # refGene:cds 0.512%, blastzFr1 10.880%, both 0.439%, cover 85.71%, enrich 7.88x
 # featureBits -chrom=chr1 danRer1 refGene:cds blastzFr1 -enrichment
 # refGene:cds 0.529%, blastzFr1 5.542%, both 0.463%, cover 87.50%, enrich 15.79x
 
 # CHAIN FUGU (fr1) BLASTZ (DONE, 2004-12-14, hartera)
 # APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
 # AND DEGENERATE DNA (DONE, 2004-12-21, hartera)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/danRer2/bed/blastz.fr1
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/danRer2/bed/blastz.fr1/axtChrom/*.axt \
         > input.lst
  cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
                                                                                 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 /iscratch/i/danRer2/nib \
     /cluster/bluearc/fugu/fr1/chromNib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
                                                                                 
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:        402s       6.71m     0.11h    0.00d  0.000 y
 # IO & Wait Time:                   276s       4.59m     0.08h    0.00d  0.000 y
 # Average job time:                  24s       0.40m     0.01h    0.00d
 # Longest job:                       56s       0.93m     0.02h    0.00d
 # Submission to last job:           101s       1.68m     0.03h    0.00d
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
     chainMergeSort run1/chain/*.chain > all.chain
     chainSplit chain all.chain
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r >> hist5000.out
       textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
       echo ""
     end
     # filter with minScore=5000
     mv all.chain all.chain.unfiltered
     chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k
     rm -r chain
     chainSplit chain all.chainFilt5k
                              
     # remove repeats from chains and reload into database
     # (2004-12-21, hartera)
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
     gunzip all.chainFilt5k.gz
     chainSplit chainRaw all.chainFilt5k 
     mkdir chain
     cd chainRaw
     foreach f (*.chain)
        set c = $f:r
        echo $c 
        nice chainAntiRepeat /iscratch/i/danRer2/nib \
                        /cluster/bluearc/fugu/fr1/chromNib $f \
                        ../chain/$c.chain
     end
     cd ..  
     chainMergeSort ./chain/*.chain > all.chain.antirepeat
     chainSplit chainAR all.chain.antirepeat
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChain/chainAR
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain danRer2 ${c}_chainFr1 $i
         echo done $c
     end
 # featureBits -chrom=chr1 danRer2 refGene:cds chainFr1 -enrichment
 # refGene:cds 0.512%, chainFr1 23.334%, both 0.451%, cover 88.07%, enrich 3.77x
 # featureBits -chrom=chr1 danRer2 refGene:cds chainFr1Link -enrichment
 # refGene:cds 0.512%, chainFr1Link 7.794%, both 0.426%, cover 83.16%, enrich 10.67x
 
 # featureBits -chrom=chr1 danRer1 refGene:cds chainFr1 -enrichment
 # refGene:cds 0.529%, chainFr1 19.003%, both 0.479%, cover 90.41%, enrich 4.76x
 # featureBits -chrom=chr1 danRer1 refGene:cds chainFr1Link -enrichment
 # refGene:cds 0.529%, chainFr1Link 4.686%, both 0.450%, cover 85.11%, enrich 18.16x
 # after chainAntiRepeat:
 # featureBits -chrom=chr1 danRer2 refGene:cds chainFr1Link -enrichment
 # refGene:cds 0.512%, chainFr1Link 7.726%, both 0.426%, cover 83.16%, 
 # enrich 10.76x 
 
 # NET FUGU (fr1) BLASTZ (DONE, 2004-12-14, hartera)
 # REMADE NET FOR FUGU (2004-12-15, hartera)
 # RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera)
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
     mkdir preNet
     cd chainAR
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                      ../preNet/$i
     end
                                                                                 
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                  ../n1/$n /dev/null
     end
                                                                                 
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 108306432, utime 690 s/100, stime 58
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
     # use -noAr option - don't look for ancient repeats
     # redo net as used danRer1 for netClass instead of danRer2 (2004-12-15)
     # and load new net into database
     netClass -noAr noClass.net danRer2 fr1 fuguFr1.net
                                                                                 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
     netFilter -minGap=10 fuguFr1.net | hgLoadNet danRer2 netFr1 stdin
 # featureBits danRer2 refGene:cds netFr1 -enrichment
 # refGene:cds 0.468%, netFr1 19.389%, both 0.411%, cover 87.95%, enrich 4.54x
 # featureBits danRer1 refGene:cds netFr1 -enrichment
 # refGene:cds 0.461%, netFr1 17.530%, both 0.391%, cover 84.78%, enrich 4.84x
 # after chainAntiRepeat:
 # featureBits danRer2 refGene:cds netFr1 -enrichment
 # refGene:cds 0.468%, netFr1 19.372%, both 0.412%, cover 87.97%, enrich 4.54x 
 
 # MAKE VSFR1 DOWNLOADABLES (DONE, 2004-12-15, hartera)
 # REPLACE FR1 NET WITH NEW VERSION IN DOWNLOADS (2004-12-15, hartera)
 # REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat 
 # (DONE, 2004-12-21, hartera)
 
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChrom
     set gp = /usr/local/apache/htdocs/goldenPath/danRer2
     mkdir -p $gp/vsFr1/axtChrom
     cp -p *.axt $gp/vsFr1/axtChrom
     cd $gp/vsFr1/axtChrom
     gzip *.axt
     md5sum *.gz > md5sum.txt
     
     # copy chains and nets to downloads area
     # re-make chains and net downloadables (2004-12-21, hartera)
     rm $gp/vsFr1/fugu*.gz $gp/vsFr1/md5sum.txt
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
     gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/fuguFr1.chain.gz
     gzip -c fuguFr1.net > /cluster/data/danRer2/zip/fuguFr1.net.gz
     cd $gp/vsFr1
     mv /cluster/data/danRer2/zip/fugu*.gz .
     md5sum *.gz > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 # BLASTZ FR1 CLEANUP (DONE, 2004-12-15, hartera)
 # RE-DONE (DONE, 2004-12-21, hartera)
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.fr1
     nice rm axtChain/run1/chain/* &
     nice rm -fr axtChain/n1 axtChain/noClass.net &
     nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/all.chainFilt5k axtChain/*.net &
     nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
     nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
 
 # BLASTZ FOR TETRAODON (tetNig1) (DONE, 2004-12-21, hartera)
 # Tetraodon is quite distant from zebrafish, more distant than human/chicken
 # so use the HoxD55.q matrix in future for blastz (2005-06-01, hartera)
     # blastz requires lineage-specific repeats but there are none
     # available between these two fish species 
     ssh kk
     mkdir -p /cluster/data/danRer2/bed/blastz.tetNig1.2004-12-21
     ln -s /cluster/data/danRer2/bed/blastz.tetNig1.2004-12-21 \
           /cluster/data/danRer2/bed/blastz.tetNig1
     cd /cluster/data/danRer2/bed/blastz.tetNig1
 # use tetraodon sequence in contigs for dynamic masking - see below
 # for dynamic masking: M=50
 cat << '_EOF_' > DEF
 # zebrafish (danRer2) vs. tetraodon (tetNig1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 # use BLASTZ_M=50 in blastz-run1
 ALIGN=blastz-run1
 BLASTZ=blastz
 BLASTZ_H=2500
 
 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - zebrafish (danRer2)
 SEQ1_DIR=/iscratch/i/danRer2/nib
 SEQ1_RMSK=
 # lineage-specific repeats
 # we don't have that information for these species
 SEQ1_SMSK=
 SEQ1_FLAG=
 SEQ1_IN_CONTIGS=0
 # 10 MB chunk for target
 SEQ1_CHUNK=500000
 SEQ1_LAP=500
 
 # QUERY - Tetraodon (tetNig1)
 # soft-masked chrom nib
 SEQ2_DIR=/iscratch/i/tetNig1/contigs
 SEQ2_RMSK=
 SEQ2_SMSK=
 SEQ2_FLAG=
 SEQ2_IN_CONTIGS=1
 SEQ2_CHUNK=
 SEQ2_LAP=
 
 BASE=/cluster/data/danRer2/bed/blastz.tetNig1
 
 DEF=$BASE/DEF
 RAW=$BASE/raw 
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
                                                                                 
     # save the DEF file in the current standard place
     chmod +x DEF
     cp DEF ~angie/hummus/DEF.danRer2-tetNig1.2004-12-21
 
     # setup cluster run
     # source the DEF file
     bash
     . ./DEF
     /cluster/data/danRer2/jkStuff/BlastZ_run0.sh
     cd run.0
     # check batch looks ok then
     para try, check, push, check, ....
 # para time
 # Completed: 3193 of 3193 jobs
 # CPU time in finished jobs:    4460310s   74338.50m  1238.97h   51.62d  0.141 y
 # IO & Wait Time:                 41176s     686.27m    11.44h    0.48d  0.001 y
 # Average job time:                1410s      23.50m     0.39h    0.02d
 # Longest job:                     2398s      39.97m     0.67h    0.03d
 # Submission to last job:         12372s     206.20m     3.44h    0.14d
  
     ssh kki
     cd /cluster/data/danRer2/bed/blastz.tetNig1
     bash # if a csh/tcsh user
     . ./DEF
     /cluster/data/danRer2/jkStuff/BlastZ_run1.sh
     cd run.1
     para try, check, push, etc ...
 # para time
 # Completed: 3193 of 3193 jobs
 # CPU time in finished jobs:        327s       5.46m     0.09h    0.00d  0.000 y
 # IO & Wait Time:                  8483s     141.38m     2.36h    0.10d  0.000 y
 # Average job time:                   3s       0.05m     0.00h    0.00d
 # Longest job:                        9s       0.15m     0.00h    0.00d
 # Submission to last job:           560s       9.33m     0.16h    0.01d
 
     #   Third cluster run to convert lav's to axt's
     ssh kki
     cd /cluster/data/danRer2/bed/blastz.tetNig1
     mkdir axtChrom
     # a new run directory
     mkdir run.2
     cd run.2
 cat << '_EOF_' > do.csh
 #!/bin/csh
 cd $1
 cat `ls -1 *.lav | sort -g` \
 | lavToAxt -fa stdin /iscratch/i/danRer2/nib \
 /iscratch/i/tetNig1/contigs/tetNig1Contigs.fa stdout \
 | axtSort stdin $2
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod a+x do.csh
     cat << '_EOF_' > gsub
 #LOOP
 ./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom/$(root1).axt}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
     \ls -1Sd ../lav/chr* > chrom.list
     gensub2 chrom.list single gsub jobList
     wc -l jobList
     head jobList
     para create jobList
     para try, check, push, check,...
 # para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:        224s       3.74m     0.06h    0.00d  0.000 y
 # IO & Wait Time:                  1240s      20.66m     0.34h    0.01d  0.000 y
 # Average job time:                  52s       0.87m     0.01h    0.00d
 # Longest job:                      148s       2.47m     0.04h    0.00d
 # Submission to last job:           157s       2.62m     0.04h    0.00d
 
     # lift up query sequences
     #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.tetNig1
 
     foreach d (axtChrom/chr*.axt)
       echo $d
       liftUp -axtQ ${d}.out.axt \
       /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft warn $d \
         > /dev/null
     end
 
     #- Lift pseudo-contigs to chromosome level
     # check this is working correctly
     set dr = "/cluster/data/danRer2"
     foreach c (`cat ${dr}/chrom.lst`)
       echo lifting $c
       liftUp -axtQ ./axtChrom/chr${c}.out2.axt \
          /cluster/data/tetNig1/jkStuff/liftAll.lft warn \
         ./axtChrom/chr${c}.axt.out.axt > /dev/null
     end
     cd axtChrom
     foreach c (`cat ${dr}/chrom.lst`)
        mv chr${c}.axt chr${c}.axt.old
        mv chr${c}.out2.axt chr${c}.axt
        rm chr${c}.axt.out.axt
     end
    # translate sorted axt files into psl
     ssh kolossus
     cd /cluster/data/danRer2/bed/blastz.tetNig1
     mkdir -p pslChrom
     # need to copy S2.len for whole tetNig1 genome - S2contigs.len
     cp /cluster/data/tetNig1/bed/blastzSelf/S1.len S2contigs.len
     set tbl = "blastzTetNig1"
     foreach f (axtChrom/chr*.axt)
       set c=$f:t:r
       echo "Processing chr $c"
       /cluster/bin/i386/axtToPsl $f \
               S1.len S2contigs.len pslChrom/${c}_${tbl}.psl
     end 
     # Load database tables
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.tetNig1/pslChrom                                                   
     foreach f (./*.psl)
       /cluster/bin/i386/hgLoadPsl danRer2 $f
       echo "$f Done"
     end
 # H=2000
 # featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1 -enrichment
 # refGene:cds 0.512%, blastzTetNig1 22.057%, both 0.435%, cover 84.94%, 
 # enrich 3.85x
 # H=2000 and L=8000
 # featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1L8k -enrichment
 # refGene:cds 0.512%, blastzTetNig1L8k 16.326%, both 0.401%, cover 78.28%, 
 # enrich 4.80x
 # use query in contigs in one file and use dynamic masking with M=100
 # should mask out more sequence in query, tetraodon has no specific repeats
 # library so it is not fully repeat masked. H=2500, M=100
 # featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1Contigs -enrichment
 # refGene:cds 0.512%, blastzTetNig1Contigs 13.052%, both 0.427%, cover 83.43%, 
 # enrich 6.39x
 # contigs used as above but H=2000
 #featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1ContigsH2k -enrichment
 # refGene:cds 0.512%, blastzTetNig1ContigsH2k 17.517%, both 0.433%, 
 # cover 84.51%, enrich 4.82x
 # contigs used with H=2500, L=6000
 # refGene:cds 0.512%, blastzTetNig1ContigsL6k 11.380%, both 0.415%,
 # cover 80.94%, enrich 7.11x
 # use M=50 and H=2500
 #featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1ContigsM50 -enrichment
 # refGene:cds 0.512%, blastzTetNig1ContigsM50 12.643%, both 0.427%, cover 83.42%# ,enrich 6.60x
 # using contigs and dynamic masking improves the enrichment and makes little
 # difference to coverage. the blastz track % is lower since more sequence
 # is being masked and since coverage is not much different, it is probably
 # being masked in non coding regions
 # at chr1:6,128,109-6,135,734 there are a lot of short alignments in low
 # complexity regions that are removed using L=6000.
 # Use contigs with H=2500, keep lower scoring alignments until after chaining
 # Number of rows:
 # blastzTetNig1	2360350 
 # blastzTetNig1L8k 1177874
 # blastzTetNig1Contigs 725076 
 # blastzTetNig1ContigsH2k 825927
 # blastzTetNig1ContigsL6k 538149
 # blastzTetNig1ContigsM50 479228
 # Using M=50 reduces the number of alignments but coverage is the same as 
 # for using M=100 so use the dynamic masking with M=50 and H=2500
 
 # CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-12-21, hartera)
     # Run axtChain on little cluster
     ssh kki
     cd /cluster/data/danRer2/bed/blastz.tetNig1
     mkdir -p axtChain/run1
     cd axtChain/run1
     mkdir out chain
     ls -1S /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom/*.axt \
         > input.lst
  cat << '_EOF_' > gsub
 #LOOP
 doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << this line makes emacs coloring happy
                                                                                 
     cat << '_EOF_' > doChain
 #!/bin/csh
 axtChain $1 /iscratch/i/danRer2/nib \
     /iscratch/i/tetNig1/nib $2 >& $3
 '_EOF_'
     # << this line makes emacs coloring happy
                                                                                 
     chmod a+x doChain
     gensub2 input.lst single gsub jobList
     para create jobList
     para try, check, push, check...
 # para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:        446s       7.43m     0.12h    0.01d  0.000 y
 # IO & Wait Time:                   355s       5.92m     0.10h    0.00d  0.000 y
 # Average job time:                  29s       0.48m     0.01h    0.00d
 # Longest job:                       98s       1.63m     0.03h    0.00d
 # Submission to last job:            98s       1.63m     0.03h    0.00d
 
     # now on the cluster server, sort chains
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
     mkdir chain
     # remove repeats from chains
     cd run1/chain
     foreach f (*.chain)
        set c = $f:r
        echo $c 
        nice chainAntiRepeat /iscratch/i/danRer2/nib \
                        /iscratch/i/tetNig1/nib $f \
                        ../../chain/$c.chain
     end
     cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
     chainMergeSort ./chain/*.chain > all.chain.antirepeat
     chainSplit chainAR all.chain.antirepeat
     # take a look at score distr's
     foreach f (chain/*.chain)
       grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
       echo $f:t:r >> hist5000.out
       textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
       echo ""
     end
     # filter with minScore=5000
     mv all.chain.antirepeat all.chainAR.unfiltered
     chainFilter -minScore=5000 all.chainAR.unfiltered > all.chainARFilt5k
     rm -r chain
     chainSplit chainAR all.chainARFilt5k
     # only chr1 has more than 100,000 chains with score of 5000
     # or less after filtering             
     # Load chains into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain/chainAR
     foreach i (*.chain)
         set c = $i:r
         hgLoadChain danRer2 ${c}_chainTetNig1 $i
         echo done $c
     end
 # featureBits -chrom=chr1 danRer2 refGene:cds chainTetNig1Link -enrichment
 # refGene:cds 0.512%, chainTetNig1Link 11.005%, both 0.416%, cover 81.22%, 
 # enrich 7.38x 
 
 # NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-12-21, hartera)
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
     mkdir preNet
     cd chainAR
     foreach i (*.chain)
        echo preNetting $i
        /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2contigs.len \
                                      ../preNet/$i
     end
     cd ..
     mkdir n1
     cd preNet
     foreach i (*.chain)
       set n = $i:r.net
       echo primary netting $i
       /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len \
                                  ../../S2contigs.len ../n1/$n /dev/null
     end
     cd ..
     cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
     # memory usage 102502400, utime 670 s/100, stime 69
     # Add classification info using db tables:
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
     # use -noAr option - don't look for ancient repeats
     netClass -noAr noClass.net danRer2 tetNig1 tetraTetNig1.net
                                                                                 
     # Load the nets into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
     netFilter -minGap=10 tetraTetNig1.net | hgLoadNet danRer2 netTetNig1 stdin
 # featureBits danRer2 refGene:cds netTetNig1 -enrichment
 # refGene:cds 0.468%, netTetNig1 19.068%, both 0.410%, cover 87.50%, 
 # enrich 4.59x
 
 # BACENDS (DONE, 2005-05-20, hartera)
 # Removed incorrect row from bacCloneXRef table (DONE, 2006-04-19, hartera)
 # provided by Anthony DiBiase, Yi Zhou and Leonard Zon at the Boston
 # Children's Hospital. Collaborated with them on this track.
 # Anthony DiBiase:adibiase@enders.tch.harvard.edu
 # Yi Zhou:yzhou@enders.tch.harvard.edu
 # BAC clone end sequences are from Robert Geisler's lab,
 # Max Planck Institute for Developmental Biology, Tuebingen, Germany
 # REMAKE AND RELOAD bacCloneAlias AND bacCloneXRef TABLES AFTER FIXING
 # BUGS IN THE zfishBacClonesandSts PROGRAM TO STORE ALL THE ALIASES FOR EACH
 # SANGER STS NAME AND TO STORE CORRECT INTERNAL AND EXTERNAL NAMES IN ALL
 # CASES. DATA FILE OF UniSTS IDS AND SANGER STS NAMES WAS ALSO RE-CREATED
 # AS THIS WAS INCORRECT. (DONE, 2005-05-24, hartera)
 # ALSO DO TESTING OF THE DATA IN THESE TABLES TO MAKE SURE IT IS CORRECT - SEE
 # SECTION ON TESTING BELOW.
     ssh kksilo
     mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends
     cd /cluster/data/danRer2/bed/ZonLab/bacends 
     # copy BAC ends sequence from previous assembly
     cp /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa .
     faSize bacends.fa
     # 486978153 bases (39070196 N's 447907957 real 447907957 upper 0 lower) 
     # in 594614 sequences in 1 files
     # Total size: mean 819.0 sd 230.3 min 0 (zKp108-H09.za) max 5403 (zC259G13.zb) median 796
     # N count: mean 65.7 sd 154.7
     # U count: mean 753.3 sd 302.6
     # L count: mean 0.0 sd 0.0
 
     ssh kkr1u00
     cd /cluster/data/danRer2/bed/ZonLab/bacends
     mkdir -p /iscratch/i/danRer2/bacends
     # if not split already, split up sequence for cluster runs
     faSplit sequence bacends.fa 20 /iscratch/i/danRer2/bacends/bacends
     # iSync bacends to kilokluster
     /cluster/bin/iSync
 
     ssh kk
     cd /cluster/data/danRer2/bed/ZonLab/bacends
     ls -1S /iscratch/i/danRer1/bacends/*.fa > bacends.lst
     ls -1S /iscratch/i/danRer2/trfFa/*.fa > genome.lst
     cat << '_EOF_' > template
 #LOOP
 /cluster/bin/i386/blat $(path1) $(path2) -tileSize=10 -ooc=/iscratch/i/danRer2/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
    # << this line keeps emacs coloring happy
     mkdir psl
     gensub2 genome.lst bacends.lst template jobList
     para create jobList
     para try, check, push, check, ...
 # para time
 # Completed: 5980 of 5980 jobs
 # CPU time in finished jobs:    5317533s   88625.55m  1477.09h   61.55d  0.169 y
 # IO & Wait Time:                 34446s     574.10m     9.57h    0.40d  0.001 y
 # Average job time:                 895s      14.92m     0.25h    0.01d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            9892s     164.87m     2.75h    0.11d
 # Submission to last job:         12309s     205.15m     3.42h    0.14d
   
     # back on kksilo (now kkstore01), filter and lift results:
     ssh kksilo
     cd /cluster/data/danRer2/bed/ZonLab/bacends
     pslSort dirs raw.psl temp psl
     # took 9 hours
     # With minCover=0.4, 78% of seqeunces are aligned
     # No minCover, aligns 90% of sequences but with a large number
     # of extra alignments - a lot of noise even compared to using minCover of 
     # 0.05.
     # Total BAC ends sequences: 594614
     # minCover minAli  	#alignments  #sequences aligned %total sequences aligned
     #   0	0.85	13969946     535465		90
     #   0.05	0.85	8140775	     531299		89
     #   0.10	0.75	5099823	     519238             87
     #   0.10	0.85	5098714	     519084		87
     #   0.10	0.90	5096205	     518616		87
     #   0.10	0.92	4984170	     515024		87
     #   0.10	0.95	4264695      498380 		84
     #   0.20	0.85	4515869	     501415		84
     #   0.40	0.85	3806487	     464067		78
     # Using minCover of 0.10 seems good, there are 55017 extra BAC ends 
     # being aligned with an extra 1292227 alignments compared to 0.10, this 
     # is an average of 23.5 alignments per sequence compared to 8 alignments 
     # per sequence on average for minCover of 0.40.
     # Using minAli of 0.90 for minCover 0.10 gives 468 less sequences 
     # aligned and 2509 less alignments. Using 0.75 for minCover, increases 
     # the number of alignments by about 1100 compared to using 0.85 and 
     # sequences aligned only increased by about 150. Using minAli of 0.95 
     # for minCover of 0.10 gives 834,019 less alignments with 20704 less 
     # sequences being aligned than for using minAli of 0.85.
     # for minAli=0.92 and minCover=0.10, there are 114,544 less alignments 
     # than for minAli=0.85 and 4060 less sequences are aligned.
     # try minAli=0.85 and minCover=0.40 and also minAli=0.92 and 
     # minCover=0.10 and compare. More alignments are gained - see below
     # but minCover=0.10 means there are a lot of alignments where < 40% of the
     # BAC end aligns and this does not seem reliable so stick to minCover=0.4 
 # Stats for minCover=0.10, minAli=0.92:
 # Pairs: there are 205360 orphans and 97458 pairs, 1014 long, 19738 mismatch 
 # and 239 slop
 # Singles: 17829 bacEnds.orphan and 205360 from pair analysis so a total of 
 # 223189 orphans 7828 more than for danRer1 and more than for using 
 # minCover=0.40 and minAli=0.85.
 # for minCov=0.10, minAli=0.92:
 # 90450 pairs.txt
 # 69699 pairs.txt.uniq - about 4500 more than for minCover=0.40 and minAli=0.85
 # 202661 singles.txt
 # 178337 singles.txt.uniq - about 5399 more than for minCover=0.40,minAli=0.85
 
     # Below, used same parameters as for danRer1: minCover=0.40 and minAli= 0.85
     # all files below and tables are derived from the alignments using
     # these parameters for pslReps filtering.
     # location of store8 has changed to kkstore01 instead of kksilo so now 
     # login there (2005-04-28)
     ssh kkstore01 
     cd /cluster/data/danRer2/bed/ZonLab/bacends
     nice pslReps -nearTop=0.02 -minCover=0.40 -minAli=0.85 -noIntrons raw.psl \
       bacEnds.psl /dev/null
     liftUp bacEnds.lifted.psl /cluster/data/danRer2/jkStuff/liftAll.lft \
            warn bacEnds.psl
     # Got 299 lifts in /cluster/data/danRer2/jkStuff/liftAll.lft
     wc -l bacEnds.lifted.psl
     # 3806457 bacEnds.lifted.psl
     # 4984175 (for minCov=0.10, minAli=0.92)
     rm -r temp
     pslCheck bacEnds.lifted.psl >& pslCheck.log
     # 536 problems with overlapping exons reported by pslCheck
 
     awk '{print $10;}' bacEnds.lifted.psl | sort | uniq > bacEnds.names.uniq
     # edit to remove header lines    
     
     wc -l bacEnds.names.uniq
     # 464067 bacEnds.names.uniq
     grep '>' bacends.fa | wc -l
     # 594614 bacends.fa
     # 78% of BAC ends have aligned
     
 # Need to add accession information and BAC end sequence pairs
     mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
     cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
     grep ">" ../bacends.fa > bacEnds
     # remove '>' at beginning of each line
     perl -pi.bak -e 's/>//' bacEnds
     # remove ^M char from end of lines
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
     # copy over necessary files from danRer1
     set dr1Bacs = "/cluster/data/danRer1/bed/ZonLab/bacends"
     cp $dr1Bacs/bacends.1/stripEndChar.pl.gz .
     cp $dr1Bacs/bacends.1/BACEnd_accessions.txt.gz .
     cp $dr1Bacs/bacends.1/zfish_accs.txt .
     cp $dr1Bacs/bacends.1/getBacEndInfo.pl.gz .
     gunzip *.gz
     
     ssh kkstore01
     cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
     perl stripEndChar.pl < bacEnds > allBacEnds.names
     rm bacEnds bacEnds.bak
     # Kerstin Jekosch at the Sanger Centre: kj2@sanger.ac.uk
     # provided accession data for BAC End clones - zfish_accs.txt
     # and for BAC End pairs- BACEnd_accessions.txt - put these in bacends.1 dir
     # write and use perl script to split BAC Ends into pairs and singles
     # bacEndPairs.txt and bacEndSingles.txt
     # For pslPairs, the reverse primer (T7 or .z*) should in the first column
     # and the forward primer (SP6 or .y*) should be in the second column
     # There are several reads for some sequences and these have similar names.
     # Reads for the same sequencee should be in a comma separated list.
     # Sequence read names can be translated to external clone names as found
     # in NCBI's clone registry using the name without the suffix and the 
     # translation of prefixes as follows after removal of dashes 
     # and extra zeros in the name:
     # library         external prefix         internal prefix         
     # CHORI-211		CH211-                  zC
     # DanioKey        	DKEY-                   zK
     # DanioKey Pilot  	DKEYP-                  zKp
     # RZPD-71         	RP71-                   bZ
     # BUSM1 (PAC)      	BUSM1-                  dZ
     # e.g. zC001-A03.za becomes CH211-1A03
     # make table of accessions for BAC ends to load into bacEndAlias table
     # find all suffixes used for sequence namesi
     cat << '_EOF_' > getSuffixes.pl
 #!/usr/bin/perl -w
 use strict;
 
 while(<STDIN>)
 {
 chomp;
 my $l = $_;
 if ($l =~ /^[t1_]*[z|Z|b|d][C|K|Z]p?[0-9]+\-?[a-zA-Z]+[0-9]+\.?([A-Z0-9a-z]+)/)
     {
     my $suffix = $1;
     print "$suffix\n";
     }
 }
 '_EOF_'
     chmod +x getSuffixes.pl
     perl getSuffixes.pl < allBacEnds.names > allBacEnds.suffixes
     sort allBacEnds.suffixes | uniq > allBacEnds.suffixes.uniq
     wc -l allBacEnds.suffixes.uniq
     # 22 different suffixes
     # SP6, SP6A,SP6W,T7,T7A,T7W,Za,p1kSP6,p1kSP6w,q1kT7,q1kT7w,sp6,t7,ya
     # yb,yc,yd,yt,za,zb,zc,zd
     # get suffixes used in BAC ends accessions file
     perl getSuffixes.pl < BACEnd_accessions.txt > BACEnd.names.suffixes
     sort BACEnd.names.suffixes | uniq
     # SP6, SP6A, T7 and T7A - all these endings occur in list of BAC ends
     # in FASTA file but if the ending is something different, need to add "A" 
     # to SP6 and T7 and check this ending also in the BAC end accessions list
     # edit getBacEndInfo.pl to make sure it includes all these suffixes
     # also treats zK32A7SP6 and zK32A7SP6A to be different
     perl getBacEndInfo.pl allBacEnds.names BACEnd_accessions.txt > bacEnds.log
     wc -l *.txt
     # 31317 BACEnd_accessions.txt
     # 180280 bacEndPairs.txt
     # 16020 bacEndSingles.txt
     #  594614 direction.txt
     wc -l bacEndAccs.aliases
     # 47558 bacEndAccs.aliases
     # bacEndAccs.aliases contains sequence read names and their
     # Genbank accessions.  
     # checked that all names from allBacEnds.names appear in either
     # bacEndPairs.txt or bacEndSingles.txt
     # First process BAC end alignments
     ssh kkstore01
     mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/pairs
     cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
     set bacDir = /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
     # these bacEnds vary in size from around 2800 bp to 626,000 bp
     ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
      -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
      -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndPairs.txt \
      all_bacends bacEnds
      wc -l bacEnds.*
 
      # There are 1411 more orphans and 18147 more pairs than for danRer1
      #  980 bacEnds.long
      #  18083 bacEnds.mismatch
      #  201646 bacEnds.orphan
      #  90967 bacEnds.pairs
      #  0 bacEnds.short
      #  190 bacEnds.slop
 
     # create header required by "rdb" tools
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
     echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
           > ../header
     echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
     # make pairs bed file
     cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
                | headchg -del > bacEndPairs.bed
     # also need to process bacEndSingles.txt into a database table
     # for singles in bacEndSingles.txt, create a dummy file where they
     # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
     # sequence, put the dummy sequence in the second column, if the single is
     # a reverse sequence put in first column. use a perl script to do this.
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/bacends
     set bacDir = /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
     mkdir singles
     cd singles
     cp /cluster/data/danRer1/bed/ZonLab/bacends/singles/formatSingles.pl.gz .
     gunzip formatSingles.pl.gz
     perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                           $bacDir/bacEndSingles.format
     # then run pslPairs on this formatted sequence
     ~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
      -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
      -mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndSingles.format \
      all_bacends bacEnds
      wc -l bacEnds*
     # 0 bacEnds.long
     # 0 bacEnds.mismatch
     # 15853 bacEnds.orphan
     # 0 bacEnds.pairs
     # 0 bacEnds.short
     # 0 bacEnds.slop
     # there are 15853 orphans here ( 14126 in danRer1) 201646 from 
     # pair analysis so a total of 217499 orphans (2138 more than danRer1)
     cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles
     wc -l bacEnds.singles
     # 217499 bacEnds.singles
     # make singles bed file
     cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                   | headchg -del > bacEndSingles.bed
     cp bacEndSingles.bed ../pairs
     cd ../pairs
     # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
     # since orphans are already in bacEndSingles, do not add these
     cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         bacEnds.orphan | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBad.bed
     # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans 
     # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans
  
     cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBadNoOrphans.bed
     extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed \
                 bacEndPairsBadNoOrphans.bed  bacEndSingles.bed | \
                         sorttbl tname tstart | headchg -del > bacEnds.load.psl
                                 /gbdb/danRer2/bacends/BACends.fa
     hgLoadSeq danRer2 /gbdb/danRer2/bacends/BACends.fa
     # There are rows where the aligments were the same but the lfNames are 
     # different. This is due to the presence of multiple reads for the 
     # same BAC end sequence. Sometimes they are slightly different lengths 
     # so the alignments are a little different. It would be good to consolidate     # all of these. Firstly, the identical rows were merged into one with a 
     # list of all the lfNames corrsponding to that alignment.
    
     # load all alignments into temporary database
     ssh hgwdev
     echo "create database bacsDr2_rah;" | hgsql danRer2
     cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
     hgLoadBed bacsDr2_rah bacEndPairs bacEndPairs.bed \
                 -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     # Loaded 84405 elements of size 11
     # Loaded 72272 elements of size 11 (danRer1)
     # create a bacEndSingles table like bacEndPairs if not created already
     cp $HOME/kent/src/hg/lib/bacEndPairs.sql ../singles/bacEndSingles.sql
     # edit to give correct table name
     # or copy over table definition from danRer1
     cp /cluster/data/danRer1/bed/ZonLab/bacends/singles/bacEndSingles.sql.gz \ 
       ../singles/
     gunzip ../singles/bacEndSingles.sql.gz
     
     hgLoadBed bacsDr2_rah bacEndSingles bacEndSingles.bed \
                  -sqlTable=../singles/bacEndSingles.sql
     # Loaded 196492 elements of size 11
     # Loaded 203683 elements of size 11 (danRer1)
     # load all_bacends later
 
     # NOTE - this track isn't pushed to RR, just used for assembly QA
     # Use bacEndPairsBadNoOrphans.bed as orphans are in the singles bed file
     hgLoadBed bacsDr2_rah bacEndPairsBad bacEndPairsBadNoOrphans.bed \
                  -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     # Loaded 18544 elements of size 11
     # Need to consolidate similar rows for bacEndPairs and bacEndSingles - same
     # name, different lfNames and same alignments.
     mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/duplicates
     cd /cluster/data/danRer2/bed/ZonLab/bacends/duplicates
     hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
           bacEndPairs order by name, chrom, chromStart;" \
           bacsDr2_rah > pairs.txt
     sort pairs.txt | uniq -c > pairs.txt.uniq
     wc -l pairs*
 
     # 84405 pairs.txt
     # 65105 pairs.txt.uniq
 
     # for danRer1:
     # 72272 pairs.txt
     # 53992 pairs.txt.uniq
 
     # for replicate rows, find all the unique lfNames and put these
     # in one row with the relevant lfStarts, lfSizes and correct lfCount
     cp \
    /cluster/data/danRer1/bed/ZonLab/bacends/duplicates/removeReplicates.pl.gz .
     gunzip removeReplicates.pl.gz
     # modify script so that the database to use is supplied as an argument
     nice perl removeReplicates.pl pairs.txt.uniq bacEndPairs bacsDr2_rah \
          > pairsNoReps.bed
     # repeat for singles
     hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
        bacEndSingles order by name, chrom, chromStart;" bacsDr2_rah \
        > singles.txt
     sort singles.txt | uniq -c > singles.txt.uniq
     wc -l singles*
     # 196492 singles.txt
     # 173025 singles.txt.uniq
     
     # 203683 singles.txt (danRer1)
     # 179170 singles.txt.uniq (danRer1)
     
     nice perl removeReplicates.pl singles.txt.uniq bacEndSingles bacsDr2_rah \
                              > singlesNoReps.bed   
     # Using badPairs with no orphans                                
     hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
           bacEndPairsBad order by name, chrom, chromStart;" bacsDr2_rah \
           > badPairs.txt
     sort badPairs.txt | uniq -c > badPairs.txt.uniq
     wc -l badPairs*
     # 18544 badPairs.txt
     # 13803 badPairs.txt.uniq
 
     nice perl removeReplicates.pl badPairs.txt.uniq bacEndPairsBad bacsDr2_rah \
          > badPairsNoReps.bed
     # sort by chrom, chromStart - done already
     # Wrote perl script to choose 2 BAC ends to represent each BAC clone.
     # since there are often more than one read for each BAC end in this set,
     # 2 were chosen for each BAC pair or 1 for the singles. This was based on 
     # the ones that had the largest region aligned (using lfSizes).
     ssh kkstore01
     cd /cluster/data/danRer2/bed/ZonLab/bacends/duplicates
     # run perl script: input bed file, pairs or singles, name of output file
     perl pickLfNames.pl pairsNoReps.bed pairs pairs2lfNames.bed
     mv error.log log.pairs
     # this log.pairs lists the 30 cases where alignments for a BAC clone use 
     # a different pair of sequence reads for the ends. These were checked out
     # and, in most cases, the alignments are almost identical or overlap for 
     # the most part so it is ok to have these extra alignments missing. 
     # CH211-69C14: zC069-C14.ya hits almost 100,000 bp downstream
     # of the .yb version. It is slightly longer and it has a good hit by BLAT
     # also to chrom 15. This is the alignment that is missing of the two for 
     # this clone.
     # run script for singles:
     perl pickLfNames.pl singlesNoReps.bed singles singles1lfName.bed
     mv error.log log.pairs               
     # Singles: there are 80 cases where the alignments for the BAC clone use 
     # a different sequence for the reads. badPairs: there are 13 cases.
     # These were not included in the new bed file and they were checked out -
     # the BAC clone names are for these are in the log file. 
     # For Singles: In all but 3 cases, the alignments were almost
     # identical or mostly overlapped the alignments for the other sequence
     # read chosen to represent that BAC end in the bed file.
     # CH211-98J20, CH211-98O21 and CH211-98G4 have alignments for other
     # sequence reads for the same end represented but the alignments are to
     # different chromosomes (see singlesNoReps.bed).                       
     perl pickLfNames.pl badPairsNoReps.bed pairs badPairs2lfNames.bed
     mv error.log log.badPairs
     # for each of these new bed files, checks were made that there are 
     # only 2 BAC ends per alingmnets for pairs and 1 for singles. 
     # For each pair, there should only be 2 ends which can appear either
     # way round depending on the orientation and there should be 1 end for 
     # the beginning (suffix T7, t7 or z) and one end for the end 
     # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
     # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
     # orientation. For singles, there should be a single BAC end for each 
     # alignment and for each BAC clone, a sequence for either or both types
     # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
     # alignments. 
     # Finally overlaps in BAC clone names were checked. All BAC clones
     # represented in each of the pairs, badPairs and singles bed files are
     # unique to that file. Between all three bed files, 173188 BAC clones
     # have alignments.
 
     ssh hgwdev
     # NOTE: using sort and uniq on hgwdev produces tab delimited output
     # after merging rows with the same BAC name, the scoring is now
     # wrong in the bed files.
     # Scores should be 1000 if there is 1 row for that name, else
     # 1500/number of rows for that sequence name - calculated by pslPairs.
     # Correct the scores.
 
     mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/scores
     cd /cluster/data/danRer2/bed/ZonLab/bacends/scores
     # copy over correctScores.pl script from danRer1
     cp /cluster/data/danRer1/bed/ZonLab/bacends/scores/correctScores.pl.gz .
     cp /cluster/data/danRer1/bed/ZonLab/bacends/scores/checkScores.pl.gz .
     gunzip *.gz
     # modify correctScores.pl to correctScore2.pl with extra argument to 
     # specify if there is a bin field.
     awk '{print $4}' ../duplicates/pairs2lfNames.bed \
                  | sort | uniq -c > pairs.hits
     perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits noBin \
                            > bacEndPairsGoodScores.bed
 
     # same for singles
     awk '{print $4}' ../duplicates/singles1lfName.bed \
                  | sort | uniq -c > singles.hits
     
     perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits noBin \
                          > bacEndSinglesGoodScores.bed
     
     # and for badPairs
     awk '{print $4}' ../duplicates/badPairs2lfNames.bed \
                  | sort | uniq -c > badPairs.hits
     perl correctScores2.pl ../duplicates/badPairs2lfNames.bed badPairs.hits \
                  noBin > bacEndPairsBadGoodScores.bed
     # check that the scores are correct now:
     awk 'BEGIN {OFS = "\t"} {print $4, $5}' bacEndPairsGoodScores.bed \
          | sort | uniq -c > pairs.count
     perl checkScores.pl < pairs.count
     # all the BAC clones should be in good.txt and none in bad.txt
     # wc -l should give same number of lines in good.txt as in pairs.hits
     # check all the GoodScores bed files in this manner - all are correct
     # for singles, 3 end up in bad.txt but it is a rounding error,
     # there are 7 alignments for each of these: CH211-210N9, CH211-30H16
     # and DKEY-31H18. Score is: 214.285714285714
     # this will be rounded to 214 when loading the database table
 
     # Now load database tables:
     hgLoadBed danRer2 bacEndPairs bacEndPairsGoodScores.bed \
                  -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     # Loaded 65075 elements of size 11
     hgLoadBed danRer2 bacEndSingles bacEndSinglesGoodScores.bed \
                  -sqlTable=../singles/bacEndSingles.sql
     # Loaded 172945 elements of size 11
     # load of bacEndSingles did not go as planned: 172945 record(s), 
     # 0 row(s) skipped, 29 warning(s) loading bed.tab
     # warnings are unknown but all of bed file loaded and the number
     # of warnings is small so ignore
     hgLoadBed danRer2 bacEndPairsBad bacEndPairsBadGoodScores.bed \
                  -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     # Loaded 13790 elements of size 11
     # load BAC end sequences into seq table so alignments may be viewed
     # symlink to bacends.fa sequences in danRer1
     mkdir -p /gbdb/danRer2/bacends
     ln -s /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa \
                                 /gbdb/danRer2/bacends/BACends.fa
     hgLoadSeq danRer2 /gbdb/danRer2/bacends/BACends.fa
     ssh kkstore01
     cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
     # for all_bacends table, just load the alignments for those sequences
     # represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables
     # bacEnds.load.psl is the file of alignments
     # get all the names of sequences 
     foreach f (../scores/*GoodScores.bed)
        echo $f
        awk '{print $11;}' $f >> allBacEnds.names
     end
     wc -l allBacEnds.names
     # 251810 allBacEnds.names
     # this is the total number of lines in the *GoodScores.bed files 
     perl -pi.bak -e 's/,/\n/g' allBacEnds.names
     sort allBacEnds.names | uniq > allBacEnds.names.uniq
     # write script to go through and pick alignments
 cat > getAlignments.pl << 'EOF'
 #!/usr/bin/perl -w
 use strict;
 
 my $bacAligns = $ARGV[0]; # psl alignments of BAC ends
 my $bacs = $ARGV[1]; # list of BAC ends for which to select alignments
 
 open(ALIGNS, $bacAligns) || die "Can not open $bacAligns: $!\n";
 open(BACS, $bacs) || die "Can not open $bacs:$!\n";
 
 my %alignsHash;
 
 # read alignments and store
 while (<ALIGNS>)
 {
 my $l = $_;
 my @fields = split(/\t/, $l);
 # hash key is BAC end name
 push (@{$alignsHash{$fields[9]}}, $l);
 }
 close ALIGNS;
 
 while (<BACS>)
 {
 chomp;
 my $end = $_;
 if (exists($alignsHash{$end}))
    {
    my @als = @{$alignsHash{$end}};
    foreach my $a (@als)
       {
       print $a;
       }
    }
 }
 close BACS;
 'EOF'
 
     chmod +x getAlignments.pl
     # get alignments for just the BAC ends that are in the database tables
     perl getAlignments.pl bacEnds.load.psl allBacEnds.names.uniq \
          > bacEnds.load.inTables.psl
     # check that alignments are present for all the BAC ends in the list
     # alignments for all BAC ends in allBacEnds.names.uniq are there
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
     # load all_bacends table
     hgLoadPsl danRer2 -table=all_bacends bacEnds.load.inTables.psl
     #  load of all_bacends did not go as planned: 2615155 record(s), 0 row(s) 
     #  skipped, 81 warning(s) loading psl.tab
     ssh kkstore01
     cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
     # make bacEndAlias table with Genbank accessions for ends
     # need to run getBacEndInfo.pl for the BAC end names in the 
     # BAC tables.
     # in the pairs directory, there is the allBacEnds.names.uniq file
     # so use this.
     mkdir bacEndAccs
     cd bacEndAccs
     perl ../getBacEndInfo.pl ../../pairs/allBacEnds.names.uniq \
          ../BACEnd_accessions.txt > bacEndAccs.aliases.log
     mv bacEndAccs.aliases bacEndAccs.aliases.intables
     
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1/bacEndAccs
     echo "drop table bacEndAlias" | hgsql danRer2
     hgsql danRer2 < $HOME/kent/src/hg/lib/bacEndAlias.sql
     echo "load data local infile 'bacEndAccs.aliases.intables' into table \
          bacEndAlias" | hgsql danRer2
     # clonemarkers.02.12.04.txt, ctgnames.02.12.04.txt and markers.02.12.04.txt
     # already downloaded from Sanger for danRer1 (see makeDanRer1.doc)
     ssh kkstore01
     mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
     cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
     set dr1=/cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
     set dr2=/cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
     ln -s $dr1/clonemarkers.02.12.04.txt $dr2
     ln -s $dr1/ctgnames.02.12.04.txt $dr2
     ln -s $dr1/markers.02.12.04.txt $dr2
     ln -s $dr1/zfish_accs.txt $dr2
 
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
     hgsql -N -e 'select name from bacEndPairs;' danRer2 > bacEnds.names
     hgsql -N -e 'select name from bacEndSingles;' danRer2 >> bacEnds.names
     hgsql -N -e 'select name from bacEndPairsBad;' danRer2 >> bacEnds.names
     sort bacEnds.names | uniq > bacEnds.names.uniq
     # 173188 bacEnds.names.uniq
     # 594614 bacEnd sequences to start with 
     # some of the bacEnd sequences are replicates so count lfNames from table
     hgsql -N -e 'select lfNames from bacEndPairs;' danRer2 > bacEnds.lfNames
     hgsql -N -e 'select lfNames from bacEndSingles;' danRer2 >> bacEnds.lfNames
     hgsql -N -e 'select lfNames from bacEndPairsBad;' danRer2 >> bacEnds.lfNames
     perl -pi.bak -e 's/,/\n/g' bacEnds.lfNames
     sort bacEnds.lfNames | uniq > bacEnds.lfNames.uniq
     # 303946 bacEnds.lfNames.uniq
     # pslCheck has 536 bad alignments
     # from psl file
     awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
     # edit to remove first few lines with no names
     sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq
     # 464067 bacEndsPsl.names.uniq
     # about 78% align - after filtering
     # Add an alias table for BAC clones 
     # bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
     # Add a xref table to give external clone registry names, internal names
     # sanger name, relationship between STS and BAC clone (method of finding
     # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT, 
     # Genbank accession and STS primer sequences
     # bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
     
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
     wc -l *.02.12.04.txt
     28305 clonemarkers.02.12.04.txt
    167441 ctgnames.02.12.04.txt
    12008 markers.02.12.04.txt
    
     # create list of BAC clones in the tables with chromosome aligned to
     hgsql -N -e 'select name, chrom from bacEndPairs;' danRer2 \
           > bacClones.namesandchrom
     hgsql -N -e 'select name, chrom from bacEndSingles;' danRer2 \
           >> bacClones.namesandchrom
     sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
     # use zfish_accs.txt provided by the Sanger Centre to create a list
     # of BAC clone names, internal names and Genbank accessions
     ssh kkstore01
     cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
     # get list of UniSTS IDs using aliases to search alias file
     # print Sanger name, alias and UniSTS ID, use print_markers2.pl
     # modified version of program written by Tony DiBiase (Boston 
     # Children's Hospital)
     # create find_markers3.pl as the previous version was finding too
     # many UniSTS IDs for some markers as using grep (2005-05-24, hartera)
 
     cat << '_EOF_' > find_markers3.pl
     # example:
 # perl find_markers.pl UniSTS.aliases markers.02.12.04.txt
 use strict;
 my $verbose = 0;
 my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
 my $aliasFile = $ARGV[0];
 my $markersFile = $ARGV[1];
 open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
 open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
 # store aliases from aliasFile
 my ($id, $al, @alsArray, %aliasHash);
 while (<ALIAS>)
 {
    chomp;
    ($id, $al) = split /\t/;
    @alsArray = split(/;/, $al);
    foreach my $as (@alsArray)
       {
       push (@{$aliasHash{$as} }, $id);
       }
 }
 close ALIAS;
 
 while (<MARKERS>) {
     my @idArray;
     ($f, $t, $m, $idArray[0]) = 0;
     my @ids;
     chomp; ($a, $b, $aliases, @rest) = split /\|/;
     if ($verbose > 3) { printf "aliases $aliases \n"; }
     @alias = split /;/, $aliases;
     ALIAS: foreach $s (@alias) {
         if ($s =~ /[\D]+/) {
             if ($verbose > 5) { printf "this $s \n"; }
             if (exists($aliasHash{$s}))
                {
                @idArray = @{$aliasHash{$s}};
                }
             if ($idArray[0]) {
                 $f = 1; $t = $s; @ids = @idArray;
                 if ($verbose) { printf "this $s found $m \n"; }
                 last ALIAS;
             }
         }
     }
     if ($f) 
      { 
      my @sNames = split(/;/, $b);
      foreach my $sn (@sNames)
         {
         foreach my $i (@ids)
            { 
            printf "$sn\t$i\n"; 
            }
         }
     }
 }
 close MARKERS;
 '_EOF_'
     chmod +x find_markers3.pl
     perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.aliases \
          markers.02.12.04.txt > sangerandUniSTSId.txt
     # No need to reformat this for zfishBacClonesandSts
 
     # FPC contig information (i.e. FPC contig number) from ctgnames file is 
     # not included in the tables as these are dynamic and constantly 
     # changing with the assembly.
     # Received a new version of the zfish_accs.txt file:
     # zfish_accs050605.txt (from Mario Caccamo at Sanger), move this to 
     # /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
     awk 'BEGIN {OFS="\t"} {print $1, $2}' zfish_accs.txt | sort \
         > zfishOld.accs.sort
     awk 'BEGIN {OFS="\t"} {print $1, $2}' zfish_accs050605.txt \  
         | sort > zfishNew.accs.sort
     comm -13 zfishOld.accs.sort zfishNew.accs.sort > zfishNewAccs.only
     comm -23 zfishOld.accs.sort zfishNew.accs.sort > zfishOldAccs.only
     # 1974 zfishNewAccs.only
     # 303 zfishOldAccs.only
     # this has 1974 new BAC clone and accessions pairs and 303 of the previous 
     # BAC internal name and accession pairs are missing compared to 
     # zfish_accs.txt
     awk '{print $1};' zfishOldAccs.only | sort > zfishOld.namesonly.sort
     awk '{print $1};' zfishNewAccs.only | sort > zfishNew.namesonly.sort
     # also uniq and check no name has more than one accession - each BAC
     # name only appears once in the file
     comm -12 zfishOld.namesonly.sort zfishNew.namesonly.sort \
          > accs.OldandNew.common
     wc -l accs.OldandNew.common
     # 24 accs.OldandNew.common
     # therefore 24 BAC names have new accessions in the new file
     # zfishBacClonesandSts.c only reads the first two fields of each line
     # in the zfish_accs.txt file so create a new file with just these 2 fields.
     # Take the 303 BAC clones and accessions from zfish_accs.txt that are 
     # not in zfish_accs050605.txt and add to 9000 accs in zfishNew.accs.sort 
     cat zfishNew.accs.sort zfishOldAccs.only > zfish_accsMerged.txt
     # 9303 zfish_accsMerged.txt - BAC clone internal names and accessions
        
     # use zfishBacClonesandSts to create tab files for loading into
     # bacCloneAlias and bacCloneXRef tables
     mkdir -p /cluster/bluearc/danRer2/bacEnds/out
     # added code so that output directory now must be specified
     # fixed bug in code so that all aliases are now stored for each Sanger
     # STS name and the correct BAC clone external name is stored for internal 
     # names in all cases.
     # remake tab files using the new file mapping Sanger names and UniSTS IDs
     # and with the corrected program (2005-05-24, hartera)
  
     nice $HOME/bin/i386/zfishBacClonesandSts ctgnames.02.12.04.txt \
       clonemarkers.02.12.04.txt markers.02.12.04.txt \
       bacClones.namesandchrom.uniq zfish_accsMerged.txt \
       sangerandUniSTSId.txt \
       /cluster/bluearc/danRer2/bacEnds/out > zfishBacs.out
 
     # output is in /cluster/bluearc/danRer2/bacends/out so copy over
     # sort alias tab file by sangerName
     sort -k2 /cluster/bluearc/danRer2/bacEnds/out/bacAlias.tab \
          > bacAlias.sort.tab
     cp /cluster/bluearc/danRer2/bacEnds/out/bacXRef.tab .
     wc -l *.tab
     #  55836 bacAlias.sort.tab
     # 233418 bacXRef.tab
     # there are 234056 entries for danRer1 bacXRef.tab - the difference
     # reflects differences in BAC ends mapped to the assemblies
 
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
     # Remove and reload tables after bug fixes in the zfishBacClonesandSts
     # program and the new UniSTS IDs and Sanger names mapping file which 
     # were used to create these tab files (2005-05-25, hartera)
     hgsql danRer2 -e 'drop table bacCloneXRef'
     hgsql danRer2 -e 'drop table bacCloneAlias'
 
     hgsql danRer2 < $HOME/kent/src/hg/lib/bacCloneAlias.sql
     hgsql danRer2 < $HOME/kent/src/hg/lib/bacCloneXRef.sql
     nice hgsql danRer2 -e \
           'load data local infile "bacAlias.sort.tab" into table bacCloneAlias'
     nice hgsql danRer2 -e \
           'load data local infile "bacXRef.tab" into table bacCloneXRef'
     cd $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer2
     # checked data in tables to check that everything was correctly loaded
     # from the files and errors were corrected - see TEST section below
 # edit trackDb.ra to add bacEnds tracks and searches for the bacEndPairs
 # and bacEndSingles tracks as for danRer1. copy over html from danRer1 
 # for bacEndPairs and bacEndSingles tracks. Edit to include description 
 # of filtering alignments so that there is only one pair of sequence reads 
 # for each BAC pair and only one sequence read for each BAC single alignment.
 
     # Remove incorrect row from bacCloneXRef table (hartera, 2006-04-19)
     # There is one row where the name is "CH211-155M11__" and the 
     # intName is "zC155M11__" which is incorrect.
     # There is a correct row for intName zC155M11 and zC155M11__ and 
     # CH211-155M11__ is not in bacEnd{Singles,Pairs,PairsBad} tables and
     # zC155M11__ is not an alias in bacEndAlias.
     hgsql -e 'delete from bacCloneXRef where name = "CH211-155M11__";' danRer2
     # corrected termRegex for some bacCloneXRef searches so that they work
     # correctly (bacPairsSangerSts and bacSinglesSangerSts)
     # (2006-04-19, hartera)
  
 # BACENDS: TESTING FOR bacCloneAlias and bacCloneXRef TABLES
 # (DONE, 2005-05-25, hartera)
 # ADDED TEST TO CHECK SANGERNAMES IN ALIAS AND XREF TABLES 
 # (DONE, 2005-05-26, hartera)
     # The following tests were carried out to check that all the data
     # in the bacCloneAlias and bacCloneXRef tables is correct.
     mkdir -p \
         /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
     cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
 
 # Check that the correct aliases are associated with their Sanger STS names
     awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
         ../markers.02.12.04.txt > sNameandaliases
     # write script to get one Sanger name and one alias on each line
     chmod +x getSangerAndAlias.pl
     perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
     sort sNameandaliases.format | uniq > sNameandaliases.sort
     # get Sanger names and aliases from database
     hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer2 \
           | sort | uniq > alias.db.sort
     wc -l alias.db.sort
     # 55836 alias.db.sort
     diff sNameandaliases.sort alias.db.sort
     # No difference between data file and data from database so ok
 
 # Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
 # (2005-05-26, hartera)
     # get Sanger names from alias table
     hgsql -N -e 'select sangerName from bacCloneAlias;' danRer2 \
              | sort | uniq > sName.alias.sort  
     wc -l sName.alias.sort 
     # 14940 sName.alias.sort
     # get Sanger names from xRef table
     hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
           is not null;' danRer2 | sort | uniq > sName.xRef.sort
     wc -l sName.xRef.sort
     # 15153 sName.xRef.sort
     comm -23 sName.alias.sort sName.xRef.sort 
     # nothing in alias file only so all sanger names in the alias table are
     # also in the xRef table
     comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias 
     wc -l sNamexRefNotAlias
     # 213 sNamexRefNotAlias - 213 sangerNames in alias table not in XRef table
     # get Sanger names from clonemarkers file
     awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.02.12.04.txt | sort | uniq \         > clonemarkers.sNames.sort
     # get Sanger names from markers file
     awk 'BEGIN {FS="|"}{print $2}' ../markers.02.12.04.txt > markers.sNames
     # remove semi-colons and sort 
     sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
     # sanger names unique to markers file
     comm -13 clonemarkers.sNames.sort markers.sNames.sort
     # there are none
     comm -23 clonemarkers.sNames.sort markers.sNames.sort \
          > sNames.clonemarkersOnly
     wc -l sNames.clonemarkersOnly
     # 213 sNames.clonemarkersOnly
     diff sNames.clonemarkersOnly sNamexRefNotAlias
     # no difference so all the extra Sanger Names in the xRef table are 
     # from the clonemarkers file and these have no aliases in the markers file
     # so they are not in the alias table so this is all ok.
 
 # Check that Sanger STS names and primers are associated correctly
     cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
     # get sanger names and primers from markers file
     awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
         ../markers.02.12.04.txt > sNameandPrimers
     # write script to reformat and write with one Sanger name per line
     chmod +x getSangerandPrimers.pl
     perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
     sort sNameandPrimers.format > sNameandPrimers.format.sort
     wc -l sNameandPrim*
     # 12008 sNameandPrimers
     # 14940 sNameandPrimers.format
     # 14940 sNameandPrimers.format.sort
     # get Sanger names and primers from database
     hgsql -N -e \
       'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
       where sangerName is not null and leftPrimer is not null and \
       rightPrimer is not null;' danRer2 | sort | uniq \
       > sNamesandprimers.fromdb.sort
    wc -l sNamesandprimers.fromdb.sort
    # 14940 sNamesandprimers.fromdb.sort
    diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
    # No difference so ok
 
 # Check that UniSTS IDs and Sanger STS names are associated correctly
    # get Sanger names and UniSTS IDs from the database
    hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
          uniStsId is not null;' danRer2 | sort | uniq > sNameUniSTS.fromdb.sort
    wc -l sNameUniSTS.fromdb.sort
    #  5456 sNameUniSTS.fromdb.sort
    # Need to reformat the sNameUniSTS.fromdb.sort
    chmod +x formatUniSts.pl 
    perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
         > sNameUniSTS.fromdb.format.sort
    # get Sanger names from data file and see how many UniSTS IDs there are
    # for each name
    awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
        > sangerandUniSTSId.count
    # the most is 3 UniSTS IDs
    # 3 etID9056.23
    # 3 etID9042.2
    # 3 etID8627.2
    # 3 etID8281.9
    # 3 etID11096.5
    # 2 etID9986.14
    # 2 etID9968.17
  
    sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort
    diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \
        > sangerandUniSTSIdvsdb
    # No difference between data from original file and that in database so ok
 
 # Check that chrom mappings and external BAC clone names are correct
    # get extNames and chroms they map to from the database
    hgsql -N -e 'select name, chroms from bacCloneXRef where \
          chroms is not null;' danRer2 | sort | uniq \
          > nameandchromsfromdb.sort
    # reformat nameandchromsfromdb.sort
    perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
         > nameandchromsfromdb.format.sort
    # compare extNames and chroms from db to those in data file
    diff ../bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
    # no difference - all ok
 
 # Check Genbank accessions and internal BAC clone names
    hgsql -N -e 'select intName,genbank from bacCloneXRef where \
          genbank is not null;' danRer2 | sort | uniq \
          > intNamesandAccs.fromdb.sort
    # this should be a subset of zfish_accsMerged.txt - not all BAC clones
    # listed here appear in either our BAC ends tracks or the markers files.
    sort ../zfish_accsMerged.txt > zfish_accsMerged.sort
    comm -23 intNamesandAccs.fromdb.sort zfish_accsMerged.sort
    # there is nothing in the database that is not in zfish_accsMerged.sort 
    comm -13 intNamesandAccs.fromdb.sort zfish_accsMerged.sort > onlyinzfishAccs
    wc -l onlyinzfishAccs
    # 87 onlyinzfishAccs
    hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' danRer2
          | sort | uniq > intNamesNoAcc.fromdb.sort
    awk '{print $1;}' zfish_accsMerged.sort | sort > intNames.withAccs.sort
    comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
         > indbNoAccsandAccs.out
    # none of these names are common to both so all accessions from
    # zfish_accsMerged.txt are in the database for the internal names stored
    # where there are accessions available.
   
 # Test Sanger STS names, internal names and external names are all correct
 # Test Sanger STS name and internal BAC clone names are associated correctly
    # get internal names and Sanger names from data file 
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.02.12.04.txt \
        | sort | uniq > intNameandSanger.sort
    hgsql -N -e 'select intName, sangerName from bacCloneXRef \
        where sangerName is not null;' danRer2 \
        | sort | uniq > intNameandSanger.fromdb.sort
    diff intNameandSanger.sort intNameandSanger.fromdb.sort 
    # No difference between data from file and that from database so ok
 
 # Check BAC clone internal name and relationship fields
    # get internal names and relationships from data file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.02.12.04.txt \
        | sort | uniq > intNameandRelation.sort
    # get internal names and relationships from database
    hgsql -N -e 'select intName, relationship from bacCloneXRef \
        where relationship != 0;' danRer2 \
        | sort | uniq > intNameandrelation.fromdb.sort
    # differences unique to database file
    comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
        > intNameRelation.indbonly
    # differences unique to data file
    comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
        > intNameRelation.incloneMarkersonly
    wc -l intNameRelation*
    # 4345 intNameRelation.incloneMarkersonly
    # 4345 intNameRelation.indbonly
 
    awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
    awk '{print $1}' intNameRelation.incloneMarkersonly \
        > intNameRelation.incloneMarkersonly.names
    diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
    # there is no difference in the internal names with relationship fields
    # no difference in names and the only places these should differ is that
    # the second column should all be 3 in the data from the database only.
    # this is because all the relationship entries that were blank were
    # in the clonemarkers file were changed to 3 when entered into the database.
    awk '{print $2}' intNameRelation.indbonly | sort | uniq
    # 3 - correct so all ok
    # all the differences should be that those that are blank in clonemarkers
    # are 3 in the database.
    # check that those that have 0 in the database bacCloneXRef relationshipe
    # field are not in the list from cloneMarkers
    # select these internal names with 0 relationship from the database
    hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
          danRer2 | sort | uniq > intNameNoRelation.fromdb.sort  
    # get all the internal names from the data file
    awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt \
        | sort | uniq > intNamefromCloneMarkers.sort
    comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
    # nothing in common between these two files as expected so there are
    # no internal names in the db with 0 in the relationship field that
    # appear in the clonemarkers file.
 
 # Check all BAC clone internal names and external names from the
 # ctgnames file are in the database
    # get intName and extName from ctgnames file 
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.02.12.04.txt \
        | sort | uniq > intNameandextNamefromCtgNames.sort
    # get intName and extName from database
    hgsql -N -e 'select intName,name from bacCloneXRef;' danRer2 \
        | sort | uniq > intNameandextName.fromdb.sort
    wc -l intNameandextName*
    # 218100 intNameandextName.fromdb.sort
    # 167441 intNameandextNamefromCtgNames.sort
    comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
         > intandextindbAndCtgNames
    wc -l intandextindbAndCtgNames 
    # 167441 intandextindbAndCtgNames
    # there are 167441 name pairs common between the file and the database
    # and this is the same number of name pairs as in the data file
    diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
    # no difference between those name pairs from the data file and those that
    # are common between the data file and the database so all internal and
    # external names from ctgNames file are in the database
    # get the list of extra ones from db
    comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
         > intandextNamesindbNotinCtgNames
    wc -l intandextNamesindbNotinCtgNames
    # 50659 intandextNamesindbNotinCtgNames
    # get list of internal names from the clonemarkers file
    awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt | sort | uniq \
        > clonemarkers.intName.sort
    wc -l clonemarkers.intName.sort
    # 12987 clonemarkers.intName.sort
    # compare these intNames to those from the database not in the ctgnames file
    comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
    # none of these clone markers internal names are in this list so they
    # must all be in the ctgnames file too. These extra internal names will be
    # translations of external names found in the list of mappings of BAC clones
    # to chroms.
 
 # Check that all the BAC clone external names from the list of chromosome
 # mappings and from the ctgnames file are in the database.
    # get all extNames from baclones.namesandchrom.uniq and from ctgnames
    awk '{print $1}' ../bacClones.namesandchrom.uniq > \
        extNames.ctgnamesandbacClones
    awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.02.12.04.txt \
        >> extNames.ctgnamesandbacClones
    wc -l extNames.ctgnamesandbacClones
    # 384421 extNames.ctgnamesandbacClones
    sort extNames.ctgnamesandbacClones | uniq \
         > extNames.ctgnamesandbacClones.sort
    wc -l extNames.ctgnamesandbacClones.sort
    # 218100 extNames.ctgnamesandbacClones.sort
    # get extNames from the database
    hgsql -N -e 'select name from bacCloneXRef;' danRer2 | sort | uniq \
          > extNames.fromdb.sort
    wc -l extNames.fromdb.sort
    # 218100 extNames.fromdb.sort
    comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
          > extNames.fromdbandfiles
    wc -l extNames.fromdbandfiles
    # 218100 extNames.fromdbandfiles
    # find extNames in common from data files and database
    diff extNames.fromdb.sort extNames.fromdbandfiles
    # no difference, all extNames from files are in db
 
 # Check that all BAC clone internal names from the ctgnames and clonemarkers
 # files are in the database
    # get internal names from ctgnames and clonemarkers files
    awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.02.12.04.txt \
        > intNames.ctgnamesandclonemarkers
    awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.02.12.04.txt \
        >> intNames.ctgnamesandclonemarkers
    wc -l intNames.ctgnamesandclonemarkers
    # 195746 intNames.ctgnamesandclonemarkers
    sort intNames.ctgnamesandclonemarkers | uniq \
         > intNames.ctgnamesandclonemarkers.sort
    wc -l intNames.ctgnamesandclonemarkers.sort
    # 167441 intNames.ctgnamesandclonemarkers.sort
    # get internal names from database
    hgsql -N -e 'select intName from bacCloneXRef;' danRer2 | sort | uniq \
         > intNames.fromdb.sort
    wc -l intNames.fromdb.sort
    #  218100 intNames.fromdb.sort
    # some of these intNames are derived from the corresponding extNames
    # all of the intNames from the file should be in the db
    comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
         > intNames.fromdbandfiles
    wc -l intNames.fromdbandfiles
    # 167441 intNames.fromdbandfiles
    diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
    # no difference, all intNames from files are in db
    
 # Check that all translations are correct between BAC clone
 # external and internal names.
    # write script to get the prefixes from internal and external names
    chmod +x getNamePrefixes.pl
    hgsql -N -e 'select name, intName from bacCloneXRef;' danRer2 \
          | sort | uniq > extandintNames.fromdb.sort
    perl getNamePrefixes.pl < extandintNames.fromdb.sort \
          > extandintNames.prefixes
    sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
    # these all look good 
    # BUSM1   dZ
    # CH211   zC
    # CH211   zc
    # CH73    zH
    # CT7     bP
    # DKEY    zK
    # DKEY    zk
    # DKEYP   zKp
    # RP71    bZ
    # XX      bY
    # zk is a internal name prefix for the external name prefix, DKEY-. There
    # is only one example where this is used (DKEY-81G7) and this in the
    # ctgnames file and is in the bacCloneXRef table so that is ok.
    # All data looks good in these tables now.
 
 # Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram)
     ssh hgwdev
     cd /tmp
     wget \
 	'http://www.genome.gov/Pages/News/Photos/Science/zebrafish_image.jpg'
 	-O zebrafish_image.jpg
     #	no crop images for this one, make the thumbnail directly:
     convert -geometry 300x200 -quality 80 -sharpen 0 -normalize \
 	zebrafish_image.jpg Danio_rerio.jpg
 
     cp -p Danio_rerio.jpg /usr/local/apache/htdocs/images
     #	add links to this image in the description.html page, request push
 
 # EXTRACT AXTs and MAFs FROM FUGU (fr1) NET (DONE, 2004-12-22, hartera)
 # GZIP AXTs and MAFs (DONE, 2005-05-16, hartera)
     ssh kksilo
     # create axts
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
     gunzip fuguFr1.net.gz
     gunzip chainAR/*.chain.gz
     netSplit fuguFr1.net fuguFr1Net
     mkdir -p ../axtNet
 cat > axtNet.csh << 'EOF'
     foreach f (fuguFr1Net/chr*.net)
         set c = $f:t:r
         echo "axtNet on $c"
         netToAxt fuguFr1Net/$c.net chainAR/$c.chain \
                  /cluster/data/danRer2/nib \
                  /cluster/data/fr1/nib ../axtNet/$c.axt
     echo "Complete: $c.net -> $c.axt"
     end
 'EOF'
     chmod +x axtNet.csh
     csh axtNet.csh >&! axtNet.log &
     tail -100f axtNet.log
     # sort axts before making mafs - must be sorted for multiz
     cd /cluster/data/danRer2/bed/blastz.fr1
     mv axtNet axtNet.unsorted
     mkdir axtNet
     foreach f (axtNet.unsorted/*.axt)
         set c = $f:t:r
         echo "Sorting $c"
         axtSort $f axtNet/$c.axt
     end
     # create maf
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.fr1
     cd axtNet
     mkdir ../mafNet
 cat > makeMaf.csh << 'EOF'
     foreach f (chr*.axt)
       set maf = $f:t:r.fr1.maf
       echo translating $f to $maf
       axtToMaf $f \
             /cluster/data/danRer2/chrom.sizes /cluster/data/fr1/chrom.sizes \
             ../mafNet/$maf -tPrefix=danRer2.  -qPrefix=fr1.
     end
 'EOF'
     chmod +x makeMaf.csh
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
     nice gzip fuguFr1.net chainAR/*.chain &
     # gzip axts and mafs (hartera, 2005-05-16)
     ssh kkstore01 
     cd /cluster/data/danRer2/bed/blastz.fr1
     nice gzip axtNet/*.axt mafNet/*.maf &
 
 # EXTRACT AXTs and MAFs FROM TETRAODON (tetNig1) NET (DONE, 2004-12-22, hartera)
 # GZIP AXTs and MAFs AND CHAINS (DONE, 2005-05-16, hartera)
     ssh kksilo
     # create axts
     cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
     netSplit tetraTetNig1.net tetraTetNig1Net
     mkdir -p ../axtNet
 cat > axtNet.csh << 'EOF'
     foreach f (tetraTetNig1Net/chr*.net)
         set c = $f:t:r
         echo "axtNet on $c"
         netToAxt tetraTetNig1Net/$c.net chainAR/$c.chain \
                  /cluster/data/danRer2/nib \
                  /cluster/data/tetNig1/nib ../axtNet/$c.axt
     echo "Complete: $c.net -> $c.axt"
     end
 'EOF'
     chmod +x axtNet.csh
     csh axtNet.csh >&! axtNet.log &
     tail -100f axtNet.log
     # sort axts before making mafs - must be sorted for multiz
     cd /cluster/data/danRer2/bed/blastz.tetNig1
     mv axtNet axtNet.unsorted
     mkdir axtNet
     foreach f (axtNet.unsorted/*.axt)
         set c = $f:t:r
         echo "Sorting $c"
         axtSort $f axtNet/$c.axt
     end
     # create maf
     ssh kksilo
     cd /cluster/data/danRer2/bed/blastz.tetNig1
     cd axtNet
     mkdir ../mafNet
 cat > makeMaf.csh << 'EOF'
     foreach f (chr*.axt)
       set maf = $f:t:r.tetNig1.maf
       echo translating $f to $maf
       axtToMaf $f \
           /cluster/data/danRer2/chrom.sizes /cluster/data/tetNig1/chrom.sizes \
           ../mafNet/$maf -tPrefix=danRer2.  -qPrefix=tetNig1.
     end
 'EOF'
     chmod +x makeMaf.csh
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     # gzip axts and mafs (hartera, 2005-05-16)
     ssh kkstore01 
     cd /cluster/data/danRer2/bed/blastz.tetNig1
     nice gzip axtNet/*.axt mafNet/*.maf &
     # gzip chains again
     cd axtChain
     nice gzip chainAR/*.chain &
 
 # TIGR GENE INDEX (DONE, 2004-12-22, hartera)
 # Data from rsultana@tigr.org (Razvan Sultana at TIGR)
     ssh kksilo
     mkdir -p /cluster/data/danRer2/bed/tigr
     cd /cluster/data/danRer2/bed/tigr
 wget ftp://ftp.tigr.org/pub/data/tgi/Danio_rerio/TGI_track_danRer2_12-2004.tgz    
     tar xvzf TGI*.tgz
     foreach f (*g_gallus*)
        set f1 = `echo $f | sed -e 's/g_gallus/chicken/g'`
        mv $f $f1
     end
 
     foreach f (*drosoph*)
     set f1 = `echo $f | sed -e 's/drosoph/Dmelano/g'`
        mv $f $f1
     end
 
     foreach o (Dmelano chicken elegans human mouse rat zfish)
       echo $o
       setenv O $o
       foreach f (chr*_$o*s)
         tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
       end
     end
 
     ssh hgwdev
     cd /cluster/data/danRer2/bed/tigr
     hgsql danRer2 -e "drop table tigrGeneIndex"
 
     nice ldHgGene -exon=TC danRer2 tigrGeneIndex *.gff
     # Read 114013 transcripts in 395310 lines in 196 files
     # 114013 groups 28 seqs 1 sources 1 feature types
     # 114013 gene predictions
 
     hgsql danRer2 -e "update tigrGeneIndex set cdsStart = txStart;"
     hgsql danRer2 -e "update tigrGeneIndex set cdsEnd = txEnd;"
 
     checkTableCoords danRer2 tigrGeneIndex
     /cluster/bin/scripts/runGeneCheck /cluster/data/danRer2/bed/tigr
     # 135739 badUtrSplice
     # 114013 noCDS - fixed these in table as above
     # 30191 gap
 
     gzip *.gff *TCs
     # make changes in doTigrGeneIndex function in hgc to add original species
     # name back when constructing URL to TIGR web site.
 
 # 3-WAY MULTIZ MULTIPLE ALIGNMENT 
 # (zebrafish danRer2, fugu fr1 and tetraodon tetNig1)
 # (DONE, 2004-12-23, hartera)
 # use v.8 of multiz (see makeHg17.doc for 8-WAY alignment)
     ssh kksilo
     set multizDir = multiz.2004-12-22
     set workingDir = /cluster/bluearc/danRer2/$multizDir
     ln -s $workingDir /cluster/bluearc/danRer2/multiz3way
     mkdir -p $workingDir
     mkdir -p /cluster/data/danRer2/bed/$multizDir
     ln -s /cluster/data/danRer2/bed/$multizDir \
           /cluster/data/danRer2/bed/multiz3way
     cd /cluster/data/danRer2/bed/multiz3way
 
 # wrapper script for multiz
     # NOTE: first arg is pairwise, 2nd arg is multiple (to add to)
     # NOTE: next time, modify script so it only needs one arg -- saves the
     # multiple dirname in a file for use by the next run
 cat << 'EOF' > doMultiz.csh
 #!/bin/csh -fe
 mkdir -p $3:h
 /cluster/bin/penn/multiz $1 $2 - > $3
 'EOF'
 # << for emacs
     cat << 'EOF' > gsub
 #LOOP
 ../doMultiz.csh {check in line /cluster/bluearc/danRer2/multiz3way/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/danRer2/multiz3way/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/danRer2/multiz3way/$(root1)$(dir1)/$(root2).maf}
 #ENDLOOP
 'EOF'
 # << for emacs
     chmod +x doMultiz.csh
 
     ssh kksilo
     set workingDir = /cluster/bluearc/danRer2/multiz3way
     # copy mafs to bluearc for tetraodon (tetNig1)
     mkdir $workingDir/tetNig1
     cd /cluster/data/danRer2/bed/blastz.tetNig1/mafNet
     foreach f (*.maf)
        set c = $f:t:r
        set g = $c:t:r
        echo $g
        cp $f $workingDir/tetNig1/${g}.maf
     end
     cd /cluster/data/danRer2/bed/multiz3way
     ls $workingDir/tetNig1/*.maf > chrom.lst
 
     # fugu (fr1)
     mkdir $workingDir/fr1
     cd /cluster/data/danRer2/bed/blastz.fr1/mafNet
     foreach f (*.maf)
        set c = $f:t:r
        set g = $c:t:r
        echo $g
        cp $f $workingDir/fr1/${g}.maf
     end
 
     # one multiz - add in fugu to zebrafish/tetraodon
     ssh kki
     set workingDir = /cluster/bluearc/danRer2/multiz3way
     cd /cluster/data/danRer2/bed/multiz3way
     mkdir run.fr1
     cd run.fr1
     echo "fr1/tetNig1" > species.lst
     gensub2 species.lst ../chrom.lst ../gsub jobList
     para create jobList
     # 28 jobs in batch 
     para try, check, push, check ...
 # para time
 # CPU time in finished jobs:        337s       5.62m     0.09h    0.00d  0.000 y
 # IO & Wait Time:                   153s       2.55m     0.04h    0.00d  0.000 y
 # Average job time:                  18s       0.29m     0.00h    0.00d
 # Longest job:                       66s       1.10m     0.02h    0.00d
 # Submission to last job:           134s       2.23m     0.04h    0.00d
 
     cd ..
     # copy 3-way mafs to build directory
     ssh kksilo
     set workingDir = /cluster/bluearc/danRer2/multiz3way
     ln -s $workingDir/tetNig1fr1 $workingDir/maf
     cd /cluster/data/danRer2/bed/multiz3way
     mkdir maf
     cp $workingDir/maf/*.maf maf
 
 # WZ EST CLUSTERS ALIGNMENTS (DONE, 2004-12-23, hartera)
 # WZ ESTs are compiled ESTs from WashU. They were provided by
 # Anthony DiBiase from Leonard Zon's lab at the Children's Hospital, Harvard
 # Contact: adibiase@enders.tch.harvard.edu
 # http://zon.tchlab.org
      ssh kksilo
      mkdir -p /cluster/data/danRer2/bed/ZonLab/wzESTs
      cd /cluster/data/danRer2/bed/ZonLab/wzESTs
      # put WZ ESTs in this directory as wzcontigs.txt -
      # obtained from the Zon Lab, these are unmasked.
      # There are 42857 ESTs in this file.
      # Translated to upper case for danRer1
      cp /cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa .
      cd /cluster/data/danRer2/bed/ZonLab/wzESTs
      mkdir -p /cluster/bluearc/danRer2/wzESTs
      faSplit sequence wzcontigs.fa 20 /cluster/bluearc/danRer2/wzESTs/wzcontigs
 
      ssh kki
      cd /cluster/data/danRer2/bed/ZonLab/wzESTs
      mkdir psl
      ls -1 /cluster/bluearc/danRer2/wzESTs/*.fa > est.lst
      ls -1S /cluster/bluearc/danRer2/trfFa/*.fa > genome.lst
      cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /iscratch/i/danRer2/11.ooc} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
      gensub2 genome.lst est.lst gsub spec
      para create spec
      para try, check, push, try ....
 # para time
 # Completed: 5980 of 5980 jobs
 # CPU time in finished jobs:      17176s     286.27m     4.77h    0.20d  0.001 y
 # IO & Wait Time:                 22870s     381.16m     6.35h    0.26d  0.001 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest job:                       49s       0.82m     0.01h    0.00d
 # Submission to last job:          2890s      48.17m     0.80h    0.03d
 
      # Do sort, best in genome filter, and convert to chromosome coordinates
      # to create wzEsts.psl
      ssh kksilo
      cd /cluster/data/danRer2/bed/ZonLab/wzESTs
      pslSort dirs raw.psl tmp psl
      # only use alignments that have at least
      # 96% identity in aligned region. use parameters used by auto
      # GenBank update for native ESTs
     pslReps -minAli=0.96 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp wz_ests.psl /cluster/data/danRer2/jkStuff/liftAll.lft warn contig.psl
     # check psl
     pslCheck wz_ests.psl >& pslCheck.log
     # looks good 
     # Load EST alignments into database.
     ssh hgwdev
     cd /cluster/data/danRer2/bed/ZonLab/wzESTs
     hgLoadPsl danRer2 wz_ests.psl
 
     # Add WZ EST sequences
     # Copy sequences to gbdb if they are not there already
     mkdir -p /gbdb/danRer2/wzESTs
     ln -s \
        /cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa \
        /gbdb/danRer2/wzESTs
 
     hgLoadSeq danRer2 /gbdb/danRer2/wzESTs/wzcontigs.fa
 
 # MAKE Human Proteins track (hg17 DONE 2004-12-15 braney)
     ssh kksilo
     mkdir -p /cluster/data/danRer2/blastDb
     cd /cluster/data/danRer2/blastDb
     cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list
     for i in `cat chrom.list`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
     ln -s `cat list` .
     for i in *.fa
     do
 	formatdb -i $i -p F
     done
     rm *.log *.fa list
     cd ..
     for i in `cat blastDb/chrom.list`; do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft
     rm blastDb/chrom.list
 
     mkdir /cluster/data/danRer2/scaffoldBlastDb
     cd /cluster/data/danRer2/scaffoldBlastDb
     cat ../Un/scaffolds/*.fa ../NA/scaffolds/*.fa |  faSplit sequence stdin 500 scaf
     for i in *.fa
     do
 	formatdb -i $i -p F
     done
     rm *.log *.fa
 
     ssh kkr1u00
     mkdir -p /iscratch/i/danRer2/blastDb
     cd /cluster/data/danRer2/blastDb
     for i in nhr nin nsq; do cp *.$i /iscratch/i/danRer2/blastDb     ; echo $i; done
 
     mkdir -p /iscratch/i/danRer2/scaffoldBlastDb
     cd /cluster/data/danRer2/scaffoldBlastDb
     for i in nhr nin nsq; do cp *.$i /iscratch/i/danRer2/scaffoldBlastDb     ; echo $i; done
 
     cd
     (iSync) > sync.out
 
     mkdir -p /cluster/data/danRer2/bed/tblastn.hg17KG
     cd /cluster/data/danRer2/bed/tblastn.hg17KG
     echo  /iscratch/i/danRer2/blastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > query.lst  
     echo  /iscratch/i/danRer2/scaffoldBlastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > scaffold.lst  
     # back to kksilo
     exit
 
     # we want around 250000 jobs
     cd /cluster/data/danRer2/bed/tblastn.hg17KG
     calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
     # 42156/(250000/3899) = 657.464976
     mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa
     split -l 657 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa/kg
     ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa kgfa
     cd kgfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
     ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
     for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
     tcsh
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2       
         liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3       
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4       
         if pslCheck -prot $3.tmp                                                                          
         then                                                                                              
             mv $3.tmp $3                                                                                  
             rm -f $f.1 $f.2 $f.3 $f.4                                                                              
         fi
         exit 0                                                                                            
     fi                                                                                                    
 fi                                                                                                        
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4      
 exit 1
 '_EOF_'
 
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     # I ended up doing the scaffolds separately from the chopped up chrom segments
     # but next time I wouldn't
     gensub2 scaffold.lst kg.lst blastGsub scaffoldBlastSpec
 
     ssh kk
     cd /cluster/data/danRer2/bed/tblastn.hg17KG
     para create blastSpec
     para push
 # Completed: 253435 of 253435 jobs
 # CPU time in finished jobs:   58003988s  966733.13m 16112.22h  671.34d  1.839 y
 # IO & Wait Time:              13196517s  219941.96m  3665.70h  152.74d  0.418 y
 # Average job time:                 281s       4.68m     0.08h    0.00d
 # Longest job:                     6238s     103.97m     1.73h    0.07d
 # Submission to last job:        174503s    2908.38m    48.47h    2.02d
 
     cat << '_EOF_' > chainGsub
 #LOOP
 chainSome $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainSome
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainSome
 
     ls -1dS `pwd`/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
 
     ssh kki
     cd /cluster/data/danRer2/bed/tblastn.hg17KG
     para create chainSpec
     para push
 # Completed: 1287 of 1287 jobs
 # CPU time in finished jobs:      72236s    1203.94m    20.07h    0.84d  0.002 y
 # IO & Wait Time:                 22886s     381.43m     6.36h    0.26d  0.001 y
 # Average job time:                  74s       1.23m     0.02h    0.00d
 # Longest job:                     3374s      56.23m     0.94h    0.04d
 # Submission to last job:          5982s      99.70m     1.66h    0.07d
 
     exit
     # back to kksilo
     cd /cluster/data/danRer2/bed/tblastn.hg17KG/blastOut
     # again some weirdness because I did the NA and Un scaffolds separately
     for i in kg??
     do 
 	cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > cs60.$i.psl
 	sort -rn cs60.$i.psl | pslUniq stdin us.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" cs60.$i.psl > ms60.$i.psl
 	echo $i
     done
 
     for i in kg??
     do 
 	cat $i/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/danRer2/bed/tblastn.hg17KG/blastHg17KG.psl
     cd ..
     # lift the chrUn and chrNA scaffolds 
 
     ssh hgwdev
     cd /cluster/data/danRer2/bed/tblastn.hg17KG
     hgLoadPsl danRer2 blastHg17KG.psl
 
     # back to kksilo
     rm -rf blastOut
 # End tblastn
 
 # ACCESSIONS FOR CONTIGS (WORKING 2005-03-17 kate)
 #  Used for Assembly track details, and also ENCODE AGP's
     cd /cluster/data/danRer2/bed
     mkdir -p contigAcc
     cd contigAcc
     
     # extract WGS contig to accession mapping from Entrez Nucleotide summaries
     # To get the summary file, access the Genbank page for the project 
     # by searching:
     #       genus[ORGN] AND WGS[KYWD]
     # At the end of the page, click on the list of accessions for the contigs.
     # Select summary format, and send to file.
     # output to file <species>.acc
 
     grep scaffold zfish.acc | wc -l
     # 21333
 
     # edit hg/encode/regionAgp/contigAccession.pl to recognize zfish format
     # and install in /cluster/bin/scripts
     contigAccession zfish.acc > contigAcc.tab
     wc -l contigAcc.tab
     # 21333
     grep Zv4 /cluster/data/danRer2/?{,?}/*.agp | wc -l
     # 21333
     hgsql danRer2 < $HOME/kent/src/hg/lib/contigAcc.sql
     echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
         hgsql danRer2
     hgsql danRer2 -e "SELECT COUNT(*) FROM contigAcc"
         # 21333
 
 # KNOWN GENES TRACK (in progress, 2005-02-14, hartera)
 # Found number of loci for Ensembl and mRNA by clustering
     cd $HOME/kent/src/hg/near/hgClusterGenes
     hgClusterGenes -noProt danRer2 ensGene ensCluster ensCanonical 
     # creates 23675 clusters, there are 32062 entries in ensGene
     # remove extra tables created by this program
     echo 'drop table ensCluster;' | hgsql danRer2
     echo 'drop table ensCanonical;' | hgsql danRer2
     cd ./kent/src/hg/geneBounds/clusterRna
     clusterRna danRer2 rna.bed est.bed -noEst
     wc -l rna.bed
     # 10,383 clusters, RefGene has 8397 accessions so that is about right
     rm *.bed
     # Therefore Ensembl should be the basis for the Known Genes tracks
     # since it represents the most loci.
     # move Ensembl to be the first track on the browser display and 
     # with default visibility of pack 
     # edit zebrafish/danRer2/trackDb.ra
     # track ensGene
     # shortLabel Ensembl Genes
     # longLabel Ensembl Gene Predictions
     # group genes
     # priority 32.8
     # visibility pack
     # color 150,0,0
     # type genePred ensPep
     # url http://www.ensembl.org/perl/transview?transcript=$$
     # hgGene on
     
     cd kent/src/hg/hgGene
     # edit hgGene.c to add special case of ensGene table for Zebrafish 
     # from kent/src/hg/near/makeNear.doc
     ssh hgwdev
     cd kent/src/hg/near/hgNear/hgNearData
     mkdir Zebrafish
     cvs add Zebrafish
     cd Zebrafish
     cp ../C_elegans/genome.ra .
     # edit this to be specific for zebrafish and commit to CVS
     cvs add genome.ra
     cvs commit genome.ra
     cd $HOME/kent/src/hg/near/hgClusterGenes
     # Cluster together various alt splice forms
     hgClusterGenes -noProt danRer2 ensGene ensIsoforms ensCanonical
     # Got 23675 clusters, from 32062 genes in 28 chromosomes
     # need to add genome.ra in hgGeneData
     cd $HOME/kent/src/hg/hgGene/hgGeneData
     mkdir Zebrafish
     cp ../C_elegans/genome.ra .
     # edit genome.ra and commit to CVS
     cp ../C_elegans/links.ra .
     # edit links.ra and commit to CVS
     # download protein accessions from Ensembl
     # go to http://www.ensembl.org/Multi/martview
     # select Ensembl 29 and Danio rerio genes (ZFISH 4)
     # go to Features and select Ensembl Gene ID from Ensembl Attributes, 
     # then from External References, select UniProt/Swiss-ProtAC, 
     # and UniProt AC and RefSeq DNA accession 
     # copy to this directory and unizip
     gunzip dr2EnsemblUniProt.gz
     wc -l dr2EnsemblUniProt
     # 34115 dr2EnsemblUniProt
     awk 'BEGIN{FS="\t"} {print $3;}' dr2EnsemblUniProt > ensembl.uniprot
     sort ensembl.uniprot | uniq > ensembl.uniprot.uniq
     # 4361 
     # download uniProt AC, UniProt/SPTREMBL ID, UniProt/Swiss-Prot ID
     # remove blank lines from ensembl.uniprot
     sed -e '/^$/d' ensembl.uniprot > ensembl.uniprot.accsonly
     # there are 6171 UniProt accessions so some Ensembl IDs share accessions
 
 
 
 #########################################################################
 # MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)
 
 # EXTRACT AXTs and MAFs FROM MOUSE (mm6) NET - already done, see
 # /cluster/data/danRer2/bed/blastz.mm6.swap/mafNet and makeMm6.doc 
 # EXTRACT AXTs and MAFs FROM OPOSSUM (monDom1) NET 
 # (DONE, 2005-05-13, hartera) - see makeMonDom1.doc
 
 # EXTRACT AXTs AND MAF's FROM HUMAN NET (DONE, 2005-05-15, hartera)    
     ssh kkstore01
     # create axts
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
     gunzip humanhg17.net.gz
     gunzip chainAR/*.chain.gz
     netSplit humanhg17.net humanHg17Net
     mkdir -p ../axtNet
 cat > axtNet.csh << 'EOF'
     foreach f (humanHg17Net/chr*.net)
         set c = $f:t:r
         echo "axtNet on $c"
         netToAxt humanHg17Net/$c.net chainAR/$c.chain \
                  /cluster/data/danRer2/nib \
                  /cluster/data/hg17/nib ../axtNet/$c.axt
     echo "Complete: $c.net -> $c.axt"
     end
 'EOF'
     chmod +x axtNet.csh
     csh axtNet.csh >&! axtNet.log &
     tail -100f axtNet.log
     # sort axts before making mafs - must be sorted for multiz
     cd /cluster/data/danRer2/bed/blastz.hg17.swap
     mv axtNet axtNet.unsorted
     mkdir axtNet
     foreach f (axtNet.unsorted/*.axt)
         set c = $f:t:r
         echo "Sorting $c"
         axtSort $f axtNet/$c.axt
     end
     # create maf
     ssh kkstore01
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtNet
     mkdir ../mafNet
 cat > makeMaf.csh << 'EOF'
     foreach f (chr*.axt)
       set maf = $f:t:r.hg17.maf
       echo translating $f to $maf
       axtToMaf $f \
             /cluster/data/danRer2/chrom.sizes /cluster/data/hg17/chrom.sizes \
             ../mafNet/$maf -tPrefix=danRer2.  -qPrefix=hg17.
     end
 'EOF'
     chmod +x makeMaf.csh
     csh makeMaf.csh >&! makeMaf.log &
     tail -100f makeMaf.log
     cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
     nice gzip humanhg17.net chainAR/*.chain &
     cd /cluster/data/danRer2/bed/blastz.hg17.swap
     nice gzip axtNet/*.axt mafNet/*.maf &
 
 # PREP FOR LIFTOVER CHAINS FOR THIS ASSEMBLY (DONE, 2005-05-15, hartera)
     # split into 3k chunks
     ssh kkr1u00
     cd /cluster/data/danRer2
     set liftDir = /iscratch/i/danRer2/liftOver/liftSplit
     mkdir -p $liftDir
     cd $liftDir
     mkdir -p split lift
 cat > split.csh << 'EOF'
     set liftDir = /iscratch/i/danRer2/liftOver/liftSplit
     cd /cluster/data/danRer2
     foreach n (`ls ?{,?}/*.fa`)
         set d = $n:h
         set c = $n:t:r
         echo $c
         faSplit -lift=$liftDir/lift/$c.lft size \
             /cluster/data/danRer2/$d/$c.fa -oneFile 3000 $liftDir/split/$c
     end
 'EOF'
 # << for emacs
     chmod +x split.csh
     csh split.csh >&! split.log &
     tail -100f split.log
     mkdir /iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA
     mv /iscratch/i/danRer2/liftOver/liftSplit/split/chrUn.fa \
        /iscratch/i/danRer2/liftOver/liftSplit/split/chrNA.fa \
        /iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA
     iSync
  
 # MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-05-16, hartera)
     ssh hgwdev
     cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom
     set gp = /usr/local/apache/htdocs/goldenPath/danRer2
     mkdir -p $gp/vsTetNig1/axtChrom
     cp -p *.axt $gp/vsTetNig1/axtChrom
     cd $gp/vsTetNig1/axtChrom
     gzip *.axt
     md5sum *.gz > md5sum.txt
     
     # copy chains and net to downloads area
     cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
     gzip -c all.chainARFilt5k > /cluster/data/danRer2/zip/tetraTetNig1.chain.gz
     gzip -c tetraTetNig1.net > /cluster/data/danRer2/zip/tetraTetNig1.net.gz
     cd $gp/vsTetNig1
     mv /cluster/data/danRer2/zip/tetra*.gz .
     md5sum *.gz > md5sum.txt
     # Copy over & edit README.txt w/pointers to chain, net formats.
 
 # 6-WAY VAR_MULTIZ ALIGNMENTS (DONE, 2005-05-20, hartera)
 # Species: zebrafish(danRer2), human (hg17), mouse(mm6), opossum(monDom1), 
 # fugu(fr1) and tetraodon(tetNig1)
 # Tetraodon and Fugu are quite distant from zebrafish, more distant than 
 # human/chicken so use the HoxD55.q matrix in future for blastz with these 
 # species. Here the pairwise alignments of zebrafish with Tetraodon and Fugu
 # were done using the default matrix for blastz. (2005-06-01, hartera)
 # zebrafish(danRer2), human (hg17), mouse(mm6), opossum(monDom1), 
 # fugu(fr1) and tetraodon(tetNig1)
 # Prepared tree image for Conservation track details page (2005-06-07, hartera)
     ssh kkstore01
     mkdir -p /cluster/data/danRer2/bed/multiz6way
     cd /cluster/data/danRer2/bed/multiz6way
     mkdir mafLinks
     # set up directories for links to mafs for each pairwise alignment
     mkdir mafLinks/hg17
     mkdir mafLinks/mm6
     mkdir mafLinks/monDom1
     mkdir mafLinks/fr1
     mkdir mafLinks/tetNig1
    
     set dir=/cluster/data/danRer2/bed
     set dirMonDom=/cluster/data/monDom1/bed
     # need to make all the links so that they are just chrN.maf.gz 
     # without the species db in the middle e.g. chrN.hg17.maf.gz
     ln -s $dir/blastz.mm6.swap/mafNet/*.maf.gz ./mafLinks/mm6
     echo "$dir/blastz.hg17.swap" > dir.lst
     echo "$dirMonDom/blastz.monDom1.to.danRer2" >> dir.lst
     echo "$dir/blastz.fr1" >> dir.lst
     echo "$dir/blastz.tetNig1" >> dir.lst
 
     foreach d (`cat dir.lst`)
       foreach m ($d/mafNet/*.maf.gz)
          foreach c (`cat /cluster/data/danRer2/chrom.lst`)
             foreach sp (hg17 monDom1 fr1 tetNig1)
                set maf = ${d}/mafNet/chr${c}.${sp}.maf.gz
                if ($m == $maf) then
                   set s = $sp
                   echo $s
                endif
             end     
             ln -s $d/mafNet/chr${c}.${s}.maf.gz ./mafLinks/$s/chr${c}.maf.gz
          end
       end
     end
      
     # Copy MAFS to Iservers for kluster run
     ssh kkr1u00
     mkdir /iscratch/i/danRer2/multiz6way
     cd /iscratch/i/danRer2/multiz6way
     rsync -a --copy-links --progress \
           /cluster/data/danRer2/bed/multiz6way/mafLinks/ .
     # 321 Mb of data - takes seconds
     mkdir penn
     cp -p /cluster/bin/penn/psuCVS/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/maf_project penn
     /cluster/bin/iSync
 
 #       Progressive alignment up the tree w/o stager,
 #       using multiz.v10 (var_multiz)
 #       Method: align internal subtrees (using 0 flag to var_multiz)
 #               Then, align these to human (using 1 flag to var_multiz)
 #       NOTE: must use maf_project after each multiz run, in order
 #       to order output.  Single-cov guaranteed by use of net MAF's,
 #       so it is not necessary to run single_cov2.
 
     # make output dir and run dir
     ssh kki
     cd /cluster/data/danRer2/bed/multiz6way
     mkdir -p maf
     mkdir -p run
     cd run
 
     # create scripts to run var_multiz on cluster
 cat > oneMultiz.csh << 'EOF'
 #!/bin/csh -fe
     set c = $1
     set multi = /scratch/danRer2/multiz6way.$c
     set pairs = /iscratch/i/danRer2/multiz6way
     
     # special mode --
     # with 1 arg, cleanup
     if ($#argv == 1) then
         rm -fr $multi
         exit
     endif
     
     # special mode --
     # with 3 args, saves an alignment file
     if ($#argv == 3) then
         cp $multi/$2/$c.maf $3
         exit
     endif
     
     set s1 = $2
     set s2 = $3
     set flag = $4
     
     # locate input files -- in pairwise dir, or multiple dir
     set d1 = $multi
     set d2 = $multi
     if (-d $pairs/$s1) then
         set d1 = $pairs
         set f1 = $d1/$s1/$c.maf.gz
         set t1 = /tmp/$s1.$c.maf
         zcat $f1 > $t1
     else
         set f1 = $d1/$s1/$c.maf
         set t1 = /tmp/$s1.$c.maf
         cp -p $f1 $t1
     endif
     if (-d $pairs/$s2) then
         set d2 = $pairs
         set f2 = $d2/$s2/$c.maf.gz
         set t2 = /tmp/$s2.$c.maf
         zcat $f2 > $t2
     else
         set f2 = $d2/$s2/$c.maf
         set t2 = /tmp/$s2.$c.maf
         cp -p $f2 $t2
     endif
     # write to output dir
     set out = $multi/${s1}${s2}
     mkdir -p $out
     
     # check for empty input file
     if (-s $t1 && -s $t2) then
         echo "Aligning $f1 $f2 $flag"
         /iscratch/i/danRer2/multiz6way/penn/multiz $t1 $t2 $flag $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf
         cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
                 $out/$c.tmp.maf
         echo "Ordering $c.maf"
         /iscratch/i/danRer2/multiz6way/penn/maf_project $out/$c.tmp.maf danRer2.$c > $out/$c.maf
         rm -f $t1 $t2
     else if (-s $t1) then
         cp -p $t1 $out/$c.maf
         rm -f $t1
     else if (-s $t2) then
         cp -p $t2 $out/$c.maf
         rm -f $t2
     endif
 'EOF'
 # << keep emacs coloring happy
     chmod +x oneMultiz.csh
 
     #   Copy this script to iscratch
     ssh kkr1u00
     cd /iscratch/i/danRer2/multiz6way/penn
     cp -p /cluster/data/danRer2/bed/multiz6way/run/oneMultiz.csh .
     /cluster/bin/iSync
     #   back to run the job
     ssh kki
     cd /cluster/data/danRer2/bed/multiz6way/run
     
     #   This tree.nh was used in the distant past for early versions
     #   of phastCons.  Now, this is merely a convenient reference to the
     #   tree under construction.  This is also used to draw a graphic
     #   tree as species.nh, see below.
     cat << '_EOF_' > tree.nh
 ((hg17,mm6),monDom1),((tetNig1,fr1),danRer2))
 '_EOF_'
     # << this line keeps emacs coloring happy
 
 cat > allMultiz.csh << 'EOF'
 #!/bin/csh -fe
     # multiple alignment steps:
 set c = $1
 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c hg17 mm6  0
 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c monDom1 hg17mm6  0
 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1 fr1  1
 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1fr1 monDom1hg17mm6  1
 # get final alignment file
 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1fr1monDom1hg17mm6 /cluster/data/danRer2/bed/multiz6way/maf/$c.maf
 #cleanup
 /iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c
 'EOF'
 # << keep emacs coloring happy
     chmod +x allMultiz.csh
 
 cat  << 'EOF' > template
 #LOOP
 ./allMultiz.csh $(root1) {check out line+ /cluster/data/danRer2/bed/multiz6way/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
 
     cd /cluster/data/danRer2/bed/multiz6way/run
     awk '{print $1}' ../../../chrom.sizes > chrom.lst
     
     gensub2 chrom.lst single template jobList
     para create jobList
     para try; para check
     para push
 # para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:       5903s      98.39m     1.64h    0.07d  0.000 y
 # IO & Wait Time:                   227s       3.78m     0.06h    0.00d  0.000 y
 # Average job time:                 219s       3.65m     0.06h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1328s      22.13m     0.37h    0.02d
 # Submission to last job:          2098s      34.97m     0.58h    0.02d
 
     # do not filter mafs as only removes a small fraction of alignments
     # better to keep them all. check for single column alignments (these
     # just have a single base for each species in the alignment), if 
     # present need to do glueing step. 
     ssh hgwdev
     # create tab file for chr1.maf
     cd /cluster/data/danRer2/bed/multiz6way
     hgLoadMaf danRer2 multiz6way -test=./maf/chr1.maf
     awk 'BEGIN {OFS="\t"} {print $2, $3, $4, $5, $6, $4-$3;}' \
         multiz6way.tab > chr1.tab.lengths
     sort -n -k6 chr1.tab.lengths > chr1.tab.lengths.sort
     # chr1 - 10 single column alignments
     # also looked at chrUn - 14 single column alignments 
     # there are also a small number of 2,3,4 column alignments.
     # This is ok as it is such a small number.
     # use mafAddIRows to 'chain' the alignments 
     ssh kkstore01
     cd /cluster/data/danRer2/bed/multiz6way
     mkdir mafAddIRows
     foreach m (./maf/*.maf)
        set n=$m:t
        echo "Processing $n ..."
        mafAddIRows $m /cluster/data/danRer2/danRer2.2bit ./mafAddIRows/$n
     end
     # cat maf files into one file for those alignments without iRows, 
     # do not use alignments with iRows in browser at the moment
 
     catDir maf > multiz6way.maf
     # 978 Mb of data
     # -rw-rw-r--  1 hartera protein 977796776 May 20 12:06 multiz6way.maf
     # Load into database  
     ssh hgwdev
     cd /cluster/data/danRer2/bed/multiz6way
     mkdir /gbdb/danRer2/multiz6way
     ln -s /cluster/data/danRer2/bed/multiz6way/multiz6way.maf \
         /gbdb/danRer2/multiz6way
     hgLoadMaf danRer2 multiz6way
     # Loaded 1230321 mafs in 1 files from /gbdb/danRer2/multiz6way
     # took about 2 minutes to load
     hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 danRer2 \
         multiz6waySummary multiz6way.maf
     # Processed 2565228 components in 1230321 mafs from multiz6way.maf
     # took about 2 minutes to load
 
     # Dropped unused indexes (2006-05-09 kate)
     # NOTE: this is not required in the future, as the loader
     # has been fixed to not generate these indexes
     hgsql danRer2 -e "alter table multiz6waySummary drop index chrom_2"
     hgsql danRer2 -e "alter table multiz6waySummary drop index chrom_3"
 
     # create tree image - like tree.nh but with common names, added zebrafish
     # to tree (2005-06-07, hartera):
     cat << '_EOF_' > species6.nh
 (((human,mouse),opossum),((tetraodon,fugu),zebrafish))
 '_EOF_'
     /cluster/bin/phast/draw_tree -b -s species6.nh > species6.ps
     # used GIMP to reduce the amount of whitespace to make it
     # smaller, then save as jpg
     display species6.jpg
     # use the Transform->Chop and then Direction->horizontal or vertical
     # to reduce the tree size while keeping label sizes the same
     cp species6.jpg /usr/local/apache/htdocs/images/phylo/DanRer2_6way.jpg
     # change permissions for display
     chmod +r /usr/local/apache/htdocs/images/phylo/DanRer2_6way.jpg
 # add all.joiner entry for danRer2 multiz6way (2005-05-31, hartera)
     # add trackDb.ra entry
 # track multiz6way
 # shortLabel 6-Way Conservation
 # longLabel 6-Way Vertebrate Multiz Alignment & Conservation
 # group compGeno
 # priority 109
 # visibility hide
 # color 0, 10, 100
 # altColor 0,90,10
 # type wigMaf 0.0 1.0
 # maxHeightPixels 100:40:11
 # #wiggle phastCons6way
 # yLineOnOff Off
-# autoScaleDefault Off
+# autoScale Off
 # summary multiz6waySummary
 # speciesGroups mammal vertebrate
 # sGroup_mammal hg17 mm6 monDom1
 # sGroup_vertebrate fr1 tetNig1
 # treeImage phylo/DanRer2_6way.jpg
 
 # PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 6-WAY ALIGNMENT 
 # (DONE, 2005-06-01, hartera)
 # Tetraodon and Fugu are quite distant from zebrafish, more distant than 
 # human/chicken so use the HoxD55.q matrix in future for blastz with these 
 # species. Here the pairwise alignments of zebrafish with Tetraodon and Fugu 
 # that were used for the 6-way multiz were done using the default matrix 
 # for blastz. (2005-06-01, hartera)
     ssh kkstore01
     mkdir /cluster/data/danRer2/bed/multiz6way/cons
     cd /cluster/data/danRer2/bed/multiz6way/cons
 
     # create a starting-tree.mod based on chr5 (67Mb - largest chrom)
     # chr5 is the largest chrom apart from NA and Un
     /cluster/bin/phast/msa_split ../maf/chr5.maf \
         --refseq ../../../5/chr5.fa --in-format MAF \
         --windows 100000000,1000 --out-format SS \
         --between-blocks 5000 --out-root s1
     # takes about 2 minutes
     /cluster/bin/phast/phyloFit -i SS s1.*.ss \
         --tree "((tetNig1,fr1),((mm6,hg17),monDom1))" \
         --out-root starting-tree
     # only takes about 5 minutes
     rm s1.*.ss
     # Get genome-wide average GC content (for all species together,
     # not just the reference genome).  If you have a globally
     # estimated tree model, as above, you can get this from the
     # BACKGROUND line in the .mod file.  E.g.,
 # ALPHABET: A C G T
 # ...
 # BACKGROUND: 0.309863 0.189473 0.189412 0.311252
     # This implies a GC content of 0.1895 + 0.1895 = 0.379
     # If you do *not* have a global tree model and you do not know
     # your GC content, you can get it directly from the MAFs with
     # a command like:
     # /cluster/bin/phast/msa_view \
     # --aggregate danRer2,tetNig1,fr1,mm6,hg17,monDom1 -i MAF \ 
     # --summary-only /cluster/data/danRer2/bed/multiz6way/maf/chr*.maf \
     # > maf_summary.txt
     # this gives GC content of 0.4026 so similar
 # break up the genome-wide MAFs into pieces
     ssh kkstore01
     mkdir -p /cluster/bluearc/danRer2/cons/ss
     cd /cluster/bluearc/danRer2/cons/ss
     bash
     for C in `awk '{print $1}' /cluster/data/danRer2/chrom.sizes`
     do
       if [ -s /cluster/data/danRer2/bed/multiz6way/maf/${C}.maf ]; then
         echo msa_split $C
         chrN=${C/chr/}
         /cluster/bin/phast/msa_split \
             /cluster/data/danRer2/bed/multiz6way/maf/${C}.maf \
             --refseq /cluster/data/danRer2/${chrN}/${C}.fa \
             --in-format MAF --windows 1000000,0 --between-blocks 5000 \
             --out-format SS -I 1000 --out-root ${C}
       fi
     done
     # took 15 minutes to run
     # Create a random list of 50 1 mb regions  (do not use the NA and Un)
     ls -l | grep -v "NA" > fileList
     cat fileList | grep -v "Un" > fileList2
     mv fileList2 fileList
     cat fileList | awk '$5 > 4000000 {print $9;}' | \
         randomLines stdin 50 ../randomSs
     rm fileList
 
     # Set up parasol directory to calculate trees on these 50 regions
     ssh kk9
     mkdir -p /cluster/bluearc/danRer2/cons/treeRun1
     cd /cluster/bluearc/danRer2/cons/treeRun1
     mkdir tree log
 
     # now set up cluster job to estimate model parameters.  Parameters
     # will be estimated separately for each alignment fragment then
     # will be combined across fragments
     #   Tuning this loop should come back to here to recalculate
     # Create little script that calls phastCons with right arguments
     cat > makeTree << '_EOF_'
     /cluster/bin/phast/phastCons ../ss/$1.ss \
       /cluster/data/danRer2/bed/multiz6way/cons/starting-tree.mod \
       --gc 0.403 --nrates 1,1 --no-post-probs --ignore-missing \
       --expected-lengths 12 --target-coverage 0.17 \
       --quiet --log log/$1 --estimate-trees tree/$1
 '_EOF_'
     #   emacs happy
     chmod a+x makeTree
 
     # msa_view as this is from the whole genome. Also, notice the
     # target coverage of 0.17.  Here we are going to aim for 65% coverage
     # of coding regions by conserved elements.
     # Create gensub file
     cat > template << '_EOF_'
 #LOOP
 makeTree $(root1)
 #ENDLOOP
 '_EOF_'
     #   happy emacs
                                                                              
     # Make cluster job and run it
     gensub2 ../randomSs single template jobList
     para create jobList
     para try/push/check/etc
 # para time
 # Completed: 50 of 50 jobs
 # CPU time in finished jobs:       4013s      66.89m     1.11h    0.05d  0.000 y
 # IO & Wait Time:                   139s       2.31m     0.04h    0.00d  0.000 y
 # Average job time:                  83s       1.38m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             135s       2.25m     0.04h    0.00d
 # Submission to last job:          5260s      87.67m     1.46h    0.06d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ls tree/*.cons.mod > cons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
         --output-average ../ave.cons.mod > cons_summary.txt
     ls tree/*.noncons.mod > noncons.txt
     /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
         --output-average ../ave.noncons.mod > noncons_summary.txt
     cd ..
     cp -p ave.*.mod /cluster/data/danRer2/bed/multiz6way/cons
     #   measuring entropy
     #   consEntropy <target coverage> <expected lengths>
     #            ave.cons.mod ave.noncons.mod --NH 9.78
     #   never stops with the --NH argument
     # target entropy should be L_min*H=9.8 bits, the expected length
     # that produces this entropy is the one to use for phastCons.
     /cluster/bin/phast/consEntropy 0.17 12 \
                         ave.cons.mod ave.noncons.mod
 
 # the above steps were repeated with the target coverage and expected lengths
 # parameters set as below:
 
 # -target-coverage=0.17 -expected-lengths 12
 #Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068
 # Relative entropy: H=0.768727 bits/site
 # Expected min. length: L_min=13.932145 sites
 # Expected max. length: L_max=8.869545 sites
 # Total entropy: L_min*H=10.710017 bits
 
 # -target-coverage=0.20 -expected-lengths 10
 #Transition parameters:gamma=0.200000,omega=10.000000, mu=0.100000, nu=0.025000
 # Relative entropy: H=0.799627 bits/site
 # Expected min. length: L_min=12.358880 sites
 # Expected max. length: L_max=7.593192 sites
 # Total entropy: L_min*H=9.882496 bits
 
 # -target-coverage=0.20 -expected-lengths=9
 #Transition parameters:gamma=0.200000, omega=9.000000, mu=0.111111, nu=0.027778
 # Relative entropy: H=0.805459 bits/site
 # Expected min. length: L_min=12.022440 sites
 # Expected max. length: L_max=7.111297 sites
 # Total entropy: L_min*H=9.683580 bits
 
 # -target-coverage=0.25 -expected-lengths=10
 #Transition parameters:gamma=0.250000, omega=10.000000, mu=0.100000, nu=0.033333
 # Relative entropy: H=0.843206 bits/site
 # Expected min. length: L_min=10.846871 sites
 # Expected max. length: L_max=6.973477 sites
 # Total entropy: L_min*H=9.146148 bits
 
 # -target-coverage=0.35 -expected-lengths=12
 #Transition parameters:gamma=0.350000, omega=12.000000, mu=0.083333, nu=0.044872
 # Relative entropy: H=0.917684 bits/site
 # Expected min. length: L_min=9.169807 sites
 # Expected max. length: L_max=6.687135 sites
 # Total entropy: L_min*H=8.414989 bits
 
 # -target-coverage=0.35 -expected-lengths=14
 #Transition parameters:gamma=0.350000,omega=14.000000, mu=0.071429, nu=0.038462
 # Relative entropy: H=0.903639 bits/site
 # Expected min. length: L_min=9.778765 sites
 # Expected max. length: L_max=7.300778 sites
 # Total entropy: L_min*H=8.836478 bits
 
 # -target-coverage=0.35 -expected-lengths=18
 #Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915
 # Relative entropy: H=0.881986 bits/site
 # Expected min. length: L_min=10.798324 sites
 # Expected max. length: L_max=8.320602 sites
 # Total entropy: L_min*H=9.523968 bits
 
 # -target-coverage=0.35 -expected-lengths=19
 #Transition parameters:gamma=0.350000,omega=19.000000, mu=0.052632, nu=0.028340
 # Relative entropy: H=0.877572 bits/site
 # Expected min. length: L_min=11.021340 sites
 # Expected max. length: L_max=8.542773 sites
 # Total entropy: L_min*H=9.672025 bits
 
 # -target-coverage=0.32 -expected-lengths=20
 #Transition parameters:gamma=0.320000, omega=20.000000, mu=0.050000, nu=0.023529
 # Relative entropy: H=0.853466 bits/site
 # Expected min. length: L_min=11.824480 sites
 # Expected max. length: L_max=9.084203 sites
 # Total entropy: L_min*H=10.091797 bits
 
 #### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED ####
 # -target-coverage=0.32 -expected-lengths=18
 #Transition parameters:gamma=0.320000,omega=18.000000, mu=0.055556, nu=0.026144
 # Relative entropy: H=0.860401 bits/site
 # Expected min. length: L_min=11.402969 sites
 # Expected max. length: L_max=8.648243 sites
 # Total entropy: L_min*H=9.811131 bits
 
 # need to iterate and get the right coverage and parameters
 # try running phastCons below with parameters used above and check the 
 # coverage of coding regions by the most conserved elements
     # Create cluster dir to do main phastCons run
     ssh kk9
     mkdir /cluster/bluearc/danRer2/cons/consRun1
     cd /cluster/bluearc/danRer2/cons/consRun1
     mkdir ppRaw bed
     # Create script to run phastCons with right parameters
     # This job is I/O intensive in its output files.  To make this
     # cluster safe, it would be better to do this work somewhere over
     # in /tmp/... and copy the final result back.  kk9 can do this
     # run, but kk cannot.
     # Run whole genome, this goes fast in this case.
 cat > doPhast << '_EOF_'
 mkdir -p ppRaw/$2
 /cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \
    --expected-lengths 12 --target-coverage 0.17 --quiet --seqname $2 \
    --idpref $2 --viterbi bed/$1.bed --score --require-informative 0 > \
    ppRaw/$2/$1.pp
 '_EOF_'
     # emacs happy
     chmod a+x doPhast
 
     # Create gsub file
     cat > template << '_EOF_'
 #LOOP
 doPhast $(file1) $(root1)
 #ENDLOOP
 '_EOF_'
     #   happy emacs
                                                                                 
     # Create parasol batch and run it for the whole genome
     ls -1 ../ss | sed 's/.ss//' > in.lst
     gensub2 in.lst single template jobList
     para create jobList
     para try/check/push/etc.
 # para time
 # Completed: 1604 of 1604 jobs
 # CPU time in finished jobs:      11005s     183.42m     3.06h    0.13d  0.000 y
 # IO & Wait Time:                 30912s     515.19m     8.59h    0.36d  0.001 y
 # Average job time:                  26s       0.44m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              55s       0.92m     0.02h    0.00d
 # Submission to last job:           608s      10.13m     0.17h    0.01d
 
     # combine predictions and transform scores to be in 0-1000 interval
     #   it uses a lot of memory, so on kolossus:
     ssh kolossus
     cd /cluster/bluearc/danRer2/cons/consRun
     catDir bed | awk ' \
         {printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
         /cluster/bin/scripts/lodToBedScore /dev/stdin > \
         /cluster/data/danRer2/bed/multiz6way/mostConserved.bed
                                                                                 
     # Figure out how much is actually covered by the bed files as so:
     ssh kkstore01
     cd /cluster/data/danRer2/bed/multiz6way
     foreach f ( `cat /cluster/data/danRer2/chrom.lst` )
        cat $f/chr${f}.fa >> allChroms.fa
     end
     faSize allChroms.fa
     # 1589273282 bases (101846391 N's 1487426891 real 813899141 upper 
     # 673527750 lower) in 28 sequences in 1 files
 
     # Total size: mean 56759760.1 sd 58536773.4 min 16596 (chrM) 
     # max 318297480 (chrNA) median 42763181 
     
     awk '{sum+=$3-$2} \
 END{printf "%% %.2f = 100.0*%d/1487426891\n",100.0*sum/1487426891,sum}' \
         mostConserved.bed
     -target-coverage 0.17: % 1.99 = 100.0*29631918/1487426891 length 12
     -target-coverage 0.20: % 1.96 = 100.0*29183170/1487426891 length 10
     -target-coverage 0.25: % 2.11 = 100.0*31391360/1487426891 length 10
     -target-coverage 0.35  % 2.69 = 100.0*39940068/1487426891 length 19
     -target-coverage 0.32  % 2.60 = 100.0*38730791/1487426891 length 18
     #   the non-N genome size, from faSize on all chroms: 1487426891
 
 # check with featureBits
     ssh hgwdev
     cd /cluster/data/danRer2/bed/multiz6way
     # get an or of refGene and mgcGenes CDS regions 
     featureBits danRer2 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
     # 9745850 bases of 1560497282 (0.625%) in intersection
     featureBits danRer2 refSeqOrMgcCds.bed mostConserved.bed -enrichment
     # featureBits danRer2 refSeqOrMgcCds.bed mostConserved.bed -enrichment
     # -target-coverage=0.17 -expected-lengths=12
     # refSeqOrMgcCds.bed 0.625%, mostConserved.bed 1.899%, both 0.350%, 
     # cover 56.07%, enrich 29.53x
     # -target-coverage=0.20 -expected-lengths=10
     # refSeqOrMgcCds.bed 0.625%, mostConserved.bed 1.870%, both 0.347%, 
     # cover 55.54%, enrich 29.70x
     # -target-coverage=0.25 -expected-length=10
     # refSeqOrMgcCds.bed 0.625%, mostConserved4.bed 2.012%, both 0.364%, 
     # cover 58.35%, enrich 29.01x
     # -target-coverage=0.35 -expected-lengths=19
     # refSeqOrMgcCds.bed 0.625%, mostConserved5.bed 2.559%, both 0.431%, 
     # cover 68.98%, enrich 26.95x
     # -target-coverage=0.32 -expected-lengths=18
     # refSeqOrMgcCds.bed 0.625%, mostConserved6.bed 2.482%, both 0.423%, 
     # cover 67.75%, enrich 27.30x
     # looking at ensGene:
     # featureBits danRer2 ensGene:cds mostConserved.bed -enrichment
     # ensGene:cds 2.135%, mostConserved.bed 2.482%, both 1.210%, 
     # cover 56.67%, enrich 22.83x
 
 # so use this result with -target-coverage=0.32 -expected-lengths=18
 # total entropy is 9.81 which is good, aiming for 9.8 and 65% coverage
 # of coding regions with most conserved elements, here it is 67.8%
 
     # Load most conserved track into database
     ssh hgwdev
     cd /cluster/data/danRer2/bed/multiz6way
     hgLoadBed danRer2 phastConsElements mostConserved.bed
     # Loaded 393654 elements of size 5
     # less than 1 minute load time
     featureBits danRer2 -enrichment refSeqOrMgcCds.bed phastConsElements
 # refSeqOrMgcCds.bed 0.625%, phastConsElements 2.482%, both 0.423%, 
 # cover 67.75%, enrich 27.30x
     featureBits danRer2 mgcGenes:cds phastConsElements -enrichment
 # mgcGenes:cds 0.486%, phastConsElements 2.482%, both 0.336%, 
 # cover 69.06%, enrich 27.82x
     featureBits danRer2 refGene:cds phastConsElements -enrichment
 # refGene:cds 0.597%, phastConsElements 2.482%, both 0.400%, 
 # cover 66.97%, enrich 26.98x
 
     # Create merged posterier probability file and wiggle track data files
     ssh kkstore01
     cd /cluster/bluearc/danRer2/cons/consRun
     # interesting sort here on the chr name and position.
     # first convert all . and - characters to special strings x/ and x_/
     #   to get a consistent delimiter of / for all fields to be sorted.
     #   Then do the sort on the chrom name and the start position, after
     #   the sort convert the special stringx x_/ and x/ back to - and .
     #   respectively.  This gets everything in order by chrom name and
     #   chrom start.
     find ./ppRaw -type f | sed -e "s#\.#x/#g; s#-#x_/#g" | \
         sort -t"/" -k4,4 -k6,6n | sed -e "s#x_/#-#g; s#x/#.#g" | xargs cat | \
             wigEncode stdin phastCons6way.wig phastCons6way.wib
     # about 3 minutes for above
 
     ssh kkstore01
     cd /cluster/bluearc/danRer2/cons/consRun
     cp -p phastCons6way.wi? /cluster/data/danRer2/bed/multiz6way/cons
                                                                                 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/danRer2/bed/multiz6way/cons
     mkdir -p /gbdb/danRer2/wib
     ln -s `pwd`/phastCons6way.wib /gbdb/danRer2/wib/phastCons6way.wib
     # use this if need to reload table
     hgsql -e 'drop table phastCons6way;' danRer2
     # load table
     hgLoadWiggle danRer2 phastCons6way phastCons6way.wig
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/danRer2/bed/multiz6way/cons
     bash
     time hgWiggle -doHistogram \
         -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
             -db=danRer2 phastCons6way > histogram.data 2>&1
     # real    3m1.185s  user    2m19.440s  sys     0m11.850s
  
         #   create plot of histogram:
     cat << '_EOF_' > histo.gp
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Zebrafish danRer2 Histogram phastCons6 track"
 set xlabel " phastCons6 score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
                                                                                 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
 
     #   happy emacs
     gnuplot histo.gp > histo.png
     display histo.png &
 
 # add line: wiggle phastCons6way to trackDb.ra for multiz6way to display the 
 # wiggle for the conservation track.
 # copy over html for multiz and edit.
 
 # PHASTCONS SCORES DOWNLOADABLES (DONE, 2005-05-31, hartera)
     #   prepare compressed copy of ascii data values for downloads
     ssh kkstore01
     cd /cluster/bluearc/danRer2/cons/consRun
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
                                                                                 
 TOP=`pwd`
 export TOP
                                                                                 
 mkdir -p phastCons6wayScores
                                                                                 
 ls ppRaw | while read D
 do
     out=${TOP}/phastCons6wayScores/${D}.gz
     echo -n "$out ... "
     cd ${TOP}/ppRaw/${D}
     gzip -c `ls *.pp  | sed -e "s#-#.x-x.#g;" | \
         sort -t"." -k1,1 -k2,2n | sed -e "s#.x-x.#-#g;"` > ${out}
     echo "done"
 done
 '_EOF_'
     #   happy emacs
     chmod +x gzipAscii.sh
     time ./gzipAscii.sh
     # 104.769u 8.789s 3:00.18 63.0%   0+0k 0+0io 11pf+0w
     # 182 Mb of data
     #  copy scores for downloads
     ssh kkstore01
     mkdir /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores
     cd /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores
     rsync -a --progress \
           /cluster/bluearc/danRer2/cons/consRun/phastCons6wayScores/ .
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/danRer2/phastCons6wayScores
     cd /usr/local/apache/htdocs/goldenPath/danRer2/phastCons6wayScores
     ln -s /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores/*.gz .
     md5sum *.gz > md5sum.txt
     # copy over and edit README.txt.
 
 # MULTIZ 6-WAY DOWNLOADABLES (DONE, 2005-05-31, hartera)
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/danRer2
     mkdir -p multiz6way
     cd multiz6way
     foreach f (/cluster/data/danRer2/bed/multiz6way/maf/*.maf)
         set c = $f:r:t
         echo $c
         nice gzip -c $f > $c.maf.gz
     end
     # copy README and edit
                                                                                 
     # Create upstream files for download
     ssh hgwdev
     cd /cluster/data/danRer2/bed/multiz6way
     echo danRer2 tetNig1 fr1 mm6 hg17 monDom1 > org.txt
     # mafFrags is quick for these
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits danRer2 refGene:upstream:$i -fa=/dev/null -bed=up.bad
 awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4,
 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         nice mafFrags danRer2 multiz6way up.bed upstream$i.maf -orgs=org.txt
         rm up.bed
     end
     # took less than 3 minutes
  
     ssh kkstore01
     cd /cluster/data/danRer2/bed/multiz6way
     nice gzip upstream{1000,2000,5000}.maf
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/danRer2
     mv /cluster/data/danRer2/bed/multiz6way/upstream*.maf.gz multiz6way
     cd multiz6way
     md5sum *.gz > md5sum.txt
 
 ###########################################################################
 # LIFTOVER CHAINS TO DANRER3 (DONE, 2006-04-18, hartera)
     # Split (using makeLoChain-split) of danRer3 is doc'ed in makeDanRer3.doc
     # Do what makeLoChain-split says to do next (start blat alignment)
     ssh kk
     mkdir -p /cluster/data/danRer2/bed/liftOver
     cd /cluster/data/danRer2/bed/liftOver
     makeLoChain-align danRer2 /iscratch/i/danRer2/nib danRer3 \
         /iscratch/i/danRer3/split10k \
         /iscratch/i/danRer3/danRer3_11.ooc >&! align.log &
     # Took about 4 minutes.
     # Do what its output says to do next (start cluster job)
     cd /cluster/data/danRer2/bed/blat.danRer3.2006-04-17/run
     para try, check, push, check, ...
     para time >&! run.time
 # Completed: 784 of 784 jobs
 # CPU time in finished jobs:    4022758s   67045.97m  1117.43h   46.56d  0.128 y
 # IO & Wait Time:                 71872s    1197.86m    19.96h    0.83d  0.002 y
 # Average job time:                5223s      87.05m     1.45h    0.06d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           42935s     715.58m    11.93h    0.50d
 # Submission to last job:         42935s     715.58m    11.93h    0.50d
 
     # lift alignments
     ssh kkr1u00
     cd /cluster/data/danRer2/bed/liftOver
     makeLoChain-lift danRer2 danRer3 >&! lift.log &
     # Took about 10 minutes to run.
 
     # chain alignments
     ssh kki
     cd /cluster/data/danRer2/bed/liftOver
     makeLoChain-chain danRer2 /iscratch/i/danRer2/nib \
                 danRer3 /iscratch/i/danRer3/nib >&! chain.log &
     # Do what its output says to do next (start cluster job)
     cd /cluster/data/danRer2/bed/blat.danRer3.2006-04-17/chainRun
     para try, check, push ...
     para time >&! run.time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:       2697s      44.95m     0.75h    0.03d  0.000 y
 # IO & Wait Time:                   548s       9.13m     0.15h    0.01d  0.000 y
 # Average job time:                 116s       1.93m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             439s       7.32m     0.12h    0.01d
 # Submission to last job:           439s       7.32m     0.12h    0.01d
     # net alignment chains
     ssh kkstore04
     cd /cluster/data/danRer2/bed/liftOver
     makeLoChain-net danRer2 danRer3 >&! net.log &
     # Took about 20 minutes to run.
     # load reference to over.chain into database table,
     # and create symlinks  /gbdb  and download area
     ssh hgwdev
     cd /cluster/data/danRer2/bed/liftOver
     makeLoChain-load danRer2 danRer3 >&! load.log &
     # test by converting a region using the "convert" link on
     # the browser, and comparing to blat of the same region
    
 ###########################################################################
 # SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM DANRER3 (DONE, 2006-04-25, hartera)
     # followed instructions used in makePanTro2.doc
     ssh kkr1u00
     cd /cluster/data/danRer2/bed
     mkdir -p liftOver
     cd liftOver
     makeLoChain-split danRer2 /cluster/data/danRer2/nib >&! split.log &
     # Took about 30 minutes to run.
  
 #########################################################################
 ##  Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
     hgsql -h genome-testdb hgcentraltest \
 	-e "update dbDb set orderKey = 452 where name = 'danRer2';"
 
 ##########################################################################
 # GenBank gbMiscDiff table (markd 2007-01-10)
 # Supports `NCBI Clone Validation' section of mgcGenes details page
 
    # genbank release 157.0 now contains misc_diff fields for MGC clones
    # reloading mRNAs results in gbMiscDiff table being created.
    ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer2
 ##########################################################################
 # BACENDS CLEANUP (DONE, 2007-03-27, hartera)
    ssh kkstore04  
    cd /cluster/data/danRer2/bed/
    du -sh ZonLab
    # 150G     ZonLab/
    cd ZonLab
    # Remove old BAC ENDS directories
    nice rm -r bacends2 bacends3
    cd bacends
    rm bacEndsCov* bacEndsMinCov* bl2seq.out blast.out diff bacEnds \
       bacEnds.names.uniq jobList
    nice rm -r psl err minCov10MinAli92
    # bacEnds.lifted.psl is lifted version of bacEnds.psl so remove latter
    rm bacEnds.psl batch batch.bak *.lst para.results names.test template *.log
    cd bacends.1
    rm accs* bacEndAccs.aliases *.log allBacEnds* zfishaccs* diffaccs bacEnds \
       BACEnd*
    rm -r test test2
    nice gzip *.psl
    cd ../pairs
    # bacEndSingles.bed is already in singles directory 
    rm bacEnds.* bed.tab bacEndSingles.bed psl.tab bacEndSingles.sql *.bak 
    rm -r old test
    cd ../singles
    rm bacEnds.*
    rm -r old
    cd ../scores
    rm *.counts *.count *.hits good.txt error.log bed.tab bad.txt
    rm -r test
    cd ../duplicates
    rm -r old2 test test1
    rm error* sing *.counts singles.log.may10 log* out* bed.tab err
    rm list list.aligns singles* *.uniq *.tmp *.tmp2 100names* pairs.* \
       pairsTest*
    rm -r badPrsTest pairsTest
    cd ../cloneandStsAliases
    rm *.bak dr1* dr2* xref.names.uniq zfishNew* zfishOld* *.tab accs.*
    mv testTables/*.pl .
    rm -r test testTables
    cd /cluster/data/danRer2/bed/
    du -sh ZonLab
    # 46G     ZonLab/
    # removed 104 GB 
 ##########################################################################