src/hg/makeDb/doc/danRer3.txt 1.17

1.17 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/danRer3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer3.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/makeDb/doc/danRer3.txt	17 Oct 2008 01:06:31 -0000	1.16
+++ src/hg/makeDb/doc/danRer3.txt	25 Nov 2009 21:48:38 -0000	1.17
@@ -1,8705 +1,8705 @@
 # for emacs: -*- mode: sh; -*-
 
                                                                                 
 # Danio Rerio (zebrafish) from Sanger, version Zv5 (released 5/20/05)
 #  Project website:
 #    http://www.sanger.ac.uk/Projects/D_rerio/
 #  Assembly notes:
 #    http://www.sanger.ac.uk/Projects/D_rerio/Zv5_assembly_information.shtml
 
 # DOWNLOAD SEQUENCE (DONE, 2005-06-06, hartera)
 # MOVE DANRER3 DIRECTORY AND CONTENTS TO STORE11 AS STORE3 IS FULL
 # (DONE, 2005-07-22, hartera)
      ssh kkstore01
      mkdir /cluster/store9/danRer3
      ln -s /cluster/store9/danRer3 /cluster/data
      cd /cluster/data/danRer3
      wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/README
      wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.stats
      wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.chunks.agp
      wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.scaffolds.agp     wget --timestamp \
        ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.fa
      # 2005-07-22 MOVE danRer3 
      # store9 is 100% full, move danRer3 to store11 which is 10% full
      ssh kkstore02
      cd /cluster/store9
      nohup nice mv danRer3 /cluster/store11 &
      # make link to /cluster/data/danRer3
      ln -s /cluster/store11/danRer3 /cluster/data
      
 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2005-06-13, hartera)
      ssh kkstore01
      mkdir -p /cluster/data/danRer3/M
      cd /cluster/data/danRer3/M
      # go to http://www.ncbi.nih.gov/ and search Nucleotide for
      # "Danio mitochondrion genome".  That shows the gi number:
      # 8576324 for the accession, AC024175
  # Use that number in the entrez linking interface to get fasta:
      wget -O chrM.fa \
       'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
      # Edit chrM.fa: make sure the header line says it is the
      # Danio Rerio mitochondrion complete genome, and then replace the
      # header line with just ">chrM".
      perl -pi.bak -e 's/>.+/>chrM/' chrM.fa
      rm *.bak
      # Make a "pseudo-contig" for processing chrM too:
      mkdir ./chrM_1
      sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
      mkdir ./lift
      echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
      echo "chrM_1" > ./lift/ordered.lst
      echo "0     M/chrM_1        16596   chrM    16596" > ./lift/ordered.lft
      # make sure this is tab delimited
 # create a .agp file for chrM as hgGoldGapGl and other
 # programs require a .agp file so create chrM.agp
     cat << '_EOF_' > ./chrM.agp
 chrM       1       16596   1       F       AC024175.3      1       16596   +
 '_EOF_'
      # Create a chrM.chunks.agp
      mkdir -p /cluster/data/danRer3/M/agps
      cd /cluster/data/danRer3/M/agps
      awk 'BEGIN {OFS="\t"} \
         {print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' ../chrM.agp \
          > chrM.chunks.agp
      # make sure that all these above files are tab delimited
 
 # Create list of chromosomes (DONE, 2005-06-08, hartera)
      ssh kkstore01
      cd /cluster/data/danRer3
      awk '{if ($1 !~ /Zv5/) print $1;}' Zv5.scaffolds.agp \
          | sort -n | uniq > chrom.lst
      cp chrom.lst chrom1to25.lst
      # add chrM
      echo "M" >> chrom.lst
      # add chrUn
      echo "Un" >> chrom.lst
      # add NA
      echo "NA" >> chrom.lst
 
 # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2005-06-09, hartera)
     ssh kkstore01
     cd /cluster/data/danRer3
     # This used to hold scripts -- better to keep them inline here 
     # Now it should just hold lift file(s) and
     # temporary scripts made by copy-paste from this file.
     mkdir /cluster/data/danRer3/jkStuff
     # This is where most tracks will be built:
     mkdir /cluster/data/danRer3/bed
 
 # GET ADDITIONAL ZEBRAFISH REPBASE LIBRARY FOR REPEATMASKER 
 # (DONE, 2005-05-10, hartera)
 # Go to http://www.girinst.org/server/RepBase/RepBase10.04.fasta
 # and download zebunc.ref containing unclassified zebrafish repeats.
 # Need username and password. Copy to /cluster/bluearc/RepeatMasker/Libraries/
      ssh hgwdev
      cd /cluster/bluearc/RepeatMasker/Libraries/
      perl -pi.bak -e 's/>(Dr[0-9]+)/>$1#Unknown \@danio [S:]/' zebunc.ref
      # add to RepeatMasker library
      cat zebunc.ref >> RepeatMasker.lib
 
     # This is all in: /cluster/bluearc/RepeatMasker050305/Libraries
 
 # CHECK AGP FILES AND FASTA SIZE CONSISTENCY (DONE, 2005-06-10, hartera)
 
      # The script, createAgpWithGaps.pl (see next section for creating
      # agps and FASTAs for chrNA and chrUn), was used to create a scaffolds 
      # agp file for chrUn to test the program. The agp output was compared to 
      # that from scaffoldFaToAgp and difference was found in the agp file
      # output for scaffoldFaToAgp which used 990568 as the end co-ordinate for
      # Zv5_scaffold1475 instead of 976101 as in the output from the script. So 
      # the co-ordinate numbering is different from there on. The program, 
      # scaffoldFaToAgp is creating the agp file from the FASTA file
      # so perhaps the sequence is a different size than stated in the agp file.
      # Get sequence and find the size:
      ssh kkstore01
      mkdir test
      cd test
      faOneRecord ../Zv5.fa Zv5_scaffold1475 > Zv5_scaffold1475.fa
      faSize Zv5_scaffold1475.fa
      # 990568 bases
      rm Zv5_scaffold1475.fa 
      # reported this inconsistency to Mario Caccamo at Sanger
      # mc2@sanger.ac.uk (2005-06-09) and new scaffolds and chunks agp files
      # were sent on 2005-06-10. There was a chunk (contig) missing from the 
      # chunks agp file and the scaffold therefore had the wrong end 
      # co-ordinate in the agp files.
      # check all sizes of scaffold sequences against those in the agp files
      ssh kkr1u00
      cd /cluster/data/danRer3 
      mkdir -p /iscratch/i/danRer3/scaffolds
      cp Zv5.fa /iscratch/i/danRer3/scaffolds/
      iSync
      
      ssh kk
      mkdir -p /cluster/data/danRer3/scaffolds/run
      cd /cluster/data/danRer3/scaffolds/run
      grep '>' ../Zv5.fa | sed -e 's/>//' > Zv5.scaffolds.lst
 cat << '_EOF_' > getSizes.csh
      #!/bin/csh -fe
      set dir=/cluster/bluearc/danRer3/scaffolds
      faOneRecord /iscratch/i/danRer3/scaffolds/Zv5.fa $1 > $dir/$1.fa
      echo $1 >> $dir/$1.size
      faSize $dir/$1.fa >> $dir/$1.size
      rm $dir/$1.fa
 '_EOF_'
      # << this line makes emacs coloring happy
      chmod +x getSizes.csh
 cat << '_EOF_' > gsub
 #LOOP
 getSizes.csh $(path1)
 #ENDLOOP
 '_EOF_'
      # << this line makes emacs coloring happy 
      gensub2 Zv5.scaffolds.lst single gsub jobList
      para create jobList 
      para try,check,push,check etc...
     
      ssh kkstore01
      cd /cluster/bluearc/danRer3/scaffolds
      foreach f (*.size)
         cat $f >> Zv5.scaffolds.sizes
      end	  
      cd /cluster/data/danRer3/scaffolds
      mv /cluster/bluearc/danRer3/scaffolds/Zv5.scaffolds.sizes .
      # Check that these sizes correspond to the sizes in the scaffolds agp file
      # use script compareSizes.pl
      cat << '_EOF_' > compareSizes.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($file, $agp);
 
 $file = $ARGV[0];
 $agp = $ARGV[1];
 
 open(FILE, $file) || die "Can not open $file: $!\n";
 open(AGP, $agp) || die "Can not open $agp: $!\n";
 open(OUT, ">log.txt") || die "Can not create log.txt: $!\n";
 
 my ($l, $name, $size, %scafsHash);
 while (<FILE>)
 {
 $l = $_;
 if ($l =~ /^(Zv5_(scaffold|NA)[0-9]+)/)
    {
    $name = $1;
    }
 elsif ($l =~ /^([0-9]+)\sbases/)
    {
    $size = $1;  
    $scafsHash{$name} = $size;
    }
 }
 close FILE;
 
 while (<AGP>)
 {
 my ($line, @fi, $scaf, $end);
 $line = $_;
 
 @fi = split(/\t/, $line);
 $scaf = $fi[5];
 $end = $fi[7];
 
 if (exists($scafsHash{$scaf}))
    {
    if ($scafsHash{$scaf} eq $end)
       {
       print OUT "$scaf - ok\n";
       }
    else
       {
       print OUT "$scaf - different size to sequence\n";
       }
    }
 else
    {
    print OUT "$scaf - does not exist in list of sizes\n";
    }
 }
 close AGP;
 close OUT;
 '_EOF_'
    # << happy emacs
    chmod +x compareSizes.pl
    perl compareSizes.pl Zv5.scaffolds.sizes ../Zv5.scaffolds.agp
    # the only lines where no ID was found in the list of scaffolds with sizes
    # were those lines for gaps.
    grep "different" Zv5_scaffold1475
    # Zv5_scaffold1475 - different size to sequence
    # so only this scaffold is a different size in the agp to the sequence
    # need to check that sizes are consistent between agp files 
    # check also new agp file for scaffolds - newAgps/Zv5.scaffolds.agp
    perl compareSizes.pl Zv5.scaffolds.sizes ../newAgps/Zv5.scaffolds.agp
    # these are all consistent with the sequence sizes
    cd /cluster/data/danRer3/newAgps/
    # print out scaffold names where the co-ordinates are not consistent
    # with sizes given
    awk '{if ($6 ~ /^Zv5/ && (($3-$2+1) != $8)) print $6;}' Zv5.scaffolds.agp \
        > Zv5.scaffolds.coordCheck 
    # this file is empty so they are ok. do the same for the chunks.agp file
    awk '{if ($6 ~ /^Zv5/ && (($3-$2+1) != $8)) print $6;}' Zv5.chunks.agp \ 
        > Zv5.chunks.coordCheck
    # also empty so ok. check that the difference between $7 and $8 is the
    # same as the difference between $11 and $12 fields
    awk '{if ($6 != 5000 && (($8 - $7) != ($12 - $11))) print $6;}' \
        Zv5.chunks.agp > Zv5.chunks.coordCheck2
    # these are all ok
    rm Zv5.*.coord*
 cat << '_EOF_' > checkSizesInAgps.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($ch, $sc, %scafsHash);
 $sc = $ARGV[0]; # scaffolds agp
 $ch = $ARGV[1]; # chunks or contigs agp
 
 open(SCAFS, $sc) || die "Can not open $sc: $!\n";
 open(CHUNKS, $ch) || die "Can not open $ch: $!\n";
 
 while (<SCAFS>)
 {
 my ($l, @f, $name, $e);
 $l = $_;
 @f = split(/\t/, $l);
 if ($f[5] =~ /^Zv5/)
    {
    $name = $f[5];
    $e = $f[2];
    $scafsHash{$name} = $e;
    }
 }
 close SCAFS;
 
 my $scaf = "";
 my $prev = "";
 my $prevEnd = 0;
 
 while (<CHUNKS>)
 {
 my ($line, @fi);
 $line = $_;
 @fi = split(/\t/, $line);
 
 if ($fi[5] ne "5000")
    {
    $scaf = $fi[9];
    if (($scaf ne $prev) && ($prev ne ""))
       {
       checkCoords($prev, $prevEnd);
       }
 $prev = $scaf;
 $prevEnd = $fi[2];
    }
 }
 # check last entry in file
 checkCoords($prev, $prevEnd);
 close CHUNKS;
 
 sub checkCoords {
 my ($name, $end) = @_;
 if (exists($scafsHash{$prev}))
    {
    if ($scafsHash{$prev} != $prevEnd)
       {
       my $ed = $scafsHash{$prev};
       print "Scaffold $prev is not consistent between agps\n";
       }
    else
       {
       my $ed = $scafsHash{$prev};
       print "Scaffold $prev - ok\n";
       }
    }
 }
 '_EOF_'
    # << happy emacs
    chmod +x checkSizesInAgps.pl
    checkSizesInAgps.pl Zv5.scaffolds.agp Zv5.chunks.agp \
          > Zv5.scafsvschunks
    grep "not consistent" Zv5.scafsvschunks
    # no lines were inconsistency was reported
    wc -l Zv5.scafsvschunks
    # 16214 Zv5.scafsvschunks
    grep "Zv5" Zv5.scaffolds.agp | wc -l
    # 16214
    # so all the scaffolds were checked and were ok.
    cd /cluster/data/danRer3
    mv ./newAgps/Zv5.scaffolds.agp .
    mv ./newAgps/Zv5.chunks.agp
    mv ./scaffolds/compareSizes.pl ./jkStuff/
    mv ./newAgps/checkSizesInAgps.pl ./jkStuff/
    rm -r newAgps
 
 # SPLIT AGP FILES BY CHROMOSOME (DONE, 2005-06-13, hartera)
 # FASTA WAS CREATED USING SCAFFOLDS AGP
      ssh kkstore01
      cd /cluster/data/danRer3
      # There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and
      # then one for chunks (contigs on danRer1) showing how they map on to
      # scaffolds.
 
      # get list of scaffolds from FASTA file and check these are in agp
      grep '>' Zv5.fa | sed -e 's/>//' | sort | uniq > Zv5FaScafs.lst
      # get list of scaffolds from agp - do not print from gap lines
      awk '{if ($7 !~ /contig/) print $6;}' Zv5.scaffolds.agp \
         | sort | uniq > Zv5AgpScafs.lst
      diff Zv5FaScafs.lst Zv5AgpScafs.lst
      # no difference so all scaffolds are in the FASTA file
      # add "chr" prefix for the agp files
      perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp
      # for chromosomes:
      foreach c (`cat chrom1to25.lst`)
        echo "Processing $c ..."
        mkdir $c
        perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
          ./Zv5.chunks.agp \
          > $c/chr$c.chunks.agp
        perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
          ./Zv5.scaffolds.agp \
          > $c/chr$c.scaffolds.agp
      end
 
 # CREATE AGP FILES FOR chrNA AND chrUn (DONE, 2005-06-13, hartera)
      ssh kkstore01
      # chrNA consists of WGS contigs that could not be related to any 
      # FPC contig and the scaffolds and contigs are named Zv5_NAN in the 
      # first field of the agp files
      cd /cluster/data/danRer3
      mkdir ./NA
      awk '{if ($1 ~ /Zv5_NA/) print;}' Zv5.chunks.agp \
          > ./NA/NA.chunks.agp
      awk '{if ($1 ~ /Zv5_NA/) print;}' Zv5.scaffolds.agp \
          > ./NA/NA.scaffolds.agp
      # change the first field to "chrUn" then can use agpToFa to process
      perl -pi.bak -e 's/Zv5_NA[0-9]+/chrNA/' ./NA/*.agp
      # check files and remove backup files
      rm ./NA/*.bak
      # then process chrUn.
      # Re-make chrUn with new agp files - this is made from scaffolds and  
      # contigs where the name is Zv5_scaffoldN in the first field of the 
      # agp files. These scaffolds and contigs are unmapped to chromosomes
      # in the agp file. chrUn is made up of WGS scaffolds that mapped to 
      # FPC contigs, but the chromosome is unknown.
      rm -r Un
      mkdir ./Un
      awk '{if ($1 ~ /Zv5_scaffold/) print;}' Zv5.chunks.agp \
          > ./Un/Un.chunks.agp
      awk '{if ($1 ~ /Zv5_scaffold/) print;}' Zv5.scaffolds.agp \
          > ./Un/Un.scaffolds.agp
      # change the first field to "chrUn" then can use agpToFa to process
      perl -pi.bak -e 's/Zv5_scaffold[0-9]+/chrUn/' ./Un/*.agp
      # check files and remove backup files
      rm ./Un/*.bak
 
      # get FASTA file of sequences for NA and Un and create agp with 
      # Ns between scaffolds
      # from scaffolds agp, get name of scaffolds to retrieve from the FASTA 
      # file to make the NA and Un chromosomes.
      foreach c (NA Un)
        awk '{print $6;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
        $HOME/bin/i386/faSomeRecords /cluster/data/danRer3/Zv5.fa \
           $c/chr$c.scaffolds.lst $c/chr$c.fa
      end
      # check that all scaffolds in list are in FASTA file for NA and Un - ok
      # edit scaffoldFaToAgp.c so that it creates agp with 500Ns between 
      # scaffolds as contig gaps for chrNA and compile. chrNA is already large
      # so the number of Ns are reduced to reduce the size.
      foreach c (NA Un)
         $HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
         mv $c/chr$c.fa $c/chr$c.scaffolds.fa
      end
      # change chrUn to chrNA for NA and D to W for NA and Un
      sed -e 's/chrUn/chrNA/' ./NA/chrNA.agp | sed -e 's/D/W/' \
          > ./NA/chrNA.scaffolds.agp
      sed -e 's/D/W/' ./Un/chrUn.agp > ./Un/chrUn.scaffolds.agp
      # edit ./NA/chrNA.scaffolds.agp and ./Un/chrUn.scaffolds.agp and 
      # remove last line as this just adds an extra 500 Ns at the 
      # end of the sequence.
      rm ./NA/chrNA.agp ./Un/chrUn.agp
 
 cat << '_EOF_' > /cluster/data/danRer3/jkStuff/createAgpWithGaps.pl
 #!/usr/bin/perl
 use strict;
 
 # This script takes a chunks agp and inserts Ns between scaffolds for 
 # the chunks (contigs) agp file. Could also insert Ns between scaffolds
 # for scaffolds agp.
 
 my ($chrom, $numN, $name, $prev, $st, $end, $prevEnd, $id);
 my $chrom = $ARGV[0]; # chromosome name
 my $numN = $ARGV[1];  # number of Ns to be inserted 
 my $type = $ARGV[2]; # contigs or scaffolds
 
 $prev = "";
 $st = 1;
 $prevEnd = 0;
 $id = 0;
 
 while (<STDIN>)
 {
 my $l = $_;
 my @f = split(/\t/, $l);
 
 if ($type eq "contigs")
    {
    $name = $f[9];
    }
 else 
    {
    $name = $f[5]
    }
 
 my $currSt = $f[1];
 my $currEnd = $f[2];
 my $size = $currEnd - $currSt;
 
 $id++;
 $st = $prevEnd + 1;
 $end = $st + $size;
 
 if (($prev ne "") && ($prev ne $name))
    {
    $st = $prevEnd + 1;
    $end = ($st + $numN) - 1;
    print "$chrom\t$st\t$end\t$id\tN\t$numN\tcontig\tno\n";
    $prevEnd = $end;
    $id++;
    }
 
 $st = $prevEnd + 1;
 $end = $st + $size;
 print "$chrom\t$st\t$end\t$id\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$f[8]";
 if ($type eq "contigs")
    {
    print "\t$f[9]\t$f[10]\t$f[11]";
    }
 
 $prevEnd = $end;
 $prev = $name;
 }
 '_EOF_'
      chmod +x /cluster/data/danRer3/jkStuff/createAgpWithGaps.pl
      cd /cluster/data/danRer3
      foreach c (NA Un)
         cd $c
         perl ../jkStuff/createAgpWithGaps.pl chr${c} 500 contigs \
              < ${c}.chunks.agp > chr${c}.chunks.agp
         cd ..
      end
      # check co-ordinates
      # clean up
      foreach c (NA Un)
         rm $c/${c}.scaffolds.agp $c/${c}.chunks.agp $c/chr${c}.scaffolds.fa \
            $c/${c}.scaffolds.lst
      end
    
 # BUILD CHROM-LEVEL SEQUENCE (DONE, 2005-06-13, hartera)
      ssh kkstore01
      cd /cluster/data/danRer3
      # Sequence is already in upper case so no need to change
      foreach c (`cat chrom.lst`)
        echo "Processing ${c}"
        $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
          $c/chr$c.fa ./Zv5.fa
        echo "${c} - DONE"
      end
      # move scaffolds agp to be chrom agp and clean up
      foreach c (`cat chrom.lst`)
         cd $c
         rm *.bak
         cp chr${c}.scaffolds.agp chr${c}.agp
         mkdir -p agps
         mv chr${c}.*.agp ./agps/
         cd ..
      end
 
 # CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2005-06-13, hartera)
      # Check that the size of each chromosome .fa file is equal to the
      # last coord of the .agp:
      ssh hgwdev
      cd /cluster/data/danRer3
      foreach c (`cat chrom.lst`)
        foreach f ( $c/chr$c.agp )
          set agpLen = `tail -1 $f | awk '{print $3;}'`
          set h = $f:r
          set g = $h:r
          echo "Getting size of $g.fa"
          set faLen = `faSize $g.fa | awk '{print $1;}'`
          if ($agpLen == $faLen) then
            echo "   OK: $f length = $g length = $faLen"
          else
            echo "ERROR:  $f length = $agpLen, but $g length = $faLen"
          endif
        end
      end
      # all are the OK so FASTA files are the expected size
 
 # CREATING DATABASE (DONE, 2005-06-13, hartera)
     # Create the database.
     # next machine
     ssh hgwdev
     echo 'create database danRer3' | hgsql ''
     # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
     echo 'drop database danRer3' | hgsql danRer3
     # Delete and re-create database as above (hartera, 2004-11-30)
     # Use df to make sure there is at least 10 gig free on
     df -h /var/lib/mysql
 # Before loading data:
 # Filesystem            Size  Used Avail Use% Mounted on
 # /dev/sdc1             1.8T  927G  734G  56% /var/lib/mysql
 
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2005-06-13, hartera)
     # next machine
     ssh hgwdev
     #  the following command copies all the data from the table
     #  grp in the database danRer2 to the new database danRer3
     echo "create table grp (PRIMARY KEY(NAME)) select * from danRer2.grp" \
       | hgsql danRer3
     # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
     echo 'drop table grp;' | hgsql danRer3
 
 # BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
 # (DONE, 2004-06-14, hartera)
 
      ssh kkstore01
      cd /cluster/data/danRer3
      foreach c (`cat chrom.lst`)
        foreach agp ($c/chr$c.agp)
          if (-e $agp) then
            set fa = $c/chr$c.fa
            echo splitting $agp and $fa
            cp -p $agp $agp.bak
            cp -p $fa $fa.bak
            splitFaIntoContigs $agp $fa . -nSize=5000000
          endif
        end
      end
 
 # MAKE LIFTALL.LFT (DONE, 2005-06-14, hartera)
     ssh kkstore01
     cd /cluster/data/danRer3
     cat */lift/ordered.lft > jkStuff/liftAll.lft 
 
 # SIMPLE REPEAT [TRF] TRACK  (DONE, 2005-06-14, hartera)
     # TRF can be run in parallel with RepeatMasker on the file server
     # since it doesn't require masked input sequence.
     # Run this on the kilokluster. Need to mask contig and chromosome 
     # sequences so run trf using contig sequences.
     # First copy over contig sequences to iscratch and then iSync to cluster.
     ssh kkr1u00
     mkdir -p /iscratch/i/danRer3/contigsNoMask
     cd /cluster/data/danRer3
     foreach d (/cluster/data/danRer3/*/chr*_?{,?})
        set ctg = $d:t
        foreach f ($d/${ctg}.fa)
           echo "Copyig $f ..."
           cp $f /iscratch/i/danRer3/contigsNoMask/
        end
     end
     # 288 sequence files
     /cluster/bin/iSync
 
     ssh kk
     mkdir -p /cluster/data/danRer3/bed/simpleRepeat
     cd /cluster/data/danRer3/bed/simpleRepeat
     mkdir trf
 cat << '_EOF_' > runTrf
 #!/bin/csh -fe
 #
 set path1 = $1
 set inputFN = $1:t
 set outpath = $2
 set outputFN = $2:t
 mkdir -p /tmp/$outputFN
 cp $path1 /tmp/$outputFN
 pushd .
 cd /tmp/$outputFN
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
 popd
 rm -f $outpath
 cp -p /tmp/$outputFN/$outputFN $outpath
 rm -fr /tmp/$outputFN/*
 rmdir --ignore-fail-on-non-empty /tmp/$outputFN
 '_EOF_'
     # << keep emacs coloring happy
     chmod +x runTrf
                                                                                 
 cat << '_EOF_' > gsub
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
 #ENDLOOP
 '_EOF_'
     # << keep emacs coloring happy
                                                                                 
     ls -1S /iscratch/i/danRer3/contigsNoMask/chr*.fa > genome.lst
     gensub2 genome.lst single gsub jobList
     # 288 jobs
     para create jobList
     para try, check, push, check etc...
     para time
 # Completed: 288 of 288 jobs
 # CPU time in finished jobs:      70742s    1179.03m    19.65h    0.82d  0.002 y
 # IO & Wait Time:                  1263s      21.05m     0.35h    0.01d  0.000 y
 # Average job time:                 250s       4.17m     0.07h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            6722s     112.03m     1.87h    0.08d
 # Submission to last job:         10037s     167.28m     2.79h    0.12d
 
     # lift up to chrom level
     liftUp simpleRepeat.bed /cluster/data/danRer3/jkStuff/liftAll.lft warn \
            trf/*.bed
 
     # Load into the database
     ssh hgwdev
     cd /cluster/data/danRer3/bed/simpleRepeat
     hgLoadBed danRer3 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     # Loaded 757119 elements of size 16
 
 # PROCESS SIMPLE REPEATS INTO MASK (DONE, 2005-06-14, hartera)
     # After the simpleRepeats track has been built, make a filtered version
     # of the trf output: keep trf's with period <= 12:
     ssh kkstore01
     cd /cluster/data/danRer3/bed/simpleRepeat
     mkdir -p trfMask
     foreach f (trf/chr*.bed)
       awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
     end
 
     # Lift up filtered trf output to chrom coords as well:
     cd /cluster/data/danRer3
     mkdir bed/simpleRepeat/trfMaskChrom
     foreach c (`cat chrom.lst`)
       if (-e $c/lift/ordered.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/ordered.lst > $c/lift/oTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
       endif
       if (-e $c/lift/random.lst) then
         perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
            $c/lift/random.lst > $c/lift/rTrf.lst
         liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
           jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
       endif
     end
 
 # REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2005-06-15, hartera)
     # When a new library is added for this version of repeatMasker, need to 
     # check in /cluster/bluearc/RepeatMasker/Libraries for a directory made 
     # up of a date e.g. 20050112 here and inside this are species directories
     # for which RepeatMasker has already been run. In this directory it creates
     # a specieslib of the danio repeats. If this exists, this is used for the
     # RepeatMasker run for that species so if new repeats are added to the
     # library, they will not get used unless this is deleted a new specieslib
     # is created using the new library on the first run for danio.
     ssh kkstore01
     rm -r /cluster/bluearc/RepeatMasker/Libraries/20050112/danio/
     cd /cluster/data/danRer3
     #- Split contigs into 500kb chunks, at gaps if possible:
     foreach c (`cat chrom.lst`)
       foreach d ($c/chr${c}*_?{,?})
         cd $d
         echo "splitting $d"
         set contig = $d:t
         ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
             -minGapSize=100
         cd ../..
       end
     end
 
     # For RepeatMasking, use RepeatMasker "open-3.0" with repeat library
     # version RepBase Update 9.11, RM database version 20050112 with the 
     # addition of the zebrafish unclassified repeats (zebunc.ref) - see above
     # section on getting this additional zebrafish RepeatMasker library. 
     #- Make the run directory and job list:
     cd /cluster/data/danRer3
 cat << '_EOF_' > jkStuff/RMZebrafish
 #!/bin/csh -fe
                                                                                 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/danRer3/$2
 /bin/cp $2 /tmp/danRer3/$2/
 cd /tmp/danRer3/$2
 /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species danio $2
 popd
 /bin/cp /tmp/danRer3/$2/$2.out ./
 if (-e /tmp/danRer3/$2/$2.align) /bin/cp /tmp/danRer3/$2/$2.align ./
 if (-e /tmp/danRer3/$2/$2.tbl) /bin/cp /tmp/danRer3/$2/$2.tbl ./
 if (-e /tmp/danRer3/$2/$2.cat) /bin/cp /tmp/danRer3/$2/$2.cat ./
 /bin/rm -fr /tmp/danRer3/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer3/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer3
 '_EOF_'
     chmod +x jkStuff/RMZebrafish
     mkdir -p RMRun
     cp /dev/null RMRun/RMJobs
     foreach c (`cat chrom.lst`)
       foreach d ($c/chr${c}_?{,?})
           set ctg = $d:t
           foreach f ( $d/${ctg}_?{,?}.fa )
             set f = $f:t
             echo /cluster/data/danRer3/jkStuff/RMZebrafish \
                  /cluster/data/danRer3/$d $f \
                '{'check out line+ /cluster/data/danRer3/$d/$f.out'}' \
               >> RMRun/RMJobs
           end
       end
     end
     # Do the run
     ssh kk 
     cd /cluster/data/danRer3/RMRun
     para create RMJobs
     para try, para check, para check, para push, para check,...
     para time
 # Completed: 4069 of 4069 jobs
 # CPU time in finished jobs:   13726314s  228771.90m  3812.87h  158.87d  0.435 y
 # IO & Wait Time:                 45762s     762.70m    12.71h    0.53d  0.001 y
 # Average job time:                3385s      56.41m     0.94h    0.04d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            4549s      75.82m     1.26h    0.05d
 # Submission to last job:         56947s     949.12m    15.82h    0.66d
 # This is slow. It should have taken about 5 hours.
 
     #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
     ssh kkstore01
     cd /cluster/data/danRer3
     foreach d (*/chr*_?{,?})
       set contig = $d:t
       echo $contig
       liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
         > /dev/null
     end
                                                                                 
     #- Lift pseudo-contigs to chromosome level
     foreach c (`cat chrom.lst`)
       echo lifting $c
       cd $c
       if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
         liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
         > /dev/null
       endif
       cd ..
     end
 
     #- Load the .out files into the database with:
     ssh hgwdev
     cd /cluster/data/danRer3
     hgLoadOut danRer3 */chr*.fa.out -verbose=2
 # bad rep range [689, 602] line 105524 of 16/chr16.fa.out 
 # bad rep range [147, 146] line 124027 of 16/chr16.fa.out
 # bad rep range [280, 258] line 754 of 17/chr17.fa.out 
 # bad rep range [280, 258] line 76417 of 17/chr17.fa.out
 # bad rep range [314, 311] line 99427 of 19/chr19.fa.out
 # bad rep range [367, 366] line 88398 of 23/chr23.fa.out 
 # bad rep range [41, 40] line 51509 of 25/chr25.fa.out
 # bad rep range [1133, 1132] line 62610 of 9/chr9.fa.out
 # bad rep range [6133, 6132] line 122359 of NA/chrNA.fa.out 
 # bad rep range [6133, 6132] line 160183 of NA/chrNA.fa.out 
 # bad rep range [292, 291] line 252829 of NA/chrNA.fa.out 
 # bad rep range [751, 599] line 261276 of NA/chrNA.fa.out 
 # bad rep range [360, 359] line 259794 of Un/chrUn.fa.out 
 # bad rep range [360, 359] line 259796 of Un/chrUn.fa.out 
 # bad rep range [360, 359] line 259798 of Un/chrUn.fa.out 
 # bad rep range [1, -56] line 379516 of Un/chrUn.fa.out
 # note: 16 records dropped due to repStart > repEnd
 
 # check coverage of repeats masked
 # featureBits -chrom=chr1 danRer1 rmsk
 # 11589712 bases of 40488791 (28.624%) in intersection
 # featureBits -chrom=chr1 danRer2 rmsk
 # 26879295 bases of 61678023 (43.580%) in intersection
 # featureBits -chrom=chr1 danRer3 rmsk
 # 25822888 bases of 55805710 (46.273%) in intersection
 
 # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF AND BUILD NIB FILES
 # (DONE, 2005-06-15, hartera)
     ssh kkstore01
     cd /cluster/data/danRer3
     # Soft-mask (lower-case) the contig and chr .fa's,
     # then make hard-masked versions from the soft-masked.
     set trfCtg=bed/simpleRepeat/trfMask
     set trfChr=bed/simpleRepeat/trfMaskChrom
     # for the chromosomes:
     foreach f (*/chr*.fa)
       echo "repeat- and trf-masking $f"
       maskOutFa -soft $f $f.out $f
       set chr = $f:t:r
       maskOutFa -softAdd $f $trfChr/$chr.bed $f
       echo "hard-masking $f"
       maskOutFa $f hard $f.masked
     end
 # This warning is extremely rare -- if it indicates a problem, it is only with
 # the repeat annotation and does not affect the masking:
 # repeat- and trf-masking Un/chrUn.fa
 # WARNING: negative rEnd: -56 chrUn:153329594-153329609 MOSAT_DR
     # for the contigs:
     foreach c (`cat chrom.lst`)
       echo "repeat- and trf-masking contigs of chr$c"
       foreach d ($c/chr*_?{,?})
         set ctg=$d:t
         set f=$d/$ctg.fa
         maskOutFa -soft $f $f.out $f
         maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
         maskOutFa $f hard $f.masked
       end
     end
 # same warning here too:
 # repeat- and trf-masking contigs of chrUn
 # WARNING: negative rEnd: -56 chrUn_26:1159145-1159160 MOSAT_DR
     # check percent sequence masked
     faSize /cluster/data/danRer3/1/chr1.fa
     # 55805710 bases (1047706 N's 54758004 real 28887275 upper 25870729 lower)
     # 46% is in lower case so masked
     # for danRer2:
     faSize /cluster/data/danRer2/1/chr1New.fa
     # 62208023 bases (3421437 N's 58786586 real 31874160 upper 26912426 lower)
     # 43% is in lower case so masked
     # Build nib files, using the soft masking in the fa
     mkdir nib
     foreach f (*/chr*.fa)
       faToNib -softMask $f nib/$f:t:r.nib
     end
 
 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE, 2005-06-15, hartera)
 # Added link from danRer3.2bit file to the danRer3 gbdb directory
 # (2005-06-17, hartera)
     # Make symbolic links from /gbdb/danRer3/nib to the real nibs
     ssh hgwdev
     cd /cluster/data/danRer3
     mkdir -p /gbdb/danRer3/nib
     foreach f (/cluster/data/danRer3/nib/chr*.nib)
       ln -s $f /gbdb/danRer3/nib
     end
 
 # Load /gbdb/danRer3/nib paths into database and save size info
     # hgNibSeq creates chromInfo table
     hgNibSeq -preMadeNib danRer3 /gbdb/danRer3/nib */chr*.fa
     echo "select chrom,size from chromInfo" | hgsql -N danRer3 > chrom.sizes
     # take a look at chrom.sizes, should be 28 lines
     wc chrom.sizes
     # 28      56     409 chrom.sizes
     
     # Make one big 2bit file as well, and make a link to it in
     # /gbdb/danRer3/nib because hgBlat looks there:
     faToTwoBit */chr*.fa danRer3.2bit
     # add link to this 2bit file from gbdb danRer3 directory (2005-06-17)
     ln -s /cluster/data/danRer3/danRer3.2bit /gbdb/danRer3/
     # also make 2 bit files for chrUn and chrNA later on - need masked seq
     # make 2 bit files for chrUn and chrNA scaffolds (2005-06-17)
     ssh kkstore01
     cd /cluster/data/danRer3
     # make scaffolds files
     foreach c (NA Un)
        cd $c
        echo "Processing $c ..."
        mkdir scafSeqs
        awk '{if ($5 != "N") print $6;}' chr${c}.agp > scafSeqs/scaffolds.lst
        cd ..
     end 
     cd /cluster/data/danRer3/NA/scafSeqs
 cat << '_EOF_' > getSeqs.csh
      #!/bin/csh -fe
      set dir=/cluster/bluearc/danRer3/scaffolds
      faOneRecord /iscratch/i/danRer3/scaffolds/Zv5.fa $1 > $dir/$1.fa
 '_EOF_'
      # << this line makes emacs coloring happy
      chmod +x getSeqs.csh
 cat << '_EOF_' > gsub
 #LOOP
 getSeqs.csh $(path1)
 #ENDLOOP
 '_EOF_'
      # << this line makes emacs coloring happy 
      ssh kk
      cd /cluster/data/danRer3/NA/scafSeqs
      gensub2 scaffolds.lst single gsub jobList
      para create jobList 
      para try,check,push,check etc...
     
      ssh kkstore01
      cd /cluster/bluearc/danRer3/scaffolds
      foreach f (*.size)
     faToTwoBit ./chrNA/scafSeqs/*.fa danRer3ChrNA.2bit
     faToTwoBit ./chrUn/scafSeqs *.fa danRer3ChrUn.2bit
 
 # MAKE GOLD AND GAP TRACKS (DONE, 2005-06-15, hartera)
 # Add trackDb entry and html page for gold and gap tracks (2005-06-16, hartera)
     ssh hgwdev
     cd /cluster/data/danRer3
     # the gold and gap tracks are created from the chrN.agp file and this is
     # the scaffolds or supercontigs agp 
     hgGoldGapGl -noGl -chromLst=chrom.lst danRer3 /cluster/data/danRer3 .
     # featureBits danRer3 gold
     # 1630323462 bases of 1630323462 (100.000%) in intersection
     # featureBits danRer2 gold
     # 1560497282 bases of 1560497282 (100.000%) in intersection
     # featureBits danRer1 gold
     # 1459132082 bases of 1459132082 (100.000%) in intersection
 
     # featureBits danRer3 gap
     # 13709500 bases of 1630323462 (0.841%) in intersection
     # featureBits danRer2 gap
     # 28776000 bases of 1560497282 (1.844%) in intersection
     # featureBits danRer1 gap
     # 64174000 bases of 1459132082 (4.398%) in intersection
 # Add trackDb.ra entries for gold and gap tracks and also create
 # gap.html and gold.html pages.
 
 # MAKE TRACKDB ENTRY FOR DANRER3 (DONE, 2005-06-16, hartera)
     ssh hgwdev
     # Make trackDb table so browser knows what tracks to expect:
     mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3
     cd ~/kent/src/hg/makeDb/trackDb/zebrafish
     cvs add danRer3
     cvs commit danRer3
     cd ~/kent/src/hg/makeDb/trackDb
     cvs up -d -P
     # Edit that makefile to add danRer3 in all the right places and do
     make update
     make alpha
     cvs commit -m "Added danRer3." makefile
     
 # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2005-06-16, hartera)
     ssh hgwdev
     mkdir /cluster/data/danRer3/html
    # make a symbolic link from /gbdb/danRer3/html to /cluster/data/danRer3/html
     ln -s /cluster/data/danRer3/html /gbdb/danRer3/html
     # Add a description page for zebrafish
     cd /cluster/data/danRer3/html
     cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer2/description.html .
     # Edit this for zebrafish danRer3
                                                                                 
     # create a description.html page here
     cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3
     # Add description page here too
     cp /cluster/data/danRer3/html/description.html .
     cvs add description.html
     cvs commit -m "First draft of description page for danRer3." \
         description.html
     cd ~/kent/src/hg/makeDb/trackDb
     make update
     make alpha
 
 # MAKE HGCENTRALTEST ENTRY FOR DANRER3 (DONE, 2005-06-16, hartera)
 # UPDATE ENTRY TO ADD DANRER3 TO GENE SORTER (DONE, 2006-06-09, hartera)
     # Make dbDb and defaultDb entries so test browser knows about it:
     ssh hgwdev
     # Add dbDb and defaultDb entries:
     echo 'insert into dbDb (name, description, nibPath, organism,  \
           defaultPos, active, orderKey, genome, scientificName,  \
           htmlPath, hgNearOk, hgPbOk, sourceName)  \
           values("danRer3", "May 2005", \
           "/gbdb/danRer3", "Zebrafish", "chr2:15,906,734-15,926,406", 1, \
           37, "Zebrafish", "Danio rerio", \
           "/gbdb/danRer3/html/description.html", 0,  0, \
           "Sanger Centre, Danio rerio Sequencing Project Zv5");' \
     | hgsql -h genome-testdb hgcentraltest
     # set danRer3 to be the default assembly for Zebrafish
     echo 'update defaultDb set name = "danRer3" \
           where genome = "Zebrafish";' \
           | hgsql -h genome-testdb hgcentraltest
     # Update dbDb entry for danRer3 to add it to Gene Sorter 
     # (hartera, 2006-06-09)
     echo 'update dbDb set hgNearOk = 1 where name = "danRer3";' \
          | hgsql -h genome-testdb hgcentraltest
 
 # PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS AND ON BLUEARC
 # (DONE, 2005-06-16, hartera)
     ssh kkr1u00
     # Chrom-level mixed nibs that have been repeat- and trf-masked:
     rm -rf /iscratch/i/danRer3/nib
     mkdir -p /iscratch/i/danRer3/nib
     cp -p /cluster/data/danRer3/nib/chr*.nib /iscratch/i/danRer3/nib
     # Pseudo-contig fa that have been repeat- and trf-masked:
     rm -rf /iscratch/i/danRer3/trfFa
     mkdir /iscratch/i/danRer3/trfFa
     foreach d (/cluster/data/danRer3/*/chr*_?{,?})
       cp -p $d/$d:t.fa /iscratch/i/danRer3/trfFa
     end
     rm -rf /iscratch/i/danRer3/rmsk
     mkdir -p /iscratch/i/danRer3/rmsk
     cp -p /cluster/data/danRer3/*/chr*.fa.out /iscratch/i/danRer3/rmsk
     cp -p /cluster/data/danRer3/danRer3.2bit /iscratch/i/danRer3/
     /cluster/bin/iSync
     # add to the bluearc
     ssh kkstore01
     mkdir -p /cluster/bluearc/danRer3/nib
     cp -p /cluster/data/danRer3/nib/chr*.nib /cluster/bluearc/danRer3/nib
     mkdir -p /cluster/bluearc/danRer3/trfFa
     foreach d (/cluster/data/danRer3/*/chr*_?{,?})
       cp -p $d/$d:t.fa /cluster/bluearc/danRer3/trfFa
     end
     cp /cluster/data/danRer3/danRer3.2bit /cluster/bluearc/danRer3/
 
 # ADD CONTIGS TRACK (DONE, 2005-06-16, hartera)
 # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from 
 # chunks (contigs) agp files.
     ssh kkstore01
     mkdir -p /cluster/data/danRer3/bed/ctgPos2
     cd /cluster/data/danRer3/bed/ctgPos2
     # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
     foreach c (`cat /cluster/data/danRer3/chrom.lst`)
          awk 'BEGIN {OFS="\t"} \
          {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
          /cluster/data/danRer3/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab
     end
                                                                                 
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ctgPos2
     hgsql danRer3 < ~/kent/src/hg/lib/ctgPos2.sql
     echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \
          | hgsql danRer3
 # create trackDb.ra entry and html page for ctgPos2 track.
     # Changed termRegEx for ctgPos2 in trackDb.ra so that it handles 
     # contigs named "Zv5_scaffold*". (2006-04-19, hartera)
 
 # CREATE gc5Base WIGGLE TRACK (DONE, 2005-06-16, hartera)
 # FIX LINK FOR WIB FILES TO POINT TO danRer3 ON store11 (2005-07-25, hartera)
     ssh kkstore01
     mkdir -p /cluster/data/danRer3/bed/gc5Base
     cd /cluster/data/danRer3/bed/gc5Base
     # The number of bases that hgGcPercent claimed it measured is calculated,
     # which is not necessarily always 5 if it ran into gaps, and then the
     # division by 10.0 scales down the numbers from hgGcPercent to the range
     # [0-100].  wigEncode now replaces wigAsciiToBinary and the previous
     # processing step between these two programs. The result file is *.wig.
     # Each value represents the measurement over five bases beginning with
     # <position>. wigEncode also calculates the zoomed set of data.
     # Uses the 2bit file in /cluster/data/danRer3 as sequence input.
                                                                                 
     nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer3 \
         /cluster/data/danRer3 | \
         wigEncode stdin gc5Base.wig gc5Base.wib
     # load the .wig file back on hgwdev:
     ssh hgwdev
     cd /cluster/data/danRer3/bed/gc5Base
     hgLoadWiggle -pathPrefix=/gbdb/danRer3/wib/gc5Base \
                  danRer3 gc5Base gc5Base.wig
     # and symlink the .wib file into /gbdb
     # fix link as danRer3 is now in store 11 (2005-07-25, hartera)
     rm -r /gbdb/danRer3/wib/gc5Base
     mkdir -p /gbdb/danRer3/wib/gc5Base
     ln -s `pwd`/gc5Base.wib /gbdb/danRer3/wib/gc5Base
 
 # MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2005-06-17, hartera)
     # Use -repMatch=512 (based on size -- for human we use 1024, and
     # the zebrafish genome is ~50% of the size of the human genome
     ssh kkr1u00
     mkdir /cluster/data/danRer3/bed/ooc
     cd /cluster/data/danRer3/bed/ooc
     mkdir -p /cluster/bluearc/danRer3
     ls -1 /cluster/data/danRer3/nib/chr*.nib > nib.lst
     blat nib.lst /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/bluearc/danRer3/danRer3_11.ooc -repMatch=512
     # Wrote 50575 overused 11-mers to /cluster/bluearc/danRer3/11.ooc
     # For 10.ooc, repMatch = 4096 for human, so use 2048
     blat nib.lst /dev/null /dev/null -tileSize=10 \
       -makeOoc=/cluster/bluearc/danRer3/danRer3_10.ooc -repMatch=2048
     # Wrote 12574 overused 10-mers to /cluster/bluearc/danRer3/10.ooc 
     # keep copies of ooc files in this directory and copy to iscratch
     cp /cluster/bluearc/danRer3/*.ooc .
     cp -p /cluster/bluearc/danRer3/*.ooc /iscratch/i/danRer3/
     /cluster/bin/iSync
 
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer3 (DONE, 2005-07-20, kuhn)
     # hgcentraltest is now on hgwdev                                            
     ssh hgwdev
    # DNA port is "0", trans prot port is "1"
  echo 'insert into blatServers values("danRer3", "blat2", "17778", "1", "0");    insert into blatServers values("danRer3", "blat2", "17779", "0", "1");' \
     | hgsql hgcentraltest
     # this enables blat and isPcr, isPcr is enabled by loading blat server
     # with tilesize=5 (ask for this when request blat servers from 
     # cluster admin).
     # if you need to delete those entries
     echo 'delete from blatServers where db="danRer3";' \
     | hgsql hgcentraltest
     # to check the entries:
     echo 'select * from blatServers where db="danRer3";' \
     | hgsql hgcentraltest
 
 # AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2005-07-22, hartera)
 # REMAKE THIS TRACK USING chrUn AND chrNA SCAFFOLDS (DONE, 2005-08-19, hartera)
 # UPDATED (2006-09-27) - see separate section, UPDATE AFFY ZEBRAFISH TRACK.
     # array chip sequences already downloaded for danRer1
     ssh hgwdev
     cd /projects/compbio/data/microarray/affyZebrafish
     mkdir /cluster/bluearc/affy
     cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \       /cluster/bluearc/affy/
     # Set up cluster job to align Zebrafish consensus sequences to danRer3
     ssh kkr1u00
     mkdir -p /cluster/data/danRer3/bed/affyZebrafish.2005-08-19
     ln -s /cluster/data/danRer3/bed/affyZebrafish.2005-08-19 \
           /cluster/data/danRer3/bed/affyZebrafish
     cd /cluster/data/danRer3/bed/affyZebrafish
     mkdir -p /iscratch/i/affy
     cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
     /cluster/bin/iSync
 
     # the kilokluster is down, so run on the pitakluster
     ssh pk
     cd /cluster/data/danRer3/bed/affyZebrafish
     ls -1 /cluster/bluearc/affy/Zebrafish_consensus.fa > affy.lst
     ls -1 /cluster/bluearc/danRer3/trfFa/chr[0-9M]*.fa > genome.lst
     # for output:
     mkdir -p /san/sanvol1/danRer3/affy/pslChrom
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/danRer3/affy/pslChrom/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
 
     gensub2 genome.lst affy.lst template.sub para.spec
     para create para.spec
     para try, check, push ... etc.
 # para time
 # Completed: 208 of 208 jobs
 # CPU time in finished jobs:       1355s      22.59m     0.38h    0.02d  0.000 y
 # IO & Wait Time:                  9988s     166.46m     2.77h    0.12d  0.000 y
 # Average job time:                  55s       0.91m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              74s       1.23m     0.02h    0.00d
 # Submission to last job:           217s       3.62m     0.06h    0.00d
 
     # then run the 2bit file of scaffolds
     ssh pk 
     cd /cluster/data/danRer3/bed/affyZebrafish
     mkdir scaffoldsNAandUnRun
     cd scaffoldsNAandUnRun
     ls -1 /cluster/bluearc/affy/Zebrafish_consensus.fa > affy.lst
     foreach f (/cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/*.fa)
        ls -1 $f >> scafs.lst
     end
     mkdir -p /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn/$(root1)_$(root2).psl}\n#ENDLOOP' > template2.sub
 
     gensub2 scafs.lst affy.lst template2.sub para.spec
     para create para.spec
     para try, check, push ... etc.
 # para time
 # Completed: 14941 of 14941 jobs
 # CPU time in finished jobs:      27574s     459.57m     7.66h    0.32d  0.001 y
 # IO & Wait Time:                 47642s     794.03m    13.23h    0.55d  0.002 y
 # Average job time:                   5s       0.08m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              35s       0.58m     0.01h    0.00d
 # Submission to last job:           339s       5.65m     0.09h    0.00d
 
     
     # need to do pslSort and lift up for each separate run
     cd /cluster/data/danRer3/bed/affyZebrafish
     cd /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyZebrafish.psl
     # only use alignments that have at least
     # 95% identity in aligned region.
     # do not use minCover since a lot of sequence is in Un, NA and Finished
     # so genes may be split up so good to see all alignments
     # first do the chr1-25 and chrM alignments
     pslSort dirs raw.psl tmp pslChrom
     pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     # Processed 27408 alignments
     pslSort dirs rawNAandUn.psl tmp pslScaffoldsNAandUn
     pslReps -minAli=0.95 -nearTop=0.005 rawNAandUn.psl scafNAandUn.psl /dev/null
     # Processed 9888 alignments
     # lift up chrom contigs to chrom level
     liftUp affyZfishChroms.psl \
         /cluster/data/danRer3/jkStuff/liftAll.lft warn contig.psl
     liftUp affyZfishScafsNAandUn.psl \
       /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
       warn scafNAandUn.psl
     # sort and merge these files
     mkdir psl
     cp affyZfish* ./psl/
     pslSort dirs affyZebrafish.psl tmp1 psl
     
     # rsync these psl files 
     rsync -a --progress /san/sanvol1/danRer3/affy/*.psl \
          /cluster/data/danRer3/bed/affyZebrafish/
     ssh kkstore02
     cd /cluster/data/danRer3/bed/affyZebrafish
     # shorten names in psl file
     sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
     mv affyZebrafish.psl.tmp affyZebrafish.psl
     pslCheck affyZebrafish.psl
     # psl is good
     # load track into database
     ssh hgwdev
     cd /cluster/data/danRer3/bed/affyZebrafish
     hgLoadPsl danRer3 affyZebrafish.psl
     # Add consensus sequences for Zebrafish chip
     # Copy sequences to gbdb if they are not there already
     mkdir -p /gbdb/hgFixed/affyProbes
     ln -s \
        /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /gbdb/hgFixed/affyProbes
                                                                                 
     hgLoadSeq -abbr=Zebrafish: danRer3 \
               /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
     # Clean up
     rm batch.bak contig.psl raw.psl
     # moved affyZebrafish.html description and trackDb.ra track entry and
     # search for Affy Zebrafish track to
     # ~/kent/src/hg/makeDb/trackDb/zebrafish since it is common to all 
     # danRer assemblies. 
 
 # LIFT FILES FROM SCAFFOLDS TO chrUn AND chrNA (DONE, 2005-07-27, hartera)
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/liftSupertoChrom
     cd /cluster/data/danRer3/liftSupertoChrom
     # lift files are already created when scaffoldFaToAgp was run for chrUn.fa
     # and chrNA.fa. These need to be edited as the last 500 Ns were removed 
     # from the agp file making the sequence 184125739 bp and not 184126239 bp
     # for chrUn, for chrNA, it is 253521007 bp instead of 253521507 bp and need 
     # to change chrUn to chrNA
     cp /cluster/data/danRer3/Un/tmp/chrUn.lft .
     cp /cluster/data/danRer3/NA/tmp/chrNA.lft .
     # edit to remove last lines of each file first
     # then use perl to change co-ordinates
     perl -pi.bak -e 's/184126239/184125739/' chrUn.lft
     perl -pi.bak -e 's/253521507/253521007/' chrNA.lft
     perl -pi.bak -e 's/chrUn/chrNA/' chrNA.lft
     cat *.lft >> liftNAandUnScaffoldsToChrom.lft
     # clean up 
     rm *.bak
 
 # ENSEMBL GENES (DONE, 2005-07-29, hartera) 
     ssh hgwdev  
     mkdir -p /cluster/data/danRer3/bed/ensembl
     cd /cluster/data/danRer3/bed/ensembl
     # Get the Ensembl gene data from
     # http://www.ensembl.org/Multi/martview
     # Follow this sequence through the pages: (NOTE: this interface has changed
     # a little since danRer2)
     # Page 1) Select the Ensembl dataset (v32 here) and the 
     # Danio_rerio choice (ZFISH5 here). Hit next. 22877 entries total.
     # Ensembl 35 now (2005-11-23) and this is the same as for the version 32
     # downloaded as above. Ensembl 36 (Dec 2005) is the same as for 32 for
     # Zebrafish. Ensembl 38 (April 2006) Protein Coding genes is the same 
     # as for Ensembl 32. (Select Gene type as protein_coding on page 2).
     # Page 2) Then hit next.
     # Page 3) Choose the "Structures" Attribute Page from the pulldown menu
     # at the top. Make sure that under the GENE section, the Ensembl 
     # Attributes checked include the Ensembl Gene ID and Ensembl 
     # Transcript ID. Choose GTF as the output. Choose gzip compression.  
     # Hit export. Save as ensemblGene35.gtf.gz
 
     # the Ensembl gene predictions are mapped to chromosomes except for 
     # chrNA and chrUn. Use lift files for scaffolds to these chroms.
     # get chrUn and chrNA Ensembl records 
     ssh kkstore02
     cd /cluster/data/danRer3/bed/ensembl
     gunzip ensemblGene.gtf.gz
     awk '$1 ~ /^Zv5_NA[0-9]+/ || $1 ~ /^Zv5_scaffold[0-9]+/' ensemblGene.gtf \
                     > ensemblGenechrUns.gtf
     # get records for all other chroms
     awk '$1 ~ /^[0-9]+/' ensemblGene.gtf > ensemblGenechroms.gtf
     wc -l *.gtf
     # 513421 ensemblGenechroms.gtf
     # 125319 ensemblGenechrUns.gtf
     # 638740 ensemblGene.gtf
     # total lines of files made equal to original file so ok
     liftUp -type=.gtf ensemblGenechrUns.lifted \
      /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \ 
      warn ensemblGenechrUns.gtf
      # Got 29880 lifts in 
      # /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft
      sed -e "s/^/chr/" ensemblGenechroms.gtf > ensGene.gtf
      cat ensemblGenechrUns.lifted >> ensGene.gtf
      # check file sizes -ok and some of the lifted co-ordinates
      # there were some erroneous lines with "1;" or "2;" - 8 lines total
      # Notified Ensembl and they fixed it so downloaded file again 
      # and reloaded into database
      # Also remove the suffix that denotes the transcript version number. 
      # This is not in the ensGtp or ensPep tables.
      perl -pi.bak -e 's/\.[0-9]+//'g ensGene.gtf
  
      # load into database
      ssh hgwdev
      cd /cluster/data/danRer3/bed/ensembl
      hgsql -e 'drop table ensGene;' danRer3
      /cluster/bin/i386/ldHgGene danRer3 ensGene ensGene.gtf
      # Read 32143 transcripts in 638732 lines in 1 files
      # 32143 groups 27 seqs 1 sources 4 feature types
      # 32143 gene predictions
 
      # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
      # hgKnownToSuper.  Use ensMart to create it as above, except:
      # Page 3) Choose the "Features" box. In "Ensembl Attributes", check
      # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
      # Choose Text, tab-separated as the output format and gzip compression.  
      # Result name: ensGtp.
      gunzip ensGtp.tsv.gz
      # edit to remove first header line
      hgsql danRer3 < ~/kent/src/hg/lib/ensGtp.sql
      # remove header line from ensGtp.txt
      echo "load data local infile 'ensGtp.tsv' into table ensGtp" \
          | hgsql -N danRer3
 
          # Get the ensembl peptide sequences from
     # http://www.ensembl.org/Multi/martview
     # Choose Danio Rerio as the organism
     # Follow this sequence through the pages:
     # Page 1) Choose the Ensembl Genes choice. Hit next.
     # Page 2) Then hit next.
     # Page 3) Choose "Sequences" from the Attributes pulldown menu at the top.
     # Page 4) Choose Peptide as type of sequence to export and select 
     # Ensembl Gene ID from Gene Attributes and 
     # Ensembl Transcript ID and Ensembl Peptide Stable ID from 
     # Transcript Attributes as the output,
     # choose text/fasta and gzip compression,
     # name the file ensemblPep.fa.gz and then hit export.
     gunzip ensemblPep.fa.gz
     hgPepPred danRer3 ensembl ensemblPep.fa
     # added code to hgc.c so that the link to the Ensembl Protein
     # is also displayed on the description page.
 
 
 FOR MGC GENES:
  - wait one day for nightly build to align and load them into the db
    - rebuild trackDb
 
 # SPLIT UP ZEBRAFISH MASKED SEQUENCE FROM chrUn and chrNA INTO SCAFFOLDS
 # (DONE, 2005-08-04, hartera)
 # ADD SOFT-MASKED SCAFFOLDS TO ISERVERS FOR CLUSTER RUNS 
 # (DONE, 2005-08-15, hartera) AND TO BLUEARC (DONE, 2005-08-19)
     ssh kkstore02
     cd /cluster/data/danRer3
     # for chrUn and chrNA, get masked sequence for soft and hard-masked 
     foreach c (Un NA)
       cd $c
       mkdir scaffoldsSoftMask scaffoldsHardMask
       awk 'BEGIN {FS="\t"}{if ($5 != "N") \
        print "faFrag -mixed chr'${c}'.fa",$2-1, $3, $6".fa";}' chr${c}.agp \
        >> ./scaffoldsSoftMask/faFragSoftMask.csh
       awk 'BEGIN {FS="\t"}{if ($5 != "N") \
         print "faFrag -mixed chr'${c}'.fa.masked",$2-1, $3, $6".fa.masked";}' \
         chr${c}.agp >> ./scaffoldsHardMask/faFragHardMask.csh
       cd ..
     end 
 
     # change permissions run scripts to get sequences
     foreach d (Un NA)
        chmod +x $d/scaffoldsSoftMask/faFragSoftMask.csh
        chmod +x $d/scaffoldsHardMask/faFragHardMask.csh
     end
 
     cat << '_EOF_' > jkStuff/getMaskedScaffolds.csh
 #!/bin/csh
 foreach c (Un NA)
    set dir=/cluster/data/danRer3
    echo "Processing $c"
    cd $dir/$c/scaffoldsSoftMask
    cp ../chr${c}.fa .
    echo "Getting soft-masked sequences ..." 
    nice faFragSoftMask.csh >& faFrag.log
    echo "Getting hard-masked sequences ..." 
    cd $dir/$c/scaffoldsHardMask
    cp ../chr${c}.fa.masked .
    nice faFragHardMask.csh >& faFrag.log
 end 
 '_EOF_'
    chmod +x jkStuff/getMaskedScaffolds.csh
    nice ./jkStuff/getMaskedScaffolds.csh &
    # check a few sequences that they are correct
    # add name of scaffold to sequence fasta and cat together
    foreach c (Un NA)
       set dir = /cluster/data/danRer3
       foreach d (scaffoldsSoftMask scaffoldsHardMask)
          cd $dir/$c/$d
          foreach f (Zv5*)
            if ($d == "scaffoldsHardMask") then
               set b=$f:r
               set g=$b:r
               set sc=scaffoldMasked${c}.fa
            else
               set g=$f:r
               set sc=scaffold${c}.fa
            endif 
            perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:]+/>$g/" $f
            cat $f >> $sc
            rm *.bak
          end
          cp scaffold* $dir/$c/
       end
    end
    # check sizes of final FASTA file with all sequences. check a few
    # sequence files to see that they are correct - ok 
    # Add soft-masked scaffolds to the iservers for cluster runs 
    # (2005-08-15, hartera)
    ssh kkr1u00
    mkdir -p /iscratch/i/danRer3/scaffoldsSoftMask
    cd /cluster/data/danRer3
    foreach c (NA Un)
       foreach f (/cluster/data/danRer3/$c/scaffoldsSoftMask/Zv5_*.fa)
       cp -p $f /iscratch/i/danRer3/scaffoldsSoftMask
       end
    end
    /cluster/bin/iSync
    # Add soft-masked scaffolds to the bluearc for cluster runs 
    # (2005-08-19, hartera)
    ssh kkr1u00
    cd /cluster/data/danRer3/
    mkdir -p /cluster/bluearc/scratch/danRer3/scaffoldsSoftMask
    foreach c (NA Un)
       foreach f (/cluster/data/danRer3/$c/scaffoldsSoftMask/Zv5_*.fa)
          rsync -a --progress $f \
          /cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/
       end 
    end 
 
 # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2005-08-05, hartera)
     ssh kkstore02
     cd /cluster/data/danRer3
     #- Build the .zip files
     cat << '_EOF_' > jkStuff/gzipAll.csh
 rm -rf gzip
 mkdir gzip
 # chrom AGP's
 tar cvzf gzip/chromAgp.tar.gz [0-9A-Z]*/chr*.agp
 # chrom RepeatMasker out files
 tar cvzf gzip/chromOut.tar.gz */chr*.fa.out
 # soft masked chrom fasta
 tar cvzf gzip/chromFa.tar.gz */chr*.fa
 # soft masked chrNA and chrUn scaffolds
 tar cvzf gzip/scaffoldUnsFa.tar.gz NA/scaffoldNA.fa \
     Un/scaffoldUn.fa
 # hard masked chrom fasta
 tar cvzf gzip/chromFaMasked.tar.gz */chr*.fa.masked
 # hard masked chrNA and chrUn scaffolds
 tar cvzf gzip/scaffoldUnsFaMasked.tar.gz \
     NA/scaffoldMaskedNA.fa \
     Un/scaffoldMaskedUn.fa
 # chrom TRF output files
 cd bed/simpleRepeat
 tar cvzf ../../gzip/chromTrf.tar.gz trfMaskChrom/chr*.bed
 cd ../..
 
 # get GenBank native mRNAs
 cd /cluster/data/genbank
 ./bin/i386/gbGetSeqs -db=danRer3 -native GenBank mrna \
         /cluster/data/danRer3/gzip/mrna.fa
 # get GenBank xeno mRNAs
 ./bin/i386/gbGetSeqs -db=danRer3 -xeno GenBank mrna \
         /cluster/data/danRer3/gzip/xenoMrna.fa
 # get native RefSeq mRNAs
 ./bin/i386/gbGetSeqs -db=danRer3 -native refseq mrna \
 /cluster/data/danRer3/gzip/refMrna.fa
 # get native GenBank ESTs
 ./bin/i386/gbGetSeqs -db=danRer3 -native GenBank est \
 /cluster/data/danRer3/gzip/est.fa
                                                                                 
 cd /cluster/data/danRer3/gzip
 # gzip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
 gzip mrna.fa
 gzip xenoMrna.fa
 gzip refMrna.fa
 gzip est.fa
 '_EOF_'
     # << this line makes emacs coloring happy
     chmod +x ./jkStuff/gzipAll.csh
     csh ./jkStuff/gzipAll.csh |& tee ./jkStuff/gzipAll.log
     #- Look at zipAll.log to make sure all file lists look reasonable.
     # Make upstream files and Copy the .zip files to
     # hgwdev:/usr/local/apache/...
     ssh hgwdev
     cd /cluster/data/danRer3/gzip
     # make upstream files for zebrafish RefSeq
     featureBits danRer3 refGene:upstream:1000 -fa=upstream1000.fa
     gzip upstream1000.fa
     featureBits danRer3 refGene:upstream:2000 -fa=upstream2000.fa
     gzip upstream2000.fa
     set gp = /usr/local/apache/htdocs/goldenPath/danRer3
     mkdir -p $gp/bigZips
     cp -p *.gz $gp/bigZips
     mkdir -p $gp/chromosomes
     foreach f (../*/chr*.fa)
        cp $f $gp/chromosomes
     end
     foreach c (NA Un)
        cd /cluster/data/danRer3/$c
        cp scaffold${c}.fa.gz $gp/chromosomes
     end
     cd $gp/bigZips
     md5sum *.gz > md5sum.txt
     cd $gp/chromosomes
     # gzip the chromosome and scaffold FASTAs individually
     foreach f (*.fa)
       gzip $f
     end
     md5sum *.gz > md5sum.txt
     # Take a look at bigZips/* and chromosomes/*
     # copy README.txt's from danRer2 and update
 
 # MAKE NIB FILES AND 2BIT FILE FOR SOFT MASKED chrUn AND chrNA SCAFFOLDS
 # (DONE, 2005-08-06, hartera)
 # ADD chrUn AND chrNA SCAFFOLDS 2BIT FILE TO BLUEARC (DONE, 2005-08-19, hartera)
 
     ssh kkstore02
     cd /cluster/data/danRer3
     mkdir scaffoldsNAandUnNib
     # Build nib files, using the soft masking in the fa
     foreach c (NA Un)
        echo "Processing $c"
        foreach f ($c/scaffoldsSoftMask/Zv5*.fa)
          faToNib -softMask $f scaffoldsNAandUnNib/$f:t:r.nib
        end
     end
     # check correct number of nib files in directory: 14941
     # there are 14676 chrNA scaffolds and 265 chrUn scaffolds
     # copy chromosome 1-25 and chrNA and chrUn scaffolds nibs to a directory
     # on iscratch and iSync for use in cluster runs
     ssh kkr1u00
     mkdir -p /iscratch/i/danRer3/chromandScafNib
     cp -p /cluster/data/danRer3/nib/chr[0-9]*.nib \
        /iscratch/i/danRer3/chromandScafNib
     foreach f (/cluster/data/danRer3/scaffoldsNAandUnNib/Zv5*.nib)
        cp -p $f /iscratch/i/danRer3/chromandScafNib
     end
     ssh kkstore02
     # make a 2 bit file of all the scaffolds for chrNA and chrUn
     # for blastz cluster runs
     cd /cluster/data/danRer3/
     cat NA/scaffoldNA.fa Un/scaffoldUn.fa > danRer3NAandUnScaffolds.fa
     grep '>' danRer3NAandUnScaffolds.fa | wc -l
     # 14941
     faToTwoBit danRer3NAandUnScaffolds.fa danRer3NAandUnScaf.2bit
     ssh kkr1u00
     mkdir -p /iscratch/i/danRer3/NAandUnScafs
     cp /cluster/data/danRer3/danRer3NAandUnScaf.2bit \
        /iscratch/i/danRer3/NAandUnScafs
     /cluster/bin/iSync
     
     # get sizes of scaffolds for the .len file used by blastz
     ssh kolossus
     mkdir -p /panasas/store/danRer3/NAandUnScafSizes
     cd /cluster/data/danRer3
 cat << '_EOF_' > jkStuff/getNAandUnScafSizes.csh
 #!/bin/csh -fe
 foreach c (NA Un)
   set sizeDir=/panasas/store/danRer3/NAandUnScafSizes
   cd /cluster/data/danRer3/$c/scaffoldsSoftMask
   foreach f (Zv5*.fa)
      set g=$f:r
      faSize detailed=on $f >> $sizeDir/NAandUnScafs.sizes
   end
 end
 '_EOF_'
     chmod +x jkStuff/getNAandUnScafSizes.csh
     nice jkStuff/getNAandUnScafSizes.csh >& size.log &
     # took about 1 minute
     wc -l /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes
     # 14941 /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes
     # so correct number of scaffolds
     cp /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes \
        /cluster/data/danRer3
     # add 2 bit to bluearc for cluster runs (2005-08-19, hartera)
     ssh kkr1u00
     mkdir -p /cluster/bluearc/scratch/danRer3
     cp /cluster/data/danRer3/danRer3NAandUnScaf.2bit \
        /cluster/bluearc/scratch/danRer3/
 
 # BLASTZ SWAP FOR MOUSE (mm6) (DONE, 2005-08-10, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
 # REMAKE AXTNET AND COPY TO DOWNLOADS. REMAKE MAFNET (DONE, 2005-08-17, hartera)
 # DROPPED THE CHAIN AND NET TABLES FROM HGWDEV AS THERE WERE 3 SETS OF 
 # MOUSE ALIGNMENTS: mm6, mm7 and mm8 (DONE, 2006-03-28, hartera)
     ssh kkr1u00
     # blastz requires lineage-specific repeats
     # Treat all repeats as lineage-specific
     # if not done already, get lineage-specific repeats
     mkdir -p /iscratch/i/mm6/linSpecRep.notInZebrafish
     foreach f (/panasas/store/mm6/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/mm6/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
     end
 
     mkdir -p /iscratch/i/danRer3/linSpecRep.notInMouse
     foreach f (/iscratch/i/danRer3/rmsk/chr*.fa.out)
       cp -p $f /iscratch/i/danRer3/linSpecRep.notInMouse/$f:t:r:r.out.spec
     end
     /cluster/bin/iSync
 
     # NOTE: the "mouse/human/etc." lineage-specific repeat files are now in
     # /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
     # however, the files for chrNA and chrUn were missing, so I'm
     # adding them here.  (2005-12-19 kate)
     ssh kkstore02
     cd /cluster/data/danRer3
     cp -p Un/chrUn.fa.out  \
         /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/chrUn.out.spec
     cp -p NA/chrNA.fa.out \
         /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/chrNA.out.spec
 
     # do swap of mm6 vs. danRer3 chain and net alignments to 
     # create danRer3 vs. mm6. see makeMm6.doc for details.
     ssh kk
     cd /cluster/data/mm6/bed/blastz.danRer3
     mkdir -p /panasas/store/danRer3vsmm6Out
     nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
         -stop download -blastzOutRoot /panasas/store/danRer3vsmm6Out \
         -swap -chainMinScore=5000 >& doSwap.log &
     # Start: Aug 10 16:30
     # Finish: Aug 10 16:54
     # Blastz parameters are as for mm6 vs. danRer3 danRer3 - see makeMm6.doc
 # BLASTZ_H=2000
 # BLASTZ_Y=3400
 # BLASTZ_L=6000
 # BLASTZ_K=2200
 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 # BLASTZ_ABRIDGE_REPEATS=1
   # do cleanup step and specify a different file server as can not 
   # access panasas from kkstore02.
   nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
         -continue cleanup -fileServer eieio \
         -blastzOutRoot /panasas/store/danRer3vsmm6Out \
         -swap -chainMinScore=5000 >& doSwapCleanUp.log &
   # make html files and trackDb.ra entry for chain and net tracks.
   # check README.txt for downloads.
 # featureBits -chrom=chr1 danRer3 refGene:cds chainMm6Link -enrichment
 # refGene:cds 0.688%, chainMm6Link 8.193%, both 0.543%, cover 78.94%, 
 # enrich 9.64x
 # featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment
 # refGene:cds 0.642%, chainMm5Link 4.499%, both 0.492%, cover 76.60%, 
 # enrich 17.02x
 # featureBits -chrom=chr2 danRer3 refGene:cds chainMm6Link -enrichment 
 # refGene:cds 0.705%, chainMm6Link 8.219%, both 0.557%, cover 79.04%, 
 # enrich 9.62x
 # featureBits -chrom=chr2 danRer2 refGene:cds chainMm5Link -enrichment 
 # refGene:cds 0.739%, chainMm5Link 4.539%, both 0.579%, cover 78.37%, 
 # enrich 17.26x
 # looks good, although enrichment is lower than for danRer2 and mm5, there are 
 # more chains in the score <10000 range for danRer3 than for danRer2 but 
 # this does not make up for all the extra chains in danRer3 over danRer2. 
 # Maybe there are more high scoring alignments to the chrUn and chrNA chains 
 # due to the scaffolds being used for the alignments.
 # danRer3 has a extra sequence compared to danRer2. danRer3 chr2 is 48.2 Mb
 # and for danRer2, chr2 is 52 Mb so in this case the chrom is smaller.
 # featureBits -chrom=chrNA danRer3 refGene:cds chainMm6Link -enrichment
 # refGene:cds 0.449%, chainMm6Link 10.952%, both 0.350%, cover 77.94%, 
 # enrich 7.12x
 # featureBits -chrom=chrNA danRer2 refGene:cds chainMm5Link -enrichment
 # refGene:cds 0.499%, chainMm5Link 4.176%, both 0.372%, cover 74.60%, 
 # enrich 17.86x
 
    # netToAxt was processing nets incorrectly so remake these with 
    # new version of netToAxt and transfer to downloads dir. 
    ssh kkstore02
    cd /cluster/data/danRer3/bed/blastz.mm6.swap
    rm -r axtNet
    # Make axtNet for download: one .axt per danRer3 seq.
    # remake noClass.net
    # Make nets("noClass", i.e. without rmsk/class stats which are added later):
    cd axtChain
 chainPreNet danRer3.mm6.all.chain.gz /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len stdout \
 | chainNet stdin -minSpace=1 /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len stdout /dev/null \
 | netSyntenic stdin noClass.net
 
    # create net for each chrom again
    netSplit noClass.net net
    # also split up chains again
    mkdir chain
    zcat danRer3.mm6.all.chain.gz | chainSplit chain stdin
    ssh hgwdev
    cd /cluster/data/danRer3/bed/blastz.mm6.swap
    mkdir axtNet
    foreach f (axtChain/net/*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
     /cluster/bluearc/danRer3/nib /panasas/store/mm6/nib stdout \
      | axtSort stdin stdout \
      | gzip -c > axtNet/$f:t:r.danRer3.mm6.net.axt.gz
    end
 
    # cleanup 
    ssh kkstore02 
    cd /cluster/data/danRer3/bed/blastz.mm6.swap/axtChain
    rm noClass.net
    rm -r net
    rm -r chain
    # remake mafNet from the new axtNet
    cd /cluster/data/danRer3/bed/blastz.mm6.swap
    rm -r mafNet
    # Make mafNet for multiz: one .maf per danRer3 seq.
    mkdir mafNet
    foreach f (axtNet/*.danRer3.mm6.net.axt.gz)
      axtToMaf -tPrefix=danRer3. -qPrefix=mm6. $f \      
     /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len \
      stdout \
      | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
    end
 
    # copy the new axtNet files to downloads and replace old ones
    ssh hgwdev
    rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6/axtNet
    cd /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6
    mkdir -p /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6/axtNet
    ln -s /cluster/data/danRer3/bed/blastz.mm6.swap/axtNet/*.axt.gz axtNet/
    # remake md5sum.txt 
    rm md5sum.txt
    md5sum *.gz */*.gz > md5sum.txt
    # Dropped mouse mm6 chain and net tables from hgwdev as there were 3 sets 
    # of mouse alignments for danRer3: mm6, mm7 and mm8 (hartera, 2006-03-29)
    hgsql -e 'drop table netMm6;' danRer3
    foreach c (`cat /cluster/data/danRer3/chrom.lst`)
       hgsql -e "drop table chr${c}_chainMm6;" danRer3
       hgsql -e "drop table chr${c}_chainMm6Link;" danRer3
    end
 
 # BLASTZ FOR FUGU (fr1) (DONE, 2005-08-18, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
 # RECREATE DOWNLOADS AS THE FUGU DOWNLOADS DIRECTORY HAS BEEN DELETED
 # (DONE, 2005-11-17, hartera)
   ssh kk
   mkdir /cluster/data/danRer3/bed/blastz.fr1.2005-08-13
   cd /cluster/data/danRer3/bed
   ln -s blastz.fr1.2005-08-13 blastz.fr1
 # use parameters for fr1 in makeDanRer2.doc. Using scaffolds makes this run
 # slower so it is best to have the scaffolds in the query. Use HoxD55.q 
 # matrix as Fugu is quite distant from zebrafish. Blastz uses 
 # lineage-specfic repeats but there are none for these two species.
 cat << '_EOF_' > DEF
 # zebrafish (danRer3) vs. Fugu (fr1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - zebrafish (danRer3)
 # soft-masked chroms, 1-25 and M
 SEQ1_DIR=/iscratch/i/danRer3/chromNib
 SEQ1_RMSK=
 # lineage-specific repeats
 # we don't have that information for these species
 SEQ1_SMSK=
 SEQ1_FLAG=
 SEQ1_IN_CONTIGS=0
 # 10 MB chunk for target
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - Fugu (fr1)
 # soft-masked scaffolds in 2bit format
 SEQ2_DIR=/iscratch/i/fr1/UnScaffolds/fr1UnScaffolds.2bit
 SEQ2_RMSK=
 SEQ2_SMSK=
 SEQ2_FLAG=
 SEQ2_IN_CONTIGS=0
 # 10 Mbase for query
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer3/bed/blastz.fr1
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x DEF
     cp /cluster/data/danRer3/chrom.sizes ./S1.len
     # make S2.len for fr1 scaffolds
     twoBitInfo /cluster/data/fr1/fr1UnScaffolds.2bit ./S2.len
     wc -l *.len
     # 28 S1.len
     # 20379 S2.len
     # make output directory
     mkdir -p /cluster/bluearc/danRer3vsfr1Out
     # do blastz and create chains for fr1 scaffolds on danRer3 chr1-25 and chrM 
     # chickenHumanTuned.gap scoring matrix is now used by default 
     # by axtChain.
     nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
        -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \
        -stop chainMerge >& do.log &
     # Start: Aug 13 10:48 
     # Finish: Aug 13 13:35
     # then run the danRer3 NA and Un scaffolds against fugu scaffolds 
     mkdir NAandUnScaffolds
     cd NAandUnScaffolds
 cat << '_EOF_' > DEF
 # zebrafish (danRer3) vs. Fugu (fr1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - zebrafish (danRer3)
 # soft-masked scaffolds for chrNA and chrUn in 2 bit format
 SEQ1_DIR=/iscratch/i/danRer3/NAandUnScafs/danRer3NAandUnScaf.2bit
 SEQ1_RMSK=
 # lineage-specific repeats
 # we don't have that information for these species
 SEQ1_SMSK=
 SEQ1_FLAG=
 SEQ1_IN_CONTIGS=0
 # 10 MB chunk for target
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - Fugu (fr1)
 # soft-masked scaffolds in 2bit format
 SEQ2_DIR=/iscratch/i/fr1/UnScaffolds/fr1UnScaffolds.2bit
 SEQ2_RMSK=
 SEQ2_SMSK=
 SEQ2_FLAG=
 SEQ2_IN_CONTIGS=0
 # 10 Mbase for query
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x DEF
     twoBitInfo /cluster/data/danRer3/danRer3NAandUnScaf.2bit ./S1.len
     # make S2.len for fr1 scaffolds
     twoBitInfo /cluster/data/fr1/fr1UnScaffolds.2bit ./S2.len
     wc -l *.len
     # 14941 S1.len
     # 20379 S2.len
     # make output directory
     mkdir -p /cluster/bluearc/danRer3vsfr1Out/NAandUnScaffolds
     # do blastz and create chains for fr1 scaffolds on danRer3 
     # chrNA and chrUn scaffolds
     nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
        -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out/NAandUnScaffolds \
        -chainMinScore=5000 -stop chainMerge >& do.log & 
     # Start: Aug 13 14:05
     # Finish: Aug 14 20:58
     # The log file says it is finished. chainSplit was not run as SEQ1 has
     # is not < 100 sequences. Need to do liftUp before running chainSplit.
     cd /cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds/axtChain/run
     # Lifting up chains:
     # need to lift these chains up to chrom level for Fugu for chrom run and 
     # for danRer3 and Fugu for the NA and Un scaffolds run.
     # first for Fugu in the danRer3 chrom run
     ssh kkstore02
     cd /cluster/data/danRer3/bed/blastz.fr1/axtChain
     mkdir liftedChain
     foreach f (chain/*.chain)
        set c=$f:t:r
        echo $c
        liftUp -chainQ liftedChain/${c}.lifted.chain \
              /cluster/data/fr1/Un/lift/ordered.lft warn $f
     end
     # lift up for danRer3 scaffolds run.
     ssh kkstore02
     cd /cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds/axtChain
     # first lift Fugu fr1 query, there is no split chains here as there
     # were not < 100 sequences for the target.
  zcat danRer3.fr1.all.chain.gz | liftUp -chainQ danRer3.fr1.liftedQall.chain \
           /cluster/data/fr1/Un/lift/ordered.lft warn stdin
     # then liftUp target coords for danRer3 
     liftUp danRer3.fr1.liftedQandTall.chain \
       /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
       warn danRer3.fr1.liftedQall.chain
     # gzip lifted danRer3.fr1 chain file
     gzip danRer3.fr1.liftedQandTall.chain
     # merge the chains from the danRer3 chrom run and the danRer3
     # NA and Un scaffolds run. chains are sorted by score and IDs are uniqued.
     cd /cluster/data/danRer3/bed/blastz.fr1/axtChain
     mv danRer3.fr1.all.chain.gz danRer3.fr1.chroms.chain.gz
     set blastz=/cluster/data/danRer3/bed/blastz.fr1
     # copy over lifted chains for danRer3 scaffolds vs fr1 
     cp $blastz/NAandUnScaffolds/axtChain/danRer3.fr1.liftedQandTall.chain.gz \ 
        ./liftedChain
     gunzip ./liftedChain/*.gz
     nice chainMergeSort liftedChain/*.chain \
          | nice gzip -c > danRer3.fr1.all.chain.gz
     # then split up into chains again
     mv chain chromChain
     mkdir chain
     nice zcat danRer3.fr1.all.chain.gz | chainSplit chain stdin
     # then pick up the doBlastzChainNet.pl script at the net step.
     ssh kkstore02
     cd /cluster/data/danRer3/bed/blastz.fr1
     cp DEF DEF.chroms
     # edit DEF file to include the all nib files for danRer3 and the 
     # nib file for the chrUn of Fugu fr1. Since all the coords have now
     # been lifted to chrom level then these are now needed.
     # SEQ1_DIR=/iscratch/i/danRer3/nib
     # SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib
     # use kkr1u00 for computationally intensive steps as kolossus is down.
     # need to create new S2.len for whole chrUn for Fugu
     mv S2.len S2.scaffolds.len
     cp /cluster/data/fr1/chrom.sizes S2.len 
     nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
        -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \
        -workhorse kkr1u00 -continue net >& doNet.log &
     # crashed at cleanup step when trying to access kkstore02 
     # The authenticity of host 'kkstore02 (128.114.50.155)' can't be
     # established.  Re-run from this step.
     nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
        -blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \
        -workhorse kkr1u00 -continue cleanup >& doNet2.log &
     # netToAxt was processing nets incorrectly so remake these with 
     # new version of netToAxt. 
     # and transfer to downloads dir.
     ssh kkstore02
     cd /cluster/data/danRer3/bed/blastz.fr1
     rm -r axtNet
     # Make axtNet for download: one .axt per danRer3 seq.
     # remake noClass.net
     # Make nets("noClass", i.e. without rmsk/class stats which are added later):
     cd axtChain
     chainPreNet danRer3.fr1.all.chain.gz \
 /cluster/data/danRer3/bed/blastz.fr1/S1.len /cluster/data/danRer3/bed/blastz.fr1/S2.len stdout \
 | chainNet stdin -minSpace=1 /cluster/data/danRer3/bed/blastz.fr1/S1.len \
 /cluster/data/danRer3/bed/blastz.fr1/S2.len stdout /dev/null \
 | netSyntenic stdin noClass.net
     # create net for each chrom again
     netSplit noClass.net net
     # also split up chains again
     mkdir chain
     zcat danRer3.fr1.all.chain.gz | chainSplit chain stdin
     ssh hgwdev
     cd /cluster/data/danRer3/bed/blastz.fr1
     mkdir axtNet
     foreach f (axtChain/net/*.net)
        netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/bluearc/danRer3/nib /cluster/bluearc/fugu/fr1/chromNib stdout \
        | axtSort stdin stdout \
        | gzip -c > axtNet/$f:t:r.danRer3.fr1.net.axt.gz
     end
     # cleanup 
     ssh kkstore02 
     cd /cluster/data/danRer3/bed/blastz.fr1/axtChain
     rm noClass.net
     rm -r net
     rm -r chain
     # remake mafNet from the new axtNet
     cd /cluster/data/danRer3/bed/blastz.fr1
     rm -r mafNet
     mkdir mafNet
     foreach f (axtNet/*.danRer3.fr1.net.axt.gz)
       axtToMaf -tPrefix=danRer3. -qPrefix=fr1. $f \
      /cluster/data/danRer3/bed/blastz.fr1/S1.len /cluster/data/danRer3/bed/blastz.fr1/S2.len \
      stdout \
      | gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
     end
 
     # copy the new axtNet files to downloads and replace old ones
     ssh hgwdev
     rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1/axtNet
     cd /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1
     mkdir -p /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1/axtNet
     ln -s /cluster/data/danRer3/bed/blastz.fr1/axtNet/*.axt.gz axtNet/
     # remake md5sum.txt 
     rm md5sum.txt
     md5sum *.gz */*.gz > md5sum.txt
 
     # Check README in downloads section and add a note about how the 
     # unordered chroms were split up into scaffolds.
     # Add trackDb entry for chain and net tracks to 
     # trackDb/zebrafish/danRer3/trackDb.ra 
     # Do swap to get danRer3 chains on Fugu, fr1 - see makeFr1.doc
 # featureBits -chrom=chr2 danRer3 refGene:cds chainFr1Link -enrichment
 # refGene:cds 0.705%, chainFr1Link 8.960%, both 0.645%, cover 91.53%, 
 # enrich 10.22x
 # featureBits -chrom=chr2 danRer2 refGene:cds chainFr1Link -enrichment
 # refGene:cds 0.739%, chainFr1Link 4.537%, both 0.620%, cover 83.90%, 
 # enrich 18.49x
 # featureBits -chrom=chrNA danRer3 refGene:cds chainFr1Link -enrichment
 # refGene:cds 0.449%, chainFr1Link 7.129%, both 0.399%, cover 88.78%, 
 # enrich 12.45x
 # featureBits -chrom=chrNA danRer2 refGene:cds chainFr1Link -enrichment
 # refGene:cds 0.499%, chainFr1Link 3.901%, both 0.409%, cover 81.90%, 
 # enrich 20.99x
     # Run directory files are already on /cluster/data. Remake downloads
     # for fugu alignments since these have been removed from
     # the downloads directory. (hartera, 2005-11-17)
     ssh hgwdev 
     # remake downloads using doBlastzChainNet.pl script
     cd /cluster/data/danRer3/bed/blastz.fr1
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
         -continue download -stop download `pwd`/DEF >& doDownload.log &
     # Check README in downloads section and add a note about how the 
     # unordered chroms were split up into scaffolds.
 
 # VEGA
     # get transcripts in transcripts_coords from e-mail from Mario Caccamo
     # at Sanger 06/16/05.
     # also README for Vega
     ssh kkstore01
     mkdir -p /cluster/data/danRer3/bed/vegaGene
     cd /cluster/data/danRer3/bed/vegaGene 
 
 # AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN (DONE, 2005-08-22, markd)
     # align with revised genbank process
     cd ~kent/src/hg/makeDb/genbank
     cvs update -d etc
     # edit etc/genbank.conf to add danRer3, had to run on pk, due to kk
     # being down.  Set temporary locations for server files
 
 # danRer3 (zebrafish)
 # Lift file partitions unplaced sequence pseudo-chroms (disabled)
 danRer3.serverGenome = /cluster/data/danRer3/danRer3.2bit
 ##danRer3.clusterGenome = /iscratch/i/danRer3/danRer3.2bit
 ##danRer3.ooc = /iscratch/i/danRer3/danRer3_11.ooc
 danRer3.clusterGenome = /san/sanvol1/scratch/danRer3/danRer3.2bit
 danRer3.ooc = /san/sanvol1/scratch/danRer3/danRer3_11.ooc
 ##danRer3.align.unplacedChroms = chrNA chrUn
 ##danRer3.lift = /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft
 danRer3.lift = no
 danRer3.downloadDir = danRer3
 danRer3.mgcTables.default = full
 danRer3.mgcTables.mgc = all
 
     # update /cluster/data/genbank/
     make etc-update
 
     ssh kkstore02
     cd /cluster/data/genbank
     nice bin/gbAlignStep -initial danRer3 &
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     nice ./bin/gbDbLoadStep -drop -initialLoad  danRer3&
 
     # enable daily alignment and update of hgwdev
     cd ~kent/src/makeDb/genbank
     cvs update -d etc
     # add danRer3 to:
         etc/align.dbs
         etc/hgwdev.dbs 
     cvs commit
     make etc-update
 
 # TIGR GENE INDEX (DONE, 2005-08-24, hartera)
 # Data from Razvan Sultana (rsultana@jimmy.harvard.edu or rsultana@tigr.org)
 # Includes data for chr1-25 and chrM, NOT chrNA and chrUn. Asked for these
 # on scaffolds and not on the virtual chroms - harder to generate. 
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/bed/tigr
     cd /cluster/data/danRer3/bed/tigr
     wget --timestamping \
 ftp://ftp.tigr.org/pub/data/tgi/Danio_rerio/TGI_track_danRer3_chr1-25.tgz
     tar xvzf TGI*.tgz
     # this is data for just chr1-25 and chrM. Data for NA and Un are to follow.
     ls chr1_*
     # chr1_drosophTCs  chr1_g_gallusTCs  chr1_mouseTCs  chr1_zfishTCs
     # chr1_elegansTCs  chr1_humanTCs     chr1_ratTCs
     # so species are fly, chicken, mouse, zebrafish, C. elegans, human and rat
     foreach f (*g_gallus*)
        set f1 = `echo $f | sed -e 's/g_gallus/chicken/g'`
        mv $f $f1
     end 
 
     foreach f (*drosoph*)
     set f1 = `echo $f | sed -e 's/drosoph/Dmelano/g'`
        mv $f $f1
     end
 
     foreach o (Dmelano chicken elegans human mouse rat zfish)
       echo $o
       setenv O $o
       foreach f (chr*_$o*s)
         tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
       end
     end
     ssh hgwdev
     cd /cluster/data/danRer3/bed/tigr
     hgsql danRer3 -e "drop table tigrGeneIndex"
 
     nice ldHgGene -exon=TC danRer3 tigrGeneIndex *.gff
     # Read 75388 transcripts in 288032 lines in 182 files
     # 75388 groups 26 seqs 1 sources 1 feature types
     # 75388 gene predictions
     checkTableCoords danRer3 tigrGeneIndex
     /cluster/bin/scripts/runGeneCheck /cluster/data/danRer3/bed/tigr
     # no CDS in these gene predictions so fix this:
     hgsql danRer3 -e "update tigrGeneIndex set cdsStart = txStart;"
     hgsql danRer3 -e "update tigrGeneIndex set cdsEnd = txEnd;"
     # compress all files
     gzip chr*
 
 # MAKE Human Proteins track  (DONE 2005-09-21 braney)
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/blastDb
     cd /cluster/data/danRer3/blastDb
     cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list
     for i in `cat chrom.list`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
     ln -s `cat list` .
     for i in *.fa
     do
 	/projects/compbio/bin/i686/formatdb -i $i -p F
     done
     rm *.log *.fa list
     cd ..
     for i in `cat blastDb/chrom.list`; do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft
     rm blastDb/chrom.list
 
     mkdir /cluster/data/danRer3/scaffoldBlastDb
     cd /cluster/data/danRer3/scaffoldBlastDb
     cat ../Un/scaffoldsSoftMask/*.fa ../NA/scaffoldsSoftMask/*.fa |  faSplit sequence stdin 500 scaf
     for i in *.fa
     do
 	/projects/compbio/bin/i686/formatdb -i $i -p F
     done
     rm *.log *.fa
 
     mkdir -p /san/sanvol1/scratch/danRer3/comboBlastDb
     cd /cluster/data/danRer3/blastDb
     for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer3/comboBlastDb; done
     cd /cluster/data/danRer3/scaffoldBlastDb
     for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer3/comboBlastDb; done
 
     mkdir -p /cluster/data/danRer3/bed/tblastn.hg17KG
     cd /cluster/data/danRer3/bed/tblastn.hg17KG
     echo  /san/sanvol1/scratch/danRer3/comboBlastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > query.lst  
 
     # we want around 250000 jobs
     calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
 # 37365/(250000/3539) = 528.938940
 
     mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa
     split -l 529 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa/kg
     ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa kgfa
     cd kgfa
     for i in *; do pslxToFa $i $i.fa; rm $i; done
     cd ..
     ls -1S kgfa/*.fa > kg.lst
     mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
     ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
     for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
 
     tcsh
     cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
 #ENDLOOP
 '_EOF_'
     cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/iscratch/i/blast/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2       
         liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3       
 	liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4       
 
         if pslCheck -prot $3.tmp                                                                          
         then                                                                                              
             mv $3.tmp $3                                                                                  
             rm -f $f.1 $f.2 $f.3  $f.4
         fi
         exit 0                                                                                            
     fi                                                                                                    
 fi                                                                                                        
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
 
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
 
     ssh kk
     cd /cluster/data/danRer3/bed/tblastn.hg17KG
     para create blastSpec
     para push
 
 # Completed: 203170 of 203170 jobs
 # CPU time in finished jobs:   17875092s  297918.20m  4965.30h  206.89d  0.567 y
 # IO & Wait Time:               4092508s   68208.46m  1136.81h   47.37d  0.130 y
 # Average job time:                 108s       1.80m     0.03h    0.00d
 # Longest finished job:            1778s      29.63m     0.49h    0.02d
 # Submission to last job:         64970s    1082.83m    18.05h    0.75d
 
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
 
     ls -1dS `pwd`/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
 
     para create chainSpec
     para push
 
 # Completed: 71 of 71 jobs
 # CPU time in finished jobs:      89115s    1485.25m    24.75h    1.03d  0.003 y
 # IO & Wait Time:                 35631s     593.85m     9.90h    0.41d  0.001 y
 # Average job time:                1757s      29.28m     0.49h    0.02d
 # Longest finished job:           15587s     259.78m     4.33h    0.18d
 # Submission to last job:         23380s     389.67m     6.49h    0.27d
 
     ssh kkstore02
     cd /cluster/data/danRer3/bed/tblastn.hg17KG/blastOut
     for i in kg??
     do 
 	cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
 	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
 	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
 	echo $i
     done
 
     liftUp -nohead -type=.psl stdout /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft carry u.*.psl m60* | \
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > /cluster/data/danRer3/bed/tblastn.hg17KG/blastHg17KG.psl
         
     ssh hgwdev
     cd /cluster/data/danRer3/bed/tblastn.hg17KG
     hgLoadPsl danRer3 blastHg17KG.psl
 # 21063005 bases of 1630323462 (1.292%) in intersection
 
     # back to kkstore02
     rm -rf blastOut
 # End tblastn
 
 # BACENDS TRACK (DONE, 2005-09-28, hartera)
 # Track display is very slow on large regions. Split all_bacends table by
 # chromosome (DONE, 2006-04-19, hartera)
 # REDO BACENDS FOR PAIRS, SINGLES, BAD PAIRS AND ALL BACENDS TABLES
 # (see separate section on REDO BACENDS, 2006-05-01 - 2006-05-08, hartera)    
     ssh kkstore01
     # BAC ends sequence files provided by Mario Caccamo at Sanger
     # mc2@sanger.ac.uk
     mkdir -p /cluster/data/danRer3/bed/bacends
     cd /cluster/data/danRer3/bed/bacends
 
     wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/zf_bacends.fa.gz
     wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/DH_bacends.fa.gz
     wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/bacend_placement.txt.gz
     gunzip *.gz  
     # DH_bacends.fa are from the new library from a doubled haploid zebrafish
     # zf_bacends.fa are from the existing libraries used in danRer2 and danRer1
     # Several reads are present for some of the BAC ends and these have
     # names like p1kaSP6 or q1kaT7 for duplicated reads and p1kSP6w or q1kT7w
     # for multiple reads. In the trace repository, the most recent sequence
     # is stored and the 'a' or 'w' is dropped from the name.
     # for the DH_bacends.fa from the CHORI73 library, the names are 
     # experiment file name                  trace_name
     # ========================              ================
     # CHORI73_139g06.p1kSP6                 CHORI73_139G6SP6
     # CHORI73_165b21.q1kT7                  CHORI73_165B21T7
     # The trace name is that stored in the trace archive with leading zeros
     # dropped and ".p1k" or ".q1k" and lower case changed to upper. 
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends
     # check list of prefixes in zf_bacends.fa
     grep '>' zf_bacends.fa > zf.names
     perl -pi.bak -e 's/>//' zf.names
     perl -pi.bak -e 's/^([A-Za-z]+)[0-9]+.+/$1/' zf.names
     sort -u zf.names
     # bZ
     # zC
     # zK
     # zKp
     # in DH_bacends.fa, all are CHORI73_
     # For DH_bacends.fa, need to clean up, change names to Trace archive
     # format as above. Then choose most recent sequence, those that are bad
     # with lots of Ns should be removed at the alignment stage as they will 
     # not pass the Blat or pslReps criteria. 
    #  cat zf_bacends.fa DH_bacends.fa >> Zv5Bacends.fa
   #  faSize Zv5Bacends.fa
     # 680121953 bases (11160014 N's 668961939 real 668961939 upper 0 lower) 
     # in 729101 sequences in 1 files
     # Total size: mean 932.8 sd 242.6 min 26 (CHORI73_189m04.p1kSP6) 
     # max 5717 (CHORI73_255a17.q1kT7) median 882
     # N count: mean 15.3 sd 75.7
     # U count: mean 917.5 sd 242.2
     # L count: mean 0.0 sd 0.0
     wc -l *.fa
     # 6412741 DH_bacends.fa
     # 14700258 Zv5Bacends.fa
     # 8287517 zf_bacends.fa
     grep '>' DH_bacends.fa | wc -l
     # 304252
     grep '>' zf_bacends.fa | wc -l
     # 424849
     # for DH_bacends.fa there are replicate reads. If duplicate plates 
     # have been made (i.e. read names like ..p1kaSP6 or ..q1kaT7) or plates 
     # have been sequenced multiple times (i.e. read names like ..p1kSP6w or 
     # ..q1kT7w), the Sanger trace repository has the most recent read and 
     # dropped the 'a' or 'w' from the trace name.
     # some are not in the repository. They had bad quality reads with a lot 
     # of Ns or runs of the same base. These should be dropped in the 
     # alignment filtering. 
     
     # now download sequence files from Sanger ftp site - these are the 
     # ones from the Sanger sequence repository
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/bed/bacends/seqs
     cd /cluster/data/danRer3/bed/bacends/seqs
     # get contents of ftp directory
     wget --timestamp \
  ftp://ftp.ensembl.org/pub/traces/danio_rerio/fasta/
     # from index.html, grep lines with cloneEnd 
     grep "cloneEnd" index.html > cloneEnds
     awk 'BEGIN {FS="\""} {print "wget --timestamp",$2;}' cloneEnds \
         > getCloneEnds.csh
     chmod +x getCloneEnds.csh 
     cat getCloneEnds.csh
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025270298.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025273988.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025278580.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1035416745.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1035417824.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1040215846.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1048006071.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1114727127.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115222417.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115226483.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115230498.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115234585.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115238038.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115240957.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039514906.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039603426.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039604741.fasta.gz
 wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1040231265.fasta.gz
     nice getCloneEnds.csh >& bac.log &
     # check log to see everything downloaded ok and then remove
     rm bac.log index.html
     # unzip files
     gunzip *.gz
     # cat together CHORI73 FASTA files
     cat sanger-zfish-CHORI*.fasta > CHORI73_bacends.fa
     grep '>' CHORI73_bacends.fa > CHORI73.names
     perl -pi.bak -e 's/>//' CHORI73.names
     sort CHORI73.names | uniq > CHORI73.names.sort
     wc -l CHORI73.names.sort
     # 265235 CHORI73.names.sort
     cat sanger-zfish-cloneEnd*.fasta > zfish_bacends.fa
     cat mpgeb-zfish-cloneEnd*.fasta > zfishmpgeb_bacends.fa
     grep '>' zfish_bacends.fa | wc -l
     # 164302
     grep '>' zfishmpgeb_bacends.fa | wc -l
     # 264633
     cp CHORI73.names.sort /cluster/data/danRer3/bed/bacends/
     # compared this list of sequence names for zf_bacends.fa and got more
     # sequences in the zf_bacends.fa - checked and some are in the trace
     # repository and some are not. 
     # for CHORI_73 there are 394 extra sequences in the downloaded file
     # and over 7000 in the original file sent by Mario. Just use the original 
     # file here as the sequences will probably be filtered out if there 
     # are bad alignments. get list of sequences for which there are more than 
     # 2 ends. Some end sequences have multiple reads. 
     cd /cluster/data/danRer3/bed/bacends
     # look at file of CHORI73_ sequences sent by Mario at Sanger:
     grep '>' DH_bacends.fa > DH.names
     perl -pi.bak -e 's/>//' DH.names 
     perl -pi.bak -e 's/(CHORI73_[0-9a-z]+)\.[a-z0-9]+.+/$1/' DH.names
     sort DH.names | uniq -c | sort -nr > DH.names.counts 
     awk '{if ($1 > 2) print $2;}' DH.names.counts > DH.names.morethan2
     # translate to upper case and remove leading zeros
     cat DH.names.morethan2 | tr '[a-z]' '[A-Z]' > DH.names.morethan2.upper
     # remove leading 0
     perl -pi.bak -e 's/(CHORI73_[0-9]+[A-Z])0([0-9]+)/$1$2/' \
         DH.names.morethan2.upper
     sort DH.names.morethan2.upper | uniq > DH.names.morethan2.upper.sort
     wc -l *.sort
     # 265235 CHORI73.names.sort
     # 6020 DH.names.morethan2.upper.sort
     comm -12 CHORI73.names.sort DH.names.morethan2.upper.sort | wc
     # 5299
     # so 721 are not in this list so they are probably not in the repository
     # but align these anyway.
     # for those that are then use the versions in CHORI73.names
     comm -12 CHORI73.names.sort DH.names.morethan2.upper.sort \
         > CHORI73.names.touse
     comm -13 CHORI73.names.sort DH.names.morethan2.upper.sort \
         > DHmorethan2.DHonly
     awk '{if ($1 <= 2) print $2;}' DH.names.counts > DH.names.2orless
     # this is list of sequences to get from DH_bacends.fa
     # need to back translate the list in DHmorethan2.DHonly
     cat DHmorethan2.DHonly | tr '[A-Z]' '[a-z]' > DHtmp
     sed -e 's/chori/CHORI/' DHtmp > DHmorethan2.DHonly.format
     # need to put leading zeros back and "." at the end to help
     # pattern matching with grep.
 cat << '_EOF_' > addZeros.pl
 #/usr/bin/perl -w
 use strict;
 
 my ($file);
 $file = $ARGV[0];
 
 open (FILE, $file) || die "Can not open $file: $!\n";
 
 while (<FILE>)
 {
 chomp;
 my ($l,$id);
 $l = $_;
 if ($l =~ /^CHORI73_[0-9]+[a-z][0-9]{2,}/)
    {
    print "$l\\.\n";
    }
 elsif($l =~ /^(CHORI73_[0-9]+[a-z])([0-9]{1})/)
   {
   $id = $1 . "0" . $2 . "\\.";
   print "$id\n";
   }
 }
 close FILE;
 '_EOF_'
     chmod +x addZeros.pl
     perl addZeros.pl DHmorethan2.DHonly.format > DHmorethan2.DHonly.format2
     wc -l DHmorethan2.DHonly*
     # 721 DHmorethan2.DHonly
     # 721 DHmorethan2.DHonly.format
     # 721 DHmorethan2.DHonly.format2
     # need to get full sequence names
     grep '>' DH_bacends.fa > DHBacs.fullnames
     perl -pi.bak -e 's/>//' DHBacs.fullnames
     perl -pi.bak -e 's/(CHORI73_[0-9a-z]+\.[a-z0-9A-Z]+) bases.+/$1/' \
          DHBacs.fullnames
     grep -f DHmorethan2.DHonly.format2 DHBacs.fullnames \
             > DHmorethan2.DHonly.fullnames
     wc -l DHmorethan2.DHonly.fullnames
     # 2352 DHmorethan2.DHonly.fullnames
     sort DHmorethan2.DHonly.fullnames > DHmorethan2.DHonly.fullnames.sort
     # do for those with less than 2 sequences to get the full names
 cat << '_EOF_' > getFullNames.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($file, $patterns, %idsHash);
 $file = $ARGV[0];
 $patterns = $ARGV[1];
 open (FILE, $file) || die "Can not open $file: $!\n";
 open (PATTERNS, $patterns) || die "Can not open $patterns: $!\n";
 
 while (<FILE>)
 {
 chomp;
 my ($l, $pref, $dir);
 $l = $_;
 if ($l =~ /^(CHORI73_[0-9a-z]+)\./)
    {
    $pref = $1;
    push(@{$idsHash{$pref}}, $l);
    }
 }
 close FILE;
 
 while (<PATTERNS>)
 {
 my ($line, @ids, $i);
 chomp;
 $line = $_;
 if (exists($idsHash{$line}))
    {
    @ids = @{$idsHash{$line}};
    foreach $i (@ids)
        {
        print "$i\n";
        }
    }
 }
 close PATTERNS;
 '_EOF_'
     chmod +x getFullNames.pl
     perl getFullNames.pl DHBacs.fullnames DH.names.2orless \
          > DH.fullnames.2orless
    
     # do the same for CHORI73.names.touse to get full names
     awk '{print $1"SP6"}' CHORI73.names.touse > CHORI73.namesSP6.touse
     awk '{print $1"T7"}' CHORI73.names.touse > CHORI73.namesT7.touse
     cat CHORI73.namesSP6.touse CHORI73.namesT7.touse \
         > CHORI73.namesSP6andT7.touse
     wc -l CHORI73.names*
     # 265235 CHORI73.names.sort
     # 10598 CHORI73.namesSP6andT7.touse
     # 5299 CHORI73.namesSP6.touse
     # 5299 CHORI73.namesT7.touse
     # 5299 CHORI73.names.touse
 
     grep '>' CHORI73_bacends.fa > CHORI73.fullnames
     perl -pi.bak -e 's/>//' CHORI73.fullnames
     grep -f CHORI73.namesSP6andT7.touse CHORI73.fullnames \
          > CHORI73.fullnames.touse
     # so get all the sequence records together in one file
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends
     mkdir bacSeqs
     # get all sequences from DH_bacends.fa that have 2 or less for the clone.
     # This might include cases where there are duplicate reads for one end
     # only but these will go into the singles track anyway.
     faSomeRecords DH_bacends.fa DH.fullnames.2orless ./bacSeqs/DHBacs.2orless.fa
     # get all sequences with more than 2 sequences for that clone but
     # with no sequence in the new downloaded BAC ends sequence file that 
     # has only one sequence for each BAC end.
     faSomeRecords DH_bacends.fa DHmorethan2.DHonly.fullnames.sort \
          ./bacSeqs/DHBacs.2ormore.orig.fa
     # get all sequences for BAC ends where there are more than 2 read for 
     # ends for one clone so there are replicate reads for at least one end.
     # use the sequence in the downloaded CHORI73 set of clone ends for these.
     faSomeRecords CHORI73_bacends.fa CHORI73.fullnames.touse \
          ./bacSeqs/CHORI73.fromDH.morethan2.fa
     cd bacSeqs
     # translate to upper case and remove leading zeros
     cat DHBacs.2orless.fa | tr '[a-z]' '[A-Z]' > DHBacs.2orless.format.fa
     cat DHBacs.2ormore.orig.fa | tr '[a-z]' '[A-Z]' \
         > DHBacs.2ormore.orig.format.fa
     # remove leading 0 and just use name as FASTA header
     # need to leave in a or w as in p1kaSP6 or q1kaT7 or p1kSP6w or q1kT7w
     # these will distinguish replicate reads from the same sequence and will
     # be removed later when the best alignment is selected.
     perl -pi.bak -e \
     's/(CHORI73_[0-9]+[A-Z]{1})0?([0-9]+)\.(P1K|Q1K)(ASP6|SP6|SP6W|AT7|T7|T7W) BASES.+/$1$2$4/' \
         DHBacs*format.fa
     cat CHORI73.*.fa DHBacs*.format.fa > CHORI73BACends.fa
     grep '>' CHORI73BACends.fa | wc -l
     # 295722
     # then combine these with the zf_bacends.fa from Sanger which contain
     # the rest of the BAC end sequences.
     cat ../zf_bacends.fa CHORI73BACends.fa > Zv5BACends.fa
     grep '>' Zv5BACends.fa | wc -l
     # 720571
     faSize Zv5BACends.fa 
     # 674252474 bases (10674972 N's 663577502 real 663577502 upper 0 lower) in 
     # 720571 sequences in 1 files Total size: mean 935.7 sd 239.8 
     # min 26 (CHORI73_189M4SP6) max 5403 (zC259G13.zb) median 882
     # N count: mean 14.8 sd 72.4
     # U count: mean 920.9 sd 239.6
     # L count: mean 0.0 sd 0.0
     # check Zv5BACends.fa has unique sequence names
     grep '>' Zv5BACends.fa | sed 's/>//' > names
     sort names | uniq -c | sort -nr > names.count
     # all unique names so cleanup
     rm names names.count *.bak
     # Now the BAC end sequences file has been made, align the sequences 
     # to danRer3 using Blat.
 
     ssh pk
     # problems running these on kk using input from bluearc - slowed down
     # kkstore02 with heavy load. So move everything to the san as it 
     # scales better than the bluearc especially from the pk. run directory 
     # is on san also.  
     cd /cluster/data/danRer3/bed/bacends/bacSeqs
     # first split up bacends sequence and add to directory on the san
     mkdir -p /san/sanvol1/scratch/danRer3/bacends/Zv5bacends
     # split up sequence for cluster runs
     faSplit sequence Zv5BACends.fa 20 \
             /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/bacends
     # get all the chrom contig files onto the san
     mkdir -p /san/sanvol1/scratch/danRer3/trfFaChroms
     rsync -a --progress /cluster/bluearc/danRer3/trfFa/chr[0-9M]*.fa \
          /san/sanvol1/scratch/danRer3/trfFaChroms/
 
     cd /cluster/data/danRer3/bed/bacends
     mkdir -p /san/sanvol1/scratch/danRer3/bacends/chromsRun
     ln -s /san/sanvol1/scratch/danRer3/bacends/chromsRun
     # make directory for output, do not have output going to /cluster/data dir
     # as it is very large.
     mkdir -p /san/sanvol1/scratch/danRer3/bacends/chromsPsl
     ln -s /san/sanvol1/scratch/danRer3/bacends/chromsPsl
     # also copy over the 11.ooc file for danRer3 if not there already
     cp -p /cluster/bluearc/danRer3/danRer3_11.ooc \
        /san/sanvol1/scratch/danRer3/ 
     # make input file lists
     cd /cluster/data/danRer3/bed/bacends/chromsRun
     ls -1S /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/*.fa > bacends.lst
     # do blat just for chr1-25 and chrM
     ls -1S /san/sanvol1/scratch/danRer3/trfFaChroms/*.fa > seqs.lst
     # 64 bit blat used for pk. This version of blat recently had a bug fix
     # so should give the same result as i386 blat on kk. use absolute path for
     # output dir rather than symlink as that would increase I/O.
 # use Blat parameters as for mm5 and hg17
 cat << '_EOF_' > template
 #LOOP
 /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc {check out line+ /san/sanvol1/scratch/danRer3/bacends/chromsPsl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
    # << this line keeps emacs coloring happy
     /cluster/bin/i386/gensub2 seqs.lst bacends.lst template jobList
     /cluster/bin/i386/para create jobList
     /cluster/bin/i386/para try, check, push, check, ...
 # /cluster/bin/i386/para time
 # Completed: 4160 of 4160 jobs
 # CPU time in finished jobs:     746878s   12447.96m   207.47h    8.64d  0.024 y
 # IO & Wait Time:                 11166s     186.11m     3.10h    0.13d  0.000 y
 # Average job time:                 182s       3.04m     0.05h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             915s      15.25m     0.25h    0.01d
 # Submission to last job:          5100s      85.00m     1.42h    0.06d
 
     # run jobs to do blat of NA and Un scaffolds vs BAC end sequences
     ssh pk
     # copy scaffolds to the san
     mkdir -p /san/sanvol1/scratch/danRer3/scaffoldsSoftMask
     foreach f (/cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/Zv5_*.fa)
       rsync -a --progress $f /san/sanvol1/scratch/danRer3/scaffoldsSoftMask/
     end 
     cd /cluster/data/danRer3/bed/bacends
     mkdir -p /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun
     ln -s /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun
     # make directory for output, do not have output going to /cluster/data dir
     # as it is very large.
     mkdir -p /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl
     ln -s /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl
     # make input file lists
     cd /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnRun
     ls -1S /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/*.fa > bacends.lst
     # do blat just for NA and Un scaffolds
     foreach f (/san/sanvol1/scratch/danRer3/scaffoldsSoftMask/Zv5_*.fa)
        echo $f >> scafs.lst
     end
     # 64 bit blat used for pk. This version of blat recently had a bug fix
     # so should give the same result as i386 blat on kk. use absolute path for
     # output dir rather than symlink as that would use
 # use Blat parameters as for mm5 and hg17
 cat << '_EOF_' > template
 #LOOP
 /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc {check out line+ /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
    # << this line keeps emacs coloring happy
     /cluster/bin/i386/gensub2 scafs.lst bacends.lst template jobList
     /cluster/bin/i386/para create jobList
     /cluster/bin/i386/para try, check, push, check, ...
 # para time
 # Completed: 298820 of 298820 jobs
 # CPU time in finished jobs:    1232495s   20541.58m   342.36h   14.26d  0.039 y
 # IO & Wait Time:                923511s   15391.85m   256.53h   10.69d  0.029 y
 # Average job time:                   7s       0.12m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1008s      16.80m     0.28h    0.01d
 # Submission to last job:         37494s     624.90m    10.41h    0.43d
  
     ssh kolossus
     cd /cluster/data/danRer3/bed/bacends
     # need to sort psl files, filter and liftUp
     # first do the chr1-25 and chrM alignments
     nice pslSort dirs rawChroms.psl tmp chromsPsl >& chromSort.log
     # Time taken: 2 hours 42 minues 
     pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
             rawChroms.psl bacEndsChroms.psl /dev/null >& pslRepsChroms.log
     # Took 19 minutes
     # then lift up NA and Un scaffolds to chrom level
     nice pslSort dirs rawNAandUn.psl tmp scaffoldsNAandUnPsl \
          >& scafsNAandUnSort.log
     # took 1 hour 50 minutes
     pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
             rawNAandUn.psl  bacNAandUnScafs.psl /dev/null >& pslRepsNAandUn.log
     # took 18 minutes
     # lift results:
     liftUp bacEnds.liftedChroms.psl /cluster/data/danRer3/jkStuff/liftAll.lft \
            warn bacEndsChroms.psl
     liftUp bacEnds.liftedNAandUn.psl \
       /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
            warn bacNAandUnScafs.psl
      
     # sort and merge these files
     mkdir liftedPsl
     mv *.lifted*.psl ./liftedPsl/
     nice pslSort dirs bacEnds.psl tmp1 liftedPsl >& pslSortAll.log
     # Took 4 minutes
     pslCheck bacEnds.psl >& pslCheck.log
     # there are 520 BAC ends with overlapping block errors - 1385 alignments
 
     # use pslReps parameters used for mm6
     pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl \
             bacEnds.psl /dev/null
     # those for hg17
     pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
             raw.psl  bacEnds2.psl /dev/null
     # see how many align in each case
     awk '{print $10;}' bacEnds.psl | sort | uniq -c \
         | sort -nr > bacEnds.qNames.sort
     awk '{print $10;}' bacEnds2.psl | sort | uniq -c \
         | sort -nr > bacEnds2.qNames.sort
     wc -l bacEnds*qNames.sort
     # 549086 bacEnds2.qNames.sort
     # 519773 bacEnds.qNames.sort
     grep '>' Zv5Bacends.fa | wc -l
     # 729101
     # so 71% of sequences aligned in bacEnds.psl
     # and 75% of sequences aligned in bacEnds2.psl
     # use textHistogram to look at number of alignments
     # bacEnds.psl has 374002 with only 1 alignment
     # bacEnds2.psl has 362364 with only 1 alignment
     # bacEnds.psl - most alignments for 1 sequence is 515, 
     # for bacEnds2.psl - most alignments for 1 sequence is 1272
     # when these are split up into bacEndPairs, bacEndPairsBad and 
     # bacEndSingles, the number of alignments per sequence is reduced
     # so use bacEnds2.psl
      
      # Process BAC end alignments
      ssh kkstore02
      mkdir -p /cluster/data/danRer3/bed/bacends/pairs
      mkdir -p /cluster/data/danRer3/bed/bacends/bacends.1
      # Downloaded BAC ends accessions from SRS
      # Go to http://srs.sanger.ac.uk
      # Go to "Select Databanks" tab and check DBGSS
      # Go to "Query Form" tab
      # Select Organism as field and enter "Danio*" as search term
      # Select AllText as field and enter "*Sanger*" as search term
      # Select AllText as filed and enter "T7|SP6" as search term
      # Select a view
      # Download as BACEndAccs.txt to bacend.1 directory 
      cd /cluster/data/danRer3/bed/bacends/bacends.1
      cp /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1/getBacEndInfo.pl .
      # get lists of SP6 and T7 accessions and merge lists
      awk 'BEGIN {FS="\t"}{OFS="\t"} {if ($7 ~ /SP6/) print $3"SP6",$4}' \
          BACEndAccs.txt > BACEndSP6.accs
      awk 'BEGIN {FS="\t"}{OFS="\t"} {if ($7 ~ /T7/) print $3"T7",$4}' \
          BACEndAccs.txt > BACEndT7.accs
      cat BACEndSP6.accs BACEndT7.accs > BACEndExtNames.accs
      # change external names to internal names
 cat << '_EOF_' > extToIntNames.pl
 #!/usr/bin/perl -w
 use strict;
 
 my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-");
 my %cloneHash = qw {
    CH211-  zC
    DKEY-   zK
    DKEYP-  zKp
    RP71-   bZ
    BUSM1-  dZ
    CH73-   CHORI73_
 };
 
 while (<STDIN>) 
 {
 my ($l, $c, $intPref);
 $l = $_;
 foreach $c (@clonePrefixes)
    {
    if ($l =~ /$c/)
        {
        # get internal name
        if (exists($cloneHash{$c}))
           {
           $intPref = $cloneHash{$c};
           $l =~ s/$c/$intPref/; 
           print $l;
           }
        }
    }
 }
 '_EOF_'
      chmod +x extToIntNames.pl
      perl extToIntNames.pl < BACEndExtNames.accs > BACEnd_accessions.txt
      # get BAC clone accessions from Genbank. They can be obtained from EMBL
      # through SRS but harder to separate the BAC end accessions from the
      # BAC clone accessions:
      # go to http://www.ncbi.nlm.nih.gov
      # 1) select "Nucleotide" as the search database.
      # 2) Search string: Danio rerio[ORGN] AND clone[TITL] NOT survey[TITL]
      # Those sequences with "genomic survey" in the title appear to be 
      # BAC clone end accessions. Here, we want only BAC clone accessions.
      # 3) There are 628991 sequences (2005-09-19). Select File from Send To 
      # pulldown menu and name file "BACClones.gbAccs.txt".
      # create script to parse out clone ID and the accession:
 cat << '_EOF_' > getAccsandIdsFromGb.pl
 #!/usr/bin/perl -w
 use strict;
 
 my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-");
 my %cloneHash = qw {  
    CH211-  zC
    DKEY-   zK
    DKEYP-  zKp 
    RP71-   bZ
    BUSM1-  dZ
    CH73-   CHORI73_
 };
 
 my $found = "FALSE";
 my $acc = "";
 my $id = "";
 while (<STDIN>)
 {
 my ($l, @f, $intId, $extPref, $intPref);
 $intPref = "";
 $extPref = "";
 
 chomp;
 $l = $_;
 if ($l =~ /^[0-9]+:\s+([A-Z]+[0-9]{3,})/)
    {
    $acc = "";
    $acc = $1;
    $found = "FALSE";
    }
 elsif ($l =~ /clone/)
    {
    $id = "";
    # check for clone name in this line
    foreach my $p (@clonePrefixes)
       {
       if ($l =~ /clone:?\s?($p[0-9-A-Za-z]+)/)
          {
          $id = $1;
          # translate to upper case
          $id =~ tr/a-z/A-Z/;
          $extPref = $p;
          $found = "TRUE";
          }
       }
    }
 if ($found eq "TRUE")
    {
    if (exists($cloneHash{$extPref}))
       {
       $intPref = $cloneHash{$extPref};
       }
    $intId = $id;
    # translate this to internal ID
    $intId =~ s/$extPref/$intPref/;
    print "$intId\t$acc\t$id\n";
    $found = "FALSE";
    }
 }
 '_EOF_'
      # chmod +x getAccsandIds.pl
     #  perl getAccsandIds.pl < BACClones.accs.txt > BACClonesIdsandAccs.txt
      # Took 36 minutes. This file has internal BAC clone name, accession and
      chmod +x getAccsandIdsFromGb.pl
      # CHORI73_ is a new prefix, this is for the internal name of 
      # BAC clones from the CHORI73 doubled haploid library.
      nice perl getAccsandIdsFromGb.pl < BACClones.gbAccs.txt \
           > BACClonesIdsandAccs.txt &
      
      # Took under 3 minutes. The output file here has internal BAC clone name, 
      # Genbank accession and external BAC clone name.
      grep '>' ../bacSeqs/Zv5BACends.fa | sed -e 's/>//' > allBacEnds.names
      # modify getBacEndInfo.pl for these sequence names so rename as
      # getBacEndInfov2.pl 
      # need to make pairs file
      perl getBacEndInfov2.pl allBacEnds.names BACEnd_accessions.txt \
           > bacEnds.log
      # check that all the BAC end sequence names from allBacEnds.names
      # appear in either bacEndPairs.txt or bacEndSingles.txt
      wc -l bacEnd*
      # 159319 bacEndAccs.aliases
      # 333356 bacEndPairs.txt
      # 19788 bacEndSingles.txt
      # bacEndAccs.aliases contains sequence read names and their
      # Genbank accessions. 
      awk 'BEGIN {OFS="\n"} {print $1, $2;}' bacEndPairs.txt \
          | sed -e 's/,/\n/g' > bacPrs.names
      awk '{print $1;}' bacEndSingles.txt | sed -e 's/,/\n/g' > bacSingles.names
      cat bacPrs.names bacSingles.names | sort > bacEnds.names.sort
      sort allBacEnds.names > allBacEnds.names.sort
      wc -l *.sort
      # 720571 allBacEnds.names.sort
      # 720571 bacEnds.names.sort
      # so all the BAC ends from the FASTA file have been accounted for either
      # as pairs or singles.
      # process BAC end alignments
      cd /cluster/data/danRer3/bed/bacends/pairs
      set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
      # try different parameters
       /cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
      -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
      -mismatch -verbose ../bacEnds.psl $bacDir/bacEndPairs.txt \
      all_bacends bacEnds
      wc -l bacEnds.*
      # 426 bacEnds.long
      # 14875 bacEnds.mismatch
      # 229139 bacEnds.orphan
      # 164778 bacEnds.pairs
      # 0 bacEnds.short
      # 100 bacEnds.slop
      # 409318 total
      # there are less slop (190) more pairs (90967) and orphans (229139)
      # and less mismatch (18083) and less long (980) than for danRer2
      # size of sequence should be 100-200 kb but since assembly is not 
      # complete there are misassemblies so the distance between pairs could be
      # larger. If -max=200000 -slopval=10000 -hardMax=500000 is used, then
      # there are 18377 bacEnds.long, 250243 bacEnds.orphan, 
      # and 131209 bacEnds.pairs and over 3000 less just drop out.413243 total
      # try -max=300000 -slopval=10000 -hardMax=500000
      # wc -l bacEnds.*
      # 3343 bacEnds.long
      # 11731 bacEnds.mismatch
      # 243500 bacEnds.orphan
      # 154981 bacEnds.pairs
      #  0 bacEnds.short
      # 509 bacEnds.slop
      # 414064 total
      # try -min=25000 -max=350000 -slopval=10000 -hardMax=500000 as for human
      # wc -l bacEnds.*
      # 1725 bacEnds.long
      # 12081 bacEnds.mismatch
      # 242235 bacEnds.orphan
      # 156444 bacEnds.pairs
      # 616 bacEnds.short
      # 1017 bacEnds.slop
      # 414118 total
      # this would be good to use but for direct comparison between danRer2 
      # and danRer3, it would be good to use the same parameters as before
      # so stick with those above: 
      # -min=2000 -max=650000 -slopval=10000 -hardMax=800000 
      # create header required by "rdb" tools
 
      # NOTE: there are overlapping BAC clone ends for danRer3. Some of these
      # are only a few kb apart (from beginning of one to end of the other)
      # so use stricter pslPairs parameters as for human and mouse.
      ssh kkstore02
      mkdir /cluster/data/danRer3/bed/bacends/pairs
      cd /cluster/data/danRer3/bed/bacends/pairs
      set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
      /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 -mismatch -verbose ../bacEnds.psl \
         $bacDir/bacEndPairs.txt all_bacends bacEnds
      wc -l bacEnds.*
 
      echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
           > ../header
      echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
      # make pairs bed file
      cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
                | headchg -del > bacEndPairs.bed
      # also need to process bacEndSingles.txt into a database table
      # for singles in bacEndSingles.txt, create a dummy file where they
      # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
      # sequence, put the dummy sequence in the second column, if the single is
      # a reverse sequence put in first column. use a perl script to do this.
      cd /cluster/data/danRer3/bed/bacends
      set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
      mkdir singles
      cd singles
      cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
      perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                            $bacDir/bacEndSingles.format
      # then run pslPairs on this formatted file
      /cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
      -max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
      -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
      all_bacends bacEnds
      wc -l bacEnds.*
      # 0 bacEnds.long
      # 0 bacEnds.mismatch
      # 11439 bacEnds.orphan
      # 0 bacEnds.pairs
      # 0 bacEnds.short
      # 0 bacEnds.slop
      # there are 11439 orphans here and 229139 from pair analysis so 
      # a total of 240578 orphans
      cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles
      wc -l bacEnds.singles
      # 240578 bacEnds.singles
      # make singles bed file
      cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                   | headchg -del > bacEndSingles.bed
      cp bacEndSingles.bed ../pairs
      cd ../pairs
      # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
      # since orphans are already in bacEndSingles, do not add these
      cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         bacEnds.orphan | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBad.bed
      # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans 
      # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans
 
      cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBadNoOrphans.bed
      # use extractPslLoad later to get all_bacends.psl for database
 
      # There are rows where the aligments were the same but the lfNames are 
      # different. This is due to the presence of multiple reads for the 
      # same BAC end sequence. Sometimes they are slightly different lengths 
      # so the alignments are a little different. It would be good to 
      # consolidate all of these. Firstly, the identical rows were merged into 
      # one with a list of all the lfNames corresponding to that alignment.
      
      ssh kkstore02
      #echo "create database bacsDr3_rah;" | hgsql danRer3
      cd /cluster/data/danRer3/bed/bacends/pairs
      #hgLoadBed bacsDr3_rah bacEndPairs bacEndPairs.bed \
     #       -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
      # Loaded 163174 elements of size 11
      # create a bacEndSingles table like bacEndPairs if not created already
      # hgLoadBed bacsDr3_rah bacEndSingles bacEndSingles.bed \
        #          -sqlTable=../singles/bacEndSingles.sql -notItemRgb
      # Loaded 212775 elements of size 11
      # NOTE - this track isn't pushed to RR, just used for assembly QA
      # Use bacEndPairsBadNoOrphans.bed as orphans are in the singles bed file
     # hgLoadBed bacsDr3_rah bacEndPairsBad bacEndPairsBadNoOrphans.bed \
      #      -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
      # Loaded 15169 elements of size 11
      # Need to consolidate similar rows for bacEndPairs and bacEndSingles - same
      # name, different lfNames and same alignments.
      mkdir -p /cluster/data/danRer3/bed/bacends/duplicates
      cd /cluster/data/danRer3/bed/bacends/duplicates
      mkdir -p /cluster/bluearc/danRer3/bacends/duplicates/overlapRun
      cd /cluster/data/danRer3/bed/bacends/duplicates
      ln -s /cluster/bluearc/danRer3/bacends/duplicates/overlapRun
      # write program to do this for linked feature series (lfs) which
      # is the type of data structure used for BAC ends.
      # Need a bed file sorted by chrom and chromStart 
      cd overlapRun
      foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         sort -k1,2 /cluster/data/danRer3/bed/bacends/pairs/${f}.bed > ${f}.lfs
      end
      wc -l *.lfs
      # 15169 bacEndPairsBadNoOrphans.lfs
      # 163174 bacEndPairs.lfs
      # 212775 bacEndSingles.lfs
  
      # remove replicate rows where names match and the overlapping region
      # (chromEnd - chromStart) is greater than or equal to 0.999.
      ssh kolossus
      cd /cluster/data/danRer3/bed/bacends/duplicates/overlapRun
      foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
          echo "Processing $f"
          nohup nice /cluster/home/hartera/bin/i386/lfsOverlap ${f}.lfs \
                ${f}.bed -name -minOverlap=1.0 -notBlocks
      end
      # Started: Tue Sep 27 21:51 Finished: Sep 28 06:29 
      ssh kkstore02
      cd /cluster/data/danRer3/bed/bacends/duplicates/overlapRun
      # check the numbers of lines are correct
     
      foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
          awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \
              | sort | uniq -c | sort -nr > ${f}.uniqCount
      end
      wc -l *
      # 163116 bacEndPairs.bed
      # 163174 bacEndPairs.lfs
      # 163116 bacEndPairs.uniqCount
      # 15163 bacEndPairsBad.bed
      # 15169 bacEndPairsBad.lfs
      # 15163 bacEndPairsBad.uniqCount
      # 212754 bacEndSingles.bed
      # 212775 bacEndSingles.lfs
      # 212754 bacEndSingles.uniqCount
      # numbers of lines after uniqueing by coords, name and score is the
      # same as that after using lfsOverlap to remove these lines so correct.
      cd /cluster/data/danRer3/bed/bacends/duplicates
      mv ./overlapRun/* .
      rm -r overlapRun /cluster/bluearc/danRer3/bacends/duplicates/overlapRun
      # Use perl script to choose 2 BAC ends to represent each BAC clone.
      # since there are often more than one read for each BAC end in this set,
      # 2 were chosen for each BAC pair or 1 for the singles. This was based on
      # the ones that had the largest region aligned (using lfSizes).
      # copy perl script over that was used for danRer2
      cp /cluster/data/danRer2/bed/ZonLab/bacends/duplicates/pickLfNames.pl \
         pickLfNamesv2.pl 
      # edit so that regular expression for matching BAC end names is the 
      # same as that used in ../bacends.1/getBacEndInfov2.pl
      # need to sort by chrom, chromStart
 
      foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
          sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed
      end
      # run perl script: input bed file, pairs or singles, name of output file
      perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed
      mv error.log log.pairs
      # log.pairs lists the 18 cases where alignments for a BAC clone use
      # a different pair of sequence reads for the ends than the previous
      # alignment for ends for that BAC clone. These were all checked and in
      # each case, the extra alignments are almost identical or overlap for
      # the most part so it does not matter if the extra alignments are 
      # removed.
      # run script for singles:
      perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed
      mv error.log log.singles
      # log.singles has 34 cases where alignments for a BAC clone use 
      # different sequence reads for either the T7 or SP6 BAC end.
      # singles may include both BAC ends for a clone in the case
      # where they aligned to different chromosomes or a long way apart on 
      # the same chromsome (orphans). mostly those that have a different read
      # align to an almost identical or largely overlapping region.
      # some sequences appear to be different: CH211-98J20 - zC98J20.yb and
      # zC98J20.ya do not align to each other. DKEYP-107B4 - zKp107B4.ya looks
      # like it has low complexity sequence, this is discarded and zKp107B4.yb 
      # is kept. zKp107B4.za and zKp107B4.zb only align in the first ~ 59bp.
      # zKp107B4.zb is kept in this case. DKEYP-114B4 - zKp114B4.za: 15-61 bp 
      # on zKp114B4.za align to 11-58 bp on zKp114B4.zb. zKp114B4.za is kept.
      # In these cases, the 2 sequences align to different regions.
      perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \
           badPairs2lfNames.bed
      mv error.log log.badPairs
      # only 3 alignments have a different pair of ends to other alignments
      # but alignment region is almost the same in each case.
     
      # for each of these new bed files, checks were made that there are
      # only 2 BAC ends per alignments for pairs and 1 for singles.
      # For each pair, there should only be 2 ends which can appear either
      # way round depending on the orientation and there should be 1 end for
      # the beginning (suffix T7, t7 or z) and one end for the end
      # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
      # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
      # orientation. For singles, there should be a single BAC end for each
      # alignment and for each BAC clone, a sequence for either or both types
      # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
      # alignments.
      # Finally overlaps in BAC clone names were checked. All BAC clones
      # represented in each of the pairs, badPairs and singles bed files are
      # unique to that file. Between all three bed files, 300323 BAC clones
      # have alignments. 512886 clone ends are aligned in these three bed files. 
      
      # NOTE: using sort and uniq on hgwdev produces tab delimited output
      # after merging rows with the same BAC name, the scoring is now
      # wrong in the bed files.
      # Scores should be 1000 if there is 1 row for that name, else
      # 1500/number of rows for that sequence name - calculated by pslPairs.
      # Correct the scores.
                                                                                 
      mkdir -p /cluster/data/danRer3/bed/bacends/scores
      cd /cluster/data/danRer3/bed/bacends/scores
      # copy over correctScores2.pl and checkscores.pl scripts from danRer2 and 
      # edit so both scripts so that hits file is split on space,not on tabs
      cp /cluster/data/danRer2/bed/ZonLab/bacends/scores/correctScores2.pl .
      cp /cluster/data/danRer2/bed/ZonLab/bacends/scores/checkScores.pl .
      awk '{print $4}' ../duplicates/pairs2lfNames.bed \
                  | sort | uniq -c > pairs.hits
      perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits noBin \
                            > bacEndPairsGoodScores.bed
      # same for singles
      awk '{print $4}' ../duplicates/singles1lfName.bed \
                  | sort | uniq -c > singles.hits
                                                                                 
      perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits \
                  noBin > bacEndSinglesGoodScores.bed
                                                                                 
      # and for badPairs
      awk '{print $4}' ../duplicates/badPairs2lfNames.bed \
                  | sort | uniq -c > badPairs.hits
      perl correctScores2.pl ../duplicates/badPairs2lfNames.bed badPairs.hits \
                  noBin > bacEndPairsBadGoodScores.bed
      # check that the scores are now correct  
      awk '{print $4, $5}' bacEndPairsGoodScores.bed \
          | sort | uniq -c > pairs.count
      perl checkScores.pl < pairs.count
      # all the BAC clones should be in good.txt and none in bad.txt
      # wc -l should give same number of lines in good.txt as in pairs.hits
      # repeat for other bed files
      awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \
          | sort | uniq -c > badPairs.count
      perl checkScores.pl < badPairs.count
      awk '{print $4, $5}' bacEndSinglesGoodScores.bed \
          | sort | uniq -c > singles.count
      perl checkScores.pl < singles.count
      # for the singles, 6 ended up in bad.txt because their scores 
      # were 214.285714285714 which is correct for 7 alignments. rounding the
      # score caused the discrepancy.
      ssh hgwdev
      cd /cluster/data/danRer3/bed/bacends/scores
      # copy over table definition from danRer2
      cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/bacEndSingles.sql \
         ../singles/
      # Now load database tables:
      hgLoadBed danRer3 bacEndPairs bacEndPairsGoodScores.bed \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
      # Loaded 163098 elements of size 11
      hgLoadBed danRer3 bacEndSingles bacEndSinglesGoodScores.bed \
                -sqlTable=../singles/bacEndSingles.sql -notItemRgb
      # Loaded 212720 elements of size 11
      # 212720 record(s), 0 row(s) skipped, 50 warning(s) loading bed.tab
      # warnings are unknown but all of bed file loaded and the number
      # of warnings is small so ignore
      hgLoadBed danRer3 bacEndPairsBad bacEndPairsBadGoodScores.bed \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
      # Loaded 15160 elements of size 11
      # load BAC end sequences into seq table so alignments may be viewed
      # symlink to bacends.fa sequences in danRer1
      mkdir -p /gbdb/danRer3/bacends
      ln -s /cluster/data/danRer3/bed/bacends/bacSeqs/Zv5BACends.fa \
                                 /gbdb/danRer3/bacends/Zv5BACends.fa
      hgLoadSeq danRer3 /gbdb/danRer3/bacends/Zv5BACends.fa
 
      # create file for loading all_bacends table
      ssh kkstore02
      cd /cluster/data/danRer3/bed/bacends/scores
      # for all_bacends table, just load the alignments for those sequences
      # represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables
      # bacEnds.load.psl is the file of alignments
      # get all the names of sequences 
      foreach f (*GoodScores.bed)
        echo $f
        awk '{print $11;}' $f >> allBacEnds.names
      end
      wc -l allBacEnds.names
      # 390978 allBacEnds.names
      # this is the total number of lines in the *GoodScores.bed files
      perl -pi.bak -e 's/,/\n/g' allBacEnds.names
      sort allBacEnds.names | uniq > allBacEnds.names.uniq
      wc -l allBacEnds.names.uniq
      # 512886 allBacEnds.names.uniq
      # get alignments for just the BAC ends that are in the database tables
      # make bacEnds.load.psl
      cd /cluster/data/danRer3/bed/bacends/scores 
      extractPslLoad -noBin ../bacEnds.psl bacEndPairsGoodScores.bed \
             bacEndPairsBadGoodScores.bed bacEndSinglesGoodScores.bed | \
             sorttbl tname tstart | headchg -del > bacEnds.load.psl
     # check that alignments are present for all BAC ends in 
     # allBacEnds.names.uniq
     awk '{print $10}' bacEnds.load.psl | sort | uniq > bacEnds.names
     comm -12 bacEnds.names allBacEnds.names.uniq | wc -l
     # 512886
     ssh hgwdev
     cd /cluster/data/danRer3/bed/bacends/scores
     # load all_bacends table
     hgLoadPsl danRer3 -table=all_bacends bacEnds.load.psl
     # load of all_bacends did not go as planned: 7584708 record(s), 
     # 0 row(s) skipped, 526 warning(s) loading psl.tab
     
     # (hartera, 2006-04-19)
     # Display is very slow for BAC ends on large regions. Try splitting
     # all_bacends by chromosome.
     ssh hgwdev
     mkdir /cluster/data/danRer3/bed/bacends/all_bacends
     cd /cluster/data/danRer3/bed/bacends/all_bacends
     foreach c (`cat /cluster/data/danRer3/chrom.lst`)
         echo "Processing $c ..."
         awk '{if ($14 == "'chr${c}'") print;}' \
             /cluster/data/danRer3/bed/bacends/scores/bacEnds.load.psl \
             > chr${c}.bacEnds.load.psl
     end
     # rename old table
     hgsql -e 'alter table all_bacends rename allBacendsOld;' danRer3
     # load new tables
     foreach c (`cat /cluster/data/danRer3/chrom.lst`)
        hgLoadPsl danRer3 -table=chr${c}_all_bacends chr${c}.bacEnds.load.psl
     end
     # There are still warnings on loading, most (510) are for chrUn.
     # This improves the performance a lot.
     # The chrom-parsing code is confused by the double underscores in the
     # chrN_all_bacends tables so change the names to chrN_allBacends
     foreach c (`cat /cluster/data/danRer3/chrom.lst`)
        hgsql -e "alter table chr${c}_all_bacends rename chr${c}_allBacends;" \
              danRer3
     end
     # Then add correct table name to each of the bacEnd* tables
     foreach t (bacEndPairs bacEndPairsBad bacEndSingles)
        hgsql -e "update $t set pslTable = 'allBacends';" danRer3
     end
     # corrected termRegex for some bacCloneXRef searches in trackDb.ra so 
     # that they work correctly (bacPairsIntName, bacSinglesIntName, 
     # bacPairsSangerSts and bacSinglesSangerSts). (2006-04-19, hartera)
 
 # CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES 
 # (bacEndAlias, bacCloneAlias and bacCloneXRef) (DONE, 2005-10-06, hartera)
 # RECREATE TABLES AFTER REMAKING THE SINGLES AND PAIRS TABLES 
 # (see REDO BACENDS SECTION) (DONE, 2006-06-08, hartera)
 # REPLICATE ROWS IN TABLES SO REMOVE AND RELOAD (DONE, 2006-08-04, hartera)
     # Process data and create bacEndAlias table 
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends/bacends.1
     #  make bacEndAlias table with Genbank accessions for ends
     # need to run getBacEndInfo.pl for the BAC end names in the 
     # BAC tables.
     # in the pairs directory, there is the allBacEnds.names.uniq file
     # so use this.
     # Already made the bacEndAccs.aliases file with getBacEndInfov2.pl 
     # This has none of the BAC ends whose names end in ASP6 or AT7 as 
     # these are all from the CHORI73 library and they do not have BAC end
     # accessions in Genbank at the moment. This contains accessions for 
     # all BAC ends even those without alignments.
     hgsql danRer3 < $HOME/kent/src/hg/lib/bacEndAlias.sql
     echo "load data local infile 'bacEndAccs.aliases' into table \
          bacEndAlias" | hgsql danRer3
     ssh kkstore02
     # get the latest versions of the clonemarkers, contig names and markers
     # files from Sanger
     mkdir -p /cluster/data/danRer3/bed/bacends/cloneandStsAliases
     cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
     wget --timestamp \
       ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README
     wget --timestamp \
       ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/clonemarkers.27.07.05.txt
     wget --timestamp \
       ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/ctgnames.27.07.05.txt
     wget --timestamp \
          ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/markers.27.07.05.txt
     wc -l *27.07.05.txt
     # 29885 clonemarkers.27.07.05.txt
     # 167858 ctgnames.27.07.05.txt
     # 12250 markers.27.07.05.txt
     # Recreate tables as bacEndPairs, bacEndSingles, bacEndPairsBad and
     # chrN_allBacends tables have changed (2006-06-08, hartera)
     # get list of BAC end names, lfNames
     cp /cluster/data/danRer3/bed/bacends/scoresAndCoords/allBacEnds.names.uniq .
     # get list of BAC clone names 
     foreach f (bacEndPairs bacEndPairsBad bacEndSingles)
       awk '{print $4}' \
       /cluster/data/danRer3/bed/bacends/scoresAndCoords/${f}GoodScores.bed >> bacs.names
     end
     sort -u bacs.names > bacs.names.uniq
     wc -l *.uniq
     # 512321 allBacEnds.names.uniq
     # 300290 bacs.names.uniq
 
     # from psl file
     awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
     # edit to remove first few lines with no names
     sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq
     wc -l bacEndsPsl.names.uniq
     # 545920 bacEndsPsl.names.uniq
     # this is all the BAC ends that originally had alignments
     # Add an alias table for BAC clones
     # bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
     # Add a xref table to give external clone registry names, internal names
     # sanger name, relationship between STS and BAC clone (method of finding
     # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT,
     # Genbank accession and STS primer sequences
     # bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
     set dir=/cluster/data/danRer3/bed/bacends/
     awk 'BEGIN {OFS="\t"}{print $4, $1}' \
      $dir/scoresAndCoords/bacEndPairsGoodScores.bed > bacClones.namesandchrom
     awk 'BEGIN {OFS="\t"}{print $4, $1}' \
     $dir/scoresAndCoords/bacEndSinglesGoodScores.bed >> bacClones.namesandchrom
     sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
     # use a list of internal names,Genbank accessions, and BAC clone names
     # use BACClonesIdsandAccs.txt.
     # get list of UniSTS IDs using aliases to search alias file
     # print Sanger name, alias and UniSTS ID, use find_markers3.pl
 cat << '_EOF_' > find_markers3.pl
     # example:
 # perl find_markers3.pl UniSTS.aliases markers.02.12.04.txt
 use strict;
 my $verbose = 0;
 my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
 my $aliasFile = $ARGV[0];
 my $markersFile = $ARGV[1];
 open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
 open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
 # store aliases from aliasFile
 my ($id, $al, @alsArray, %aliasHash);
 while (<ALIAS>)
 {
    chomp;
    ($id, $al) = split /\t/;
    @alsArray = split(/;/, $al);
    foreach my $as (@alsArray)
       {
       push (@{$aliasHash{$as} }, $id);
       }
 }
 close ALIAS;
                                                                                 
 while (<MARKERS>) {
     my @idArray;
     ($f, $t, $m, $idArray[0]) = 0;
     my @ids;
     chomp; ($a, $b, $aliases, @rest) = split /\|/;
     if ($verbose > 3) { printf "aliases $aliases \n"; }
     @alias = split /;/, $aliases;
     ALIAS: foreach $s (@alias) {
         if ($s =~ /[\D]+/) {
             if ($verbose > 5) { printf "this $s \n"; }
             if (exists($aliasHash{$s}))
                {
                @idArray = @{$aliasHash{$s}};
                }
             if ($idArray[0]) {
                 $f = 1; $t = $s; @ids = @idArray;
                 if ($verbose) { printf "this $s found $m \n"; }
                 last ALIAS;
             }
         }
     }
     if ($f)
      {
      my @sNames = split(/;/, $b);
      foreach my $sn (@sNames)
         {
         foreach my $i (@ids)
            {
            printf "$sn\t$i\n";
            }
         }
     }
 }
 close MARKERS;
 '_EOF_'
     chmod +x find_markers3.pl
     perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-09-29/UniSTS.aliases \
          markers.27.07.05.txt > sangerandUniSTSId.txt
     # No need to reformat this for zfishBacClonesandSts
     # FPC contig information (i.e. FPC contig number) from ctgnames file is
     # not included in the tables as these are dynamic and constantly
     # changing with the assembly.
     # FILE OF BAC CLONE ACCESSIONS
     # http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
 
     # copy over file of BAC internal names, accessions and external names 
     cp /cluster/data/danRer3/bed/bacends/bacends.1/BACClonesIdsandAccs.txt .
     # use zfishBacClonesandSts to create tab files for loading into
     # bacCloneAlias and bacCloneXRef tables
     # make output directory
     rm -r /cluster/bluearc/danRer3/bacEnds/out
     mkdir -p /cluster/bluearc/danRer3/bacEnds/out
     # edit zfishBacClonesandSts.c to add prefixes for CHORI73 library:
     # CHORI73_ for internal name, CH73- for external name
     # in ctgnames.27.07.05.txt and clonemarkers.27.07.05.txt
     perl -pi.bak -e 's/zH([0-9]+)/CHORI73_$1/' *.27.07.05.txt 
     mv ctgnames.27.07.05.txt.bak ctgnames.27.07.05.orig
     mv clonemarkers.27.07.05.txt.bak clonemarkers.27.07.05.txt.orig
     # no change to markers file so remove .bak file
     rm markers.27.07.05.txt.bak 
     nice $HOME/bin/x86_64/zfishBacClonesandSts ctgnames.27.07.05.txt \
       clonemarkers.27.07.05.txt markers.27.07.05.txt \
       bacClones.namesandchrom.uniq BACClonesIdsandAccs.txt \
       sangerandUniSTSId.txt ./out > ./out/zfishBacs.out &
     # output is in /cluster/bluearc/danRer3/bacends/out so copy over
     # sort alias tab file by sangerName
     sort -k2 ./out/bacAlias.tab > bacAlias.sort.tab
     cp ./out/bacXRef.tab .
     wc -l *.tab
     # 110961 bacAlias.sort.tab
     # 540800 bacXRef.tab
  
     ssh hgwdev 
     cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
     hgsql -e 'drop table bacCloneAlias;' danRer3
     hgsql -e 'drop table bacCloneXRef;' danRer3
      
     hgLoadSqlTab danRer3 bacCloneAlias \
           $HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab
     hgLoadSqlTab danRer3 bacCloneXRef \
           $HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab
 # edit trackDb.ra to add bacEnds tracks and searches for the bacEndPairs
 # and bacEndSingles tracks as for danRer1. copy over html from danRer2
 # for bacEndPairs and bacEndSingles tracks.
     # Replicate rows in table so reload after removing these
     # (hartera, 2006-08-04)
     ssh hgwdev 
     cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
 
     sort bacAlias.sort.tab | uniq | sort -k2 > bacAlias.sort.tab.uniq
     sort bacXRef.tab | uniq > bacXRef.tab.uniq
     wc -l *.tab.uniq
     # 57656 bacAlias.sort.tab.uniq
     # 356453 bacXRef.tab.uniq
  
     # Drop old tables and reload:
     hgsql -e 'drop table bacCloneAlias;' danRer3
     hgsql -e 'drop table bacCloneXRef;' danRer3
     
     hgLoadSqlTab danRer3 bacCloneAlias \
           $HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab.uniq
     hgLoadSqlTab danRer3 bacCloneXRef \
           $HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab.uniq
 
 # BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES
 # (DONE, 2005-10-06, hartera)
 # REDONE AFTER REMAKING bacCloneAlias AND bacCloneXRef TABLES - both ok.
 # (DONE, 2006-06-12, hartera)
 # REDONE AFTER REMAKING bacCloneAlias AND bacCloneXRef TABLES
 # (DONE, 2006-08-04, hartera)
     # The following tests were carried out to check that all the data
     # in the bacCloneAlias and bacCloneXRef tables is correct.
     ssh hgwdev
     cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
     cp ./testTablesNew/*.pl .
     rm -r testTablesNew
     mkdir -p testTablesNew
     cd testTablesNew
                                                                                 
 # Check that the correct aliases are associated with their Sanger STS names
     awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
         ../markers.27.07.05.txt > sNameandaliases
     # write script to get one Sanger name and one alias on each line
     cp ../*.pl .
     perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
     sort sNameandaliases.format | uniq > sNameandaliases.sort
     # get Sanger names and aliases from database
     hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer3 \
           | sort | uniq > alias.db.sort
     wc -l alias.db.sort
     # 57656 alias.db.sort
     diff sNameandaliases.sort alias.db.sort
     # No difference between data file and data from database so ok
     # Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
     # get Sanger names from alias table
     hgsql -N -e 'select sangerName from bacCloneAlias;' danRer3 \
              | sort | uniq > sName.alias.sort
     wc -l sName.alias.sort
     # 15309 sName.alias.sort
     # get Sanger names from xRef table
     hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
           is not null;' danRer3 | sort | uniq > sName.xRef.sort
     wc -l sName.xRef.sort
     # 15522 sName.xRef.sort
     comm -23 sName.alias.sort sName.xRef.sort
     # nothing unique to alias file so all Sanger names in the alias table are
     # also in the xRef table
     comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias
     wc -l sNamexRefNotAlias
     # 213 sNamexRefNotAlias
     awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.27.07.05.txt | sort | uniq \
         > clonemarkers.sNames.sort
     # get Sanger names from markers file
     awk 'BEGIN {FS="|"}{print $2}' ../markers.27.07.05.txt > markers.sNames
     # remove semi-colons and sort
     sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
     # sanger names unique to markers file
     comm -13 clonemarkers.sNames.sort markers.sNames.sort
     # there are none
     comm -23 clonemarkers.sNames.sort markers.sNames.sort \
          > sNames.clonemarkersOnly
     wc -l sNames.clonemarkersOnly
     # 213 sNames.clonemarkersOnly
     diff sNames.clonemarkersOnly sNamexRefNotAlias
     # No difference so all the extra Sanger Names in the xRef 
     # table are from the clonemarkers file and these have no aliases in 
     # the markers file so they are not in the alias table so this is all ok.
   
 # Check that Sanger STS names and primers are associated correctly
     cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases/testTablesNew
     # get sanger names and primers from markers file
     awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
         ../markers.27.07.05.txt > sNameandPrimers
     # use script to reformat and write with one Sanger name per line
     chmod +x getSangerandPrimers.pl
     perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
     sort sNameandPrimers.format > sNameandPrimers.format.sort
     wc -l sNameandPrim*
     # 12250 sNameandPrimers
     # 15309 sNameandPrimers.format
     # 15309 sNameandPrimers.format.sort
     # get Sanger names and primers from database
     hgsql -N -e \
       'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
       where sangerName is not null and leftPrimer is not null and \
       rightPrimer is not null;' danRer3 | sort | uniq \
       > sNamesandprimers.fromdb.sort
     wc -l sNamesandprimers.fromdb.sort
     # 15309 sNamesandprimers.fromdb.sort
     diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
     # No difference so ok.
 
 # Check that UniSTS IDs and Sanger STS names are associated correctly
    # get Sanger names and UniSTS IDs from the database
    hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
        uniStsId is not null;' danRer3 | sort | uniq > sNameUniSTS.fromdb.sort
    wc -l sNameUniSTS.fromdb.sort
    #  5634 sNameUniSTS.fromdb.sort
    # Need to reformat the sNameUniSTS.fromdb.sort
    chmod +x formatUniSts.pl
    perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
         > sNameUniSTS.fromdb.format.sort
    # get Sanger names from data file and see how many UniSTS IDs there are
    # for each name
    awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
        > sangerandUniSTSId.count
    # the most is 3 
    # 3 etID9786.21
    # 3 etID9056.23
    # 3 etID9042.2
    # 3 etID8627.2
    # 3 etID8281.9
    # 3 etID11096.5
    sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort
    diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \
        > sangerandUniSTSIdvsdb
    # No difference between data from original file and that in database so ok
 
 # Check that chrom mappings and external BAC clone names are correct
    # get extNames and chroms they map to from the database
    hgsql -N -e 'select name, chroms from bacCloneXRef where \
          chroms is not null;' danRer3 | sort | uniq \
          > nameandchromsfromdb.sort
    # reformat nameandchromsfromdb.sort
    perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
         > nameandchromsfromdb.format.sort
    # compare extNames and chroms from db to those in data file
    cp ../bacClones.namesandchrom .
    sort -u bacClones.namesandchrom > bacClones.namesandchrom.uniq
    diff bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
    # no difference - all ok
 
 # Check Genbank accessions and internal BAC clone names
    hgsql -N -e 'select intName,genbank from bacCloneXRef where \
          genbank is not null;' danRer3 | sort | uniq \
          > intNamesandAccs.fromdb.sort
    # this should be a subset of zfish_accsMerged.txt - not all BAC clones
    # listed here appear in either our BAC ends tracks or the markers files.
    awk 'BEGIN {OFS="\t"} {print $1,$2}' ../BACClonesIdsandAccs.txt \
        | sort -u > BACClonesIntandAccs.sort
    comm -23 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort
    # there is nothing in the database that is not in BACClonesIntandAccs.sort
    comm -13 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort \
             > onlyinzfishAccs
    wc -l onlyinzfishAccs
    # 86 onlyinzfishAccs
    hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' \
          danRer3 | sort | uniq > intNamesNoAcc.fromdb.sort
    awk '{print $1;}' BACClonesIntandAccs.sort > intNames.withAccs.sort
    comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
         > indbNoAccsandAccs.out
    # none of these names are common to both so all accessions from
    # BACClonesIdsandAccs.txt are in the database for the internal names stored
    # where there are accessions available.
 
 # Test Sanger STS names, internal names and external names are all correct
 # Test Sanger STS name and internal BAC clone names are associated correctly
    # get internal names and Sanger names from data file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.27.07.05.txt \
        | sort | uniq > intNameandSanger.sort
    hgsql -N -e 'select intName, sangerName from bacCloneXRef \
        where sangerName is not null;' danRer3 \
        | sort | uniq > intNameandSanger.fromdb.sort
    diff intNameandSanger.sort intNameandSanger.fromdb.sort
    # No difference between data from file and that from database so ok
 
 # Check BAC clone internal name and relationship fields
    # get internal names and relationships from data file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.27.07.05.txt \
        | sort | uniq > intNameandRelation.sort
    # get internal names and relationships from database
    hgsql -N -e 'select intName, relationship from bacCloneXRef \
        where relationship != 0;' danRer3 \
        | sort | uniq > intNameandrelation.fromdb.sort
    # differences unique to database file
    comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
        > intNameRelation.indbonly
    # differences unique to data file
    comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
        > intNameRelation.incloneMarkersonly
    wc -l intNameRelation*
    # 4650 intNameRelation.incloneMarkersonly
    # 4650 intNameRelation.indbonly
   
    awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
    awk '{print $1}' intNameRelation.incloneMarkersonly \
        > intNameRelation.incloneMarkersonly.names
    diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
    # there is no difference in the internal names with relationship fields
    # no difference in names and the only places these should differ is that
    # the second column should all be 3 in the data from the database only.
    # this is because all the relationship entries that were blank were
    # in the clonemarkers file were changed to 3 when entered into the database.
    awk '{print $2}' intNameRelation.indbonly | sort | uniq
    # 3 - correct so all ok
    # all the differences should be that those that are blank in clonemarkers
    # are 3 in the database.
    # check that those that have 0 in the database bacCloneXRef relationshipe
    # field are not in the list from cloneMarkers
    # select these internal names with 0 relationship from the database
    hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
          danRer3 | sort | uniq > intNameNoRelation.fromdb.sort
    # get all the internal names from the data file
    awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.07.05.txt \
        | sort | uniq > intNamefromCloneMarkers.sort
    comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
    # nothing in common between these two files as expected so there are
    # no internal names in the db with 0 in the relationship field that
    # appear in the clonemarkers file.
 
 # Check all BAC clone internal names and external names from the
 # ctgnames file are in the database
    # get intName and extName from ctgnames file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.27.07.05.txt \
        | sort | uniq > intNameandextNamefromCtgNames.sort
    # get intName and extName from database
    hgsql -N -e 'select intName,name from bacCloneXRef;' danRer3 \
        | sort | uniq > intNameandextName.fromdb.sort
    wc -l intNameandextName*
    # 340039 intNameandextName.fromdb.sort
    # 167858 intNameandextNamefromCtgNames.sort
    comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
         > intandextindbAndCtgNames
    wc -l intandextindbAndCtgNames
    # 167858 intandextindbAndCtgNames
    # there are 167858 name pairs common between the file and the database
    # and this is the same number of name pairs as in the data file
    diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
    # no difference between those name pairs from the data file and those that
    # are common between the data file and the database so all internal and
    # external names from ctgNames file are in the database
    # get the list of extra ones from db
    comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
         > intandextNamesindbNotinCtgNames
    wc -l intandextNamesindbNotinCtgNames
    # 172181 intandextNamesindbNotinCtgNames
    # get list of internal names from the clonemarkers file
    awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.07.05.txt | sort | uniq \
        > clonemarkers.intName.sort
    wc -l clonemarkers.intName.sort
    # 13471 clonemarkers.intName.sort
    # compare these intNames to those from the database not in the ctgnames file
    comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
    # none of these clone markers internal names are in this list so they
    # must all be in the ctgnames file too. These extra internal names will be
    # translations of external names found in the list of mappings of BAC clones
    # to chroms.
 
 # Check that all the BAC clone external names from the list of chromosome
 # mappings and from the ctgnames file are in the database.
    # get all extNames from baclones.namesandchrom.uniq and from ctgnames
    awk '{print $1}' ../bacClones.namesandchrom.uniq > \
        extNames.ctgnamesandbacClones
    awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.27.07.05.txt \
        >> extNames.ctgnamesandbacClones
    wc -l extNames.ctgnamesandbacClones
    # 510169 extNames.ctgnamesandbacClones
    sort extNames.ctgnamesandbacClones | uniq \
         > extNames.ctgnamesandbacClones.sort
    wc -l extNames.ctgnamesandbacClones.sort
    # 340039 extNames.ctgnamesandbacClones.sort
    # get extNames from the database
    hgsql -N -e 'select name from bacCloneXRef;' danRer3 | sort | uniq \
          > extNames.fromdb.sort
    wc -l extNames.fromdb.sort
    # 340039 extNames.fromdb.sort
    comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
          > extNames.fromdbandfiles
    wc -l extNames.fromdbandfiles
    # 340039 extNames.fromdbandfiles
    # find extNames in common from data files and database
    diff extNames.fromdb.sort extNames.fromdbandfiles
    # no difference, all extNames from files are in db
 
 # Check that all BAC clone internal names from the ctgnames and clonemarkers
 # files are in the database
    # get internal names from ctgnames and clonemarkers files
    awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.27.07.05.txt \
        > intNames.ctgnamesandclonemarkers
    awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.27.07.05.txt \
        >> intNames.ctgnamesandclonemarkers
    wc -l intNames.ctgnamesandclonemarkers
    # 197743 intNames.ctgnamesandclonemarkers
    sort intNames.ctgnamesandclonemarkers | uniq \
         > intNames.ctgnamesandclonemarkers.sort
    wc -l intNames.ctgnamesandclonemarkers.sort
    # 167858 intNames.ctgnamesandclonemarkers.sort
    # get internal names from database
    hgsql -N -e 'select intName from bacCloneXRef;' danRer3 | sort | uniq \
         > intNames.fromdb.sort
    wc -l intNames.fromdb.sort
    # 340039 intNames.fromdb.sort
    # some of these intNames are derived from the corresponding extNames
    # all of the intNames from the file should be in the db
    comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
         > intNames.fromdbandfiles
    wc -l intNames.fromdbandfiles
    # 167858 intNames.fromdbandfiles
    diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
    # no difference, all intNames from files are in db
                                                                                 
 # Check that all translations are correct between BAC clone
 # external and internal names.
    # write script to get the prefixes from internal and external names
    chmod +x getNamePrefixes.pl
    hgsql -N -e 'select name, intName from bacCloneXRef;' danRer3 \
          | sort | uniq > extandintNames.fromdb.sort
    perl getNamePrefixes.pl < extandintNames.fromdb.sort \
          > extandintNames.prefixes
    sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
    # these all look good
    # BUSM1   dZ
    # CH211   zC
    # CH211   zc
    # CH73    CHORI
    # CT7     bP
    # DKEY    zK
    # DKEY    zk
    # DKEYP   zKp
    # RP71    bZ
    # XX      bY
    # zk is a internal name prefix for the external name prefix, DKEY-. There
    # is only one example where this is used (DKEY-81G7) and this in the
    # ctgnames file and is in the bacCloneXRef table so that is ok.
    # All data looks good in these tables now.
 
 # BLASTZ TETRAODON (tetNig1) (DONE, 2005-10-20, hartera)
 # REMADE DOWNLOADS FOR net, all.chain AND over.chain AS THEY HAD BEEN DELETED.
 # MOVE ALL THE RUN FILES AND OUTPUT FROM THE SAN RUN DIRECTORY TO A DIRECTORY
 # ON /cluster/data AS THIS IS MORE PERMANENT. (DONE, 2005-11-17, hartera).
     # Tetraodon is quite distant from zebrafish, more distant than human/chicken
     # so use the HoxD55.q matrix for the Blastz alignments.
     # Blastz requires lineage-specific repeats but there are none
     # available between these two fish species 
     
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/bed/blastz.tetNig1.2005-10-11
     cd /cluster/data/danRer3/bed
     ln -s blastz.tetNig1.2005-10-11 blastz.tetNig1
     cd /cluster/data/danRer3/bed/blastz.tetNig1
     # create a 2bit file for danRer3 with all chroms (1-25 and M) and the
     # scaffolds for NA and Un if it does not exist already
     cd /cluster/data/danRer3
     faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
                Un/scaffoldUn.fa NA/scaffoldNA.fa danRer3ChrUnNAScafs.2bit
     ssh hgwdev
     # move the 2 bit file for danRer3 to the san if not there already
     mkdir -p /san/sanvol1/scratch/danRer3/
     mv /cluster/data/danRer3/danRer3ChrUnNAScafs.2bit \
        /san/sanvol1/scratch/danRer3/
     # also copy over the danRer3 2 bit file for all chroms and the
     # lift file for NA and Un scaffolds to chrNA and chrUn.
     cp /cluster/data/danRer3/danRer3.2bit /san/sanvol/scratch/danRer3/
     cp /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
        /san/sanvol1/scratch/danRer3/
 
     # also copy over tetraodon sequences to the san
     mkdir -p /san/sanvol1/scratch/tetNig1/contigs
     cp /cluster/bluearc/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit \
     # see makeTetNig1.doc for making tetNig1ChrContigsRandomScafs.2bit
     # make output and run directories
     mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
     mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut
     cd /cluster/data/danRer3/bed/blastz.tetNig1
     ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
     ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut
     # also copy over tetraodon sequences to the san
     mkdir -p /san/sanvol1/scratch/tetNig1/contigs
     cp /cluster/bluearc/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit \
        /san/sanvol1/scratch/tetNig1/contigs/
 # use tetraodon sequence in contigs for dynamic masking - see below
 # for dynamic masking: M=50. Each time a base is hit at least 50 times, it
 # is masked out.
 # Blastz danRer3 chroms and scaffolds vs tetNig1 ordered chrom contigs and 
 # scaffolds from random chromosomes. lift up the tetNig1 contigs to chrom 
 # level. Then make the chains and then liftUp all the scaffolds to chrom 
 # level before sorting and merging chains and then netting.
     # get all contigs from mapped ordered chroms and make 2bit file
     # see makeTetNig1.doc
 
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
 cat << '_EOF_' > DEF
 # zebrafish (danRer3) vs. tetraodon (tetNig1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 ALIGN=blastz-run
 BLASTZ=blastz.v7.x86_64
 BLASTZ_M=50
 BLASTZ_H=2500
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM and scaffolds
 SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
 SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit
 SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
 SEQ1_RMSK=
 # lineage-specific repeats
 # we don't have that information for these species
 SEQ1_SMSK=
 SEQ1_FLAG=
 SEQ1_LIMIT=30
 SEQ1_IN_CONTIGS=0
 # 0.5 Mb chunk for target with 5 kb overlap
 SEQ1_CHUNK=500000
 SEQ1_LAP=5000
 
 # QUERY - Tetraodon (tetNig1)
 # soft-masked 500 kb contigs for chroms, scaffolds for randoms
 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit
 SEQ2_RMSK=
 SEQ2_SMSK=
 SEQ2_FLAG=
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=1000000000
 SEQ2_LAP=0
 
 BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes
 SEQ2_LEN=$BASE/S2.len
 TMPDIR=/scratch/tmp
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x DEF
     cp /cluster/data/danRer3/chrom.sizes ./S1.len
     twoBitInfo /san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit \
                chromsUnNAScafs.sizes
     twoBitInfo \
 /san/sanvol1/scratch/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit ./S2.len
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
   -bigClusterHub=pk -smallClusterHub=pk -workhorse=pk -stop cat \
   -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut \
   `pwd`/DEF >& do.log &
    # PID 32339  Start: Tue Oct 11 14:55
    # use Hiram's script to kill 4 empty shell commands on Thurs Oct 13th
    # /cluster/bin/scripts/findEmpty.sh -r to find
    # /cluster/bin/scripts/findEmpty.sh -K to kill
 # Fri Oct 14 10:41
 # Checking finished jobs
 # crashed: 32
 # running: 20
 # ranOk: 3716
 # failed 4 times: 32
 # total jobs in batch: 3768
 # check problems:
 
 # 141 jobs crashed on host: kkr10u19.kilokluster.ucsc.edu
 # Just removed this machine with parasol remove machine as over 9000 jobs 
 # crashed for opossum run on this machine.
 # run again with para push -retries=20
 # By 16:00 on Fri Oct 14, all jobs finished but 2 failed 4 times so repush
 # with para push -retries=20.
 # para time
 # Completed: 3768 of 3768 jobs
 # CPU time in finished jobs:   12465019s  207750.32m  3462.51h  144.27d  0.395 y
 # IO & Wait Time:                873594s   14559.90m   242.66h   10.11d  0.028 y
 # Average job time:                3540s      59.00m     0.98h    0.04d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           19777s     329.62m     5.49h    0.23d
 # Submission to last job:        264857s    4414.28m    73.57h    3.07d
     ssh pk
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/run.blastz
     para time > run.time
     # run doBlastzChainNet.pl to continue with cat step since the script
     # crashed when some of the jobs failed 4 times.
     ssh hgwdev
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
   -bigClusterHub=pk -smallClusterHub=pk -workhorse=pk -continue cat -stop cat \
   -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut \
   `pwd`/DEF >& doCat.log &
     # Took about 7 minutes.
     # Now need to liftUp the contigs for tetNig1 to chrom-level but
     # not the scaffolds. All the scaffolds will be lifted after the 
     # chaining step.
     ssh kolossus
     # liftUp contigs for tetraodon query: 
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
     mv pslParts pslPartsNotLifted
     mkdir /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun/liftedPsl
     set dir=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
     # use carry for "how" as this will carry items not in liftSpec to dest
     # file without translation. lift file is only for contigs not scaffolds.
     # use nohead option otherwise psl header added at the top of each file.
     # need to add the blastz params header
     zcat ./pslPartsNotLifted/part958.lst.psl.gz | head -3 > header
 
     # first lift to pseudo-contig level and then to chroms
   foreach f (./pslPartsNotLifted/*.psl.gz) 
      set g=$f:r:t
      zcat $f | liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted.psl \
   /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft carry stdin
      liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted2.psl \
   /cluster/data/tetNig1/jkStuff/liftAll.lft carry $dir/liftedPsl/${g}.lifted.psl
      cat header $dir/liftedPsl/${g}.lifted2.psl > $dir/liftedPsl/${g}
      rm $dir/liftedPsl/${g}.lifted*
   end
     # check a couple of files and see that they have the correct number of lines
     # then move the contents of this directory to pslParts
     mkdir $dir/pslParts
     foreach f ($dir/liftedPsl/*.psl)
        gzip $f 
        mv ${f}.gz $dir/pslParts/
     end
     # carry on with doBlastzChainNet.pl from the chaining step
     ssh hgwdev
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
     cp DEF DEF.tetraContigs
     # edit DEF file so that tetNig1 now has a 2bit file of the chroms and 
     # scaffolds for randoms in the CTGDIR and also there is a lift file
     # for the scaffolds.
 cat << '_EOF_' > DEF
 # zebrafish (danRer3) vs. tetraodon (tetNig1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
                                                                                 
 ALIGN=blastz-run
 BLASTZ=blastz.v7.x86_64
 BLASTZ_M=50
 BLASTZ_H=2500
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
 BLASTZ_ABRIDGE_REPEATS=0
                                                                                 
 # TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM and scaffolds
 SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
 SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit
 SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
 SEQ1_RMSK=
 # lineage-specific repeats
 # we don't have that information for these species
 SEQ1_SMSK=
 SEQ1_FLAG=
 SEQ1_LIMIT=30
 SEQ1_IN_CONTIGS=0
 # 0.5 Mb chunk for target with 5 kb overlap
 SEQ1_CHUNK=500000
 SEQ1_LAP=5000
                                                                                 
 # QUERY - Tetraodon (tetNig1)
 # soft-masked chroms, and scaffolds for randoms
 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
 SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit
 SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
 SEQ2_RMSK=
 SEQ2_SMSK=
 SEQ2_FLAG=
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=1000000000
 SEQ2_LAP=0
                                                                                 
 BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
                                                                                 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes
 SEQ2_LEN=$BASE/S2.len
 SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
 TMPDIR=/scratch/tmp
                                                                                 
 #DEBUG=1
 '_EOF_'
     # if it does not exist already, make the file of sizes for the tetNig1
     # chroms and scaffolds.
     twoBitInfo \
 /san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit \
     /san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
     # Also, need to change the sequence sizes file for tetNig1 to the 
     # chrom sizes and not the scaffolds and contigs sizes.
     cp S2.len S2contigsAndScafs.len
     cp /cluster/data/tetNig1/chrom.sizes S2.len
     # then run doBlastzChainNet.pl script again
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
          -bigClusterHub=pk \
          -smallClusterHub=pk \
          -workhorse=pk \
          -fileServer=kolossus \
          -continue chainRun \
          -chainMinScore=5000 \
          `pwd`/DEF >& doChains.log &
     # Start: Fri Oct 14 17:47 Finished: Oct 14 17:57
     # crashed as one job failed after 4 retries, problem is that 
     # part958.lst.psl.gz is not recognized as a psLayout file. It is empty
     # except for parameter comment lines so it can be ignored.
     # Also, need to change the sequence sizes file for tetNig1 to the 
     # chrom sizes and not the scaffolds and contigs sizes.
     ssh pk
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain/run/
     para time > run.time
     ssh hgwdev
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
     # crashes while doing chainMerge so add a flag into DEF file to indicate
     # that the genomes are in scaffolds so there is a large number of chain 
     # files. Changed doBlastzChainNet.pl so that if this flag is seen then 
     # the chain files are concatentated and then chainSort is used to sort 
     # the resulting chain file by score and chainMergeSort is used to renumber 
     # the chain IDs so that they are unique. chainMergeSort expects chain 
     # files sorted by score as input.
     # add this line to the DEF file: GENOME_IN_SCAFFOLDS=1 
     nice ./doBlastzChainNet.pl \
          -bigClusterHub=pk \
          -smallClusterHub=pk \
          -workhorse=pk \
          -fileServer=kolossus \
          -continue chainMerge \
          -chainMinScore=5000 \
          `pwd`/DEF >& doChainMergeNet.log &
     # Start: Wed Oct 19 12:52 Finish: Oct 19 13:13   
     # Add a trackDb.ra entry for chainTetNig1 and netTetNig1 and add html 
     # pages. Modify track descriptions to describe the process using 
     # scaffolds for danRer3 chrNA and chrUn and the fact that dynamic 
     # masking was used for the Blastz alignments Edit the README for 
     # the downloads to add in information about using scaffolds for Blastz 
     # for danRer3 chrNA and chrUn and for tetNig1 random unordered chroms, 
     # and how the tetNig1 genome was aligned as a file of contigs for chroms
     # and scaffolds for randoms for the Blastz alignments and so that
     # each danRer3 chunk was aligned with the whole of the tetraodon 
     # genome to take advantage of dynamic masking (M=50).
     # Finally, run a doBlastzChainNet.pl swap for this to create danRer3 
     # chains and net tracks on tetNig1 - see makeTetNig1.doc.
  
 # featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1Link -enrichment
 # refGene:cds 0.746%, chainTetNig1Link 7.167%, both 0.672%, cover 90.17%, 
 # enrich 12.58x
 # featureBits -chrom=chr2 danRer2 refGene:cds chainTetNig1Link -enrichment
 # refGene:cds 0.750%, chainTetNig1Link 4.463%, both 0.621%, cover 82.84%, 
 # enrich 18.56x
 # so better coverage for danRer3 but less enrichment than for danRer2.
 
 # Make the download files for all.chain, over.chain and net again as these
 # files have been removed. Put the files on /cluster/data rather than the 
 # san so that they are not moved again. (hartera, 2005-11-17)
     ssh kolossus
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain
     chainMergeSort ./run/chain/*.chain | nice gzip -c \
                    > danRer3.tetNig1.all.chain.gz
     # copy over.chain file from bedOver directory to axtChain directory
     cp /cluster/data/danRer3/bed/bedOver/danRer3.tetNig1.over.chain.gz \
        /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain/
     # recreate net file
     # make noClass.net
     #Make nets ("noClass", i.e. without rmsk/class stats which are added later) 
     chainPreNet danRer3.tetNig1.all.chain.gz ../S1.len ../S2.len \
          stdout | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout \
          /dev/null | netSyntenic stdin noClass.net 
     # memory usage 251383808, utime 562 s/100, stime 41
     # create net file 
     ssh hgwdev
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain
     netClass -verbose=0 -noAr noClass.net danRer3 tetNig1 danRer3.tetNig1.net
     # compress net file
     gzip danRer3.tetNig1.net
 
     # Move these files to /cluster/data and remake download links as the 
     # san is not a permanent storage space.
     mv /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun \
        /cluster/data/danRer3/bed/blastz.tetNig1/
     # Then change the symlinks in the downloads directory to point to the files
     # on /cluster/data
     cd /usr/local/apache/htdocs/goldenPath/danRer3/vsTetNig1/axtNet
     set runDir=/cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
     rm *.gz
     foreach f ($runDir/axtNet/*.axt.gz)
       ln -s $f .
     end
     cd ..
     rm *.gz
     foreach f ($runDir/axtChain/*.gz)
       ln -s $f
     end
     # remake the md5sum file
     rm md5sum.txt
     md5sum *.gz */*.gz > md5sum.txt
     
     # Test Runs for chr2 and chrUn
     cd /cluster/data/danRer3/bed/blastz.tetNig1
     mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
     ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
     # create blastz output directory
     mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out
     ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out
     mkdir /san/sanvol1/scratch/danRer3/chrUnand2
     cd /san/sanvol1/scratch/danRer3/chrUnand2
     cp ../nib/chr2.nib ../nib/chrUn.nib .
     rsync -a --progress /cluster/bluearc/tetNig1/contigs/tetNig1Contigs.2bit \
        /san/sanvol1/scratch/tetNig1/contigs/
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run
 
 cat << '_EOF_' > DEF
 # zebrafish (danRer3) vs. tetraodon (tetNig1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 ALIGN=blastz-run
 BLASTZ=blastz.v7.x86_64
 BLASTZ_M=50
 BLASTZ_H=2500
 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
 BLASTZ_ABRIDGE_REPEATS=0
 
 # TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM
 SEQ1_DIR=/san/sanvol1/scratch/danRer3/chrUnand2
 SEQ1_RMSK=
 # lineage-specific repeats
 # we don't have that information for these species
 SEQ1_SMSK=
 SEQ1_FLAG=
 SEQ1_IN_CONTIGS=0
 # 0.5 Mb chunk for target
 SEQ1_CHUNK=500000
 SEQ1_LAP=500
 
 # QUERY - Tetraodon (tetNig1)
 # soft-masked 500 kb contigs for chroms, scaffolds for randoms
 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/contigs/tetNig1Contigs.2bit
 SEQ2_RMSK=
 SEQ2_SMSK=
 SEQ2_FLAG=
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=1000000000
 SEQ2_LAP=0
 
 BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x DEF
    
     cp /cluster/data/danRer3/chrom.sizes ./S1.len 
     twoBitInfo \
     /san/sanvol1/scratch/tetNig1/contigs/tetNig1Contigs.2bit ./S2.len
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
       -bigClusterHub=pk \
       -smallClusterHub=pk \
       -workhorse=pk \
       -fileServer=kolossus \
       -stop cat \
       -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \
       -chainMinScore=5000 \
       `pwd`/DEF >& do.log &
       # PID: 4890 Start: Thu Sep 29 14:50
       # ran quickly, 30 mins
       # crashed as some jobs crashed and failed after 4 retries so 
       # push them again. 
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
       -bigClusterHub=pk \
       -smallClusterHub=pk \
       -workhorse=pk \
       -fileServer=kolossus \
       -continue cat \
       -stop cat \
       -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \
       -chainMinScore=5000 \
       `pwd`/DEF >& doCat.log &
     # Took a couple of minutes
     # need to lift up the contigs to chrom level for tetNig1 
     # liftUp contig files for tetraodon query: 
     # if file is empty, then liftUp gets stuck reading commented lines
     # so make a list of files which contain alignment data and not just
     # commented lines starting with # (blastz parameters)
     foreach f (./pslPartsNotLifted/*.psl.gz)
         zcat $f | awk '{if ($1 !~ /#/) print "'$f'";}' >> pslParts.lst
     end
     sort pslParts.lst | uniq > pslPartsNotEmpty.lst 
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run
     mv pslParts pslPartsNotLifted
     mkdir /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run/liftedPsl
     set dir=/san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
     # use carry for "how" as this will carry items not in liftSpec to dest
     # file without translation. lift file is only for contigs not scaffolds.
     # use nohead option otherwise psl header added at the top of each file.
     # need to add the blastz params header
     zcat \
     ./pslPartsNotLifted/chrUn.nib:chrUn:99500000-100000500.psl.gz \
     | head -3 > header
 
     # first lift to pseudo-contig level and then to chroms
   foreach f (`cat pslPartsNotEmpty.lst`) 
      set g=$f:r:t
      zcat $f | liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted.psl \
   /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft warn stdin
      liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted2.psl \
   /cluster/data/tetNig1/jkStuff/liftAll.lft warn $dir/liftedPsl/${g}.lifted.psl
      cat header $dir/liftedPsl/${g}.lifted2.psl > $dir/liftedPsl/${g}
      rm $dir/liftedPsl/${g}.lifted*
   end
     mv liftedPsl pslParts
     # need to gzip these again
     foreach f (./pslParts/*.psl)
        gzip $f
     end
     # then carry on with chaining for these danRer3 NA and Un scaffolds
     # tetNig1.2bit has full chroms for ordered chroms
     # and randoms as scaffolds
     cp DEF DEF.contigs
     # copy over 2bit file with chroms for tetNig1 if not
     # there already.
     mv S2.len S2.contigs
     twoBitInfo \
     /san/sanvol1/scratch/tetNig1/tetNig1.2bit ./S2.len
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
       -bigClusterHub=pk \
       -smallClusterHub=pk \
       -workhorse=pk \
       -fileServer=kolossus \
       -continue chainRun \
       -stop net \
       -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \
       -chainMinScore=5000 \
       `pwd`/DEF >& doNet.log &
     # PID 1117    Start: Thu Sep 29 16:20 Finished: 16:24
     # crashed: says it can't find [danRer3.tetNig1.]all.chain[.gz] but it 
     # is there.
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
       -bigClusterHub=pk \
       -smallClusterHub=pk \
       -workhorse=pk \
       -fileServer=kolossus \
       -continue net \
       -stop net \
       -chainMinScore=5000 \
       `pwd`/DEF >& doNet2.log &
     # Took 1 minute
     # TO DO: load tables
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run/axtChain/chain
     foreach f (*.chain)
        set c=$f:r
        hgLoadChain danRer3 ${c}_chainTetNig1NoScafs $f
     end
     cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run/axtChain
     # add gap/repeat stats to net file using db tables
     netClass -verbose=0 -noAr noClass.net danRer3 tetNig1 danRer3.tetNig1.net
     # load nets
     netFilter -minGap=10 danRer3.tetNig1.net \
               | hgLoadNet -verbose=0 danRer3 netTetNig1NoScafs stdin
 
     # then need to load chains and net into browser with a different name
 # featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1Link -enrichment
 # refGene:cds 0.742%, chainTetNig1Link 7.166%, both 0.670%, cover 90.26%, 
 # enrich 12.60x
 # featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1NoScafsLink -enrichment
 # refGene:cds 0.742%, chainTetNig1NoScafsLink 7.171%, both 0.670%, cover 90.30%, enrich 12.59x
 # featureBits -chrom=chrUn danRer3 refGene:cds chainTetNig1Link -enrichment
 # refGene:cds 0.497%, chainTetNig1Link 6.175%, both 0.441%, cover 88.68%, enrich 14.36x
 # featureBits -chrom=chrUn danRer3 refGene:cds chainTetNig1NoScafsLink -enrichment
 # refGene:cds 0.497%, chainTetNig1NoScafsLink 6.179%, both 0.441%, cover 88.67%, enrich 14.35x
 Rows in chainTetNig1Link:
     tetNig1	tetNig1NoScafs
 chr2	308576	303236
 chrUn	1133922 1114061
 
 #nets:
 # featureBits -chrom=chr2 danRer3 refGene:cds netTetNig1 -enrichment
 # refGene:cds 0.742%, netTetNig1 62.053%, both 0.715%, cover 96.34%, enrich 1.55x
 # featureBits -chrom=chr2 danRer3 refGene:cds netTetNig1NoScafs -enrichment
 # refGene:cds 0.742%, netTetNig1NoScafs 63.095%, both 0.717%, cover 96.63%, enrich 1.53x
 # featureBits -chrom=chrUn danRer3 refGene:cds netTetNig1 -enrichment
 # refGene:cds 0.497%, netTetNig1 48.803%, both 0.477%, cover 95.87%, enrich 1.96x
 # featureBits -chrom=chrUn danRer3 refGene:cds netTetNig1NoScafs -enrichment
 # refGene:cds 0.497%, netTetNig1NoScafs 49.207%, both 0.478%, cover 96.01%, enrich 1.95x
 #  Rows in netTetNig1
 # 	tetNig1		tetNig1NoScafs
 chr2	17370		17415
 chrUn	56259		56360
 
 # featureBits -chrom=chr2 danRer2 refGene:cds chainTetNig1Link -enrichment
 # refGene:cds 0.739%, chainTetNig1Link 4.463%, both 0.617%, cover 83.44%, 
 # enrich 18.69x
 # featureBits -chrom=chr2 danRer3 refGene:cds chainNoHoxD55TetNig1Link -enrichment
 # refGene:cds 0.668%, chainNoHoxD55TetNig1Link 4.815%, both 0.587%, 
 # cover 87.95%,enrich 18.27x
 
 # featureBits -chrom=chr2 danRer3 refGene:cds chainHoxD55TetNig1Link -enrichment
 # refGene:cds 0.668%, chainHoxD55TetNig1Link 7.846%, both 0.612%, cover 91.71%, enrich 11.69x
 # HoxD55.q with mm6 parameters but H=2500:
 # featureBits -chrom=chr2 danRer3 refGene:cds chainHoxD55v2TetNig1Link -enrichment
 # refGene:cds 0.668%, chainHoxD55v2TetNig1Link 7.400%, both 0.601%, 
 # cover 90.10%,enrich 12.18x
 
 # if H=2000 is used, one job does not finish for blastz after a day.
 # makes little difference if use mm6 parameters
 #  Database   	Table			Number of chains
 #  danRer2	chr2_chainTetNig1		21176
 #  danRer3	chr2_chainNoHoxD55TetNig1	16076
 #  danRer3	chr2_chainHoxD55TetNig1		23951
 #  danRer3	chr2_chainHoxD55v2TetNig1	21378
 # also there are more lower scoring chains with HoxD55 alone than for 
 # no HoxD55 or using the mm6 parameters with HoxD55. However, using HoxD55
 # seems to increase the number of higher scoring chains.
 
 # BLASTZ, CHAIN AND NET FOR OPOSSUM (monDom2) (DONE, 2005-10-18, hartera)
 # MOVE ALL THE RUN FILES AND OUTPUT FROM THE SAN RUN DIRECTORY TO A DIRECTORY
 # ON /cluster/data AS THIS IS MORE PERMANENT. (DONE, 2005-11-17, hartera).
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/bed/blastz.monDom2.2005-10-07
     cd /cluster/data/danRer3/bed
     ln -s blastz.monDom2.2005-10-07 blastz.monDom2
     # create a 2 bit for danRer3 with all chroms (1-25 and M) and the
     # scaffolds for NA and Un.
     cd /cluster/data/danRer3
     faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
                Un/scaffoldUn.fa NA/scaffoldNA.fa danRer3ChrUnNAScafs.2bit
     ssh hgwdev
     mkdir -p /san/sanvol1/scratch/danRer3/
     mv /cluster/data/danRer3/danRer3ChrUnNAScafs.2bit \
        /san/sanvol1/scratch/danRer3/
     # make output and run directories
     mkdir -p /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun
     mkdir -p /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut
     cd /cluster/data/danRer3/bed/blastz.monDom2
     ln -s /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun
     ln -s /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut
     cd chromsAndScafsRun
     cat << '_EOF_' > DEF
 # zebrafish (danRer3) vs opossum (monDom2)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
                                                                                 
 ALIGN=blastz-run
 BLASTZ=blastz.v7.x86_64
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
 #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
 BLASTZ_ABRIDGE_REPEATS=0
                                                                                 
 # TARGET - zebrafish (danRer3) soft-masked chroms 1-25 and chrM, and
 # scaffolds for NA and Un
 SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
 SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit
 SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
 SEQ1_RMSK=
 # lineage-specific repeats
 # we don't have that information for these species
 SEQ1_SMSK=
 SEQ1_FLAG=
 SEQ1_LIMIT=30
 SEQ1_IN_CONTIGS=0
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - Opossum (monDom2)
 # soft-masked sequence in scaffolds
 SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
 SEQ2_SMSK=
 SEQ2_FLAG=
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
                                                                                 
 BASE=/san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun
                                                                                 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes
 SEQ2_LEN=$BASE/S2.len
 TMPDIR=/scratch/tmp
                                                                                 
 #DEBUG=1
 '_EOF_'
     # << this line keeps emacs coloring happy
     chmod +x DEF
     cp /cluster/data/danRer3/chrom.sizes S1.len
     twoBitInfo /san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit \
                chromsUnNAScafs.sizes
     cp /cluster/data/monDom2/chrom.sizes S2.len
     # now do the run
   nice /cluster/bin/scripts/doBlastzChainNet.pl \
   -bigClusterHub=pk \
   -smallClusterHub=pk \
   -workhorse=pk \
   -fileServer=kolossus \
   -stop cat \
   -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut \
   -chainMinScore=5000 \
   `pwd`/DEF >& do.log &
     # chromsAndScafs PID 19811      Start: Fri Oct  7 15:16
     # Friday Oct 14th 10:30 - 
 # Checking finished jobs
 # crashed: 3271
 # ranOk: 90399
 # failed 4 times: 3271
 # total jobs in batch: 93670
 # more than 9000 crashed on one machine: kkr10u19.kilokluster.ucsc.edu
 # so remove this machine.
 # run again with para push -retries=20
 
     # still 7 jobs crashed so repush again with para push -retries=20
     # Now try using the SEQ1_LIMIT option in the DEF file to limit the 
     # number of sequences in a partition file to 30. Before, there would 
     # be a lot of small sequences in a partition file that would take a long
     # time to run.
     # finished around 21:40 Fri Oct 14 Took about 7 days, maybe a little less 
     # as a number of jobs crashed last night.
     # carry on from the cat step to the end
     ssh pk
     cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun/run.blastz
     para time > run.time
 # para time
 # Completed: 93670 of 93670 jobs
 # CPU time in finished jobs:   55738486s  928974.77m 15482.91h  645.12d  1.767 y
 # IO & Wait Time:               1276213s   21270.22m   354.50h   14.77d  0.040 y
 # Average job time:                 609s      10.14m     0.17h    0.01d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1470s      24.50m     0.41h    0.02d
 # Submission to last job:        627367s   10456.12m   174.27h    7.26d
 
     ssh hgwdev
     cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun
   nice /cluster/bin/scripts/doBlastzChainNet.pl \
   -bigClusterHub=pk \
   -smallClusterHub=pk \
   -workhorse=pk \
   -fileServer=kolossus \
   -continue cat \
   -blastzOutRoot /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut \
   -chainMinScore=5000 \
   `pwd`/DEF >& doCatChainNet.log &
   # Took 13 minutes to cat then chain. It had 70 jobs crash at the chaining
   # step. These are empty files - when axtChain opens them using 
   # pslxFileOpenWithMeta (in psl.c) it aborts as the file is empty apart from
   # meta data and therefore not psLayout format. Ignore these crashed jobs
   # for now and then modify psl.c so it will skip over these empty files.
   # Next, the script crashed on the chainMergeSort step
   # since there are too many chains due to opossum being scaffold-based. 
   # chainMergeSort opens all the files at once.
   # Added a flag to the DEF file to show if an assembly is scaffold-based:
   # GENOME_IN_SCAFFOLDS=1 
   # and then modify doBlastzChainNet.pl so that if it sees this flag, then
   # chains are merged into one file then run chainSort to sort the file 
   # and then chainMergeSort to change the IDs so they are unqiue.
   # chainMergeSort assumes that the input files are sorted already.
   nice ./doBlastzChainNet.pl \
   -bigClusterHub=pk \
   -smallClusterHub=pk \
   -workhorse=pk \
   -fileServer=kolossus \
   -continue chainMerge \
   -chainMinScore=5000 \
   `pwd`/DEF >& doChainMergeNet.log &
   # Start: Tue Oct 18 12:55 Finished: 15:02
 # add trackDb.ra entries for monDom2 chain and net tracks and add html for
 # these tracks too. Modified html pages to describe the process using 
 # scaffolds for chrUn and chrNA for danRer3.
 # Modify the downloads README.txt to include a description of the process
 # of running blastz with scaffolds for the chrUn and chrNA unordered chroms.
 # Finally run the swap for this to get danRer3 chains and net tracks 
 # on monDom2 - see makeMonDom2.doc. 
     # Move the run directory files to /cluster/data and remake download links
     # as the san is not a permanent storage space (hartera, 2005-11-17)
     ssh hgwdev 
     mv /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun \
        /cluster/data/danRer3/bed/blastz.monDom2/
     # then change the symlinks in the downloads directory to point to the files
     # on /cluster/data
     cd /usr/local/apache/htdocs/goldenPath/danRer3/vsMonDom2/axtNet
     set runDir=/cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun
     rm *.gz
     foreach f ($runDir/axtNet/*.axt.gz)
       ln -s $f .
     end
     cd ..
     rm *.gz
     foreach f ($runDir/axtChain/*.gz)
       ln -s $f
     end
     # remake the md5sum file
     rm md5sum.txt
     md5sum *.gz */*.gz > md5sum.txt
    
 # RADIATION HYBRID (RH) MAP TRACK (DONE, 2005-09-06, hartera)
     # Data from Leonard Zon's lab at the Childrens Hospital, Boston
     # Provided by Anhua Song: asong@enders.tch.harvard.edu
     # Updated data provided on 2006-02-23
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap
     cd /cluster/data/danRer3/bed/ZonLab/rhMap
     # download data from e-mail to this directory
     # new sequences (2006-02-23) are available
     unzip rhSequenceSubmit022306.zip
     # sequences are in rhSequenceSubmit022306/rhSequenceSubmitSeq022306.txt
     # primer information is in rhSequenceSubmit022306/rhSequenceSubmit022306.txt
     mv rhSequenceSubmitSeq022306.txt rhMap022306.fa
     mv rhSequenceSubmit022306.txt rhMapPrimers022306.txt
     # first remove ^M from end of lines
     dos2unix rhMap022306.fa
     dos2unix rhMapPrimers022306.txt
     grep '>' rhMap022306.fa | wc -l
     # 11514
     wc -l rhMapPrimers022306.txt
     # 13438 rhMapPrimers022306.txt
     grep '>' rhMap022306.fa > rhMap.names
    
     # remove '>' from names and grab first field
     perl -pi.bak -e 's/>//' rhMap.names
     awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \
         > rhMap.namesOnly.sort
     awk 'BEGIN {FS="|"} {print $1;}' rhMapPrimers022306.txt | sort | uniq \
         > rhMapPrimers.namesOnly.sort
     wc -l *.sort
     # 11514 rhMap.namesOnly.sort
     # 13436 rhMapPrimers.namesOnly.sort (after removing blank line)
     # There are no replicates this time for rhMap sequences but there are for
     # the primers set:
     awk 'BEGIN {FS="|"} {print $1;}' rhMapPrimers022306.txt | sort | uniq -c \
         | sort -nr > rhMapPrimers.names.count
     # These replicates are blank lines so there are no replicates
     # Total 11514 sequences in rhMap, but 13436 primer sets
 
     # 11527 rhMap.namesOnly.sort
     # 13436 rhMapPrimers.namesOnly.sort
 
     # get a list of headers from the FASTA file
     grep '>' rhMap022306.fa > rhMap.headers
     awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq
     # BAC_END
     # EST
     # GENE
     # SSLP
     # STS
     # 5 types of sequence
     awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq
     # BACends
     # Custom
     # Insertion_Mutant
     # Insertion_Mutants
     # MGH
     # NCBI
     # Sanger SG
     # Sequencing_Project
     # ThisseClone
     # Thisse_Clone
     # other_zfEst
     # wu_zfEst
     # wz
     # Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone;
     # So there are 11 different sources.
     awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq
     # CHBG
     # MPIEB
     
     # There are 2 sequences with problem primers. E-mailed Peter Song about
     # these and he suggested to delete thoser primers:
     # >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.|
     # >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A| 
     # edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers.
     # need to reformat FASTA headers so they are in the format: 
     # NAME.SOURCE.TYPE.ORIGIN
     # Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone
     # so change these to have the same name. Also shorten Sanger SG to Shotgun.
 
     perl -pi.bak -e 's/Insertion_Mutant/InsertMut/' rhMap022306.fa
     perl -pi.bak -e 's/Insertion_Mutants/InsertMut/' rhMap022306.fa
     perl -pi.bak -e 's/Sanger SG/Shotgun/' rhMap022306.fa
     perl -pi.bak -e 's/ThisseClone/Thisse/' rhMap022306.fa
     perl -pi.bak -e 's/Thisse_Clone/Thisse/' rhMap022306.fa
     perl -pi.bak -e 's/Sequencing_Project/Seqproj/' rhMap022306.fa
    
     # use a script to reformat the names for the FASTA headers to the format 
     # >NAME.SOURCE where name is the first field separated by "|" and source
     # is the 9th field. The source is used to make the name unique. Some
     # of these names are BAC ends that occur in the BAC ends track so there
     # are name clashes in the seq table if the names are not made unique.
     # Also make the name upper case as for those for the danRer1 and danRer2
     # RH map. 
 cat << '_EOF_' > rhFix
 #!/usr/bin/awk -f 
 
 #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
 /^>/ {
     split(toupper($0), a, "\\|");
     print a[1]"."a[9];
     next;
 }
 
 /^[0-9]+ / {
     $0 = $2;
 }
 
 {
     print $0;
 }
 
 '_EOF_'
 # << keep emacs coloring happy
     chmod +x rhFix
     rhFix rhMap022306.fa > rhMap.fa
     # Blat sequences vs danRer3 genome
     ssh pk
     mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
     # make output directory
     mkdir -p /san/sanvol1/scratch/danRer3/rhMap/psl
     cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
     ln -s /san/sanvol1/scratch/danRer3/rhMap/psl .
     # copy input to the san
     cp \
   /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/rhMap.fa \
     /san/sanvol1/scratch/danRer3/rhMap/
     # do the blat run to align RH map sequences to danRer3 and do separate
     # runs for chroms and scaffolds from chrUn and chrNA
     ls -1S /san/sanvol1/scratch/danRer3/rhMap/rhMap.fa > rhMap.lst
     ls -1S /san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/chr[0-9M]*.fa \
           > genome.lst
     # use the individual scaffolds for chrUn and chrNA alignments
     foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/Zv5_*.fa)
         ls -1S $f >> genome.lst
     end
     wc -l genome.lst
     # 15149 genome.lst
     cp -p /cluster/data/danRer3/bed/ooc/danRer3_10.ooc \
           /san/sanvol1/scratch/danRer3
 # try same parameters as for BAC ends
 cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/san/sanvol1/scratch/danRer3/danRer3_10.ooc {check out line+ /san/sanvol1/scratch/danRer3/rhMap/psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << this line keeps emacs coloring happy
     # gensub2 genome.lst rhmap.lst gsub spec
     gensub2 genome.lst rhMap.lst gsub spec
     para create spec
     para try, check, push, check etc.
 # para time
 # Completed: 15149 of 15149 jobs
 # CPU time in finished jobs:      16326s     272.09m     4.53h    0.19d  0.001 y
 # IO & Wait Time:                 41360s     689.34m    11.49h    0.48d  0.001 y
 # Average job time:                   4s       0.06m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              61s       1.02m     0.02h    0.00d
 # Submission to last job:           263s       4.38m     0.07h    0.00d
     
     cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
     # Make & check the psl table
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create rhmap.psl
     pslSort dirs raw.psl tmp psl
     pslReps -nearTop=0.0001 -minAli=0.80 -minCover=0.20 raw.psl \
             contig.psl /dev/null
     # There are 11514 sequences in total in rhMap.fa
     # Experimented with different parameters:
     # little difference if STS markers BLAT parameters were used 
     # i.e. -ooc=11.ooc and -stepSize=5.
     # For Blat parameters used above (-ooc=10.ooc and -tileSize=10), try
     # different pslReps parameters using minCover=0.40 and nearTop=0.0001:
     # minAli=0.96, 83%, most aligned sequence has 11 alignments.
     # minAli=0.90, 88% align, most aligned seq has 11 alignments
     # minAli=0.80, 88%, 10120 sequences aligned. 
     # at minAli=0.50, there are still 10120 sequences aligned so those that
     # are not aligning must have very low sequence identity. Took a look at 
     # some that are not aligning e.g. 2217C, 2791C and these are not passing
     # the minCover=0.40 criterion. Some sequences have Ns in them too
     # e.g. ZC92E13.YBF so has a lot of short alignments that do not pass
     # the minCover parameter. Lowering minCover increases the number of 
     # sequences aligned:
     # minAli=0.80, minCover=0.20, there are 10850 (94%) of sequences aligned. 
     # minAli=0.90, minCover=0.20, there are 10837 (94%) of sequences aligned
     # with 21 less alignment than for minAli=0.80. 
     # Most alignments for one sequence is 99, second most is 11. There are 
     # about 1851 sequences with more than > 1 alignment (many of these 
     # have 2 alignments) while for minAli=0.80 and minCover=0.40, there were
     # 1266 sequences with more than 1 alignment. With lower minCover, more
     # sequences align, but there are more sequences with higher numbers of
     # multiple alignments. At minCover=0.0, there is 1 sequence with 1353
     # alignments, the second largest number of alignments for 1 sequence
     # is 532, then 329 etc. So use minAli=0.80 and minCover=0.20 to get the
     # most sequences aligned without having sequences aligning too many times. 
     # at minAli=0.80 and minCov=0.20, there are 10850 sequences aligned (94%). 
     # 88% of sequences were aligned for danRer2.
     # merge together liftAll and scaffolds lift then lift psl to chrom level.
     cat /cluster/data/danRer3/liftSuperToChrom/liftNAandUnScaffoldsToChrom.lft \        /cluster/data/danRer3/jkStuff/liftAll.lft \
         > /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft 
     liftUp rhMap.psl \
            /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft \     
            warn contig.psl
     # Got 30168 lifts
     pslCheck rhMap.psl
     # psl is ok
     # Load sequence alignments into database.
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
     # drop old table and reload (hartera, 2006-03-26)
     echo "drop table rhMap;" | hgsql danRer3
     hgLoadPsl danRer3 rhMap.psl
     # cleanup
     rm -r /san/sanvol1/scratch/danRer3/rhMap/psl 
     rm psl para.results batch batch.bak spec
     rm -r err  
     gzip *.psl
     # Copy sequences to gbdb if they are not already there.
     mkdir -p /gbdb/danRer3/rhMap
     ln -s \
       /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/rhMap.fa \
       /gbdb/danRer3/rhMap/rhMap022306.fa 
     # then add sequences to database:
     # reloaded (hartera, 2006-03-26)
     hgLoadSeq danRer3 /gbdb/danRer3/rhMap/rhMap022306.fa
     # Note: first time these sequences were loaded there was a problem 
     # 2215 are not loaded into database, these all
     # have names with extensions like .YB, .YC etc. so remove from extFile
     # and seq. Sequences with the same IDs are already in the seq table
     # for the BAC ends tracks so need to make these RH map names unique.
     hgsql -e 'delete from seq where extFile = 736113;' danRer3
     hgsql -e 'delete from extFile where id = 736113;' danRer3
     hgsql -e 'update history set errata = "Removed sequences. Error so not all asequences loaded." where ix = 23;' danRer3 
     
     # Check that all the headers from rhMap.headers are also in the primers
     # file which seems to contain the same headers from the FASTA file
     # as well as additional markers.
     ssh kkstore02
     cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306
     perl -pi.bak -e 's/>//' rhMap.headers
     sort rhMap.headers > rhMap.headers.sort
     sort rhMapPrimers022306.txt > rhMapPrimers.sort
     wc -l *.sort
     # 11514 rhMap.headers.sort
     # 13437 rhMapPrimers.sort
     comm -12 rhMap.headers.sort rhMapPrimers.sort | wc -l
     # 11514 in common
     # so all FASTA headers from rhMap022306.fa are in the primers file
     # Get headers again from rhMap.fa file as the names of the sources have
     # been changed. Parse out information from headers to add to an rhMapInfo
     # table so that this information can be displayed on the details page for
     # the RH map markers.
     # Fields: 1 - name, 2 - linkage group (chrom), 3 - position number on the 
     # RH map for that linkage group, 4 - distance (in cR) from the 
     # top of a linkage group, 4 - position number in entire RH map (ordered 
     # from LG1 to LG25, 5 - type of marker (SSLP, BAC_END, EST, GENE, STS),
     # 9 - source, 10 - institute that mapped the marker, 11 - 5' forward primer,
     # 12 - 3' reverse primer.
     # Sort headers by linkage group and by position
     grep '>' rhMap022306.fa > rhMap.headers2
     # then use the rhMap.headers2 file to extract the marker information
     # and to reformat the names for the FASTA headers to the format 
     # >NAME.SOURCE where name is the first field separated by "|" and source
     # is the 9th field so that names in the rhMap and rhMapInfo tables are 
     # the same. The source is used to make the name unique. 
 cat << '_EOF_' > getRhInfo
 #!/usr/bin/awk -f 
 
 #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
 /^>/ {
     sub(/>/,"",$0);
     split(toupper($0), a, "\\|");
     print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12];
     next;
 }
 '_EOF_'
 # << keep emacs coloring happy
     chmod +x getRhInfo
     getRhInfo rhMap.headers2 > rhMapInfo.tab
     # Sort headers by linkage group (LG) and by position
     sort -k 2,2 -k 3,3n rhMapInfo.tab > rhMapInfoSorted.tab
     wc -l rhMapInfoSorted.tab
     # 11514 rhMapInfoSorted.tab
  
     ssh hgwdev 
     # Create a table with RH map item information including type, source,
     # origin and primer sequences.
     cat << 'EOF' > ~/kent/src/hg/lib/rhMapInfo.as
 table rhMapInfo
 "Radiation Hybrid map information"
 (
 string name;		"Name of Radiation Hybrid (RH) map marker"
 string linkageGp;	"Linkage group to which the marker was mapped"
 uint position;  	"Position number in RH map for this linkage group"
 uint distance;  	"Distance from the top of linkage group (cR)"
 string markerType;      "Type of marker"
 string source;    	"Source of marker"
 string mapSite;   	"Institution that mapped the marker"
 string leftPrimer; 	"Forward primer sequence"
 string rightPrimer; 	"Reverse primer sequence"
 )
 'EOF'
 # << happy emacs
     # create .sql, .c and .h files using autoSql
     autoSql rhMapInfo.as rhMapInfo
     mv rhMapInfo.h ../inc
     # rhMapInfo.sql - name is the primary key
     # commit rhMapInfo.as, .sql, .c and .h files to CVS.   
     # create and load table (Reloaded: hartera, 2006-03-26)
     cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306 
     echo "drop table rhMapInfo;" | hgsql danRer3
     hgsql danRer3 < ~/kent/src/hg/lib/rhMapInfo.sql
     hgsql -e \
     'load data local infile "rhMapInfoSorted.tab" into table rhMapInfo' danRer3
     
     # edit danRer3/trackDb.ra to add rhMap track and the search spec.  
     # add and edit rhMap.html to describe the info data.
     # edit ~/kent/src/hg/hgc/hgc.c so that the rhMapInfo data is displayed 
     # on the details page for each marker - edit doRHmap function.
     # Add a rule to all.joiner to check that all names in rhMap also appear 
     # in rhMapInfo
     # Add a rule to all.joiner to check that all names in rhMap also appear 
     # in rhMapInfo..
     # commit these to CVS.
     # Changed termRegex for  rhMap search in trackDb.ra so that it works 
     # for all IDs. (2006-04-19, hartera)
 
 # SELF BLASTZ, CHAIN, NET, AXTNET, MAFNET AND DOWNLOADS
 # (DONE, 2005-12-02, hartera)
     ssh pk
     mkdir -p /cluster/data/danRer3/bed/blastzSelf.2005-11-30
     cd /cluster/data/danRer3/bed
     ln -s blastzSelf.2005-11-30 blastzSelf
     cd /cluster/data/danRer3/bed/blastzSelf
     # make run directory on the san
     mkdir -p /san/sanvol1/scratch/danRer3/blastzSelf/chromsRun
     ln -s /san/sanvol1/scratch/danRer3/blastzSelf/chromsRun
     # make 2 bit file of chr1-25 and chrM
     cd /cluster/data/danRer3
     faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
         /san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit
     cd /cluster/data/danRer3/bed/blastzSelf/chromsRun
     twoBitInfo /san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit S1.len
     cp S1.len S2.len
     cat << '_EOF_' > DEF
 # zebrafish vs zebrafish
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
                                                                                 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_L=5000
 BLASTZ_H=2500
 BLASTZ_M=50
 BLASTZ_ABRIDGE_REPEATS=0
                                                                                 
 # TARGET: Zebrafish danRer3
 SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit
 SEQ1_IN_CONTIGS=0
 SEQ1_LIMIT=30
 SEQ1_CHUNK=500000
 SEQ1_LAP=5000
                                                                                 
 # QUERY: Zebrafish danRer3
 SEQ2_DIR=/san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit
 SEQ2_SELF=1
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=1800000000
 SEQ2_LAP=0
 
 BASE=/san/sanvol1/scratch/danRer3/blastzSelf/chromsRun
                                                                                 
 DEF=$BASE/DEF
 RAW=$BASE/raw
 CDBDIR=$BASE
 SEQ1_LEN=$BASE/S1.len
 SEQ2_LEN=$BASE/S2.len
 TMPDIR=/scratch/tmp
 '_EOF_'
     chmod +x DEF
     ssh hgwdev
     cd /cluster/data/danRer3/bed/blastzSelf/chromsRun
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
          -bigClusterHub=pk \
          -smallClusterHub=pk \
          -workhorse=pk \
          -fileServer=kolossus \
          -chainMinScore=5000 \
          -chainLinearGap=medium \
          `pwd`/DEF >& do.log &
     # Start: Wed Nov 30 17:07 Finish: Thur Dec  1 06:51
     # Crashed at downloads step as these exist from previous run so remove
     rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsSelf 
 # para time (blastz)
 # Completed: 2425 of 2425 jobs
 # CPU time in finished jobs:    4783120s   79718.66m  1328.64h   55.36d  0.152 y
 # IO & Wait Time:                108014s    1800.24m    30.00h    1.25d  0.003 y
 # Average job time:                2017s      33.62m     0.56h    0.02d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            2762s      46.03m     0.77h    0.03d
 # Submission to last job:         14993s     249.88m     4.16h    0.17d
 
 # para time (axtChain)
 # Completed: 26 of 26 jobs
 # CPU time in finished jobs:      96405s    1606.74m    26.78h    1.12d  0.003 y
 # IO & Wait Time:                   731s      12.19m     0.20h    0.01d  0.000 y
 # Average job time:                3736s      62.27m     1.04h    0.04d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            7405s     123.42m     2.06h    0.09d
 # Submission to last job:          7411s     123.52m     2.06h    0.09d
 
     # Carry on from downloads step.
     cd /cluster/data/danRer3/bed/blastzSelf/chromsRun
     nice /cluster/bin/scripts/doBlastzChainNet.pl \
          -bigClusterHub=pk \
          -smallClusterHub=pk \
          -workhorse=pk \
          -fileServer=kolossus \
          -continue download \
          -chainMinScore=5000 \
          -chainLinearGap=medium \
          `pwd`/DEF >& doDownloads.log &
     # Took 2 minutes. 
 # check trackDb entry exists. Put html at danRer3 level of trackDb and edit
 # these and the downloads README to state that chrNA and chrUn were not 
 # aligned for this track.
     # Remove extra downloads made by script:
     # Only chain track is pushed to the RR so remove the net and axtNet 
     # downloads, re-make md5sum.txt and edit README.txt accordingly.
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/danRer3/vsSelf
     rm danRer3.danRer3.net.gz md5sum.txt
     rm -r axtNet
     md5sum *.gz > md5sum.txt
     
 # Original run with loose linear gap matrix and scaffolds for chrNA and chrUn
 # done 2005-10-26.
 # filtering chains from above on minScore 10,000. done 2005-11-18
 # Using the medium linear gap matrix for axtChain. minScore=5,000. 
 # done 2005-11-30.
 # chainSelf - loose linearGap matrix, filtered minScore=5000
 # chainSelfFilt10k - loose linearGap matrix, filtered minScore=10000
 # chainSelfMedGap - medium linearGap matrix, filtered minScore=5000
 # featureBits -chrom=chr1 danRer3 refGene:cds chainSelfLink -enrichment
 # refGene:cds 0.743%, chainSelfLink 65.056%, both 0.560%, cover 75.29%, 
 # enrich 1.16x
 
 # featureBits -chrom=chr1 danRer3 refGene:cds chainSelfFilt10kLink -enrichment
 # refGene:cds 0.743%, chainSelfFilt10kLink 64.019%, both 0.554%, cover 74.54%, 
 # enrich 1.16x
 # number of rows in tables for chr1:
 
 # chainSelf          	941416
 # chainSelfFilt10k	530292
 # chainSelfMedGap	997525			
 # chainSelfLink		9110071
 # chainSelfFilt10kLink	7226815
 # chainSelfMedGapLink	9149100
 
 # featureBits -chrom=chr1 danRer3 refGene:cds chainSelfMedGapLink -enrichment
 # refGene:cds 0.743%, chainSelfMedGapLink 64.525%, both 0.549%, cover 73.80%, 
 # enrich 1.14x
 
 # so the medium linearGap matrix increases the number of chains by about 5% 
 # but coverage is little different.
 # for the chains filtered with  minScore=10000
 # 12192577 chains out of 17592225 do not have chrNA or chrUn as query or 
 # target which is about 69%. 
 # 12192577 out of 12807964 do not have chrNA or chrUn as the query for just
 # chr1-25 and chrM which is about 95%.
 # so make the chains without chrNA and chrUn and using the medium linearGap
 # matrix which is for species that are not so distant.
 # 2005-12-02
 # medium linearGap matrix for axtChain, minScore=5000 and no chrNA or chrUn.
 # number of rows in tables for chr1:
 # chainSelf 	943482
 # chainSelfLink 8707208
 # featureBits -chrom=chr1 danRer3 refGene:cds chainSelfLink -enrichment
 # refGene:cds 0.743%, chainSelfLink 60.876%, both 0.503%, cover 67.65%, 
 # enrich 1.1
 # coverage dropped about 8% without chrNA and chUn alignments so not a 
 # huge difference.
 
 # BLASTZ SWAP FOR HUMAN (hg18) (DONE, 2005-12-24, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
     ssh hgwdev
     # Blastz requires lineage-specific repeats
     # Treat all repeats as lineage-specific for all alignments except those
     # involving danRer3 chrUn and chrNA where the dynamic masking 
     # functionality of Blastz was used. hg18 random chroms were aligned
     # as contigs and danRer3 chrNA and chrUn were aligned as scaffolds -
     # see zebrafish (danRer3) chain and net track section in makeHg18.doc
     # for further details. 
 
     # do swap of hg18 vs. danRer3 chain and net alignments to 
     # create danRer3 vs. hg18 see makeHg18.doc for details.
     cd /cluster/data/hg18/bed/blastz.danRer3/chromsRun
     # edit DEF file and add location of danRer3 and hg18 lineage-specific
     # repeats - move chrUn and chrNA lineage-specific repeats into a tmp
     # directory as they were not used.
     nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
         -bigClusterHub=pk -swap -chainMinScore=5000 \
         -chainLinearGap loose `pwd`/DEF >& doSwap.log &
     # Took about 27 minutes.
     # Blastz parameters are as for hg18 vs. danRer3 - see makeHg18.doc
 # BLASTZ_H=2000
 # BLASTZ_Y=3400
 # BLASTZ_L=6000
 # BLASTZ_K=2200
 # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 # BLASTZ_ABRIDGE_REPEATS=1
   # make html files and trackDb.ra entry for chain and net tracks.
   # check README.txt for downloads.
 # featureBits -chrom=chr2 danRer3 refGene:cds chainHg18Link -enrichment 
 # refGene:cds 0.767%, chainHg18Link 4.370%, both 0.607%, cover 79.15%,
 # enrich 18.11x
 # featureBits -chrom=chr2 danRer2 refGene:cds chainHg17Link -enrichment 
 # refGene:cds 0.769%, chainHg17Link 4.576%, both 0.605%, cover 78.69%,
 # enrich 17.20x
 # Similar coverage and enrichment as for danRer2 vs hg17 but there are less
 # chains: 7057 for hg18 on danRer3, 1111 for hg17 on danRer2 (chr1).
 
 # 5-WAY VAR_MULTIZ ALIGNMENTS (DONE, 2006-02-06, hartera)
 # MAF ANNOTATION ADDED (DONE, 2006-02-6, braney)
 # FINISHED MAKING TREE IMAGE FOR TRACK DESCRIPTION PAGE 
 # (DONE, 2006-02-07, hartera)
 # Species: zebrafish(danRer3), human (hg18), mouse(mm7), 
 # fugu(fr1) and tetraodon(tetNig1)
 # Opossum (monDom2) was dropped since there were many more alignments
 # for monDom2 than monDom1 and the chains were shorter on average. The
 # reason for this is unknown so they will not be included in the 
 # conservation track at this time.
 # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
 
 
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/bed/multiz5way
     cd /cluster/data/danRer3/bed/multiz5way
     mkdir mafLinks
     # set up directories for links to mafs for each pairwise alignment
     mkdir mafLinks/hg18
     mkdir mafLinks/mm7
     mkdir mafLinks/fr1
     mkdir mafLinks/tetNig1
   
     set dir=/cluster/data/danRer3/bed
     # need to make links to all the mafNet files for pairwise blastz 
     # alignments for each species. Make sure files are all called chrN.maf.gz
     ln -s $dir/blastz.hg18.swap/mafNet/*.maf.gz ./mafLinks/hg18
     ln -s $dir/blastz.mm7.swap/mafNet/*.maf.gz ./mafLinks/mm7
     ln -s $dir/blastz.fr1/mafNet/*.maf.gz ./mafLinks/fr1
     ln -s $dir/blastz.tetNig1.2005-10-11/chromsAndScafsRun/mafNet/*.maf.gz \
           ./mafLinks/tetNig1
     # copy files over to the san for the pitakluster cluster run
     ssh pk
     mkdir /san/sanvol1/scratch/danRer3/multiz5way
     cd /san/sanvol1/scratch/danRer3/multiz5way
     rsync -a --copy-links --progress \
           /cluster/data/danRer3/bed/multiz5way/mafLinks/ .
     # 277 Mb of data - took less than 1 minute 
     mkdir penn
     cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn
 
 #       Progressive alignment up the tree w/o stager,
 #       using multiz.v10 (var_multiz)
 #       Method: align internal subtrees (using 0 flag to var_multiz)
 #               Then, align these to human (using 1 flag to var_multiz)
 #       NOTE: must use maf_project after each multiz run, in order
 #       to order output.  Single-cov guaranteed by use of net MAF's,
 #       so it is not necessary to run single_cov2.
 
     # make output dir and run dir
 
     cd /cluster/data/danRer3/bed/multiz5way
     mkdir -p maf
     mkdir -p run
     cd run
 
     # create scripts to run var_multiz on cluster
 
 cat > oneMultiz.csh << 'EOF'
 #!/bin/csh -fe
     set c = $1
     set db = danRer3
     set multi = /scratch/tmp/$db/multiz5way.$c
     set pairs = /san/sanvol1/scratch/$db/multiz5way
     set penn = $pairs/penn
 
     # special mode --
     # with 1 arg, cleanup
     if ($#argv == 1) then
         echo "cleanup"
         echo "rm -fr $multi"
         rm -fr $multi
         echo "rmdir --ignore-fail-on-non-empty /scratch/tmp/$db"
         rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
         exit
     endif
 
     # special mode --
     # with 3 args, saves an alignment file
     if ($#argv == 3) then
         echo "cp $multi/$2/$c.maf $3"
         ls -og $multi/$2/$c.maf
         cp $multi/$2/$c.maf $3
         exit
     endif
 
     set s1 = $2
     set s2 = $3
     set flag = $4
 
     # locate input files -- in pairwise dir, or multiple dir
     set d1 = $multi
     set d2 = $multi
     if (-d $pairs/$s1) then
         set d1 = $pairs
         set f1 = $d1/$s1/$c.maf.gz
         set t1 = /tmp/$s1.$c.maf
         zcat $f1 > $t1
     else
         set f1 = $d1/$s1/$c.maf
         set t1 = /tmp/$s1.$c.maf
         cp -p $f1 $t1
     endif
     if (-d $pairs/$s2) then
         set d2 = $pairs
         set f2 = $d2/$s2/$c.maf.gz
         set t2 = /tmp/$s2.$c.maf
         zcat $f2 > $t2
     else
         set f2 = $d2/$s2/$c.maf
         set t2 = /tmp/$s2.$c.maf
         cp -p $f2 $t2
     endif
     # write to output dir
     set out = $multi/${s1}${s2}
     mkdir -p $out
 
     # check for empty input file
     if (-s $t1 && -s $t2) then
         echo "Aligning $f1 $f2 $flag"
         $penn/multiz $t1 $t2 $flag $out/$c.unused1.maf \
                 $out/$c.unused2.maf > $out/$c.full.maf
         cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
                 $out/$c.tmp.maf
         echo "Ordering $c.maf"
         $penn/maf_project $out/$c.tmp.maf $db.$c > $out/$c.maf
         rm -f $t1 $t2
     else if (-s $t1) then
         cp -p $t1 $out/$c.maf
         rm -f $t1
     else if (-s $t2) then
         cp -p $t2 $out/$c.maf
         rm -f $t2
     endif
 'EOF'
 # << keep emacs coloring happy
     chmod +x oneMultiz.csh
     cp -p oneMultiz.csh \
          /san/sanvol1/scratch/danRer3/multiz5way/penn/oneMultiz.csh
     # Create 6way.nh file of tree. This was used in the distant past for 
     # early versions of phastCons.  Now, this is merely a convenient 
     # reference to the tree under construction.  This is also used to draw 
     # a graphic tree as species5.nh, see below.
 
     cat << '_EOF_' > /cluster/data/danRer3/bed/multiz5way/5way.nh
 (hg18,mm7),((tetNig1,fr1),danRer3))
 '_EOF_'
     # << this line keeps emacs coloring happy
     #   using the tree diagram as above, arrange these alignments
     #   in order of the tree branches
 cat > allMultiz.csh << 'EOF'
 #!/bin/csh -fe
     # multiple alignment steps:
 set c = $1
 set db = danRer3
 set s = "/san/sanvol1/scratch/$db/multiz5way/penn/oneMultiz.csh"
 
 $s $c hg18 mm7 0
 $s $c tetNig1 fr1 1
 $s $c tetNig1fr1 hg18mm7 1
 # get final alignment file
 $s $c tetNig1fr1hg18mm7 /cluster/data/$db/bed/multiz5way/maf/$c.maf
 #cleanup
 $s $c
 'EOF'
 # happy emacs
     chmod +x allMultiz.csh
 
 cat  << 'EOF' > template
 #LOOP
 ./allMultiz.csh $(root1) {check out line+ /cluster/data/danRer3/bed/multiz5way/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
 
     awk '{print $1}' ../../../chrom.sizes > chrom.lst
     
     gensub2 chrom.lst single template jobList
     para create jobList
     para try, para check, para push, para check ... etc
     para time
 # Completed: 28 of 28 jobs
 #CPU time in finished jobs:       3546s      59.10m     0.98h    0.04d  0.000 y
 # IO & Wait Time:                   115s       1.92m     0.03h    0.00d  0.000 y
 # Average job time:                 131s       2.18m     0.04h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             553s       9.22m     0.15h    0.01d
 # Submission to last job:           709s      11.82m     0.20h    0.01d
 
     # do not filter mafs as only removes a small fraction of alignments
     # better to keep them all. check for single column alignments (these
     # just have a single base for each species in the alignment). There
     # should be none of these now. Previously had to do a glueing step to 
     # deal with these. There are none here.
 
 # Build maf annotation and load database (braney, 2006-02-06)
 cd /cluster/data/danRer3/bed/multiz5way
 mkdir anno 
 cd anno
 cat ../../maf/chr1.maf | awk "/^s/ {print \$2}" | sed "s/\..*$//"  | sort -u > species.names
 mkdir maf run
 cd run
 rm sizes nBeds
 for i in `cat species.names`
 do
     ln -s  /cluster/data/$i/chrom.sizes $i.len
     ln -s  /cluster/data/$i/$i.N.bed $i.bed
     echo $i.bed  >> nBeds
     echo $i.len  >> sizes
 done 
 
 for i in ../../maf/*.maf
 do
     echo mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/danRer3/danRer3.2bit ../maf/`basename $i`
 done > jobs
 sh -x jobs
 
 ssh hgwdev
 
 cd /cluster/data/danRer3/bed/multiz5way/anno/mafs
 cat *.maf | hgLoadMafSummary danRer3 multiz5way stdin       
 
 # Dropped unused indexes (2006-05-09 kate)
 # NOTE: this is not required in the future, as the loader
 # has been fixed to not generate these indexes
 hgsql danRer3 -e "alter table multiz5waySummary drop index chrom_2"
 hgsql danRer3 -e "alter table multiz5waySummary drop index chrom_3"
 
 mkdir /gbdb/danRer3/multiz5way
 for i in *.maf
 do
     ln -s `pwd`/$i /gbdb/danRer3/multiz5way
 done
 hgLoadMaf danRer3 multiz5way
 rm *.tab
 
 cd /cluster/data/danRer3/bed/multiz5way
 mkdir frames
 cd frames
 cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
 cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
 
 #edit Makefile to correct species names 
 
 mkdir -p /san/sanvol1/scratch/danRer3/multiz5wayFrames/maf
 for i in ../../maf/*.maf; do echo $i; cp $i /san/sanvol1/scratch/danRer3/multiz5wayFrames/maf/$i; done  
 
 make getGenes
 make getFrames
 make loadDb
 
 ###
 # rebuild frames to get bug fix, using 1-pass maf methodology
 # (2006-06-09 markd)
 ssh kkstore02
 cd /cluster/data/danRer3/bed/multiz5way/frames
 mv mafFrames/ mafFrames.old
 nice tcsh # easy way to get process niced
 (zcat  ../maf/*.maf.gz | time genePredToMafFrames danRer3 stdin stdout danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz hg18 genes/hg18.gp.gz mm7 genes/mm7.gp.gz tetNig1 genes/tetNig1.gp.gz | gzip >multiz5way.mafFrames.gz)>&log&
 ssh hgwdev
 cd /cluster/data/danRer3/bed/multiz5way/frames
 
 hgLoadMafFrames danRer3 multiz5wayFrames multiz5way.mafFrames.gz >&log&
 #end of multiz5way annotation and load
 
     # create tree image - like tree.nh but with common names
     # (hartera, 2006-02-07)
     ssh hgwdev
     cd /cluster/data/danRer3/bed/multiz5way
     cat << '_EOF_' > species5.nh
 ((human,mouse),((tetraodon,fugu),zebrafish))
 '_EOF_'
     /cluster/bin/phast/$MACHTYPE/draw_tree -b -s species5.nh > species5.ps
     convert species5.ps 5way.jpg
     # using GIMP, edit tree and remove whitespace
     # Photoshop used to edit the image (kuhn, 2006-02-07)
     cp 5way.jpg /usr/local/apache/htdocs/images/phylo/danRer3_5way.jpg 
     # change permissions for display
     chmod +r /usr/local/apache/htdocs/images/phylo/danRer3_5way.jpg
 
 # check for all.joiner entry for multiz5way - ok
 # add trackDb.ra entry in ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3:
 # track multiz5way
 # shortLabel 5-Way Conservation
 # longLabel 5-Way Vertebrate Multiz Alignment & Conservation
 # group compGeno
 # priority 104
 # visibility pack
 # color 0, 10, 100
 # altColor 0,90,10
 # type wigMaf 0.0 1.0
 # maxHeightPixels 100:40:11
 # yLineOnOff Off
-# autoScaleDefault Off
+# autoScale Off
 # summary multiz5waySummary
 # speciesGroups vertebrate mammal
 # sGroup_mammal hg18 mm7
 # sGroup_vertebrate tetNig1 fr1
 
 # add this line to trackDb entry as above for the tree image (2006-02-07):
 # treeImage phylo/danRer3_5way.jpg
 
 # PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 6-WAY ALIGNMENT 
 # (DONE, 2006-02-06, hartera)
     ssh kkstore02
     mkdir /cluster/data/danRer3/bed/multiz5way/cons
     cd /cluster/data/danRer3/bed/multiz5way/cons
     # create a starting-tree.mod based on chr5 (73Mb - largest chrom)
     # chr5 is the largest chrom apart from NA and Un
     /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr5.maf \
         --refseq ../../../5/chr5.fa --in-format MAF \
         --windows 100000000,1000 --out-format SS \
         --between-blocks 5000 --out-root s1
     # takes about 30 seconds
     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
         --tree "((danRer3,(tetNig1,fr1)),(mm7,hg18))" \
         --out-root starting-tree
     # took less than 1 minute
     rm s1.*ss
     # Get genome-wide average GC content (for all species together,
     # not just the reference genome).  If you have a globally
     # estimated tree model, as above, you can get this from the
     # BACKGROUND line in the .mod file.  E.g.,
 # ALPHABET: A C G T
 # ...
 # BACKGROUND: 0.307629 0.191708 0.192177 0.308486
     # add up the C and G:
     grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
     # 0.384 is the GC content. This is used in the -gc argument below.
     # If you do *not* have a global tree model and you do not know your
     # GC content, you can get it directly from the MAFs with a command
     # like:
     # /cluster/bin/phast/$MACHTYPE/msa_view \
     # --aggregate danRer3,tetNig1,fr1,mm7,hg18 -i MAF \ 
     # -S /cluster/data/danRer3/bed/multiz5way/maf/chr*.maf > maf_summary.txt
     # This gives a GC content of 0.438
     # break up the genome-wide MAFs into pieces on the san filesystem
     ssh kkstore02
     mkdir -p /san/sanvol1/scratch/danRer3/cons/ss
     cd /san/sanvol1/scratch/danRer3/cons/ss
     bash
     for C in `awk '{print $1}' /cluster/data/danRer3/chrom.sizes`
     do
       if [ -s /cluster/data/danRer3/bed/multiz5way/maf/${C}.maf ]; then
         mkdir ${C}
         echo msa_split $C
         chrN=${C/chr/}
         /cluster/bin/phast/$MACHTYPE/msa_split \
             /cluster/data/danRer3/bed/multiz5way/maf/${C}.maf \
             --refseq /cluster/data/danRer3/${chrN}/${C}.fa \
             --in-format MAF --windows 1000000,0 --between-blocks 5000 \
             --out-format SS -I 1000 --out-root ${C}/${C}
       fi
     done
     # took about 20 minutes to run
     # Create a random list of 50 1 mb regions (do not use chrNA and chrUn)
 
     ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \
        awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
     
     # Set up parasol directory to calculate trees on these 50 regions
     ssh pk
     mkdir /san/sanvol1/scratch/danRer3/cons/treeRun1
     cd /san/sanvol1/scratch/danRer3/cons/treeRun1
     mkdir tree log
     
     # now set up cluster job to estimate model parameters.  Parameters
     # will be estimated separately for each alignment fragment then
     # will be combined across fragments. Tuning this loop should come 
     # back to here to recalculate. Tuning target-coverage and expected-length.
     # Create little script that calls phastCons with right arguments
 cat > makeTree << '_EOF_'
 #!/bin/csh -fe
 set C=$1:h
 mkdir -p log/${C} tree/${C}
 /cluster/bin/phast/x86_64/phastCons ../ss/$1 \
    /cluster/data/danRer3/bed/multiz5way/cons/starting-tree.mod \
    --gc 0.438 --nrates 1,1 --no-post-probs --ignore-missing \
    --expected-length 12 --target-coverage 0.17 \
    --quiet --log log/$1 --estimate-trees tree/$1
 '_EOF_'
     #   emacs happy
     chmod a+x makeTree
 
     # Make sure that the correct GC content is subsituted in here. Notice 
     # the target coverage of 0.17. Here we are going to aim 
     # for 65% coverage of coding regions by conserved elements.
     # Create gensub file
     cat > template << '_EOF_'
 #LOOP
 makeTree.csh $(path1)
 #ENDLOOP
 '_EOF_'
     #   happy emacs
     # Make cluster job and run it
     gensub2 ../randomSs.list single template jobList
     para create jobList
     para try,check,push,check etc.
 # para time
 # Completed: 50 of 50 jobs
 # CPU time in finished jobs:        714s      11.90m     0.20h    0.01d  0.000 y
 # IO & Wait Time:                   132s       2.20m     0.04h    0.00d  0.000 y
 # Average job time:                  17s       0.28m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              26s       0.43m     0.01h    0.00d
 # Submission to last job:           353s       5.88m     0.10h    0.00d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ssh kkstore02
     cd /san/sanvol1/scratch/danRer3/cons/treeRun1
     ls tree/chr*/*.cons.mod > cons.txt
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
         --output-average ../ave.cons.mod > cons_summary.txt
     ls tree/chr*/*.noncons.mod > noncons.txt
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
         --output-average ../ave.noncons.mod > noncons_summary.txt
     cd ..
     cp -p ave.*.mod /cluster/data/danRer3/bed/multiz5way/cons
     #   measuring entropy
     #   consEntropy <target coverage> <expected lengths>
     #            ave.cons.mod ave.noncons.mod --NH 9.78
     #   never stops with the --NH argument
     # target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok)
     # the expected length that produces this entropy is the one 
     # to use for phastCons.
     /cluster/bin/phast/$MACHTYPE/consEntropy 0.17 12 \
                         ave.cons.mod ave.noncons.mod
 
 # -target-coverage=0.17 -expected-lengths 12
 #Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068
 # Relative entropy: H=0.618383 bits/site
 # Expected min. length: L_min=17.978234 sites
 # Expected max. length: L_max=10.983828 sites
 # Phylogenetic information threshold: PIT=L_min*H=11.117434 bits
 
 # then the above steps from creating the treeRun directory onwards were
 # repeated with the target coverage and expected lengths parameters set as
 # below:
 
 # -target-coverage=0.25 -expected-lengths 12
 #Transition parameters:gamma=0.250000, omega=12.000000, mu=0.083333,nu=0.027778
 #Relative entropy: H=0.637721 bits/site
 #Expected min. length: L_min=15.535855 sites
 #Expected max. length: L_max=10.157133 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.907536 bits
 
 #### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED ####
 
 # Parameters used for danRer2 6-way conservation track:
 # -target-coverage=0.35 -expected-lengths 18
 #Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915
 # Relative entropy: H=0.592725 bits/site
 # Expected min. length: L_min=16.435656 sites
 # Expected max. length: L_max=12.564154 sites
 # Phylogenetic information threshold: PIT=L_min*H=9.741828 bits
 
 # need to iterate and get the right coverage and parameters
 # try running phastCons below with parameters used above and check the 
 # coverage of coding regions by the most conserved elements
     # Create cluster dir to do main phastCons run
     ssh pk
     mkdir -p /san/sanvol1/scratch/danRer3/cons/consRun1
     cd /san/sanvol1/scratch/danRer3/cons/consRun1
     mkdir ppRaw bed
     cp -p /san/sanvol1/scratch/danRer3/cons/ave.*.mod .
     # Create script to run phastCons with right parameters
     #   This job is I/O intensive in its output files, thus it is all
     #   working over in /scratch/tmp/
     cat > doPhast.csh << '_EOF_'
 #!/bin/csh -fe
 mkdir /scratch/tmp/${2}
 cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
 pushd /scratch/tmp/${2} > /dev/null
 /cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
    --expected-length 18 --target-coverage 0.35 --quiet \
         --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
 popd > /dev/null
 mkdir -p ppRaw/${1}
 mkdir -p bed/${1}
 mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
 mv /scratch/tmp/${2}/${2}.bed bed/${1}
 rm /scratch/tmp/${2}/ave.*.mod
 rm /scratch/tmp/${2}/${2}.ss
 rmdir /scratch/tmp/${2}
 '_EOF_'
     # emacs happy
     chmod a+x doPhast.csh
 
     #   root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
 cat > template << '_EOF_'
 #LOOP
 doPhast.csh $(root1) $(file1)
 #ENDLOOP
 '_EOF_'
     #   happy emacs
 
     # Create parasol batch and run it
     ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
 
     gensub2 in.list single template jobList
     para create jobList
     para try/check/push/etc.
 
 # combine predictions and transform scores to be in 0-1000 interval
     ssh kkstore02
     cd /san/sanvol1/scratch/danRer3/cons/consRun1
 
     #   The sed's and the sort get the file names in chrom,start order
     find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
         | sort -k7,7 -k9,9n \
         | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
         | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
         | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #   ~ 1 minute
     cp -p mostConserved.bed /cluster/data/danRer3/bed/multiz5way
 # Figure out how much is actually covered by the mostConserved data as so:
     cd /cluster/data/danRer3
     faSize */chr*.fa  
     # 1644032962 bases (48201758 N's 1595831204 real 816464533 upper 
     # 779366671 lower) in 28 sequences in 28 files
     # The non-N size is 1595831204 bases
     cd /cluster/data/danRer3/bed/multiz5way
     awk '{sum+=$3-$2}
 END{printf "%% %.2f = 100.0*%d/1595831204\n",100.0*sum/1595831204,sum}' \
         mostConserved.bed
     -target-coverage 0.35: % 3.06 = 100.0*48883581/1595831204 length=18
     -target-coverage 0.
     
     ssh hgwdev
     cd /cluster/data/danRer3/bed/multiz5way
     # get an or of refGene and mgcGenes CDS regions 
     featureBits danRer3 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
     # 11338034 bases of 1630323462 (0.695%) in intersection
     featureBits danRer3 refSeqOrMgcCds.bed mostConserved.bed -enrichment
     # refSeqOrMgcCds.bed 0.695%, mostConserved.bed 2.998%, both 0.464%, 
     # cover 66.71%, enrich 22.25x 
     # so use this result for -target-coverage=0.35 -expected-lengths=18
     # with entropy (PIT) value of 9.74 (aiming for around 9.8) and 
     # 66.7% coverage of coding regions with most conserved elements 
     # (aiming for about 65%)
 
     # Load most conserved track into database
     ssh hgwdev
     cd /cluster/data/danRer3/bed/multiz5way
     hgLoadBed danRer3 phastConsElements mostConserved.bed
     # Loaded 552331 elements of size 5
     featureBits danRer3 mgcGenes:cds phastConsElements -enrichment
     # mgcGenes:cds 0.531%, phastConsElements 2.998%, both 0.363%, 
     # cover 68.39%, enrich 22.81x
     featureBits danRer3 refGene:cds phastConsElements -enrichment
     # refGene:cds 0.658%, phastConsElements 2.998%, both 0.440%, cover 66.82%,
     # enrich 22.28x
     # Create merged posterier probability file and wiggle track data files
     # the sed business gets the names sorted by chromName, chromStart
     # so that everything goes in numerical order into wigEncode
     ssh kkstore02
     cd /san/sanvol1/scratch/danRer3/cons/consRun1
     find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
         | sort -k7,7 -k9,9n \
         | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
         | wigEncode stdin phastCons5way.wig phastCons5way.wib
     # takes a few minutes
     ls -l phastCons*
     # -rw-rw-r--  1 hartera protein 198399845 Feb  6 16:05 phastCons5way.wib
     # -rw-rw-r--  1 hartera protein  45304940 Feb  6 16:05 phastCons5way.wig
     cp -p phastCons5way.wi? /cluster/data/danRer3/bed/multiz5way/cons
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/danRer3/bed/multiz5way/cons
     mkdir -p /gbdb/danRer3/wib
     ln -s `pwd`/phastCons5way.wib /gbdb/danRer3/wib/phastCons5way.wib
     # use this if need to reload table
     hgsql -e 'drop table phastCons5way;' danRer3
     # load table
     hgLoadWiggle danRer3 phastCons5way phastCons5way.wig
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/danRer3/bed/multiz5way/cons
     bash
     time hgWiggle -doHistogram \
         -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
             -db=danRer3 phastCons5way > histogram.data 2>&1
 # real    2m33.069s
 # user    1m58.310s
 # sys     0m16.170s
 
         #   create plot of histogram:
     cat << '_EOF_' > histo.gp
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Zebrafish danRer3 Histogram phastCons5 track"
 set xlabel " phastCons5 score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
      "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
 
     #   happy emacs
     gnuplot histo.gp > histo.png
     display histo.png &
 
 # add line: wiggle phastCons5way to trackDb.ra for multiz5way to display the 
 # wiggle for the conservation track.
 # check all.joiner for entries for phastCons5way and phastConsElements5way -ok
 # copy over html for multiz and edit.
 
 # PHASTCONS SCORES DOWNLOADABLES (DONE, 2006-02-07, hartera)
     #   prepare compressed copy of ascii data values for downloads
     ssh kkstore02
     cd /san/sanvol1/scratch/danRer3/cons/consRun1
 cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
 
 TOP=`pwd`
 export TOP
 
 mkdir -p phastCons5Scores
 
 for D in ppRaw/chr*
 do
     C=${D/ppRaw\/}
     out=phastCons5Scores/${C}.data.gz
     echo "========================== ${C} ${D}"
     find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
         | sort -k7,7 -k9,9n \
         | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
             gzip > ${out}
 done
 '_EOF_'
     chmod +x gzipAscii.sh
     time ./gzipAscii.sh
     # 192.852u 8.835s 4:04.05 82.6%   0+0k 0+0io 1pf+0w
     # creates 331 Mb of data.
     # copy data for downloads
     ssh kkstore02
     mkdir /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores
     cd /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores
     rsync -a --progress \
         pk:/san/sanvol1/scratch/danRer3/cons/consRun1/phastCons5Scores/ .
 
     ssh hgwdev
     mkdir /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores
     cd /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores
     ln -s /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores/*.gz .
     md5sum *.gz > md5sum.txt
     # copy over and edit README.txt from the hg17 phastCons.
 
 # MULTIZ 5-WAY DOWNLOADABLES (DONE, 2006-02-22, hartera)
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/danRer3
     mkdir -p multiz5way
     cd multiz5way
     foreach f (/cluster/data/danRer3/bed/multiz5way/maf/*.maf)
         set c = $f:r:t
         echo $c
         nice gzip $f
         ln -s $f.gz .
     end
     md5sum *.gz > md5sum.txt
     # copy over README and edit for this 5-way multiple alignment
 
 ##################################################################
 # HGNEAR TABLES (also used by the Known Genes details page links)
 # GET LATEST PROTEIN SEQUENCE FOR ALL HGNEAR SPECIES (DONE, 2005-02-10, hartera)
 #   # For species with knownGene, use that; otherwise, download the latest 
     # version of the main model organism database for this species.
     # Human: use knownGene proteins.
 # need to get hg18 peptide sequence:
      mkdir -p /cluster/data/hg18/bed/blastp
      cd /cluster/data/hg18/bed/blastp
      pepPredToFa hg18 knownGenePep known.faa
 # # Mouse: use knownGene proteins.
 # already done:
 #    mkdir -p  /cluster/data/mm7/bed/geneSorter/blastp
 #    cd /cluster/data/mm7/bed/geneSorter/blastp
 #    pepPredToFa mm7 knownGenePep known.faa
     # Rat: use knownGene proteins.
 # already done:
 #    mkdir /cluster/data/rn3/bed/blastp
 #    cd /cluster/data/rn3/bed/blastp
 #    pepPredToFa rn3 knownGenePep known.faa
     # Fly: use FlyBase proteins - already done 
     # /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
     # Worm: use WormBase proteins.
     mkdir -p /cluster/data/ce2/bed/blastp
     cd /cluster/data/ce2/bed/blastp
     # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
     # to find out the latest version. It is WormPep 154 so use that.
     wget --timestamping -O wormPep154.faa \
        ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep154/wormpep154
     # Yeast: use SGD proteins.
     mkdir -p /cluster/data/sacCer1/bed/blastp
     cd /cluster/data/sacCer1/bed/blastp
     # get latest version - from Jan 26, 2006
     wget -O orf_trans.fasta.jan26.gz \
          ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
     # rename old version of peptide sequences
     mv sgdPep.faa sgdPep.jan9.faa
     zcat orf_trans.fasta.jan26.gz > sgdPep.faa
 
 # HGNEAR PROTEIN BLAST TABLES (DONE, 2006-02-10, hartera)
 # RENAME SELF BLASTP TABLE AND CHANGE CONFIG.RA FILE (DONE, 2006-04-19, hartera)
 # NOTE: mmBlastTab was updated to mm8 as a result of running doHgNearBlastp.pl
 # for mm8 on 2006-03-13 (see makeMm8.doc).
 # RECREATE THE HGNEAR TABLES FOR RAT AND MOUSE TO UPDATE THEM 
 # (DONE, 2006-05-31, hartera)
 # RE-MADE THE ZEBRAFISH BLASTP TABLES USING THE TRANSCRIPT ID INSTEAD OF THE
 # PEPTIDE ID FOR EACH SEQUENCE - FOR ALL OTHER SPECIES THE PEPTIDE SEQUENCES
 # ARE REPRESENTED BY THEIR KNOWN GENES TRANSCRIPT ID
 # (DONE, 2006-07-03, hartera)
 # CHANGED INDEX ON ensZfishBlastTab (DONE, 2006-11-03, hartera)
     ssh hgwdev
     mkdir -p /cluster/data/danRer3/bed/hgNearBlastp
     cd /cluster/data/danRer3/bed/hgNearBlastp
     
     # zebrafish vs fly table has already been created as a result of 
     # creating the blastp table for dm2 (see makeDm2.doc)
 
 cat << _EOF_ > config.ra
 # Latest zebrafish vs. other Gene Sorter orgs:
 # human, mouse, rat, worm, yeast
 # zebrafish vs fly already done (dm2)
 
 targetGenesetPrefix ensZfish
 targetDb danRer3
 queryDbs hg18 mm7 rn3 ce2 sacCer1
 
 danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
 hg18Fa /cluster/data/hg18/bed/blastp/known.faa
 mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa
 rn3Fa /cluster/data/rn3/bed/blastp/known.faa
 ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
 sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
 
 buildDir /cluster/data/danRer3/bed/hgNearBlastp
 scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp
 _EOF_
      # << this line makes emacs coloring happy
     nice doHgNearBlastp.pl config.ra >& do.log &
     tail -f do.log
     # Took about 2 hours to finish.
     # The target geneset (self Blastp) should be prefixed with ensZfish
     # so change the config.ra and rename the table (2006-04-19, hartera)
     hgsql -e 'alter table flyBaseBlastTab rename ensZfishBlastTab;' danRer3
     # Update mouse to mm8 and rat to rn4
     mkdir updates
     cd updates
     hgsql -e 'drop table mmBlastTab;' danRer3
     hgsql -e 'drop table rnBlastTab;' danRer3
 
 cat << _EOF_ > config.ra
 # Update of zebrafish vs. other Gene Sorter orgs:
 # mouse mm8 and rat rn4
 targetGenesetPrefix ensZfish
 targetDb danRer3
 queryDbs mm8 rn4
 
 danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
 mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
 rn4Fa /cluster/data/rn4/bed/blastp/known.faa
 buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates
 scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates
 _EOF_
      # << this line makes emacs coloring happy
     nice doHgNearBlastp.pl config.ra >& do.log &
     tail -f do.log
     # Took about 25 minutes.
 
     # Need to remake all the BlastTab tables using the transcript Id instead
     # of the protein ID for zebrafish Ensembl Genes.
     # create ensZfishBlastTab and drBlastTab tables using the Ensembl 
     # transcript Ids for the tables instead of the peptide Ids
     # (2006-07-03, hartera)
     ssh hgwdev
     # create the FASTA file of Ensembl peptide sequences with transcript IDs
     # there is a one to one relationship between these IDs.
     cd /cluster/data/danRer3/bed/blastp
     # then create a fasta file of the sequences:
     pepPredToFa danRer3 ensPep ensPep.faa
     mkdir /cluster/data/danRer3/bed/hgNearBlastp/updates2 
     cd /cluster/data/danRer3/bed/hgNearBlastp/updates2 
 cat << _EOF_ > config.ra
 # Latest zebrafish vs. other Gene Sorter orgs:
 # human, mouse, rat, fly, worm, yeast
 
 targetGenesetPrefix ensZfish
 targetDb danRer3
 queryDbs hg18 mm8 rn4 dm2 ce2 sacCer1
 
 danRer3Fa /cluster/data/danRer3/bed/blastp/ensPep.faa
 hg18Fa /cluster/data/hg18/bed/blastp/known.faa
 mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
 rn4Fa /cluster/data/rn4/bed/blastp/known.faa
 dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
 ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
 sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
 
 buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates2
 scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates2
 _EOF_
      # << this line makes emacs coloring happy
     nice doHgNearBlastp.pl config.ra >& do.log &
     tail -f do.log
     # Took about 45 minutes
     # update sacCer1 otherOrgs.ra to use danRer3 instead of danRer1 
     # for drBlastTab.
     
 # also need to update:
 # dm1, hg{15,16,17}, mm{5,6,7}, rn{2,3}
 # Human (hg15 and hg16), Drosophila, mouse mm5 and rat all use danRer1.
 # Human hg17 and mouse mm6 and mm7 uses danRer2.
   # Update these all to use the Zv5 (danRer3) Ensembl proteins.
   # Ensembl 38 (April 2006)
   ssh hgwdev
   cd /cluster/data/danRer3/bed/hgNearBlastp/updates2
 cat << _EOF_ > config2.ra
 # Latest zebrafish vs. other Gene Sorter orgs:
 # human, mouse, rat, fly - older databases
 
 targetGenesetPrefix ensZfish
 targetDb danRer3
 queryDbs hg17 hg16 hg15 mm7 mm6 mm5 rn3 rn2 dm1
 
 danRer3Fa /cluster/data/danRer3/bed/blastp/ensPep.faa
 hg17Fa /cluster/data/hg17/bed/blastp/known.faa
 hg16Fa /cluster/data/hg16/bed/blastp/known.faa
 hg15Fa /cluster/data/hg15/bed/blastp/known.faa
 mm7Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
 mm6Fa /cluster/data/mm6/bed/geneSorter/blastp/known.faa
 mm5Fa /cluster/data/mm5/bed/geneSorter/blastp/known.faa
 rn3Fa /cluster/data/rn3/bed/blastp/known.faa
 rn2Fa /cluster/data/rn2/bed/blastp/known.faa
 dm1Fa /cluster/data/dm1/bed/blastp/bdgp.faa
 
 buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates2
 scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates2
 _EOF_
      # << this line makes emacs coloring happy
     # create BlastTab tables for all queries vs target and no self blastp
     nice doHgNearBlastp.pl config2.ra -noSelf -queryOnly >& do2.log &
     tail -f do2.log
     # Took about 30 minutes
     # Update and commit hgGeneData and hgNearData files to make sure that 
     # all queries and links now work for the transcript ID instead of 
     # peptide ID for ensZfishBlastTab and drBlastTab tables.
 
     # Gene Sorter is very slow for danRer3. ensZfishBlastTab has an index
     # on both the query and target. All the other BlatTab tables have only
     # an index on the query so try dropping the index on the target.
     hgsql -e 'alter table ensZfishBlastTab drop index target;' danRer3
     # Gene Sorter still loads slowly.
     # Index is too short. hgLoadBlastTab used to load table and index on
     # query is query(12). The first 12 characters are not unique for
     # the Ensembl IDs so extend to query(20).
     hgsql -e 'alter table ensZfishBlastTab drop index query;' danRer3
     hgsql -e 'create index query on ensZfishBlastTab (query(20));' danRer3
     # Much faster now.
 
 # END OF HGNEAR STUFF
 ####################################################
 # GENE SET BASED ON ENSEMBL GENES (PROTEIN CODING GENES) 
 # (in progress, 2005-11-23, hartera)
     # see ENSEMBL GENES section for documentation of creation of
     # the ensGene, ensGtp and ensPep tables and the track.
     # compare the Ensembl and Human Proteins tracks
     featureBits danRer3 refGene:cds ensGene:cds -enrichment
 # refGene:cds 0.658%, ensGene:cds 1.994%, both 0.589%, cover 89.60%, 
 # enrich 44.94x
     featureBits danRer3 refGene:cds blastHg17KG -enrichment
 # refGene:cds 0.658%, blastHg17KG 1.292%, both 0.385%, cover 58.52%, 
 # enrich 45.30x
     # little difference in enrichment and less coverage for Human Proteins so
     # it seems like Ensembl is the best choice in terms of genome coverage
     # and intersection with RefSeq CDS regions.
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/bed/ensGenes
     cd /cluster/data/danRer3/bed/ensGenes
     # use Ensembl's BioMart to download the Ensembl Genes UniProt IDs and
     # descriptions. For genes with no description, use the InterPro domain.
     # Go to http://www.ensembl.org/Multi/martview
     # Follow this sequence through the pages: 
     # Page 1) Select the Ensembl dataset (now v38 here, v36 and v37 is the 
     # same for Zv5 Danio rerio protein coding genes) and the Danio_rerio 
     # choice (ZFISH5 here). 
     # Hit next. 25541 entries total.
     # Ensembl 37 from Feb 2006 - this dataset is the same as for the 
     # version 32 downloaded as above for the Ensembl Genes track.
     # (Checked on 2006-03-09, hartera)
     # Ensembl 38 from April 2006 - this dataset is the same as for the 
     # version 32 downloaded as above for the Ensembl Genes track.
     # (Checked on 2006-05-31, hartera)
     # Page 2) In the GENE section, select Gene type as protein_coding. 
     # Then hit next. There are now 22877 entries in this filtered version.
     # Page 3) Choose the "Features" Attribute Page from the pulldown menu
     # at the top. Make sure that under the GENE section, the Ensembl 
     # Attributes checked are the Ensembl Transcript ID, External Gene ID and the
     # Description. Under External References, select Unified UniProt 
     # accession, and ZFIN Primary ID. Under the Protein section, select 
     # InterPro Description and InterPro ID under InterPro 
     # Attributes. Select text, tab-separated for output. Choose gzip 
     # compression. Hit export. Save as ensGeneInfo37Coding.tsv.gz. Same as for
     # Ensembl v36 so update to Ensembl v37. Ensembl v38 is the same too
     # so update to this version (2006-05-31, hartera). Also add External Gene
     # ID for the Ensembl Attributes.
     gunzip ensGeneInfo38Coding.txt.gz
     # this file has some errors in it - there is a newline character in the
     # middle of the descriptions for the genes with the following UniProt 
     # IDs: Q5TYV0, Q5SPG7, Q5SPG5, Q5RIJ2, Q5RID3. This causes the table
     # to be loaded incorrectly. Edit the ensGeneInfo38Coding.txt file manually
     # to remove these extra newlines.
 
     # Repeat above steps and get the Ensembl transcript ID from Ensembl 
     # Attributes and then get EntrezGene ID, RefSeq DNA ID, and RefSeq 
     # Peptide ID and from the External References section. Select text, 
     # tab-separated for output. Choose gzip compression. Hit export. Again 
     # Ensembl v36 gives the same result for Danio rerio. 
     # Save as ensGeneInfo38Coding2.txt.gz
     cd /cluster/data/danRer3/bed/ensGenes
     gunzip ensGeneInfo38Coding2.txt.gz  
     wc -l ensGeneInfo38*
     # 85607 ensGeneInfo38Coding.txt
     # 32457 ensGeneInfo38Coding2.txt
 
     # 85607 ensGeneInfo37Coding.tsv
     # 33233 ensGeneInfo37Coding2.tsv
 
     # find how many Transcripts have multiple SWISS-PROT IDs
     tail +2 ensGeneInfo38Coding.txt | awk '{FS="\t"} {OFS="\t"} \
          {print $1, $2, $4}' > ensGene38UniProtandExtId.txt
     tail +2 ensGeneInfo38Coding.txt | awk '{FS="\t"} {OFS="\t"} \
          {if ($2 != "") print $1, $4}' \
          > ensGene38UniProt.txt
     sort ensGene38UniProt.txt | uniq > ensGene38UniProt.txt.uniq
     awk '{print $1}' ensGene38UniProt.txt.uniq | sort | uniq -c | sort -nr \
         > ens38UniProt.count
     awk '{if ($1 > 1) print $2}' ens38UniProt.count \
         > ens38UniProtMorethanOne.txt    
     wc -l ens38UniProtMorethanOne.txt
     # 2257 ens38UniProtMorethanOne.txt
     awk '{if ($1 == 1) print $2}' ens38UniProt.count \
         > ens38UniProtOnlyOne.txt    
     wc -l ens38UniProtOnlyOne.txt
     # 8172
     # get list of Ensembl transcripts with more than 1 UniProt ID and
     # the list of UniProt IDs.
     grep -f ens38UniProtMorethanOne.txt ensGene38UniProt.txt.uniq \
             > ens38UniProtMorethanOne.uniProtIds
     # get list of Ensembl transcripts with more than 1 UniProt ID and
     # the list of UniProt IDs and external database IDs.
     sort ensGene38UniProtandExtId.txt | uniq \
          > ensGene38UniProtandExtId.txt.uniq
     grep -f ens38UniProtMorethanOne.txt ensGene38UniProtandExtId.txt.uniq \
          > ens38UniProtMorethanOne.uniProtandExtIds
      
     # to do blastp of Ensembl Proteins vs UniProt 
     # (last uniProt update 2006-01-23):
     ssh hgwdev
     mkdir -p /cluster/data/danRer3/bed/ensGenes/blastDb
     cd /cluster/data/danRer3/bed/ensGenes/blastDb
     # create a table of Danio Rerio (Brachydanio rerio in UniProt)
     # SWISS-PROT sequences (2006-05-31)
     hgsql uniProt -e ' \
       create table test.danioProt select protein.* from protein,accToTaxon \
       where accToTaxon.taxon = 7955 and accToTaxon.acc = protein.acc;'
     # then create a fasta file of the sequences:
     pepPredToFa test danioProt danioUniProt.fa
     grep '>' danioUniProt.fa | wc -l
     # 14297
     # then select just those UniProt IDs for the Ensembl Transcript IDs that
     # have multiple UniProt IDs associated with them.
     ssh kkstore02
     cd /cluster/data/danRer3/bed/ensGenes/blastDb
     # get list of UniProt IDs
     awk '{print $2}' ../ens38UniProtMorethanOne.uniProtIds \
         > ens38MultiUniProtIds.idsOnly
     sort ens38MultiUniProtIds.idsOnly | uniq \
          > ens38MultiUniProtIds.idsOnly.uniq
     faSomeRecords danioUniProt.fa ens38MultiUniProtIds.idsOnly.uniq \
           ens38DanioUniProt.fa 
     # 4410 UniProt IDs but 4293 in the FASTA file so 117 are missing.
     grep '>' ens38DanioUniProt.fa | sort > uniProtSeq.ids
     perl -pi.bak -e 's/>//' uniProtSeq.ids
     comm -13 uniProtSeq.ids ens38MultiUniProtIds.idsOnly.uniq > uniProtMissing
     # these missing sequences are missing because the uniProt IDs are
     # secondary IDs. Find the primary ID.
     hgsql -N -e 'select o.acc, o.val from otherAcc as o, accToTaxon as a \
       where o.acc = a.acc and a.taxon = 7955;' uniProt > otherAccs.zfish.txt
     wc -l otherAccs.zfish.txt
     # 321 otherAccs.zfish.txt
     grep -f uniProtMissing otherAccs.zfish.txt > uniProtMissing.otherAccs.txt  
     # found 83 of them
     awk '{print $2}' uniProtMissing.otherAccs.txt | sort | uniq > otherAccsFound
     comm -13 otherAccsFound uniProtMissing > stillMissing
     # check list of deleted TrEMBL IDs - delac_tr.txt from Expasy site.
     sort delac_tr.txt > delac_tr.sort
     sort stillMissing > stillMissing.sort
     comm -12 delac_tr.sort stillMissing.sort | wc
     # 34. There are 34 in the stillMissing file and these are all in the
     # delac_tr.txt file.
 #This file lists the accession numbers of TrEMBL entries which have
 #been deleted from the database. Most deletions are due to the deletion of
 #the corresponding CDS in the source nucleotide sequence databases EMBL-
 #Bank/DDBJ/GenBank. In addition, some entries are recognised to be Open
 #Reading frames (ORFs) that have been wrongly predicted to code for
 #proteins. When there is enough evidence that these hypothetical proteins
 #are not real, we take the decision to remove them from TrEMBL.
 
     # Get the sequences for otherAccsFound from danioUniProt.fa
     awk '{print $1}' uniProtMissing.otherAccs.txt | sort | uniq \
         > otherAccsFound.altAccs
     faSomeRecords danioUniProt.fa otherAccsFound.altAccs ens38DanioOtherAccs.fa
     grep '>' ens38DanioOtherAccs.fa | wc
     # 73
     wc -l otherAccsFound.altAccs
     # 73 otherAccsFound.altAccs
     cat ens38DanioUniProt.fa ens38DanioOtherAccs.fa > ens38DanioAllUniProt.fa
     # create blastDb database
     ssh pk
     cd /cluster/data/danRer3/bed/ensGenes/blastDb
     mkdir format
     cd format
     mv ../ens38DanioAllUniProt.fa .
     /scratch/blast/formatdb -i ens38DanioAllUniProt.fa \
             -t ensUniProt -n ensUniProt
     # Copy database over to the san
     mkdir -p /san/sanvol1/scratch/danRer3/ensGenes/blastDb
     cp ensUniProt* /san/sanvol1/scratch/danRer3/ensGenes/blastDb/
     ssh hgwdev
     mkdir /cluster/data/danRer3/bed/ensGenes/blastp
     cd /cluster/data/danRer3/bed/ensGenes/blastp
     # get FASTA file of Ensembl sequences
     
     pepPredToFa danRer3 ensPep ensPep.fa
     # get list of Ensembl transcripts to use in Blastp
     cp ../blastDb/stillMissing .
     # need to remove the missing ones (those no longer in TrEMBL) from list
     grep -v -f stillMissing ../ens38UniProtMorethanOne.uniProtIds \
             > ens38UniProt.uniProtIdsforBlastp
     # get final list of Ensembl Transcript Ids
     awk '{print $1}' ens38UniProt.uniProtIdsforBlastp | sort | uniq \
             > ens38IdsOnlyForBlastp.txt
     wc -l ens38IdsOnlyForBlastp.txt
     # 2252 ens38IdsOnlyForBlastp.txt
     # grab the protein sequences just for these Ensembl Transcripts:
     faSomeRecords ensPep.fa ens38IdsOnlyForBlastp.txt ens38ForBlastp.fa
     # check that there are 2252 records
 
     # set up the Blastp run
     ssh pk
     cd /cluster/data/danRer3/bed/ensGenes/blastp
     # split Ensembl peptide sequences FASTA file into chunks for cluster
     mkdir split
     faSplit sequence ens38ForBlastp.fa 200 split/ens38
     # make parasol run directory
     mkdir run
     cd run 
     mkdir out
     # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/csh -ef
 setenv BLASTMAT /san/sanvol1/scratch/blast64/blast-2.2.11/data 
 /san/sanvol1/scratch/blast64/blast-2.2.11/bin/blastall \
     -p blastp -d /san/sanvol1/scratch/danRer3/ensGenes/blastDb/ensUniProt \
     -i $1 -o $2 -e 0.01 -m 8 -b 1000
 '_EOF_'
     # << keep emacs happy
     chmod +x blastSome
     # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
     # Create parasol batch
     echo ../split/*fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub jobList
     para create jobList
     para try, check, push, check ... etc.
 # Completed: 190 of 190 jobs
 # CPU time in finished jobs:        279s       4.65m     0.08h    0.00d  0.000 y
 # IO & Wait Time:                  2293s      38.22m     0.64h    0.03d  0.000 y
 # Average job time:                  14s       0.23m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              30s       0.50m     0.01h    0.00d
 # Submission to last job:            37s       0.62m     0.01h    0.00d
     # Load these into a temporary database table. hgLoadBlastTab
     # picks the best hit for each of the queries (Ensembl peptide).
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ensGenes/blastp/run/out
     time hgLoadBlastTab -maxPer=1 test ensUniProtBlastTab *.tab
     # 0.154u 0.008s 0:00.66 22.7%     0+0k 0+0io 0pf+0w
     # there were 2252 queries
 # BLASTP OF ALL ENS PEP VS ALL DANIO UNIPROT SEQS
     # Try doing Blastp again but this time using all the zebrafish UniProt
     # sequences as the database and all the Ensembl peptides as queries.
     # create blastDb database
     ssh pk
     cd /cluster/data/danRer3/bed/ensGenes/blastDb
     mkdir zfishUniProt
     cd zfishUniProt
     cp ../danioUniProt.fa .
     /san/sanvol1/scratch/blast64/blast-2.2.11/bin/formatdb \
         -i danioUniProt.fa -t danioUniProt -n danioUniProt
     # Copy database over to the san
     mkdir -p /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt
     cp danioUniProt* /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt
     # split Ensembl peptide sequences FASTA file into chunks for cluster
     cd /cluster/data/danRer3/bed/ensGenes/blastp
     mkdir splitAll
     grep '>' ensPep.fa | wc -l
     # 32143
     faSplit sequence ensPep.fa 8000 splitAll/ens38All
     # make parasol run directory
     mkdir runAll
     cd runAll
     mkdir out
     # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/csh -ef
 setenv BLASTMAT /san/sanvol1/scratch/blast64/blast-2.2.11/data 
 /san/sanvol1/scratch/blast64/blast-2.2.11/bin/blastall \
     -p blastp \
     -d /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt/danioUniProt \
     -i $1 -o $2 -e 0.01 -m 8 -b 1000
 '_EOF_'
     # << keep emacs happy
     chmod +x blastSome
     # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
     # Create parasol batch
     echo ../splitAll/*fa | wordLine stdin > split.lst
     gensub2 split.lst single gsub jobList
     para create jobList
     para try, check, push, check ... etc.
     para time
 #Completed: 7609 of 7609 jobs
 #CPU time in finished jobs:      11414s     190.23m     3.17h    0.13d  0.000 y
 #IO & Wait Time:                401489s    6691.48m   111.52h    4.65d  0.013 y
 #Average job time:                  54s       0.90m     0.02h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              77s       1.28m     0.02h    0.00d
 #Submission to last job:          1096s      18.27m     0.30h    0.01d
     # Load these into a temporary database table. hgLoadBlastTab
     # picks the best hit for each of the queries (Ensembl peptide).
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ensGenes/blastp/runAll/out
     # cat files together as argument list too long for hgLoadBlastTab
     foreach t (*.tab)
        cat $t >> ensAll.tab
     end
     time hgLoadBlastTab -maxPer=1 test ensUniProtAllBlastTab ensAll.tab
     # 4.168u 0.737s 0:06.03 81.0%     0+0k 0+0io 5pf+0w   
     # filter these and select just those with identity >= 95%
     # and eValue <= 0.00001
     hgsql -N -e 'select distinct(target) from ensUniProtAllBlastTab where \
          identity >= 95 and eValue <= 0.00001;' test | sort > out
     # get 11910 UniProt IDs mapping to Ensembl transcripts
     # there are 11343 unique UniProt IDs in ensGeneInfo38Coding.txt
     # load the ensGeneInfo38Coding.txt file into a table
 
 cat << 'EOF' > ens38Zfish.sql
 CREATE TABLE ens38Zfish (
     transcriptId varchar(255) not null,     
     extDbId varchar(255) not null,     
     description longblob not null, 
     uniProt varchar(255) not null,
     zfinId varchar(255) not null,
     interProDesc longblob not null,
     interProId varchar(255) not null
 );
 'EOF'
     # << emacs
     chmod a+r ensGeneInfo38Coding*
     tail +2 ensGeneInfo38Coding.txt > ens38Coding.tab
     hgLoadSqlTab test ens38Zfish ens38Zfish.sql ens38Coding.tab
     hgsql -N -e 'select distinct(uniProt) from ens38Zfish;' test \
          | sort > ens38Zfish.uniProt.uniq
     wc -l ens38Zfish.uniProt.uniq out 
     # 11344 ens38Zfish.uniProt.uniq
     # 9208 out
     comm -12 ens38Zfish.uniProt.uniq out | wc
     # 8526 in common
     comm -13 ens38Zfish.uniProt.uniq out > fromBlastPOnly
     comm -23 ens38Zfish.uniProt.uniq out > fromEns38Only
     wc -l from*
     # 682 fromBlastPOnly
     # 2817 fromEns38Only
     # find out how many from fromEns38Only are on the list of deleted from
     # TrEMBL IDs
     comm -12 fromEns38Only ./blastDb/delac_tr.sort > deletedFromTrEMBL
     comm -13 deletedFromTrEMBL fromEns38Only > fromEns38Only2
     # get list of transcripts matched to a UniProt by blastP that
     # are not in ens38Zfish
     hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where uniProt = "";' test | sort > ens38Zfish.noUniProt
     hgsql -N -e 'select distinct(query) from ensUniProtAllBlastTab where \
          identity >= 95 and eValue <= 0.00001;' test | sort > queryBlast.sort
     comm -12 queryBlast.sort ens38Zfish.noUniProt
     # 1967
     # 9943 transcripts.
     # delac_sp.txt in ./blastDb - list of deleted SWISS-PROT IDs
     # as of May 30, 2006. 331 IDs.
     sort blastDb/delac_sp.txt > blastDb/delac_sp.sort 
     # compare to list of SP IDs that are not in Blastp hits
     comm -12 blastDb/delac_sp.sort fromEns38Only2 
     # there are none in common
     # get list of Danio rerio UniProt IDs
     hgsql -N -e 'select distinct(acc) from danioProt;' test | sort \
           > danioProt.accs.uniq
     comm -13 danioProt.accs.uniq fromEns38Only2
     comm -12 danioProt.accs.uniq fromEns38Only2 > inuniProtAndfromEns38Only
     hgsql -e 'create table test.ensBlastp select * from ensUniProtAllBlastTab where identity >= 95 and eValue <= 0.00001;' test 
    ## wc -l in*Only
     # 1967 inBlastpOnly
     # 278 inEns38Only
     # these are trancsript IDs
     # find the UniProt IDs for the 278 inEns38Only
     cd test6/tmp
     hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \
           > ensBlastp.tId.sort
     hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where uniProt = "";' test | sort > ens38ZfishwithUniProt.tId.sort
     comm -13 ensBlastp.tId.sort ens38ZfishwithUniProt.tId.sort > inEns38Only
     comm -23 ensBlastp.tId.sort ens38ZfishwithUniProt.tId.sort > inBlastpOnly
     wc -l in*Only
     # 9943 inBlastpOnly
     # 19955 inEns38Only
     wc -l *.sort
     # 32143 ens38Zfish.tId.sort
     # 11910 ensBlastp.tId.sort
     # So there are 9943 that have Blastp hits assigned and 19955 in
     # Ensembl 38 that do not have Blastp hits
     # find those with no description and also have no UniProt ID.
     # there are 21236 and this is the same number without a description
     hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where description = "" and uniProt = "";' test | sort > ens38ZfishNoDesc.tid.sort
     
     # 21236 ens38ZfishNoDesc.tid.sort
     # compare this to the set of transcript IDs in Ensembl 38 Only 
     # and for Blastp Only
     comm -12 inEns38Only ens38ZfishNoDesc.tid.sort > noBlastHitNoDesc
     comm -12 inBlastpOnly ens38ZfishNoDesc.tid.sort > blastHitNoDesc
     wc -l *NoDesc
     # 0 blastHitNoDesc
     # 19712 noBlastHitNoDesc
     # then get list of transcript IDs with no description in Ensembl 38 but
     # do have a Blastp hit
     comm -13 inEns38Only ens38ZfishNoDesc.tid.sort > blastpHitNoDesc.tid 
     wc -l blastpHitNoDesc.tid
     # 1524 blastpHitNoDesc.tid
     # These are sequences with a Blastp hit but no description
     hgsql -N -e 'select distinct(target) from ensBlastp;' test \
           | sort > blastp.uniProt.sort
     hgsql -N -e 'select distinct(uniProt) from ens38Zfish;' test \
           | sort > ens38.uniProt.sort
     wc -l *uniProt.sort
     # 9208 blastp.uniProt.sort
     # 11344 ens38.uniProt.sort
     # there are 8526 in common
     comm -13 blastp.uniProt.sort ens38.uniProt.sort > ens38Only.uniProt
     comm -23 blastp.uniProt.sort ens38.uniProt.sort > blastpOnly.uniProt
     wc -l *.uniProt
     # 682 blastpOnly.uniProt
     # 2817 ens38Only.uniProt
     # there are 80 in the ense38Only.uniProt list that are deleted from TrEMBL
     # there are 3 in the blastpOnly.uniProt list that are deleted from TrEMBL
     # Q503U2
     # Q7SY13
     # Q8AW80
     # Remove these from each list:
   comm -23 ens38Only.uniProt ../../blastDb/delac_tr.sort > ens38Only.uniProt2
   comm -23 blastpOnly.uniProt ../../blastDb/delac_tr.sort > blastpOnly.uniProt2
     # some of these will be ones where there were several SWISS-PROT IDs for
     # each transcript ID and only one is chosen so the others are dropped.
     # find how many of these ens38Only.uniProt2 are not in danioProt.accs.uniq
     comm -13 ../../danioProt.accs.uniq ens38Only.uniProt2 \ 
          > ens38Only.uniProt.notinDanioProt
     # there are 88 of these.
     # find list of zebrafish accs with alternative accs in uniProt
     hgsql -N -e 'select val from otherAcc as a, accToTaxon as t where a.acc = t.acc and taxon = 7955;' uniProt | sort | uniq > zfishVals.otherAccs.uniq
     comm -12 ens38Only.uniProt.notinDanioProt zfishVals.otherAccs.uniq \
     # 88 so all of these have alternate accessions.
     # remove these from list so:
     comm -13 ens38Only.uniProt.notinDanioProt ens38Only.uniProt2 \
              > ens38Only.uniProt3
     wc -l ens38Only.uniProt3
     # 2649 ens38Only.uniProt3
     # find number of uniProt IDs belonging to transcript IDs that have multiple 
     # uniProt IDs: ../../blastDb/ens38MultiUniProtIds.idsOnly.uniq is list of 
     # uniProt IDs for such transcripts.
   comm -12 ens38Only.uniProt3 ../../blastDb/ens38MultiUniProtIds.idsOnly.uniq \
     > ens38Only.multiUniProtIds
     # there are 2310 of these.
     comm -13 ens38Only.multiUniProtIds ens38Only.uniProt3 > ens38Only.uniProt4
     # 339 of these left 
     grep -f ens38Only.uniProt4 ../../ensGene38UniProt.txt \
          > ens38Only.uniProt4.tIdAndUpId
     awk '{print $1}' ens38Only.uniProt4.tIdAndUpId | sort | uniq \
         > ens38Only.uniProt4.tId.uniq
     wc -l ens38Only.uniProt4.tId.uniq
     # 368 ens38Only.uniProt4.tId.uniq
     # Do these all have SWISS-PROT IDs by Blastp?
     hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \
           > ensBlastp.query.sort
     comm -12 ens38Only.uniProt4.tId.uniq ensBlastp.query.sort 
     # 183 so remove these:
     comm -23 ens38Only.uniProt4.tId.uniq ensBlastp.query.sort \
          > ens38Only.uniProt4.tId.noBlastp
     wc -l ens38Only.uniProt4.tId.noBlastp
     # 185 ens38Only.uniProt4.tId.noBlastp
 
     #e.g. ENSDART00000002826, this has only 91% ID to Q6DBUS (Q6NYR4 in BioMart
     # download. It is 91.7% ID to Q6DBUS in Blastp table.
     hgsql -e 'create table test.ensBlastp90 select * from ensUniProtAllBlastTab where identity >= 90 and eValue <= 0.00001;' test 
     hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \
           > ensBlastp.tId.sort
     hgsql -N -e 'select distinct(query) from ensBlastp90;' test | sort \
           > ensBlastp90.tId.sort
     # transcript IDs in ensBlastp90 and not in ensBlastp
     comm -23 ensBlastp90.tId.sort ensBlastp.tId.sort > ensBlastp90Only.tId
     wc -l ensBlastp90Only.tId
     # 704 ensBlastp90Only.tId
     # check these against list of ens38 with no description
     comm -12 ens38ZfishNoDesc.tid.sort ensBlastp90Only.tId \
          > ensBlastp90Only.noUniProtInEns38
     # 416
     # also check against list of ens38Only.uniProt4.tId.noBlastp
     comm -12 ens38Only.uniProt4.tId.noBlastp ensBlastp90Only.tId
     # 140
     comm -23 ens38Only.uniProt4.tId.noBlastp ensBlastp90Only.tId \
          > ens38Only.uniProt4.tId.noBlastp90
     # 45 of these left
     # ENSDART00000009971 has only 48% Identity to Q5DTD0. maps to Q58EF8 on
     # Ensembl web page.
     # Check 10 alignments with >= 95% and 10 that have >= 90% and < 95%
     cd /cluster/data/danRer3/bed/ensGenes/blastp/runAll2/out
     # ens38Blastp.out has the alignments in NCBI format
     # 95-96% 226
     # 96-97% 322
     # 97-98% 526
     # 98-99% 1333
     # 99-100% 9503 (both inclusive)
     # lower score can be due to shorter query and target
 # for >= 95% identity (ensBlastp table in test db). Get BlastP results 
 # and check Ensembl. All Ensembl records show the UniProt ID given below 
 # except where noted. 
 # Query  Target  Identity qLen qAli tLen tAli  E-value  Score  misMatch Comment
 # ENSDART00000012253 Q9W6E8 99.51 609 609 609 609 0   978  3      
 # ENSDART00000013114 Q6NYT1 99.63 267 267 267 267 4e-143 502 1  
 # ENSDART00000067816 Q6NZZ8 95.78  433 433  471 460 0 838 2 query doesn't 
 # begin with Met, no associated UniProt ID in Ensembl
 # ENSDART00000018931 Q9DG41 99.42 346 346 552 346 0 709 2 query is partial, 
 # doesn't begin with Met
 # ENSDART00000023846 Q7ZUQ4 98.33 300 300 625 300 1e-179 624 5 query doesn't
 # begin with Met
 # ENSDART00000006095 Q6P2V4 99.32 443 443 443 443 0 941 3
 # ENSDART00000039597 Q5G9L7 100 146 146 146 146 3e-81 295 0 100% coverage 
 # ENSDART00000028930 Q90442 97.53 84 81 85 81 5e-42 164 2
 # ENSDART00000028255 Q8JHY2 100 63 63 63 63 2e-32 132 0
 # ENSDART00000042947 Q4QRH1 95.22 1849 456 479 452 0 808 10 alignment length =
 # 460 bp, Ensembl doesn't show a UniProt protein ID for this.
 # Maybe there is a coverage criterion.
 # >= 90% and < 95% identity from ensBlastP90 table in test db:
 #  There are 705 of these. 11911 have identity >= 95%
 # Query  Target  Identity qLen qAli tLen tAli  E-value  Score  misMatch Comment
 # ENSDART00000031211 Q6R5A4 94.21 779 779 846 789 0 1266 38  (gapOpen 6) 
 # bases 66-846 of target is aligning. Ensembl does not have a UniProt ID 
 # for this transcript.
 # ENSDART00000028390 Q5TKR3 90.87 241 240 243 241 1e-125 444 21 (gapOpen 1) 
 # ENSDART00000053312 Q5SYD9 92.64 325 325 322 322 8e-175 608 19 (gapOpen 2)
 # ENSDART00000056703 Q5CZR2 91.02 323 323 323 323 7e-124 605 29 (gapOpen 0)
 # Ensembl has no UniProt ID for this transcript. 91 % ID to NP_001013324.1,
 # also 323 bp.
 # ENSDART00000044490 Q3ZMH2 90.74 992 985 1082 994   0  1682 64 (gapOpen 7)
 # Ensembl has no UniProt ID, just InterPro domains.
 # ENSDART00000031487 Q5RHD6 92.81 320 320 319 319 7e-172 598 22 (gapOpen 1)
 # Ensembl has no UniProt ID, just InterPro domain.
 # ENSDART00000020233 Q6DHI1 91.72 298 298 299 299 6e-145 508 18 (gapOpen 2) 
 # ENSDART00000061435 Q6PBV8 93.72 76 76 76 76 2e-33 135 5 (gapOpen 0)
 # ENSDART00000056959 Q4V9F6 94.21 433 426 440 431 0 728 18 (gapOpen 2)
 # only InterPro domain given for Ensembl, no UniProt ID. 
 # ENSDART00000040220 Q504G5 90.12 172 172 174 172 3e-100 358 17 (gapOpen 0)
 # only InterPro domain given for Ensembl, no UniProt ID.
 # ENSDART00000066247 Q58EK5 90.08 767 231 485 251 3e-124 441 3 (gapOpen 3)
 # only InterPro domain given for Ensembl, no UniProt ID.
     # for 95% identity and above, there are only 18 proteins that have
     # mismatch > 40.
     # for between 90-95% then there are 62 with mismatch > 40. 
     # use grep -A 100 -w 
     # look at examples with high mismatch but identity < 95%.
     # ---+------+--------+------+--------+----------+
     # | query  | target | identity | aliLength | mismatch | gapOpen     
     # |qStart | qEnd | tStart | tEnd | eValue | bitScore |
 
     # ENSDART00000012435 | Q6IQX1 |     91.2 |      1932 |      163 |       5
     # |  2 | 1931 |      3 | 1931 |      0 |     3093 |
     # this has a high number of mismatches but distributed throughout
     # the protein and the UniProt sequence aligns to the genome with the 
     # same exon structure as for ENSDART00000012435.
     # ENSDART00000050066 | Q7M558 |    91.69 |      3008 |      249 |       1
     # |  0 | 3008 |      0 | 3007 |      0 |     5543 |
     # this is a very large protein so the mismatch is small compared to
     # the protein size. has same exon structure as Ensembl protein at
     # chr17:18,247,969-18,259,468. Blats to several regions - could be a
     # processed pseudogene or assembly artifact. 
     # If identity < 95% and mismatch > 40 then size is at least around 450bp.
     # ENSDART00000028708 | Q7T296 |    90.12 |       486 |       45 |       1
     # |  0 |  486 |     18 |  501 |      0 |      907 |
     # The most gaps in a sequence is 9 - only 1 sequence < 95% identity and 
     # most have 0-2 gaps. Same for those >= 95% identity.
     #  ENSDART00000039735 | Q7T1C9 |    98.15 |      1406 |       12 |       9
     #  |  0 | 1394 |      0 | 1404 |      0 |     2175 |
     # Gaps are spread throughout the seqeunce and are short. Blat of this
     # UniProt sequence gives the same exon structure as for the Ensembl seq.
     # | ENSDART00000053813 | Q7M560 |    90.07 |      2275 |      104 |      9 
     # |  0 | 2178 |     99 | 2349 |      0 |     3966 |
     # There are several large gaps in the first third of the sequence. The
     # rest of the gaps are short. Ensembl does not have a UniProt ID for this
     # transcript. Blat aligns this sequence to several places on the genome
     # all in close proximity to each other. One alignment corresponds to the
     # an Ensembl ID but not the one above. It does align to the region of 
     # ENSDART00000053813 but with a different exon structure.
     # ENSDART00000044490 | Q3ZMH2 |    90.74 |      1004 |       64 |       7
     # |  0 |  985 |     88 | 1082 |      0 |     1682 |
     # This has a couple of larger gaps. The UniProt sequence aligns to the 
     # same region as ENSDART00000044490 which has 3 extra exons. There is 
     # another transcript with the same exon structure.
     # | ENSDART00000041503 | Q3ZMH2 |    91.42 |       991 |       63 |
     # 5 |  0 |  974 |     82 | 1068 |      0 |     1684 |
     # This has only slightly higher identity.
     # ENSDART00000025635 | Q4FE55 |    99.33 |      2545 |        6 |       7
     # |  0 | 2542 |      0 | 2537 |      0 |     4859 |
     # just short gaps. This Blats to the same region of ENSDART00000025635
     # and gives the same exon structure.
     # could filter more using pslReps but should not filter on minAli since
     # either the query or target could be partial.
     # Use identity >= 90% as the cutoff and then associate the RefSeqs with
     # ZFIN IDs and update the official ZFIN Gene symbols. 
     # 
     ssh hgwdev  # kkstore02
     cd /cluster/data/danRer3/bed/ensGenes
     mkdir alignments
     cd alignments
 # Add a proteinID column to the ensGene table:
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ensGenes
     # Add protein ID column:
     hgsql -e 'alter table ensGene add proteinID varchar(40) NOT NULL;' danRer3
     # Add index to this column: 
     # Next step, download the ZFIN IDs and UniProt IDs
     hgsql -e 'alter table ensGene add index(proteinID);' danRer3
     hgsql -e 'select count(*) from ensGene;' danRer3
     # 32143
     hgsql -e 'update ensGene set proteinID = "";' danRer3
     # ensBlastp is the table in the test database where proteins have 
     # >=90% identity to the Ensembl proteins.
     hgsql -e 'select count(*) from ensGene as g, test.ensBlastp90 as p \
           where g.name = p.query;' danRer3
     # for >= 90% there are
     # 12614
     # for >=95%, there are 
     # 11910
    
     # Use these UniProt IDs to fill in proteinID table.
     hgsql -e 'update ensGene as g, test.ensBlastp90 as p \
           set g.proteinID = p.target where g.name = p.query;' danRer3
     # check that there are 12614 rows with proteinID filled.
     hgsql -e 'select count(*) from ensGene where proteinID != "";' danRer3
     # 12614
     # once this is done, can create ensCanonical and ensIsoforms table -
     # see section on "BUILD GENE SORTER TABLES".
     
     # Add table for Ensembl 38 Ensembl Transcript IDs and RefSeq IDs
     # and Entrez Gene ID.
     ssh hgwdev 
     cd /cluster/data/danRer3/bed/ensGenes 
 cat << 'EOF' > ens38Zfish2.sql
 CREATE TABLE ens38Zfish2 (
     transcriptId varchar(255) not null,     
     entrezGeneId varchar(255) not null,     
     refSeqId varchar(255) not null,
     refSeqProtId varchar(255) not null
 );
 'EOF'
     # << emacs
     tail +2 ensGeneInfo38Coding2.txt > ens38Coding2.tab
     hgLoadSqlTab test ens38Zfish2 ens38Zfish2.sql ens38Coding2.tab
     # 24523 lines where there is no Entrez Gene Id so these are set to 0.
     hgsql -N -e 'select distinct(entrezGeneId) from ens38Zfish2;' test \
          | sort > ens38Zfish2.geneId.uniq
     wc -l ens38Zfish2.geneId.uniq
     # 6764 ens38Zfish2.geneId.uniq 
     hgsql -e 'select count(distinct extDbId) from ens38Zfish;' test
     # 9028
     hgsql -N -e 'select distinct(extDbId) from ens38Zfish;' test \
           | sort > ens38Zfish.extDbId.sort
     grep -v NM ens38Zfish.extDbId.sort > ens38Zfish.extDbIdNoNM.sort
     # 8982 left
     grep -v BRARE ens38Zfish.extDbIdNoNM.sort \
             > ens38Zfish.extDbIdNoNMandNoSP.sort
     grep -v NP ens38Zfish.extDbIdNoNMandNoSP.sort \
             > ens38Zfish.extDbIdNoNMNoSPNoNP.sort
     wc -l ens38Zfish.extDbIdNoNMNoSPNoNP.sort
     # 5284 ens38Zfish.extDbIdNoNMNoSPNoNP.sort
     awk '{print $2}' ens38/ensToRefSeqvsZFIN.txt | sort | uniq \
         > ensToRefSeqvsZFIN.names.uniq
     # how many in common
     comm -12 ens38Zfish.extDbIdNoNMNoSPNoNP.sort ensToRefSeqvsZFIN.names.uniq \
         > common
     wc -l common
     # 4176 common
     comm -23 ens38Zfish.extDbIdNoNMNoSPNoNP.sort ensToRefSeqvsZFIN.names.uniq \
         > extDbIdNotfromZFINviaRefSeq 
     hgsql -N -e 'select mrnaAcc from refLink where locusLinkId != "";' danRer3 | sort | uniq > mrnaAcc.refLink.dr3.uniq
     wc -l mrnaAcc.refLink.dr3.uniq
     # 8811 mrnaAcc.refLink.dr3.uniq
     comm -12 mrnaAcc.refLink.dr3.uniq ensToRefSeq.refseq | wc
     # 7738 
     wc -l ensToRefSeq.refseq
     # 7738
     # merge the ens38Zfish2 table with ens38ZfishNew.  
     # for the Known Genes details pages. Changed table name from 
     # ensGeneXRef to ensXRefZfish as there are a number of tables already
     # with similar names to ensGeneXRef so this would be confusing.
     # create a table definition for ensXRefZfish:
     # (updated 2006-11-08, hartera)
     cd ~/kent/src/hg/lib
 cat << 'EOF' > ensXRefZfish.as
 table ensXRefZfish
 "Link from an Ensembl Transcript ID to other database IDs and description."
     (
     string ensGeneId;   "Ensembl Transcript ID"
     string zfinId;   "ZFIN ID"
     string uniProtId;   "Unified UniProt protein accession"
     string spDisplayId; "UniProt Display ID"
     string geneId;	"ZFIN Gene Symbol (formerly LocusLink) ID"
     string geneSymbol;  "Official ZFIN Gene Symbol"
     string refSeq;      "RefSeq DNA Accession"
     string protAcc;     "RefSeq Protein Accession"
     string description; "Description"
     )
 'EOF'
 
     autoSql ensXRefZfish.as ensXRefZfish
     mv ensXRefZfish.h ../inc
     # commit ensXRefZfish* files to CVS.
     # add zfinId, uniProtId, spDisplayId, geneId, geneSymbol, refSeq and 
     # protAcc as keys. ensGeneId is already the primary key.
     # description field is not long enough so it must be changed to a
     # longblob.
     perl -pi.bak -e 's/description varchar\(255\)/description longblob/' \
          ensXRefZfish.sql
 
     # get the gene2refseq file from NCBI to give the Entrez Gene ID
     # and symbol for refSeq accessions. Taxonomy ID is 7955 for Danio rerio.
     # columns in file are tax_id, GeneID, status, 
     # RNA nucleotide accession.version, RNA nucleotide gi, 
     # protein accession.version, protein gi, genomic nucleotide 
     # accession.version, genomic nucleotide gi, start position on the genomic 
     # accession, end position on the genomic accession, orientation.
     # for the gene_info file, column headings are:
     # tax_id, GeneID, Symbol, LocusTag, Synonyms, dbXrefs, chromosome,
     # map location, description, type of gene, Symbol from nomenclature
     # authority, Full name from nomenclature authority, Nomenclature status.
     # DOWNLOAD LATEST versions (from Nov. 8, 2006)
     ssh kkstore02
     mkdir /cluster/data/danRer3/bed/ensGenes/downloads
     cd /cluster/data/danRer3/bed/ensGenes/downloads
     wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
     wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
     gunzip gene2refseq.gz
     gunzip gene_info.gz
     # get records for taxon ID: 7955
     awk '{if ($1 == 7955) print;}' gene2refseq > zfish.gene2refseq
     wc -l zfish.gene2refseq
     # 14659 zfish.gene2refseq
     # 50465 zfish.gene2refseq - in March 
     # Most of the ones no longer in the gene2refseq file are 
     # PREDICTED, PROVISIONAL AND MODEL.
 # 37206 MODEL
 # 6278 PREDICTED
 # 6174 PROVISIONAL
 # 43 NA
 # 13 Reviewed
 # 5 REVIEWED
 # 1 VALIDATED
 # New sequences added:
 # 7021 PROVISIONAL
 # 6801 PREDICTED
 # 52 NA
 # 13 Reviewed
 # 12 VALIDATED
 # 10 INFERRED
 # 5 REVIEWED
 
     awk '{if ($1 == 7955) print;}' gene_info > zfish.gene_info
     wc -l zfish.gene_info
     # 38915 zfish.gene_info
     # 38126 zfish.gene_info - in March
     # checked that the Ensembl 38 genes for zebrafish are the same as 
     # for Ensembl 35 for which these files were downloaded (see above - 
     # updated file names to reflect v38).
     # also download the file from ZFIN that gives gene Symbols, ZFIN IDs
     # and RefSeq accessions. ZFIN associates more than one ZFIN ID with
     # UniProt IDs but there is a one to one relationship for ZFIN IDs 
     # and RefSeq accessions. Therefore the RefSeq accessions can be used
     # to identify a ZFIN ID and gene name and vice versa.
     wget --timestamping http://zfin.org/data_transfer/Downloads/refseq.txt
     # already the ensGeneInfo38Coding.txt and ensGeneInfo38Coding2.txt
     # files into tables so that the information can be put together.
     # these are ens38Zfish and ens38Zfish2 in the test database.
     # first copy the ens38Zfish table and then replace the uniProtId column
     # with the best hits from the ensBlastp90 table. 
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ensGenes
     sed -e 's/ens38Zfish/ens38ZfishNew/' ens38Zfish.sql > ens38ZfishNew.sql
     # create table
     hgsql test < ens38ZfishNew.sql
     hgsql -e 'insert into ens38ZfishNew select * from ens38Zfish;' test
     # Add spDisplayId column:
     hgsql -e \
       'alter table ens38ZfishNew add spDisplayId varchar(255) NOT NULL;' test
     # add some indices
     hgsql -e 'create index uniProt on ens38ZfishNew (uniProt);' test
     hgsql -e 'create index query on ens38ZfishNew (transcriptId(20));' test
     # first remove uniProt IDs and add those found by Blastp:
     hgsql -e 'update ens38ZfishNew set uniProt = "";' test
     # add displayIds from uniProt to this table
     hgsql -e 'select count(*) from ens38ZfishNew as g, ensBlastp90 as p \
           where g.transcriptId = p.query;' test
     # 37362
     hgsql -e 'update ens38ZfishNew as g, ensBlastp90 as p \
           set g.uniProt = p.target where g.transcriptId = p.query;' test
     # check that 37362 rows have an entry for uniProt - ok
     hgsql -e 'select count(*) from ens38ZfishNew as g, uniProt.displayId as p \
           where g.uniProt = p.acc;' test
     # 36647
     # 36647 have display IDs in UniProt
     hgsql -e 'update ens38ZfishNew as g, uniProt.displayId as p \
           set g.spDisplayId = p.val where g.uniProt = p.acc;' test
     # check that 36647 of the rows have spDisplayId - ok.
     # add new columns for ens38ZfishNew
     hgsql -e \
       'alter table ens38ZfishNew add entrezGeneId varchar(255) NOT NULL;' test
     hgsql -e \
       'alter table ens38ZfishNew add refSeqId varchar(255) NOT NULL;' test
     hgsql -e \
       'alter table ens38ZfishNew add refSeqProtId varchar(255) NOT NULL;' test
 
     # merge together the tables. 
     hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \
           set g.entrezGeneId = e.entrezGeneId \
           where g.transcriptId = e.transcriptId;' test
     
     hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \
           set g.refSeqId = e.refSeqId \
           where g.transcriptId = e.transcriptId;' test
     
     hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \
           set g.refSeqProtId = e.refSeqProtId \
           where g.transcriptId = e.transcriptId;' test
     
     cd /cluster/data/danRer3/bed/ensGenes/downloads/
     hgsql -N -e 'select * from ens38ZfishNew;' test > ens38ZfishNew.txt
     ssh kkstore04
     cd /cluster/data/danRer3/bed/ensGenes/downloads/
     
     # There are 308 cases where there is a RefSeq ID but no Entrez Gene ID.
     # There are 1046 cases where there is an Entrez Gene ID but no RefSeq ID.
     # Use the NCBI files to fill in the gaps where needed.
     # get ZFIN file of ZFIN IDs, gene name and GenBank accession 
     # refseq.txt has ZFIN IDs, gene name and RefSeq ID.
     wget --timestamping http://zfin.org/data_transfer/Downloads/gene_seq.txt
     awk '{print $1, $2}' gene_seq.txt | sort | uniq > geneSeq.genes
     awk '{print $1, $2}' refseq.txt | sort | uniq > refSeq.genes
     comm -23 refSeq.genes geneSeq.genes > refSeqOnly
     comm -13 refSeq.genes geneSeq.genes > geneSeqOnly
     wc -l *SeqOnly
     # 9542 geneSeqOnly
     # 827 refSeqOnly
     # get certain fields from each file and merge
     awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1, $2, $3, $4, $6;}' \
         zfish.gene2refseq > zfish.gene2refseqSubset.txt
     awk 'BEGIN {FS="\t"} {OFS="\t"} \
         {print $2, $3, $5, $6, $9, $10, $11, $12;}' \
         zfish.gene_info > zfish.gene_infoSubset.txt
     # need to sort on the GeneID field (second field in refseq file and 
     # first field in gene_info file):
     sort -n -k2 zfish.gene2refseqSubset.txt | uniq \
          > zfish.gene2refseqSubset.sort
     sort -n -k1 zfish.gene_infoSubset.txt | uniq > zfish.gene_infoSubset.sort
     # join the two files based on the GeneID (Entrez Gene ID) which is 
     # the second field in refseq file and first field in gene_info file.
     # Need to set the $tab variable in .tcshrc file:
     # set tab = "	"
     join -t "$tab" -1 2 -2 1 zfish.gene2refseqSubset.sort \
          zfish.gene_infoSubset.sort \
          > zfish.gene2refSeqPlusInfo.txt 
     # The program needs to be written to fill in these gaps for RefSeq ID,
     # Entrez Gene ID and RefSeq Peptide ID. It should then check for the
     # gene symbol using the ZFIN ID using RefSeq ID.  
     # write program taking ensGene38Coding.tsv and ensGene38Coding2.tsv as 
     # input and also the RefSeq files to find Entrez Gene IDs and Gene Symbols.
     # and give the tabbed output for loading into the ensXRefZfish table.
     # hgEnsGeneXRef.c in ~/kent/src/hg/near/hgZfishEnsXRef
     /cluster/home/hartera/bin/x86_64/hgZfishEnsXRef \
        ensGeneInfo38.txt zfish.gene2refSeqPlusInfo.txt refseq.txt \
        ens37XRefZfish.tab >& ens37XRefZfish.log
     # load this tabbed file into ensXRefZfish table 
     ssh hgwdev 
     cd /cluster/data/danRer3/bed/ensGenes
     # remove old table:
     hgsql -e 'drop table ensXRefZfish;' danRer3
     hgLoadSqlTab danRer3 ensXRefZfish ~/kent/src/hg/lib/ensXRefZfish.sql \
           ens38XRefZfish.tab 
     # loaded with no problems.
     # Now need to check its contents:
     mkdir testing
     cd testing
     hgsql -N -e 'select zfinId, geneSymbol, refSeq from ensXRefZfish where \
            zfinId != "" AND refSeq != "";' test > zfinIdsymbAndrefseq.txt
     sort zfinIdsymbAndrefseq.txt | uniq > zfinIdsymbAndrefseq.sort
     sort ../refseq.txt | uniq > refseq.sort
     perl -pi.bak -e 's/\t\n/\n/' refseq.sort
     comm -23 zfinIdsymbAndrefseq.sort refseq.sort | wc 
     comm -12 zfinIdsymbAndrefseq.sort refseq.sort | wc 
     cd /cluster/data/danRer3/bed/ensGenes/testProgram/tmp3
     awk 'BEGIN {FS="\t"} {print $5}' ens38ZfishNew.sort | sort | uniq \
         ensFile.zfinIds.sort
     # There are 7321 zfin IDs
     # 7284 ZFIN IDs in table and 6499 with a RefSeq.
     hgsql -N -e 'select distinct(zfinId) from ensXRefZfish where refseq = "" \
          and zfinId != "" and geneSymbol = "";' test \
          | sort > zfinIdwithNoRefSeqNoSymb.sort
     # There are 853 with no refseq but a zfinId and no gene symbol and 690
     # are unique ZFIN IDs.
     # compare these to ZFIN IDs in the zfish.gene2refSeqPlusInfo.txt from
     # NCBI files:
     awk 'BEGIN {FS="\t"} {print $8;}' zfish.gene2refSeqPlusInfo.txt \
         | sort | uniq > zfinIds.fromNcbiFile.sort
     # remove first line and "ZFIN:" prefix
     tail +2 zfinIds.fromNcbiFile.sort | sed -e 's/ZFIN://' \
          > zfinIds.fromNcbiFile.sort2
     comm -13 zfinIds.fromNcbiFile.sort2 zfinIdwithNoRefSeqNoSymb.sort | wc
     # 251 of these with no symbols are not found in the NCBI file
     comm -12 zfinIds.fromNcbiFile.sort2 zfinIdwithNoRefSeqNoSymb.sort \
          > zfinIds.inNcibFile.noRefSeqOrSymbinXRef
     awk '{print $1}' refseq.txt | sort | uniq > refseq.zfId.sort
     comm -13 refseq.zfId.sort zfinIdwithNoRefSeqNoSymb.sort | wc
     # 176 of these with no symbols are not found in the ZFIN RefSeq file
     comm -12 refseq.zfId.sort zfinIdwithNoRefSeqNoSymb.sort \
          > zfinIds.inZfinFile.noRefSeqOrSymbinXRef
     # 435 are in both of these lists
     wc -l *.noRefSeqOrSymbinXRef
     # 439 zfinIds.inNcibFile.noRefSeqOrSymbinXRef
     # 514 zfinIds.inZfinFile.noRefSeqOrSymbinXRef
 
     
     # edit ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/genome.ra to give 
     # mySQL queries to ensGtp and ensXRefZfish to retrieve name, protein and
     # description. Changed XRef table name to new name. 
 cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/genome.ra
 name global
 knownGene ensGene
 knownGenePep ensPep
 nameSql select gene from ensGtp where transcript = '%s'
 descriptionSql select description from ensXRefZfish where ensGeneId = '%s'
 proteinSql select uniProtId from ensXRefZfish where ensGeneId = '%s'
 _EOF_
 # << happy emacs
 
     # created blastp hgNear tables by alignment of Zebrafish Ensembl peptide
     # sequences to the equivalent "Known Genes" peptide sets for other species
     # - see hgNear sections above. Then create an otherOrg.ra file for 
     # zebrafish specifying the species and databases for these organisms 
     # with blastp homolog tables. 
 cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/otherOrgs.ra
 name human
 db hg18
 
 name mouse
 db mm8
 
 name rat
 db rn4
 
 name drosophila
 db dm1
 
 name cElegans
 db ce2
 
 name yeast
 db sacCer1
 _EOF_
      # << this line makes emacs coloring happy
      # add Zebrafish-specific section.ra file
 cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/section.ra
 name method
 shortLabel Methods
 longLabel Ensembl Genes Methods, Credits, and Data Use Restrictions
 priority 140
 _EOF_
      # << this line makes emacs coloring happy
      # added links to the Zebrafish links.ra file
      # update links.ra so that link for Ensembl Genes is to the correct
      # stable archive link for Ensembl37 (feb 2006) and change XRef
      # table name to new name.
 cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/links.ra
 # Zebrafish-specific link info.
 # This contains info to construct the quick links. 
 
 name genome
 tables ensGene 
 idSql select chrom,txStart+1,txEnd from ensGene where name = '%s'
 
 name family
 tables ensGene
 idSql select name from ensGene where name = '%s'
 
 name ensemblGenes
 shortLabel Ensembl Genes
 tables ensGene
 idSql select name from ensGene where name = '%s'
 url http://feb2006.archive.ensembl.org/Danio_rerio/transview?transcript=%s
 priority 25
 
 name zfin
 shortLabel ZFIN
 tables ensXRefZfish
 idSql select zfinId from ensXRefZfish where ensGeneId = '%s'
 url http://zfin.org/cgi-bin/webdriver?MIval=aa-markerview.apg&OID=%s
 priority 28
 
 name tbSchema
 shortLabel Table Schema
 tables ensGene
 
 name uniProt
 shortLabel UniProt
 tables ensXRefZfish
 idSql select uniProtId from ensXRefZfish where ensGeneId = '%s'
 priority 30
 
 name refSeq
 shortLabel RefSeq
 tables ensXRefZfish
 idSql select refSeq from ensXRefZfish where ensGeneId = '%s'
 url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=Nucleotide&term=%s&doptcmdl=GenBank&tool=genome.ucsc.edu
 priority 40
 
 name refSeqPep
 shortLabel RefSeq Peptide
 tables ensXRefZfish
 idSql select protAcc from ensXRefZfish where ensGeneId = '%s'
 url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=protein&term=%s&doptcmdl=GenPept&tool=genome.ucsc.edu
 priority 42
 
 name entrezGene
 shortLabel Entrez Gene
 tables ensXRefZfish
 idSql select geneId from ensXRefZfish where ensGeneId = '%s'
 url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=Retrieve&dopt=Graphics&list_uids=%s&tool=genome.ucsc.edu
 priority 45
 
 name genBank
 hide
 
 name pubMed
 hide
 
 name geneCards
 hide
 
 name stanfordSource
 hide
 
 name cgap
 hide
 
 name ensembl
 hide 
 
 name aceView
 hide
 _EOF_
      # << this line makes emacs coloring happy
      # then make my to visualize in own sandbox
      cd ~/kent/src/hg/hgGene
      make my
      # commit *.ra files for Zebrafish to CVS.
      # edited hgGene.c so that the Gene Symbol (if available) is displayed 
      # in the description section of the details page.
      # added ensXRefZfish to ensemblTranscriptId rules in all.joiner.
      # add entry to danRer3/trackDb.ra:
 # track ensGene
 # shortLabel Ensembl Genes
 # longLabel Ensembl v37 Gene Predictions (Protein Coding Genes)
 # group genes
 # priority 32.8
 # visibility pack
 # color 150,0,0
 # type genePred ensPep
 # hgGene on 
  
 # STS MARKERS (in progress, 2005-10-13, hartera)
     # DOWNLOADED RECENTLY FROM NCBI
     ssh kkstore02
     mkdir -p /cluster/data/danRer3/bed/stsMarkers
     cd /cluster/data/danRer3/bed/stsMarkers
     # UniSTS is the a unique subset of markers that are STS markers from the
     # six zebrafish mapping panels: GAT, HS, LN54, MGH, MOP, T51, and also
     # ZMAP which contains markers from the other panels. Among markers in 
     # these map, a subset that are STSs and with available primers sequences
     # were imported to UniSTS. These include submitted maps and those from
     # the Zebrafish Information Network (ZFIN).
 
 ############################################################################
 ##  BLASTZ swap from mm8 alignments (DONE - 2006-02-28 - Hiram)
     ssh pk
     cd /cluster/data/mm8/bed/blastzDanRer3.2006-02-28
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits danRer3 chainMm8Link
     #   54831876 bases of 1630323462 (3.363%) in intersection
 
 
 # SWAP CHAINS/NET RN4 (DONE 4/2/06 angie)
     ssh kkstore02
     mkdir /cluster/data/danRer3/bed/blastz.rn4.swap
     cd /cluster/data/danRer3/bed/blastz.rn4.swap
     doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.danRer3/DEF \
       -workhorse kkr7u00 >& do.log & tail -f do.log
     ln -s blastz.rn4.swap /cluster/data/danRer3/bed/blastz.rn4
 
 
 ############################################################################
 ##  BLASTZ swap from hg17 alignments (DONE 2006-04-09 markd)
     ssh pk  
     mkdir /cluster/data/danRer3/bed/blastz.hg17.swap
     ln -s blastz.hg17.swap /cluster/data/danRer3/bed/blastz.hg17
     cd /cluster/data/danRer3/bed/blastz.hg17.swap
     
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
 	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	/cluster/data/hg17/bed/blastz.danRer3/DEF >& swap.out&
    # failed due to netChains: looks like previous stage was not 
    # successful (can't find [danRer3.hg17.]all.chain[.gz]).
    #
     mv swap.out swap.out.1
    # rerun with -continue=net
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -continue=net -stop=net \
 	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	/cluster/data/hg17/bed/blastz.danRer3/DEF >& swap.out&
 
    # create the net filee (DONE 2006-04-09 markd)
     ssh hgwdev
     cd /cluster/data/danRer3/bed/blastz.hg17.swap/axtChain
     nice netClass -verbose=0 -noAr noClass.net danRer3 hg17 danRer3.hg17.net
     nice gzip danRer3.hg17.net
 
 ###########################################################################
 # SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM OTHER ASSEMBLIES
 # (DONE, 2006-04-17, hartera)
 # ADD TO SAN FOR PK RUNS (DONE, 2006-05-30, hartera)
 
     # followed instructions used in makePanTro2.doc
     ssh kkr1u00
     cd /cluster/data/danRer3/bed
     mkdir -p liftOver
     cd liftOver
     makeLoChain-split danRer3 /cluster/data/danRer3/nib >&! split.log &
     # Took about 30 minutes.
     # add split10k to san for pk runs (2006-05-30, hartera)
     ssh kk
     rsync -a --progress /iscratch/i/danRer3/split10k \
          /san/sanvol1/scratch/danRer3/
 
 ###########################################################################
 # LIFTOVER CHAINS TO DANRER2 (DONE, 2006-04-25 - 2006-05-03, hartera)
 # CLEANUP BLAT DIRECTORY (DONE, 2006-12-14, hartera)
     # Split (using makeLoChain-split) of danRer2 is doc'ed in makeDanRer2.doc
     # Do what makeLoChain-split says to do next (start blat alignment)
     ssh kk
     mkdir -p /cluster/data/danRer3/bed/liftOver
     cd /cluster/data/danRer3/bed/liftOver
     makeLoChain-align danRer3 /iscratch/i/danRer3/nib danRer2 \
         /iscratch/i/danRer2/split10k \
         /iscratch/i/danRer2/11.ooc >&! align.log &
     # Took about 5 minutes.
     # Do what its output says to do next (start cluster job)
     cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/run
     para try, check, push, check, ...
     para time >&! run.time
 # Completed: 782 of 784 jobs
 # Crashed: 2 jobs
 # CPU time in finished jobs:    4324484s   72074.73m  1201.25h   50.05d  0.137 y
 # IO & Wait Time:                 35200s     586.67m     9.78h    0.41d  0.001 y
 # Average job time:                5575s      92.92m     1.55h    0.06d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           62741s    1045.68m    17.43h    0.73d
 # Submission to last job:        355469s    5924.48m    98.74h    4.11d
 
     # 2 jobs keep crashing so try them on the pk: chrUn_chrUn and chrUn_chr20
     # need to copy the danRer2 split10k over to the pk
     ssh kkr1u00
     mkdir -p /san/sanvol1/scratch/danRer2/split10k
     rsync -a --progress /iscratch/i/danRer2/split10k/* \
           /san/sanvol1/scratch/danRer2/split10k/
     # copy over 11.ooc file for danRer2
     cp /iscratch/i/danRer2/11.ooc /san/sanvol1/scratch/danRer2
     ssh pk
     cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/run
     mkdir extraRun raw
     cd extraRun
     grep chrUn_chrUn ../spec > spec
     grep chrUn_chr20 ../spec >> spec
     # change directories for spec file
     perl -pi.bak -e 's#/iscratch/i#/san/sanvol1/scratch#g' spec
     rm spec.bak 
     para create spec
     para push, check etc.
     para time >& run.time
 # Completed: 2 of 2 jobs
 # CPU time in finished jobs:     263163s    4386.05m    73.10h    3.05d  0.008 y
 # IO & Wait Time:                    62s       1.04m     0.02h    0.00d  0.000 y
 # Average job time:              131613s    2193.54m    36.56h    1.52d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:          147104s    2451.73m    40.86h    1.70d
 # Submission to last job:        147104s    2451.73m    40.86h    1.70d
 
     ssh kkr1u00
     # merge all raw output:
     cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25
     mv ./run/raw/*.psl ./raw/
     # lift alignments
     cd /cluster/data/danRer3/bed/liftOver
     makeLoChain-lift danRer3 danRer2 >&! lift.log &
     # Took about 8 minutes to run.
 
     # chain alignments
     ssh kki
     cd /cluster/data/danRer3/bed/liftOver
     makeLoChain-chain danRer3 /iscratch/i/danRer3/nib \
                 danRer2 /iscratch/i/danRer2/nib >&! chain.log &
     # Do what its output says to do next (start cluster job)
     cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/chainRun
     para try, check, push, check etc. ...
     para time >&! run.time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:       2751s      45.86m     0.76h    0.03d  0.000 y
 # IO & Wait Time:                   879s      14.64m     0.24h    0.01d  0.000 y
 # Average job time:                 130s       2.16m     0.04h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             598s       9.97m     0.17h    0.01d
 # Submission to last job:          1520s      25.33m     0.42h    0.02d
 
     # net alignment chains
     ssh kkstore02
     cd /cluster/data/danRer3/bed/liftOver
     makeLoChain-net danRer3 danRer2 >&! net.log &
     # Took about 24 minutes to run.
     # load reference to over.chain into database table,
     # and create symlinks  /gbdb  and download area
     ssh hgwdev
     cd /cluster/data/danRer3/bed/liftOver
     makeLoChain-load danRer3 danRer2 >&! load.log &
     # clean up
     rm *.log
     # test by converting a region using the "convert" link on
     # the browser, and comparing to blat of the same region
 
     # CLEANUP for LiftOver blat directory (2006-12-14, hartera)
     ssh kkstore02
     rm -r blat.danRer2.2006-04-25
 
 # REDO BACENDS - bacEndPairs, bacEndSingles, bacEndBadPairs and all_bacends
 # (split as chrN_allBacends) ONLY (DONE, 2006-05-01 - 2006-05-08, hartera) 
 # RELOADED chrN_allBacends TABLES (DONE, 2006-06-08, hartera)
 # RECREATED all_bacends table WITH ONLY RELEVANT PSLS FOR THE LFS BED 
 # TABLES FOR PAIRS, PAIRSBAD AND SINGLES (DONE, 2006-08-04, hartera)
      # NOTE: there are overlapping BAC clone ends for danRer3. Some of these
      # are only a few kb apart (from beginning of one to end of the other)
      # so use stricter pslPairs parameters as for human and mouse.
      # These BAC Ends should be about 150-200 kb. Typically, they are
      # 50 - 300 kb apart.
      # NOTE: IN FUTURE, IF SPLITTING all_bacends TABLE BY CHROM AND
      # RENAMING AS chrN_allBacends THEN USE allBacends INSTEAD OF
      # all_bacends AS ARGUMENT TO pslPairs. THIS WILL THEN AUTOMATICALLY
      # ADD THE CORRECT PSL TABLE NAME TO THE BED (LFS) TABLES
      ssh kkstore02
      mkdir /cluster/data/danRer3/bed/bacends/pairsNew
      cd /cluster/data/danRer3/bed/bacends/pairsNew
      set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
      /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 -mismatch -verbose ../bacEnds.psl \
         $bacDir/bacEndPairs.txt all_bacends bacEnds
      wc -l bacEnds.*
      # 1725 bacEnds.long
      # 12081 bacEnds.mismatch
      # 242235 bacEnds.orphan
      # 156444 bacEnds.pairs
      # 616 bacEnds.short
      # 1017 bacEnds.slop
 
      echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
           > ../header
      echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
      # make pairs bed file
      cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
                | headchg -del > bacEndPairs.bed
      # also need to process bacEndSingles.txt into a database table
      # for singles in bacEndSingles.txt, create a dummy file where they
      # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
      # sequence, put the dummy sequence in the second column, if the single is
      # a reverse sequence put in first column. use a perl script to do this.
      cd /cluster/data/danRer3/bed/bacends
      set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
      mkdir singlesNew
      cd singlesNew
      cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
      perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                            $bacDir/bacEndSingles.format
      # then run pslPairs on this formatted file
      /cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
      -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
      -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
      all_bacends bacEnds
      wc -l bacEnds.*
      # 0 bacEnds.long
      # 0 bacEnds.mismatch
      # 11439 bacEnds.orphan
      # 0 bacEnds.pairs
      # 0 bacEnds.short
      # 0 bacEnds.slop
      # there are 11439 orphans here and 242235 from pair analysis so 
      # a total of 253674 orphans
      cat bacEnds.orphan ../pairsNew/bacEnds.orphan > bacEnds.singles
      wc -l bacEnds.singles
      # 253674 bacEnds.singles
      # make singles bed file
      cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                   | headchg -del > bacEndSingles.bed
      cp bacEndSingles.bed ../pairsNew
      cd ../pairsNew
      # all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
      # since orphans are already in bacEndSingles, do not add these
      cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         bacEnds.orphan | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBad.bed
      # add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans 
      # twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans
 
      cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBadNoOrphans.bed
      # use extractPslLoad later to get all_bacends.psl for database
 
      # There are rows where the aligments were the same but the lfNames are 
      # different. This is due to the presence of multiple reads for the 
      # same BAC end sequence. Sometimes they are slightly different lengths 
      # so the alignments are a little different. It would be good to 
      # consolidate all of these. Firstly, the identical rows were merged into 
      # one with a list of all the lfNames corresponding to that alignment.
      
      ssh kkstore02
      cd /cluster/data/danRer3/bed/bacends/pairsNew
      mkdir -p /cluster/data/danRer3/bed/bacends/duplicatesNew
      cd /cluster/data/danRer3/bed/bacends/duplicatesNew
      mkdir -p /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun
      cd /cluster/data/danRer3/bed/bacends/duplicatesNew
      ln -s /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun
      # write program to do this for linked feature series (lfs) which
      # is the type of data structure used for BAC ends.
      # Need a bed file sorted by chrom and chromStart 
      cd overlapRun
      foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
         sort -k1,2 /cluster/data/danRer3/bed/bacends/pairsNew/${f}.bed \
               > ${f}.lfs
      end
      wc -l *.lfs
      # 155242 bacEndPairs.lfs
      # 15311  bacEndPairsBadNoOrphans.lfs
      # 221821 bacEndSingles.lfs
 
      # remove replicate rows where names match and the overlapping region
      # (chromEnd - chromStart) is greater than or equal to 0.999.
      ssh kolossus
      cd /cluster/data/danRer3/bed/bacends/duplicatesNew/overlapRun
      foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
          echo "Processing $f"
          nohup nice /cluster/bin/x86_64/lfsOverlap ${f}.lfs \
                ${f}.bed -name -minOverlap=0.999 -notBlocks
      end
      # Started: May 3 23:30 PID: 9199
      # pairs started: May 5 18:10, PID: 13232
      # Segmentation fault with bacEndSingles. This is a very large file so
      # run again using the file split into two
      # chr24 starts at line 109407
      head -109406 bacEndSingles.lfs > bacEndSinglesPart1.lfs
      tail +109407 bacEndSingles.lfs > bacEndSinglesPart2.lfs
      # then try again:
      foreach f (bacEndSinglesPart1 bacEndSinglesPart2)
          echo "Processing $f"
          nohup nice /cluster/home/hartera/bin/i386/lfsOverlap ${f}.lfs \
                ${f}.bed -name -minOverlap=0.999 -notBlocks
      end
      # merge results
      cat bacEndSinglesPart*.bed > bacEndSingles.bed
 
      ssh kkstore02
      cd /cluster/data/danRer3/bed/bacends/duplicatesNew/overlapRun
      # check the numbers of lines are correct
     
      foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
          awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \
              | sort | uniq -c | sort -nr > ${f}.uniqCount
      end
      wc -l *
      # 155164 bacEndPairs.bed
      # 155242 bacEndPairs.lfs
      # 155189 bacEndPairs.uniqCount
      # 15293 bacEndPairsBadNoOrphans.bed
      # 15311 bacEndPairsBadNoOrphans.lfs
      # 15303 bacEndPairsBadNoOrphans.uniqCount
      # 221771 bacEndSingles.bed
      # 221821 bacEndSingles.lfs
      # 221799 bacEndSingles.uniqCount
      # 109390 bacEndSinglesPart1.bed
      # 109406 bacEndSinglesPart1.lfs
      # 112381 bacEndSinglesPart2.bed
      # 112415 bacEndSinglesPart2.lfs
      # different numbers for unique count since some of these alignments 
      # were not identical but very close to identical (>0.999 overlap) 
      cd /cluster/data/danRer3/bed/bacends/duplicatesNew
      mv ./overlapRun/* .
      rm -r overlapRun /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun
      # Use perl script to choose 2 BAC ends to represent each BAC clone.
      # since there are often more than one read for each BAC end in this set,
      # 2 were chosen for each BAC pair or 1 for the singles. This was based on
      # the ones that had the largest region aligned (using lfSizes).
      # copy perl script over that was used for danRer2
      cp /cluster/data/danRer2/bed/ZonLab/bacends/duplicates/pickLfNames.pl \
         pickLfNamesv2.pl 
      # edit so that regular expression for matching BAC end names is the 
      # same as that used in ../bacends.1/getBacEndInfov2.pl
      # need to sort by chrom, chromStart
 
      foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
          sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed
      end
      # run perl script: input bed file, pairs or singles, name of output file
      perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed
      mv error.log log.pairs
      # log.pairs is empty
      perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed
      mv error.log log.singles
      sort log.singles | uniq > log.singles.uniq
      cp bacEndSinglesSort.bed bacEndSingles2Sort.bed
      # log.singles has 15 cases where alignments for a BAC clone use 
      # different sequence reads for either the T7 or SP6 BAC end.
      # singles may include both BAC ends for a clone in the case
      # where they aligned to different chromosomes or a long way apart on 
      # the same chromsome (orphans). mostly those that have a different read
      # align to an almost identical or largely overlapping region.
      # CH211-189J23: zC189J23.ya and zC189J23.yb align to overlapping regions.
      # Use zC189J23.yb as aligns to a longer region and remove the other one.
      # CH211-42D5
      # some sequences appear to be different: CH211-98J20 - zC98J20.yb and
      # zC98J20.ya do not align to each other. DKEYP-107B4 - zKp107B4.ya looks
      # like it has low complexity sequence, this is discarded and zKp107B4.yb 
      # is kept. zKp107B4.za and zKp107B4.zb only align in the first ~ 59bp.
      # zKp107B4.zb is kept in this case. DKEYP-114B4 - zKp114B4.za: 15-61 bp 
      # on zKp114B4.za align to 11-58 bp on zKp114B4.zb. zKp114B4.za is kept.
      # In these cases, the 2 sequences align to different regions.
      # Some sequences have overlapping alignments as one sequence is a bit
      # longer than the other.
      perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \
           badPairs2lfNames.bed
      mv error.log log.badPairs
      # no alignments have a different pair of ends to other alignments
     
      # for each of these new bed files, checks were made that there are
      # only 2 BAC ends per alignments for pairs and 1 for singles.
      # For each pair, there should only be 2 ends which can appear either
      # way round depending on the orientation and there should be 1 end for
      # the beginning (suffix T7, t7 or z) and one end for the end
      # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
      # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
      # orientation. For singles, there should be a single BAC end for each
      # alignment and for each BAC clone, a sequence for either or both types
      # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
      # alignments.
      e.g.
      wc -l pairs2lfNames.bed
      grep ',' pairs2lfNames.bed
      # should be the same number, every line should have a comma
      # should be twice the number of above, just 2 end names per line
      awk '{print $11}' pairs2lfNames.bed | sort | uniq > pairs.ends
      sed -e 's/,/\n/g' pairs.ends > pairs.ends2
      wc -l pairs.ends2
      # should be twice the number of above, just 2 end names per line
      perl -pi.bak -e \
 's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' pairs.ends
      sort pairs.ends | uniq > pairs.ends.uniq
      # check that each of these have the correct pair type
 
      # Finally overlaps in BAC clone names were checked. All BAC clones
      # represented in each of the pairs, badPairs and singles bed files are
      # unique to that file. Between all three bed files, 300323 BAC clones
      # have alignments. 512886 clone ends are aligned in these three bed files. 
      foreach f (*.bed)
         awk '{print $4}' $f | sort | uniq > ${f}.names
      end
      comm -12 pairs2lfNames.bed.names badPairs2lfNames.bed.names
      comm -12 pairs2lfNames.bed.names singles1lfName.bed.names
      comm -12 badPairs2lfNames.bed.names singles1lfName.bed.names
      # None of these files should have any BAC clone names in common and
      # they do not so they are ok.
      # clean up:
      rm *Part1.bed *Part2.bed *.names *.ends *.ends2 *.Part1.lfs *Part2.lfs
      rm *.uniqCount
      # NOTE: using sort and uniq on hgwdev produces tab delimited output
      # after merging rows with the same BAC name, the scoring is now
      # wrong in the bed files.
      # Scores should be 1000 if there is 1 row for that name, else
      # 1500/number of rows for that sequence name - calculated by pslPairs.
      # Correct the scores. The co-ordinates for the singles also need to be
      # corrected.
                                                                                 
      mkdir -p /cluster/data/danRer3/bed/bacends/scoresAndCoords
      cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
      # copy over correctScores2.pl and checkscores.pl scripts from danRer2 and 
      # edit so both scripts so that hits file is split on space,not on tabs
      cp \
    /cluster/data/danRer2/bed/ZonLab/bacends/scoresAndCoords/correctScores2.pl .
      cp \
      /cluster/data/danRer2/bed/ZonLab/bacends/scoresAndCoords/checkScores.pl .
      awk '{print $4}' ../duplicatesNew/pairs2lfNames.bed \
                  | sort | uniq -c > pairs.hits
      perl correctScores2.pl ../duplicatesNew/pairs2lfNames.bed pairs.hits \
           noBin > bacEndPairsGoodScores.bed
      # same for singles
      awk '{print $4}' ../duplicatesNew/singles1lfName.bed \
                  | sort | uniq -c > singles.hits
                                                                                 
      perl correctScores2.pl ../duplicatesNew/singles1lfName.bed singles.hits \
                  noBin > bacEndSinglesGoodScores.bed
                                                                                 
      # and for badPairs
      awk '{print $4}' ../duplicatesNew/badPairs2lfNames.bed \
                  | sort | uniq -c > badPairs.hits
      perl correctScores2.pl ../duplicatesNew/badPairs2lfNames.bed \
           badPairs.hits noBin > bacEndPairsBadGoodScores.bed
      # check that the scores are now correct  
      awk '{print $4, $5}' bacEndPairsGoodScores.bed \
          | sort | uniq -c > pairs.count
      perl checkScores.pl < pairs.count
      # all the BAC clones should be in good.txt and none in bad.txt
      # wc -l should give same number of lines in good.txt as in pairs.hits
      # repeat for other bed files
      awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \
          | sort | uniq -c > badPairs.count
      perl checkScores.pl < badPairs.count
      awk '{print $4, $5}' bacEndSinglesGoodScores.bed \
          | sort | uniq -c > singles.count
      perl checkScores.pl < singles.count
      # for the singles, 7 ended up in bad.txt because their scores 
      # were 214.285714285714 which is correct for 7 alignments. rounding the
      # score caused the discrepancy.
      # For singles, the co-ordinates in the lfs table are wrong. The
      # chromStart should be the same as the lfsStart and chromEnd - chromStart
      # should be the same as lfSizes. Need to correct these:
      # pslPairs has added min/2 to the end or subtracted min/2 from the start
      # depending on whether it is a left or a right BAC end and the 
      # alignment orientation. min used here was 25000.
      awk 'BEGIN {FS="\t"} {OFS="\t"} \
       {if ($2 != $9) print $1,$9,$3,$4,$5,$6,$7,$8,$9,$10,$11; \
       else print $1,$2,$3 - 12500,$4,$5,$6,$7,$8,$9,$10,$11;}' \
       bacEndSinglesGoodScores.bed \
       > bacEndSinglesGoodScoresAndCoords.bed
      # clean up
      rm error.log *.txt *.count *.hits bacEndSinglesGoodScore.bed
 
      ssh hgwdev
      cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
      # copy over table definition from danRer2
      cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/bacEndSingles.sql \
         ../singlesNew/
      # Now load database tables:
      hgLoadBed danRer3 bacEndPairs bacEndPairsGoodScores.bed \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
      # Loaded 155164 elements of size 11
      hgLoadBed danRer3 bacEndSingles bacEndSinglesGoodScoresAndCoords.bed \
                -sqlTable=../singlesNew/bacEndSingles.sql -notItemRgb
      # Loaded 221754 elements of size 11
      # 221754 record(s), 0 row(s) skipped, 57 warning(s) loading bed.tab
      # warnings are unknown but all of bed file loaded and the number
      # of warnings is small so ignore
      hgLoadBed danRer3 bacEndPairsBad bacEndPairsBadGoodScores.bed \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
      # Loaded 15293 elements of size 11
      # load BAC end sequences into seq table so alignments may be viewed
      mkdir -p /gbdb/danRer3/bacends
      ln -s /cluster/data/danRer3/bed/bacends/bacSeqs/Zv5BACends.fa \
                                 /gbdb/danRer3/bacends/Zv5BACends.fa
      hgLoadSeq danRer3 /gbdb/danRer3/bacends/Zv5BACends.fa
 
      # create file for loading all_bacends table
      ssh kkstore02
      cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
      # for all_bacends table, just load the alignments for those sequences
      # represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables
      # bacEnds.load.psl is the file of alignments
      # get all the names of sequences 
      foreach f (*.bed)
        echo $f
        awk '{print $11;}' $f >> allBacEnds.names
      end
      wc -l allBacEnds.names
      # 392211 allBacEnds.names
      # this is the total number of lines in the *.bed files
      perl -pi.bak -e 's/,/\n/g' allBacEnds.names
      sort allBacEnds.names | uniq > allBacEnds.names.uniq
      wc -l allBacEnds.names.uniq
      # 512321 allBacEnds.names.uniq
      # get alignments for just the BAC ends that are in the database tables
      # make bacEnds.load.psl
      cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
      extractPslLoad -noBin ../bacEnds.psl bacEndPairsGoodScores.bed \
          bacEndPairsBadGoodScores.bed bacEndSinglesGoodScoresAndCoords.bed | \
          sorttbl tname tstart | headchg -del > bacEnds.load.psl
     # check that alignments are present for all BAC ends in 
     # allBacEnds.names.uniq
     awk '{print $10}' bacEnds.load.psl | sort | uniq > bacEnds.names
     comm -12 bacEnds.names allBacEnds.names.uniq | wc -l
     # 512321
     wc -l *
     # 512321 allBacEnds.names.uniq
     # 512321 bacEnds.names
    
     # Reloaded split tables. Old bacEnds.load.psl was used 
     # last time. (2006-06-08, hartera)
     ssh hgwdev
     cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
     # remove old all_bacends table. This was moved over from hgwbeta after
     # the recent crash of hgwdevold after the power failure.
     hgsql -e 'drop table all_bacends;' danRer3 
     # Display is very slow for BAC ends on large regions. Try splitting
     # bacEnds.load.psl and load tables as chrN_allBacends. The parsing
     # code is confused if there are two underscores in the table name.
     foreach c (`cat /cluster/data/danRer3/chrom.lst`)
         echo "Processing $c ..."
         awk '{if ($14 == "'chr${c}'") print;}' \
            /cluster/data/danRer3/bed/bacends/scoresAndCoords/bacEnds.load.psl \
            > chr${c}.bacEnds.load.psl
     end
     # drop old tables
     foreach c (`cat /cluster/data/danRer3/chrom.lst`)
        echo $c
        hgsql -e "drop table chr${c}_allBacends;" danRer3
     end
     # load new tables
     foreach c (`cat /cluster/data/danRer3/chrom.lst`)
      nice hgLoadPsl danRer3 -table=chr${c}_allBacends chr${c}.bacEnds.load.psl
     end
     # load of chr5_allBacends did not go as planned: 326147 record(s), 
     # 0 row(s) skipped, 1 warning(s) loading psl.tab
     # load of chr8_allBacends did not go as planned: 212665 record(s), 
     # 0 row(s) skipped, 5 warning(s) loading psl.tab
     # load of chr12_allBacends did not go as planned: 156947 record(s), 
     # 0 row(s) skipped, 1 warning(s) loading psl.tab
     # load of chr15_allBacends did not go as planned: 181721 record(s), 
     # 0 row(s) skipped, 1 warning(s) loading psl.tab
     # load of chr19_allBacends did not go as planned: 282423 record(s), 
     # 0 row(s) skipped, 1 warning(s) loading psl.tab
     # load of chr20_allBacends did not go as planned: 315248 record(s), 
     # 0 row(s) skipped, 7 warning(s) loading psl.tab
     # load of chrUn_allBacends did not go as planned: 1524765 record(s), 
     # 0 row(s) skipped, 487 warning(s) loading psl.tab    
 
     # There are still warnings on loading, most (487) are for chrUn.
     # alter lfs (BED) tables so that pslTable field is "allBacends"
     # instead of all_bacends (this was set by the pslPairs program).
     foreach t (bacEndPairs bacEndSingles bacEndPairsBad)
        hgsql -e "update $t set pslTable = 'allBacends';" danRer3
     end
     # This improves the performance a lot.
     # corrected termRegex for some bacCloneXRef searches in trackDb.ra so 
     # that they work correctly (bacPairsIntName, bacSinglesIntName, 
     # bacPairsSangerSts and bacSinglesSangerSts). (2006-04-19, hartera)
 
     # Remake the all_bacends table. extractPslLoad extracts psl alignments
     # by name so even those that are filtered out end up in the all_bacends
     # table. Wrote a program that matches BAC end psl alignments from the
     # bacEnd{Pairs, PairsBad, Singles} tables by name, chrom, chromStart and
     # chromEnd.
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends
     mkdir extractPsl
     cd extractPsl
     # Some scores in bacEndSinglesGoodScoresAndCoords.psl are not integers
     # so fix these and also for the other bacEnd files just in case.
 cat << '_EOF_' > roundPslScore.pl
 #!/usr/bin/perl -w
 use strict;
 
 my $file = $ARGV[0];
 
 open(FILE, $file) || die "Can not open $file: $!\n";
 while (<FILE>)
 {
 my (@f, $line, $num, $score);
 $line = $_;
 @f = split(/\t/, $line);
 $num = $f[4];
 $score = round($num);
 $line =~ s/$num/$score/;
 print $line;
 }
 
 sub round {
     my($number) = shift;
     return int($number + .5);
 }
 '_EOF_'
     chmod +x roundPslScore.pl
     set bacDir=/cluster/data/danRer3/bed/bacends
     perl roundPslScore.pl $bacDir/scoresAndCoords/bacEndPairsGoodScores.bed \
          > bacEndPairsRoundScore.bed
     perl roundPslScore.pl $bacDir/scoresAndCoords/bacEndPairsBadGoodScores.bed \
          > bacEndPairsBadRoundScore.bed
     perl roundPslScore.pl \
          $bacDir/scoresAndCoords/bacEndSinglesGoodScoresAndCoords.bed \
          > bacEndSinglesRoundScore.bed
     
     nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
       $bacDir/bacEnds.psl bacEndPairsRoundScore.bed bacPairs.psl   
     nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
       $bacDir/bacEnds.psl bacEndPairsBadRoundScore.bed bacPairsBad.psl   
     nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
      $bacDir/bacEnds.psl \
      bacEndSinglesRoundScore.bed bacSingles.psl   
     cat bac*.psl > allBacends.load.psl
     
     # Now load database tables:
     # Do not need to reload singles table as it is still the same, the 
     # scores were rounded to 214 on loading. These are the only scores that
     # are floats rather than integers. 
     # Drop old split bacends tables and reload new one with only those psls 
     # relevant to alignments in the lfs tables. 
     ssh hgwdev
     cd /cluster/data/danRer3/bed/bacends/extractPsl
     foreach c (`cat /cluster/data/danRer3/chrom.lst`)
        hgsql -e "drop table chr${c}_allBacends;" danRer3
     end
     # change the bacEnd{Pairs, PairBad, Singles} tables so that the 
     # pslTable is all_bacends again.
     foreach b (Pairs PairsBad Singles)
        hgsql -e "update bacEnd${b} set pslTable = 'all_bacends';" \
                 danRer3
     end
     # Then load all_bacends table. Now there are many less alignments than
     # before, they can all go in one table since the large table size 
     # was previously slowing down the Browser at zoomed out display levels
     # due to slow access of the very large all_bacends table.
     wc -l allBacends.load.psl
     # 549408 allBacends.load.psl
     hgLoadPsl danRer3 -table=all_bacends allBacends.load.psl
     hgsql -e 'select count(*) from all_bacends;' danRer3
     # 549408
     # Table contains the correct number of rows.
     # Get all the lfNames from the bed files and check that these are all
     # represented in allBacends.load.psl
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends/extractPsl
     foreach p (*RoundScore.bed)
         awk '{print $11}' $p >> bedFiles.names
     end
     perl -pi.bak -e 's/,/\n/' bedFiles.names
     sort bedFiles.names | uniq > bedFiles.names.uniq
     # get psl file names
     awk '{print $10}' allBacends.load.psl | sort | uniq > pslFile.names.uniq
     wc -l *.uniq
     # 512321 bedFiles.names.uniq
     # 512321 pslFile.names.uniq
     comm -12 bedFiles.names.uniq pslFile.names.uniq | wc -l
     # 512321
     # Therefore all names from BED files are in PSL file.
     rm bedFiles* pslFile*
     cd /cluster/data/danRer3/bed/bacends
     rm -r all_bacends
     rm ./scoresAndCoords/*.bacEnds.load.psl
     # Duplicate rows in bacCloneXRef and bacCloneAlias tables so remove
     # these, reload tables and test - see sections on  
     # CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES and 
     # BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES
 
 #######################################################################
     # RE-DO RH MAP:
     # isPcr of sequences. 
     # 1) Make a list from FASTA file of sequences.
     # 2) get one record per file. - need to just split on '>' 
     # 3) use rhFix to adapt to get primers, one set per file and name
     # after sequence
     # run isPcr as cluster job - one per sequence and primers set
     # get RHmap info again. need to remove spaces in primers seqs
 
 cat << '_EOF_' > getRhInfo
 #!/usr/bin/awk -f 
 
 #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
 /^>/ {
     sub(/>/,"",$0);
     sub(/\//,"_", $0);
     gsub(/ /,"",$0);
     split(toupper($0), a, "\\|");
     print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12];
     next;
 }
 '_EOF_'
 # << keep emacs coloring happy
     chmod +x getRhInfo
     getRhInfo ../../rhMap.headers2 > rhMapInfo.tab
      
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306
     mkdir -p isPcr/primers
     cd isPcr/primers
     # create primers files
     ssh kkstore02
     cd \
 /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers
     awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($8 != "") print $1,$8,$9 \
         > $1".primers.fa"}' rhMapInfo.tab
     # there are 7519 primer sets which is correct.
     # get list of sequences
     cd ..
     mkdir markerSeqs
     cd markerSeqs
     grep '>' ../../rhMap.fa | wc
     # 11514
     # get all sequences. There are 11514 total.
     # use faSplit sequence 11514 
     # rhMap.fa is file. Need to fix that one name:
     perl -pi.bak -e 's/\//_/' ../../rhMap.fa
     # splits sequences up with one file per name named with sequence name
     faSplit byname ../../rhMap.fa rhMap
     ls | wc -l
     # 11514
     ssh pk
     # make run dir on the san and link to isPcr dir
     mkdir -p /san/sanvol1/scratch/danRer3/bacends/isPcrRun
     cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr
     ln -s /san/sanvol1/scratch/danRer3/bacends/isPcrRun .
     # get list of sequences with primers
     cd \
 /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/isPcrRun
     awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($8 != "") print $1 \
         > "primerSeqs.lst"}' \
        /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers/rhMapInfo.tab
    
     foreach m (`cat primerSeqs.lst`)
        echo /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/san/sanvol1/scratch/danRer3/danRer3_10.ooc -stepSize=5 /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/markerSeqs/${m}.fa /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers/${m}.primers.fa '{'check out line+ /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/isPcrRun/out/${m}.psl'}' >> jobList
     end
     para create jobList
     para try, check, push, check etc. ...
     # there are 654 that do not have isPcr results. Checked Z4664.MGH and 
     # found that the primers would not align with Blat either.
     # these are in unmatchedPrimers. They crashed even if maxSize=50000 and
     # if -flipReverse used.
     mkdir notMatchedPrimers notMatchedSeqs 
     perl -pi.bak -e 's/\.fa//' unmatchedPrimers
     foreach f (`cat unmatchedPrimers`)
      set d=/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr
      cp ${d}/primers/${f}.primers.fa ./notMatchedPrimers/
      cp ${d}/markerSeqs/${f}.fa ./notMatchedSeqs
     end
     
     tar cvzf primers.tar.gz notMatchedPrimers/*primers.fa
     tar cvzf markers.tar.gz notMatchedSeqs/*.fa
     # sent these to Yi Zhou by e-mail and see if they can look at them.
     # include the isPcr parameters.
     # from PSL extract sequence. need tName, tStart and tEnd, fields 14, 16 and
     # 17. Then used faFrag to get sequence from FASTA file.
 
 ############################################################################
 ##  BLASTZ swap from panTro2 alignments (DONE 2006-05-07 markd)
     ssh hgwdev64
     mkdir /cluster/data/danRer3/bed/blastz.panTro2.swap
     ln -s blastz.panTro2.swap /cluster/data/danRer3/bed/blastz.panTro2
     cd /cluster/data/danRer3/bed/blastz.panTro2.swap
     
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
 	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	/cluster/data/panTro2/bed/blastz.danRer3/DEF >& swap.out&
 
    # create the net files
     ssh hgwdev
     cd /cluster/data/danRer3/bed/blastz.panTro2.swap/axtChain
     nice netClass -verbose=0 -noAr noClass.net danRer3 panTro2 danRer3.panTro2.net
 
 ###########################################################################
 # LIFTOVER CHAINS TO DANRER4 (DONE, 2006-05-31 - 2006-06-06, hartera)
 # CLEANUP BLAT DIRECTORY (DONE, 2006-12-14, hartera)
    # Split (using makeLoChain-split) of danRer4 is doc'ed in makeDanRer4.doc
    # Do what makeLoChain-split says to do next (start blat alignment)
    # Use pk as runs faster than on kk. Scripts only run on kk so run manually.
    ssh pk
    mkdir -p /cluster/data/danRer3/bed/liftOver
    cd /cluster/data/danRer3/bed/liftOver
 cat << '_EOF_' > align.csh
 #!/bin/csh -fe
 set oldAssembly = $1
 set oldNibDir = $2
 set newAssembly = $3
 set newSplitDir = $4
 set ooc = $5
 if ("$ooc" != "") then
     set ooc = '-ooc='$ooc
 endif
 
 set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d`
 echo "Setting up blat in $blatDir"
 rm -fr $blatDir
 mkdir $blatDir
 cd $blatDir
 mkdir raw psl run
 cd run
 
 echo '#LOOP' > gsub
 echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \
        '-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \
   >> gsub
 echo '#ENDLOOP' >> gsub
 
 # target
 ls -1S $oldNibDir/*.{nib,2bit} > old.lst
 # query
 ls -1S $newSplitDir/*.{nib,fa} > new.lst
 
 gensub2 old.lst new.lst gsub spec
 /parasol/bin/para create spec
 
 echo ""
 echo "First two lines of para spec:"
 head -2 spec
 echo ""
 echo "DO THIS NEXT:"
 echo "    cd $blatDir/run"
 echo "    para try, check, push, check, ..."
 echo ""
 exit 0
 '_EOF_'
    # << emacs
    chmod +x align.csh
    align.csh danRer3 /san/sanvol1/scratch/danRer3/nib danRer4 \
        /san/sanvol1/scratch/danRer4/split10k \
        /san/sanvol1/scratch/danRer4/danRer4_11.ooc >&! align.log &
    # Took a few seconds.
    # Do what its output says to do next (start cluster job)
    cd /cluster/data/danRer3/bed/blat.danRer4.2006-05-31/run
    para try, check, push, check, ...
    para time
 # Completed: 784 of 784 jobs
 # CPU time in finished jobs:    2011355s   33522.59m   558.71h   23.28d  0.064 y
 # IO & Wait Time:                  3926s      65.43m     1.09h    0.05d  0.000 y
 # Average job time:                2571s      42.84m     0.71h    0.03d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:          205412s    3423.53m    57.06h    2.38d
 # Submission to last job:        219860s    3664.33m    61.07h    2.54d
    
    ssh pk
    cd /cluster/data/danRer3/bed/liftOver
 
 cat << '_EOF_' > lift.csh
 #!/bin/csh -ef
 set oldAssembly = $1
 set newAssembly = $2
 set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k
 
 set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly
 set blatDir = `ls -td $prefix.20* | head -1`
 echo "using dir $blatDir"
 
 if ( ! -e $blatDir/raw ) then
     echo "Can't find $blatDir/raw"
 endif
 
 if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then
     echo "Can't find any .lft files in $newLiftDir"
     exit 1
 endif
 cd $blatDir/raw
 
 foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`)
     echo $chr
     liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl
 end
 
 set execDir = $0:h
 echo ""
 echo "DO THIS NEXT:"
 echo "    ssh pk"
 echo "    $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>"
 echo ""
 exit 0
 '_EOF_'
    # << emacs
    chmod +x lift.csh
    lift.csh danRer3 danRer4 >&! lift.log &
    # makeLoChain-chain can be run on pk. chain alignments
 
    makeLoChain-chain danRer3 /san/sanvol1/scratch/danRer3/nib \
                      danRer4 /san/sanvol1/scratch/danRer4/nib >&! chain.log &
    cd /cluster/data/danRer3/bed/blat.danRer4.2006-05-31/chainRun
    para try, check, push, check, ...
    para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:       3414s      56.91m     0.95h    0.04d  0.000 y
 # IO & Wait Time:                  3256s      54.26m     0.90h    0.04d  0.000 y
 # Average job time:                 238s       3.97m     0.07h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             280s       4.67m     0.08h    0.00d
 # Submission to last job:           280s       4.67m     0.08h    0.00d
 
    # net alignment chains
    ssh kkstore02
    cd /cluster/data/danRer3/bed/liftOver
    makeLoChain-net danRer3 danRer4 >&! net.log &
    # load reference to over.chain into database table,
    # and create symlinks  /gbdb  and download area
    ssh hgwdev
    cd /cluster/data/danRer3/bed/liftOver
    makeLoChain-load danRer3 danRer4 >&! load.log &
    # clean up
    rm *.log
    # add md5sum.txt to include this new liftOver file
    cd /usr/local/apache/htdocs/goldenPath/danRer3/liftOver
    rm md5sum.txt
    md5sum *.gz > md5sum.txt
    # copy README.txt from another liftOver directory if it is not there already.
    # test by converting a region using the "convert" link on
    # the browser, and comparing to blat of the same region
    
    # CLEANUP blat directory (2006-12-14, hartera)
    ssh kkstore02
    rm -r /cluster/data/danRer3/bed/blat.danRer4.2006-05-31
 
 ###########################################################################
 # CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO 
 # AFFY ZEBRAFISH ALIGNMENTS (DONE, 2006-06-10, hartera)
 # UPDATE ARRAY DATA TRACK AFTER PROCESSING ARRAY DATA DIFFERENTLY AND
 # RELOADING INTO hgFixed (see hgFixed.txt for details).
 # (DONE, 2006-10-20, hartera)
 # UPDATE ARRAY DATA TRACK AFTER REPROCESSING ARRAY DATA TO ANTILOG THE LOG2
 # VALUES FROM NORMALISATION TO GET THE ABSOLUTE VALUES AND
 # RELOADING INTO hgFixed (see hgFixed.txt for details).
 # (DONE, 2007-01-08, hartera)
 # RE-ORDERED DISPLAY IN TRACK (DONE, hartera, 2007-04-09)
 # Array data is for whole embryos of five wild type zebrafish strains. 
 # Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's 
 # Hospital Boston. Contact: adibiase@enders.tch.harvard.edu
     ssh hgwdev
     mkdir /cluster/data/danRer3/bed/ZonLab/wtArray
     cd /cluster/data/danRer3/bed/ZonLab/wtArray
    
     # use AllRatio table for mapping. There are not many arrays in this
     # dataset so using AllRatio will allow the selection of All Arrays
     # from the track controls on the track description page. Also set up the
     # Zebrafish microarrayGroups.ra so that the Medians of replicates or
     # Means of replicates can also be selected for display.
     # Create mapped data in zebrafishZonWT.bed.
     rm zebrafishZonWT.bed
     hgsql -e 'drop table affyZonWildType;' danRer3
     hgMapMicroarray zebrafishZonWT.bed hgFixed.zebrafishZonWTAllRatio \
          /cluster/data/danRer3/bed/affyZebrafish/affyZebrafish.psl
     # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTMedian
     # Mapped 14494,  multiply-mapped 4102, missed 0, unmapped 1123
 
     # Load mapped data into database:
     hgLoadBed danRer3 affyZonWildType zebrafishZonWT.bed
     # Loaded 18596 elements of size 15
     # add trackDb.ra entry at trackDb/zebrafish level
 
     # look at range of scores:
     hgsql -N -e 'select expScores from zebrafishZonWTAllRatio;' hgFixed \
           > ratioExps.out
     perl -pi.bak -e 's/,/\n/g' ratioExps.out
     sort ratioExps.out | uniq -c > ratioExps.uniq.count
     textHistogram -binSize=0.5 -real -maxBinCount=40 -minVal=-10 \
         ratioExps.out > expRatios.hist
     # Most values are between -3 and +2.
     # Therefore use the following trackDb entry:
 
 # track affyZonWildType
 # shortLabel Wild Type Array
 # longLabel Zon Lab Expression data for Wild Type Zebrafish strains
 # group regulation
 # priority 80
 # visibility hide
 # type expRatio
 # expScale 2.0
 # expStep 0.2
 # groupings affyZonWildTypeGroups
     # The .ra file in /usr/local/apache/cgi-bin/hgCgiData/Zebrafish
     # (from ~/kent/src/hg/makeDb/hgCgiData/Zebrafish in the source tree)
     # which is microarrayGroups.ra defines how the array data is
     # displayed and also grouped for the Medians and Means of Replicates.
     # It also defines the labels for the track controls for showing
     # All Arrays, Arrays Grouped By Replicate Means or
     # Arrays Grouped By Replicate Medians. This is in the description field.
 
     # RE-ORDER DISPLAY IN TRACK - (hartera, 2007-04-09)
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/hgCgiData/Zebrafish
     # 14 somites and 15 somites should come before 36 hpf
     # 14-19 somites stage is 16-19h.
     # from hgFixed.zebrafishZonWTAllExps
     # for AB, 0-8 should go after 14, 
     # for TL, 16-22 should go after 24
     # for TU, 25-27 should go after 32
     # re-order accordingly in the config file:
     cd /cluster/data/danRer4/bed/ZonLab/wtArray
 cat << '_EOF_' > formatArray
 #!/usr/bin/awk -f
 BEGIN {FS=","} {OFS=","}
 /expIds/ {
     sub(/expIds /,"",$0);
     print "expIds "$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7,$8,$9,$16,$24,$25,$17,$18,$19,$20,$21,$22,$23,$29,$30,$31,$32,$33,$26,$27,$28,$34;
     next;
 }
 /names AB-36-hpf,AB-36-hpf 2/ {
     sub(/names /,"",$0);
     print "names "$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7,$8,$9,$16,$24,$25,$17,$18,$19,$20,$21,$22,$23,$29,$30,$31,$32,$33,$26,$27,$28,$34;
     next;
 }
 /names AB-36-hpf,AB-14-somites/ {
     sub(/names /,"",$0);
     print "names "$2,$1,$3,$5,$4,$7,$8,$6,$9;
     next;
 }
 /groupSizes 9/ {
     sub(/groupSizes /,"",$0);
     print "groupSizes "$2,$1,$3,$5,$4,$7,$8,$6,$9;
     next;
 }
 {
     print $0;
 }
 '_EOF_'
     chmod +x formatArray
     formatArray ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra \
                 > microarrayGroups2.ra
     cp microarrayGroups2.ra \
        ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra
     cd ~/kent/src/hg/makeDb/hgCgiData/
     make my 
     # after doing make, check this in hgwdev-hartera
     # then commit to CVS as it works fine. 
 
 ###########################################################################
 # BUILD GENE SORTER TABLES (AKA FAMILY BROWSER) 
 # (DONE, 2006-06-08 - 2006-06-12, hartera)
 # Zon Lab WT Affy data tables in hgFixed renamed to reflect that the data 
 # is log2 transformed (DONE, 2006-07-30, hartera)
 # Recreate the ensToAffyZebrafish and ensToAffyZonWildType tables after 
 # updating the Affy Zebrafish track with different filtering used for the 
 # Blat alignments - see UPDATE AFFY ZEBRAFISH TRACK section. Also the 
 # Affy Zon Lab Wild Type Array data was updated with a different method of 
 # processing - see hgFixed.txt (DONE, 2006-10-25, hartera)
 # Recreated the ensCanonical and ensIsoforms table after updating proteinID
 # in ensGene table (DONE, 2006-11-06, hartera) 
 #  This should be done after creating ensGene, ensGtp and ensPep tables
 #  for the Ensembl Genes track.
 #  The BlastTab tables are already built - see HGNEAR PROTEIN BLAST TABLES
 #  Blastp of self is ensZfishBlastTab table.
 #  Other blastp ortholog tables are: hgBlastTab (hg18), mmBlastTab(mm8), 
 #  rnBlastTab (rn4), dmBlastTab (dm2), ceBlastTab (ce2), 
 #  sacCerBlastTab (sacCer1).
    ssh hgwdev
    mkdir /cluster/data/danRer3/bed/geneSorter.2006-06-08
    ln -s /cluster/data/danRer3/bed/geneSorter.2006-06-08 \
          /cluster/data/danRer3/bed/geneSorter
    cd /cluster/data/danRer3/bed/geneSorter
    # Create table that maps between known genes and RefSeq
    # Index is only on first 16 characters, too short for Ensembl names
    # manually changed hgMapToGene to create index with 20 characters 
    # on name and use local copy of program.
    $HOME/bin/x86_64/hgMapToGene danRer3 refGene ensGene ensToRefSeq
    # hgsql -e 'select count(*) from ensToRefSeq;' danRer3
    # 9707
 
    # Create table that maps between Ensembl genes and LocusLink 
    # LocusLink is now called Entrez Gene.
    hgsql -N -e "select mrnaAcc,locusLinkId from refLink" danRer3 > refToLl.txt
    $HOME/bin/x86_64/hgMapToGene danRer3 refGene ensGene \
          ensToLocusLink -lookup=refToLl.txt
 
    # Update the following three tables after update of Affy Zebrafish and
    # Affy Zon Lab Wild Type data (2006-10-25):
    hgsql -e 'drop table ensToAffyZebrafish;' danRer3
    hgsql -e 'drop table ensToAffyZonWildType;' danRer3
    hgsql -e 'drop table zebrafishZonWTDistance;' danRer3
 
    # Create table that maps between Ensembl genes and the Affy Zebrfish
    # probeset consensus sequences.
    $HOME/bin/x86_64/hgMapToGene danRer3 affyZebrafish ensGene \
            ensToAffyZebrafish
 
    # Create a table that maps between Ensembl genes and 
    # the Zon lab microarray expression data.
    $HOME/bin/x86_64/hgMapToGene "-type=bed 12" danRer3 affyZonWildType \
                ensGene ensToAffyZonWildType 
 
    # Create expression distance table.
    nice hgExpDistance danRer3 hgFixed.zebrafishZonWTMedianRatio \
         hgFixed.zebrafishZonWTMedianExps zebrafishZonWTDistance  \
         -lookup=ensToAffyZebrafish &
    # Have 15617 elements in hgFixed.zebrafishZonWTMedian
    # Got 8911 unique elements in hgFixed.zebrafishZonWTMedian
    # Made zebrafishZonWTDistance.tab
    # Loaded zebrafishZonWTDistance
    # Made query index
    # Took 2 minutes.
    # To allow data to be viewed in Gene Sorter, add the hgNearOk=1 
    # to the dbDb table entry for danRer3 on hgcentraltest -
    # see section on MAKE HGCENTRALTEST ENTRY FOR DANRER3.
 
    # added a protein ID field to ensGene before running this hgClusterGenes
    # Cluster together various alt-splicing isoforms.
    # Creates the knownIsoforms and knownCanonical tables
    # Rebuild this after creating updating the ensGene table with
    # protein IDs from UniProt with >= 90% identity to Ensembl proteins.
    # (2006-11-06, hartera)
    hgsql -e 'drop table ensIsoforms;' danRer3
    hgsql -e 'drop table ensCanonical;' danRer3
    hgClusterGenes danRer3 ensGene ensIsoforms ensCanonical
    # Got 22877 clusters, from 32143 genes in 28 chromosomes
    # There are also 22877 genes in the ensGtp table so this is in agreement.
 
 #######################################################################
 # UPDATE AFFY ZEBRAFISH TRACK USING BLAT WITHOUT -mask OPTION AND 
 # USING -repeats OPTION AND DIFFERENT FILTERING TO REMOVE SHORT 
 # ALIGNMENTS (DONE, 2006-09-27, hartera)
 # With the previous version of this track, QA found a number of short 
 # alignments of <= 30 bp and there are a number in the <= 50bp range.
 # These do not seem to be meaningful so filtering was changed to try to 
 # remove these alignments while retaining meaningful alignments.
 # pslCDnaFilter was used with the same settings as used for the
 # Genbank EST alignments for zebrafish. 
 # Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the
 # higher minIdentity is causing alignments to be dropped that should not be.
 # Blat's minIdentity seems to be more severe than that for pslReps or 
 # pslCDnaFilter as it takes insertions and deletions into account.
 # These are Jim's recommendations. 
 # Remove old Affy zebrafish directories (DONE, 2006-12-13, hartera)    
     # Array chip sequences already downloaded for danRer1
     ssh hgwdev
     cd /projects/compbio/data/microarray/affyZebrafish
     mkdir -p /san/sanvol1/scratch/affy
     cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
        /san/sanvol1/scratch/affy/
     # Set up cluster job to align Zebrafish consensus sequences to danRer3
     # remove old link and create new one
     rm /cluster/data/danRer3/bed/affyZebrafish
     mkdir -p /cluster/data/danRer3/bed/affyZebrafish.2006-09-27
     ln -s /cluster/data/danRer3/bed/affyZebrafish.2006-09-27 \
           /cluster/data/danRer3/bed/affyZebrafish
 
     # Align sequences on the pitakluster. Scaffolds were aligned for NA
     # and Un and lifted to chrom level afterwards. Chroms 1-25 and M
     # were aligned as ~5 Mb chunks.
 
     ssh pk
     cd /cluster/data/danRer3/bed/affyZebrafish
     mv /san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/scaffold*.fa \
        /san/sanvol1/scratch/danRer3/
     ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst
     foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/*.fa)
        ls -1 $f >> genome.lst
     end
     wc -l genome.lst
     # 15149 genome.lst
     # for output:
     mkdir -p /san/sanvol1/scratch/danRer3/affy/psl
     # use -repeats option to report matches to repeat bases separately 
     # to other matches in the PSL output.
     echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer3/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
 
     gensub2 genome.lst affy.lst template.sub para.spec
     para create para.spec
     para try, check, push ... etc.
     para time
 # Completed: 15149 of 15149 jobs
 #CPU time in finished jobs:      34672s     577.87m     9.63h    0.40d  0.001y
 #IO & Wait Time:                 41580s     692.99m    11.55h    0.48d  0.001 y
 #Average job time:                   5s       0.08m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             145s       2.42m     0.04h    0.00d
 #Submission to last job:          1400s      23.33m     0.39h    0.02d
 
     # need to do pslSort and lift up 
     ssh pk
     cd /san/sanvol1/scratch/danRer3/affy
     # Do sort, liftUp and then best in genome filter. 
     # only use alignments that have at least
     # 95% identity in aligned region.
     # Previously did not use minCover since a lot of sequence is in 
     # Un and NA so genes may be split up so good to see all alignments.
     # However, found a number of short alignments of <= 50 bp. These are
     # not meaningful so maybe need to use minCover. If increased too much,
     # then hits on poor parts of the assembly will be missed. 
     # use pslCDnaFilter with the same parameters as used for zebrafish
     # Genbank EST alignments. 
     pslSort dirs raw.psl tmp psl
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
 #                         seqs    aligns
 #             total:     14886   830753
 # drop minNonRepSize:     2753    745330
 #     drop minIdent:     2645    38916
 #     drop minCover:     2472    10516
 #        weird over:     384     1529
 #        kept weird:     308     403
 #    drop localBest:     2559    17395
 #              kept:     14494   18596
 # 97.3% were kept. 
 # There are 15502 Affy sequences originally aligned so there are now
 # 93.5% remaining.
     
     # lift up the coordinates to chrom level
     #pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     # lift up chrom contigs to chrom level
     cat /cluster/data/danRer3/jkStuff/liftAll.lft \
      /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
      > allLift.lft
     liftUp affyZebrafish.psl allLift.lft warn contig.psl
     # Got 30168 lifts in allLift.lft
     # Lifting contig.psl
 
     # rsync these psl files 
     rsync -a --progress /san/sanvol1/scratch/danRer3/affy/*.psl \
          /cluster/data/danRer3/bed/affyZebrafish/
     ssh kkstore02
     cd /cluster/data/danRer3/bed/affyZebrafish
     # shorten names in psl file
     sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
     mv affyZebrafish.psl.tmp affyZebrafish.psl
     pslCheck affyZebrafish.psl
     # psl is good
     # load track into database
     ssh hgwdev
     cd /cluster/data/danRer3/bed/affyZebrafish
     hgsql -e 'drop table affyZebrafish;' danRer3
     hgLoadPsl danRer3 affyZebrafish.psl
     # Add consensus sequences for Zebrafish chip
     # Copy sequences to gbdb if they are not there already
     mkdir -p /gbdb/hgFixed/affyProbes
     ln -s \
        /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /gbdb/hgFixed/affyProbes
     # these sequences were loaded previously so no need to reload. 
     hgLoadSeq -abbr=Zebrafish: danRer3 \
               /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
     # Clean up
     rm batch.bak contig.psl raw.psl
     # check number of short alignments:
     hgsql -e \
      'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer3
     # 6 
     # for previous filtered set, there were 1195 alignments of <= 50 bp so
     # this has improved. 
     hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer3
     # 14494
     # Previously 14335 distinct affy sequences were aligned. Many of the 
     # short alignments may also have longer alignments to different regions 
     # of the genome that are good.
 
     # CLEANUP:
     # remove old Affy Zebrafish alignment directories (hartera, 2006-12-13)
     ssh kkstore02
     cd /cluster/data/danRer3/bed
     rm -r affyZebrafish.2005-08-19
     rm -r affyZebrafish.2005-09-25
 
 #########################################################################
 # NEW RH MAP SEQUENCES FOR TRACK (in progress, 2006-10-12, hartera)
 # Data from Yi Zhou at Boston Children's Hospital:
 # yzhou@enders.tch.harvard.edu
     ssh kkstore02
     mkdir /cluster/data/danRer3/bed/rhMap-2006-10-03
     cd /cluster/data/danRer3/bed
     ln -s rhMap-2006-10-03 rhMap
     # download data files from e-mail:
     # rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip
     unzip rhSequenceSubmit100306.zip
     unzip rhSequenceSubmitSeq100306.zip
     dos2unix rhSequenceSubmit100306.txt
     dos2unix rhSequenceSubmitSeq100306.txt
     # need to convert format of FASTA file to remove the line numbers
 
 ###########################################################################
 # BACENDS CLEANUP (DONE, 2006-12-13, hartera)
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends
     mv ./seqs/getCloneEnds.csh .
     rm CHORI73.* DH.* DHBacs.fullnames DHmorethan2.*
     rm bacEndsChroms.psl bacNAandUnScafs.psl
     rm bacends.lst genome.lst names.psl namesPls.uniq header pslCheck.log \
        raw*
     rm -r /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl
     rm -r /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnPsl
     rm -r newPairs2
     rm -r /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun
     rm -r /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnRun 
     rm -r singles pairs scores
     rm -r ./cloneandStsAliases/tmp
     rm ./cloneandStsAliases/*.bak ./cloneandStsAliases/*.tab \
        ./cloneandStsAliases/*.sort ./cloneandStsAliases/*.uniq 
     rm DH_bacends.fa
     rm -r liftedPsl 
     # the psl directory is large, gzip the contents
     cd psl
     gzip *.psl
 
 #########################################################################
 ##  Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
     hgsql -h genome-testdb hgcentraltest \
 	-e "update dbDb set orderKey = 451 where name = 'danRer3';"
 
 ##########################################################################
 # GenBank gbMiscDiff table (markd 2007-01-10)
 # Supports `NCBI Clone Validation' section of mgcGenes details page
 
    # genbank release 157.0 now contains misc_diff fields for MGC clones
    # reloading mRNAs results in gbMiscDiff table being created.
    ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer3
 
 ###########################################################################
 # REMAKE RADIATION HYBRID (RH) MAP TRACK (DONE, 2007-02-14, hartera)
 # Use update of Radiation Hybrid map data from October 2006 and use method
 # as documented in danRer4.txt to map these sequences to danRer3.
 # Data from Yi Zhou at Boston Children's Hospital:
 # yzhou@enders.tch.harvard.edu
 # Latest RH map sequences and primers received on 2006-10-03 from
 # Anhua (Peter) Song - asong@enders.tch.harvard.edu
     ssh kkstore02
     mkdir /cluster/data/danRer3/bed/ZonLab/rhMap-2006-10-03
     cd /cluster/data/danRer3/bed/ZonLab
     ln -s rhMap-2006-10-03 rhMap
     cd rhMap
     # download data files from e-mail:
     # rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip
     unzip rhSequenceSubmit100306.zip
     unzip rhSequenceSubmitSeq100306.zip
     dos2unix rhSequenceSubmit100306.txt
     dos2unix rhSequenceSubmitSeq100306.txt
     # Sequences are in rhSequenceSubmitSeq100306.txt and primers and other
     # information are in rhSequenceSubmi100306.txt
     grep '>' rhSequenceSubmitSeq100306.txt | wc -l
     # 11514
     wc -l  rhSequenceSubmit100306.txt
     # 13438 rhSequenceSubmit100306.txt
    
     grep '>' rhSequenceSubmitSeq100306.txt > rhMap.names
 
     # remove '>' from names and grab first field
     perl -pi.bak -e 's/>//' rhMap.names
     awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \
         > rhMap.namesOnly.sort
     awk 'BEGIN {FS="|"} {print $1;}' rhSequenceSubmit100306.txt | sort | uniq \
         > rhMapPrimers.namesOnly.sort
     wc -l *.sort
     # 11514 rhMap.namesOnly.sort
     # 13436 rhMapPrimers.namesOnly.sort (after removing blank line)
     
     # get a list of headers from the FASTA file
     grep '>' rhSequenceSubmitSeq100306.txt > rhMap.headers
     awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq
 # BAC_END
 # EST
 # GENE
 # SSLP
 # STS
     # There are 5 types of sequence here.
     awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq
 #BACends
 #Custom
 #Insertion_Mutant
 #Insertion_Mutants
 #MGH
 #NCBI
 #Sanger SG
 #Sequencing_Project
 #ThisseClone
 #Thisse_Clone
 #other_zfEst
 #wu_zfEst
 #wz
     awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq
     # CHBG
     # MPIEB
 
 # Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone;
 # So there are 11 different sources.
     # There are 2 sequences with problem primers. E-mailed Peter Song about
     # these and he suggested to delete thoser primers:
     # >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.|
     # >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A| 
     # edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers.
     # need to reformat FASTA headers so they are in the format: 
     # NAME.SOURCE.TYPE.ORIGIN
     # Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone
     # so change these to have the same name. Also shorten Sanger SG to
     # Shotgun.
     sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmitSeq100306.txt \
        | sed -e 's/Insertion_Mutant/InsertMut/' \
        | sed -e 's/Sanger SG/Shotgun/' \
        | sed -e 's/ThisseClone/Thisse/' \
        | sed -e 's/Thisse_Clone/Thisse/' \
        | sed -e 's/Sequencing_Project/Seqproj/' > rhMap100306.fa
     # Do the same for the primers and information file:
     sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmit100306.txt \
        | sed -e 's/Insertion_Mutant/InsertMut/' \
        | sed -e 's/Sanger SG/Shotgun/' \
        | sed -e 's/ThisseClone/Thisse/' \
        | sed -e 's/Thisse_Clone/Thisse/' \
        | sed -e 's/Sequencing_Project/Seqproj/' > rhMapPrimers100306.txt
     # edit these files to remove the extra newline char after the first primer
     # for 1942c and then change "/" in FJ34C05.Y1/FJ56G09.Y1.WU_ZFEST to 
     # an underscore (2007-02-14, hartera)
     perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
          rhMap100306.fa
     perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
          rhMapPrimers100306.txt
 
     # use a script to reformat the names for the FASTA headers to the format 
     # >NAME.SOURCE where name is the first field separated by "|" and source
     # is the 9th field. The source is used to make the name unique. Some
     # of these names are BAC ends that occur in the BAC ends track so there
     # are name clashes in the seq table if the names are not made unique.
     # Also make the name upper case as for those for the danRer1 and danRer2
     # RH map and remove base numbering on each sequence line of FASTA file.
 cat << '_EOF_' > rhFix
 #!/usr/bin/awk -f 
 
 #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
 /^>/ {
     split(toupper($0), a, "\\|");
     print a[1]"."a[9];
     next;
 }
 
 /^[0-9]+ / {
     $0 = $2;
 }
 
 {
     print $0;
 }
 
 '_EOF_'
 # << keep emacs coloring happy
     chmod +x rhFix
     rhFix rhMap100306.fa > rhMap.fa
     # Blat sequences vs danRer3 genome
     ssh pk
     mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
     cd /cluster/data/danRer3/bed/ZonLab/rhMap
     # put the rhMap sequences on the san 
     mkdir -p /san/sanvol1/scratch/danRer3/rhMap
     cp rhMap.fa /san/sanvol1/scratch/danRer3/rhMap/
     # do blat run to align RH map sequences to danRer3 and and use
     # chrNA_random and chrUn_random separated into scaffolds.
     cd blatRun
     ls -1S /san/sanvol1/scratch/danRer3/rhMap/rhMap.fa > rhMap.lst
     foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/*.fa)
        ls -1S $f >> genome.lst
     end
     wc -l genome.lst 
     # 15149 genome.lst
     # for output:
     mkdir -p /san/sanvol1/scratch/danRer3/rhMap/psl
     # use -repeats option to report matches to repeat bases separately
     # to other matches in the PSL output.
     echo '#LOOP\n/cluster/bin/x86_64/blat -repeats=lower -minIdentity=80
 -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc $(path1) $(path2) {check out
 line+ /san/sanvol1/scratch/danRer3/rhMap/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
 > template.sub
 
     gensub2 genome.lst rhMap.lst template.sub para.spec
     para create para.spec
     para try, check, push ... etc.
     para time
 # Completed: 15149 of 15149 jobs
 #CPU time in finished jobs:      13684s     228.07m     3.80h    0.16d  0.000y
 #IO & Wait Time:                 38258s     637.63m    10.63h    0.44d  0.001 y
 #Average job time:                   3s       0.06m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              24s       0.40m     0.01h    0.00d
 #Submission to last job:           901s      15.02m     0.25h    0.01d
 
     # need to do pslSort and lift up
     ssh pk
     cd /san/sanvol1/scratch/danRer3/rhMap
     # Do sort, liftUp and then best in genome filter.
     pslSort dirs raw.psl tmp psl 
     # only use alignments that have at least
     # 95% identity in aligned region.
     # Previously did not use minCover since a lot of sequence is in
     # Un and NA so genes may be split up so good to see all alignments.
     # However, found a number of short alignments of <= 50 bp. These are
     # not meaningful so maybe need to use minCover. If increased too much,
     # then hits on poor parts of the assembly will be missed.
     # use pslCDnaFilter with the same parameters as used for zebrafish
     # Genbank EST alignments.
         
     # Use parameters as determined for danRer4
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.85 -minCover=0.33 raw.psl contig.psl
 #                         seqs    aligns
 #             total:     11060   1767931
 #      drop invalid:     1       1
 #drop minNonRepSize:     3047    1297013
 #     drop minIdent:     763     3913
 #     drop minCover:     4065    420022
 #        weird over:     288     4267
 #        kept weird:     130     189
 #    drop localBest:     2188    34092
 #              kept:     10447   12890
     # Percent sequences aligned: 10447/11514 = 90.7%
     # This is a compromise between reducing the number of sequences piling
     # up but not losing all alignments for too many sequences.
     awk '{print $10}' contig.psl | sort | uniq -c | sort -nr > contig.count
     head contig.count
 #      33 ZKP106G9.YA.BACENDS
 #     21 BZ83M20.Z.BACENDS
 #     12 ZK4I5.T7.BACENDS
 #     10 ZC27I3.ZA.BACENDS
 #     10 ZC261G9.ZAF.BACENDS
 #     10 ZC261G9.ZA.BACENDS
 #      8 ZK8O7.T7.BACENDS
 #      8 ZC77P2.ZB.BACENDS
 #      8 FJ89A05.X1.WU_ZFEST
 #      8 FJ07G09.X1.WU_ZFEST
 
     cd /cluster/data/danRer3/bed/ZonLab/rhMap
     # lift up to genome level coordinates
     liftUp rhMap.psl \
            /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft warn \
            /san/sanvol1/scratch/danRer3/rhMap/contig.psl
     # Got 30168 lifts in
     # /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft
     pslCheck rhMap.psl
     # psl looks ok
     # cleanup 
     rm *.bak *.sort
     # Load sequence alignments into the database
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ZonLab/rhMap   
     # drop old table and reload final psl file
     hgsql -e 'drop table rhMap;' danRer3 
     hgLoadPsl danRer3 rhMap.psl
      
     # Copy sequences to gbdb if they are not already there.
     mkdir -p /gbdb/danRer3/rhMap
     # remove old sequences
     rm /gbdb/danRer3/rhMap/rhMap022306.fa
     ln -s \
       /cluster/data/danRer3/bed/ZonLab/rhMap/rhMap.fa \
       /gbdb/danRer3/rhMap/rhMap20061003.fa
 
     # then add sequences to database:
     # remove old ones first
     hgsql -e 'select * from extFile where path like "%rhMap%";' danRer3
 #| id     | name           | path                               | size    |
 #+--------+----------------+------------------------------------+---------+
 #| 747628 | rhMap022306.fa | /gbdb/danRer3/rhMap/rhMap022306.fa | 7456861 |
 #+--------+----------------+------------------------------------+---------+
     hgsql -e 'select count(*) from seq where extFile = 747628;' danRer3
     hgsql -e 'delete from seq where extFile = 747628;' danRer3
     # delete from extFile:
     hgsql -e 'delete from extFile where id = 747628;' danRer3
     hgLoadSeq danRer3 /gbdb/danRer3/rhMap/rhMap20061003.fa
     # loaded succesfully
     # Check in the Browser and see if there are many pileups
     # Much reduced now on chr24. Took 10 random sequences in the pileup from
     # minCover=0.20 and found that 7 of them still align to danRer4 
     # with minCover=0.33 and 2 of those that don't also have primers that
     # do not map using the hgPcr tool.
     # Add trackDb entry and also an rhMap.html for trackDb/zebrafish/danRer4
     # also add the search specs for hgFindSpec to trackDb.ra
 
     # Add table of related information for the RH map details pages:
     
     # Check that all the headers from rhMap.headers are also in the primers
     # file which seems to contain the same headers from the FASTA file
     # as well as additional markers.
     ssh kkstore02
     cd /cluster/data/danRer3/bed/ZonLab/rhMap/
     # The same RH map version was used as for danRer4 so the data for the 
     # info table is the same as for danRer4 so copy the file over. See 
     # kent/src/hg/makeDb/doc/danRer4.txt for details on how this file is 
     # produced.
     cp /cluster/data/danRer4/bed/ZonLab/rhMap/rhMapInfoWithZfinIds.tab . 
     # load the info table
     ssh hgwdev
     cd /cluster/data/danRer3/bed/ZonLab/rhMap
     hgLoadSqlTab danRer3 rhMapZfishInfo ~/kent/src/hg/lib/rhMapZfishInfo.sql \
           rhMapInfoWithZfinIds.tab
     
     # add code to hgc.c to print ZFIN ID, if available, on the details page
     # together with the other marker-related information.
     # added track to trackDb.ra in trackDb/zebrafish/danRer4 with a URL for 
     # the ZFIN IDs to link to the relevant page at http://www.zfin.org 
     # and added an html page for the track.
 
 #########################################################################
 # BACENDS CLEANUP (DONE, 2007-03-27, hartera)
     ssh kkstore02
     cd /cluster/data/danRer3/bed/bacends
     # 23G in bacends directory
     # remove sequence file as already in bacSeqs dir
     rm Zv5Bacends.fa
     # du -sh psl   
     # 12G psl
     nice rm -r psl 
     cd bacends.1
     rm bacEndAccs.aliases bacEnds.log bacEnds.names.sort bacPrs.names bacs.log
     rm ch211 intNames intNames.count out test test.pl bacEndSingles.txt
     rm -r test2 bacEndAccs
     rm BACClones* BACEndAccs.txt *.accs allBacEnds* bacEndSingles.names
     cd ../scoresAndCoords
     rm allBacEnds.names.* bacEndSinglesGoodScores.bed error.log *.tab \
        singles.hits bacEnds.load.psl bacEnds.names 
     rm -r tmp
     cd ../pairsNew
     # bacEndSingles.bed is already in singlesNew
     rm bacEnds.* bed.tab bacEndSingles.bed
     cd ../singlesNew
     rm singles.coordcheck bed.tab bacEnds.*
     cd ../duplicates
     # duplicatesNew is latest directory so remove everthing else from
     # duplicates directory
     rm *
     cd duplicatesNew
     rm log* *.lfs
     cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
     rm -r tmp
     rm bacClones* bacs.names log
     cd /cluster/data/danRer3/bed/
     du -sh bacends
     # 5.0G    bacends
     # BAC ENDS track was remade in May 2006 (see REDO BACENDS section)
     # so can remove bacEndsNew which is an old version from 2005"
     du -sh newBacends
     # 37G     newBacends
     nice rm -r newBacends
 #########################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 echo danRer3 fr1 tetNig1 mm7 hg18  > /hive/data/genomes/danRer3/bed/multiz5way/species.list
 update genbank.conf
 danRer3.upstreamGeneTbl = refGene
 danRer3.upstreamMaf = multiz5way /hive/data/genomes/danRer3/bed/multiz5way/species.list