src/hg/makeDb/doc/danRer4.txt 1.35

1.35 2009/11/25 21:48:39 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/danRer4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer4.txt,v
retrieving revision 1.34
retrieving revision 1.35
diff -b -B -U 1000000 -r1.34 -r1.35
--- src/hg/makeDb/doc/danRer4.txt	17 Oct 2008 01:06:31 -0000	1.34
+++ src/hg/makeDb/doc/danRer4.txt	25 Nov 2009 21:48:39 -0000	1.35
@@ -1,10285 +1,10285 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # Danio Rerio (zebrafish) from Sanger, version Zv5 (released 5/20/05)
 #  Project website:
 #    http://www.sanger.ac.uk/Projects/D_rerio/
 #  Assembly notes:
 #    http://www.sanger.ac.uk/Projects/D_rerio/
 #    ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_assembl_information.shmtl
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +-----------+-------------------------+
 #  | tableName | type                    |
 #  +-----------+-------------------------+
 #  | refGene   | genePred refPep refMrna |
 #  | mgcGenes  | genePred                |
 #  | genscan   | genePred genscanPep     |
 #  +-----------+-------------------------+
 
 
 ###########################################################################
 # DOWNLOAD SEQUENCE (DONE, 2006-03-29, hartera)
 # CHANGED NAME OF SCAFFOLDS AGP FILE (DONE, 2006-04-13, hartera)
      ssh kkstore01
      mkdir /cluster/store8/danRer4 
      ln -s /cluster/store8/danRer4 /cluster/data
      cd /cluster/data/danRer4
      wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/README
      wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6.chunks.agp
      wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6.scaffold.agp
      wget --timestamp \
       ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_scaffolds.fa
      wget --timestamp \
     ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_scaffolds.stats
      # keep agp file name consistent with Zv5 (hartera, 2006-04-13)
      mv Zv6.scaffold.agp Zv6.scaffolds.agp
 
 ###########################################################################
 # DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2006-03-29, hartera)
 # ADDED CHUNKS AGP FILE (DONE, 2006-04-13, hartera)
      ssh kkstore01
      mkdir -p /cluster/data/danRer4/M
      cd /cluster/data/danRer4/M
      # go to http://www.ncbi.nih.gov/ and search the Nucleotide database for
      # "Danio mitochondrion genome".  That shows the gi number:
      # 8576324 for the accession, AC024175
  # Use that number in the entrez linking interface to get fasta:
      wget -O chrM.fa \
       'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
      # Edit chrM.fa: make sure the header line says it is the
      # Danio Rerio mitochondrion complete genome, and then replace the
      # header line with just ">chrM".
      perl -pi.bak -e 's/>.+/>chrM/' chrM.fa
      rm *.bak
      # Make a "pseudo-contig" for processing chrM too:
      mkdir ./chrM_1
      sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
      mkdir ./lift
      echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
      echo "chrM_1" > ./lift/ordered.lst
      # make sure this is tab delimited:
      echo "0\tM/chrM_1\t16596\tchrM\t16596" > ./lift/ordered.lft
 # create a .agp file for chrM as hgGoldGapGl and other
 # programs require a .agp file so create chrM.agp
      echo "chrM\t1\t16596\t1\tF\tAC024175.3\t1\t16596\t+" \
           > chrM.agp
      # Create a chrM.chunks.agp (hartera, 2006-04-13)
      mkdir -p /cluster/data/danRer4/M/agps
      cd /cluster/data/danRer4/M/agps
      awk 'BEGIN {OFS="\t"} \
         {print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' \
         ../chrM.agp > chrM.chunks.agp
      # make sure that all above *.agp files are tab delimited
 
 ###########################################################################
 # CREATE LIST OF CHROMOSOMES (DONE, 2006-04-12, hartera)
 # Change names of random chroms to chrNA_random and chrUn_random
 # (DONE, hartera, 2006-04-21)
      ssh kkstore01
      cd /cluster/data/danRer4
      awk '{if ($1 !~ /Zv6/) print $1;}' Zv6.scaffolds.agp \
          | sort -n | uniq > chrom.lst
      cp chrom.lst chrom1to25.lst
      # add chrM, chrUn and chrNA
      echo "M" >> chrom.lst
      echo "NA" >> chrom.lst
      echo "Un" >> chrom.lst
      # Change names of random chroms to reflect that
      perl -pi.bak -e 's/NA/NA_random/' chrom.lst
      perl -pi.bak -e 's/Un/Un_random/' chrom.lst
      rm *.bak
 
 ###########################################################################
 # MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2006-04-12, hartera)
     ssh kkstore01
     cd /cluster/data/danRer4
     # This used to hold scripts -- better to keep them inline here 
     # Now it should just hold lift file(s) and
     # temporary scripts made by copy-paste from this file.
     mkdir /cluster/data/danRer4/jkStuff
     # This is where most tracks will be built:
     mkdir /cluster/data/danRer4/bed
 
 ###########################################################################
 # CHECK AGP FILES AND FASTA SIZE CONSISTENCY (DONE, 2006-04-13, hartera)
 # 
     ssh kkstore01
     cd /cluster/data/danRer4
     mkdir -p /cluster/data/danRer4/scaffolds
     cd /cluster/data/danRer4/scaffolds
     faSize detailed=on ../Zv6_scaffolds.fa > Zv6.scaffolds.sizes
     # Check that these sizes correspond to the sizes in the scaffolds agp file
     # use script compareSizes2.pl
     cat << '_EOF_' > ../jkStuff/compareSizes2.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($file, $agp);
 
 $file = $ARGV[0];
 $agp = $ARGV[1];
 
 open(FILE, $file) || die "Can not open $file: $!\n";
 open(AGP, $agp) || die "Can not open $agp: $!\n";
 open(OUT, ">log.txt") || die "Can not create log.txt: $!\n";
 
 my ($l, @f, $name, $size, %scafsHash);
 while (<FILE>)
 {
 $l = $_;
 @f = split(/\t/, $l);
 
 $name = $f[0]; 
 $size = $f[1];
 $scafsHash{$name} = $size;
 }
 close FILE;
 
 while (<AGP>)
 {
 my ($line, @fi, $scaf, $end);
 $line = $_;
 
 if ($line =~ /Zv/)
    {
    @fi = split(/\t/, $line);
    $scaf = $fi[5];
    $end = $fi[7];
 
    if (exists($scafsHash{$scaf}))
       {
       if ($scafsHash{$scaf} == $end)
          {
          print OUT "$scaf - ok\n";
          }
       else
          {
          print OUT "$scaf - different size to sequence\n";
          }
       }
    else
       {
       print OUT "$scaf - does not exist in list of sizes\n";
       }
    }
 }
 close AGP;
 close OUT;
 '_EOF_'
    # << happy emacs
    chmod +x ../jkStuff/compareSizes2.pl
    perl /cluster/data/danRer4/jkStuff/compareSizes2.pl \
         Zv6.scaffolds.sizes ../Zv6.scaffolds.agp 
    grep different log.txt
    grep not log.txt
    # these are all consistent with the sequence sizes
    # check that the co-ordinates in the agp files are consistent:
    # field 2 is the start position, field 3 is the end and field 8 is the size
    # so check that this is consistent.
    cd /cluster/data/danRer4
    awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' Zv6.scaffolds.agp \
        > Zv6.scaffolds.coordCheck 
    # this file is empty so they are ok. do the same for the chunks.agp file
    awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' Zv6.chunks.agp \
        > Zv6.chunks.coordCheck
    # this file is empty so ok
    # check that the difference between 7th and 8th fields is the same as the 
    # difference between 11th and 12th fields. 
    awk '{if ($5 != "N" && (($8 - $7) != ($12 - $11))) print $6;}' \
        Zv6.chunks.agp > Zv6.chunks.coordCheck2
    # these are all ok
    rm Zv6.*.coord*
 
    cat << '_EOF_' > ./jkStuff/checkSizesInAgps.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($ch, $sc, %scafsHash);
 $sc = $ARGV[0]; # scaffolds agp
 $ch = $ARGV[1]; # chunks or contigs agp
 
 open(SCAFS, $sc) || die "Can not open $sc: $!\n";
 open(CHUNKS, $ch) || die "Can not open $ch: $!\n";
 
 while (<SCAFS>)
 {
 my ($l, @f, $name, $e);
 $l = $_;
 @f = split(/\t/, $l);
 if ($f[5] =~ /^Zv/)
    {
    $name = $f[5];
    $e = $f[2];
    $scafsHash{$name} = $e;
    }
 }
 close SCAFS;
 
 my $scaf = "";
 my $prev = "";
 my $prevEnd = 0;
 
 while (<CHUNKS>)
 {
 my ($line, @fi);
 $line = $_;
 @fi = split(/\t/, $line);
 
 # if it is not a gap line
 if ($fi[4] ne "N")
    {
    $scaf = $fi[9];
    if (($scaf ne $prev) && ($prev ne ""))
       {
       checkCoords($prev, $prevEnd);
       }
 $prev = $scaf;
 $prevEnd = $fi[2];
    }
 }
 # check last entry in file
 checkCoords($prev, $prevEnd);
 close CHUNKS;
 
 sub checkCoords {
 my ($name, $end) = @_;
 if (exists($scafsHash{$prev}))
    {
    if ($scafsHash{$prev} != $prevEnd)
       {
       my $ed = $scafsHash{$prev};
       print "Scaffold $prev is not consistent between agps\n";
       }
    else
       {
       my $ed = $scafsHash{$prev};
       print "Scaffold $prev - ok\n";
       }
    }
 }
 '_EOF_'
    # << happy emacs
    chmod +x ./jkStuff/checkSizesInAgps.pl
    cd scaffolds
    perl /cluster/data/danRer4/jkStuff/checkSizesInAgps.pl \
         Zv6.scaffolds.agp Zv6.chunks.agp > Zv6.scafsvschunks
    grep "not consistent" Zv6.scafsvschunks
    # no lines were inconsistency was reported
    wc -l Zv6.scafsvschunks
    # 6653 Zv6.scafsvschunks
    grep "Zv6" Zv6.scaffolds.agp | wc -l
    # 6653
    # so all the scaffolds were checked and were ok.
    cd ..
    rm -r scaffolds
  
 ###########################################################################
 # SPLIT AGP FILES BY CHROMOSOME (DONE, 2006-04-13, hartera)
 # GENOME FASTA FROM SANGER WAS CREATED USING SCAFFOLDS AGP
    ssh kkstore01
    cd /cluster/data/danRer4
    # There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and
    # then one for chunks (contigs on danRer1) showing how they map on to
    # scaffolds.
   
    # get list of scaffolds from FASTA file and check these are in agp
    grep '>' Zv6_scaffolds.fa | sed -e 's/>//' | sort | uniq > Zv6FaScafs.lst
    # get list of scaffolds from agp - do not print from gap lines
    awk '{if ($7 !~ /contig/) print $6;}' Zv6.scaffolds.agp \
         | sort | uniq > Zv6AgpScafs.lst
    diff Zv6FaScafs.lst Zv6AgpScafs.lst
    # no difference so all scaffolds are in the FASTA file
    # add "chr" prefix for the agp files
    perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp
    # for chromosomes 1 to 25, create 2 agps for each chrom, one for scaffolds 
    # and one for chunks:
    foreach c (`cat chrom1to25.lst`)
      echo "Processing $c ..."
      mkdir $c
      perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
           ./Zv6.chunks.agp \
           > $c/chr$c.chunks.agp
      perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
           ./Zv6.scaffolds.agp \
           > $c/chr$c.scaffolds.agp
    end
    
 ###########################################################################
 # CREATE AGP FILES FOR chrNA AND chrUn (DONE, 2006-04-13, hartera)
 # RECREATE AGP FILES WITH chrNA and chrUn RENAMED AS chrNA_random 
 # AND chrUn_random (DONE, 2006-04-21, hartera)
 # NOTE: IN THIS ASSEMBLY AND IN FUTURE, NAME chrNA AND chrUn AS 
 # chrNA_random AND chrUn_random TO REFLECT THAT THEY ARE UNORDERED 
 # COLLECTIONS OF SCAFFOLDS.  
    ssh kkstore01
    # chrNA_random consists of WGS contigs that could not be related to any
    # FPC contig and the scaffolds and contigs are named Zv5_NAN in the
    # first field of the agp files where the second N is an number.
    cd /cluster/data/danRer4
    mkdir ./NA_random
    awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.chunks.agp \
        > ./NA_random/NA_random.chunks.agp
    awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.scaffolds.agp \
        > ./NA_random/NA_random.scaffolds.agp
    # change the first field to "chrNA_random" then can use agpToFa to process
    perl -pi.bak -e 's/Zv6_NA[0-9]+/chrNA_random/' ./NA_random/*.agp
    wc -l ./NA_random/NA_random.scaffolds.agp
    # 2898 ./NA_random/NA_random.scaffolds.agp
    # check files and remove backup files
    # these are not sorted numerically by scaffold number
    rm ./NA_random/*.bak
    # then process chrUn_random - this is made from scaffolds and
    # contigs where the name is Zv6_scaffoldN in the first field of the
    # agp files. These scaffolds and contigs are unmapped to chromosomes
    # in the agp file. chrUn_random is made up of WGS scaffolds that mapped to
    # FPC contigs, but the chromosome is unknown.
    mkdir ./Un_random
    awk '{if ($1 ~ /Zv6_scaffold/) print;}' Zv6.chunks.agp \
        > ./Un_random/Un_random.chunks.agp
    awk '{if ($1 ~ /Zv6_scaffold/) print;}' Zv6.scaffolds.agp \
        > ./Un_random/Un_random.scaffolds.agp
    # change the first field to "chrUn_random" then can use agpToFa to process
    perl -pi.bak -e 's/Zv6_scaffold[0-9]+/chrUn_random/' ./Un_random/*.agp
    wc -l ./Un_random/Un_random.scaffolds.agp
    # 68 ./Un_random/Un_random.scaffolds.agp
    # check files and remove backup files
    rm ./Un_random/*.bak
    # get FASTA file of sequences for NA_random and Un_random and create agp with
    # Ns between scaffolds
    # from scaffolds agp, get name of scaffolds to be retrieved from the 
    # FASTA file to make the NA_random and Un_random chromosomes.
    cd /cluster/data/danRer4
    foreach c (NA_random Un_random)
      awk '{print $6;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
          $HOME/bin/i386/faSomeRecords /cluster/data/danRer4/Zv6_scaffolds.fa \
          $c/chr$c.scaffolds.lst $c/chr$c.fa
    end
    # check that all scaffolds in the list are in the FASTA file for 
    # NA_random and Un_random.
    # made a change to scaffoldFaToAgp.c so that the the number of Ns to be
    # inserted between scaffolds can be specified as an option.
    # There are less and smaller random scaffolds than before so use 50,000 Ns
    # between scaffolds as for the human random chromosomes.
    foreach c (NA_random Un_random)              
      $HOME/bin/i386/scaffoldFaToAgp -scaffoldGapSize=50000 $c/chr$c.fa
      mv $c/chr$c.fa $c/chr$c.scaffolds.fa
    end
    # change chrUn to chrNA_random for NA_random, change chrUn to chrUn_random
    # forUn_random. Change D to W for NA_random and Un_random..
    sed -e 's/chrUn/chrNA_random/' ./NA_random/chrNA_random.agp \
        | sed -e 's/D/W/' > ./NA_random/chrNA_random.scaffolds.agp
    # the scaffolds agp for chrNA_random is now sorted numerically by 
    # scaffold number
    sed -e 's/chrUn/chrUn_random/' ./Un_random/chrUn_random.agp \
        | sed -e 's/D/W/' > ./Un_random/chrUn_random.scaffolds.agp
    # edit ./NA_random/chrNA_random.scaffolds.agp and 
    # ./Un_random/chrUn_random.scaffolds.agp and remove last line as this 
    # just adds an extra 50000 Ns at the 
    # end of the sequence.
    rm ./NA_random/chrNA_random.agp ./Un_random/chrUn_random.agp
 cat << '_EOF_' > ./jkStuff/createAgpWithGaps.pl
 #!/usr/bin/perl
 use strict;
 
 # This script takes a chunks agp and inserts Ns between scaffolds for 
 # the chunks (contigs) agp file. Could also insert Ns between scaffolds
 # for scaffolds agp.
 
 my ($chrom, $numN, $name, $prev, $st, $end, $prevEnd, $id);
 my $chrom = $ARGV[0]; # chromosome name
 my $numN = $ARGV[1];  # number of Ns to be inserted 
 my $type = $ARGV[2]; # contigs or scaffolds
 
 $prev = "";
 $st = 1;
 $prevEnd = 0;
 $id = 0;
 
 while (<STDIN>)
 {
 my $l = $_;
 my @f = split(/\t/, $l);
 
 if ($type eq "contigs")
    {
    $name = $f[9];
    }
 else
    {
    $name = $f[5]
    }
 
 my $currSt = $f[1];
 my $currEnd = $f[2];
 my $size = $currEnd - $currSt;
 
 $id++;
 $st = $prevEnd + 1;
 $end = $st + $size;
 
 if (($prev ne "") && ($prev ne $name))
    {
    $st = $prevEnd + 1;
    $end = ($st + $numN) - 1;
    print "$chrom\t$st\t$end\t$id\tN\t$numN\tcontig\tno\n";
    $prevEnd = $end;
    $id++;
    }
 
 $st = $prevEnd + 1;
 $end = $st + $size;
 print "$chrom\t$st\t$end\t$id\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$f[8]";
 if ($type eq "contigs")
    {
    print "\t$f[9]\t$f[10]\t$f[11]";
    }
 
 $prevEnd = $end;
 $prev = $name;
 }
 '_EOF_'
    chmod +x ./jkStuff/createAgpWithGaps.pl
    cd /cluster/data/danRer4/NA_random
    # for NA_random, sort the chunks.agp by contig number
    perl -pi.bak -e 's/Zv6_NA//' NA_random.chunks.agp
    sort -k6,6n NA_random.chunks.agp > NA_random.chunks2.agp
    # then put back Zv6_NA
    perl -pi.bak -e 's/([0-9]+\.[0-9]+)/Zv6_NA$1/' NA_random.chunks2.agp
    mv NA_random.chunks2.agp NA_random.chunks.agp
    # Un_random.chunks.agp is already sorted by scaffold number
    cd /cluster/data/danRer4
    foreach c (NA_random Un_random)
       cd $c
       perl /cluster/data/danRer4/jkStuff/createAgpWithGaps.pl \
            chr${c} 50000 contigs < ${c}.chunks.agp > chr${c}.chunks.agp
       cd ..
    end
    # check co-ordinates
    # field 2 is the start position, field 3 is the end and field 8 is the size
    # so check that this is consistent in scaffolds and chunks agp. 
    # check that the difference between 7th and 8th fields is the same as the 
    # difference between 11th and 12th fields for chunks agp. 
    cd /cluster/data/danRer4
    foreach c (NA_random Un_random)
         awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' \
             $c/chr${c}.scaffolds.agp > $c/chr${c}.scaffolds.coordCheck 
         awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' \
             $c/chr${c}.chunks.agp > $c/chr${c}.chunks.coordCheck 
         awk '{if ($5 != "N" && (($8 - $7) != ($12 - $11))) print $6;}' \
             $c/chr${c}.chunks.agp > $c/chr${c}.chunks.coordCheck2 
    end
    # check the outputs are empty
    wc -l NA_random/*.coord*
    wc -l Un_random/*.coord*
    rm NA_random/*.coord* Un_random/*.coord*
    # check that the scaffolds and chunks agp files are consistent with
    # each other. 
 cat << '_EOF_' > ./jkStuff/checkSizesInAgps.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($ch, $sc, %scafsHash);
 $sc = $ARGV[0]; # scaffolds agp
 $ch = $ARGV[1]; # chunks or contigs agp
 
 open(SCAFS, $sc) || die "Can not open $sc: $!\n";
 open(CHUNKS, $ch) || die "Can not open $ch: $!\n";
 
 while (<SCAFS>)
 {
 my ($l, @f, $name, $e);
 $l = $_;
 @f = split(/\t/, $l);
 if ($f[5] =~ /^Zv/)
    {
    $name = $f[5];
    $e = $f[2];
    $scafsHash{$name} = $e;
    }
 }
 close SCAFS;
 
 my $scaf = "";
 my $prev = "";
 my $prevEnd = 0;
 
 while (<CHUNKS>)
 {
 my ($line, @fi);
 $line = $_;
 @fi = split(/\t/, $line);
 
 # if it is not a gap line
 if ($fi[4] ne "N")
    {
    $scaf = $fi[9];
    if (($scaf ne $prev) && ($prev ne ""))
       {
       checkCoords($prev, $prevEnd);
       }
 $prev = $scaf;
 $prevEnd = $fi[2];
    }
 }
 # check last entry in file
 checkCoords($prev, $prevEnd);
 close CHUNKS;
 
 sub checkCoords {
 my ($name, $end) = @_;
 if (exists($scafsHash{$prev}))
    {
    if ($scafsHash{$prev} != $prevEnd)
       {
       my $ed = $scafsHash{$prev};
       print "Scaffold $prev is not consistent between agps\n";
       }
    else
       {
       my $ed = $scafsHash{$prev};
       print "Scaffold $prev - ok\n";
       }
    }
 }
 '_EOF_'
    # << happy emacs   
    chmod +x jkStuff/checkSizesInAgps.pl
    foreach c (NA_random Un_random)
       perl /cluster/data/danRer4/jkStuff/checkSizesInAgps.pl \
            $c/chr${c}.scaffolds.agp $c/chr${c}.chunks.agp \
            > $c/${c}.scafsvschunks
    end
    foreach c (NA_random Un_random)
      grep "not consistent" $c/${c}.scafsvschunks
    end 
    wc -l NA_random/NA_random.scafsvschunks 
    wc -l Un_random/Un_random.scafsvschunks 
    # no lines were inconsistency was reported
    rm NA_random/NA_random.scafsvschunks Un_random/Un_random.scafsvschunks
    # clean up
    foreach c (NA_random Un_random)
       rm $c/${c}.scaffolds.agp $c/${c}.chunks.agp $c/chr${c}.scaffolds.fa \
          $c/chr${c}.scaffolds.lst $c/*.bak
    end
 '_EOF_'
 
 ###########################################################################
 # BUILD CHROM-LEVEL SEQUENCE (DONE, 2006-04-13, hartera)
 # REPEAT THIS FOR chrNA_random AND chrUn_random (DONE, 2006-04-21, hartera)
    ssh kkstore01
    cd /cluster/data/danRer4
    # Ignore warnings about chrM files not existing - this chrom has 
    # already been processed - see mitochondrion section above.
    # Sequence is already in upper case so no need to change
    foreach c (`cat chrom.lst`)
      echo "Processing ${c}"
      $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
          $c/chr$c.fa ./Zv6_scaffolds.fa
      echo "${c} - DONE"
    end
    # move scaffolds agp to be chrom agp and clean up
    foreach c (`cat chrom.lst`)
       cd $c
       cp chr${c}.scaffolds.agp chr${c}.agp
       mkdir -p agps
       mv chr${c}.*.agp ./agps/
       cd ..
    end
    # Repeat just for chrNA_random and chrUn_random (2006-04-21, hartera)
    foreach c (NA_random Un_random)
      echo "Processing ${c}"
      $HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
          $c/chr$c.fa ./Zv6_scaffolds.fa
      echo "${c} - DONE"
    end
    # move scaffolds agp to be chrom agp and clean up
    foreach c (NA_random Un_random)
       cd $c
       cp chr${c}.scaffolds.agp chr${c}.agp
       mkdir -p agps
       mv chr${c}.*.agp ./agps/
       cd ..
    end
 
 ##########################################################################
 # CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2006-04-14, hartera)
 # RE-CHECK THESE AFTER CREATING chrNA_random AND chrUn_random SEQUENCE FILES 
 # (DONE, 2006-04-20, hartera)
    # Check that the size of each chromosome .fa file is equal to the last
    # co-ordinate of the corresponding agp file.
    ssh hgwdev
    cd /cluster/data/danRer4
    foreach c (`cat chrom.lst`)
      foreach f ( $c/chr$c.agp )
        set agpLen = `tail -1 $f | awk '{print $3;}'`
        set h = $f:r
        set g = $h:r
        echo "Getting size of $g.fa"
        set faLen = `faSize $g.fa | awk '{print $1;}'`
        if ($agpLen == $faLen) then
          echo "   OK: $f length = $g length = $faLen"
        else
          echo "ERROR:  $f length = $agpLen, but $g length = $faLen"
        endif
      end
    end
    # all are the OK so FASTA files are the expected size
 
 ###########################################################################
 # CREATING DATABASE (DONE, 2006-04-14, hartera)
     # Create the database.
     # next machine
     ssh hgwdev
     echo 'create database danRer4' | hgsql ''
     # if you need to delete that database:  !!! WILL DELETE EVERYTHING !!!
     echo 'drop database danRer4' | hgsql danRer4
     # Use df to make sure there is at least 10 gig free on
     df -h /var/lib/mysql
 # Before loading data:
 # Filesystem            Size  Used Avail Use% Mounted on
 # /dev/sdc1             1.8T  1.5T  173G  90% /var/lib/mysql
 
 ###########################################################################
 # CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2006-04-14, hartera)
     # next machine
     ssh hgwdev
     #  the following command copies all the data from the table
     #  grp in the database mm8 to the new database danRer4. Use one of the
     #  newest databases to copy from to make sure that the groupings are
     #  up to date.
     echo "create table grp (PRIMARY KEY(NAME)) select * from mm8.grp" \
       | hgsql danRer4
     # if you need to delete that table:   !!! WILL DELETE ALL grp data !!!
     echo 'drop table grp;' | hgsql danRer4
 
 ###########################################################################
 # MAKE HGCENTRALTEST ENTRY FOR DANRER4 (DONE, 2006-04-14, hartera)
 # CHANGE DATE FORMAT ON HGCENTRALTEST ENTRY (DONE, 2006-04-21, hartera)
     # Make entry into dbDb and defaultDb so test browser knows about it.
     ssh hgwdev
     # Add dbDb and defaultDb entries:
     echo 'insert into dbDb (name, description, nibPath, organism,  \
           defaultPos, active, orderKey, genome, scientificName,  \
           htmlPath, hgNearOk, hgPbOk, sourceName)  \
           values("danRer4", "March 2006", \
           "/gbdb/danRer4", "Zebrafish", "chr2:15,906,734-15,926,406", 1, \
           37, "Zebrafish", "Danio rerio", \
           "/gbdb/danRer4/html/description.html", 0,  0, \
           "Sanger Centre, Danio rerio Sequencing Project Zv6");' \
     | hgsql -h genome-testdb hgcentraltest
     # reformat the date (2006-04-21, hartera)
     echo 'update dbDb set description = "Mar. 2006" where name = "danRer4";' \
          | hgsql -h genome-testdb hgcentraltest
 
     # Create /gbdb directory for danRer4
     mkdir /gbdb/danRer4
     # SET AS DEFAULT LATER WHEN READY FOR RELEASE
     # set danRer4 to be the default assembly for Zebrafish
     echo 'update defaultDb set name = "danRer4" \
           where genome = "Zebrafish";' \
           | hgsql -h genome-testdb hgcentraltest
 
 ###########################################################################
 # BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
 # (DONE, 2006-04-14, hartera)
 # RE-DONE JUST FOR chrNA_random AND chrUn_random (DONE, 2006-04-20, hartera)
      ssh kkstore01
      cd /cluster/data/danRer4
      foreach c (`cat chrom.lst`)
        foreach agp ($c/chr$c.agp)
          if (-e $agp) then
            set fa = $c/chr$c.fa
            echo splitting $agp and $fa
            cp -p $agp $agp.bak
            cp -p $fa $fa.bak
            splitFaIntoContigs $agp $fa . -nSize=5000000
          endif
        end
      end
      
      # Repeat just for chrNA_random and chrUn_random (2006-04-21, hartera)
      ssh kkstore01
      cd /cluster/data/danRer4
      foreach c (NA_random Un_random)
        foreach agp ($c/chr$c.agp)
          if (-e $agp) then
            set fa = $c/chr$c.fa
            echo splitting $agp and $fa
            cp -p $agp $agp.bak
            cp -p $fa $fa.bak
            splitFaIntoContigs $agp $fa . -nSize=5000000
          endif
        end
      end
 
 ###########################################################################
 # MAKE LIFTALL.LFT (DONE, 2006-04-14, hartera)
 # REMAKE LIFTALL.LFT WITH chrNA_random AND chrUn_random 
 # (DONE, 2006-04-21, hartera)
      ssh kkstore01
      cd /cluster/data/danRer4
      rm jkStuff/liftAll.lft
      foreach c (`cat chrom.lst`)
        cat $c/lift/ordered.lft >> jkStuff/liftAll.lft
      end
 
 ###########################################################################
 # MAKE TRACKDB ENTRY FOR DANRER4 (DONE, 2006-04-14, hartera)
 # Should add this later when adding gold/gap tracks. Angie created a 
 # temporary chromInfo table otherwise make update/alpha causes an error
 # (2006-04-17)
     # Make trackDb table so browser knows what tracks to expect.
     ssh hgwdev
     mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer4
     cd ~/kent/src/hg/makeDb/trackDb/zebrafish
     cvs add danRer4
     cvs commit danRer4
     cd ~/kent/src/hg/makeDb/trackDb
     cvs up -d -P
     # Edit that makefile to add danRer4 in all the right places and do
     make update DBS=danRer4
     make alpha DBS=danRer4
     cvs commit -m "Added danRer4." makefile
 
 ###########################################################################
 # MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2006-04-14, hartera)
     ssh hgwdev
     mkdir /cluster/data/danRer4/html
     # make a symbolic link from /gbdb/danRer4/html to /cluster/data/danRer4/html
     ln -s /cluster/data/danRer4/html /gbdb/danRer4/html
     # Add a description page for zebrafish
     cd /cluster/data/danRer4/html
     cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer3/description.html .
     # Edit this for zebrafish danRer4
 
     # create a description.html page here
     cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer4
     # Add description page here too
     cp /cluster/data/danRer4/html/description.html .
     cvs add description.html
     cvs commit -m "First draft of description page for danRer4." \
         description.html
     cd ~/kent/src/hg/makeDb/trackDb
     make update DBS=danRer4
     make alpha  DBS=danRer4
 
 ###########################################################################
 # SIMPLE REPEAT [TRF] TRACK  (DONE, 2006-04-14, hartera)
 # RE-RUN FOR chrNA AND chrUn RENAMED AS chrNA_random AND chrUn_random
 # AND RELOAD THE TABLE (DONE, 2006-04-21, hartera)
 # MADE A NOTE IN THE HISTORY TABLE TO EXPLAIN WHY THE simpleRepeats TABLE
 # WAS RELOADED (DONE, 2006-04-22, hartera)
     # TRF can be run in parallel with RepeatMasker on the file server
     # since it doesn't require masked input sequence.
     # Run this on the kilokluster. Need to mask contig and chromosome 
     # sequences so run trf using contig sequences.
     # First copy over contig sequences to iscratch and then rsync to cluster.
     ssh kkr1u00
     rm -r /iscratch/i/danRer4/contigsNoMask
     mkdir -p /iscratch/i/danRer4/contigsNoMask
     cd /cluster/data/danRer4
     foreach d (/cluster/data/danRer4/*/chr*_?{,?})
        set ctg = $d:t
        foreach f ($d/${ctg}.fa)
           echo "Copyig $f ..."
           cp $f /iscratch/i/danRer4/contigsNoMask/
        end
     end
     ls /iscratch/i/danRer4/contigsNoMask/*.fa | wc -l
     # 317 sequence files
     # rsync to cluster machines
     foreach R (2 3 4 5 6 7 8)
        rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
     end
      
     ssh kki
     mkdir -p /cluster/data/danRer4/bed/simpleRepeat
     cd /cluster/data/danRer4/bed/simpleRepeat
     mkdir trf
 cat << '_EOF_' > runTrf
 #!/bin/csh -fe
 #
 set path1 = $1
 set inputFN = $1:t
 set outpath = $2
 set outputFN = $2:t
 mkdir -p /tmp/$outputFN
 cp $path1 /tmp/$outputFN
 pushd .
 cd /tmp/$outputFN
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
 popd
 rm -f $outpath
 cp -p /tmp/$outputFN/$outputFN $outpath
 rm -fr /tmp/$outputFN/*
 rmdir --ignore-fail-on-non-empty /tmp/$outputFN
 '_EOF_'
  # << keep emacs coloring happy
     chmod +x runTrf
 
 cat << '_EOF_' > gsub
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line trf/$(root1).bed}
 #ENDLOOP
 '_EOF_'
     # << keep emacs coloring happy
 
     ls -1S /iscratch/i/danRer4/contigsNoMask/chr*.fa > genome.lst
     gensub2 genome.lst single gsub jobList
     # 317 jobs
     para create jobList
     para try, check, push, check etc...
     para time
 # Completed: 317 of 317 jobs
 # CPU time in finished jobs:      25083s     418.05m     6.97h    0.29d  0.001 y
 # IO & Wait Time:                   933s      15.55m     0.26h    0.01d  0.000 y
 # Average job time:                  82s       1.37m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            2732s      45.53m     0.76h    0.03d
 # Submission to last job:          4604s      76.73m     1.28h    0.05d
 
     # Re-do only for chrNA_random and chrUn_random (2006-04-21, hartera)
     ssh kki
     cd /cluster/data/danRer4/bed/simpleRepeat
     rm trf/chrNA*.bed
     rm trf/chrUn*.bed
     rm simpleRepeat.bed
     mkdir -p randomsRun/trf
     cd randomsRun
     cp ../runTrf .
     cp ../gsub .
     ls -1S /iscratch/i/danRer4/contigsNoMask/chr*_random*.fa > genome.lst
     gensub2 genome.lst single gsub jobList
     para create jobList
     # 46 jobs
     para try, check, push, check etc...
     para time
 # Completed: 46 of 46 jobs
 # CPU time in finished jobs:       1904s      31.73m     0.53h    0.02d  0.000 y
 # IO & Wait Time:                   103s       1.72m     0.03h    0.00d  0.000 y
 # Average job time:                  44s       0.73m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             241s       4.02m     0.07h    0.00d
 # Submission to last job:           269s       4.48m     0.07h    0.00d 
     cp ./trf/*.bed /cluster/data/danRer4/bed/simpleRepeat/trf/
     # lift up to chrom level
     cd /cluster/data/danRer4/bed/simpleRepeat
     rm simpleRepeat.bed
     liftUp simpleRepeat.bed /cluster/data/danRer4/jkStuff/liftAll.lft warn \
            trf/*.bed
     # Reload into the database
     ssh hgwdev
     cd /cluster/data/danRer4/bed/simpleRepeat
     hgsql -e 'drop table simpleRepeat;' danRer4
     hgLoadBed danRer4 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     # Loaded 759659 elements of size 16
     # Make a note in the history table to explain why the simpleRepeats
     # table was reloaded (2006-04-22, hartera)
     hgsql -e 'update history set errata = \
       "Dropped table for reloading after changing names of random chroms." \
       where ix = 2;' danRer4
 
 ###########################################################################
 # CREATE MICROSAT TRACK (done 2006-7-5 JK)
      ssh hgwdev
      cd /cluster/data/danRer4/bed
      mkdir microsat
      cd microsat
      awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
     /cluster/bin/i386/hgLoadBed canFam2 microsat microsat.bed
 
 ###########################################################################
 # PROCESS SIMPLE REPEATS INTO MASK (DONE, 2005-06-14, hartera)
 # RE-DO AFTER RENAMING RANDOM CHROMS AS chrNA_random AND chrUn_random
 # (DONE, 2006-04-21, hartera)
    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore01
    cd /cluster/data/danRer4/bed/simpleRepeat
    rm -r trfMask
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
 
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/danRer4
    rm -r ./bed/simpleRepeat/trfMaskChrom
    mkdir bed/simpleRepeat/trfMaskChrom
    
    foreach c (`cat chrom.lst`)
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end
 
 ###########################################################################
 # GET ADDITIONAL ZEBRAFISH REPBASE LIBRARY FOR REPEATMASKER AND ADD TO
 # DANIO LIBRARY FOR REPEATMASKER (DONE, 2006-04-14, hartera)
 # Go to http://www.girinst.org/server/RepBase/RepBase11.02.fasta
 # (03-15-2006) and download zebunc.ref.txt containing unclassified zebrafish 
 # repeats.
 # Need username and password. Copy to /cluster/bluearc/RepeatMasker/Libraries/
    ssh hgwdev
    cd /cluster/bluearc/RepeatMasker/Libraries
    # This is /cluster/bluearc/RepeatMasker060320/Libraries
    # Do a dummy run of RepeatMasker with the -species option. This creates
    # a zebrafish-specific library from the EMBL format RepBase library.
    # Then the zebunc.ref unclassified repeats can be added to this library.
    /cluster/bluearc/RepeatMasker/RepeatMasker -spec danio /dev/null
    # RepeatMasker version development-$Id: RepeatMasker,v 1.13 2006/03/21 
    # This creates a specieslib in Libraries/20060315/danio
    # Format the zebunc.ref library:
    # Sequence is upper case, change to lower case like the specieslib
    cat zebunc.ref.txt | tr '[A-Z]' '[a-z]' > zebunc.ref.format 
    perl -pi.bak -e 's/>dr([0-9]+)/>Dr$1#Unknown/' zebunc.ref.format
    grep '>' zebunc.ref.format | wc -l
    # 958
    cd /cluster/bluearc/RepeatMasker/Libraries/20060315/danio
    grep '>' specieslib | wc -l
    # 219
    mv specieslib danio.lib
    cat danio.lib ../../zebunc.ref.format > specieslib  
    grep '>' specieslib | wc -l
    # 1177
    rm danio.lib
    # make a copy in Libraries directory in case this directory of libraries
    # is removed.
    cp specieslib /cluster/bluearc/RepeatMasker/Libraries/danio.lib
  
 ###########################################################################
 # SPLIT SEQUENCE FOR REPEATMASKER RUN (DONE, 2006-04-14, hartera)
 # SPLIT SEQUENCE AGAIN JUST FOR chrNA_random AND chrUn_random AFTER RENAMING
 # THESE RANDOM CHROMS (DONE, 2006-04-21, hartera)
    ssh kkstore01
    cd /cluster/data/danRer4
    
    # break up into 500 kb sized chunks at gaps if possible 
    # for RepeatMasker runs
    foreach c (`cat chrom.lst`)
       foreach d ($c/chr${c}*_?{,?})
         cd $d
         echo "splitting $d"
         set contig = $d:t
         faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
             -minGapSize=100
         cd ../..
       end
    end
    # took about 3 minutes. 
    # split just for chrNA_random and chrUn_random (2006-04-21, hartera)
    cd /cluster/data/danRer4
    foreach c (NA_random Un_random)
       foreach d ($c/chr${c}*_?{,?})
         cd $d
         echo "splitting $d"
         set contig = $d:t
         faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
             -minGapSize=100
         cd ../..
       end
     end
 
 ###########################################################################
 # REPEATMASKER RUN (DONE, 2006-04-21, hartera)
    # Originally run 2006-04-14. There was one sequence chr16_4_10.fa that 
    # failed with a division by zero error. Sent this as a test case with the 
    # danio library to Robert Hubley who fixed the bug and sent a new
    # version of ProcessRepeats. Checked this into CVS for 
    # /cluster/bluearc/RepeatMasker on 2006-04-19.
    # When a new library is added for this version of RepeatMasker, need to 
    # check in /cluster/bluearc/RepeatMasker/Libraries for a directory made 
    # up of a date e.g. 20060315 here and inside this are species directories
    # for which RepeatMasker has already been run. In this directory it creates
    # a specieslib of the danio repeats. If this exists, this is used for the
    # RepeatMasker run for that species. Check that this contains the 
    # unclassified Zebrafish repeats with IDs beginning with Dr. This library
    # with these repeats should have been created in the section above:
    # Use sequence split into 500 kb chunks.
    ssh kkstore01
    cd /cluster/data/danRer4
    mkdir RMRun
    # Record RM version used:
    ls -l /cluster/bluearc/RepeatMasker
    # lrwxrwxrwx  1 angie protein 18 Mar 20 16:50 /cluster/bluearc/RepeatMasker -> RepeatMasker060320
    # March 20 2006 (open-3-1-5) version of RepeatMasker
    # get RM database version
    grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl \
         > RMdatabase.version
    # RELEASE 20060315
 
    cd /cluster/data/danRer4
    cat << '_EOF_' > jkStuff/RMZebrafish
 #!/bin/csh -fe
 
 cd $1
 pushd .
 /bin/mkdir -p /tmp/danRer4/$2
 /bin/cp $2 /tmp/danRer4/$2/
 cd /tmp/danRer4/$2
 /cluster/bluearc/RepeatMasker060320/RepeatMasker -ali -s -species danio $2
 popd
 /bin/cp /tmp/danRer4/$2/$2.out ./
 if (-e /tmp/danRer4/$2/$2.align) /bin/cp /tmp/danRer4/$2/$2.align ./
 if (-e /tmp/danRer4/$2/$2.tbl) /bin/cp /tmp/danRer4/$2/$2.tbl ./
 if (-e /tmp/danRer4/$2/$2.cat) /bin/cp /tmp/danRer4/$2/$2.cat ./
 /bin/rm -fr /tmp/danRer4/$2/*
 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer4/$2
 /bin/rmdir --ignore-fail-on-non-empty /tmp/danRer4
 '_EOF_'
    # << emacs
    chmod +x jkStuff/RMZebrafish
 
    # move old files out the way and re-run on 2006-04-19
    cd /cluster/data/danRer4
    mkdir RMOutOld
    foreach d (*/chr*_?{,?})
       set contig = $d:t
       echo $contig
       foreach c ($d/$contig*.fa.*)
          set t=$c:t
          mv $c /cluster/data/danRer4/RMOutOld/$t.bak
       end
    end  
  
    cp /dev/null RMRun/RMJobs
    foreach c (`cat chrom.lst`)
       foreach d ($c/chr${c}_?{,?})
           set ctg = $d:t
           foreach f ( $d/${ctg}_?{,?}.fa )
             set f = $f:t
             echo /cluster/data/danRer4/jkStuff/RMZebrafish \
                  /cluster/data/danRer4/$d $f \
                '{'check out line+ /cluster/data/danRer4/$d/$f.out'}' \
               >> RMRun/RMJobs
           end
       end
    end
 
    # Do the run again with new version of ProcessRepeats used 
    # for RepeatMasker.
    ssh pk
    cd /cluster/data/danRer4/RMRun
    para create RMJobs
    # 4382 jobs written to batch
    para try, check, push, check ... etc.
    para time
 # Completed: 4382 of 4382 jobs
 # CPU time in finished jobs:   11745656s  195760.94m  3262.68h  135.95d  0.372 y
 # IO & Wait Time:                 18953s     315.88m     5.26h    0.22d  0.001 y
 # Average job time:                2685s      44.75m     0.75h    0.03d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3878s      64.63m     1.08h    0.04d
 # Submission to last job:         41887s     698.12m    11.64h    0.48d 
    
    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kkstore01
    cd /cluster/data/danRer4
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
         > /dev/null
    end
 
    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
       echo lifting $c
       cd $c
       if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
         liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
         > /dev/null
       endif
       cd ..
    end
    # Re-run for just chrNA_random and chrUn_random (start on 2006-04-21)
    ssh kkstore01
    mkdir /cluster/data/danRer4/RMRun/randomsRun
    cd /cluster/data/danRer4
    cp /dev/null RMRun/randomsRun/RMJobs
    foreach c (NA_random Un_random)
       foreach d ($c/chr${c}_?{,?})
           set ctg = $d:t
           foreach f ( $d/${ctg}_?{,?}.fa )
             set f = $f:t
             echo /cluster/data/danRer4/jkStuff/RMZebrafish \
                  /cluster/data/danRer4/$d $f \
                '{'check out line+ /cluster/data/danRer4/$d/$f.out'}' \
               >> RMRun/randomsRun/RMJobs
           end
       end
    end
 
    # Do the run again for chrNA_random and chrUn_random.
    ssh pk
    cd /cluster/data/danRer4/RMRun/randomsRun
    para create RMJobs
    # 468 jobs written to batch
    para try, check, push, check ... etc.
    para time
 # Completed: 468 of 468 jobs
 # CPU time in finished jobs:     551863s    9197.71m   153.30h    6.39d  0.017 y
 # IO & Wait Time:                  2217s      36.96m     0.62h    0.03d  0.000 y
 # Average job time:                1184s      19.73m     0.33h    0.01d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3836s      63.93m     1.07h    0.04d
 # Submission to last job:          9086s     151.43m     2.52h    0.11d
 
    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kkstore01
    cd /cluster/data/danRer4
    foreach c (NA_random Un_random)
      foreach d (${c}/chr*_?{,?})
        set contig = $d:t
        echo $contig
        liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
               > /dev/null
      end
    end
 
    #- Lift pseudo-contigs to chromosome level
    foreach c (NA_random Un_random)
       echo lifting $c
       cd $c
       if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
         liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
         > /dev/null
       endif
       cd ..
    end
 
    # Load tables
    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/danRer4
    hgLoadOut danRer4 */chr*.fa.out -verbose=2 >& load.log
 # bad rep range [5031, 4990] line 51895 of 14/chr14.fa.out 
 # bad rep range [4559, 4558] line 59431 of 16/chr16.fa.out 
 # bad rep range [1202, 1201] line 131633 of 16/chr16.fa.out 
 # bad rep range [280, 252] line 93608 of 17/chr17.fa.out 
 # bad rep range [429, 272] line 43230 of 22/chr22.fa.out 
 # bad rep range [262, 261] line 167346 of 3/chr3.fa.out 
 # bad rep range [889, 888] line 28495 of 5/chr5.fa.out 
 # bad rep range [349, 348] line 113404 of 5/chr5.fa.out 
 # bad rep range [1133, 1132] line 200654 of 5/chr5.fa.out 
 # bad rep range [965, 920] line 3567 of 8/chr8.fa.out 
 # bad rep range [292, 291] line 6354 of NA_random/chrNA_random.fa.out
 # note: 11 records dropped due to repStart > repEnd
    # Not too many errors so just ignore, but send examples to Arian Smit
    # and Robert Hubley. 
    # check coverage of repeats masked
    featureBits -chrom=chr1 danRer3 rmsk
    # 25822888 bases of 55500710 (46.527%) in intersection 
    featureBits -chrom=chr1 danRer4 rmsk 
    # 32880041 bases of 70589895 (46.579%) in intersection
 
 ###########################################################################
 # MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF AND BUILD NIB FILES
 # (DONE, 2006-04-22, hartera)
 # MASK PSEUDO-CONTIGS AS NOT DONE BEFORE (DONE, 2006-05-27, hartera)
     ssh kkstore01
     cd /cluster/data/danRer4
     # Soft-mask (lower-case) the contig and chr .fa's,
     # then make hard-masked versions from the soft-masked.
     set trfCtg=bed/simpleRepeat/trfMask
     set trfChr=bed/simpleRepeat/trfMaskChrom
     # for the chromosomes:
     foreach f (*/chr*.fa)
       echo "repeat- and trf-masking $f"
       maskOutFa -soft $f $f.out $f
       set chr = $f:t:r
       maskOutFa -softAdd $f $trfChr/$chr.bed $f
       echo "hard-masking $f"
       maskOutFa $f hard $f.masked
     end
 
     # check percent sequence masked
     faSize /cluster/data/danRer4/1/chr1.fa
     # 70589895 bases (904883 N's 69685012 real 36751306 upper 
     # 32933706 lower) in 1 sequences in 1 files
 
     faSize /cluster/data/danRer3/1/chr1.fa
     # 55805710 bases (1047706 N's 54758004 real 28887275 upper 
     # 25870729 lower) in 1 sequences in 1 files
     # 47% of danRer4 chr1.fa is in lower case so masked
     # Build nib files, using the soft masking in the fa
     mkdir nib
     foreach f (*/chr*.fa)
       faToNib -softMask $f nib/$f:t:r.nib
     end
     ls ./nib/* | wc
     # 28
     
     # for the contigs (2006-05-27, hartera)
     ssh kkstore04
     cd /cluster/data/danRer4
     set trfCtg=bed/simpleRepeat/trfMask
     set trfChr=bed/simpleRepeat/trfMaskChrom
     foreach c (`cat chrom.lst`)
       echo "repeat- and trf-masking contigs of chr$c"
       foreach d ($c/chr*_?{,?})
         set ctg=$d:t
         set f=$d/$ctg.fa
         maskOutFa -soft $f $f.out $f
         maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
         maskOutFa $f hard $f.masked
       end
     end
     
 ###########################################################################
 # STORING O+O SEQUENCE AND ASSEMBLY INFORMATION AND CREATE 2BIT FILE
 # (DONE, 2006-04-23, hartera)
 # CHANGE FILENAME TO 2BIT FILE IN CHROMINFO AND REMOVE NIB DIR IN /gbdb
 # (DONE, 2006-05-24, hartera)
     # Make symbolic links from /gbdb/danRer4/nib to the real nibs
     ssh hgwdev
     cd /cluster/data/danRer4
     mkdir -p /gbdb/danRer4/nib
     foreach f (/cluster/data/danRer4/nib/chr*.nib)
       ln -s $f /gbdb/danRer4/nib
     end
     # Load /gbdb/danRer4/nib paths into database and save size info
     # hgNibSeq creates chromInfo table
     hgNibSeq -preMadeNib danRer4 /gbdb/danRer4/nib */chr*.fa
     echo "select chrom,size from chromInfo" | hgsql -N danRer4 > chrom.sizes
     # take a look at chrom.sizes, should be 28 lines
     wc chrom.sizes
     # 28      56     422 chrom.sizes
 
     # Make one big 2bit file as well, and make a link to it in
     # /gbdb/danRer4 because hgBlat looks there:
     faToTwoBit */chr*.fa danRer4.2bit
     # check the 2bit file
     twoBitInfo danRer4.2bit 2bit.tab
     diff 2bit.tab chrom.sizes
     # should be the same and they are so ok.
     rm 2bit.tab
     # add link to this 2bit file from gbdb danRer4 directory 
     ln -s /cluster/data/danRer4/danRer4.2bit /gbdb/danRer4/
     # (hartera, 2006-05-24)
     # change chromInfo table to have 2bit file for filename
     hgsql -e 'update chromInfo set fileName = "/gbdb/danRer4/danRer4.2bit";' \
           danRer4
     # then remove nib directory in /gbdb/danRer4 as do not need both nibs
     # and 2 bit file which is in /gbdb/danRer4.
     rm -r /gbdb/danRer4/nib
 
 ###########################################################################
 # MAKE GOLD AND GAP TRACKS (DONE, 2006-04-23, hartera)
     ssh hgwdev
     cd /cluster/data/danRer4
     # the gold and gap tracks are created from the chrN.agp file and this is
     # the scaffolds or supercontigs agp 
     hgGoldGapGl -noGl -chromLst=chrom.lst danRer4 /cluster/data/danRer4 .
 
     # featureBits danRer4 gold
     # 1626093931 bases of 1626093931 (100.000%) in intersection
     # featureBits danRer3 gold
     # 1630323462 bases of 1630323462 (100.000%) in intersection
 
     # featureBits danRer4 gap
     # 148566200 bases of 1626093931 (9.136%) in intersection
     # featureBits danRer3 gap
     # 13709500 bases of 1630323462 (0.841%) in intersection
     # there are larger gaps now in chrNA and chrUn so compare just chr1
     # featureBits -chrom=chr1 danRer4 gap
     # 16000 bases of 70573895 (0.023%) in intersection
     # featureBits -chrom=chr1 danRer3 gap
     # 305000 bases of 55500710 (0.550%) in intersection
     # without random or chrUn chroms:
     # featureBits -noRandom danRer4 gap
     # 366200 bases of 1546950119 (0.024%) in intersection
     # featureBits -noRandom danRer3 gap
     # 6240000 bases of 1200146216 (0.520%) in intersection
 # Add trackDb.ra entries for gold and gap tracks and also create
 # gap.html and gold.html pages.
 
 ###########################################################################
 # PUT MASKED SEQUENCE OUT ON iSERVERS AND THE SAN FOR CLUSTER RUNS
 # (DONE, 2006-04-23, hartera)
 # TRFFA SEQUENCED WAS NOT MASKED SO ADD MASKED SEQUENCE TO iSERVERS AND 
 # THE SAN FOR CLUSTER RUNS (DONE, 2006-05-30, hartera)
     ssh kkr1u00
     # Chrom-level mixed nibs that have been repeat- and trf-masked:
     rm -rf /iscratch/i/danRer4/nib
     mkdir -p /iscratch/i/danRer4/nib
     cp -p /cluster/data/danRer4/nib/chr*.nib /iscratch/i/danRer4/nib
     # Pseudo-contig fa that have been repeat- and trf-masked:
     # Add these pseudo-contigs that have been repeat- and trf-masked
     # and rsync again. (2006-05-30, hartera)
     rm -rf /iscratch/i/danRer4/trfFa
     mkdir /iscratch/i/danRer4/trfFa
     foreach d (/cluster/data/danRer4/*/chr*_?{,?})
       cp -p $d/$d:t.fa /iscratch/i/danRer4/trfFa
     end
     rm -rf /iscratch/i/danRer4/rmsk
     mkdir -p /iscratch/i/danRer4/rmsk
     cp -p /cluster/data/danRer4/*/chr*.fa.out /iscratch/i/danRer4/rmsk
     cp -p /cluster/data/danRer4/danRer4.2bit /iscratch/i/danRer4/
     # rsync files - faster than using iSync
     # rsync again - still can not rsync to kkr2u00 (hartera, 2006-05-30)
     foreach R (2 3 4 5 6 7 8)
       echo "rsync for kkr${R}u00 ..."
       rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
     end
     # error rsyncing to kkr2u00: 
     # connect to host kkr2u00 port 22: No route to host
 
     # then add the same sequence files to the san
     ssh kkstore01
     # Chrom-level mixed nibs that have been repeat- and trf-masked:
 
     mkdir -p /san/sanvol1/scratch/danRer4/nib
     rm -rf /san/sanvol1/scratch/danRer4/nib
     cp -p /cluster/data/danRer4/nib/chr*.nib /san/sanvol1/scratch/danRer4/nib
     cp /cluster/data/danRer4/danRer4.2bit /san/sanvol1/scratch/danRer4
     # Pseudo-contig fa that have been repeat- and trf-masked:
     # Add these pseudo-contigs again (2006-05-30, hartera)
     ssh kkstore04
     rm -rf /san/sanvol1/scratch/danRer4/trfFa
     mkdir /san/sanvol1/scratch/danRer4/trfFa
     foreach d (/cluster/data/danRer4/*/chr*_?{,?})
       cp -p $d/$d:t.fa /san/sanvol1/scratch/danRer4/trfFa
     end
     
 ###########################################################################
 # ADD CONTIGS TRACK (DONE, 2006-04-23, hartera)
     # make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from 
     # chunks (contigs) agp files.
     ssh kkstore01
     mkdir -p /cluster/data/danRer4/bed/ctgPos2
     cd /cluster/data/danRer4/bed/ctgPos2
     # ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
     foreach c (`cat /cluster/data/danRer4/chrom.lst`)
          awk 'BEGIN {OFS="\t"} \
          {if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
          /cluster/data/danRer4/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab
     end
     # load the ctgPos2 table
     ssh hgwdev
     cd /cluster/data/danRer4/bed/ctgPos2
     # use hgLoadSqlTab as it gives more error messages than using 
     # "load data local infile ...".
     /cluster/bin/i386/hgLoadSqlTab danRer4 ctgPos2 \
             ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
 # create trackDb.ra entry and html page for ctgPos2 track.
 # add search for the track and make sure the termRegex will handle
 # contigs named "Zv6_scaffoldN.N" where N is an integer and all the 
 # contig accessions in the *.chunks.agp files.
 
 ###########################################################################
 # CREATE gc5Base WIGGLE TRACK (DONE, 2006-04-23, hartera)
     ssh kkstore01
     mkdir -p /cluster/data/danRer4/bed/gc5Base
     cd /cluster/data/danRer4/bed/gc5Base
     nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer4 \
         /cluster/data/danRer4 | wigEncode stdin gc5Base.wig gc5Base.wib
     #       Calculating gcPercent with window size 5
     #       Using twoBit: /cluster/data/danRer4/danRer4.2bit
     #       File stdout created
     #   Converted stdin, upper limit 100.00, lower limit 0.00
     # runs for about 7 minutes 
 
     # load database with the .wig file and add .wib file to /gbdb/danRer4
     ssh hgwdev
     cd /cluster/data/danRer4/bed/gc5Base
     mkdir /gbdb/danRer4/wib
     ln -s `pwd`/gc5Base.wib /gbdb/danRer4/wib
     time hgLoadWiggle -pathPrefix=/gbdb/danRer4/wib danRer4 gc5Base gc5Base.wig
     # 17 second load time
 
     #   verify index is correct:
     hgsql danRer4 -e "show index from gc5Base;"
     #   should see good numbers in Cardinality column
 
 ###########################################################################
 # MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2005-04-24, hartera)
     # Use -repMatch=512 (based on size -- for human we use 1024, and
     # the zebrafish genome is ~50% of the size of the human genome
     ssh kkr1u00
     mkdir /cluster/data/danRer4/bed/ooc
     cd /cluster/data/danRer4/bed/ooc
     mkdir -p /san/sanvol1/scratch/danRer4
     ls -1 /cluster/data/danRer4/nib/chr*.nib > nib.lst
     blat nib.lst /dev/null /dev/null -tileSize=11 \
       -makeOoc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc -repMatch=512
     # Wrote 50424 overused 11-mers to /cluster/bluearc/danRer4/11.ooc
     # For 10.ooc, repMatch = 4096 for human, so use 2048
     blat nib.lst /dev/null /dev/null -tileSize=10 \
       -makeOoc=/san/sanvol1/scratch/danRer4/danRer4_10.ooc -repMatch=2048
     # Wrote 12231 overused 10-mers to /cluster/bluearc/danRer4/10.ooc 
     # keep copies of ooc files in this directory and copy to iscratch
     cp /san/sanvol1/scratch/danRer4/*.ooc .
     cp -p /san/sanvol1/scratch/danRer4/*.ooc /iscratch/i/danRer4/
     # rsync to iServers
     foreach R (2 3 4 5 6 7 8)
        rsync -a --progress /iscratch/i/danRer4/*.ooc \
              kkr${R}u00:/iscratch/i/danRer4/
     end
      
 ###########################################################################
 # MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer4 (DONE, 2006-04-27, hartera)
    ssh hgwdev
    # DNA port is "0", trans prot port is "1"  
    echo 'insert into blatServers values("danRer4", "blat17", "17788", "1", "0");  insert into blatServers values("danRer4", "blat17", "17789", "0", "1");' \
     | hgsql hgcentraltest
    # this enables blat and isPcr, isPcr is enabled by loading blat server
    # with tilesize=5 (ask for this when request blat servers from
    # cluster admin).
    # if you need to delete those entries
    echo 'delete from blatServers where db="danRer4";' | hgsql hgcentraltest
  
 ###########################################################################
 # AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2006-04-24, hartera)
 # UPDATED (2006-09-28) - see separate section, UPDATE AFFY ZEBRAFISH TRACK.
 # NOTE: Jim recommends that, in the future, all AFFY blat alignments should drop
 # -mask=lower for blat and drop -minIdentity=95 to -minIdentity=90 as the
 # higher minIdentity is causing alignments to be dropped that should not be. 
 # e.g.  blat -fine -minIdentity=90 -ooc=11.ooc  
 # $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 # pslReps can be used to handle filtering at a later step. Blat's minIdentity 
 # seems to be more severe than that for pslReps as it takes insertions and 
 # deletions into account. 
 # CHECKED ALIGNMENTS USING MASKED TRFFA AND RESULTS ARE THE SAME
 # (DONE, 2006-05-30, hartera)
 # array chip sequences already downloaded for danRer1
     ssh hgwdev
     # need to copy sequences to the bluearc first to transfer to the iServers
     cd /projects/compbio/data/microarray/affyZebrafish
     mkdir -p /cluster/bluearc/affy
     cp -p \
       /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /cluster/bluearc/affy/
     # Set up cluster job to align Zebrafish consensus sequences to danRer3
     ssh kkr1u00
     mkdir -p /cluster/data/danRer4/bed/affyZebrafish.2006-04-24
     ln -s /cluster/data/danRer4/bed/affyZebrafish.2006-04-24 \
           /cluster/data/danRer4/bed/affyZebrafish
     cd /cluster/data/danRer4/bed/affyZebrafish
     mkdir -p /iscratch/i/affy
     cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
     foreach R (2 3 4 5 6 7 8)
        rsync -a --progress /iscratch/i/affy/*.fa \
              kkr${R}u00:/iscratch/i/affy/
     end
     # small cluster run to align sequences
     ssh kki
     cd /cluster/data/danRer4/bed/affyZebrafish
     ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst
     ls -1 /iscratch/i/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
     # for output:
     mkdir -p psl
     echo '#LOOP\n/cluster/bin/i386/blat -fine -minIdentity=90 -ooc=/iscratch/i/danRer4/danRer4_11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
 
     gensub2 genome.lst affy.lst template.sub para.spec
     para create para.spec
     para try, check, push, check .... etc.
 # para time
 # Completed: 271 of 271 jobs
 # CPU time in finished jobs:      15331s     255.51m     4.26h    0.18d  0.000 y
 # IO & Wait Time:                   737s      12.29m     0.20h    0.01d  0.000 y
 # Average job time:                  59s       0.99m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             101s       1.68m     0.03h    0.00d
 # Submission to last job:          1557s      25.95m     0.43h    0.02d
 
     # do pslSort and liftUp
     ssh kkstore04
     cd /cluster/data/danRer4/bed/affyZebrafish
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyZebrafish.psl
     pslSort dirs raw.psl tmp psl
     # only use alignments that have at least 95% identity in aligned region.
     # try minCover as now there is less sequence in chrUn and chrNA
     # so less likely that genes are split up.
     grep '>' /cluster/bluearc/affy/Zebrafish_consensus.fa | wc -l
     # 15502
     pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     # see how many sequences are aligned:
     awk '{print $10;}' contig.psl > contigAligned
     tail +6 contigAligned | sort | uniq -c | sort -nr > contigAligned.count
     wc -l contigAligned.count 
     # 14819 contigAligned.count 
     tail +6 contig.psl | wc -l
     # 21486
     # 96% of sequences are aligned. The sequence with the most alignments 
     # aligns 177 times, then the next is 105, then 86, 85, 69, 69, 54, 54 etc.
     # for danRer3, 14335 were aligned (92% aligned). The sequence with 
     # the most alignments aligned 96 times, then 31, 27, 22, 20, 19 times. 
     # also 854 sequences aligned for danRer4 that did not align for danRer3.
     # 370 were aligned in danRer3 but not for danRer4.
     # USED THESE pslReps PARAMETERS:
     pslReps -minCover=0.30 -minAli=0.95 -nearTop=0.005 \
             raw.psl contig2.psl /dev/null
     # see how many sequences are aligned:
     awk '{print $10;}' contig2.psl > contig2Aligned
     tail +6 contig2Aligned | sort | uniq -c | sort -nr > contig2Aligned.count
     wc -l contig2Aligned.count 
     # 14528 contig2Aligned.count
     tail +6 contig2.psl | wc -l
     # 18744
     # danRer3 has 21196 total alignments and 14335 sequences aligned. 
     # 94% of sequences are aligned.
     # 785 sequences were aligned for danRer4 using minCover but not for 
     # danRer3 after using pslReps. 592 sequences were aligned for danRer3 
     # but not for danRer4 using minCover after using pslReps.
     # the sequence with the most alignments aligns 105 times, then 85, 69,
     # 54, 50, 47, 44, 37, 26, 31, 29:
 # No. of alignments Sequence Name
 # 105 Zebrafish:Dr.15955.1.A1_at
 # 85 Zebrafish:Dr.20178.1.A1_at
 # 69 Zebrafish:Dr.885.1.S1_at
 # 54 Zebrafish:Dr.15958.1.S1_at
 # 50 Zebrafish:Dr.25427.1.A1_at
 # 47 Zebrafish:Dr.16470.1.A1_at
 # 44 Zebrafish:Dr.490.1.S1_at
 # 37 Zebrafish:Dr.7806.1.A1_at
 # 36 Zebrafish:Dr.19.1.A1_at
 # 31 Zebrafish:Dr.2825.1.A1_at
 # 29 Zebrafish:Dr.19556.1.A1_at
     # aligning with the -mask=lower option doesn't make a difference to the
     # number of alignments and sequences aligned.  
     # there are 291 extra sequences that align when minCover option is
     # not used. Only 7 of these have 22 or more alignments. 
 # 86 Zebrafish:Dr.24316.1.S1_at
 # 69 Zebrafish:Dr.14452.1.A1_at
 # 39 Zebrafish:Dr.12372.1.S1_at
 # 26 Zebrafish:Dr.18296.2.S1_a_at
 # 23 Zebrafish:Dr.7519.1.A1_at
 # 22 Zebrafish:Dr.8680.1.S1_at
 # 22 Zebrafish:Dr.22175.1.S1_at
     # clean up 
     rm contig* 
     # use pslReps without the minCover option as it does allow quite a lot
     # more alignments and the number of total alignments/number of sequences
     # aligned is still close to that for danRer3. Using nearTop=0.001 does
     # decrease the number of alignments but also means that some good 
     # alignments are lost.  
     pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl
     # shorten names in psl file:
     sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
     mv affyZebrafish.psl.tmp affyZebrafish.psl
     pslCheck affyZebrafish.psl
     # co-ordinates are ok. psl is good.
     # load track into database
     ssh hgwdev
     cd /cluster/data/danRer4/bed/affyZebrafish
     hgLoadPsl danRer4 affyZebrafish.psl
     # Add consensus sequences for Zebrafish chip
     # Copy sequences to gbdb if they are not there already
     mkdir -p /gbdb/hgFixed/affyProbes
     ln -s \
        /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /gbdb/hgFixed/affyProbes
 
     hgLoadSeq -abbr=Zebrafish: danRer4 \
               /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
     # Clean up
     rm batch.bak contig.psl raw.psl
 # trackDb.ra entry and html are already there in trackDb/zebrafish/
     
 ###########################################################################
 # CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS DIRECTORY AND 
 # ADD CHROM SIZES FOR BLASTZ CLUSTER RUNS (DONE, 2006-04-24, hartera)
     # There are no lineage-specific repeats for zebrafish and other species
     # so use all repeats.
     ssh pk
     mkdir -p /san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
     foreach f (/cluster/data/danRer4/*/chr*.fa.out)
      cp -p $f \
         /san/sanvol1/scratch/danRer4/linSpecRep.notInOthers/$f:t:r:r.out.spec
     end
     cp -p /cluster/data/danRer4/chrom.sizes \
           /san/sanvol1/scratch/danRer4/
 
 ###########################################################################
 # BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
 # HUMAN (hg18) (DONE, 2006-04-24 - 2006-04-25, hartera)
 # LOAD BLASTZ PSLS INTO DATABASE AND CHECK FOR HUMAN CONTAMINATION
 # (DONE, 2006-05-11, hartera)
     ssh pk
     # Blastz uses lineage-specific repeats. There are none for human
     # and zebrafish so use all repeats.
     # There is a lineage-specific repeats directory for zebrafish (see
     # section on CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS
     # DIRECTORY. lineage-specific repeats for hg18 already made - see
     # makeHg18.doc (BLASTZ ZEBRAFISH section).
     mkdir -p /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
     cd /cluster/data/danRer4/bed
     ln -s blastz.hg18.2006-04-24 blastz.hg18
     cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
     # only 5% of the danRer4 genome is now in the random unordered chroms
     # so not running only scaffolds for these chroms - run as virtual chroms
     # and use same parameters as for danRer2.
     cat << 'EOF' > DEF
 # danRer4 zebrafish target, human hg18 query
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # use parameters suggested for human-fish evolutionary distance
 # recommended in doBlastzChainNet.pl help
 # (previously used for  hg16-fr1, danRer1-mm5)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
 
 # TARGET: zebrafish (danRer4)
 # Use all chroms, including both randoms (chrNA_random and chrUn_random)
 SEQ1_DIR=/san/sanvol1/scratch/danRer4/nib
 SEQ1_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
 SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: human (hg18) - single chunk big enough to run each chrom by itself
 # Use all chroms, including all randoms 
 SEQ2_DIR=/san/sanvol1/scratch/hg18/nib
 SEQ2_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len
 SEQ2_SMSK=/san/sanvol1/scratch/hg18/linSpecRep.notInOthers
 SEQ2_CHUNK=300000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer4/bed/blastz.hg18.2006-04-24
 TMPDIR=/scratch/tmp
 'EOF'
    # << happy emacs
    chmod +x DEF 
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         `pwd`/DEF >& doBlastz.log &
    # Start: Mon Apr 24 19:20 Stop: Tues Apr 25 05:42
    # Did not finish:
    # netChains: looks like previous stage was not successful     
    # (can't find [danRer4.hg18.]all.chain[.gz]).
    # This file is there so run again. Continue chainMerge step so remove
    # all.chain file and chain directory.
    # NOTE: can leave these files and continue from the net step and it 
    # will work.
    cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
    rm ./axtChain/*.all.chain.gz
    rm -r ./axtChain/chain
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -continue chainMerge `pwd`/DEF >& chainMerge.log &
    # Took about 10 minutes.
    # Check results with featureBits and compare to those 
    # for danRer3 and danRer2:
    ssh hgwdev
    featureBits danRer4 chainHg18Link
    # 64196991 bases of 1626093931 (3.948%) in intersection
    featureBits danRer3 chainHg18Link
    # 69559338 bases of 1630323462 (4.267%) in intersection
    featureBits danRer2 chainHg17Link
    # 70046373 bases of 1560497282 (4.489%) in intersection
 
    # After Genbank tracks are loaded, (hartera, 2006-04-27)
    featureBits -chrom=chr1 danRer4 refGene:cds chainHg18Link -enrichment
    # refGene:cds 0.732%, chainHg18Link 4.140%, both 0.558%, cover 76.19%, 
    # enrich 18.40x
    featureBits -chrom=chr1 danRer3 refGene:cds chainHg18Link -enrichment
    # refGene:cds 0.769%, chainHg18Link 4.124%, both 0.604%, cover 78.49%, 
    # enrich 19.03x
    featureBits -chrom=chr1 danRer4 refGene:cds netHg18 -enrichment
    # refGene:cds 0.732%, netHg18 31.154%, both 0.624%, cover 85.21%, 
    # enrich 2.73x
    featureBits -chrom=chr1 danRer3 refGene:cds netHg18 -enrichment
    # refGene:cds 0.774%, netHg18 35.434%, both 0.679%, cover 87.73%, 
    # enrich 2.48x
    # Similar coverage and enrichment as for hg18 chains and net on danRer3.
    # do the swap for Blastz chains over to human (hg18) and create net,
    # axtNet, mafNet, liftOver and Downloads. see also makeHg18.doc for
    # featureBits on these alignments.
    ssh pk
    cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
    nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -swap `pwd`/DEF >& doSwap.log &
    # Took about 15 minutes.
    # Load Blastz results into database (DONE, 2006-05-11, hartera)
    ssh kkstore04
    cd /cluster/data/danRer4/bed/blastz.hg18/pslParts
    # cat together Blastz for each chrom
    mkdir pslChrom
    foreach c (`cat /cluster/data/danRer4/chrom.lst`)
       echo "Processing $c ..."
       foreach p (chr${c}.nib*)
          zcat $p >> ./pslChrom/chr${c}_blastzHg18.psl
       end
    end
    # load Blastz psls into the database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/blastz.hg18/pslParts/pslChrom
    foreach f (*.psl)
       /cluster/bin/i386/hgLoadPsl danRer4 $f
       echo "$f Done"
    end
    # Then determine how much sequence has 100% identity to human with a 
    # stretch of at least 300 bp. Human contamination was also found in
    # danRer1 and a user reported it more recently.
 
    foreach c (`cat /cluster/data/danRer4/chrom.lst`)
       echo "chr$c" >> humanContamination.txt
       hgsql -e "select count(*) from chr${c}_blastzHg18 where matches >= 300 and        misMatches = 0;" danRer4 >> humanContamination.txt
    end
    # There are 4 on chr11 that fit this criteria (same if decrease to regions
    # of >= 200 bp with 100% ID).
    hgsql -e \
    'select * from chr11_blastzHg18 where matches >= 300 and mismatches = 0;' \
    danRer4 > chr11HumanSeq
    # only 2 of these also have no query inserts and 1 of the others only has
    # a 1 base insert: regions are of size 303, 310 and 367 bp. The region of 
    # 330 bp has a 45 bp insert on the query side - see below
 #bin	matches	misMatches	repMatches	nCount	qNumInsert	qBaseInsert	tNumInsert	tBaseInsert	strand	qName	qSize	qStart	qEnd	tName	tSize	tStart	tEnd	blockCount	blockSizes	qStarts	tStarts
 #588	303	0	0	0	0	0	0	0	-	chr4	191273063	69879746	69880049	chr11	52342180	502145	502448	1	303,	121393014,	502145,
 #588	330	0	0	0	1	45	0	0	-	chr4	191273063	69879319	69879694	chr11	52342180	502545	502875	2	1,329,	121393369,121393415,	502545,502546,
 #588	310	0	0	0	0	0	0	0	-	chr4	191273063	69878956	69879266	chr11	52342180	502928	503238	1	310,	121393797,	502928,
 #588	667	0	0	0	1	1	0	0	-	chr4	191273063	69878268	69878936	chr11	52342180	503258	503925	2	453,214,	121394127,121394581,	503258,503711,
 
 ###########################################################################
 # BLASTZ/CHAIN/NET PREP (DONE 4/25/06 angie)
     ssh kkstore04
     cd /cluster/data/danRer4
     cp -p danRer4.2bit /san/sanvol1/scratch/danRer4/
 
     # Create a 2bit file for danRer4 with all chroms (1-25 and M) and the
     # scaffolds for NA and Un:
     awk '$1 == $6 {print $1;}' Zv6.scaffolds.agp \
     | faSomeRecords Zv6_scaffolds.fa stdin stdout \
     | faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa stdin \
        /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit
     twoBitInfo /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit \
       /san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes
 
     # Make a lift file for scaffolds --> {chrUn, chrNA}:
     mkdir /cluster/data/danRer4/liftSupertoChrom
     cd /cluster/data/danRer4/liftSupertoChrom
     /cluster/bin/scripts/agpToLift \
       < ../NA_random/agps/chrNA_random.scaffolds.agp \
       > chrNA_random.lft
     /cluster/bin/scripts/agpToLift \
       < ../Un_random/agps/chrUn_random.scaffolds.agp \
       > chrUn_random.lft
     cat chr*.lft > liftNAandUnScaffoldsToChrom.lft
     cp -p liftNAandUnScaffoldsToChrom.lft /san/sanvol1/scratch/danRer4/
 
     # Distribute on /iscratch/i too (danRer4.2bit is already there):
     ssh kkr1u00
     cd /iscratch/i/danRer4
     cp -p /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit .
     twoBitInfo danRer4ChrUnNAScafs.2bit chromsUnNAScafs.sizes
     cp -p \
       /cluster/data/danRer4/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft .
     iSync
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET XENTRO2 (DONE 4/26/06 angie)
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25
     cd /cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25
     cat << '_EOF_' > DEF
 # zebrafish vs. frog
 BLASTZ=/cluster/bin/penn/i386/blastz
 
 # Use same params as used for danRer1-xenTro1 (see makeXenTro1.doc)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Zebrafish danRer4
 SEQ1_DIR=/iscratch/i/danRer4/danRer4.2bit
 SEQ1_CTGDIR=/iscratch/i/danRer4/danRer4ChrUnNAScafs.2bit
 SEQ1_LIFT=/iscratch/i/danRer4/liftNAandUnScaffoldsToChrom.lft
 SEQ1_LEN=/cluster/data/danRer4/chrom.sizes
 SEQ1_CTGLEN=/iscratch/i/danRer4/chromsUnNAScafs.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=100
 
 # QUERY: Frog xenTro2 - single chunk big enough to run two of the
 #               largest scaffolds in one job
 SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
 SEQ2_LEN=/cluster/bluearc/xenTro2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25
 '_EOF_'
     # << emacs
     # kkstore04 can't see /iscratch so use an iServer as fileServer:
     doBlastzChainNet.pl -blastzOutRoot=/cluster/bluearc/danRer4XenTro2 \
       -bigClusterHub=kk -fileServer=kkr8u00 -workhorse=kkr8u00 \
       -chainMinScore=5000 -chainLinearGap=loose DEF \
       >& do.log & tail -f do.log
     ln -s blastz.xenTro2.2006-04-25 /cluster/data/danRer4/bed/blastz.xenTro2
 
 ###########################################################################
 # CREATE LIFT FILES FOR RANDOM CHROMOSOMES' SCAFFOLDS
 # (DONE, 2006-04-25, hartera)
    # scaffolds lift files created by scaffoldFaToAgp when agp files created 
    # for chrNA_random and chrUn_random. remove last line as this is an extra
    # gap line that was removed from the agp.
 
    ssh kkstore01
    cd /cluster/data/danRer4
    foreach c (NA_random Un_random)
      mkdir -p /cluster/data/danRer4/$c/tmp
    end
    # NA_random doesn't have .lft and .gap files from scaffoldFaToAgp so
    # recreate. It had no tmp dir with the NA_random.scaffolds.agp.
    awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.scaffolds.agp \
        > ./NA_random/tmp/NA_random.scaffolds.agp
    # change the first field to "chrNA_random" then can use agpToFa to process
    perl -pi.bak -e 's/Zv6_NA[0-9]+/chrNA_random/' ./NA_random/tmp/*.agp
    wc -l ./NA_random/tmp/NA_random.scaffolds.agp
    # 2898 ./NA_random/tmp/NA_random.scaffolds.agp
 
    cd /cluster/data/danRer4
    foreach c (NA_random)
      awk '{print $6;}' $c/tmp/$c.scaffolds.agp > $c/tmp/chr$c.scaffolds.lst
          $HOME/bin/i386/faSomeRecords /cluster/data/danRer4/Zv6_scaffolds.fa \
          $c/tmp/chr$c.scaffolds.lst $c/tmp/chr$c.fa
    end
    cd /cluster/data/danRer4/NA_random/tmp
    scaffoldFaToAgp -scaffoldGapSize=50000 chrNA_random.fa
    # change chrUn to chrNA_random for NA_random, change chrUn to chrUn_random
    # forUn_random. Change D to W for NA_random and Un_random..
    sed -e 's/chrUn/chrNA_random/' chrNA_random.agp \
        | sed -e 's/D/W/' > chrNA_random.scaffolds.agp
    mv chrNA_random.fa chrNA_random.scaffolds.fa
    # also move the Un_random .lft and .gap files to Un_random/tmp
    mv ./Un_random/chrUn_random.lft ./Un_random/tmp/chrUn_random.lft
    mv ./Un_random/chrUn_random.gap ./Un_random/tmp/chrUn_random.gap
    # for chrNA_random and chrUn_random: remove last line as this is an extra
    # gap line that was removed from the chrN_random.agp. Add these 
    # scaffold lift files to liftAll.lft. Also need to change the last 
    # field so that the correct total number of bases is being shown in the
    # last column.
    cd /cluster/data/danRer4
    foreach c (NA_random Un_random)
      head -n -1 $c/tmp/chr${c}.lft > $c/tmp/chr${c}.scaffolds.lft
      perl -pi.bak -e "s/chrUn/chr${c}/" $c/tmp/chr${c}.scaffolds.lft
      if ($c == "NA_random") then
         perl -pi.bak -e 's/208064280/208014280/' \
              $c/tmp/chrNA_random.scaffolds.lft
      else
         perl -pi.bak -e 's/19379532/19329532/' \
              $c/tmp/chrUn_random.scaffolds.lft
      endif
      cat $c/tmp/chr${c}.scaffolds.lft >> ./jkStuff/liftAll.lft
      rm $c/tmp/chr${c}.lft $c/tmp/chr${c}.gap *.bak
    end
 
 ###########################################################################
 # AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN 
 # (DONE, 2006-04-25 - 2006-04-26, hartera)
    ssh hgwdev
    cd ~kent/src/hg/makeDb/genbank
    cvs update -d -P etc
    # edit etc/genbank.conf to add danRer4 and commit this to CVS.
 # danRer4 (zebrafish)
 # Lift file partitions unplaced sequence pseudo-chroms
 danRer4.serverGenome = /cluster/data/danRer4/danRer4.2bit
 danRer4.clusterGenome = /iscratch/i/danRer4/danRer4.2bit
 danRer4.ooc = /iscratch/i/danRer4/danRer4_11.ooc
 danRer4.align.unplacedChroms = chrNA_random chrUn_random
 danRer4.lift = /cluster/data/danRer4/jkStuff/liftAll.lft
 danRer4.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 danRer4.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 danRer4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 danRer4.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 danRer4.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 danRer4.downloadDir = danRer4
 danRer4.mgcTables.default = full
 danRer4.mgcTables.mgc = all
    # end of section added to etc/genbank.conf
    cvs commit -m "Added danRer4." etc/genbank.conf
    # update /cluster/data/genbank/
    make etc-update
 
    # ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
    # danRer genome information
 
    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial danRer4 &
    # Start: Tues Apr 25 12:53 Finish: Wed Apr 26 08:38
    # logFile: var/build/logs/2006.04.25-12:53:39.danRer4.initalign.log
    # check log file
    tail -f var/build/logs/2006.04.25-12:53:39.danRer4.initalign.log
    # check it has finished (last line in log file):
    # kkstore02 2006.04.26-08:38:36 danRer4.initalign: finish
    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad danRer4 &
    # logFile: var/dbload/hgwdev/logs/2006.04.26-15:45:19.dbload.log
    # check it is finished: hgwdev 2006.04.26-17:48:07 dbload: finish
    # Took about 2 hours.
 
 ###########################################################################
 # SPLIT UP ZEBRAFISH MASKED SEQUENCE FROM chrUn AND chrNA INTO SCAFFOLDS
 # ADD SOFT-MASKED SCAFFOLDS TO ISERVERS AND THE SAN FOR CLUSTER RUNS
 # (DONE, 2006-04-27, hartera)
 
     ssh kkstore01
     cd /cluster/data/danRer4
     # for chrNA_random and chrUn_random, get soft-masked sequence.
     foreach c (NA_random Un_random)
       cd $c
       mkdir scaffoldsSoftMask
       awk 'BEGIN {FS="\t"}{if ($5 != "N") \
        print "faFrag -mixed chr'${c}'.fa",$2-1, $3, $6".fa";}' chr${c}.agp \
        >> ./scaffoldsSoftMask/faFragSoftMask.csh
       cd ..
     end
     # change permissions run scripts to get sequences
     foreach d (NA_random Un_random)
        chmod +x $d/scaffoldsSoftMask/faFragSoftMask.csh
     end
     # wrapper shell script to run script to get the soft-masked scaffolds
     cat << '_EOF_' > jkStuff/getMaskedScaffolds.csh
 #!/bin/csh
 foreach c (NA_random Un_random)
    set dir=/cluster/data/danRer4
    echo "Processing $c"
    cd $dir/$c/scaffoldsSoftMask
    cp ../chr${c}.fa .
    echo "Getting soft-masked sequences ..."
    nice faFragSoftMask.csh >& faFrag.log
 end
 '_EOF_'
    chmod +x jkStuff/getMaskedScaffolds.csh
    nice ./jkStuff/getMaskedScaffolds.csh &
    # Took about 2.5 hours.
    # check a few sequences that they are correct
    # add name of scaffold to sequence fasta and cat together
    foreach c (NA_random Un_random)
       set dir = /cluster/data/danRer4
       cd $dir/$c/scaffoldsSoftMask
       foreach f (Zv*)
         set g=$f:r
         set sc=scaffold${c}.fa
         perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:_]+/>$g/" $f
         cat $f >> $sc
         rm *.bak
       end
       cp scaffold* $dir/$c/
    end
    grep '>' NA_random/scaffoldNA_random.fa | wc -l
    # 2898
    grep '>' Un_random/scaffoldUn_random.fa | wc -l
    # 68
    # check sizes of final FASTA file with all sequences. check a few
    # sequence files to see that they are correct - ok 
    cd /cluster/data/danRer4
 cat << '_EOF_' > ./jkStuff/checkFastaSizes.csh
 #!/bin/csh -fe
 
 set scafName=$1
 set agpLen=$2
 
 set pref=`echo $scafName | cut -c1-2`
 if ($pref == "Zv") then
   set g=/cluster/data/danRer4/*/scaffoldsSoftMask/${scafName}.fa
   set h=$g:t
   echo "Getting size of $h"
   set faLen = `faSize $g | awk '{print $1;}'`
 
   if ($agpLen == $faLen) then
      echo "   OK: apg length = $h length = $faLen"
   else
      echo "ERROR:  length = $agpLen, but $h length = $faLen"
   endif
 endif
 '_EOF_'
    # << happy emacs
    chmod +x ./jkStuff/checkFastaSizes.csh
    # use bash as doing a cat in C shell seems to split the line up by space
    bash
    for c in NA_random Un_random
    do
      echo "Processing $c scaffolds ..."; 
      cat $c/chr${c}.agp  | while read line;
      do
      scaf=`echo $line | cut -d " " -f6`;
      size=`echo $line | cut -d " " -f8`; 
      nice ./jkStuff/checkFastaSizes.csh $scaf $size >> checkFastaSizes.log;
      done
    done 
    exit # back to C shell
    grep "ERROR:" checkFastaSizes.log | wc -l
 
    # No errors so all are the OK so FASTA files are the expected size
    # Add soft-masked scaffolds to the Iservers and the san for cluster runs 
    
    ssh kkr1u00
    cd /cluster/data/danRer4
    mkdir /iscratch/i/danRer4/scaffoldsSoftMask
    foreach c (NA_random Un_random)
       foreach f (/cluster/data/danRer4/$c/scaffoldsSoftMask/Zv*.fa)
          cp -p $f /iscratch/i/danRer4/scaffoldsSoftMask
       end
       cp -p /cluster/data/danRer4/$c/scaffold${c}.fa /iscratch/i/danRer4
    end
    ls /iscratch/i/danRer4/scaffoldsSoftMask/ | wc
    # 2966
    # all files are there   
    # rsync to cluster machines
    foreach R (2 3 4 5 6 7 8)
       rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
    end
 
    ssh pk
    mkdir -p /san/sanvol1/scratch/danRer4/scaffoldsSoftMask
    foreach c (NA_random Un_random)
       foreach f (/cluster/data/danRer4/$c/scaffoldsSoftMask/Zv*.fa)
         rsync -a --progress $f /san/sanvol1/scratch/danRer4/scaffoldsSoftMask/
       end
       rsync -a --progress /cluster/data/danRer4/${c}/scaffold${c}.fa \
             /san/sanvol1/scratch/danRer4/
    end
    foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
      echo $f >> files.log
    end
    wc -l files.log 
    # 2966 files.log
    rm files.log
    # All files have transferred.
 
 ###########################################################################
 ## SWAP MM8 blastz result (DONE - 2006-04-28 - Hiram)
 # ADD SYMBOLIC LINK TO SWAP DIR (DONE, 2006-05-04, hartera)
 # RE-MAKE MM8 CHAINS AND NET SWAP WITH DANRER4 RANDOM CHROMS 
 # (DONE, 2006-05-24, hartera) ADDED LINK TO SWAP DIR (2006-05-27, hartera)
     ssh pk
     cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
     # blastz parameters used in blastz alignment of danRer4 on mm8:
     # BLASTZ_ABRIDGE_REPEATS=1
     # BLASTZ_H=2000
     # BLASTZ_Y=3400
     # BLASTZ_L=6000
     # BLASTZ_K=2200
     # BLASTZ_M=50
     # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
     
     ssh hgwdev
     cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
     time nice -n +19 featureBits danRer4 chainMm8Link \
 	> fb.danRer4.chainDanRer4Link 2>&1 &
     cat fb.danRer4.chainDanRer4Link
     # 60721886 bases of 1626093931 (3.734%) in intersection
     # Add symbolic link to new swap directory (2006-05-27, hartera)
     ssh kkstore04
     cd /cluster/data/danRer4/bed
     ln -s blastz.mm8.swap blastz.mm8
    
 ###########################################################################
 # MONDOM4 BLASTZ TESTS USING LINEAGE-SPECIFIC REPEATS OR DYNAMIC MASKING
 # AND SWAP (DONE, 2006-04-28, hartera)
    # used no lineage specific-repeats and M=50 for dynamic masking
    featureBits danRer4 chainMonDom4
    # 541863023 bases of 1626093931 (33.323%) in intersection
    featureBits danRer4 chainMonDom4NoDyMsk
    # 534445657 bases of 1626093931 (32.867%) in intersection
    featureBits monDom4 chainDanRer4
    # 856404995 bases of 3501643220 (24.457%) in intersection
    featureBits monDom4 chainDanRer4NoDyMsk
    # 812142533 bases of 3501643220 (23.193%) in intersection
    featureBits -chrom=chr1 danRer4 refGene:cds chainMonDom4Link -enrichment
    # refGene:cds 0.732%, chainMonDom4Link 5.573%, both 0.550%, cover 75.20%, 
    # enrich 13.49x
    featureBits -chrom=chr1 danRer4 refGene:cds chainMonDom4NoDyMskLink 
    -enrichment
    # refGene:cds 0.732%, chainMonDom4NoDyMskLink 4.083%, both 0.550%, 
    # cover 75.15%, enrich 18.40x
    featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4Link -enrichment
    # refGene:cds 0.001%, chainDanRer4Link 2.448%, both 0.000%, 
    # cover 55.63%, enrich 22.73x 
    featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4NoDyMskLink 
    -enrichment
    # refGene:cds 0.001%, chainDanRer4NoDyMskLink 1.807%, both 0.000%, 
    # cover 43.85%, enrich 24.27x
    # There are only 36 RefSeq genes for monDom4 so results are misleading.
    # Try mrna and xenoRefGene table.
    # for mrna tables, not much difference:
    featureBits -chrom=chr1 monDom4 mrna chainDanRer4Link -enrichment
    # mrna 0.004%, chainDanRer4Link 2.448%, both 0.002%, cover 54.59%, 
    # enrich 22.30x
    featureBits -chrom=chr1 monDom4 mrna chainDanRer4NoDyMskLink -enrichment
    # mrna 0.004%, chainDanRer4NoDyMskLink 1.807%, both 0.002%, 
    # cover 52.67%, enrich 29.15x
    featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4Link -enrichment
    # xenoRefGene:cds 0.820%, chainDanRer4Link 2.448%, both 0.655%, 
    # cover 79.88%, enrich 32.63x
    featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4NoDyMskLink 
    -enrichment
    # xenoRefGene:cds 0.820%, chainDanRer4NoDyMskLink 1.807%, both 0.661%, 
    # cover 80.63%, enrich 44.63x
 
    # For the nets:
    featureBits -chrom=chr1 danRer4 refGene:cds netMonDom4 -enrichment
    # refGene:cds 0.732%, netMonDom4 31.056%, both 0.612%, 
    # cover 83.58%, enrich 2.69x
    featureBits -chrom=chr1 danRer4 refGene:cds netMonDom4NoDyMsk -enrichment
    # refGene:cds 0.732%, netMonDom4NoDyMsk 31.002%, both 0.617%, 
    # cover 84.31%, enrich 2.72x
    featureBits -chrom=chr1 monDom4 refGene:cds netDanRer4 -enrichment
    # refGene:cds 0.001%, netDanRer4 25.224%, both 0.000%, 
    # cover 66.95%, enrich 2.65x
    featureBits -chrom=chr1 monDom4 refGene:cds netDanRer4NoDyMsk -enrichment
    # refGene:cds 0.001%, netDanRer4NoDyMsk 24.539%, both 0.000%, 
    # cover 49.19%, enrich 2.00x
    # rows in tables for chr1
    # Assembly  Table 		Number of rows
    # danRer4   chainMonDom4          	36931
    # danRer4   chainMonDom4Link        	426659 
    # danRer4   chainMonDom4NoDyMsk    	34363
    # danRer4   chainMonDom4NoDyMskLink	361572
    # monDom4   chainDanRer4            	170759
    # monDom4   chainDanRer4Link	        2552995
    # monDom4   chainDanRer4NoDyMsk	139797	
    # monDom4   chainDanRer4NoDyMskLink	1806858
    # all chroms:
    # danRer4   netMonDom4	        399531
    # danRer4   netMonDom4NoDyMsk	346482
    # monDom4   netDanRer4		395881
    # monDom4   netDanRer4NoDyMsk	321288
 
    # Use lineage-specific repeats and no dynamic masking, seem to get 
    # better enrichment and coverage compared to gene CDS regions and also 
    # there are less chains being produced.
 
 ###########################################################################
 # BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
 # OPOSSUM (monDom4) (DONE, 2006-04-28 - 2006-04-29, hartera)
     ssh hgwdev
     # Remove all test chain and net tables and start again
     foreach c (`cat chrom.lst`)
        hgsql -e "drop table chr${c}_chainMonDom4;" danRer4
        hgsql -e "drop table chr${c}_chainMonDom4Link;" danRer4
        hgsql -e "drop table chr${c}_chainMonDom4NoDyMsk;" danRer4
        hgsql -e "drop table chr${c}_chainMonDom4NoDyMskLink;" danRer4
     end
     hgsql -e "drop table netMonDom4;" danRer4
     hgsql -e "drop table netMonDom4NoDyMsk;" danRer4
     # remove downloads
     rm -r /usr/local/apache/htdocs/goldenPath/danRer4/vsMonDom4
     rm \
     /usr/local/apache/htdocs/goldenPath/danRer4/liftOver/danRer4ToMonDom4.over.chain.gz
     rm /cluster/data/danRer4/bed/liftOver/danRer4ToMonDom4.over.chain.gz
     # remove old Blastz swap
     rm -r /cluster/data/danRer4/bed/blastz.monDom4.swap
     # remove link to old blastz directory
     rm -r /cluster/data/danRer4/bed/blastz.monDom4
 
     # see makeMonDom4.doc for removal of test tables and download files
     # and swap directory on monDom4.
     
     ssh pk
     # Blastz uses lineage-specific repeats. There are none for human
     # and zebrafish so use all repeats.
     # There is a lineage-specific repeats directory for zebrafish (see
     # section on CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS
     # DIRECTORY. lineage-specific repeats for monDom4 made and also nibs - see
     # makeMonDom4.doc. Need nib files when running Blastz with
     # lineage-specific repeats.
     
     mkdir -p /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
     cd /cluster/data/danRer4/bed
     ln -s blastz.monDom4.2006-04-28 blastz.monDom4
     cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
     # only 5% of the danRer4 genome is now in the random unordered chroms
     # so not running only scaffolds for these chroms - run as virtual chroms
     # and use same parameters as for danRer2 but use all repeats as 
     # lineage-specific as monDom4 is now mapped to chroms.
     cat << 'EOF' > DEF
 # danRer4 zebrafish target, opossum monDom4 query
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # use parameters suggested for human-fish evolutionary distance
 # recommended in doBlastzChainNet.pl help.
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
 
 # TARGET: zebrafish (danRer4)
 # Use all chroms, including both randoms (chrNA_random and chrUn_random)
 SEQ1_DIR=/san/sanvol1/scratch/danRer4/nib
 SEQ1_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
 SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 
 # QUERY: opossum (monDom4) 
 SEQ2_DIR=/san/sanvol1/scratch/monDom4/nib
 SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
 SEQ2_SMSK=/san/sanvol1/scratch/monDom4/linSpecRep.notInOthers
 SEQ2_CHUNK=50000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
 TMPDIR=/scratch/tmp
 'EOF'
    # << happy emacs
    chmod +x DEF 
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         `pwd`/DEF >& doBlastz.log &
    # Start: Fri Apr 28 13:27 Finish: Apr 29 01:28
    # Stopped after making and merging chains:
    # netChains: looks like previous stage was not successful 
    # (can't find [danRer4.monDom4.]all.chain[.gz]). 
    # Start again with net step and continue:
 
    cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -continue net `pwd`/DEF >& net.log &
    # Took about 15 minutes to finish.
    # Do swap to get danRer4 alignments on monDom4:
    # see also makeMonDom4.doc  
    cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
    nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -swap `pwd`/DEF >& doSwap.log &
    # Took about 15 minutes.
 
 ###########################################################################
 # BLASTZ FOR FUGU (fr1) (DONE, 2006-04-28 - 2006-04-29, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
     # No lineage-specific repeats for this species pair. fr1 is in scaffolds
     # so not so easy to use repeats with this run anyway. There is a 2bit 
     # file of scaffolds on the Iservers. 
     # Run this with dynamic masking instead. 
     # copy masked fr1 scaffolds 2 bit file to the san - see makeFr1.doc    
     # size of scaffolds FASTA file:
     ssh kkr1u00
     faSize /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa
     # 329140338 bases
     ssh pk
     mkdir /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
     cd /cluster/data/danRer4/bed
     ln -s blastz.fr1.2006-04-28 blastz.fr1
     cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
 # use parameters for fr1 in makeDanRer2.doc. Using scaffolds makes this run
 # slower so it is best to have the scaffolds in the query. Use HoxD55.q 
 # matrix as Fugu is quite distant from zebrafish. Blastz uses 
 # lineage-specfic repeats but there are none for these two species.
 # Use soft-masked scaffolds and dynamic masking.
 cat << '_EOF_' > DEF
 # zebrafish (danRer4) vs. Fugu (fr1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=0
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2000
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - zebrafish (danRer4)
 SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
 SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
 # 0.5 Mb chunk for target with 5 kb overlap
 SEQ1_LIMIT=30
 SEQ1_CHUNK=500000
 SEQ1_LAP=5000
 
 # QUERY - Fugu (fr1)
 SEQ2_DIR=/san/sanvol1/scratch/fr1/fr1.2bit
 # soft-masked scaffolds in 2bit format
 SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/UnScaffolds/fr1UnScaffolds.2bit
 SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
 SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
 SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/UnScaffolds/scaffolds.sizes
 # large enough chunk to do whole genome at once
 SEQ2_CHUNK=500000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer4/bed/blastz.fr1.2006-04-28
 TMPDIR=/scratch/tmp
 '_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         `pwd`/DEF >& doBlastz.log &
 
    # Start: Fri Apr 28 18:54 Finish: Apr 29 06:35
    # Stopped after making and merging chains:
    # netChains: looks like previous stage was not successful 
    # (can't find [danRer4.fr1.]all.chain[.gz]). 
    # Start again with net step and continue:
 
    cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -continue net `pwd`/DEF >& net.log &
    # Took about an hour to finish.
    # check coverage:
    featureBits danRer4 chainFr1Link
    # 139280554 bases of 1626093931 (8.565%) in intersection
    featureBits danRer3 chainFr1Link
    # 137698495 bases of 1630323462 (8.446%) in intersection
 
    featureBits -chrom=chr1 danRer4 refGene:cds chainFr1Link -enrichment
    # refGene:cds 0.732%, chainFr1Link 8.464%, both 0.660%, 
    # cover 90.18%, enrich 10.66x
    featureBits -chrom=chr1 danRer3 refGene:cds chainFr1Link -enrichment
    # refGene:cds 0.774%, chainFr1Link 8.364%, both 0.713%, 
    # cover 92.09%, enrich 11.01x
    featureBits -chrom=chr1 danRer4 refGene:cds netFr1 -enrichment
    # refGene:cds 0.732%, netFr1 52.712%, both 0.710%, 
    # cover 96.97%, enrich 1.84x
    featureBits -chrom=chr1 danRer3 refGene:cds netFr1 -enrichment
    # refGene:cds 0.774%, netFr1 58.353%, both 0.759%, 
    # cover 97.95%, enrich 1.68x
    # Do the Blastz swap to get danRer4 alignments on fr1
    # see also makeFr1.doc for featureBits on these alignments.
    ssh pk
    cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
    nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -swap `pwd`/DEF >& doSwap.log &
    # Took about 30 minutes.
 
 ###########################################################################
 # BLASTZ FOR TETRAODON (tetNig1) (DONE, 2006-04-29 - 2006-04-30, hartera)
 # CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
     # No lineage-specific repeats for this species pair. 
     # Tetraodon also has no species-specific repeats in the RepeatMasker
     # library so run this using dynamic masking instead as for danRer2 and
     # danRer3.
     # The tetraodon 2bit file of chroms and scaffolds 
     # (tetNig1ChromsRandomScafs.2bit) - this contains sequences for chroms
     # and for scaffolds of random chroms.
     ssh pk
     mkdir /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
     cd /cluster/data/danRer4/bed
     ln -s blastz.tetNig1.2006-04-29 blastz.tetNig1
     cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
 # use parameters for tetNig1 in makeDanRer3.doc. Using scaffolds makes this run
 # slower so it is best to have the scaffolds in the query. Use HoxD55.q 
 # matrix as tetraodon is quite distant from zebrafish. Blastz uses 
 # lineage-specfic repeats but there are none for these two species.
 # Use soft-masked scaffolds and dynamic masking.
 cat << '_EOF_' > DEF
 # zebrafish (danRer4) vs. tetraodon (tetNig1)
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=0
 
 ALIGN=blastz-run
 BLASTZ=blastz
 BLASTZ_H=2500
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - zebrafish (danRer4)
 SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
 SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
 # 0.5 Mb chunk for target with 5 kb overlap
 SEQ1_LIMIT=30
 SEQ1_CHUNK=500000
 SEQ1_LAP=5000
 
 # QUERY - Tetraodon (tetNig1)
 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
 # soft-masked chroms and random scaffolds in 2bit format
 SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit
 SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
 SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
 SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
 # large enough chunk to do whole genome at once
 SEQ2_CHUNK=1000000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
 TMPDIR=/scratch/tmp
 '_EOF_'
    # << this line keeps emacs coloring happy
    chmod +x DEF
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         `pwd`/DEF >& doBlastz.log &
    # Start: Sat Apr 29 18:10 Finish: Apr 29 22:41 
    # Stopped after making and merging chains:
    # netChains: looks like previous stage was not successful 
    # (can't find [danRer4.tetNig1.]all.chain[.gz]). However, this file
    # is there so start again with net step and continue:
 
    cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
    nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -continue net `pwd`/DEF >& net.log &
    # Took about 20 minutes to finish.
    # check coverage compared to danRer3:
    featureBits danRer4 chainTetNig1Link
    # 119439512 bases of 1626093931 (7.345%) in intersection
    featureBits danRer3 chainTetNig1Link
    # 109205244 bases of 1630323462 (6.698%) in intersection
    featureBits -chrom=chr1 danRer4 refGene:cds chainTetNig1Link -enrichment
    # refGene:cds 0.732%, chainTetNig1Link 7.536%, both 0.645%, 
    # cover 88.08%, enrich 11.69x
    featureBits -chrom=chr1 danRer3 refGene:cds chainTetNig1Link -enrichment
    # refGene:cds 0.774%, chainTetNig1Link 6.821%, both 0.692%, 
    # cover 89.34%, enrich 13.10x
    featureBits -chrom=chr1 danRer4 refGene:cds netTetNig1 -enrichment
    # refGene:cds 0.732%, netTetNig1 55.116%, both 0.705%, 
    # cover 96.33%, enrich 1.75x
    featureBits -chrom=chr1 danRer3 refGene:cds netTetNig1 -enrichment
    # refGene:cds 0.774%, netTetNig1 61.540%, both 0.753%, 
    # cover 97.24%, enrich 1.58x
    # Similar coverage as for tetNig1 chains and nets on zebrafish danRer3.
    # Do the Blastz swap to get danRer4 alignments on tetNig1
    # see also makeTetNig1.doc for featureBits for these alignments on tetNig1.
    ssh pk
    cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
    nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
         -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
         -swap `pwd`/DEF >& doSwap.log &
    # Took about 22 minutes to run.
 
 ###########################################################################
 # MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2006-05-01, hartera)
 # RE-MAKE DOWNLOADS FOR AGP, SOFT AND HARD MASKED CHROMS, REPEATMASKER OUT
 # BECAUSE THEY DID NOT INCLUDE NA_RANDOM AND UN_RANDOM 
 # (DONE, 2007-03-29, hartera)
 # NOTE THAT zipAll.csh MUST BE ALTERED ACCORDINGLY IN FUTURE.
    ssh kkstore01
    cd /cluster/data/danRer4
    #- Build the .tar.gz and *.gz files for bigZips
    cat << '_EOF_' > jkStuff/zipAll.csh
 rm -rf bigZips
 mkdir bigZips
 tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp
 tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
 tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa
 tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked
 # soft masked chrNA and chrUn scaffolds
 tar cvzf bigZips/scaffoldRandomsFa.tar.gz NA_random/scaffoldNA_random.fa \
     Un_random/scaffoldUn_random.fa
 cd bed/simpleRepeat
 tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed
 cd ../..
 # get GenBank native mRNAs
 cd /cluster/data/genbank
 ./bin/i386/gbGetSeqs -db=danRer4 -native GenBank mrna \
         /cluster/data/danRer4/bigZips/mrna.fa
 # get GenBank xeno mRNAs
 ./bin/i386/gbGetSeqs -db=danRer4 -xeno GenBank mrna \
         /cluster/data/danRer4/bigZips/xenoMrna.fa
 # get native RefSeq mRNAs
 ./bin/i386/gbGetSeqs -db=danRer4 -native refseq mrna \
 /cluster/data/danRer4/bigZips/refMrna.fa
 # get native GenBank ESTs
 ./bin/i386/gbGetSeqs -db=danRer4 -native GenBank est \
 /cluster/data/danRer4/bigZips/est.fa
 
 # gzip the Genbank sequences and create upstream sequence files for RefSeq.
 cd /cluster/data/danRer4/bigZips
 gzip *.fa
 '_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/zipAll.csh
    csh -ef ./jkStuff/zipAll.csh >& zipAll.log &
    # Took about 35 minutes.    
    #- Look at zipAll.log to make sure all file lists look reasonable.
 
    # Make upstream files for zebrafish RefSeq and Copy the .gz files to
    # hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/danRer4/bigZips
    foreach I (1000 2000 5000)
      featureBits danRer4 refGene:upstream:${I} -fa=stdout \
               | gzip -c > upstream${I}.fa.gz
      echo "upstream${I} done"
    end
    set gp = /usr/local/apache/htdocs/goldenPath/danRer4
    mkdir -p $gp/bigZips
    cp -p *.gz $gp/bigZips
    mkdir -p $gp/chromosomes
 
    # Add individual chromosomes and file of scaffolds for each random chrom
    # to chromosomes downloads directory.
    foreach f (../*/chr*.fa)
      cp $f $gp/chromosomes
    end
    foreach c (NA_random Un_random)
      cd /cluster/data/danRer4/$c
      cp scaffold${c}.fa $gp/chromosomes
    end
    # create md5sum for bigZips
    cd $gp/bigZips
    md5sum *.gz > md5sum.txt
    # gzip each chrom or scaffolds for chrom separately in chromosomes dir
    cd $gp/chromosomes
    foreach f (*.fa)
       gzip $f
    end
    # create md5sum for chromosomes
    md5sum *.gz > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*
    # copy README.txt's from danRer3 and update
 
    # RE-MAKE DOWNLOADS FOR AGP, SOFT AND HARD MASKED CHROMS, REPEATMASKER OUT
    # BECAUSE THEY DID NOT INCLUDE NA_RANDOM AND UN_RANDOM 
    # (DONE, 2007-03-29, hartera)
    # NOTE THAT zipAll.csh MUST BE ALTERED ACCORDINGLY IN FUTURE.
    ssh kkstore04
    cd /cluster/data/danRer4
    #- Rebuild the .tar.gz (agp, files for bigZips
    cat << '_EOF_' > jkStuff/zip2.csh
 rm -r bigZips/chromAgp.tar.gz
 rm -r bigZips/chromOut.tar.gz
 rm -r bigZips/chromFa.tar.gz
 rm -r bigZips/chromFaMasked.tar.gz
 tar cvzf bigZips/chromAgp.tar.gz ?{,?}{,_random}/chr*.agp
 tar cvzf bigZips/chromOut.tar.gz ?{,?}{,_random}/chr*.fa.out
 tar cvzf bigZips/chromFa.tar.gz ?{,?}{,_random}/chr*.fa
 tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}{,_random}/chr*.fa.masked
 '_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/zip2.csh
    csh -ef ./jkStuff/zip2.csh >& zip2.log &
    # Took about 10 minutes
    # Links to these files already exist from the 
    # /usr/local/apache/htdocs/goldenpath/danRer4/bigZips directory.
    # Recreate the md5sum there to include these new files.
    cd /usr/local/apache/htdocs/goldenpath/danRer4/bigZips
    rm md5sum.txt
    md5sum *.gz > md5sum.txt
    
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK FOR hg18 (DONE, 2006-04-28 - 2006-05-03, hartera)
    ssh kkstore01
    bash # if not using bash shell already
    # make Blast database for non-random chrom sequences
    mkdir -p /cluster/data/danRer4/blastDb
    cd /cluster/data/danRer4/blastDb
    cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA_random/d" \ 
        | sed "/Un_random/d" > chrom.list
    for i in `cat chrom.list`; 
        do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
    ln -s `cat list` .
    for i in *.fa
     do
         /projects/compbio/bin/i686/formatdb -i $i -p F
     done
    rm *.log *.fa list
    cd /cluster/data/danRer4
    for i in `cat blastDb/chrom.list`; 
        do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft
    rm blastDb/chrom.list
    # Now make Blast database for random scaffolds sequences.
    mkdir /cluster/data/danRer4/scaffoldBlastDb
    cd /cluster/data/danRer4/scaffoldBlastDb
 
    # Take file of all scaffolds for NA_random and Un_random and cat together
    cat ../NA_random/scaffoldNA_random.fa ../Un_random/scaffoldUn_random.fa \
        > allRandomScafs.fasta
    grep '>' allRandomScafs.fasta | wc -l
    # 2966
    faSplit sequence allRandomScafs.fasta 500 scaf
    rm allRandomScafs.fasta
    for i in *.fa
      do
        /projects/compbio/bin/i686/formatdb -i $i -p F
      done
    rm *.log *.fa
    # combine databases for chroms and random chroms 
    mkdir -p /san/sanvol1/scratch/danRer4/comboBlastDb
    cd /cluster/data/danRer4/blastDb
    for i in nhr nin nsq; 
     do cp *.$i /san/sanvol1/scratch/danRer4/comboBlastDb; 
     done
    cd /cluster/data/danRer4/scaffoldBlastDb
    for i in nhr nin nsq; 
      do cp *.$i /san/sanvol1/scratch/danRer4/comboBlastDb; 
      done
    mkdir -p /cluster/data/danRer4/bed/tblastn.hg18KG
    cd /cluster/data/danRer4/bed/tblastn.hg18KG
    echo  /san/sanvol1/scratch/danRer4/comboBlastDb/*.nsq  \
          | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
    # 4377 query.lst
    # we want around 250000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
    # 36727/(250000/4377) = 643.016316
    mkdir -p /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa
    split -l 643 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl \
         /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice /cluster/home/braney/bin/x86_64/pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    exit # back to tcsh
    cd /cluster/data/danRer4/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
         liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer4/jkStuff/subChr.lft carry $f.2
         liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/danRer4/jkStuff/liftAll.lft carry $f.3
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4
 
         if pslCheck -prot $3.tmp                                                  
         then                                                                      
             mv $3.tmp $3                                                          
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0                                                                    
     fi                                                                            
 fi                                                                                
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     
     # then run the Blast cluster jobs
     ssh kk
     cd /cluster/data/danRer4/bed/tblastn.hg18KG
     para create blastSpec
     para try, check, push, check etc.
     # pushed 100,000 jobs at a time so need to do para push again later
     para time
 # Completed: 253866 of 253866 jobs
 # CPU time in finished jobs:   52410110s  873501.83m 14558.36h  606.60d  1.662 y
 # IO & Wait Time:               5508786s   91813.10m  1530.22h   63.76d  0.175 y
 # Average job time:                 228s       3.80m     0.06h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            2162s      36.03m     0.60h    0.03d
 # Submission to last job:        147825s    2463.75m    41.06h    1.71d
 
     # Took a while as had to repush some crashed jobs.
     ssh kkstore01
     cd /cluster/data/danRer4/bed/tblastn.hg18KG
     tcsh
     mkdir chainRun
     cd chainRun
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS \
       /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh kk
     cd /cluster/data/danRer4/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para try, check, push, check etc.
 # Completed: 58 of 58 jobs
 # CPU time in finished jobs:     759034s   12650.56m   210.84h    8.79d  0.024 y
 # IO & Wait Time:                217724s    3628.74m    60.48h    2.52d  0.007 y
 # Average job time:               16841s     280.68m     4.68h    0.19d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:          208828s    3480.47m    58.01h    2.42d
 # Submission to last job:        208891s    3481.52m    58.03h    2.42d
     ssh kkstore01
     cd /cluster/data/danRer4/bed/tblastn.hg18KG/blastOut
     bash # if using another shell
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     liftUp -nohead -type=.psl stdout \
            /cluster/data/danRer4/jkStuff/liftAll.lft carry u.*.psl m60* | \
            sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq \
            > /cluster/data/danRer4/bed/tblastn.hg18KG/blastHg18KG.psl
     pslCheck blastHg18KG.psl
     # this is ok.
     # load table 
     ssh hgwdev
     cd /cluster/data/danRer4/bed/tblastn.hg18KG
     hgLoadPsl danRer4 blastHg18KG.psl
     # check coverage
     featureBits danRer4 blastHg18KG
     # 21159392 bases of 1626093931 (1.301%) in intersection 
     
     featureBits danRer3 blastHg17KG
     # 21063005 bases of 1630323462 (1.292%) in intersection
     
     featureBits -chrom=chr1 danRer4 refGene:cds blastHg18KG -enrichment
     # refGene:cds 0.732%, blastHg18KG 1.333%, both 0.428%, cover 58.43%, 
     # enrich 43.83x
     featureBits -chrom=chr1 danRer3 refGene:cds blastHg17KG -enrichment
     # refGene:cds 0.774%, blastHg17KG 1.370%, both 0.450%, cover 58.05%, 
     # enrich 42.38x
     # Similar coverage compared to refGene CDS as for hg17 proteins on danRer3.
     # back to kkstore04 to clean up
     ssh kkstore04
     rm -rf /cluster/data/danRer4/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut
 
     # add trackDb.ra entry and html to ~/kent/src/hg/makeDb/trackDb/trackDb.ra
     # also added the blastHg18KG.html here. 
     # blastKGPep04 and blastKGRef04 tables required on hg18 - these have
     # been created - see makeHg18.doc. update of hgc.c, hgTrackUi.c and 
     # hgTracks.c was required - done by Brian.
 
 ###########################################################################
 # MULTIZ7WAY ALIGNMENTS FOR CONSERVATION TRACK 
 # (DONE, 2006-05-04 - 2006-05-10, hartera)
 # RE-MAKE WITH DANRER4 RANDOMS FOR MM8 AND ADDED FRAMES TABLE AND 
 # MULTIZ7WAY DOWNLOADS (DONE, 2006-05-28 - 2005-05-29, hartera)
 #   for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18.
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28
     cd /cluster/data/danRer4/bed
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
     # copy MAFs to a cluster-friendly server
     rm -r /san/sanvol1/scratch/danRer4/mafNet
     mkdir /san/sanvol1/scratch/danRer4/mafNet
     foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
        echo $s
        rsync -av /cluster/data/danRer4/bed/blastz.$s/mafNet/* \
         /san/sanvol1/scratch/danRer4/mafNet/$s/
     end
     # prune the hg17 17way tree to just these 7 and update db names:
     /cluster/bin/phast/tree_doctor \
       --prune-all-but=mouse_mm8,human_hg18,monodelphis_monDom4,xenopus_xenTro1,tetraodon_tetNig1,fugu_fr1,zebrafish_danRer3 \
       --rename="xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \
       /cluster/data/hg18/bed/multiz17way/17way.nh > 7way.nh
     # carefully edit so that danRer4 is first. copy first to new file
     cp 7way.nh 7way_zfishFirst.nh
   #  /cluster/bin/phast/draw_tree 7way_zfishFirst.nh > 7way.ps
     # also made the ps file for the 7way.nh and compared to make sure
     # that the tree with zebrafish at the top looks correct.
     /cluster/bin/phast/all_dists 7way_zfishFirst.nh > 7way.distances
     grep danRer4 7way.distances | sort -k3,3n | \
         awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
     cat distances.txt
 # 1.4749  tetraodon_tetNig1
 # 1.5154  fugu_fr1
 # 1.7480  human_hg18
 # 1.7782  monodelphis_monDom4
 # 1.8771  xenopus_xenTro2
 # 2.1058  mouse_mm8
     # the order in the browser display will be by tree topology,
     # not by distance, so they will be:
     # danRer4
     # 1.5154  fugu_fr1
     # 1.4749  tetraodon_tetNig1
     # 1.8771  xenopus_xenTro2
     # 1.7782  monodelphis_monDom4
     # 2.1058  mouse_mm8
     # 1.7480  human_hg18
 
     # create species list and stripped down tree for autoMZ
     sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' \
         7way_zfishFirst.nh > tree-commas.nh
     sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
     sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     ssh pk
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
     mkdir maf run
     cd run
 
     # stash binaries
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
     set db = danRer4
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/mafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../{tree.nh,species.lst} $tmp
     pushd $tmp
     foreach s (`cat species.lst`)
         set in = $pairs/$s/$c.maf
         set out = $db.$s.sing.maf
         if ($s == $db) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
     # << emacs
     chmod +x autoMultiz.csh
 
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
     # << emacs
     awk '{print $1}' /cluster/data/danRer4/chrom.sizes > chrom.lst
     gensub2 chrom.lst single spec jobList
     para create jobList
     para try, check, push, check etc. ...
     para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:       7022s     117.03m     1.95h    0.08d  0.000 y
 IO & Wait Time:                   142s       2.37m     0.04h    0.00d  0.000 y
 Average job time:                 256s       4.26m     0.07h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:             368s       6.13m     0.10h    0.00d
 Submission to last job:           705s      11.75m     0.20h    0.01d
 
    # Make .jpg for tree and install in htdocs/images/phylo/... don't forget
    # to request a push of that file.  The treeImage setting in trackDb.ra 
    # is phylo/danRer4_7way.jpg (relative to htdocs/images).
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
 cat << '_EOF_' > species7.nh
 ((zebrafish,(Fugu,Tetraodon)),(X. tropicalis,(opossum,(mouse,human))))
 '_EOF_'
 
    /cluster/bin/phast/draw_tree species7.nh > species7way.ps
    # ask Bob to resize image for Browser track description page and convert
    # to JPEG and rename as danRer4_7way.jpg
 
    # Build maf annotation and load dataabase
    ssh kolossus
    mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
  foreach db (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-28/species.lst`)
       ln -s  /cluster/data/$db/chrom.sizes $db.len
       if (! -e /cluster/data/$db/$db.N.bed) then
         twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
       endif
       ln -s  /cluster/data/$db/$db.N.bed $db.bed
       echo $db.bed  >> nBeds
       echo $db.len  >> sizes
   end
     echo date > jobs.csh
     # do smaller jobs first:
     foreach f (`ls -1rS ../../maf/*.maf`)
       echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
         /cluster/data/danRer4/danRer4.2bit ../maf/`basename $f` \
         >> jobs.csh
       echo "echo $f" >> jobs.csh
     end
     echo date >> jobs.csh
     csh -efx jobs.csh >&! jobs.log & 
     tail -f jobs.log
     # Took 27 minutes to run.
 
     # Load anno/maf
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf
     mkdir -p /gbdb/danRer4/multiz7way/anno/maf
     ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf/*.maf \
       /gbdb/danRer4/multiz7way/anno/maf
     # delete old files from extFile table
     hgsql -e 'delete from extFile where path like "%multiz7way/anno/maf%";' \
           danRer4
     cat > loadMaf.csh << 'EOF'
 date
 nice hgLoadMaf -pathPrefix=/gbdb/danRer4/multiz7way/anno/maf danRer4 multiz7way
 date
 'EOF'
     # << emacs
     csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log
     # Took about 1 minute.
     # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
     # machine and then load on hgwdev:
     ssh kkr7u00
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf
     cat *.maf \
     | nice hgLoadMafSummary danRer4 -minSize=30000 -mergeGap=1500 \
     -maxSize=200000 -test multiz7waySummary stdin
     # Created 820403 summary blocks from 4245668 components and 
     # 2120803 mafs from stdin
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf
     sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
       > /tmp/multiz7waySummary.sql
     time nice hgLoadSqlTab danRer4 multiz7waySummary \
          /tmp/multiz7waySummary.sql multiz7waySummary.tab
     # 0.000u 0.000s 2:05.26 0.0%      0+0k 0+0io 209pf+0w
     rm *.tab /tmp/multiz7waySummary.sql
  
     # zip mafs:
     ssh kkstore04
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf
 cat > zipMafs.csh << 'EOF'
     date
     foreach f (chr*.maf)
         set c = $f:r
         echo $c
         nice gzip -c $f > $c.maf.gz
     end
     date
 'EOF'
     time csh -efx zipMafs.csh >&! zip.log
     # 219.706u 1.939s 3:41.75 99.9%   0+0k 0+0io 0pf+0w
     rm *.maf
     # add Frames table:
     mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/frames
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/frames
     # The following is adapted from MarkD's Makefile used for mm7...
     # and used in makeRn4.doc.
     #------------------------------------------------------------------------
     # get the genes for all genomes
     # using mrna for danRer4
     # using knownGene for mm8 hg18
     # using mgcGenes for xenTro2
     # using ensGene for fr1
     # no genes for monDom4 and tetNig1
     # targetDb = danRer4
     # queryDbs = mm8 hg18 xenTro2 fr1 (to build frames for)
     # genePreds; (must keep only the first 10 columns for knownGene)
     
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
     # using mrna table as genes: danRer4
     mkdir genes
     foreach queryDb (danRer4)
       set tmpExt = `mktemp temp.XXXXXX`
       set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
       set tmpMrna = ${queryDb}.mrna.${tmpExt}
       set tmpCds = ${queryDb}.cds.${tmpExt}
       echo $queryDb
       hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                    from all_mrna,gbCdnaInfo,cds \
                    where (all_mrna.qName = gbCdnaInfo.acc) and \
                      (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
        ${queryDb} > ${tmpMrnaCds}
       cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
       cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
       mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
         stdout \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
 
     # using knownGene for mm8 hg18
     # using mgcGenes for xenTro2
     # using enesGene for fr1
     foreach queryDb (mm8 hg18 xenTro2 fr1)
       if ($queryDb == "xenTro2") then
         set geneTbl = mgcGenes
       else if ($queryDb == "fr1") then
         set geneTbl = ensGene
       else
         set geneTbl = knownGene
       endif
       hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$queryDb.tmp.gz
       mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
       rm -f $tmpExt
     end
  
     #------------------------------------------------------------------------
     # create frames
     set clusterDir = /cluster/bluearc/danRer4/multiz7wayFrames
     set multizDir = /cluster/data/danRer4/bed/multiz7way.2006-05-28
     set mafDir = $multizDir/maf
     set geneDir = $multizDir/frames/genes
     set clusterMafDir = ${clusterDir}/maf
     set clusterGeneDir = ${clusterDir}/genes
     set clusterFramesDir = ${clusterDir}/mafFrames.kki
 
     # copy mafs to cluster storage
     mkdir $clusterDir
     ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/"
 
     # copy genes to cluster storage
     ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/"
 
     # run cluster jobs
     set tmpExt = `mktemp temp.XXXXXX`
     set paraDir = $multizDir/frames/para.${tmpExt}
     mkdir mafFrames $paraDir
     rm -f $paraDir/jobList
     mkdir ${clusterFramesDir}
     foreach queryDb (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-28/species.lst`)
       mkdir ${clusterFramesDir}/${queryDb}
       foreach c (`awk '{print $1;}' /cluster/data/danRer4/chrom.sizes`)
         if (-e ${clusterGeneDir}/${queryDb}.gp.gz) then
           echo /cluster/bin/scripts/mkMafFrames.pl ${queryDb} danRer4 \
             ${clusterGeneDir}/${queryDb}.gp.gz ${clusterMafDir}/$c.maf.gz \
             ${clusterFramesDir}/${queryDb}/$c.mafFrames \
             >> $paraDir/jobList
         endif
       end
     end
     rm -f $tmpExt
     ssh -x kki "cd ${paraDir} && para make jobList && para time"
 # Completed: 140 of 140 jobs
 # CPU time in finished jobs:        255s       4.25m     0.07h    0.00d  0.000 y
 # IO & Wait Time:                   360s       6.00m     0.10h    0.00d  0.000 y
 # Average job time:                   4s       0.07m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:               8s       0.13m     0.00h    0.00d
 # Submission to last job:            55s       0.92m     0.02h    0.00d
 
     # combine results from cluster
     foreach queryDb (`cat ../species.lst`)
       echo $queryDb
       ssh -x kolossus "cat ${clusterFramesDir}/${queryDb}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${queryDb}.mafFrames.gz"
     end
     #------------------------------------------------------------------------
     # load the database
     hgLoadMafFrames danRer4 multiz7wayFrames mafFrames/*.mafFrames.gz
 
 
     #------------------------------------------------------------------------
     # clean up
     rm -rf ${clusterDir}
 
     ###
     # rebuild frames to get bug fix, using 1-pass maf methodology
     # (2006-06-09 markd)
     ssh kkstore04
     cd /cluster/data/danRer4/bed/multiz7way/frames
     mv mafFrames/ mafFrames.old
     nice tcsh # easy way to get process niced
     (zcat  ../maf/*.maf.gz | time genePredToMafFrames danRer4 stdin stdout danRer4 genes/danRer4.gp.gz fr1 genes/fr1.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz xenTro2 genes/xenTro2.gp.gz | gzip >multiz7way.mafFrames.gz)>&log&
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way/frames
 
     hgLoadMafFrames danRer4 multiz7wayFrames multiz7way.mafFrames.gz >&log&
 
     # end of multiz7way frames and load
 
     cd /cluster/data/danRer4/bed
     ln -s multiz7way.2006-05-28 /cluster/data/danRer4/bed/multiz7way
     # create and add the tree image for the description page
     # Make .jpg for tree and install in htdocs/images/phylo/... don't forget
     # to request a push of that file.  The treeImage setting in trackDb.ra 
     # is phylo/danRer4_7way.jpg (relative to htdocs/images).
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
 cat << '_EOF_' > species7.nh
 ((zebrafish,(Fugu,Tetraodon)),(X. tropicalis,(opossum,(mouse,human))))
 '_EOF_'
 
     /cluster/bin/phast/draw_tree species7.nh > species7way.ps
     # ask Bob to resize image for Browser track description page and convert
     # to JPEG and rename as danRer4_7way.jpg
     ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/danRer4_7way.jpg \
           /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
     # change permissions for display if not already readable to all
     chmod +r /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
 
 # check for all.joiner entry for 7-way - it is there already.
 
 # add html and trackDb.ra entry for danRer4:
 # track multiz7way
 # shortLabel Conservation
 # longLabel Vertebrate Multiz Alignment & Conservation
 # group compGeno
 # priority 104
 # visibility pack
 # color 0, 10, 100
 # altColor 0,90,10
 # type wigMaf 0.0 1.0
 # maxHeightPixels 100:40:11
 # wiggle phastCons7way
 # pairwiseHeight 12
 # spanList 1
 # yLineOnOff Off
-# autoScaleDefault Off
+# autoScale Off
 # windowingFunction mean
 # summary multiz7waySummary
 # frames multiz7wayFrames
 # irows on
 # speciesGroups vertebrate mammal
 # sGroup_vertebrate fr1 tetNig1 xenTro2
 # sGroup_mammal monDom4 mm8 hg18
 # treeImage phylo/danRer4_7way.jpg
 
 ###########################################################################
 # MAF DOWNLOADS FOR MULTIZ7WAY (DONE, 2006-05-29, hartera)
 # GZIPPED UPSTREAM FILES AND ADDED TO DOWNLOADS AND RE-MADE md5sum.txt
 # (DONE, 2006-06-02, hartera)
    ssh hgwdev
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
    mkdir mafDownloads
    cd mafDownloads
     # upstream mafs
 cat > mafFrags.csh << 'EOF'
     date
     foreach i (1000 2000 5000)
         echo "making upstream$i.maf"
         nice featureBits danRer4 refGene:upstream:$i -fa=/dev/null -bed=up.bad
         awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
         rm up.bad
         nice mafFrags danRer4 multiz7way up.bed upstream$i.maf \
                 -orgs=../species.lst
         rm up.bed
     end
     date
 'EOF'
     time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
     # 57.823u 105.238s 4:13.15 64.4%  0+0k 0+0io 2pf+0w
     # add maf downloads for annotated mafs
     ssh kkstore04
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads
 cat > downloads.csh << 'EOF'
     date
     foreach f (../anno/maf/chr*.maf)
         set c = $f:t:r
         echo $c
         nice gzip -c $f > $c.maf.gz
     end
     md5sum *.gz > md5sum.txt
     date
 'EOF'
     # 446.734u 5.629s 7:38.09 98.7%       0+0k 0+0io 2pf+0w
    
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/danRer4/multiz7way
     mkdir $dir
     ln -s \
 /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads/{*.gz,md5sum.txt} \
     $dir
     cp /usr/local/apache/htdocs/goldenPath/danRer3/multiz5way/README.txt $dir
     # edit README.txt
 
     # gzip the upstream maf downloads and remake md5sum.txt 
     # (2006-06-02, hartera)
     ssh kkstore04
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads
     foreach f (upstream*.maf)
       nice gzip -c $f > $f.maf.gz
     end 
     rm md5sum.txt
     md5sum *.gz > md5sum.txt
     ssh hgwdev
     set dir = /usr/local/apache/htdocs/goldenPath/danRer4/multiz7way
     rm $dir/md5sum.txt
     ln -s \
 /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads/{upstream*.gz,md5sum.txt} $dir 
 
 ###########################################################################
 # PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 7-WAY ALIGNMENT 
 # (DONE, 2006-05-17 - 2006-05-24, hartera) 
 # REMAKE CONSERVATION TRACK USING MULTIZ 7-WAY INCLUDING DANRER4 RANDOM CHROMS
 # FOR MM8 ALIGNMENTS (DONE, 2006-05-29, hartera)
    ssh kkstore04
    # Need unzipped maf files for this.
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf
    foreach f (*.maf.gz)
      echo $f
      gunzip -c $f > $f:r
    end 
 
    mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
 
    # create a starting-tree.mod based on chr14 (92 Mb)
    # chr14 is the largest chrom apart from chrNA_random
    /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr14.maf \
         --refseq ../../../14/chr14.fa --in-format MAF \
         --windows 100000000,1000 --out-format SS \
         --between-blocks 5000 --out-root s1
 
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
         --tree "`cat ../tree-commas.nh`" \
         --out-root starting-tree
    # took less than a minute
    rm s1.*ss
    
    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
 # ALPHABET: A C G T
 # ...
 # BACKGROUND: 0.305239 0.194225 0.194292 0.306244
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    # 0.389 is the GC content. This is used in the -gc argument below.
    # If you do *not* have a global tree model and you do not know your
    # GC content, you can get it directly from the MAFs with a command
    # like:
    /cluster/bin/phast/$MACHTYPE/msa_view \
     --aggregate danRer4,tetNig1,fr1,xenTro2,monDom4,mm8,hg18 -i MAF \
     -S /cluster/data/danRer4/bed/multiz7way/maf/chr*.maf > maf_summary.txt
    # This gives a GC content of 0.426 so use this as it is from mafs for
    # the whole genome.
    # break up the genome-wide MAFs into pieces on the san filesystem
    ssh pk
    set WINDOWS=/san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/ss
    mkdir -p $WINDOWS
    cd $WINDOWS
    cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
 set MAFS = /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf
 set WINDOWS=/san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/ss
 cd $WINDOWS
 set c = $1
 echo $c
 rm -fr $c
 mkdir $c
 set N = `echo $c | sed -e 's/chr//'`
 /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
        -M /cluster/data/danRer4/$N/$c.fa \
        -o SS -w 10000000,0 -I 1000 -B 5000 -r $c/$c
 echo "Done" >> $c.done
 'EOF'
 # << emacs
    chmod +x doSplit.csh
    rm -f jobList
    foreach c (`cat /cluster/data/danRer4/chrom.lst`)
     echo "doSplit.csh chr${c} {check out line+ $WINDOWS/chr$c.done}" >> jobList
    end
 
    para create jobList
    para push, check etc.
    para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:        831s      13.86m     0.23h    0.01d  0.000 y
 # IO & Wait Time:                   634s      10.56m     0.18h    0.01d  0.000 y
 # Average job time:                  52s       0.87m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             118s       1.97m     0.03h    0.00d
 # Submission to last job:           118s       1.97m     0.03h    0.00d
 
     # Create a random list of 50 1 mb regions (do not use chrNA and chrUn)
 
     ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \
        awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
     
     # Set up parasol directory to calculate trees on these 50 regions
     ssh pk
     set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
     mkdir -p $dir
     cd $dir
     # now set up cluster job to estimate model parameters.  Parameters
     # will be estimated separately for each alignment fragment then
     # will be combined across fragments. Tuning this loop should come
     # back to here to recalculate. Tuning target-coverage and expected-length.
     # Create little script that calls phastCons with right arguments
 
     cat > makeTree.csh << 'EOF'
 #!/bin/csh -fe
 set C = $1:h
 set treeRun = $2
 set cov = $3
 set len = $4
 set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
 mkdir -p $dir/$treeRun/log/${C} $dir/$treeRun/tree/${C}
 /cluster/bin/phast/x86_64/phastCons $dir/ss/$1 \
   /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons/starting-tree.mod \
   --gc 0.426 --nrates 1,1 --no-post-probs --ignore-missing \
   --expected-length $len --target-coverage $cov \
   --quiet --log $dir/$treeRun/log/$1 --estimate-trees $dir/$treeRun/tree/$1
 'EOF'
     # << emacs
     chmod a+x makeTree.csh
     # Make sure that the correct GC content is substituted in here. Notice
     # the target coverage of 0.17. Here we are going to aim
     # for 65% coverage of coding regions by conserved elements.
     # Create gensub file
 # need to add cov and len parameters
     cat > template << '_EOF_'
 #LOOP
 makeTree.csh $(path1) $(path2)
 #ENDLOOP
 '_EOF_'
     #   happy emacs
     # Make cluster job and run it to try out a few parameters close
     # to those used for danRer3 and danRer2 phastCons runs.
     echo "treeRun1 0.17 12" > tree.lst
     echo "treeRun2 0.32 18" >> tree.lst
     echo "treeRun3 0.32 20" >> tree.lst
     echo "treeRun4 0.35 18" >> tree.lst
     gensub2 randomSs.list tree.lst template jobList
     para create jobList
     para try,check,push,check etc.
 # para time
 # Completed: 200 of 200 jobs
 # CPU time in finished jobs:      68652s    1144.20m    19.07h    0.79d  0.002 y
 # IO & Wait Time:                  2521s      42.02m     0.70h    0.03d  0.000 y
 # Average job time:                 356s       5.93m     0.10h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             629s      10.48m     0.17h    0.01d
 # Submission to last job:          2356s      39.27m     0.65h    0.03d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
     foreach d ($dir/treeRun*)
        cd $d
        ls tree/chr*/*.cons.mod > cons.txt
        /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
          --output-average ave.cons.mod > cons_summary.txt
        ls tree/chr*/*.noncons.mod > noncons.txt
        /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
          --output-average ave.noncons.mod > noncons_summary.txt
     end
        #   measuring entropy
     #   consEntropy <target coverage> <expected lengths>
     #            ave.cons.mod ave.noncons.mod --NH 9.78
     #   never stops with the --NH argument
     # target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok)
     # the expected length that produces this entropy is the one
     # to use for phastCons.
     # foreach treeRun, set the appropriate coverage and length
     # file: treeRunN cov len
     # use awk to split up
     cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
     cp tree.lst entropy.csh 
     perl -pi.bak -e 's/^(treeRun[0-9]+)\s*([0-9\.]+)\s*([0-9]+)/echo \"Coverage = $2 Length = $3\"\ncd $1\n\/cluster\/bin\/phast\/x86_64\/consEntropy $2 $3 ave.cons.mod ave.noncons.mod\ncd \.\./' entropy.csh 
     chmod +x entropy.csh
     entropy.csh >& entropy.out
 # entropy.out
 #Coverage = 0.17 Length = 12
 #Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068
 #Relative entropy: H=0.857449 bits/site
 #Expected min. length: L_min=12.298748 sites
 #Expected max. length: L_max=8.165741 sites
 #Phylogenetic information threshold: PIT=L_min*H=10.545544 bits
 
 #### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED ####
 # These are the same as for danRer2 and give the targeted L_min*H value.
 # This is from treeRun2.
 #Coverage = 0.32 Length = 18
 #Transition parameters:gamma=0.320000,omega=18.000000, mu=0.055556, nu=0.026144
 #Relative entropy: H=0.818130 bits/site
 #Expected min. length: L_min=12.025818 sites
 #Expected max. length: L_max=9.281106 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.838688 bits
 ###
 
 #Coverage = 0.32 Length = 20
 #Transition parameters:gamma=0.320000,omega=20.000000, mu=0.050000, nu=0.023529
 #Relative entropy: H=0.795926 bits/site
 #Expected min. length: L_min=12.724131 sites
 #Expected max. length: L_max=9.927736 sites
 #Phylogenetic information threshold: PIT=L_min*H=10.127467 bits
 
 #Coverage = 0.35 Length = 18
 #Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915
 #Relative entropy: H=0.827604 bits/site
 #Expected min. length: L_min=11.542637 sites
 #Expected max. length: L_max=9.061627 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.552732 bits
 
 # need to iterate and get the right coverage and parameters
 # try running phastCons below with parameters used above and check the
 # coverage of coding regions by the most conserved elements
     # Create cluster dir to do main phastCons run
     ssh pk
     mkdir -p \
        /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
     cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
     cp -p ../treeRun2/ave.*.mod .
     cp -p ../treeRun2/ave.*.mod \
        /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
     mkdir ppRaw bed
     # Create script to run phastCons with right parameters
     #   This job is I/O intensive in its output files, thus it is all
     #   working over in /scratch/tmp/
     # Use the expected length and target coverage determined above and 
     # the corresponding average conserved and nonconserved models
     cat > doPhast.csh << '_EOF_'
 #!/bin/csh -fe
 mkdir /scratch/tmp/${2}
 cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
 pushd /scratch/tmp/${2} > /dev/null
 /cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
         --expected-length 18 --target-coverage 0.32 --quiet \
         --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
 popd > /dev/null
 mkdir -p ppRaw/${1}
 mkdir -p bed/${1}
 mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
 mv /scratch/tmp/${2}/${2}.bed bed/${1}
 rm /scratch/tmp/${2}/ave.*.mod
 rm /scratch/tmp/${2}/${2}.ss
 rmdir /scratch/tmp/${2}
 '_EOF_'
     # emacs happy
     chmod a+x doPhast.csh
 
     #   root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
 cat > template << '_EOF_'
 #LOOP
 doPhast.csh $(root1) $(file1)
 #ENDLOOP
 '_EOF_'
    #   happy emacs
 
    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
 
    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    para time
 # Completed: 191 of 191 jobs
 # CPU time in finished jobs:       4660s      77.67m     1.29h    0.05d  0.000 y
 # IO & Wait Time:                  2927s      48.78m     0.81h    0.03d  0.000 y
 # Average job time:                  40s       0.66m     0.01h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:              83s       1.38m     0.02h    0.00d
 # Submission to last job:          2246s      37.43m     0.62h    0.03d
  
 # combine predictions and transform scores to be in 0-1000 interval
    ssh kkstore04
    cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
    #   The sed's and the sort get the file names in chrom,start order 
    # (Hiram tricks -- split into columns on [.-/] with
    #    identifying x,y,z, to allow column sorting and
    #    restoring the filename.  Warning: the sort column
    # will depend on how deep you are in the dir
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
         | sort -k7,7 -k9,9n \
         | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
         | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
         | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #   ~ 1 minute
     cp -p mostConserved.bed \
         /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
 # Figure out how much is actually covered by the mostConserved data as so:
     cd /cluster/data/danRer4
     faSize */chr*.fa
     # 1774660131 bases (175779328 N's 1598880803 real 816338509 upper 
     # 782542294 lower) in 28 sequences in 28 files
     # Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM) 
     # max 208014280 (chrNA_random) median 59765243
     # The non-N size is 1598880803 bases
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
     awk '{sum+=$3-$2}
 END{printf "%% %.2f = 100.0*%d/1598880803\n",100.0*sum/1598880803,sum}' \
         mostConserved.bed
     -target-coverage 0.32: % 3.18 = 100.0*50871950/1598880803 length=18 
 
     # want to aim for 65% coverage of coding regions
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way/phastCons
     # get an or of refGene and mgcGenes CDS regions
     featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
     # 11770580 bases of 1626093931 (0.724%) in intersection
 
     featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment
     # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 3.128%, both 0.463%, 
     # cover 63.94%, enrich 20.44x
 
     # for danRer3:
     featureBits danRer3 refSeqOrMgcCdsDanRer3.bed \
       /cluster/data/danRer3/bed/multiz5way/mostConserved.bed -enrichment
     # refSeqOrMgcCdsDanRer3.bed 0.714%, 
     # /cluster/data/danRer3/bed/multiz5way/mostConserved.bed 2.998%, 
     # both 0.474%, cover 66.40%, enrich 22.14x
     # so use this result for -target-coverage=0.32 -expected-lengths=18
     # with L_min*H entropy (PIT) value of 9.84 (aiming for around 9.8) and
     # 63.9% coverage of coding regions with most conserved elements
     # (aiming for about 65%)
     # Load most conserved track into database
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
     hgsql -e 'drop table phastConsElements;' danRer4
     hgLoadBed danRer4 phastConsElements mostConserved.bed
     # Loaded 676058 elements of size 5
     featureBits danRer4 mgcGenes:cds phastConsElements -enrichment
     # mgcGenes:cds 0.560%, phastConsElements 3.128%, both 0.366%, 
     # cover 65.36%, enrich 20.89x
     # Create merged posterier probability file and wiggle track data files
     # the sed business gets the names sorted by chromName, chromStart
     # so that everything goes in numerical order into wigEncode
     ssh kkstore04
     cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
     find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
         | sort -k7,7 -k9,9n \
         | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
         | wigEncode stdin phastCons7way.wig phastCons7way.wib
     # takes a few minutes
     ls -l phastCons*
     # -rw-rw-r--  1 hartera protein 255524779 May 29 19:49 phastCons7way.wib
     # -rw-rw-r--  1 hartera protein  61525690 May 29 19:49 phastCons7way.wig
     cp -p phastCons7way.wi? /cluster/data/danRer4/bed/multiz7way/phastCons
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
     mkdir -p /gbdb/danRer4/wib
     rm /gbdb/danRer4/wib/phastCons7way.wib
     ln -s `pwd`/phastCons7way.wib /gbdb/danRer4/wib/phastCons7way.wib
     # use this if need to reload table
     hgsql -e 'drop table phastCons7way;' danRer4
     # load table
     hgLoadWiggle danRer4 phastCons7way phastCons7way.wig
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
     bash
     time hgWiggle -doHistogram \
         -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
             -db=danRer4 phastCons7way > histogram.data 2>&1
 # real    0m30.234s
 # user    0m23.721s
 # sys     0m3.234s
 
         #   create plot of histogram:
     cat << '_EOF_' > histo.gp
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Zebrafish danRer4 Histogram phastCons7 track"
 set xlabel " phastCons7 score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
      "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
 
     #   happy emacs
     gnuplot histo.gp > histo.png
     display histo.png &
 
 # add line: wiggle phastCons7way to trackDb.ra for multiz7way to display the
 # wiggle for the conservation track.
 # check all.joiner for entries for phastCons7way and phastConsElements7way -ok
 # copy over html for multiz and edit.
 
 ###########################################################################
 # PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE, 2006-05-30, hartera)
     #   prepare compressed copy of ascii data values for downloads
     ssh kolossus
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
     mkdir phastConsDownloads
     cd phastConsDownloads
 cat > downloads.csh << 'EOF'
 date
 cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun/ppRaw
 foreach chr (`awk '{print $1}' /cluster/data/danRer4/chrom.sizes`)
   echo $chr
   cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
        | nice gzip -c \
    > /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads/$chr.gz
 end
 date
 'EOF'
     # << emacs
     csh -efx downloads.csh >&! downloads.log & tail -f downloads.log
     # Took ~5 minutes.
     md5sum *.gz > md5sum.txt
 
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads
     set dir = /usr/local/apache/htdocs/goldenPath/danRer4/phastCons7wayScores
     mkdir $dir
     ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads/{*.gz,md5sum.txt} $dir
     # copy over and edit README.txt
     cd $dir
     cp \
  /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores/README.txt .
     # Clean up after phastCons run.
     ssh kkstore04
     rm /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons/*.tab
     rm -r /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
 
 ###########################################################################
 # CREATED RECIPROCAL BEST NETS AND MAF NETS FOR ALL SPECIES WITH PAIRWISE
 # ALIGNMENTS USED FOR MULTIZ MULTIPLE ALIGNMENT 
 # (DONE, 2006-05-12 - 2006-05-15 , hartera)
 #   for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18.
     ssh kolossus
     mkdir /cluster/data/danRer4/bed/rBestRunForMultiz/
     cd /cluster/data/danRer4/bed/rBestRunForMultiz
     # need to re-run chainNet and keep first output (target-referenced,
     # target-centric nets) and second output that we usually /dev/null
     # (query-referenced, target-centric nets).
 cat > rBestNet.csh << 'EOF'
 #!/bin/csh -ef
 foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
   echo "Creating Reciprocal Best Net for $s..."
   set binDir=/cluster/home/hartera/bin/i386
   set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain
   cd $dir
 # Run chainNet again, this time keeping the second output:
   chainPreNet danRer4.$s.all.chain.gz /cluster/data/danRer4/chrom.sizes \
        /cluster/data/$s/chrom.sizes stdout \
        | $binDir/chainNet stdin /cluster/data/danRer4/chrom.sizes \
        /cluster/data/$s/chrom.sizes /dev/null stdout | \
        netSyntenic stdin $dir/$s.danRer4_ref.net
 # get the other species chains from the other species-referenced 
 # (but danRer4-centric) net:
   chainSwap danRer4.$s.all.chain.gz $s.danRer4.all.chain
   netChainSubset -verbose=0 $s.danRer4_ref.net \
        $s.danRer4.all.chain stdout \
        | chainSort stdin $s.danRer4_ref.subset.chain
 # Net those (sorted) danRer4 chains, and keep both outputs, to get
 # reciprocal best nets referenced to both species:
   chainPreNet $s.danRer4_ref.subset.chain \
        /cluster/data/$s/chrom.sizes /cluster/data/danRer4/chrom.sizes stdout \
        | $binDir/chainNet stdin /cluster/data/$s/chrom.sizes \
        /cluster/data/danRer4/chrom.sizes tmp1 tmp2
 
   netSyntenic tmp1 $s.danRer4.rbest.net
   netSyntenic tmp2 danRer4.$s.rbest.net
   rm tmp1 tmp2
   nice gzip *.rbest.net
 end
 'EOF'
     chmod +x rBestNet.csh
     nice rBestNet.csh >& rBestNet.log &
     # Took about 11 minutes to complete.
     # Then make axtNet and mafNet 
 cat > makeMafRBestNet.csh << 'EOF'
 #!/bin/csh -ef
 foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
   echo "Creating mafs for $s ..."
   set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain
   set seqDir=/san/sanvol1/scratch
   cd $dir
 # extract recriprocal best chains from the zebrafish-other species rbest.net
   echo "Get reciprocal best chains for best zebrafish-$s"
   netChainSubset danRer4.$s.rbest.net.gz danRer4.$s.all.chain.gz \
       danRer4.$s.rbest.chain
 # need to make sure this is sorted and assing unique chain IDs
   chainSort danRer4.$s.rbest.chain stdout | chainMergeSort stdin \
        > danRer4.$s.rbest.newids.chain
 # need to re-net with new ids 
   chainNet danRer4.$s.rbest.newids.chain /cluster/data/danRer4/chrom.sizes \
           /cluster/data/$s/chrom.sizes danRer4.$s.rbest.newids.net /dev/null
 # split reciprocal best chains and net
   chainSplit rBestChain danRer4.$s.rbest.newids.chain 
   netSplit danRer4.$s.rbest.newids.net rBestNet
   mkdir ../axtRBestNet
 # make axtNet for reciprocal best
   echo "Making axtRBestNet for $s ..."
   foreach f (rBestNet/*.net)
     netToAxt $f rBestChain/$f:t:r.chain \
       $seqDir/danRer4/danRer4.2bit $seqDir/$s/$s.2bit stdout \
       | axtSort stdin stdout \
       | gzip -c > ../axtRBestNet/$f:t:r.danRer4.$s.net.axt.gz
   end
 # make mafNet for reciprocal best
   cd ..
   mkdir mafRBestNet  
   echo "Making mafRBestNet for $s ..."
   foreach f (axtRBestNet/*.danRer4.$s.net.axt.gz)
   axtToMaf -tPrefix=danRer4. -qPrefix=$s. $f \
      /cluster/data/danRer4/chrom.sizes /cluster/data/$s/chrom.sizes stdout \
      | gzip -c > mafRBestNet/$f:t:r:r:r:r:r.maf.gz
   end
 end
 'EOF'
     chmod +x makeMafRBestNet.csh 
     nice makeMafRBestNet.csh >& mafRBestNet.log &
     # Took about an hour.
     # NOTE: Must use chainSort and chainMergeSort to reassign unique IDs
     # to the chains extracted from the rbest.net and then re-net the chains
     # with the new IDs ortherwise netToAxt crashes due to duplicate chain IDs. 
     # Now do the multiple alignment using reciprocal best mafNets as input
     # for multiz.
     # Load up nets and chains from rBestChain and rBestNet 
     ssh hgwdev
     cd /cluster/data/danRer4/bed/rBestRunForMultiz
     # Nets from Reciprocal Best have no type field or repeat/gap stats so need
     # to add these.
 cat > loadRBest.csh << 'EOF'
 #!/bin/csh -ef
 foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
   set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain
   if ($s == "tetNig1") then
      set g = TetNig1
   else if ($s == "fr1") then
      set g = Fr1
   else if ($s == "xenTro2") then
      set g = XenTro2
   else if ($s == "monDom4") then
      set g = MonDom4 
   else if ($s == "mm8") then
      set g = Mm8
   else if ($s == "hg18") then
      set g = Hg18
   endif
 # load chains
   echo "Loading chains for $s ..."
   cd $dir/rBestChain
   foreach f (*.chain)
     set c = $f:r
     hgLoadChain danRer4 ${c}_chainRBest${g} $f 
   end   
 # load nets
   cd $dir
   echo "Loading nets for $s ..."
 # add type field
   netSyntenic danRer4.${s}.rbest.newids.net noClassRBest.net
 # add gap/repeat stats to net file using database tables
   netClass -verbose=0 -noAr noClassRBest.net danRer4 $s \
      danRer4.${s}.rbest.withClass.net
   netFilter -minGap=10 danRer4.${s}.rbest.withClass.net \
      | hgLoadNet -verbose=0 danRer4 netRBest${g} stdin
 end
 'EOF'
 
     << emacs
     chmod +x loadRBest.csh
     nohup nice loadRBest.csh >& loadRBest.log &
 
 ###########################################################################
 # MULTIZ7WAY ALIGNMENTS FOR CONSERVATION TRACK - USING RECIPROCAL BEST NETS
 # (DONE, 2006-05-18 - 2006-05-24, hartera)
 #   for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18.
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-18
 
     # copy MAFs to a cluster-friendly server
     # use bluearc as the san is down
     mkdir /cluster/bluearc/danRer4/mafRBestNet
     foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
        echo $s
        rsync -av /cluster/data/danRer4/bed/blastz.$s/mafRBestNet/* \
         /cluster/bluearc/danRer4/mafRBestNet/$s/
     end
     # prune the hg17 17way tree to just these 7 and update db names:
     /cluster/bin/phast/tree_doctor \
       --prune-all-but=mouse_mm8,human_hg18,monodelphis_monDom4,xenopus_xenTro1,tetraodon_tetNig1,fugu_fr1,zebrafish_danRer3 \
       --rename="xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \
       /cluster/data/hg18/bed/multiz17way/17way.nh > 7way.nh
     # carefully edit so that danRer4 is first. copy first to new file
     cp 7way.nh 7way_zfishFirst.nh
     # DO THIS LATER AND CREATE FROM TREE WITHOUT DISTANCES
     /cluster/bin/phast/draw_tree 7way_zfishFirst.nh > 7way.ps
     # also made the ps file for the 7way.nh and compared to make sure
     # that the tree with zebrafish at the top looks correct.
     /cluster/bin/phast/all_dists 7way_zfishFirst.nh > 7way.distances
     grep danRer4 7way.distances | sort -k3,3n | \
         awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
     cat distances.txt
 # 1.4749  tetraodon_tetNig1
 # 1.5154  fugu_fr1
 # 1.7480  human_hg18
 # 1.7782  monodelphis_monDom4
 # 1.8771  xenopus_xenTro2
 # 2.1058  mouse_mm8
     # the order in the browser display will be by tree topology,
     # not by distance, so they will be:
     # danRer4
     # 1.5154  fugu_fr1
     # 1.4749  tetraodon_tetNig1
     # 1.8771  xenopus_xenTro2
     # 1.7782  monodelphis_monDom4
     # 2.1058  mouse_mm8
     # 1.7480  human_hg18
 
     # create species list and stripped down tree for autoMZ
     sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' \
         7way_zfishFirst.nh > tree-commas.nh
     sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
     sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
     cp tree-commas.nh 7way.nh
 
     ssh pk
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-18
     mkdir maf run
     cd run
 
     # stash binaries
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
 cat > autoMultiz.csh << 'EOF'
 #!/bin/csh -ef
     set db = danRer4
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /cluster/bluearc/$db/mafRBestNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../{tree.nh,species.lst} $tmp
     pushd $tmp
     foreach s (`cat species.lst`)
         set in = $pairs/$s/$c.maf
         set out = $db.$s.sing.maf
         if ($s == $db) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 'EOF'
     # << emacs
     chmod +x autoMultiz.csh
 
 cat  << 'EOF' > spec
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/danRer4/bed/multiz7way.2006-05-18/maf/$(root1).maf}
 #ENDLOOP
 'EOF'
     # << emacs
     awk '{print $1}' /cluster/data/danRer4/chrom.sizes > chrom.lst
     gensub2 chrom.lst single spec jobList
     para create jobList
     para try, check, push, check etc. ...
     # Took less than 10 minutes to run 
    # Make .jpg for tree and install in htdocs/images/phylo/... don't forget
    # to request a push of that file.  The treeImage setting in trackDb.ra
    # is phylo/danRer4_7way.jpg (relative to htdocs/images).
  #  ssh hgwdev
   # DO LATER
 #   cd /cluster/data/danRer4/bed/multiz7way.2006-05-04
 #   pstopnm -stdout 7way.ps | pnmtojpeg > danRer4_7way.jpg
    # ask Bob to resize image for Browser track description page.
 
    # Build maf annotation and load database
    ssh kolossus
    mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno
    mkdir maf run
    cd run
    rm -f sizes nBeds
  foreach db (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-18/species.lst`)
       ln -s  /cluster/data/$db/chrom.sizes $db.len
       if (! -e /cluster/data/$db/$db.N.bed) then
         twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
       endif
       ln -s  /cluster/data/$db/$db.N.bed $db.bed
       echo $db.bed  >> nBeds
       echo $db.len  >> sizes
   end
     echo date > jobs.csh
     # do smaller jobs first:
     foreach f (`ls -1rS ../../maf/*.maf`)
       echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
         /cluster/data/danRer4/danRer4.2bit ../maf/`basename $f` \
         >> jobs.csh
       echo "echo $f" >> jobs.csh
     end
     echo date >> jobs.csh
     csh -efx jobs.csh >&! jobs.log &
     tail -f jobs.log
 
     # Load anno/maf  
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf
     mkdir -p /gbdb/danRer4/multiz7wayRBest/anno/maf
     ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf/*.maf \
       /gbdb/danRer4/multiz7wayRBest/anno/maf
     # Reload as not working correctly. 
     hgsql -e 'drop table multiz7wayRBest;' danRer4
     hgsql -e 'delete from extFile where path like "%multiz7wayRBest%";' \
           danRer4
     cat > loadMaf.csh << 'EOF'
 date
 nice hgLoadMaf -pathPrefix=/gbdb/danRer4/multiz7wayRBest/anno/maf danRer4 multiz7wayRBest
 date
 'EOF'
     # << emacs
     csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log
     # Do the computation-intensive part of hgLoadMafSummary on a workhorse
     # machine and then load on hgwdev:
     ssh kkr7u00
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf
     cat *.maf \
     | nice hgLoadMafSummary danRer4 -minSize=30000 -mergeGap=1500 \
     -maxSize=200000 -test multiz7wayRBestSummary stdin
     # Created 526386 summary blocks from 1972659 components and 1105457 mafs 
     # from stdin
 
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf
     sed -e 's/mafSummary/multiz7wayRBestSummary/' \
       ~/kent/src/hg/lib/mafSummary.sql \
       > /tmp/multiz7wayRBestSummary.sql
     time nice hgLoadSqlTab danRer4 multiz7wayRBestSummary \
          /tmp/multiz7wayRBestSummary.sql multiz7wayRBestSummary.tab
     # 0.000u 0.000s 0:07.56 0.0%      0+0k 0+0io 4pf+0w
     rm *.tab /tmp/multiz7wayRBestSummary.sql
   #  ln -s multiz7way.2006-05-18 /cluster/data/danRer4/bed/multiz7way
   #  ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-18/danRer4_7way.jpg \
    #       /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
     # change permissions for display if not already readable to all
   #  chmod +r /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
 
 # check for all.joiner entry for 7-way - it is there already.
 # add trackDb.ra entry for danRer4:
 
 ###########################################################################
 # PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 7-WAY ALIGNMENT USING MAFS
 # FROM RECIPROCAL BEST NET (DONE, 2006-05-19 - 2005-05-24, hartera)
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
    cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
 
    # create a starting-tree.mod based on chr14 (92 Mb)
    # chr14 is the largest chrom apart from chrNA_random
    /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr14.maf \
         --refseq ../../../14/chr14.fa --in-format MAF \
         --windows 100000000,1000 --out-format SS \
         --between-blocks 5000 --out-root s1
    /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
         --tree "`cat ../tree-commas.nh`" \
         --out-root starting-tree
    # took less than a minute
    rm s1.*ss
    
    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
 # ALPHABET: A C G T
 # ...
 # BACKGROUND: 0.309665 0.189697 0.189720 0.310918
    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    # 0.379 is the GC content. This is used in the -gc argument below.
    # If you do *not* have a global tree model and you do not know your
    # GC content, you can get it directly from the MAFs with a command
    # like:
    /cluster/bin/phast/$MACHTYPE/msa_view \
     --aggregate danRer4,tetNig1,fr1,xenTro2,monDom4,mm8,hg18 -i MAF \
     -S /cluster/data/danRer4/bed/multiz7way/maf/chr*.maf > maf_summary.txt
    # This gives a GC content of 0.426 so use this as it is from mafs for
    # the whole genome.
    # break up the genome-wide MAFs into pieces on the san filesystem
    ssh pk
    # should use a directory on the san but it is down and para create is
    # not working on kk.
    set WINDOWS=/cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/ss
    mkdir -p $WINDOWS
    cd $WINDOWS
    cat << 'EOF' > doSplit.csh
 #!/bin/csh -ef
 set MAFS = /cluster/data/danRer4/bed/multiz7way.2006-05-18/maf
 set WINDOWS=/cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/ss
 cd $WINDOWS
 set c = $1
 echo $c
 rm -fr $c
 mkdir $c
 set N = `echo $c | sed -e 's/chr//'`
 /cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
        -M /cluster/data/danRer4/$N/$c.fa \
        -o SS -w 10000000,0 -I 1000 -B 5000 -r $c/$c
 echo "Done" >> $c.done
 'EOF'
 # << emacs
    chmod +x doSplit.csh
    rm -f jobList
    foreach c (`cat /cluster/data/danRer4/chrom.lst`)
     echo "doSplit.csh chr${c} {check out line+ $WINDOWS/chr$c.done}" >> jobList
    end
 
    para create jobList
    para push, check etc.
    para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:        847s      14.12m     0.24h    0.01d  0.000 y
 # IO & Wait Time:                  9741s     162.35m     2.71h    0.11d  0.000 y
 # Average job time:                 378s       6.30m     0.11h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             539s       8.98m     0.15h    0.01d
 # Submission to last job:           581s       9.68m     0.16h    0.01d
 
     # Create a random list of 50 1 mb regions (do not use chrNA and chrUn)
 
     ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \
        awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
     
     # Set up parasol directory to calculate trees on these 50 regions
     ssh pk
     set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
     mkdir -p $dir
     cd $dir
     # now set up cluster job to estimate model parameters.  Parameters
     # will be estimated separately for each alignment fragment then
     # will be combined across fragments. Tuning this loop should come
     # back to here to recalculate. Tuning target-coverage and expected-length.
     # Create little script that calls phastCons with right arguments
 
     cat > makeTree.csh << 'EOF'
 #!/bin/csh -fe
 set C = $1:h
 set treeRun = $2
 set cov = $3
 set len = $4
 set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
 mkdir -p $dir/$treeRun/log/${C} $dir/$treeRun/tree/${C}
 /cluster/bin/phast/x86_64/phastCons $dir/ss/$1 \
   /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons/starting-tree.mod \
   --gc 0.426 --nrates 1,1 --no-post-probs --ignore-missing \
   --expected-length $len --target-coverage $cov \
   --quiet --log $dir/$treeRun/log/$1 --estimate-trees $dir/$treeRun/tree/$1
 'EOF'
     # << emacs
     chmod a+x makeTree.csh
     # Make sure that the correct GC content is substituted in here. Notice
     # the target coverage of 0.17. Here we are going to aim
     # for 65% coverage of coding regions by conserved elements.
     # Create gensub file
 # need to add cov and len parameters
     cat > template << '_EOF_'
 #LOOP
 makeTree.csh $(path1) $(path2)
 #ENDLOOP
 '_EOF_'
     #   happy emacs
     # Make cluster job and run it
     echo "treeRun1 0.17 12" > tree.lst
     echo "treeRun2 0.32 18" >> tree.lst
     echo "treeRun3 0.32 20" >> tree.lst
     echo "treeRun4 0.35 18" >> tree.lst
     gensub2 randomSs.list tree.lst template jobList
     para create jobList
     para try,check,push,check etc.
 # para time
 # Completed: 200 of 200 jobs
 # CPU time in finished jobs:      45500s     758.33m    12.64h    0.53d  0.001 y
 # IO & Wait Time:                 31478s     524.64m     8.74h    0.36d  0.001 y
 # Average job time:                 385s       6.41m     0.11h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             622s      10.37m     0.17h    0.01d
 # Submission to last job:           821s      13.68m     0.23h    0.01d
     # try again, mkdir test2. if aim for about 5% coverage and for chr1 on
     # hg18, netDanRer4 covers about 31% of bases then 0.05/0.30 = 0.156
     # want length of about 20 bp to influence the model towards detecting
     # shorter conserved regions such as TFBSs.
     cd test2
     echo "treeRun5 0.156 20" > tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     cd test3
     echo "treeRun6 0.156 15" > tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     cd test4
     # increase coverage and compensate a bit by lowering the expected length
     echo "treeRun7 0.25 8" > tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     cd test5
     echo "treeRun8 0.35 12" > tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     cd test6
     echo "treeRun9 0.5 20" > tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     cd test7
     echo "treeRun10 0.5 24" > tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     cd test8
     echo "treeRun11 0.45 22" > tree.lst
     echo "treeRun12 0.5 26" >> tree.lst
     echo "treeRun13 0.5 28" >> tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     cd test9
     echo "treeRun14 0.45 24" > tree.lst
     echo "treeRun15 0.45 20" >> tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     cd test10
     echo "treeRun16 0.40 24" > tree.lst
     echo "treeRun17 0.40 20" >> tree.lst
     echo "treeRun18 0.42 20" >> tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     cd test11
     echo "treeRun19 0.38 24" > tree.lst
     echo "treeRun20 0.38 22" >> tree.lst
     echo "treeRun21 0.38 20" >> tree.lst
     gensub2 ../randomSs.list tree.lst template jobList
     para create jobList
     # Now combine parameter estimates.  We can average the .mod files
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
     foreach d ($dir/treeRun*)
        cd $d
        ls tree/chr*/*.cons.mod > cons.txt
        /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
          --output-average ave.cons.mod > cons_summary.txt
        ls tree/chr*/*.noncons.mod > noncons.txt
        /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
          --output-average ave.noncons.mod > noncons_summary.txt
     end
        #   measuring entropy
     #   consEntropy <target coverage> <expected lengths>
     #            ave.cons.mod ave.noncons.mod --NH 9.78
     #   never stops with the --NH argument
     # target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok)
     # the expected length that produces this entropy is the one
     # to use for phastCons.
     # foreach treeRun, set the appropriate coverage and length
     # file: treeRunN cov len
     # use awk to split up
     cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
     cp tree.lst entropy.csh 
     perl -pi.bak -e 's/^(treeRun[0-9]+)\s*([0-9\.]+)\s*([0-9]+)/echo \"Coverage = $2 Length = $3\"\ncd $1\n\/cluster\/bin\/phast\/x86_64\/consEntropy $2 $3 ave.cons.mod ave.noncons.mod\ncd \.\./' entropy.csh 
     chmod +x entropy.csh
     entropy.csh >& entropy.out
 # entropy.out
 #Coverage = 0.17 Length = 12
 #Transition parameters:gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
 #Relative entropy: H=0.782279 bits/site
 #Expected min. length: L_min=13.655129 sites
 #Expected max. length: L_max=8.801144 sites
 #Phylogenetic information threshold: PIT=L_min*H=10.682123 bits
 
 #Coverage = 0.32 Length = 18
 #Transition parameters:gamma=0.320000, omega=18.000000, mu=0.055556, nu=0.026144
 #Relative entropy: H=0.757117 bits/site
 #Expected min. length: L_min=13.055080 sites
 #Expected max. length: L_max=9.912578 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.884225 bits
 
 #Coverage = 0.32 Length = 20
 #Transition parameters:gamma=0.320000, omega=20.000000, mu=0.050000, nu=0.023529
 #Relative entropy: H=0.736191 bits/site
 #Expected min. length: L_min=13.815340 sites
 #Expected max. length: L_max=10.615242 sites
 #Phylogenetic information threshold: PIT=L_min*H=10.170732 bits
 
 #Coverage = 0.35 Length = 18
 #Transition parameters:gamma=0.350000, omega=18.000000, mu=0.055556, nu=0.029915
 #Relative entropy: H=0.768872 bits/site
 #Expected min. length: L_min=12.471015 sites
 #Expected max. length: L_max=9.642561 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.588610 bits
 
 #Coverage = 0.156 Length = 20
 #Transition parameters:gamma=0.156000, omega=20.000000, mu=0.050000, nu=0.009242
 #Relative entropy: H=0.676147 bits/site
 #Expected min. length: L_min=17.857722 sites
 #Expected max. length: L_max=12.694666 sites
 #Phylogenetic information threshold: PIT=L_min*H=12.074436 bits
 
 #Coverage = 0.156 Length = 15
 #Transition parameters:gamma=0.156000, omega=15.000000, mu=0.066667, nu=0.012322
 #Relative entropy: H=0.726430 bits/site
 #Expected min. length: L_min=15.713919 sites
 
 #Transition parameters: gamma=0.250000, omega=8.000000, mu=0.125000, nu=0.041667
 #Relative entropy: H=0.950194 bits/site
 #Expected min. length: L_min=8.951612 sites
 #Expected max. length: L_max=5.560228 sites
 #Phylogenetic information threshold: PIT=L_min*H=8.505767 bits
 
 #Coverage = 0.5 Length = 20
 #Transition parameters:gamma=0.500000, omega=20.000000, mu=0.050000, nu=0.050000
 #Relative entropy: H=0.817081 bits/site
 #Expected min. length: L_min=10.397809 sites
 #Expected max. length: L_max=9.006386 sites
 #Phylogenetic information threshold: PIT=L_min*H=8.495855 bits
 
 # Coverage = 0.5 Length = 24
 #Transition parameters:gamma=0.500000, omega=24.000000, mu=0.041667, nu=0.041667
 #Relative entropy: H=0.772807 bits/site
 #Expected min. length: L_min=11.706841 sites
 #Expected max. length: L_max=10.170845 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.047124 bits
 
 # Coverage = 0.5 Length = 26
 #Transition parameters:gamma=0.500000,omega=26.000000, mu=0.038462, nu=0.038462
 #Relative entropy: H=0.755159 bits/site
 #Expected min. length: L_min=12.299010 sites
 #Expected max. length: L_max=10.697444 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.287712 bits
 
 #Coverage = 0.5 Length = 28
 #Transition parameters:gamma=0.500000,omega=28.000000, mu=0.035714, nu=0.035714
 #Relative entropy: H=0.739661 bits/site
 #Expected min. length: L_min=12.856932 sites
 #Expected max. length: L_max=11.193931 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.509775 bits
 
 ########USED THESE PARAMETERS##################
 #Coverage = 0.45 Length = 24
 #Transition parameters:gamma=0.450000, omega=24.000000, mu=0.041667, nu=0.034091
 #Relative entropy: H=0.749572 bits/site
 #Expected min. length: L_min=12.663020 sites
 #Expected max. length: L_max=10.634682 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.491841 bits
 
 #Coverage = 0.40 Length = 24
 #Transition parameters:gamma=0.400000, omega=24.000000, mu=0.041667, nu=0.027778
 #Relative entropy: H=0.730161 bits/site
 #Expected min. length: L_min=13.607002 sites
 #Expected max. length: L_max=11.092981 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.935307 bits
 
 #Coverage = 0.38 Length = 20
 #Transition parameters:gamma=0.380000, omega=20.000000, mu=0.050000, nu=0.030645
 #Relative entropy: H=0.758676 bits/site
 #Expected min. length: L_min=12.652818 sites
 #Expected max. length: L_max=10.063048 sites
 #Phylogenetic information threshold: PIT=L_min*H=9.599385 bits
  
 #Coverage = 0.38 Length = 24
 #Transition parameters:gamma=0.380000, omega=24.000000, mu=0.041667, nu=0.025538
 #Relative entropy: H=0.723105 bits/site
 #Expected min. length: L_min=13.987286 sites
 #Expected max. length: L_max=11.279443 sites
 #Phylogenetic information threshold: PIT=L_min*H=10.114270 bits
 
     # Create cluster dir to do main phastCons run
     ssh pk
     mkdir -p \
        /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun
     cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun
     cp -p ../treeRun1/ave.*.mod .
     cp -p ../treeRun1/ave.*.mod \
        /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
     mkdir ppRaw bed
     # Create script to run phastCons with right parameters
     #   This job is I/O intensive in its output files, thus it is all
     #   working over in /scratch/tmp/
     # Use the expected length and target coverage determined above and 
     # the corresponding average conserved and nonconserved models
     cat > doPhast.csh << '_EOF_'
 #!/bin/csh -fe
 mkdir /scratch/tmp/${2}
 cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
 pushd /scratch/tmp/${2} > /dev/null
 /cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
         --expected-length 18 --target-coverage 0.32 --quiet \
         --seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
 popd > /dev/null
 mkdir -p ppRaw/${1}
 mkdir -p bed/${1}
 mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
 mv /scratch/tmp/${2}/${2}.bed bed/${1}
 rm /scratch/tmp/${2}/ave.*.mod
 rm /scratch/tmp/${2}/${2}.ss
 rmdir /scratch/tmp/${2}
 '_EOF_'
     # emacs happy
     chmod a+x doPhast.csh
 
     #   root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
 cat > template << '_EOF_'
 #LOOP
 doPhast.csh $(root1) $(file1)
 #ENDLOOP
 '_EOF_'
    #   happy emacs
 
    # Create parasol batch and run it
    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
 
    gensub2 in.list single template jobList
    para create jobList
    para try/check/push/etc.
    para time
 # Completed: 191 of 191 jobs
 # CPU time in finished jobs:       4421s      73.69m     1.23h    0.05d  0.000 y
 # IO & Wait Time:                121036s    2017.26m    33.62h    1.40d  0.004 y
 # Average job time:                 657s      10.95m     0.18h    0.01d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             726s      12.10m     0.20h    0.01d
 # Submission to last job:           874s      14.57m     0.24h    0.01d
  
 # combine predictions and transform scores to be in 0-1000 interval
    ssh kkstore04
    cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun
    #   The sed's and the sort get the file names in chrom,start order 
    # (Hiram tricks -- split into columns on [.-/] with
    #    identifying x,y,z, to allow column sorting and
    #    restoring the filename.  Warning: the sort column
    # will depend on how deep you are in the dir
    find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
         | sort -k7,7 -k9,9n \
         | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
         | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
         | /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     cp -p mostConserved.bed \
         /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
 # Figure out how much is actually covered by the mostConserved data as so:
     cd /cluster/data/danRer4
     faSize */chr*.fa
     # 1774660131 bases (175779328 N's 1598880803 real 816338509 upper 
     # 782542294 lower) in 28 sequences in 28 files
     # Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM) 
     # max 208014280 (chrNA_random) median 59765243
     # 782542294 lower) in 28 sequences in 28 files
     # Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM) 
     # max 208014280 (chrNA_random) median 59765243
     # The non-N size is 1598880803 bases
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
     awk '{sum+=$3-$2}
 END{printf "%% %.2f = 100.0*%d/1598880803\n",100.0*sum/1598880803,sum}' \
         mostConserved.bed
     -target-coverage 0.17: % 1.51 = 100.0*24186350/1598880803 length=12
     -target-coverage 0.156: % 1.44 = 100.0*22973222/1598880803 length=20
     -target-coverage 0.156: % 1.32 = 100.0*21177329/1598880803 length=15
     -target-coverage 0.25: % 1.32 = 100.0*21104503/1598880803 length=8
     -target-coverage 0.32: % 1.88 = 100.0*30014509/1598880803 length=20
     -target-coverage 0.5: % 3.00 = 100.0*47931076/1598880803 length=20
     -target-coverage 0.5: % 2.95 = 100.0*47170018/1598880803 length=24
     -target-coverage 0.5: % 2.24 = 100.0*35801661/1598880803 length=28
     -target-coverage 0.45: % 2.50 = 100.0*39965003/1598880803 length=24
     -target-coverage 0.40: % 2.22 = 100.0*35436744/1598880803 length=24
     -target-coverage 0.38: % 2.12 = 100.0*33911465/1598880803 length=20
     -target-coverage 0.38: % 2.13 = 100.0*33986115/1598880803 length=24
 
     # want to aim for 65% coverage of coding regions
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
     # get an or of refGene and mgcGenes CDS regions
     featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
     # 11753378 bases of 1626093931 (0.723%) in intersection
 
 #    featureBits danRer3 refGene:cds mgcGenes:cds -or \
  #               -bed=refSeqOrMgcCdsDanRer3.bed
     # 11633092 bases of 1630323462 (0.714%) in intersection
     featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment
     # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.487%, both 0.332%, 
     # cover 45.97%, enrich 30.90x
     # for length = 12 and cov = 0.17 PIT=10.7
     # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.846%, both 0.388%, 
     # cover 53.74%, enrich 29.12x
     # for length = 20 and cov = 0.156 PIT=12.1
     # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.413%, both 0.333%, 
     # cover 46.04%, enrich 32.59x
     # for length = 15 and cov = 0.156 PIT=11.4
     # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.302%, both 0.313%, 
     # cover 43.36%, enrich 33.30x
     # decrease length and increase coverage to compensate
     # for length = 8 and cov = 0.25 PIT=8.5, PIT is too low
     # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.298%, both 0.304%, 
     # cover 42.06%, enrich 32.40x
     # try length = 20 and cov = 0.32 PIT=10.8
     # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.846%, both 0.388%, 
     # cover 53.74%, enrich 29.12x
     # length = 20 and cov = 0.5 PIT=8.5
     # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 2.948%, both 0.459%, 
     # cover 63.53%, enrich 21.55x
     # coverage good, need to increase the PIT value so increase the length.
     # length = 24 and cov = 0.5 PIT=9.05
     # refSeqOrMgcCds.bed 0.723%, mostConserved.bed 2.901%, both 0.458%, 
     # cover 63.35%, enrich 21.84x
     # length = 28 and cov = 0.5 PIT=9.5
     # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.202%, both 0.431%, 
     # cover 59.57%, enrich 27.06x
     # length = 24 and cov = 0.45 PIT=9.5
     featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
     # 11770580 bases of 1626093931 (0.724%) in intersection
     featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment
     # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.458%, both 0.438%, 
     # cover 60.57% enrich 24.64x
     # length = 20 and cov = 0.38 PIT=9.6
     # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.085%, both 0.411%, 
     # cover 56.76%, enrich 27.22x
     # length = 24 and cov = 0.38 PIT=10.1
     # refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.090%, both 0.413%, 
     # cover 57.07%, enrich 27.30x
     # with L_min*H entropy (PIT) value of 9.84 (aiming for around 9.8) and
     # 53.3% coverage of coding regions with most conserved elements
     # (aiming for about 65%)
 
     # use consRun14 length = 24 cov=0.45
     # Load most conserved track into database
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
     hgLoadBed danRer4 phastConsRBestElements mostConserved.bed
     # Loaded  elements of size 5
     featureBits danRer4 mgcGenes:cds phastConsRBestElements -enrichment
     # mgcGenes:cds 0.560%, phastConsRBestElements 2.458%, both 0.349%, 
     # cover 62.23%, enrich 25.32x
     # Create merged posterier probability file and wiggle track data files
     # the sed business gets the names sorted by chromName, chromStart
     # so that everything goes in numerical order into wigEncode
     ssh kkstore04
     cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun14
     find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
         | sort -k7,7 -k9,9n \
         | sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
         | wigEncode stdin phastConsRBest7way.wig phastConsRBest7way.wib
     # Converted stdin, upper limit 1.00, lower limit 0.00
     # takes a few minutes
     ls -l phastCons*
     #-rw-rw-r--  1 hartera protein 133817339 May 24 22:48 phastConsRBest7way.wib
     #-rw-rw-r--  1 hartera protein  36947021 May 24 22:48 phastConsRBest7way.wig 
     cp -p phastConsRBest7way.wi? \
           /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
     
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
     mkdir -p /gbdb/danRer4/wib
     ln -s `pwd`/phastConsRBest7way.wib /gbdb/danRer4/wib/phastConsRBest7way.wib
     # use this if need to reload table
     hgsql -e 'drop table phastConsRBest7way;' danRer4
     # load table
     hgLoadWiggle danRer4 phastConsRBest7way phastConsRBest7way.wig
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/danRer4/bed/multiz7way.2006-05-04/phastCons
     bash
     time hgWiggle -doHistogram \
         -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
             -db=danRer4 phastCons7way > histogram.data 2>&1
 # real    2m33.069s
 # user    1m58.310s
 # sys     0m16.170s
 
         #   create plot of histogram:
     cat << '_EOF_' > histo.gp
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Zebrafish danRer4 Histogram phastCons7 track"
 set xlabel " phastCons7 score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
      "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
 
     #   happy emacs
     gnuplot histo.gp > histo.png
     display histo.png &
 
 # add line: wiggle phastCons7way to trackDb.ra for multiz7way to display the
 # wiggle for the conservation track.
 # check all.joiner for entries for phastCons7way and phastConsElements7way -ok
 # copy over html for multiz and edit.
 
 
 ###########################################################################
 # BACENDS TRACK (DONE, 2006-08-25, hartera)
    # Obtain these from the NCBI Trace archive
    ssh kolossus
    mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/sequences
    cd /cluster/data/danRer4/bed/bacEnds/
    ln -s /san/sanvol1/scratch/danRer4/bacEnds/sequences .
    cd sequences
    # go to NCBI Trace Archive
    # http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?
    cat << '_EOF_' > query_tracedb
 #!/usr/bin/perl -w
 use strict;
 use LWP::UserAgent;
 use HTTP::Request::Common 'POST';
 
 $ENV{'LANG'}='C';
 $ENV{'LC_ALL'}='C';
 
 my $query = join ' ', @ARGV;
 $query = 'help' if $query =~ /^(\-h|\-\-help|\-)$/;
 $query = join('', <STDIN>) if ! $query;
 
 my $req = POST 'http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=raw', [query=>$query];
 my $res =  LWP::UserAgent->new->request($req, sub { print $_[0] });
 die "Couldn't connect to TRACE server\n" if ! $res->is_success;
 '_EOF_'
    chmod +x query_tracedb
    # ./query_tracedb usage
    # command to see the help screen with usage examples
    # count number of entries for zebrafish
    query_tracedb "query count species_code='DANIO RERIO' AND trace_type_code = 'CLONEEND'"
    # 473060
    # 428904 (08-16-06)
    # Therefore this is 11 files of 40000 results each.
    # so get from ftp site:
    
    cat << '_EOF_' > getZfishSeqs.csh
 #!/bin/csh -fe
 foreach n (0 1 2 3 4 5 6 7 8 9 10)
    echo "Fetching page $n ..."
    (echo -n "retrieve_tgz all 0b"; query_tracedb "query page_size 40000 page_number $n binary species_code='DANIO RERIO' AND trace_type_code = 'CLONEEND'") | query_tracedb > data${n}.tgz
 end
 '_EOF_'
    chmod +x getZfishSeqs.csh
    mkdir -p downloads
    cp query_tracedb getZfishSeqs.csh ./downloads
    cd downloads
    nohup nice getZfishSeqs.csh >& zfishSeqs.log &
    # Took 5 hours 14 minutes.
    ##  Start: Wed May 10 09:57 Finished: 14:51
    # Start: May  2 21:43 Finish: May 3 03:08
    ssh kkstore04
    # unzip and untar the downloads
    cd /cluster/data/danRer4/bed/bacEnds/sequences/downloads
    gunzip *.tgz
 cat << '_EOF_' > unTarBacs.csh
 #!/bin/csh -fe
 foreach t (0 1 2 3 4 5 6 7 8 9 10 11)
    tar xvf data${t}.tar
 end
 '_EOF_'
    chmod +x unTarBacs.csh
    nohup unTarBacs.csh >& unTarBacs.log &
 foreach d (2006*)
    echo "Processing $d"
    nice cat ${d}/TRACEINFO.xml >> allTraceInfo.xml
    nice catBacs.csh >& catBacs.log &
    # The last archive obtained is empty so try downloading from the ftp site
    # to be sure to get everything.
 
    # get BAC end sequences from NBCI Trace archive ftp site:
    ssh kkstore04
    mkdir /cluster/data/danRer4/bed/bacEnds/sequences2
    mkdir /cluster/bluearc/danRer4/bacEndsDownloads
    cd /cluster/data/danRer4/bed/bacEnds/sequences2
    ln -s /cluster/bluearc/danRer4/bacEndsDownloads
    cd /cluster/data/danRer4/bed/bacEnds/sequences2/bacEndsDownloads
    # get index page and ftp for the trace server
    wget --timestamping \
         ftp://ftp.ncbi.nih.gov/pub/TraceDB/danio_rerio/
    # grab just the ftp link for each file.
    grep "anc" index.html > ancillary.lst
    perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' ancillary.lst
    rm *.bak
    # this contains just the ftp link for each file to get the ancillary
    # information files.
 cat << '_EOF_' > getFtpFiles.csh
 #!/bin/csh -fe
 set s=$1
 foreach f (`cat "${s}"`)
    echo $f
    nice wget --timestamping $f 
 end
 '_EOF_'
    chmod +x getFtpFiles.csh 
    nohup nice getFtpFiles.csh ancillary.lst >& ancillary.log &
    # Took about 25 minutes.
    grep "fasta" index.html > otherFiles.lst
    grep "mate_pairs" index.html >> otherFiles.lst
    grep "xml" index.html >> otherFiles.lst
    perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' otherFiles.lst
    rm *.bak
    mkdir otherFiles
    cd otherFiles
    cp ../otherFiles.lst .
    # then get these files by ftp
    nice ../getFtpFiles.csh otherFiles.lst >& otherFiles.log &
    # Took about 6 hours and 50 minutes.
    # There are 181 files as expected.
    foreach f (*.gz)
      nice gunzip $f
    end
    cd ..
    cat ./otherFiles/fasta* > danRerBacEnds.fa
    # Took about 20 minutes
    grep '>' danRerBacEnds.fa | wc -l
    # 14566448
    cat ./otherFiles/xml* > danRer.xml
    # Took 4 hours and 40 minutes.
    # find out which have CLONEEND information in them
 cat << '_EOF_' > findCloneEnds.csh
 #!/bin/csh -fe
 foreach f (otherFiles/xml.*)
    echo $f >> cloneEndsXml.txt
    grep CLONEEND $f >> cloneEndsXml.txt
 end
 '_EOF_'
    chmod +x findCloneEnds.csh
    nice findCloneEnds.csh & 
    # Took 1.5 hours
    # CLONEEND is only in xml.danio_rerio.024 and xml.danio_rerio.033
    cd /cluster/data/danRer4/bed/bacEnds/sequences2/bacEndsDownloads
    cat otherFiles/xml.danio_rerio.024 otherFiles/xml.danio_rerio.033 \
        > cloneEnds.xml
    # cleanup xml files
    rm otherFiles/xml.*
    # get list of libraries:
    grep "LIBRARY_ID" cloneEnds.xml | sort | uniq > libraries.xml.txt
    
    grep "TRACE_NAME" cloneEnds.xml | wc -l
    # 985980
    grep "TRACE_NAME" cloneEnds.xml | sort | uniq -c > traceName.xml.count
    # Hard to tell which are the BAC clone end sequences. These ftp files
    # contain a mixture of sequences from different sources
    # Try downloading sequences from Sanger instead. Not all of the sequences
    # may have been submitted to NCBI anyway yet. 
    ssh kkstore04
    cd /cluster/data/danRer4/bed/bacEnds
    mkdir -p /san/sanvol1/danRer4/bacEnds/ensemblSeqs
    ln -s /san/sanvol1/danRer4/bacEnds/ensemblSeqs
    cd ensemblSeqs
    wget --timestamping  \
         ftp://ftp.ensembl.org/pub/traces/danio_rerio/fasta/
    # gets index.html page
    # get list of cloneEnd FASTA files
    grep cloneEnd index.html > cloneEndsFile
    perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' cloneEndsFile
    rm *.bak
    foreach f (`cat cloneEndsFile`)
      echo $f
      wget --timestamping $f 
    end
    # then do the same to get the trace info xml files:
    wget --timestamping \
       ftp://ftp.ensembl.org/pub/traces/danio_rerio/traceinfo/
    grep cloneEnd index.html > cloneEndsXmlFile
    perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' cloneEndsXmlFile
    rm *.bak
    foreach f (`cat cloneEndsXmlFile`)
      echo $f
      wget --timestamping $f
    end
    gunzip *.gz
    # check for multiple occurrences of same sequence ID
    grep trace_name *.xml | sort | uniq -c | sort -nr > traceNames.count
    # top of list has count of 1 so the end names are unique.
    grep clone_id *.xml | sort | uniq -c | sort -nr > cloneIds.count
    # top of list has count of 4. All those clone IDs that appear 3 or 4 times
    # do so in the CHORI-1073 library - this is the fosmid library.
    # move CHORI-1073 out of the way
    mkdir fosmids
    mv sanger-zfish-CHORI-1073-cloneEnd* ./fosmids
    
    # FASTA files have clone end names as sequence names
    # concatenate the 18 fasta files
    cat *.fasta > Zv6BacEnds.fa    
    grep '>' Zv6BacEnds.fa | wc -l
    # 694170
    # Zv5 had 729101 but these were not unique reads for each sequence. 
    faSize Zv6BacEnds.fa >& Zv6.faSize.txt
    # there are 31 sequence names with no sequence.
    awk '{print $10}' Zv6.faSize.txt > cloneEnds.noSeq
    # remove extra lines at end of file 
    # list of FASTA files that they are in. 
    grep -f cloneEnds.noSeq *.fasta > cloneEnds.noSeq.files
    # sent this list of sequence names and files to Kerstin Howe
    # at Sanger: kj2@sanger.ac.uk . Sanger said that these are just missing
    # sequences due to poor quality.
    # invalid FASTA file format
    # remove these from FASTA file:
    grep -v -f cloneEnds.noSeq Zv6BacEnds.fa > tmp.fa
    grep '>' tmp.fa | wc -l
    # 694139
    mv tmp.fa Zv6BacEnds.fa
    faSize Zv6BacEnds.fa
    # 728424771 bases (11822219 N's 716602552 real 716602552 upper 0 lower) in
    # 694139 sequences in 1 files
    # Total size: mean 1049.4 sd 277.3 min 4 (zKp108D7.za) max 5403 (zC259G13.zb)
    # median 982
    # N count: mean 17.0 sd 42.1
    # U count: mean 1032.4 sd 265.3
    # L count: mean 0.0 sd 0.0
    
    # Blat these BAC ends vs the danRer4 genome assembly. Gaps between
    # scaffolds in the NA_random and Un_random chroms are 50,000 so 
    # alignments of BAC ends across adjacent scaffolds are unlikely, 
    # but alignments done separately just in case:
    ssh pk
    mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/sequences
    cd /cluster/data/danRer4/bed/bacEnds/ensemblSeqs
    cp Zv6BacEnds.fa /san/sanvol1/scratch/danRer4/bacEnds/sequences
    mkdir -p /cluster/data/danRer4/bed/bacEnds/chromsRun
    cd /cluster/data/danRer4/bed/bacEnds/chromsRun
    ls -1S /san/sanvol1/scratch/danRer4/bacEnds/sequences/Zv6BacEnds.fa \
           > bacends.lst 
    ls -1S /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > seqs.lst
    # create out dir
    mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl
    # use Blat parameters as for mm5 and hg17
 cat << '_EOF_' > template
 #LOOP
 /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc {check out line+ /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl/$(root1)_$(root2).psl}
 #ENDLOOP 
 '_EOF_'
 # << this line keeps emacs coloring happy
     gensub2 seqs.lst bacends.lst template jobList
     para create jobList
     para try, check, push, check, ...
 # para time
 # Completed: 271 of 271 jobs
 # CPU time in finished jobs:    1063126s   17718.77m   295.31h   12.30d  0.034 y
 # IO & Wait Time:                  2531s      42.18m     0.70h    0.03d  0.000 y
 # Average job time:                3932s      65.54m     1.09h    0.05d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            9404s     156.73m     2.61h    0.11d
 # Submission to last job:          9891s     164.85m     2.75h    0.11d
 
    # Repeat for random chroms, but use separate scaffolds:
    mkdir -p /cluster/data/danRer4/bed/bacEnds/randomsRun
    cd /cluster/data/danRer4/bed/bacEnds/randomsRun
    ls -1S /san/sanvol1/scratch/danRer4/bacEnds/sequences/Zv6BacEnds.fa \
           > bacends.lst 
    foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/Zv6*.fa)
       ls -1S $f >> seqs.lst
    end
    # create out dir
    mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/randomsPsl
    # use Blat parameters as for mm5 and hg17
 cat << '_EOF_' > template
 #LOOP
 /cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc {check out line+ /san/sanvol1/scratch/danRer4/bacEnds/randomsPsl/$(root1)_$(root2).psl}
 #ENDLOOP 
 '_EOF_'
 # << this line keeps emacs coloring happy
     gensub2 seqs.lst bacends.lst template jobList
     para create jobList
     para try, check, push, check, ...
 # para time 
 # Completed: 2966 of 2966 jobs
 # CPU time in finished jobs:     240259s    4004.31m    66.74h    2.78d  0.008 y
 # IO & Wait Time:                 84042s    1400.71m    23.35h    0.97d  0.003 y
 # Average job time:                 109s       1.82m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             997s      16.62m     0.28h    0.01d
 # Submission to last job:         11925s     198.75m     3.31h    0.14d
     # lift chrom alignments and randoms alignments and then merge and filter.
     ssh kolossus
     cd /cluster/data/danRer4/bed/bacEnds/
     nice pslSort dirs rawChroms.psl tmp \
          /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl >& chromSort.log 
     # Took 2 hours
     # very large output so do the randoms on the san
     cd /san/sanvol1/scratch/danRer4/bacEnds/
     nice pslSort dirs rawRandoms.psl tmp randomsPsl >& randomsSort.log
     # Took 12 minutes
     # move the rawChroms.psl over to the san
     mv /cluster/data/danRer4/bed/bacEnds/rawChroms.psl \
        /san/sanvol1/scratch/danRer4/bacEnds/
     cd /san/sanvol1/scratch/danRer4/bacEnds/
     # for danRer3, hg18 etc.:
     pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                 rawChroms.psl bacEndsChroms.psl /dev/null
     # Took about 1 hour.
     pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
                 rawRandoms.psl bacEndsRandoms.psl /dev/null
     # Took 2 minutes.
     # merge files. There is a single liftOver file that works for both the
     # pseudocontigs and the scaffolds.
     # remove header for bacEndsRandoms.psl
     tail +6 bacEndsRandoms.psl > tmp.psl 
     cat bacEndsChroms.psl tmp.psl > bacEndsNoLift.psl
     # liftUp file to chrom coordinates.
     liftUp bacEnds.psl \
            /cluster/data/danRer4/jkStuff/liftAll.lft warn bacEndsNoLift.psl 
     # Took 2 minutes
     # REPROCESS BACENDS - see section at end (2006-10-06 - 2006-10-11, hartera)
     # Now put together the pairs information:
     ssh kkstore04
     cd /cluster/data/danRer4/bed/bacEnds
     mv /san/sanvol1/danRer4/bacEnds/bacEnds.psl .
     # cat together the xml files of BAC clone end information
     cat ensemblSeqs/*.xml > danRerBacEnds.xml
     # get mate-pair information from xml, forward is SP6, reverse is T7 
     # edit getBacInfo.pl used for canFam1 and adapt for use with zebrafish
     # BAC ends. Not all entries in the xml file have clone_id or trace_end
     # but sometimes they have trace_direction instead of trace_end.
     # correct directions:
 cat << '_EOF_' > getZfishBacInfo.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($file, $outFile, $name, $clone, $library, $dir);
 $file = $ARGV[0];
 $outFile = $ARGV[1];
 
 open (FILE, $file) || die "Can not open $file : $!\n";
 open (OUT, ">$outFile") || die "Can not create $outFile : $!\n";
 open (STDERR, ">error.log") || die "Can not create error.log : $!\n";
 my %cloneHash = qw {
    zC   CH211-
    zK  DKEY-
    zKp DKEYP-
    bZ  RP71-
    dZ  BUSM1-
    CHORI73_ CH73-
 };
 
 $name = "";
 $clone = "";
 $dir = "";
 while (<FILE>)
 {
 chomp;
 my $l = $_;
 if ($l =~ /<trace_name>([A-Za-z0-9\_\.]+)/)
    {
    $name = $1;
    }
 elsif ($l =~ /<clone_id>([A-Z0-9]+\-[0-9A-Z]+)/)
    {
    $clone = $1;
    }
 elsif ($l =~ /<library_id>([A-Z0-9a-z\s]+\-?[0-9A-Z]*)<\/library_id>/)
    {
    $library = $1;
    if ($library eq "Daniokey Pilot")
       {
       $library = "DKEYP";
       }
    }
 elsif ($l =~ /<trace_end>(F|R)/)
    {
    $dir = $1;
    }
 elsif ($l =~ /<trace_direction>(F|R)/)
    {
    $dir = $1;
    }
 # find end of record and print out end information
 if ($l =~ /^\s+<\/trace>/)
    {
    printInfo($name, $clone, $library, $dir);
    $name = $clone = $dir = $library = "";
    }
 }
 close FILE;
 close OUT;
 close STDERR;
 
 sub printInfo  {
    my ($name, $clone, $lib, $d) = @_; 
    # if no clone name read from file then create from trace name
    if ($clone  eq "")
       {
       foreach my $c (keys(%cloneHash))
          {
          if ($name =~ /$c/)
              {
              if (exists($cloneHash{$c})) 
                 {
                 my $prefix = $cloneHash{$c};
                 $clone = $name;
                 # change to clone name
                 $clone =~ s/$c/$prefix/;
                 # remove suffix
                 $clone =~ s/\.[a-z]+|SP6|T7//;
                 }
              }
          }
       }
    # convert forward or reverse direction to T7 or SP6
    if ($d ne "")
       {
       if ($d eq "F")
          {
          $d = "T7";
          }
       elsif ($d eq "R")
          {
          $d = "SP6";
          }
       }
    else 
       {
       print STDERR "No direction for $name found\n";
       }
    # print clone end information
    print OUT "$clone\t$name\t0\t$lib\t0\t$d\n";
 }
 '_EOF_'
     # << for emacs
     chmod +x getZfishBacInfo.pl
     perl getZfishBacInfo.pl danRerBacEnds.xml bacEndInfo.txt
     # check all the names are there 
     grep '>' ./ensemblSeqs/Zv6BacEnds.fa > names
     perl -pi.bak -e 's/>//' names
     sort names | uniq > names.sort
     awk '{print $2}' bacEndInfo.txt  | sort | uniq > bacEndInfo.names.sort
     comm -13 bacEndInfo.names.sort names.sort
     # no difference so all clone ends in the FASTA file are also 
     # in the xml file.
     rm *.bak *.sort names  
     # create mate-pair information
     cp /cluster/bin/scripts/convertBacEndPairInfo convertZfishBacEndInfo
     # comment out line 43 as this removes the suffix after a . from the
     # trace names. In this case, we need to keep those. 
     # line 43:  ($acc, $ver) = split(/\./,$acc);
     # here used wrong script - used old one.
     ./convertZfishBacEndInfo bacEndInfo.txt
     # creates pairs and singles files
     # 312901 pairs and 35479 singles
     # looks like pairs were made for both DKEY-32B21A and DKEY-32B21
     # need to find singles that could be used in pairs. 
     awk '{print $2}' bacEndSingles.txt > singles.names
     perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' singles.names
     perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' singles.names
     sort singles.names | uniq -c | sort -nr > singles.names.count
     # 209 have 2 ends for the BAC clone.
     # some are duplicates of the same end e.g. .ya and .yb but these
     # have the same BAC clone name.
     head -209 singles.names.count | awk '{print $2}' > singles.withPairs.names
     awk '{print $2}' bacEndPairs.txt > pairs.names
     perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' pairs.names
     perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' pairs.names
   
     mkdir -p /cluster/data/danRer4/bed/bacEnds/pairs
     cd /cluster/data/danRer4/bed/bacEnds/pairs
     set dir = /cluster/data/danRer4/bed/bacEnds
     # use parameters from REDO of danRer3 BAC ends
     /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose $dir/bacEnds.psl $dir/bacEndPairs.txt all_bacends bacEnds
     wc -l *
 #   1714 bacEnds.long
 #  14889 bacEnds.mismatch
 # 109213 bacEnds.orphan
 # 105294 bacEnds.pairs
 #    347 bacEnds.short
 #    782 bacEnds.slop
 
     # create header required by "rdb" tools
     echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \
          > ../header
     echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
     # edit header to make sure \t is/become tab character
     cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairs.bed
     # create bad BAC ends set
     cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         bacEnds.orphan | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBad.bed
     # Also create a bad BAC ends set with no orphans since orphans are
     # already added to the singles track and do not want to add these orphans
     # twice when extracting PSL. Use this bacEndPairsBadNoOrphans.bed
     # file when extracting PSLs for adding to the all_bacends table.
     cat header  bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBadNoOrphans.bed
     # To create singles set:
     # also need to process bacEndSingles.txt into a database table
     # for singles in bacEndSingles.txt, create a dummy file where they
     # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
     # sequence, put the dummy sequence in the second column, if the single is
     # a reverse sequence put in first column. use a perl script to do this.
     cd /cluster/data/danRer4/bed/bacends
     set bacDir = /cluster/data/danRer4/bed/bacEnds
     mkdir singles
     cd singles
     cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
     perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                            $bacDir/bacEndSingles.format
     # then run pslPairs on this formatted file
     /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
      -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
      -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
      all_bacends bacEnds
     wc -l bacEnds.*
     #     0 bacEnds.long
     #     0 bacEnds.mismatch
     # 22036 bacEnds.orphan
     #     0 bacEnds.pairs
     #     0 bacEnds.short
     #     0 bacEnds.slop
     cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles 
     wc -l bacEnds.singles
     # 131249 bacEnds.singles
     # Of these, 109213 are from pair analysis and 22036 from singles.
     # For danRer3: there are 11439 orphans from singles and 242235 from 
     # pair analysis so a total of 253674 orphans so this has improved.
     # Although for danRer3, some of these could be replicate reads for the
     # same BAC clone end.
     # make singles bed file
     cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                   | headchg -del > bacEndSingles.bed
 
     # check if there are any overlapping alignments that can be removed.
     cd /cluster/data/danRer4/bed/bacEnds
     mkdir -p duplicates/overlapRun
     cd duplicates/overlapRun
     sort -k1,2 /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed \
          > bacEndPairs.lfs 
     wc -l *.lfs
     # 104546 bacEndPairs.lfs
     nice /cluster/bin/x86_64/lfsOverlap bacEndPairs.lfs bacEndPairs.bed \
          -name -minOverlap=0.999 -notBlocks
     # Loaded 104546 elements of size 11
     # only 5 lines removed
     sort -k1,2 /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed \
          > bacEndSingles.lfs
     nice /cluster/bin/x86_64/lfsOverlap bacEndSingles.lfs bacEndSingles.bed \
         -name -minOverlap=0.999 -notBlocks
     # Loaded 125695 elements of size 11
     # No lines removed.
     sort -k1,2 \
          /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairsBadNoOrphans.bed \
          > bacEndPairsBadNoOrphans.lfs 
     wc -l *.lfs
     # 17611 bacEndPairsBadNoOrphans.lfs
     nice /cluster/bin/x86_64/lfsOverlap bacEndPairsBadNoOrphans.lfs \
          bacEndPairsBadNoOrphans.bed -name -minOverlap=0.999 -notBlocks
     # Loaded 17611 elements of size 11
     # Saving 17608 records to bacEndPairsBadNoOrphans.bed
     # Only 3 alignments were removed. 
     # Therefore no point in doing using these files. Use the original bed 
     # files for pairs and singles. No further processing of BED files is 
     # needed as they have not been changed in any way.
     # Remove duplicates directory.
     rm -r /cluster/data/danRer4/bed/bacEnds/duplicates
     # use new extract program that extracts PSLs using name and position:
     ssh kkstore04
     set bacDir=/cluster/data/danRer4/bed/bacEnds
     cd $bacDir/pairs
 
     nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
         $bacDir/bacEnds.psl bacEndPairs.bed bacPairs.psl
     # for this, use bacEndPairsBadNoOrphans since pairs orphans are already
     # included in bacEndSingles
     nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
       $bacDir/bacEnds.psl bacEndPairsBadNoOrphans.bed bacPairsBadNoOrphans.psl
     # then for singles
     cd $bacDir/singles
     nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
       $bacDir/bacEnds.psl bacEndSingles.bed bacSingles.psl
     cd $bacDir
     cat pairs/*.psl singles/bacSingles.psl > allBacends.load.psl
     # try old program and compare
     extractPslLoad -noBin bacEnds.psl pairs/bacEndPairs.bed \
         pairs/bacEndPairsBadNoOrphans.bed singles/bacEndSingles.bed \
         | sorttbl tname tstart | headchg -del > bacEnds.load.psl
     wc -l *.load.psl
     # 364457 allBacends.load.psl
     # 4568907 bacEnds.load.psl
     # Much reduced by using only BAC end alignments that are in BED files.
     # load into database
     ssh hgwdev
     cd /cluster/data/danRer4/bed/bacEnds/pairs
     hgLoadBed danRer4 bacEndPairs bacEndPairs.bed -notItemRgb \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     # Loaded 104546 elements of size 11
     # note - this next track isn't pushed to RR, just used for assembly QA
     hgLoadBed danRer4 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \
               -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     # Loaded 121728 elements of size 11
     cd /cluster/data/danRer4/bed/bacEnds/singles
     cp /cluster/data/danRer3/bed/bacends/singles/bacEndSingles.sql .
     hgLoadBed danRer4 bacEndSingles bacEndSingles.bed -notItemRgb \
               -sqlTable=bacEndSingles.sql
     # Loaded 125695 elements of size 11
     cd /cluster/data/danRer4/bed/bacEnds
     hgLoadPsl danRer4 -table=all_bacends allBacends.load.psl
     # All alignments were loaded into the table - no problems.
     # load BAC end sequences into seq table so alignments may be viewed
     # symlink to FASTA sequence file in ncbi directory
     # move BAC ends to the ncbi directory
     mkdir -p /cluster/data/ncbi/bacends/zebrafish/bacends.1
     # remove some files
     cd ensemblSeqs 
     rm tmp clone* index.html
     cd /cluster/data/danRer4/bed/bacEnds
     mv /cluster/data/danRer4/bed/bacEnds/ensemblSeqs/* \
        /cluster/data/ncbi/bacends/zebrafish/bacends.1
     rm -r ensemblSeqs
     mkdir -p /gbdb/danRer4/bacends
     ln -s /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \
           /gbdb/danRer4/bacends/Zv6BacEnds.fa
     hgLoadSeq danRer4 /gbdb/danRer4/bacends/Zv6BacEnds.fa
     # check trackDb.ra entry and description
     # cleanup:
     ssh kkstore04
     cd /cluster/data/danRer4/bed/bacEnds/
     rm -r sequences
     rm -r /san/sanvol1/scratch/danRer4/bacEnds/sequences
     rm -r sequences2
     rm changes.txt bacEnds.load.psl *.log
     du -sh /cluster/data/danRer4/bed/bacEnds
     # 2.4G    /cluster/data/danRer4/bed/bacEnds
     gzip *.psl *.txt danRerBacEnds.xml
     du -sh /cluster/data/danRer4/bed/bacEnds
     # 599M    /cluster/data/danRer4/bed/bacEnds
     # (hartera, 2006-10-02)
     # NOTE: Some BAC clones have duplicate reads and these end in the 
     # suffixes SP6A, T7A, SP6W and T7W. There is a corresponding read name
     # without the W or A suffix. The names of the BAC clones 
     # are also suffixed with A or W for these reads. e.g There is a BAC
     # clone called DKEY-32M8. DKEY-32M8A is the same one sequenced with
     # different read ending in SP6A and T7A. The BAC ends names are 
     # zK32M8SP6A and zK32M8T7A. 
     # Check if there are any cases where both the version without the W or A 
     # suffix and the version with it are in the same track:
     ssh hgwdev 
     cd /cluster/data/danRer4/bed/bacEnds
     mkdir duplicates
     cd duplicates
     # found that there are some alignments in all_bacends where there 
     # is SP6W, SP6A, T7W, T7A suffixes for BAC ends. These are duplicate
     # reads, there is a corresponding read name without the W or A suffix.
 # Suffix Alignments Unique Names
 # SP6W	179	153
 # SP6A	254	245
 # T7W 	53	48
 # T7A	247	238
     hgsql -e 'select count(*) from bacEndPairs where lfNames like "%SP6A%";' \
       danRer4
     # 126 were found
     hgsql -e \
  'select count(distinct(name)) from bacEndPairs where lfNames like "%SP6A%";' \
       danRer4
     # 122 with distinct names
     hgsql -N -e \
      'select name, lfNames from bacEndPairs where lfNames like "%SP6A%";' \
      danRer4 | sort > names.SP6A.txt
     awk '{print $1}' names.SP6A.txt | sed -e 's/A$//' > names.SP6.txt
     hgsql -N -e \
     'select name, lfNames from bacEndPairs where lfNames not like "%SP6A%";' \
      danRer4 | sort > pairs.nameswithoutA.txt
     grep -w -f names.SP6.txt pairs.nameswithoutA.txt | sort | uniq \
          > pairs.withAandwithout.txt
     # there are 23 BAC clones in the bacEndPairs table where there are
     # entries for both the clone names ending in A and that without the A.
     hgsql -N -e 'select name, lfNames from bacEndSingles where (lfNames like
 "%SP6A%") or (lfNames like "%SP6W%") or (lfNames like "%T7A%") or (lfNames
 like "%T7W%");' danRer4 | sort | uniq > singles.names.sort
     awk '{print $1}' singles.names.sort | sed -e 's/A$//' | sed -e 's/W$//' \
         > names.SP6andT7.txt
     wc -l names.SP6andT7.txt
     # 372 names.SP6andT7.txt
     sort names.SP6andT7.txt | uniq > names.SP6andT7.uniq
     wc -l names.SP6andT7.uniq
     # 309 names.SP6andT7.uniq
     # Some may have both names ending in W and in A or could those 
     # where the SP6 and T7 end are both present.
     hgsql -N -e 'select name, lfNames from bacEndSingles;' danRer4 \
           > singles.names.txt
     grep -w -f names.SP6andT7.uniq singles.names.txt | sort | uniq \
          > singles.withAorWandwithout.txt
     wc -l singles.withAorWandwithout.txt
     # 212 singles.withAandwithout.txt
     ssh kkstore04
     cd /cluster/data/danRer4/bed/bacEnds/duplicates/tmp/singles
     # Check to see if any pairs can be made that do not have the same 
     # suffix: A, W or without. Only for cases where there is not a pair 
     # already.
     awk '{print $2}' singles.names.sort | sort | uniq > bacEnds.namesAorW.sort
     # also add the BAC ends for those with the same name but withour A or W
     awk '{print $2}' singles.withAorWandwithout.txt | sort | uniq \
         > singles.withAorWandwithout.ends
     cat bacEnds.namesAorW.sort singles.withAorWandwithout.ends \
         | sort | uniq > bacEnds.namesAorWorwithout.sort
     # make pairs where there is none with the same ending already. If an end
     # has W and/or A suffix and/or no suffix, use just one and discard others.
     # use a script to do this.
     wc -l *.txt
     # 93 diffSuffix.txt
     # 69 sameSuffix.txt
     # 212 singles.withAorWandwithout.txt
     # 92 singlesEnds.txt
     # changed program to do second pass using the extra ends.
     # 76 diffSuffix.txt
     # 78 extraEnds.txt
     # 39 extraEnds2.txt
     # 86 sameSuffix.txt
     # 92 singlesEnds.txt
     
     # /cluster/data/danRer4/bed/bacEnds/duplicates/tmp/singles/test2
     # now check to see if any of the BACs represented by singles or pairs
     # are already in the original file created.  
     # extraEnds2.txt are those to be removed
     # diffSuffix.txt, sameSuffix.txt and singlesEnds.txt should all
     # be checked against the entries in the bacEndPairs table since
     # these are sequences that already passed all the criteria for 
     # being in the BAC end pairs track.
     mkdir /cluster/data/danRer4/bed/bacEnds/duplicates/remove
     cd /cluster/data/danRer4/bed/bacEnds/duplicates/remove 
     cp /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed .
     # those that have the same suffix will have already been paired. It is
     # the ones that are different that should be put into the pairs file 
     # and those that are singles should go into the singles file before
     # processing the BAC ends. 
 
     # first remove the 23 that are duplicated in the bacEndPairs table.
     cp ../pairs.withAandwithout.txt .
    # cd /cluster/data/danRer4/bed/bacEnds/duplicates/remove
     awk '{print $1"A"}' pairs.withAandwithout.txt > bacsToRemove.txt
     # remove these from the BAC end pairs file
     grep -wv -f bacsToRemove.txt bacEndPairs.bed > bacEndPairsRemBacA.bed 
     wc -l *.bed
     # 104546 bacEndPairs.bed
     # 104523 bacEndPairsRemBacA.bed
     
     # then find out if there are any BACs with more than one set of pairs
     # in each of the lists: sameSuffix.txt and diffSuffix.txt
     cp ../*Suffix.txt .
     #  the first column has the stem of the BAC end names without the 
     #  SP6 or T7 part of the suffix.
     awk '{print $1;}' sameSuffix.txt | sort | uniq -c | sort -nr \
         > sameSuff.count
     # no duplicates within the file
     awk '{print $1;}' diffSuffix.txt | sort | uniq -c | sort -nr \
         > diffSuff.count
     # no duplicates within the file
     cat sameSuffix.txt diffSuffix.txt > allSuff.txt
     awk '{print $1;}' allSuff.txt | sort | uniq -c | sort -nr \
         > allSuff.count
     # no duplicates between files
     rm *.count
     # then check if any of these are represented in the pairs table:
     # All of these BAC end names begin with zK, these are DKEY- BAC clones
     # translate names in column 1 to BAC clone names
     awk '{print $1}' allSuff.txt | sed -e 's/zK/DKEY\-/' | sort \
         > allSuff.BACclones.txt
     grep -w -f allSuff.BACclones.txt bacEndPairsRemBacA.bed \
          > newPairsDupsInPairsBed.txt
     # only one is found: DKEY-32B21: zK32B21T7,zK32B21SP6
     awk '{print $4}' newPairsDupsInPairsBed.txt \
         > newPairsDupsInPairsBed.name
     grep "zK32B21" *.txt
     # found in sameSuffix.txt so delete from this file and from allSuff.txt
     grep -wv "zK32B21" sameSuffix.txt > sameSuffix2.txt
     grep -wv "zK32B21" allSuff.txt > allSuff2.txt
     # in this case the zK32B21T7A alignment is much better than the
     # zK32B21T7 alignment, also zK32B21SP6A is better than the zK32B21SP6
     # alignment therefore it should be replaced with the SP6A and T7A
     # versions.
 
     cp /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed .
     grep "zK32B21" bacEndSingles.bed 
     # then repeat this for the singles and see if any of those already
     # have pairs in the bacEndPairsRemBacA.bed file.
     cp ../singlesEnds.txt .
     cp ../extraEnds2.txt .
     # all these ends begin with "zK" so from "DKEY-" library.
     # get BAC end prefixes and conver to DKEY BAC clone names.
     awk '{print $1}' singlesEnds.txt | sed -e 's/zK/DKEY\-/' | sort \
         > singles.BACclones.txt
     grep -w -f singles.BACclones.txt bacEndPairsRemBacA.bed \
         > singlesInPairsBed.txt
     wc -l singlesInPairsBed.txt
     # 40 singlesInPairsBed.txt
     # get those names from the clone name in bacEndPairsRemBacA.bed
     awk '{print $4}' singlesInPairsBed.txt | sed -e 's/DKEY\-/zK/' \
         | sort | uniq > singlesDupsInPairs.txt
     wc -l singlesDupsInPairs.txt
     # 37 singlesDupsInPairs.txt
     # All of these versions are in Genbank.
     cat newPairsDupsInPairsBed.name singlesDupsInPairs.txt \
         | sed -e 's/zK/DKEY\-/' > allDupsInPairs.txt
     # BEST WAY FORWARD IS TO START AGAIN WITH PROCESSING THE BAC ENDS AND
     # PROCESS DUPLICATES AS FOR danRer3.
 
 ##############################################################################
 # REPROCESS BAC ENDS TO DEAL WITH DUPLICATES AND REDO BACENDS TRACKS
 # (2006-10-06 - 2006-10-11, hartera)
     # The bacEnds.psl from the first BACENDS TRACK section is used so all 
     # processing is the same up to that point.
     # Now put together the pairs information:
     ssh kkstore04
     # move old bacends dir out the way
     mv /cluster/data/danRer4/bed/bacEnds /cluster/data/danRer4/bed/bacEndsOld
     mkdir /cluster/data/danRer4/bed/bacEnds
     cd /cluster/data/danRer4/bed/bacEnds
    # mv /cluster/data/danRer4/bed/bacEndsOld/bacEnds.psl .
     # cat together the xml files of BAC clone end information
     cat ensemblSeqs/*.xml > danRerBacEnds.xml
     # get mate-pair information from xml, 
     # in convertBacEndInfo, forward is T7, reverse is SP6. Use this 
     # although before used the other way round. Arbitrary really as long 
     # as use the same in the same library. CHORI73 library has it the opposite 
     # way round to above.  
     # edit getBacInfo.pl used for canFam1 and adapt for use with zebrafish
     # BAC ends. Not all entries in the xml file have clone_id or trace_end
     # but sometimes they have trace_direction instead of trace_end.
     # correct directions:
 cat << '_EOF_' > getZfishBacInfo.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($file, $outFile, $name, $clone, $library, $dir);
 $file = $ARGV[0];
 $outFile = $ARGV[1];
 
 open (FILE, $file) || die "Can not open $file : $!\n";
 open (OUT, ">$outFile") || die "Can not create $outFile : $!\n";
 open (STDERR, ">error.log") || die "Can not create error.log : $!\n";
 my %cloneHash = qw {
    zC   CH211-
    zK  DKEY-
    zKp DKEYP-
    bZ  RP71-
    dZ  BUSM1-
    CHORI73_ CH73-
 };
 
 $name = "";
 $clone = "";
 $dir = "";
 while (<FILE>)
 {
 chomp;
 my $l = $_;
 if ($l =~ /<trace_name>([A-Za-z0-9\_\.]+)/)
    {
    $name = $1;
    }
 elsif ($l =~ /<clone_id>([A-Z0-9]+\-[0-9A-Z]+)/)
    {
    $clone = $1;
    }
 elsif ($l =~ /<library_id>([A-Z0-9a-z\s]+\-?[0-9A-Z]*)<\/library_id>/)
    {
    $library = $1;
    if ($library eq "Daniokey Pilot")
       {
       $library = "DKEYP";
       }
    }
 elsif ($l =~ /<trace_end>(F|R)/)
    {
    $dir = $1;
    }
 elsif ($l =~ /<trace_direction>(F|R)/)
    {
    $dir = $1;
    }
 # find end of record and print out end information
 if ($l =~ /^\s+<\/trace>/)
    {
    printInfo($name, $clone, $library, $dir);
    $name = $clone = $dir = $library = "";
    }
 }
 close FILE;
 close OUT;
 close STDERR;
 
 sub printInfo  {
    my ($name, $clone, $lib, $d) = @_; 
    # if no clone name read from file then create from trace name
    if ($clone  eq "")
       {
       foreach my $c (keys(%cloneHash))
          {
          if ($name =~ /$c/)
              {
              if (exists($cloneHash{$c})) 
                 {
                 my $prefix = $cloneHash{$c};
                 $clone = $name;
                 # change to clone name
                 $clone =~ s/$c/$prefix/;
                 # remove suffix
                 $clone =~ s/\.[a-z]+|SP6|T7//;
                 }
              }
          }
       }
    # convert forward or reverse direction to T7 or SP6
    if ($d ne "")
       {
       if ($d eq "F")
          {
          $d = "T7";
          }
       elsif ($d eq "R")
          {
          $d = "SP6";
          }
       }
    else 
       {
       print STDERR "No direction for $name found\n";
       }
    # print clone end information
    print OUT "$clone\t$name\t0\t$lib\t0\t$d\n";
 }
 '_EOF_'
     # << for emacs
     chmod +x getZfishBacInfo.pl
     perl getZfishBacInfo.pl danRerBacEnds.xml bacEndInfo.txt
     # check all the names are there 
     grep '>' ./ensemblSeqs/Zv6BacEnds.fa > names
     perl -pi.bak -e 's/>//' names
     sort names | uniq > names.sort
     awk '{print $2}' bacEndInfo.txt  | sort | uniq > bacEndInfo.names.sort
     comm -13 bacEndInfo.names.sort names.sort
     # no difference so all clone ends in the FASTA file are also 
     # in the xml file.
     rm *.bak *.sort names  
     # create mate-pair information
     # convertBacEndPairInfo does not deal with replicate names. These can 
     # be in a comma separated list in the pairs and singles files.
     # edit the script so that it does this and parses the bacEndInfo.txt file.
     cp /cluster/bin/scripts/convertBacEndPairInfo convertZfishBacEndInfo
     # comment out line 43 as this removes the suffix after a . from the
     # trace names. In this case, we need to keep those. 
     # line 43:  ($acc, $ver) = split(/\./,$acc);
 cat << 'EOF' > convertZfishBacEndInfo
 #!/usr/local/bin/perl
 # File: convertBacEndPairZfishInfo
 # Date: 10/2006
 # Description: Converts bacends.cl_acc_gi_len_primer format file to 
 # bacEnds.pair file used for creating BAC End Pairs tracks
 
 # Usage message
 if ($#ARGV < 0) {
   print stderr "USAGE: convertBacEndPairInfo <cl_acc_gi_len_primer>\n";
   exit(1);
 }
 
 $file = shift(@ARGV);
 open(FILE, "$file") || die("Could not open $file\n");
 
 $pair = $single = 0;
 
 # Read in and record end info
 print stderr "Reading in end info\n";
 while ($line = <FILE>) {
   chomp($line);
   ($clone, $acc, $gi, $center, $length, $end) = split('\t',$line);
  # ($acc, $ver) = split(/\./,$acc);
   $end =~ tr/a-z/A-Z/;
   $found{$clone} = 1;
   $clone{$acc} = $clone;
   $printa{$acc} = 0;
   $print{$clone} = 0;
   $end{$acc} = $end;
   if (&isForward($end)) {
    #  print "Adding $acc for $clone as $end \n";
     $t7{$clone} .= "$acc,";
    # print "The entry for $clone is $t7{$clone}\n";
   } elsif (&isReverse($end)) {
     $sp6{$clone} .= "$acc,";
   } elsif ($end) {
     print stderr "End $end for $acc / $clone\n";
   }
 }
 close(OUT);
 
 # Print out pairs
 open(OUT, ">bacEndPairs.txt");
 print stderr "Writing out pair info\n";
 foreach $clone (keys %found) {
   if ($t7{$clone} && $sp6{$clone}) {
     print OUT "$t7{$clone}\t$sp6{$clone}\t$clone\n";
     $print{$clone} = 1;
     @acc = split(/\,/,$t7{$clone});
     for ($i = 0; $i <= $#acc; $i++) {
        $printa{$acc[$i]} = 1;
     }
     @acc = split(/\,/,$sp6{$clone});
     for ($i = 0; $i <= $#acc; $i++) {
        $printa{$acc[$i]} = 1;
     }
     $pair++;
   }
 }
 close(OUT);
 
 # Print out singletons
 print stderr "Writing out singleton info\n";
 open(OUT, ">bacEndSingles.txt");
 %sp6Singles;
 %t7Singles;
 
 foreach $acc (keys %printa) {
   $clone = $clone{$acc};
   # if not printed already then add to a new hash for singles
   if (!$printa{$acc}) {
      if (&isForward($end{$acc})) {
         $t7Singles{$clone} .= "$acc,";
      }
      elsif (&isReverse($end{$acc})) {
         $sp6Singles{$clone} .="$acc,";
      }
      else {
         print stderr "$acc has unknown end\n";
      }
   }
 }
 # then print out the singles:
 foreach $cl (keys %t7Singles) {
    print OUT "$t7Singles{$cl}\t$cl\tT7\n";
    $single++;
 }
 foreach $cl (keys %sp6Singles) {
    print OUT "$sp6Singles{$cl}\t$cl\tSP6\n";
    $single++;
 }
 
 close(OUT);
 print stderr "$pair pairs and $single singles\n";
 
 sub isForward {
     $end = shift(@_);
     if (($end =~ /FORWARD/) || ($end =~ /^T7/) || ($end eq "F") ||
 	($end eq "M13-21") || ($end eq "1") || ($end =~ /^TK/) ||
 	($end =~ /^EC1/) || ($end =~ /^RM1/)) {
 	return 1;
     } else {
 	return 0;
     }
 }
 
 
 sub isReverse {
     if (($end =~ /REVERSE/) || ($end =~ /^SP6/) || ($end eq "R") ||
 	($end =~ /^TJ/)) {
 	return 1;
     } else {
 	return 0;
     }
 }
 'EOF'
     # remove all W and A suffixes from the end of bacEndInfo.txt clone names
     cp bacEndInfo.txt bacEndInfo2.txt 
     perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' bacEndInfo2.txt
     perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' bacEndInfo2.txt
     ./convertZfishBacEndInfo bacEndInfo2.txt
     # creates pairs and singles files
     # 312850 pairs and 34935 singles
   
     mkdir -p /cluster/data/danRer4/bed/bacEnds/pairs
     cd /cluster/data/danRer4/bed/bacEnds/pairs
     set dir = /cluster/data/danRer4/bed/bacEnds
     # use parameters from REDO of danRer3 BAC ends
     /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose $dir/bacEnds.psl $dir/bacEndPairs.txt all_bacends bacEnds
     wc -l *
 #    2724 bacEnds.long
 #   22959 bacEnds.mismatch
 #  179405 bacEnds.orphan
 #  156241 bacEnds.pairs
 #     565 bacEnds.short
 #    1196 bacEnds.slop
 
     # create header required by "rdb" tools
     echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \
          > ../header
     echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
     # edit header to make sure \t is/become tab character
     cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairs.bed
     # create bad BAC ends set
     cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         bacEnds.orphan | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBad.bed
     # Also create a bad BAC ends set with no orphans since orphans are
     # already added to the singles track and do not want to add these orphans
     # twice when extracting PSL. Use this bacEndPairsBadNoOrphans.bed
     # file when extracting PSLs for adding to the all_bacends table.
     cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
         | row score ge 300 | sorttbl chr start \
         | headchg -del > bacEndPairsBadNoOrphans.bed
     # To create singles set:
     # also need to process bacEndSingles.txt into a database table
     # for singles in bacEndSingles.txt, create a dummy file where they
     # are given zJA11B12T7 as dummy sequence pair. If the single is a forward
     # sequence, put the dummy sequence in the second column, if the single is
     # a reverse sequence put in first column. use a perl script to do this.
     cd /cluster/data/danRer4/bed/bacEnds
     set bacDir = /cluster/data/danRer4/bed/bacEnds
     mkdir singles
     cd singles
     cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
     perl formatSingles.pl $bacDir/bacEndSingles.txt > \
                            $bacDir/bacEndSingles.format
     # then run pslPairs on this formatted file
     /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
      -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
      -mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
      all_bacends bacEnds
     wc -l bacEnds.*
     #     0 bacEnds.long
     #     0 bacEnds.mismatch
     # 23398 bacEnds.orphan
     #     0 bacEnds.pairs
     #     0 bacEnds.short
     #     0 bacEnds.slop
     cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles 
     wc -l bacEnds.singles
     # 202803 bacEnds.singles
     # Of these, 179405 are from pair analysis and 23398 from singles.
     # For danRer3: there are 11439 orphans from singles and 242235 from 
     # pair analysis so a total of 253674 orphans so this has improved.
     # Although for danRer3, some of these could be more replicate reads for the
     # same BAC clone end.
     # make singles bed file
     cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
                   | headchg -del > bacEndSingles.bed
 
     # check if there are any overlapping alignments that can be removed.
     cd /cluster/data/danRer4/bed/bacEnds
     mkdir -p duplicates/overlapRun
     cd duplicates/overlapRun
     sort -k1,2 /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed \
          > bacEndPairs.lfs 
     wc -l *.lfs
     # 154732 bacEndPairs.lfs
     nice /cluster/bin/x86_64/lfsOverlap bacEndPairs.lfs bacEndPairs.bed \
          -name -minOverlap=0.999 -notBlocks
     # Loaded 154732 elements of size 11
     # Took about 2.5 hours.
     wc -l bacEndPairs*
     # 154634 bacEndPairs.bed
     # 154732 bacEndPairs.lfs
 
     sort -k1,2 /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed \
          > bacEndSingles.lfs
     nice /cluster/bin/x86_64/lfsOverlap bacEndSingles.lfs bacEndSingles.bed \
         -name -minOverlap=0.999 -notBlocks
     # Loaded 187638 elements of size 11
     # Took about 4.5 hours
     sort -k1,2 \
          /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairsBadNoOrphans.bed \
          > bacEndPairsBadNoOrphans.lfs 
     wc -l *.lfs
     # 27301 bacEndPairsBadNoOrphans.lfs
     nice /cluster/bin/x86_64/lfsOverlap bacEndPairsBadNoOrphans.lfs \
          bacEndPairsBadNoOrphans.bed -name -minOverlap=0.999 -notBlocks
     # Loaded 27301 elements of size 11
     # Took 5 minutes
      
     # check the numbers of lines are correct
 
     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
          awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \
              | sort | uniq -c | sort -nr > ${f}.uniqCount
     end
     wc -l *
 #  154634 bacEndPairs.bed
 #  154732 bacEndPairs.lfs
 #  154656 bacEndPairs.uniqCount
 #   27282 bacEndPairsBadNoOrphans.bed
 #   27301 bacEndPairsBadNoOrphans.lfs
 #   27293 bacEndPairsBadNoOrphans.uniqCount
 #  187601 bacEndSingles.bed
 #  187638 bacEndSingles.lfs
 #  187624 bacEndSingles.uniqCount
     # different numbers for unique count since some of these alignments
     # were not identical but very close to identical (>0.999 overlap)
     rm *.uniqCount
     cd /cluster/data/danRer4/bed/bacEnds/duplicates 
     mv ./overlapRun/* .
     rm -r overlapRun
     # copy perl script used for danRer3 to choose 2 BAC ends to represent
     # each BAC clone since there are often more than one read for each BAC end
     # in this set,  2 were chosen for each BAC pair or 1 for the singles. This
     # was based on the ones that had the largest region aligned (using lfSizes).
     cp /cluster/data/danRer3/bed/bacends/duplicatesNew/pickLfNamesv2.pl .
 
     # need to sort by chrom, chromStart
     foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
          sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed
     end
     # run perl script: input bed file, pairs or singles, name of output file
     perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed
     mv error.log log.pairs
 
     perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed
     mv error.log log.singles
 
     perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \
           badPairs2lfNames.bed
     mv error.log log.badPairs
     wc -l log*
     # 1 log.badPairs
     # 3 log.pairs
     # 13 log.singles
     # In future, could pick which set of alignments to pick based on the 
     # Blat score computed by pslScore. 
     # For badPairs, CH211-115F14 has 2 sets of pairs: zC115F14.zb,zC115F14.ya
     # has a longer region between ends than for zC115F14.za,zC115F14.ya.
     # so the latter was removed.
     # for Pairs, CH211-74D17: the alignment with zC74D17.zb,zC74D17.yb was
     # removed but there is also one with zC74D17.zb,zC74D17.yb to the same 
     # region that was retained so remove this one as zC74D17.zb,zC74D17.ya
     # covers a longer region.
     # CH211-98O15 has zC98O15.ya,zC98O15.za aligning to chr3 and 
     # zC98O15.yb,zC98O15.zb align to chr17. There is no similarity between
     # zC98O15.ya and zC98O15.yb by bl2seq.
     # CH211-98E22 has zC98E22.ya,zC98E22.za aligning to chr3 and 
     # zC98E22.yb,zC98E22.zb aligning to chr14. zC98E22.ya and zC98E22.yb
     # has no similarity by bl2seq.
     # For singles, there are 13 will alignments to more than 1 read for a 
     # BAC end:
     # CH211-66E17: remove zC66E17.za as it has more mismatches and inserts.
     # CH211-74O5: remove zC74O5.ya as is has more mismatches. 
     # CH211-42B4: remove zC42B4.yb as it has a shorter alignment. Not much
     # difference in mismatches or inserts between this and zC42B4.ya.
     # CH211-98O3: zC98O3.yb aligns to chr13 and zC98O3.ya aligns to chr16 and
     # they have no similarity to each other.
     # CH211-89J7: remove zC89J7.zb as it has more mismatches and inserts.
     # CH211-97A18: remove zC97A18.yb has more mismatches and inserts.
     # CH211-48O20: zC48O20.zb aligns to chr22 and zC48O20.za aligns twice
     # to chr16. No similarity by bl2seq.
     # CH211-60H17: remove zC60H17.ya as it has a more mismatches.
     # CH211-189J23: remove zC189J23.yb as it has a large tNumInsert.
     # CH211-124G12: remove zC124G12.za as it has more mismatches and inserts.
     # CH211-60P6: remove zC60P6.ya as it has more inserts. 
     # CH211-42A6: remove zC42A6.za as it has more inserts.
     # CH211-69K2: remove zC69K22.za as it has more inserts.
 
     # Reported discrepancies to Mario Caccamo at Sanger (mc2@sanger.ac.uk)
     # Here is his reply:
     # This looks like a clone swap problem where names where associated to 
     # the wrong clones. All the examples you mention below are from 
     # projects sequenced at Max Planck (Germany).
     # CH211-98O15 - the right place for this one is in chr3. This clone is
     # currently assigned to ctg247 in chr3.
     # CH211-98O3 - should go to chr14 (there is a problem in Zv6 most
     # probably). This clone is assigned to ctg3009. The b ends are correct.
     # CH211-48O20 - unfortunately this clone is not fingerprinted so I don't
     # have any independent information to confirm the right placement.
     # So for pairs, 
     # CH211-98O15: retain zC98O15.ya,zC98O15.za aligning to chr3
     # CH211-98O3: retain zC98O3.yb and zC98O3.zb (should go to chr14)
     # NOTE: For some singles, the lfStart does not equal the chromStart.
     # Also chromStart - chromEnd should equal lfSizes.
     # pslPairs has added min/2 to the end or subtracted min/2 from the start
     # depending on whether it is a left or a right BAC end and the
     # alignment orientation. min used here was 25000.
     # That is ok. This is what gives the display where the aligning block is
     # shown with a line with arrows on it showing the direction.
 
     ssh kkstore04
     cd /cluster/data/danRer4/bed/bacEnds/duplicates 
     # create remove lists for each set of alignments.
 cat << 'EOF' > pairsToRemove
 zC74D17.zb,zC74D17.yb
 zC98O15.yb,zC98O15.zb
 zC98E22.ya,zC98E22.za
 zC98E22.yb,zC98E22.zb
 'EOF'
  
 cat << 'EOF' > singlesToRemove
 zC66E17.za
 zC74O5.ya 
 zC42B4.yb
 zC98O3.ya
 zC89J7.zb
 zC97A18.yb
 zC48O20.zb 
 zC48O20.za
 zC60H17.ya
 zC189J23.yb
 zC124G12.za
 zC60P6.ya 
 zC42A6.za
 'EOF'
     mv pairs2lfNames.bed pairs2lfNamesOld.bed
     mv singles1lfName.bed singles1lfNameOld.bed
     # recreate these files removing alignments for ends in lists above
     grep -wv -f pairsToRemove bacEndPairsSort.bed > pairs2lfNames.bed
     grep -wv -f singlesToRemove bacEndSinglesSort.bed > singles1lfName.bed
  
     # for each of these new bed files, checks were made that there are
     # only 2 BAC ends per alignments for pairs and 1 for singles.
     # For each pair, there should only be 2 ends which can appear either
     # way round depending on the orientation and there should be 1 end for
     # the beginning (suffix T7, t7 or z) and one end for the end
     # (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
     # either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
     # orientation. For singles, there should be a single BAC end for each
     # alignment and for each BAC clone, a sequence for either or both types
     # of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
     # alignments.
 
     e.g. 
     wc -l pairs2lfNames.bed
     # 154632 pairs2lfNames.bed
     grep ',' pairs2lfNames.bed | wc -l
     # 154632
     # should be the same number, every line should have a comma
     # should be twice the number of above, just 2 end names per line
     awk '{print $11}' pairs2lfNames.bed | sort | uniq > pairs.ends
     wc -l pairs.ends
     # 147668 pairs.ends
     sed -e 's/,/\n/g' pairs.ends > pairs.ends2
     wc -l pairs.ends2
     # 295336 pairs.ends2
     # should be twice the number of above, just 2 end names per lines so
     # correct.
     perl -pi.bak -e \
 's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' pairs.ends
     sort pairs.ends | uniq > pairs.ends.uniq
     # check that these have the right combination of ends - one forward and
     # one reverse. all ok.
     # repeat for badPairs and singles
     # badPairs:
     wc -l badPairs2lfNames.bed
     # 27281 badPairs2lfNames.bed
     grep ',' badPairs2lfNames.bed | wc -l
     # 27281
     # should be the same number, every line should have a comma
     # should be twice the number of above, just 2 end names per line
     awk '{print $11}' badPairs2lfNames.bed | sort | uniq > badPairs.ends
     wc -l badPairs.ends
     # 25795 badPairs.ends
     sed -e 's/,/\n/g' badPairs.ends > badPairs.ends2
     wc -l badPairs.ends2
     # 51590 badPairs.ends2
     # should be twice the number of above, just 2 end names per lines so
     # correct.
     perl -pi.bak -e \
 's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' badPairs.ends
     sort badPairs.ends | uniq > badPairs.ends.uniq
     # check that these have the right combination of ends - one forward and
     # one reverse. all ok.
     # for singles
     wc -l singles1lfName.bed
     # 187587 singles1lfName.bed
     grep ',' singles1lfName.bed | wc -l
     # 0
     # should be 0 as there should only be one BAC end name per line.
     awk '{print $11}' singles1lfName.bed | sort | uniq > singles.ends
     wc -l singles.ends
     # 172981 singles.ends
     # some singles have more than 1 alignment so appear more than once.
     perl -pi.bak -e \
 's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1/g' singles.ends
     sort singles.ends | uniq > singles.ends.uniq
     # check that these have the the right suffixes for the BAC ends. all ok.
     # clean up
     rm *.bak *.ends *.ends2 *.uniq
 
     # Finally overlaps in BAC clone names were checked. All BAC clones
     # represented in each of the pairs, badPairs and singles bed files are
     # unique to that file. Between all three bed files, 302606 BAC clones
     # have alignments. 
     foreach f (pairs2lfNames.bed badPairs2lfNames.bed singles1lfName.bed)
         awk '{print $4}' $f | sort | uniq > ${f}.names
     end
     wc -l *.names
     # 25421 badPairs2lfNames.bed.names
     # 147501 pairs2lfNames.bed.names
     # 129684 singles1lfName.bed.names
     # 302606 total
 
     comm -12 pairs2lfNames.bed.names badPairs2lfNames.bed.names
     comm -12 pairs2lfNames.bed.names singles1lfName.bed.names
     comm -12 badPairs2lfNames.bed.names singles1lfName.bed.names
 
     # None of these files should have any BAC clone names in common and
     # they do not so they are ok.
     # NOTE: using sort and uniq on hgwdev produces tab delimited output
     # after merging rows with the same BAC name, the scoring is now
     # wrong in the bed files.
     # Scores should be 1000 if there is 1 row for that name, else
     # 1500/number of rows for that sequence name - calculated by pslPairs.
     # Correct the scores. 
     ssh kkstore04
     mkdir -p /cluster/data/danRer4/bed/bacEnds/scoresAndCoords
     cd /cluster/data/danRer4/bed/bacEnds/scoresAndCoords
     # copy over correctScores2.pl and checkscores.pl scripts from danRer3 and
     # scripts were edited so that hits file is split on space,not on tabs
     cp \
      /cluster/data/danRer3/bed/bacends/scoresAndCoords/correctScores2.pl .
      cp \
      /cluster/data/danRer3/bed/bacends/scoresAndCoords/checkScores.pl .
     awk '{print $4}' ../duplicates/pairs2lfNames.bed \
                  | sort | uniq -c > pairs.hits
     perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits \
           noBin > bacEndPairsGoodScores.bed
     # same for singles
     awk '{print $4}' ../duplicates/singles1lfName.bed \
                  | sort | uniq -c > singles.hits
     perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits \
                  noBin > bacEndSinglesGoodScores.bed
     # and for badPairs
     awk '{print $4}' ../duplicates/badPairs2lfNames.bed \
                  | sort | uniq -c > badPairs.hits
     perl correctScores2.pl ../duplicates/badPairs2lfNames.bed \
           badPairs.hits noBin > bacEndPairsBadGoodScores.bed
 
     # check that the scores are now correct
     awk '{print $4, $5}' bacEndPairsGoodScores.bed \
          | sort | uniq -c > pairs.count
     perl checkScores.pl < pairs.count
     # all the BAC clones should be in good.txt and none in bad.txt
     # wc -l should give same number of lines in good.txt as in pairs.hits
     # and therefore bad.txt should be empty.
     # repeat for other bed files
     awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \
          | sort | uniq -c > badPairs.count
     perl checkScores.pl < badPairs.count
     awk '{print $4, $5}' bacEndSinglesGoodScores.bed \
          | sort | uniq -c > singles.count
     perl checkScores.pl < singles.count
     # for the singles, 6 ended up in bad.txt because their scores are
     # 214.285714285714 which is correct for 7 alignments. Rounding the score
     # caused the discrepancy.
     # round these values otherwise get a loading error when loading database:
     perl -pi.bak -e 's/214\.285714285714/214/' bacEndSinglesGoodScores.bed
     # clean up
     rm error.log *.txt *.count *.hits
 
     ssh hgwdev
     cd /cluster/data/danRer4/bed/bacEnds/scoresAndCoords
     # copy over table definition from danRer3
     cp /cluster/data/danRer3/bed/bacends/singles/bacEndSingles.sql \
         ../singles
     
     # Now load database tables:
     hgsql -e 'drop table bacEndPairs;' danRer4
     hgLoadBed danRer4 bacEndPairs bacEndPairsGoodScores.bed \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
     # Loaded 154632 elements of size 11
     hgsql -e 'drop table bacEndSingles;' danRer4
     hgLoadBed danRer4 bacEndSingles bacEndSinglesGoodScores.bed \
                -sqlTable=../singles/bacEndSingles.sql -notItemRgb
     # Loaded 187587 elements of size 11
     hgsql -e 'drop table bacEndPairsBad;' danRer4
     hgLoadBed danRer4 bacEndPairsBad bacEndPairsBadGoodScores.bed \
                -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
     # Loaded 27281 elements of size 11
     # clean up
     rm *.tab *.bak error.log 
     # The Zv6 BAC end sequences are already in /gbdb/danRer4/bacends/ and 
     # they have been loaded into the seq table - this is from the first section 
     # on BACENDS tracks. No need to repeat this here.
     # loaded BAC end sequences into seq table so alignments may be viewed
     # moved BAC ends to the ncbi directory previously
     # symlink to FASTA sequence file in ncbi directory
     mkdir -p /gbdb/danRer4/bacends
     ln -s /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \
           /gbdb/danRer4/bacends/Zv6BacEnds.fa
     hgLoadSeq danRer4 /gbdb/danRer4/bacends/Zv6BacEnds.fa
     # use new extract program that extracts PSLs using name and position:
     ssh kkstore04
     set bacDir=/cluster/data/danRer4/bed/bacEnds
     cd $bacDir/scoresAndCoords
 
     nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
         $bacDir/bacEnds.psl bacEndPairsGoodScores.bed bacPairs.psl
     # for this, use bacEndPairsGoodScores.bed which was derived from 
     # bacEndPairsBadNoOrphans since pairs orphans are already
     # included in bacEndSingles
     nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
       $bacDir/bacEnds.psl bacEndPairsBadGoodScores.bed bacPairsBad.psl
     # then for singles
     nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
       $bacDir/bacEnds.psl bacEndSinglesGoodScores.bed bacSingles.psl
     cd $bacDir
     cat $bacDir/scoresAndCoords/*.psl > allBacends.load.psl
     wc -l *.load.psl
     # 542725 allBacends.load.psl
     # load PSL file into database
     ssh hgwdev
     cd /cluster/data/danRer4/bed/bacEnds/
     hgsql -e 'drop table all_bacends;' danRer4
     hgLoadPsl danRer4 -table=all_bacends allBacends.load.psl
     # All alignments were loaded into the table - no problems.
     # check trackDb.ra entry and modify description.
     # Moved the searches up to the top level zebrafish trackDb.ra file
     # in trackDb/zebrafish/ since the searches are common to all zebrafish
     # assemblies. Deleted searches from each assembly trackDb.ra.
 
 ###########################################################################
 # CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES
 # (bacEndAlias, bacCloneAlias and bacCloneXRef) 
 # (DONE, 2006-09-29 - 2006-10-27, hartera)
     # Process data and create bacEndAlias table
     ssh kkstore04
     # create a list of BAC end names and their accessions
     # Downloaded BAC ends accessions from SRS
     # SRS at Sanger is no longer available.
     # Go to http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?CMD=search&DB=nucgss
     # This is dbGSS at NCBI: GSS is Genomic Sequence Survey
     # Search: Danio rerio[Organism] AND BAC
     # There are 159020 entries. This is the same as for the BACEndAccs.txt
     # for danRer3 in: /cluster/data/danRer3/bed/bacends/bacends.1
     # getBacEndInfo.pl and extToIntNames.pl was used to create 
     # BACEnd_accessions.txt. Use this from danRer3 to load table.
     cd /cluster/data/danRer4/bed/bacEnds
     cp /cluster/data/danRer3/bed/bacends/bacends.1/BACEnd_accessions.txt .
     grep '>' /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \
          | sed -e 's/>//' > allBacEnds.names
     # copy over getBacEndInfov2.pl - this produces the bacEndAccs.aliases file
     cp /cluster/data/danRer3/bed/bacends/bacends.1/getBacEndInfov2.pl .
     # edit to remove section that creates pairs and singles files 
     # and rename to getBacEndAliases.pl
 cat << 'EOF' > getBacEndAliases.pl
 #!/usr/bin/perl -w
 use strict;
                                                                                 
 my $file = $ARGV[0]; # list of BAC end sequence read Sanger names
 my $file2 = $ARGV[1]; # list of BAC ends and GenBank accessions
 # translation for sequence prefixes from Sanger internal names to external names
 my %cloneHash = qw {  
    zC   CH211-
    ZC   CH211-
    zK   DKEY-
    zKp  DKEYP-
    bZ   RP71-
    dZ   BUSM1-
    CHORI73_   CH73-
 };
 # need to get bacends into pairs and singles
 # find duplicates also
 
 # Get and store BAC ends and accessions
 my %bacEnds;
 
 open (BACENDS, $file2) || die "Can not open $file2: $!";
 while (<BACENDS>) {
    chomp;
    my ($be, $a) = split(/\t/);
    print "bac end $be and acc is $a\n";
    $bacEnds{$be} = $a;
 }
 close BACENDS;
 
 my %bacs;
 my %bacAccs;
 
 open(FILE, $file) || die "Can not open $file: $!";
 open(STDERR, ">bacs.log") || die "Can not create bacs.log: $!"; 
 open(OUT, ">direction.txt") || die "Can not create direction.txt:$!";
 open(ACCS, ">bacEndAccs.aliases") || die "Can not create bacEndsAccs.aliases: $!";                                                                               
 while (<FILE>) {
    chomp;
    my $seqName = $_;
    print "seqName is $seqName here\n";
    $seqName =~ /^([CHORI73]*[|z|Z|b|d]?[C|K|Z|_]p?)([0-9]+[A-Z][0-9]+)\.?[pq1k]*(SP6|T7|ASP6|AT7|SP6W|T7W|y|z|Z)/;
 
    my $prefix = $1;
    my $rest = $2;
    print "prefix is $prefix, rest is $rest\n";
    my $dir = $3;
    print STDERR "dir is $dir\n";
    print OUT "$dir\n";
    my $direction;
    # forward  or reverse direction
    if (($dir =~ /SP6/) || ($dir =~ /T7/) ) {
       $direction = $dir;
    }
    # reverse direction (as in convertZfishBacEndInfo)
    elsif ($dir =~ /(sp6)/i || $dir =~ /y/i) {
       $direction = "SP6";
    }
    # forward direction (as in convertZfishBacEndInfo)
    elsif ($dir =~ /(t7)/i ||$dir =~ /z/i) {
       $direction = "T7";
    }
    else {
       print STDERR "seqName is $seqName - direction not found\n";
    }
    print "dir is $dir and direction is $direction\n";
    my $extName = "";
    my $intName = $prefix.$rest;
    print "prefix is $prefix\n";
    my $mid = "";
    $mid = $rest;
    $mid =~ s/\-//;
    $mid =~ tr/a-z/A-Z/;
    print "after trans, mid is $mid here\n";
    if ($mid =~ /^([A-Z]*)0*([0-9]+[A-Z]+)0*([0-9]+$)/) {
       print "matched mid $mid here\n";
       my $new = $1.$2.$3;
       $mid = $new;
       print "new mid is $mid\n";
    }
 
    if (exists ($cloneHash{$prefix})) {
       my $extPrefix = $cloneHash{$prefix};
       $extName = $extPrefix.$mid;
       print "External name is $extName\n";
    }
    else {
       $extName = "";
    }
    # need to get duplicate clones, if switch to lower case and remove
    # . and - and use as key to bacs hash
    # add the internal and external name for BAC to hash
    my $fullName = $seqName;
    # my $intNameStem = $intName;
    my $upDir = $dir;
    $dir =~ tr/a-z/A-Z/;
    # preserve prefix and change middle part of name to upper case
    my $upperIntName = $prefix.$mid;
    my $upperFullName = $prefix.$mid.$dir;
 
    print "upper internal name is $upperIntName here\n";
   # my $newFullName = "";
    print "internal name is $intName, altered seq name is $upperIntName\n";
    print "full name is now $upperFullName\n";
    if (exists($bacEnds{$upperFullName})) {
       my $ac = $bacEnds{$upperFullName};
       print "seq is $upperFullName; acc is $ac\n";
       $bacs{$upperIntName}->{$upperFullName}->{acc} = $ac;
    }
    push (@{$bacs{$upperIntName}->{$upperFullName}->{seqs} }, $seqName);
    $bacs{$upperIntName}->{$upperFullName}->{extName} = $extName; 
    $bacs{$upperIntName}->{$upperFullName}->{direction} = $direction;
    if (exists($bacAccs{$upperIntName}) ){
       my $bacAcc = $bacAccs{$upperIntName};
       print "bacacc is $bacAcc\n";
       $bacs{$upperIntName}->{$upperFullName}->{bacAcc} = $bacAcc;
    }
    if (exists($bacEnds{$upperFullName} )) {
       my $bacEndAcc = $bacEnds{$upperFullName};
       print "bacendacc is $bacEndAcc\n"; 
       $bacs{$upperIntName}->{$upperFullName}->{bacEndAcc} = $bacEndAcc;
    }
 }
 close FILE;
 
 # print accessions for BacEnds with BAC end aliases
 my $count = 0;
 print "printing accessions.\n";
 foreach my $a (keys(%bacs)) {
    print "$a is bac end from bacEnds hash\n";
    foreach my $f (keys %{ $bacs{$a} } ) {
       if (exists($bacs{$a}->{$f}->{acc} ) ) {
       my $acc = $bacs{$a}->{$f}->{acc};
       my @ids = @{$bacs{$a}->{$f}->{seqs} };
       foreach my $i (@ids) {
          $count++;
          print ACCS "$i\t$count\t$acc\n";
 
       }
     }
   }
 
 }
 'EOF'
     chmod +x getBacEndAliases.pl
     perl getBacEndAliases.pl allBacEnds.names BACEnd_accessions.txt \
          > bacEnds.log
     wc -l bacEndAccs.aliases
     # 159370 bacEndAccs.aliases
     # clean up 
     rm *.log direction.txt
     # Only the DKEY- library clone ends have accessions in Genbank
     # load this alias table and accessions for clone ends
     ssh hgwdev
     cd /cluster/data/danRer4/bed/bacEnds
     # Carry on and process this file into the bacEndAlias table.
     hgLoadSqlTab danRer4 bacEndAlias ~/kent/src/hg/lib/bacEndAlias.sql \
           bacEndAccs.aliases
     # Loaded successfully.
 
     # Get the latest versions of the clonemarkers, contig names and markers
     # files from Sanger: Provided by Mario Caccamo (mc2@sanger.ac.uk)
     # at the Sanger Institute.
     ssh kkstore04
     mkdir -p /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
     cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
     # Problem with the markers file - generated incorrectly. Contacted
     # Sanger to ask for new markers file on 10/12/06 and new set of files 
     # were put up for ftp on 10/26/06. Another problem with markers file
     # was found - there was a number in the second field instead of the 
     # sanger sts name which is an ID beginning with "et" or "st". Notified'
     # Sanger and new files put out for ftp on 10/27/06.
     wget --timestamp \
       ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README
     wget --timestamp \
          ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/clonemarkers.27.10.06.txt
     wget --timestamp \
          ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/ctgnames.27.10.06.txt
     wget --timestamp \
          ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/markers.27.10.06.txt
     wc -l *27.10.06.txt
     # 32612 clonemarkers.27.10.06.txt
     # 168828 ctgnames.27.10.06.txt
     # 12407 markers.27.10.06.txt
     # get list of BAC end names, lfNames
     foreach f (../scoresAndCoords/*.bed)
        echo $f
        awk '{print $11;}' $f >> allBacEnds.names
     end
     wc -l allBacEnds.names
     # 369500 allBacEnds.names
     # this is the total number of lines in the *.bed files
     perl -pi.bak -e 's/,/\n/g' allBacEnds.names
     sort allBacEnds.names | uniq > allBacEnds.names.uniq
     # get list of BAC clone names
     foreach f (bacEndPairs bacEndPairsBad bacEndSingles)
       awk '{print $4}' \
       /cluster/data/danRer4/bed/bacEnds/scoresAndCoords/${f}GoodScores.bed \
           >> bacs.names
     end
     sort bacs.names | uniq > bacs.names.uniq
     wc -l *.uniq
     # 518827 allBacEnds.names.uniq
     # 302606 bacs.names.uniq
    
     # from psl file
     awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
     # remove first few lines with no names
     tail +6 bacEndsPsl.names | sort | uniq > bacEndsPsl.names.uniq
     wc -l bacEndsPsl.names.uniq
     # 549034 bacEndsPsl.names.uniq
     # this is all the BAC ends that originally had alignments
     # Add an alias table for BAC clones
     # bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
     # Add a xref table to give external clone registry names, internal names
     # sanger name, relationship between STS and BAC clone (method of finding
     # STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT,
     # Genbank accession and STS primer sequences
     # bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
     set dir=/cluster/data/danRer4/bed/bacEnds/
     awk 'BEGIN {OFS="\t"}{print $4, $1}' \
     $dir/scoresAndCoords/bacEndPairsGoodScores.bed > bacClones.namesandchrom
     awk 'BEGIN {OFS="\t"}{print $4, $1}' \
     $dir/scoresAndCoords/bacEndSinglesGoodScores.bed >> bacClones.namesandchrom
     sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
     wc -l bacClones.namesandchrom.uniq
     # 306079 bacClones.namesandchrom.uniq
     # so created a list of names and chroms for BAC clones only in pairs
     # and singles, exclude bad Pairs since this track is not shown on RR.
     # use a list of internal names,Genbank accessions, and BAC clone names
     # use BACClonesIdsandAccs.txt.
     # get list of UniSTS IDs using aliases to search alias file
     # print Sanger name, alias and UniSTS ID, use find_markers3.pl
 cat << '_EOF_' > find_markers3.pl
     # example:
 # perl find_markers.pl UniSTS.aliases markers.02.12.04.txt
 use strict;
 my $verbose = 0;
 my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
 my $aliasFile = $ARGV[0];
 my $markersFile = $ARGV[1];
 open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
 open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
 # store aliases from aliasFile
 my ($id, $al, @alsArray, %aliasHash);
 while (<ALIAS>)
 {
    chomp;
    ($id, $al) = split /\t/;
    @alsArray = split(/;/, $al);
    foreach my $as (@alsArray)
       {
       push (@{$aliasHash{$as} }, $id);
       }
 }
 close ALIAS;
 
 while (<MARKERS>) {
     my @idArray;
     ($f, $t, $m, $idArray[0]) = 0;
     my @ids;
     chomp; ($a, $b, $aliases, @rest) = split /\|/;
     if ($verbose > 3) { printf "aliases $aliases \n"; }
     @alias = split /;/, $aliases;
     ALIAS: foreach $s (@alias) {
         if ($s =~ /[\D]+/) {
             if ($verbose > 5) { printf "this $s \n"; }
             if (exists($aliasHash{$s}))
                {
                @idArray = @{$aliasHash{$s}};
                }
             if ($idArray[0]) {
                 $f = 1; $t = $s; @ids = @idArray;
                 if ($verbose) { printf "this $s found $m \n"; }
                 last ALIAS;
             }
         }
     }
     if ($f) 
      { 
      my @sNames = split(/;/, $b);
      foreach my $sn (@sNames)
         {
         foreach my $i (@ids)
            { 
            printf "$sn\t$i\n"; 
            }
         }
     }
 }
 close MARKERS;
 '_EOF_'
    chmod +x find_markers3.pl
    # download latest version of UniSTS (2006-10-26)
    ssh kkstore02
    mkdir -p /cluster/store5/sts.2006-10-26
    ln -s /cluster/store5/sts.2006-10-26 /cluster/data/ncbi
    cd /cluster/data/ncbi/sts.2006-10-26
    wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
    mkdir -p /cluster/store5/UniSTS.2006-10-26
    ln -s /cluster/store5/UniSTS.2006-10-26 /cluster/data/ncbi
    cd /cluster/data/ncbi/UniSTS.2006-10-26
    wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
    wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
    wget --timestamp -r l1 \
 ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/
    mv
 /cluster/data/ncbi/UniSTS.2006-10-26/ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio
 /cluster/data/ncbi/UniSTS.2006-10-26
     rm -r /cluster/data/ncbi/UniSTS.2006-10-26/ftp.ncbi.nih.gov
     # then back to danRer4 BAC ends tables:
     ssh kkstore04
     cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
     # change internal names in files to have CHORI73 instead of zH to 
     # keep names the same as those used in the BAC end tables.
     perl -pi.bak -e 's/zH([0-9]+)/CHORI73_$1/' *.27.10.06.txt 
     perl find_markers3.pl /cluster/data/ncbi/UniSTS.2006-10-26/UniSTS.aliases \
          markers.27.10.06.txt > sangerandUniSTSId.txt
     # Need to sort and uniq this file since the UniSTS IDs are being
     # replicated for each instance of the sanger name in field 2 of the
     # markers file. In some cases the sanger name is replicated.
     sort sangerandUniSTSId.txt | uniq > sangerandUniSTSId.uniq
     # No need to reformat this for zfishBacClonesandSts
     # FPC contig information (i.e. FPC contig number) from ctgnames file is
     # not included in the tables as these are dynamic and constantly
     # changing with the assembly.
     # bacs.names.uniq has the list of BACS in this track
     # Get accessions for BAC clones from Genbank (as for danRer3)
     # go to http://www.ncbi.nlm.nih.gov
     # 1) select "Nucleotide" as the search database.
     # 2) Search string: 
     # Danio rerio[ORGN] AND clone[TITL] NOT survey[TITL]
     # Including only those with BAC in the record seems to exclude some of the
     # BAC clones as well as other types of sequence so this "BAC" was not 
     # used in the search.
     # Those sequences with "genomic survey" in the title appear to be
     # BAC clone end accessions. Here, we want only BAC clone accessions.
     # 3) There are 1148560 sequences. (2006-10-27). Select File from Send To
     # pulldown menu and name file "BACClones.gbAccs.txt".
     # use script from danRer3 to parse out clone ID and the accession:
 cat << '_EOF_' > getAccsandIdsFromGb.pl
 #!/usr/bin/perl -w
 use strict;
 
 my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-");
 my %cloneHash = qw {  
    CH211-  zC
    DKEY-   zK
    DKEYP-  zKp 
    RP71-   bZ
    BUSM1-  dZ
    CH73-   CHORI73_
 };
 
 my $found = "FALSE";
 my $acc = "";
 my $id = "";
 while (<STDIN>)
 {
 my ($l, @f, $intId, $extPref, $intPref);
 $intPref = "";
 $extPref = "";
 
 chomp;
 $l = $_;
 if ($l =~ /^[0-9]+:\s+([A-Z]+[0-9]{3,})/)
    {
    $acc = "";
    $acc = $1;
    $found = "FALSE";
    }
 elsif ($l =~ /clone/)
    {
    $id = "";
    # check for clone name in this line
    foreach my $p (@clonePrefixes)
       {
       if ($l =~ /clone:?\s?($p[0-9-A-Za-z]+)/)
          {
          $id = $1;
          # translate to upper case
          $id =~ tr/a-z/A-Z/;
          $extPref = $p;
          $found = "TRUE";
          }
       }
    }
 if ($found eq "TRUE")
    {
    if (exists($cloneHash{$extPref}))
       {
       $intPref = $cloneHash{$extPref};
       }
    $intId = $id;
    # translate this to internal ID
    $intId =~ s/$extPref/$intPref/;
    print "$intId\t$acc\t$id\n";
    $found = "FALSE";
    }
 }
 '_EOF_'
     chmod +x getAccsandIdsFromGb.pl
     nice perl getAccsandIdsFromGb.pl < BACClones.gbAccs.txt \
           > BACClonesIdsandAccs.txt &
     # Took about 1 minute
     # compare the BAC clones for which accessions were found to those 
     # for danRer3:
     awk '{print $3}' BACClonesIdsandAccs.txt | sort | uniq \
         > clonesWithAccs.dr4
     awk '{print $3}' \
       /cluster/data/danRer3/bed/bacends/bacends.1/BACClonesIdsandAccs.txt \
       | sort | uniq > clonesWithAccs.dr3
     comm -13 clonesWithAccs.dr4 clonesWithAccs.dr3
 # DKEY-188F22
 # DKEY-30O13
     # Checked these out for searching for each in the Nucleotide database
     # at Genbank. DKEY-30O13 only has accessions for the 
     # end sequences. DKEY-188F22 has an accession for the BAC clone: AP007256
     # For some reason this was not found by the search.
     # Add this to list:
     echo "zK188F22\tAP007256\tDKEY-188F22" >> BACClonesIdsandAccs.txt
 
     # use zfishBacClonesandSts to create tab files for loading into
     # bacCloneAlias and bacCloneXRef tables
     # make output directory
     mkdir out
     # Asked Sanger for another version of the file with the Sanger sts aliases
     # instead of these numbers in the second field of the markers file.
     # (2006-10-26). Received new file (2006-10-27)
     # Increased NUMSANGER from 5 to 40 and MAXSANGER from 50 to 60
     # because there are multiple occurrences of Sanger names in the second
     # field of the markers file and this can be quite a long list.
     # clonemarkers file now has 0 for relationship where before it was blank.
     # change this to blank again otherwise processed incorrectly.
     perl -pi.bak -e 's/\|0/\|/' clonemarkers.27.10.06.txt
     nice $HOME/bin/x86_64/zfishBacClonesandSts ctgnames.27.10.06.txt \
       clonemarkers.27.10.06.txt markers.27.10.06.txt \
       bacClones.namesandchrom.uniq BACClonesIdsandAccs.txt \
       sangerandUniSTSId.uniq ./out > ./out/zfishBacs.out &
     # output is in out directory so copy over
     cp ./out/*.tab .
     # sort alias tab file by sangerName
     wc -l *.tab
     # 120211 bacAlias.tab
     # 507274 bacXRef.tab
     # make sure there are no replicate lines:
     # also sort alias tab file by sangerName
     sort bacAlias.tab | uniq | sort -k2 > bacAlias.sort.tab.uniq
     sort bacXRef.tab | uniq > bacXRef.tab.uniq
     wc -l bac*.tab.uniq
     # 58758 bacAlias.sort.tab.uniq
     # 353042 bacXRef.tab.uniq
 
     ssh hgwdev 
     cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
     hgsql -e 'drop table bacCloneAlias;' danRer4
     hgsql -e 'drop table bacCloneXRef;' danRer4
      
     hgLoadSqlTab danRer4 bacCloneAlias \
           $HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab.uniq
     hgLoadSqlTab danRer4 bacCloneXRef \
           $HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab.uniq
 
 ###########################################################################
 # BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES
 # (DONE, 2006-10-27, hartera)
     # The following tests were carried out to check that all the data
     # in the bacCloneAlias and bacCloneXRef tables is correct.
     ssh hgwdev
     cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
     mkdir -p testTables
     cd testTables
     # copy scripts over from danRer3:
     cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/getName*.pl .  
     cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/getSanger*.pl . 
     cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/formatUniSts.pl . 
     # scripts were created for danRer2 - see danRer2.txt
 
 # Check that the correct aliases are associated with their Sanger STS names
     awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
         ../markers.27.10.06.txt > sNameandaliases
     # use script to get one Sanger name and one alias on each line
     perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
     sort sNameandaliases.format | uniq > sNameandaliases.sort
     # get Sanger names and aliases from database
     hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer4 \
           | sort | uniq > alias.db.sort
     wc -l alias.db.sort
     # 58758 alias.db.sort
     diff sNameandaliases.sort alias.db.sort
     # No difference between data file and data from database so ok
     # Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
     # get Sanger names from alias table
     hgsql -N -e 'select sangerName from bacCloneAlias;' danRer4 \
              | sort | uniq > sName.alias.sort
     wc -l sName.alias.sort
     # 15595 sName.alias.sort
     # get Sanger names from xRef table
     hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
           is not null;' danRer4 | sort | uniq > sName.xRef.sort
     wc -l sName.xRef.sort
     # 15946 sName.xRef.sort
     comm -23 sName.alias.sort sName.xRef.sort
     # nothing unique to alias file so all Sanger names in the alias table are
     # also in the xRef table
     comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias
     wc -l sNamexRefNotAlias
     # 351 sNamexRefNotAlias
     awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.27.10.06.txt | sort | uniq \
         > clonemarkers.sNames.sort
     # get Sanger names from markers file
     awk 'BEGIN {FS="|"}{print $2}' ../markers.27.10.06.txt > markers.sNames
     # remove semi-colons and sort
     sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
     # sanger names unique to markers file
     comm -13 clonemarkers.sNames.sort markers.sNames.sort
     # there are none
     comm -23 clonemarkers.sNames.sort markers.sNames.sort \
          > sNames.clonemarkersOnly
     wc -l sNames.clonemarkersOnly
     # 351 sNames.clonemarkersOnly
     diff sNames.clonemarkersOnly sNamexRefNotAlias
     # No difference so all the extra Sanger Names in the xRef 
     # table are from the clonemarkers file and these have no aliases in 
     # the markers file so they are not in the alias table so this is all ok.
   
 # Check that Sanger STS names and primers are associated correctly
     cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases/testTables
     # get sanger names and primers from markers file
     awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
         ../markers.27.10.06.txt > sNameandPrimers
     # use script to reformat and write with one Sanger name per line
     chmod +x getSangerandPrimers.pl
     perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
     # Need to sort and uniq due to multiple occurrences of the same 
     # Sanger name in some lines of the markers file.
     sort sNameandPrimers.format | uniq > sNameandPrimers.format.sort
     wc -l sNameandPrim*
     # 12407 sNameandPrimers
     # 32098 sNameandPrimers.format
     # 15595 sNameandPrimers.format.sort
 
     # get Sanger names and primers from database
     hgsql -N -e \
       'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
       where sangerName is not null and leftPrimer is not null and \
       rightPrimer is not null;' danRer4 | sort | uniq \
       > sNamesandprimers.fromdb.sort
     wc -l sNamesandprimers.fromdb.sort
     # 15595 sNamesandprimers.fromdb.sort
     diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
     # No difference so ok.
 
 # Check that UniSTS IDs and Sanger STS names are associated correctly
    # get Sanger names and UniSTS IDs from the database
    hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
        uniStsId is not null;' danRer4 | sort | uniq > sNameUniSTS.fromdb.sort
    wc -l sNameUniSTS.fromdb.sort
    # 5699 sNameUniSTS.fromdb.sort
    # Need to reformat the sNameUniSTS.fromdb.sort
    chmod +x formatUniSts.pl
    perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
         > sNameUniSTS.fromdb.format.sort
    # get Sanger names from data file and see how many UniSTS IDs there are
    # for each name
    awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
        > sangerandUniSTSId.count
    # the most is 160 - this is high due to replicate occurrences of sanger
    # STS names (sangerName) in the markers file. Replicates are removed
    # during processing.
 #   160 etID9511.14
 #   132 etID8743.18
 #    124 etID9682.15
 #    124 etID9681.15
 #     96 etID10372.18
 #     84 etID8170.14
 #     76 etID10495.5
 #     66 etID9328.14
 #     56 etID9708.3
 
    # use uniq'd file used to create database tables.
    sort ../sangerandUniSTSId.uniq > sangerandUniSTSId.txt.sort
    diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort 
    # No difference between data from original file and that in database so ok
 
 # Check that chrom mappings and external BAC clone names are correct
    # get extNames and chroms they map to from the database
    hgsql -N -e 'select name, chroms from bacCloneXRef where \
          chroms is not null;' danRer4 | sort | uniq \
          > nameandchromsfromdb.sort
    # reformat nameandchromsfromdb.sort
    perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
         > nameandchromsfromdb.format.sort
    # compare extNames and chroms from db to those in data file
    cp ../bacClones.namesandchrom .
    sort -u bacClones.namesandchrom > bacClones.namesandchrom.uniq
    diff bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
    # no difference - all ok
 
 # Check Genbank accessions and internal BAC clone names
    hgsql -N -e 'select intName,genbank from bacCloneXRef where \
          genbank is not null;' danRer4 | sort | uniq \
          > intNamesandAccs.fromdb.sort
    # this should be a subset of zfish_accsMerged.txt - not all BAC clones
    # listed here appear in either our BAC ends tracks or the markers files.
    awk 'BEGIN {OFS="\t"} {print $1,$2}' ../BACClonesIdsandAccs.txt \
        | sort -u > BACClonesIntandAccs.sort
    comm -23 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort
    # there is nothing in the database that is not in BACClonesIntandAccs.sort
    comm -13 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort \
             > onlyinzfishAccs
    wc -l onlyinzfishAccs
    # 86 onlyinzfishAccs
    hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' \
          danRer4 | sort | uniq > intNamesNoAcc.fromdb.sort
    awk '{print $1;}' BACClonesIntandAccs.sort > intNames.withAccs.sort
    comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
         > indbNoAccsandAccs.out
    # none of these names are common to both so all accessions from
    # BACClonesIdsandAccs.txt are in the database for the internal names stored
    # where there are accessions available.
 
 # Test Sanger STS names, internal names and external names are all correct
 # Test Sanger STS name and internal BAC clone names are associated correctly
    # get internal names and Sanger names from data file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.27.10.06.txt \
        | sort | uniq > intNameandSanger.sort
    hgsql -N -e 'select intName, sangerName from bacCloneXRef \
        where sangerName is not null;' danRer4 \
        | sort | uniq > intNameandSanger.fromdb.sort
    diff intNameandSanger.sort intNameandSanger.fromdb.sort
    # No difference between data from file and that from database so ok
 
 # Check BAC clone internal name and relationship fields
    # get internal names and relationships from data file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.27.10.06.txt \
        | sort | uniq > intNameandRelation.sort
    # get internal names and relationships from database, some internal names
    # may have different relationships associated with each internal name
    # and Sanger sts name pair
    hgsql -N -e 'select intName, relationship from bacCloneXRef \
        where relationship != 0;' danRer4 \
        | sort | uniq > intNameandrelation.fromdb.sort
    # differences unique to database file
    comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
        > intNameRelation.indbonly
    # differences unique to data file
    comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
        > intNameRelation.incloneMarkersonly
    wc -l intNameRelation*
    # 5051 intNameRelation.incloneMarkersonly
    # 5051 intNameRelation.indbonly
   
    awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
    awk '{print $1}' intNameRelation.incloneMarkersonly \
        > intNameRelation.incloneMarkersonly.names
    diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
    # there is no difference in the internal names with relationship fields
    # no difference in names and the only places these should differ is that
    # the second column should all be 3 in the data from the database only.
    # this is because all the relationship entries that were blank were
    # in the clonemarkers file were changed to 3 when entered into the database.
    awk '{print $2}' intNameRelation.indbonly | sort | uniq
    # 3 - correct so all ok
    # all the differences should be that those that are blank in clonemarkers
    # are 3 in the database.
    # check that those that have 0 in the database bacCloneXRef relationshipe
    # field are not in the list from cloneMarkers
    # select these internal names with 0 relationship from the database
    hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
          danRer4 | sort | uniq > intNameNoRelation.fromdb.sort
    # get all the internal names from the data file
    awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.10.06.txt \
        | sort | uniq > intNamefromCloneMarkers.sort
    comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
    # nothing in common between these two files as expected so there are
    # no internal names in the db with 0 in the relationship field that
    # appear in the clonemarkers file.
 
 # Check all BAC clone internal names and external names from the
 # ctgnames file are in the database
    # get intName and extName from ctgnames file
    awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.27.10.06.txt \
        | sort | uniq > intNameandextNamefromCtgNames.sort
    # get intName and extName from database
    hgsql -N -e 'select intName,name from bacCloneXRef;' danRer4 \
        | sort | uniq > intNameandextName.fromdb.sort
    wc -l intNameandextName*
    # 334890 intNameandextName.fromdb.sort
    # 168828 intNameandextNamefromCtgNames.sort
 
    comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
         > intandextindbAndCtgNames
    wc -l intandextindbAndCtgNames
    # 168828 intandextindbAndCtgNames
    # there are 168828 name pairs common between the file and the database
    # and this is the same number of name pairs as in the data file
    diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
    # no difference between those name pairs from the data file and those that
    # are common between the data file and the database so all internal and
    # external names from ctgNames file are in the database
    # get the list of extra ones from db
    comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
         > intandextNamesindbNotinCtgNames
    wc -l intandextNamesindbNotinCtgNames
    # 166062 intandextNamesindbNotinCtgNames
    # get list of internal names from the clonemarkers file
    awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.10.06.txt | sort | uniq \
        > clonemarkers.intName.sort
    wc -l clonemarkers.intName.sort
    # 14460 clonemarkers.intName.sort
    # compare these intNames to those from the database not in the ctgnames file
    comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
    # none of these clone markers internal names are in this list so they
    # must all be in the ctgnames file too. These extra internal names will be
    # translations of external names found in the list of mappings of BAC clones
    # to chroms.
 
 # Check that all the BAC clone external names from the list of chromosome
 # mappings and from the ctgnames file are in the database.
    # get all extNames from baclones.namesandchrom.uniq and from ctgnames
    awk '{print $1}' ../bacClones.namesandchrom.uniq > \
        extNames.ctgnamesandbacClones
    awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.27.10.06.txt \
        >> extNames.ctgnamesandbacClones
    wc -l extNames.ctgnamesandbacClones
    # 474907 extNames.ctgnamesandbacClones
    sort extNames.ctgnamesandbacClones | uniq \
         > extNames.ctgnamesandbacClones.sort
    wc -l extNames.ctgnamesandbacClones.sort
    # 334890 extNames.ctgnamesandbacClones.sort
    # get extNames from the database
    hgsql -N -e 'select name from bacCloneXRef;' danRer4 | sort | uniq \
          > extNames.fromdb.sort
    wc -l extNames.fromdb.sort
    # 334890 extNames.fromdb.sort
    comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
          > extNames.fromdbandfiles
    wc -l extNames.fromdbandfiles
    # 334890 extNames.fromdbandfiles
    # find extNames in common from data files and database
    diff extNames.fromdb.sort extNames.fromdbandfiles
    # no difference, all extNames from files are in db
 
 # Check that all BAC clone internal names from the ctgnames and clonemarkers
 # files are in the database
    # get internal names from ctgnames and clonemarkers files
    awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.27.10.06.txt \
        > intNames.ctgnamesandclonemarkers
    awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.27.10.06.txt \
        >> intNames.ctgnamesandclonemarkers
    wc -l intNames.ctgnamesandclonemarkers
    # 201440 intNames.ctgnamesandclonemarkers
    sort intNames.ctgnamesandclonemarkers | uniq \
         > intNames.ctgnamesandclonemarkers.sort
    wc -l intNames.ctgnamesandclonemarkers.sort
    # 168828 intNames.ctgnamesandclonemarkers.sort
    # get internal names from database
    hgsql -N -e 'select intName from bacCloneXRef;' danRer4 | sort | uniq \
         > intNames.fromdb.sort
    wc -l intNames.fromdb.sort
    # 334890 intNames.fromdb.sort
    # some of these intNames are derived from the corresponding extNames
    # all of the intNames from the file should be in the db
    comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
         > intNames.fromdbandfiles
    wc -l intNames.fromdbandfiles
    # 168828 intNames.fromdbandfiles
    comm -13 intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort 
    comm -23 intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
    # no difference, all intNames from files are in db
                                                                                 
 # Check that all translations are correct between BAC clone
 # external and internal names.
    # write script to get the prefixes from internal and external names
    chmod +x getNamePrefixes.pl
    hgsql -N -e 'select name, intName from bacCloneXRef;' danRer4 \
          | sort | uniq > extandintNames.fromdb.sort
    perl getNamePrefixes.pl < extandintNames.fromdb.sort \
          > extandintNames.prefixes
    sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
    # these all look good
    # BUSM1   dZ
    # CH211   zC
    # CH211   zc
    # CH73    CHORI
    # CT7     bP
    # DKEY    zK
    # DKEY    zk
    # DKEYP   zKp
    # RP71    bZ
    # XX      bY
    # zk is a internal name prefix for the external name prefix, DKEY-. There
    # is only one example where this is used (DKEY-81G7) and this in the
    # ctgnames file and is in the bacCloneXRef table so that is ok.
    # All data looks good in these tables now.
 
 ###########################################################################
 # SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM OTHER DANRER ASSEMBLIES
 # (DONE, 2006-06-27, hartera)
 # ADD TO SAN FOR PK RUNS (DONE, 2006-05-30, hartera)
    ssh kkr3u00
    # change script to do this and only rsync to 4,5,6,7, and 8 as
    # kkr1u00 and kkr2u00 are down.
    cd /cluster/data/danRer4/bed
    mkdir -p liftOver
    cd liftOver
    # commented out lines in local copy that makes the script abort if 
    # kkr1u00 not used. can not connect to kkr1u00 at the moment.
    ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh danRer4 \
          /cluster/data/danRer4/nib >&! split.log &
    # rsync didn't work properly so do manually
    foreach R (4 5 6 7 8)
     rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
    end
    ssh kk
    # add split10k to san for pk runs (2006-05-30, hartera)
    rsync -a --progress /iscratch/i/danRer4/split10k \
          /san/sanvol1/scratch/danRer4/
    
 ###########################################################################
 # LIFTOVER CHAINS TO DANRER3 (DONE, 2006-05-30 = 2006-05-31, hartera)
    # Split (using makeLoChain-split) of danRer3 is doc'ed in makeDanRer3.doc
    # Do what makeLoChain-split says to do next (start blat alignment)
    # Took too long on kk. Try pk. Scripts only run on kk so run manually.
    ssh pk
    mkdir -p /cluster/data/danRer4/bed/liftOver
    cd /cluster/data/danRer4/bed/liftOver
 cat << '_EOF_' > align.csh
 #!/bin/csh -fe
 set oldAssembly = $1
 set oldNibDir = $2
 set newAssembly = $3
 set newSplitDir = $4
 set ooc = $5
 if ("$ooc" != "") then
     set ooc = '-ooc='$ooc
 endif
 
 set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d`
 echo "Setting up blat in $blatDir"
 rm -fr $blatDir
 mkdir $blatDir
 cd $blatDir
 mkdir raw psl run
 cd run
 
 echo '#LOOP' > gsub
 echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \
        '-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \
   >> gsub
 echo '#ENDLOOP' >> gsub
 
 # target
 ls -1S $oldNibDir/*.{nib,2bit} > old.lst
 # query
 ls -1S $newSplitDir/*.{nib,fa} > new.lst
 
 gensub2 old.lst new.lst gsub spec
 /parasol/bin/para create spec
 
 echo ""
 echo "First two lines of para spec:"
 head -2 spec
 echo ""
 echo "DO THIS NEXT:"
 echo "    cd $blatDir/run"
 echo "    para try, check, push, check, ..."
 echo ""
 exit 0
 '_EOF_'
    # << emacs
    chmod +x align.csh
    align.csh danRer4 /san/sanvol1/scratch/danRer4/nib danRer3 \
        /san/sanvol1/scratch/danRer3/split10k \
        /san/sanvol1/scratch/danRer3/danRer3_11.ooc >&! align.log &
    # Took a few seconds.
    # Do what its output says to do next (start cluster job)
    cd /cluster/data/danRer4/bed/blat.danRer3.2006-05-30/run
    para try, check, push, check, ...
    para time >&! run.time
 # Completed: 784 of 784 jobs
 # CPU time in finished jobs:    1482693s   24711.54m   411.86h   17.16d  0.047 y
 # IO & Wait Time:                  2873s      47.89m     0.80h    0.03d  0.000 y
 # Average job time:                1895s      31.58m     0.53h    0.02d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           11350s     189.17m     3.15h    0.13d
 # Submission to last job:         13914s     231.90m     3.87h    0.16d
 
    ssh pk
    cd /cluster/data/danRer4/bed/liftOver
  
 cat << '_EOF_' > lift.csh
 #!/bin/csh -ef
 set oldAssembly = $1
 set newAssembly = $2
 set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k
 
 set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly
 set blatDir = `ls -td $prefix.20* | head -1`
 echo "using dir $blatDir"
 
 if ( ! -e $blatDir/raw ) then
     echo "Can't find $blatDir/raw"
 endif
 
 if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then
     echo "Can't find any .lft files in $newLiftDir"
     exit 1
 endif
 cd $blatDir/raw
 
 foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`)
     echo $chr
     liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl
 end
 
 set execDir = $0:h
 echo ""
 echo "DO THIS NEXT:"
 echo "    ssh pk"
 echo "    $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>"
 echo ""
 exit 0
 '_EOF_'
    # << emacs
    chmod +x lift.csh
    lift.csh danRer4 danRer3 >&! lift.log &
    # makeLoChain-chain can be run on pk. chain alignments
    
    makeLoChain-chain danRer4 /san/sanvol1/scratch/danRer4/nib \
                      danRer3 /san/sanvol1/scratch/danRer3/nib >&! chain.log &
    cd /cluster/data/danRer4/bed/blat.danRer3.2006-05-30/chainRun
    para try, check, push, check, ...
    para time
 # Completed: 28 of 28 jobs
 # CPU time in finished jobs:       4030s      67.16m     1.12h    0.05d  0.000 y
 # IO & Wait Time:                   939s      15.66m     0.26h    0.01d  0.000 y
 # Average job time:                 177s       2.96m     0.05h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             797s      13.28m     0.22h    0.01d
 # Submission to last job:           953s      15.88m     0.26h    0.01d
    # net alignment chains
    ssh kkstore04
    cd /cluster/data/danRer4/bed/liftOver
    makeLoChain-net danRer4 danRer3 >&! net.log &
    # load reference to over.chain into database table,
    # and create symlinks  /gbdb  and download area
    ssh hgwdev
    cd /cluster/data/danRer4/bed/liftOver
    makeLoChain-load danRer4 danRer3 >&! load.log &
    # clean up
    rm *.log
    # add md5sum.txt to include this new liftOver file
    cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver
    md5sum *.gz > md5sum.txt
    # copy README.txt from another liftOver directory.
    # test by converting a region using the "convert" link on
    # the browser, and comparing to blat of the same region
 
 ###########################################################################
 # PRODUCING GENSCAN PREDICTIONS (DONE, 2006-05-27, hartera)
    # Use scaffolds for random chroms to avoid getting false predictions
    # spanning scaffolds in chrNA_random and chrUn_random.
    ssh kkstore04
    cd /cluster/data/danRer4
    # already have a file of soft-masked scaffolds for chrNA_random and
    # chrUn_random. Use this to create hard-masked scaffolds FASTA file
    # for Genscan run.
    foreach c (NA_random Un_random)
       cd /cluster/data/danRer4/$c
       mkdir scaffoldsHardMask
       echo "Hard-masking scaffolds for $c ..."
       cd scaffoldsSoftMask
       foreach f (*.fa)
         maskOutFa $f hard ../scaffoldsHardMask/${f}.masked
       end
    end
   
    ssh hgwdev
    mkdir /cluster/data/danRer4/bed/genscan
    cd /cluster/data/danRer4/bed/genscan
    cvs co hg3rdParty/genscanlinux
 
    ssh pk
    cd /cluster/data/danRer4/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    cp /dev/null genome.list
    foreach c (`cat /cluster/data/danRer4/chrom.lst`)
     echo $c
     if (($c == "NA_random") || ($c == "Un_random")) then
      foreach s (/cluster/data/danRer4/${c}/scaffoldsHardMask/Zv6_*.fa.masked)
       egrep '[ACGT]' $s > /dev/null
       if ($status == 0) echo $s >> genome.list
      end
     else
      foreach f ( `ls -1S /cluster/data/danRer4/$c/chr*_*/chr*_?{,?}.fa.masked` )
        egrep '[ACGT]' $f > /dev/null
        if ($status == 0) echo $f >> genome.list
      end
     endif
    end
    wc -l genome.list
    # 3237 genome.list
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check ... etc.
    para time
 # Completed: 3236 of 3237 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:      46601s     776.69m    12.94h    0.54d  0.001 y
 # IO & Wait Time:                 10409s     173.48m     2.89h    0.12d  0.000 y
 # Average job time:                  18s       0.29m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             363s       6.05m     0.10h    0.00d
 # Submission to last job:           445s       7.42m     0.12h    0.01d
 
    # If there are crashes, diagnose with "para problems" / "para crashed".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    para problems > problems
    nice /cluster/bin/x86_64/gsBig /cluster/data/danRer4/8/chr8_5/chr8_5.fa.masked gtf/chr8_5.fa.gtf -trans=pep/chr8_5.fa.pep -subopt=subopt/chr8_5.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 >& chr8_5.fa.log & 
    # Took about 5 minutes to run
    # check log and then remove it
    rm chr8_5.fa.log
 
    ssh kkstore04
    cd /cluster/data/danRer4/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep
 
    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/danRer4/bed/genscan
    ldHgGene danRer4 genscan genscan.gtf
    # Read 44534 transcripts in 325488 lines in 1 files
    # 44534 groups 28 seqs 1 sources 1 feature types
    # 44534 gene predictions
    hgPepPred danRer4 generic genscanPep genscan.pep
    hgLoadBed danRer4 genscanSubopt genscanSubopt.bed
    # Loaded 332782 elements of size 6
    # compare to other assemblies:
    featureBits danRer4 genscan
    # 64448019 bases of 1626093931 (3.963%) in intersection
    featureBits rn4 genscan
    # 54781052 bases of 2571531505 (2.130%) in intersection
    featureBits monDom4 genscan
    # 45991425 bases of 3501643220 (1.313%) in intersection
    featureBits tetNig1 genscan
    # 30459626 bases of 342403326 (8.896%) in intersection
 
    featureBits -chrom=chr1 refGene genscan -enrichment
    # refGene 1.129%, genscan 4.195%, both 0.653%, cover 57.80%, enrich 13.78x 
    # check CDS only
    featureBits -chrom=chr1 danRer4 refGene:cds genscan:cds -enrichment
    # refGene:cds 0.746%, genscan:cds 4.195%, both 0.631%, cover 84.52%, 
    # enrich 20.15x 
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET GALGAL3 (DONE 5/30/06 angie)
     ssh pk
     mkdir /cluster/data/danRer4/bed/blastz.galGal3.2006-05-30
     cd /cluster/data/danRer4/bed/blastz.galGal3.2006-05-30
     cat << '_EOF_' > DEF
 # zebrafish vs. chicken
 BLASTZ=/cluster/bin/penn/i386/blastz
 
 # Use same params as used for danRer1-xenTro1 (see makeXenTro1.doc)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Zebrafish danRer4
 SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
 SEQ1_CTGDIR=/san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit
 SEQ1_LIFT=/san/sanvol1/scratch/danRer4/liftNAandUnScaffoldsToChrom.lft
 SEQ1_LEN=/cluster/data/danRer4/chrom.sizes
 SEQ1_CTGLEN=/san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=100
 
 # QUERY: Chicken galGal3 - single chunk big enough to run while chrom
 SEQ2_DIR=/san/sanvol1/galGal3/nib
 SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/cluster/data/danRer4/bed/blastz.galGal3.2006-05-30
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/scratch/danRer4GalGal3 \
       -bigClusterHub=pk -smallClusterHub=pk \
       -chainMinScore=5000 -chainLinearGap=loose DEF \
       >& do.log & tail -f do.log
     ln -s blastz.galGal3.2006-05-30 /cluster/data/danRer4/bed/blastz.galGal3
 
 ###########################################################################
 # CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO 
 # AFFY ZEBRAFISH ALIGNMENTS (DONE, 2006-06-10, hartera)
 # UPDATE ARRAY DATA TRACK AFTER PROCESSING ARRAY DATA DIFFERENTLY AND
 # RELOADING INTO hgFixed (see hgFixed.txt for details). 
 # (DONE, 2006-10-20, hartera)
 # UPDATE ARRAY DATA TRACK AFTER REPROCESSING ARRAY DATA TO ANTILOG THE LOG2
 # VALUES FROM NORMALISATION TO GET THE ABSOLUTE VALUES AND
 # RELOADING INTO hgFixed (see hgFixed.txt for details).
 # (DONE, 2007-01-08, hartera)
 # RE-ORDERED DISPLAY IN TRACK - see ZON LAB WILD TYPE MICROARRAY DATA section
 # in danRer3.txt make doc. (DONE, hartera, 2007-04-09)
 # Array data is for whole embryos of five wild type zebrafish strains. 
 # Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's 
 # Hospital Boston. Contact: adibiase@enders.tch.harvard.edu
     ssh hgwdev
     mkdir /cluster/data/danRer4/bed/ZonLab/wtArray
     cd /cluster/data/danRer4/bed/ZonLab/wtArray
     
     # use AllRatio table for mapping. There are not many arrays in this 
     # dataset so using AllRatio will allow the selection of All Arrays
     # from the track controls on the track description page. Also set up the
     # Zebrafish microarrayGroups.ra so that the Medians of replicates or
     # Means of replicates can also be selected for display.
     # Create mapped data in zebrafishZonWT.bed.
     rm zebrafishZonWT.bed
     hgsql -e 'drop table affyZonWildType;' danRer4
     hgMapMicroarray zebrafishZonWT.bed hgFixed.zebrafishZonWTAllRatio \
          /cluster/data/danRer4/bed/affyZebrafish/affyZebrafish.psl
     # Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTMedian
     # Mapped 14952,  multiply-mapped 3867, missed 0, unmapped 665
 
     hgLoadBed danRer4 affyZonWildType zebrafishZonWT.bed
     # Loaded 18819 elements of size 15
     # add trackDb.ra entry at trackDb/zebrafish level
     # look at range of scores:
     hgsql -N -e 'select expScores from zebrafishZonWTAllRatio;' hgFixed \
           > ratioExps.out
     perl -pi.bak -e 's/,/\n/g' ratioExps.out
     sort ratioExps.out | uniq -c > ratioExps.uniq.count
     textHistogram -binSize=0.5 -real -maxBinCount=40 -minVal=-10 \
         ratioExps.out > expRatios.hist
     # Most values are between -3 and +2.
     # Therefore use the following trackDb entry:
 
 # track affyZonWildType
 # shortLabel Wild Type Array
 # longLabel Zon Lab Expression data for Wild Type Zebrafish strains
 # group regulation
 # priority 80
 # visibility hide
 # type expRatio
 # expScale 2.0
 # expStep 0.2
 # groupings affyZonWildTypeGroups
     # The .ra file in /usr/local/apache/cgi-bin/hgCgiData/Zebrafish
     # (from ~/kent/src/hg/makeDb/hgCgiData/Zebrafish in the source tree)
     # which is microarrayGroups.ra defines how the array data is
     # displayed and also grouped for the Medians and Means of Replicates.
     # It also defines the labels for the track controls for showing 
     # All Arrays, Arrays Grouped By Replicate Means or
     # Arrays Grouped By Replicate Medians. This is in the description field.
     # RE-ORDERED DISPLAY IN TRACK - see danRer3.txt make doc 
     # (hartera, 2007-04-09)
     # 14 somites and 15 somites should come before 36 hpf
     # 14-19 somites stage is 16-19h.
     # from hgFixed.zebrafishZonWTAllExps
     # for AB, 0-8 should go after 14, 
     # for TL, 16-22 should go after 24
     # for TU, 25-27 should go after 32
     # re-order accordingly in the config file:
     # ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra
 
 ###########################################################################
 # HUMAN ORTHOLOGS ADDED TO AFFY ZEBRAFISH TRACK DETAILS
 # (DONE, 2006-06-08, hartera)
     # Human orthologs were mapped to Affy Zebrafish probes by 
     # Tony DiBiase (adibiase@enders.tch.harvard.edu) from Len Zon's group
     # at Children's Hospital, Boston. They map to human hg16.
     ssh kkstore04
     mkdir -p /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs
     cd /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs
     sed -e 's/"//g' cumuList.gedi.2005oct12.txt > hg16Orthologs.txt
     awk \
     'BEGIN {FS="\t"} {OFS="\t"} {if ($2 == $1) print $1,"",""; else print;}' \
         hg16Orthologs.txt > hg16Orthologs.tab 
     # create a table definition for this set:
 cat << 'EOF' > orthologs.sql
 # Link together an item with an ortholog
 CREATE TABLE affyToHg16Orthologs (
    name varchar(255) not null,        # Item ID
    geneSymbol longblob not null,  # Gene Symbol of ortholog
    description longblob not null, # Description of ortholog
        # Indices
    INDEX(name(20)),
    INDEX(geneSymbol(20))
 );
 'EOF'    
    # load table
    ssh hgwdev 
    cd /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs
    hgsql -e 'drop table affyToHg16Orthologs;' danRer4
    hgLoadSqlTab danRer4 affyToHg16Orthologs orthologs.sql hg16Orthologs.tab
    # edit hgc.c to use this table on affyZebrafish details page and add
    # a search to use the human ortholog gene symbol in a search:
    # affyZebrafishHg16Ortholog, put in trackDb/zebrafish/trackDb.ra
 
 ###########################################################################
 #  SWAP rn4 BLASTZ CHAIN/NET (DONE, 2006-06-19, hartera)
 #  See also makeRn4.doc
     ssh pk
     cd /cluster/data/rn4/bed/blastzDanRer4.2006-06-19
     # blastz parameters used in blastz alignment of danRer4 on mm8:
     # BLASTZ_ABRIDGE_REPEATS=0
     # BLASTZ_H=2000
     # BLASTZ_Y=3400
     # BLASTZ_L=6000
     # BLASTZ_K=2200
     # BLASTZ_M=50
     # BLASTZ_Q=/cluster/data/blastz/HoxD55.q
     nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF >& swap.log &
     
     ssh hgwdev
     featureBits danRer4 chainRn4Link 
     # 68978593 bases of 1626093931 (4.242%) in intersection
     featureBits danRer4 refGene:cds chainRn4Link -chrom=chr1 -enrichment
     # refGene:cds 0.746%, chainRn4Link 4.333%, both 0.564%, 
     # cover 75.55%, enrich 17.43x
     featureBits danRer3 refGene:cds chainRn4Link -chrom=chr1 -enrichment
     # refGene:cds 0.786%, chainRn4Link 4.320%, both 0.604%, 
     # cover 76.87%, enrich 17.80x
     featureBits danRer4 refGene:cds netRn4 -chrom=chr1 -enrichment
     # refGene:cds 0.746%, netRn4 29.601%,both 0.623%,cover 83.49%,enrich 2.82x
     featureBits danRer3 refGene:cds netRn4 -chrom=chr1 -enrichment
     # refGene:cds 0.786%, netRn4 33.103%, both 0.671%,cover 85.33%,enrich 2.58x
     # Add symbolic link to new swap directory 
     ssh kkstore04
     cd /cluster/data/danRer4/bed
     ln -s blastz.rn4.swap blastz.rn4
     # Check README.txt for downloads. 
 
 #######################################################################
 # VEGA GENES (DONE, 2006-08-14 - 2006-08-25, hartera)
 # ADD DESCRIPTIONS FOR VEGA GENES (DONE, 2006-09-25 - 2006-09-26, hartera)
 # Data provided by Kerstin Howe from Sanger: kj2@sanger.ac.uk
 # and also Mario Caccamo: mc2@sanger.ac.uk 
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/vega
     cd /cluster/data/danRer4/bed/vega
     wget --timestamping \
          ftp://ftp.sanger.ac.uk/pub/kj2/gff/vega_in_ensembl.gff
     wget --timestamping \
          ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/genes_for_tom_new.txt
     # checked list of genes found in vega_in_ensembl.gff but not in 
     # genes_for_tom_new.txt against this file
     grep -f genesWithNoInfo.txt genes_for_tom_20060725.txt
     # got a list of 20 that were not in this file: genesWithNoInfo2.txt
     # e-mailed Kerstin at Sanger and got the information for these 20 genes:
     # moreInfo.txt
     # Need to rewrite this file using tabs:
     # checked format for VEGA genes in hg17. Includes an alternate name.
     cd /cluster/data/hg17/bed/vega30
     # to look at human VEGA
     # vegaInfo is transcriptId, otterId, geneId, method and geneDesc
     awk '{if ((($9 ~ /^ID=OTTDART/) && ($9 ~ /Parent=OTTDARG/)) || \
         (($9 ~ /^ID=OTTDART/) && ($9 ~ /Parent=OTTDART/))) print $9;}' \
         vega_in_ensembl.gff | sort | uniq > vegaIDs.txt
     perl -pi.bak -e 's/ID=//' vegaIDs.txt
     # list of transcript ID and corresponding gene ID for Vega
     perl -pi.bak -e 's/;Parent=/\t/' vegaIDs.txt
     perl -pi.bak -e 's/;Note=Only//' vegaIDs.txt
 
     # write a script to reformat the GFF3 file to GFF format.
     # some exon and CDS items belong to more than one transcript ID so these
     # lines can just be duplicated. Those items that are labelled as mRNA or
     # gene can be ignored and not added to the GFF file. Some of these lines
     # have an extra comment e.g. Note="   . These will be ignored anyway as
     # they are on the lines with mRNA or gene in them so they will not be in
     # the final GFF file.  
 cat << '_EOF_' > formatGff3ToGff.pl
 #!/usr/bin/perl -w
 use strict;
 
 my (%idsHash, $gffFile, $idsFile);
 $gffFile = $ARGV[0];
 open(GFF, $gffFile) || die "Can not open $gffFile\n";
 
 while (<GFF>)
 {
 my ($line, @f, $t, @trans, $r, $chr);
 $line = $_;
 if ($line !~ /^#/)
    {
    @f = split(/\t/, $line);
    $chr = "chr" . $f[0];
    if (($f[2] ne "gene") && ($f[2] ne "mRNA"))
       {
       $f[8] =~ /Parent=(OTTDART[0-9]+[A-Z0-9,]+)/;
       $t = $1;
       @trans = split(/,/, $t);
       foreach $r (@trans)
          {
          print "$chr\t$f[1]\t$f[2]\t$f[3]\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$r\n";
          }
       }
    }
 else 
    {
    # print lines beginning with "#"
    print $line;
    }
 }
 close GFF;
 '_EOF_'
     chmod +x formatGff3ToGff.pl
     # Use script to format the GFF3 file to GFF format in order to load 
     # using ldHgGene
     perl formatGff3ToGff.pl vega_in_ensembl.gff > vega.gff
     # then use the info file to grab those genes that are pseudogenes, get the
     # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
     # to a separate file. Create an info file. Remove the .NOVEL or .PUTATIVE 
     # or .KNOWN or .NOVEL from the method column and add as a separate 
     # confidence column. 
     # check number of items on each line: there are 4 or 6.
     # Some genes have more than one clone ID in a comma separated list
     # so create two files for loading into two tables. 
     # Found that some of the clone ID fields have comma separated lists 
     # and for OTTDARG00000006367, there are 30. Therefore create two info 
     # tables where one is just for clone IDs.  
     # NOTE: in future, make sure each row of vegaInfoZfish.txt output has 
     # 8 fields. The pseudogene entries are missing an entry in the 
     # confidence field so this should be an empty field.
 cat << '_EOF_' > formatVegaInfo.pl
 #!/usr/bin/perl -w
 use strict;
 
 # format Vega additional information into one file for vegaInfoZfish table
 # and another for the vegaToCloneIdZfish table which contains the
 # geneId and cloneId for each gene since there are multiple clone IDs for
 # some of the genes.
 my ($idsFile, $infoFile, $outFile1, $outFile2, %idsHash);
 $idsFile = $ARGV[0];
 $infoFile = $ARGV[1];
 $outFile1 = $ARGV[2];
 $outFile2 = $ARGV[3];
 
 open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
 open (INFO, $infoFile) || die "Can not open $infoFile: $!\n";
 open (OUT1, ">$outFile1") || die "Can not create $outFile1: $!\n";
 open (OUT2, ">$outFile2") || die "Can not create $outFile2: $!\n";
 open (STDERR, ">info.log") || die "Can not create info.log: $!\n";
 
 while (<IDS>)
 {
 my ($line, @f);
 chomp;
 $line = $_;
 @f = split(/\t/, $line);
 
 $idsHash{$f[1]} = $f[0];
 }
 close IDS;
 
 while (<INFO>)
 {
 my ($line,@fi,$id,$gene,$trans,@transIds, $tr,@clones, $c,@t, $method, $conf);
 chomp;
 $gene = "";
 $line = $_;
 @fi = split(/\t/, $line);
 $id = $gene = $fi[1];
 # get all the transcript IDs for a gene
 while (exists($idsHash{$id}))
    {
    $trans = $idsHash{$id};
    push(@transIds, $trans);
    $id = $trans;
    }
 # push clone IDs into an array:
 @clones = split(/,/, $fi[2]);
 @t = split(/\./, $fi[3]);
 $method = $t[0];
 if ($#t > 0)
 {
 $conf = $t[1];
 }
 elsif ($#t == 0)
    {
    $conf = "";
    }
 else
    {
    print STDERR "Should be 4 or 6 items per row, found $#fi \n";
    }
 foreach $tr (@transIds)
    {
    print OUT1 "$tr\t$fi[1]\t$fi[0]";
    if ($#fi == 5)
       {
       print OUT1 "\t$fi[4]\t$fi[5]\t$method\t\t$conf\n";
       }
    elsif ($#fi == 3)
       {
       print OUT1 "\t\t\t$method\t\t$conf\n";
       }
    # print out clone IDs for each transcript
    foreach $c (@clones)
       {
       print OUT2 "$tr\t$c\n";
       }
    }
 if($gene && !exists($idsHash{$gene})) 
    {
    print STDERR "$gene\n";
    }
 }
 close IDS;
 close INFO;
 close OUT1;
 close OUT2;
 close STDERR;
 '_EOF_'
     chmod +x formatVegaInfo.pl 
     wc -l genes_for_tom_new.txt
     # 4822 genes_for_tom_new.txt
     awk '{print $2}' genes_for_tom_new.txt | sort | uniq > genesWithInfo.txt
     awk '{if ($2 ~ /OTTDARG/) print $2;}' vegaIDs.txt \
         | sort | uniq > genesFromGff.txt
     wc -l genesFromGff.txt
     # 4947 genesFromGff.txt
     comm -12 genesWithInfo.txt genesFromGff.txt | wc -l
     # 4033 
     comm -13 genesWithInfo.txt genesFromGff.txt | wc -l
     # 914
     comm -13 genesWithInfo.txt genesFromGff.txt > genesWithNoInfo.txt
     # sent this list to Sanger to ask about getting additional information
     # for these genes.
     comm -23 genesWithInfo.txt genesFromGff.txt | wc -l
     # 789
     # got another file from Sanger that should contain the information 
     # for the 914 genes missing information above.
     ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/060725/genes_for_tom_20060725.txt
     # check if this contains all of the list missing before
     sort genesWithNoInfo.txt | uniq > genesWithNoInfo.sort
     awk '{print $2}' genes_for_tom_20060725.txt | sort | uniq > genes.txt
     comm -13 genes.txt genesWithNoInfo.uniq > genesWithNoInfo2.txt
     # there are 20 of these. Sent these to Sanger and received 
     # information for these. Copied and pasted these from e-mail into
     # moreInfo.txt. Write script to reformat: addTabs.pl
     perl addTabs.pl < moreInfo.txt > geneInfo3.txt
     grep -f genesWithInfo.txt genes_for_tom_20060725.txt > tmp
     wc -l tmp
     # 4738
     wc -l genesWithInfo.txt
     # 4822 genesWithInfo.txt
     # Not all of these are in genes_for_tom_20060725.txt so merge all the 
     # info files and uniq:
     cat genes_for_tom_new.txt genes_for_tom_20060725.txt geneInfo3.txt \
         | sort | uniq > allGeneInfo.txt
     awk '{print $2}' allGeneInfo.txt | sort | uniq -c | sort -nr > count
     # counts gene names - often occur twice but with more information in 
     # one case than the other. Seems like newer file has most information for
     # each gene.
     grep -f genesFromGff.txt genes_for_tom_20060725.txt > info1.txt
     # then list genes in info1.txt
     comm -13 genesInInfo1.sort genesFromGff.txt > genes1
     wc -l genes1
     # 55 genes1
     grep -f genes1 genes_for_tom_new.txt > info2.txt
     awk '{print $2}' info2.txt | sort | uniq > genesInInfo2.txt
     comm -13 genesInInfo2.sort genes1 > genes2
     wc -l genes2
     # 20 genes2
     # genes2 is list of genes not found in either file. Should be 20 left.
     awk '{print $2}' geneInfo3.txt | sort | uniq > genes3
     comm -12 genes2 genes3 | wc -l
     # 20 - so these are the same 20 that are in geneInfo3.txt
     # These are in geneInfo3.txt. cat all these files together
     cat info1.txt info2.txt geneInfo3.txt > allGeneInfo2.txt
     # Recreate the tab file for loading into the vegaInfoZfish table:
     rm vegaInfoZfish.txt
     # Use new version that prints out one row for each accession in field 3.
     perl formatVegaInfo.pl vegaIDs.txt allGeneInfo2.txt vegaInfoZfish.txt \  
          vegaToCloneId.txt
     # info.log contains genes for which are not in the gff file of VEGA
     # and this is empty as it should be.
     wc -l vegaInfoZfish.txt
     # 6606 vegaInfoZfish.txt
     wc -l vegaToCloneId.txt
     # 7245 vegaToCloneId.txt
     awk '{print $1}' vegaInfoZfish.txt | sort | uniq -c | sort -nr > out2
     # transcripts only have 1 entry
     awk '{print $2}' vegaInfoZfish.txt | sort | uniq > infogenes.txt
     comm -13 infogenes.txt genesFromGff.txt 
     # There are no genes in the GFF file that are not in vegaInfoZfish.txt
     # Then remake the pseudogenes track from this.
     # Next step is to find which transcripts are pseudogenes.
     grep pseudogene vegaInfoZfish.txt | sort | uniq | wc -l
     # There are only 51 in the info file, and all of these are in the GFF
     # file. Anyway, this is too sparse for a separate track, but
     # a subtrack could be created.
     # Get transcript IDs for pseudogenes.
     grep pseudogene vegaInfoZfish.txt | awk '{print $1}' > pseudogenes.ids 
     grep -f pseudogenes.ids vega.gff > vegaPseudoGene.gff 
     awk '{print $9}' vegaPseudoGene.gff |sort | uniq | wc -l
     # 51 
     grep -v -f pseudogenes.ids vega.gff > vegaGene.gff
     wc -l vega*ff
     #  98170 vega.gff
     #  97999 vegaGene.gff
     #    171 vegaPseudoGene.gff
     
     # load gff files:
     ssh hgwdev
     cd /cluster/data/danRer4/bed/vega
     hgsql -e 'drop table vegaGene;' danRer4
     hgsql -e 'drop table vegaPseudoGene;' danRer4
     ldHgGene danRer4 vegaGene vegaGene.gff
     # Read 6555 transcripts in 88104 lines in 1 files
     # 6555 groups 25 seqs 1 sources 2 feature types
     # 6555 gene predictions
  
     ldHgGene danRer4 vegaPseudoGene vegaPseudoGene.gff
     # Read 51 transcripts in 171 lines in 1 files
     # 51 groups 9 seqs 1 sources 1 feature types
     # 51 gene predictions
 
     # Then create SQL table for adding the zebrafish-specific information
     # Add clone_id to a separate table instead of this one. 
 cat << '_EOF_' > ~/kent/src/hg/lib/vegaInfoZfish.as
 table vegaInfoZfish
 "Vega Genes track additional information"
     (
     string transcriptId;	"Vega transcript ID"
     string geneId;		"Vega gene ID (OTTER ID)"
     string sangerName;		"Sanger gene name"
     string zfinId;		"ZFIN ID"
     string zfinSymbol;		"ZFIN gene symbol"
     string method;		"GTF method field"
     string geneDesc; 		"Vega gene description"
     string confidence;		"Status (KNOWN, NOVEL, PUTATIVE, PREDICTED)"
     )
 '_EOF_'
     cd ~/kent/src/hg/lib/
     autoSql vegaInfoZfish.as vegaInfoZfish
     mv vegaInfoZfish.h ../inc/
     # commit vegaInfoZfish{.h,.c,.as,.sql} files to CVS
     # add INDEX(geneId) to vegaInfoZfish.sql
     # Need to change geneDesc to longblob type because some descriptions
     # are long (2006-09-26, hartera)
     cd ~/kent/src/hg/lib
     perl -pi.bak -e 's/geneDesc varchar\(255\)/geneDesc longblob/' \
          vegaInfoZfish.sql
     
     # create a second table for the cloneId accessions since there
     # are multiple ids for some VEGA genes. Otherwise, there would be 
     # a comma separated list in this field or many rows repeated but just
     # different in the cloneId field. Associate transcript ID to clone IDs.  
     grep ',' allGeneInfo2.txt | wc -l
     # 378
 cat << '_EOF_' > ~/kent/src/hg/lib/vegaToCloneId.as
 table vegaToCloneId
 "Vega Genes track cloneId information"
     (
     string transcriptId;        "Vega transcript ID"
     string cloneId;             "clone ID"
     )
 '_EOF_'
     cd ~/kent/src/hg/lib/
     autoSql vegaToCloneId.as vegaToCloneId
     # replace PRIMARY KEY(transcriptId) with Indices on geneId and cloneId:
     perl -pi.bak -e \
     's/PRIMARY KEY\(transcriptId\)/INDEX\(transcriptId\),\nINDEX\(cloneId\)/' \
        vegaToCloneId.sql
     rm *.bak
   #  mv vegaInfoZfish.h ../inc/
     
     cd /cluster/data/danRer4/bed/vega
     hgsql -e 'drop table vegaInfoZfish;' danRer4
     hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                  vegaInfoZfish.txt
     hgsql -e 'drop table vegaToCloneId;' danRer4
     hgLoadSqlTab danRer4 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
                  vegaToCloneId.txt
 
     # Add code to hgc.c so that this works for Zebrafish and creates the
     # relevant links. Add searches by vega transcript ID, ZFIN ID and 
     # clone ID.  Add a Vega zebrafish-specific description to
     # trackDb/zebrafish. The Pseudogenes are a subtrack of the Genes track
     # because it is too sparse to show as a separate track. 
 # Added entry in zebrafish/trackDb.ra to create these tracks as subtracks of
 # a Vega Genes track.
 # track vegaGeneZfish
 # compositeTrack on
 # shortLabel Vega Genes 
 # longLabel Vega Annotations 
 # group genes
 # priority 37
 # visibility hide
 # chromosomes chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chr23,chr24,chr25
 # type genePred
 # url http://vega.sanger.ac.uk/Danio_rerio/geneview?transcript=$$
     
 #    track vegaGene
 #    subTrack vegaGeneZfish
 #    shortLabel Vega Genes
 #    longLabel Vega Gene Annotations
 #    priority 1
 #    color 0,100,180
     
 #    track vegaPseudoGene
 #    subTrack vegaGeneZfish
 #    shortLabel Vega Pseudogenes
 #    longLabel Vega Annotated Pseudogenes
 #    priority 2
 #    color 155,0,125
 
     # ADD Descriptions for Vega Genes (2006-09-25 - 2006-09-26, hartera)
     # Looked into using description from BioMart for VEGA genes but easier
     # to get them all directly from Sanger. Kerstin sent a list of 
     # descriptions: for_rachel.txt
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/vega/description
     # copy file here and rename
     cd /cluster/data/danRer4/bed/vega/description
     mv for_rachel.txt vegaDesc.txt
     # get list of VEGA gene IDs in vegaInfoZfish
     ssh hgwdev
     cd /cluster/data/danRer4/bed/vega/description
     hgsql -e 'select distinct(geneId) from vegaInfoZfish;' danRer4 | sort \
           > geneIdsFromInfo.sort
     # get sorted list of gene IDs from description file:
     awk '{print $2;}' vegaDesc.txt | sort | uniq > vegaDesc.ids.sort
     wc -l *.sort
     comm -12 geneIdsFromInfo.sort vegaDesc.ids.sort | wc
     # 4892
     comm -23 geneIdsFromInfo.sort vegaDesc.ids.sort > genesNoDesc.txt
     # 55 with no description. sent this list to Sanger and got the
     # descriptions for these too: descriptions_for_Rachel.txt
     awk '{print $1}' descriptions_for_Rachel.txt | sort | uniq \
         > geneIds.newDesc.sort
     comm -12 genesNoDesc.txt geneIds.newDesc.sort | wc
     # 55 gene names in common
     ssh kkstore04
     cd /cluster/data/danRer4/bed/vega/description
     cat vegaDesc.txt descriptions_for_Rachel.txt > vegaAllDesc.txt
     wc -l vegaAllDesc.txt 
     # 6440 vegaAllDesc.txt
     # clean up
     rm genesNoDesc.txt geneIds* vegaDesc.ids.sort
     # Then add these to the vegaInfoZish table
 cat << 'EOF' > addDesc.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($infoFile, $descFile, %descHash);
 $infoFile = $ARGV[0]; # vegaInfoZfish.txt file
 $descFile = $ARGV[1]; # file of descriptions
 
 open(INFO, $infoFile) || die "Can not open $infoFile : $!\n";
 open(DESC, $descFile) || die "Can not open $descFile : $!\n";
 
 while (<DESC>)
 {
 my($line, @f, $id, $desc);
 chomp;
 $line = $_;
 @f = split(/\t/, $line);
 if ($#f > 0 && $f[1] =~ /^OTTDARG/)
   {
   $id = $f[1];
   $desc = $f[2];
   }
 elsif ($f[0] =~ /^(OTTDARG[0-9]+)\s*(.+)/)
   {
   # some lines have just id and description with only a space between
   $id = $1;
   $desc = $2;
   }
 else 
   {
   print "OTTDARG ID not found \n";
   }
 $descHash{$id} = $desc;
 }
 close DESC;
 
 while (<INFO>) 
 {
 my ($li, @fi, $de, $i, $last);
 $de = "";
 chomp;
 $li = $_;
 @fi = split(/\t/, $li);
 if ($fi[1] =~ /OTTDARG/)
    {
    if (exists($descHash{$fi[1]}))
       {
       $de = $descHash{$fi[1]};
       }
    else
       {
       print "There is no description for $fi[1] available.\n";
       }
    
    }
 $last = $#fi;
 for ($i = 0; $i <= 5; $i++ )
    {
    print "$fi[$i]\t";
    }
 print "$de\t";
 if ($last == 5)
    {
    # if there are only 5 fields, the last one is missing so add extra tab
    print "\t\n";
    }
 else 
    {
    print "$fi[$last]\n";
    }
 }
 close INFO;
 'EOF'
      chmod +x addDesc.pl
      # add new descriptions to vegaInfoZfish.txt file
      perl addDesc.pl ../vegaInfoZfish.txt vegaAllDesc.txt \
           > vegaInfoZfishWithDesc.txt
     # Reload vegaInfoZfish table 
     ssh hgwdev  
     cd /cluster/data/danRer4/bed/vega/description
     # 105 warnings when loading the table
     # remove "\N" from desc
     perl -pi.bak -e 's/\\N//' vegaInfoZfishWithDesc.txt
     # this removed 3 warnings
     # after dumping the contents of the table and diffing with the input
     # file, found that the pseudogenes are missing the confidence field
     # and so there is a tab missing from the file. Modified addDesc.pl to
     # add the extra tab when only 7 tabbed fields instead of 8 is found
     # in a row.
     hgsql -e 'drop table vegaInfoZfish;' danRer4
     hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                  vegaInfoZfishWithDesc.txt
 
     # Try loading GTF format file (2006-10-19)
     ssh kkstore04
     cd /cluster/data/danRer4/bed/vega/new
     wget ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
     ssh hgwdev
     cd /cluster/data/danRer4/bed/vega/new
     ldHgGene -bin -genePredExt danRer4 vegaNew vega.gtf
     # Error: Read 6371 transcripts in 88275 lines in 1 files
 #  6371 groups 25 seqs 4 sources 2 feature types
 # invalid gffGroup detected on line: chr22        NOVEL   exon    6782575
 # 67832400.000000 -       .       gene_id "si:rp71-1i20.2"; transcript_id
 # "si:rp71-1i20.2-001"; 
 # GFF/GTF group si:rp71-1i20.2-001 on chr22+, this line is on chr22-, all group
 # members must be on same seq and strand
 # transcript_id is not unique. otter_transcript_id is unique so switch these.
     cp vega.gtf vegaNew.gtf
     # ldHgGene groups by transcript Id so use OTTER IDS instead
     perl -pi.bak -e 's/transcript_id/other_transcript_id/' vegaNew.gtf
     perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaNew.gtf
     ldHgGene -bin -genePredExt danRer4 vegaNew vegaNew.gtf
     # worked ok
     # Added this as a vegaGeneNew subtrack for Vega Genes
 
     ssh kkstore04
     cd /cluster/data/danRer4/bed/vega/new
     # find genes that has same transcript Ids for different OTTER gene_ids
     awk 'BEGIN {FS="\t"} {print $9}' vega.gtf > vegaAttributes
     awk 'BEGIN {FS=";"} {print $2, $5}' vegaAttributes \
         > vegaAttrib.transIdandotterId
     sort vegaAttrib.transIdandotterId | uniq \
          > vegaAttrib.transIdandotterId.uniq
     awk '{print $2}' vegaAttrib.transIdandotterId.uniq | sed -e 's/\s//' \
         | sort | uniq -c | sort -nr > vegaAttrib.transId.count 
     # 88 of these transcripts have more than one entry in gtf file. Need
     # to check if they have different OTTER gene ids in each case.
     head -88 vegaAttrib.transId.count | awk '{print $2}' > transIds.morethan1
     grep -w -f transIds.morethan1 vegaAttrib.transIdandotterId.uniq \
          > transIdswithDiffOtterGeneIds.txt
     awk '{print $2}' transIdswithDiffOtterGeneIds.txt | sort | uniq \
         > transIds.diffOtterGeneIds.txt
     # send transIdswithDiffOtterGeneIds.txt to Kerstin at Sanger. List
     # of transcript Ids with different instances of OTTER gene ids.
 
     # WAITING NOW FOR VEGA GENE UPDATE (2006-10-19)
     # Received e-mail from Ian Sealy at Sanger (is1@sanger.ac.uk) that 
     # Vega gene update is ready in gtf format (2006-11-02)
     ssh kkstore04
     cd /cluster/data/danRer4/bed/vega
     mkdir update
     cd update
     wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
     ssh hgwdev
     cd /cluster/data/danRer4/bed/vega/update
     ldHgGene -bin -genePredExt danRer4 vegaUpdate vega.gtf
     # Read 6823 transcripts in 93253 lines in 1 files
   6823 groups 25 seqs 4 sources 2 feature types
 invalid gffGroup detected on line: chr22        PUTATIVE        exon
 67909276791256  0.000000        -       .       gene_id "si:rp71-1i20.2";
 transcript_id "RP71-1I20.1-001";
 GFF/GTF group RP71-1I20.1-001 on chr22+, this line is on chr22-, all group
 members must be on same seq and strand
    # Still has non-unique transcript IDs - need to wait for next release
    # of VEGA genes and Ensembl for this to be fixed. 
    
    # Received new update of VEGA from Ian Sealy (is1@sanger.ac.uk) on 
    # 2007-02-14.
    ssh kkstore04
    cd /cluster/data/danRer4/bed/vega
    wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
    # Load into database
    # 2007-03-09
    ssh hgwdev
    cd /cluster/data/danRer4/bed/vega
    ldHgGene -bin -genePredExt danRer4 vega vega.gtf
     
    invalid gffGroup detected on line: chr4 NOVEL   exon    35259893
 352599940.000000        +       .       gene_id "sinup"; transcript_id
 "siah2l-001"; 
 GFF/GTF group siah2l-001 on chr21-, this line is on chr4+, all group members
 must be on same seq and strand
     # still get duplicate transcript IDs on different chromosome. 
     # Below is what Kerston Howe (kj2@sanger.ac.uk) advised on these cases:
 # "this will continue to happen as long as the map still changes. The  
 # gene in question was annotated on two adjacent clones which were  
 # apparently then broken up and assigned to different chromosomes.  
 # Usually, this is not too alarming (just delete those cases, please)"
     # Find other such cases:   
     awk 'BEGIN{OFS="\t"} {print $1, $12}' vega.gtf > vegachromAndId.txt
     sort vegachromAndId.txt | uniq > vegachromAndId.uniq
     awk '{print $2}' vegachromAndId.uniq | sort | uniq -c | sort -nr \
         > vegaIds.count
 # These transcript IDs all appear twice on different chromosomes. There could
 # be cases where there are transcripts that are duplicated on the same
 # chromosome.
 # 2 "taf6-001";
 # 2 "siah2l-001";
 # 2 "rasgrf2-001";
 # 2 "lmx1b-001";
 # 2 "fvt1-001";
 # 2 "ckmt2-002";
 # 2 "ckmt2-001";
 # 2 "accn2c-001";
     # There are some cases where the gene is on the same chrom but different
     # strands.
     awk 'BEGIN{OFS="\t"} {print $1, $7, $12}' vega.gtf \
         | sort | uniq > vegachromStrandAndId.uniq
     awk '{print $1, $3}' vegachromStrandAndId.uniq | sort | uniq -c \
         | sort -nr > vegaIdsAndChroms.count
 # These occur twice on different strands of the same chromosome:
 # 2 chr19 "DKEY-264N13.5-001";
 # 2 chr14 "stx5a-001";
     
     # Remove these from the GTF file as suggested by Kerstin Howe (Sanger)
     head -8 vegaIds.count | awk '{print $2}' > transcriptIds.remove
     head -2 vegaIdsAndChroms.count | awk '{print $3}' >> transcriptIds.remove
     grep -v  -f transcriptIds.remove vega.gtf > vega2.gtf
     # reload into danRer4 database
     hgsql -e 'drop table vegaUpdate;' danRer4
     ldHgGene -bin -genePredExt danRer4 vegaUpdate vega2.gtf
     # successfully loads now.
 
     # ldHgGene groups by transcript Id so use OTTER IDS instead
     sed -e 's/transcript_id/other_transcript_id/' vega.gtf > vegaFormat.gtf
     perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaFormat.gtf
     # Now it loads ok without removing duplicate transcript IDs:
     ldHgGene -bin -genePredExt danRer4 vegaFormat vegaFormat.gtf
     # Read 8817 transcripts in 119707 lines in 1 files
     # 8817 groups 29 seqs 4 sources 2 feature types
     # 8817 gene predictions
 
     hgsql -N -e 'select distinct(name2) from vegaFormat;' danRer4 > name2
     # Extra information obtained from Sanger:
     ssh kkstore04
     cd /cluster/data/danRer4/bed/vega
     wget --timestamping \
          ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/061111/noH/genes_for_tom.txt   
     sort name2 > name2.sort
     awk '{print $1}' genes_for_tom.txt | sort | uniq > genesfortom.symbs.sort
     comm -23 name2.sort genesfortom.symbs.sort > vegaGtfOnly
     wc -l vegaGtfOnly
     # 4021
     awk '{print $6}' genes_for_tom.txt | sort | uniq > genesfortom.altsymb.sort
     comm -23 vegaGtfOnly genesfortom.altsymb.sort
     # rest of symbols found as alternate symbols in column 6 of this file
     # subtract this from original list
     comm -13 vegaGtfOnly name2.sort > genesincol1
     # Also received descriptions file and additional information from Sanger. 
     # Now the track can be updated since the vega.gtf file loads into the 
     # database see VEGA UPDATE section below.
 
 #######################################################################
 # VEGA UPDATE (DONE, 2007-03-26 - 2007-03-28, hartera)
 # Data provided by Kerstin Howe from Sanger: kj2@sanger.ac.uk
 # and also Ian Sealy: is1@sanger.ac.uk
 # GTF file sent on 2007-02-14
 # Updated formatVegaInfo.pl script as not all transcripts were being included
 # in the vegaInfoZfish and the vegaToCloneId tables so all tables were
 # re-made (DONE, 2007-04-06, hartera)
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/vega.2007-02-14
     cd /cluster/data/danRer4/bed/vega
     ln -s /cluster/data/danRer4/bed/vega.2007-02-14 \
           /cluster/data/danRer4/bed/vega
     wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
     wget --timestamping \
          ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/061111/noH/genes_for_tom.txt   
     # list of gene descriptions by Kerstin Howe (2007-03-12)
     mv for_rachel.txt vegaDescriptions.txt
     mv genes_for_tom.txt vegaInformation.txt
      
     # vegaInfo is transcriptId, otterId, geneId, method and geneDesc
     # Get otter transcript ID and otter gene ID:
     awk 'BEGIN{OFS="\t"} \
         {if (($17 ~ /otter_gene_id/) && ($19 ~ /otter_transcript_id/)) \
         print $20, $18;}' vega.gtf \
         > vegaIDs.txt
     perl -pi.bak -e 's/;//g' vegaIDs.txt
     perl -pi.bak -e 's/\"//g' vegaIDs.txt
     # list of transcript ID and corresponding gene ID for Vega
     sort vegaIDs.txt | uniq > vegaIDs.uniq
     # then use the info file to grab those genes that are pseudogenes, get the
     # transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
     # to a separate file. Create an info file. Remove the .NOVEL or .PUTATIVE 
     # or .KNOWN or .NOVEL from the method column and add as a separate 
     # confidence column. 
     # check number of items on each line: there are 4 or 6.
     # Some genes have more than one clone ID in a comma separated list
     # so create two files for loading into two tables. 
     # Found that some of the clone ID fields have comma separated lists 
     # and for OTTDARG00000006367, there are 30. Therefore create two info 
     # tables where one is just for clone IDs.  
     # NOTE: in future, make sure each row of vegaInfoZfish.txt output has 
     # 8 fields. The pseudogene entries are missing an entry in the 
     # confidence field so this should be an empty field.
     # Updated formatVegaInfo.pl as not getting all transcript IDs in the 
     # vegaInfoZfish table (hartera, 2007-04-06)
 cat << '_EOF_' > formatVegaInfo.pl
 #!/usr/bin/perl -w
 use strict;
 
 # format Vega additional information into one file for vegaInfoZfish table
 # and another for the vegaToCloneIdZfish table which contains the
 # geneId and cloneId for each gene since there are multiple clone IDs for
 # some of the genes.
 my ($idsFile, $infoFile, $outFile1, $outFile2, %idsHash);
 $idsFile = $ARGV[0];  # list of Transcript IDs and Gene IDs
 $infoFile = $ARGV[1]; # information file for Vega genes
 $outFile1 = $ARGV[2]; # output1 is the formatted file of Vega info for table
 $outFile2 = $ARGV[3]; # output2 is a vega to clone ID conversion table
 
 open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
 open (INFO, $infoFile) || die "Can not open $infoFile: $!\n";
 open (OUT1, ">$outFile1") || die "Can not create $outFile1: $!\n";
 open (OUT2, ">$outFile2") || die "Can not create $outFile2: $!\n";
 open (STDERR, ">info.log") || die "Can not create info.log: $!\n";
 
 while (<IDS>)
 {
 my ($line, @f);
 chomp;
 $line = $_;
 @f = split(/\t/, $line);
 
 # hash is keyed by gene ID but there could be more than one transcript
 # associated with a gene ID so need to create an array for the hash
 push @{$idsHash{$f[1]}}, $f[0];
 }
 close IDS;
 
 while (<INFO>)
 {
 my ($line,@fi,$id,$gene,@transIds, $tr,@clones, $c,@t, $method, $conf);
 chomp;
 $gene = "";
 $line = $_;
 @fi = split(/\t/, $line);
 $id = $gene = $fi[1];
 # get all the transcript IDs for a gene
 if (exists($idsHash{$id}))
    {
    @transIds = @{$idsHash{$id}};
    }
 # push clone IDs into an array:
 @clones = split(/,/, $fi[2]);
 @t = split(/\./, $fi[3]);
 $method = $t[0];
 if ($#t > 0)
 {
 $conf = $t[1];
 }
 elsif ($#t == 0)
    {
    $conf = "";
    }
 else
    {
    print STDERR "Should be 4 or 6 items per row, found $#fi \n";
    }
 foreach $tr (@transIds)
    {
    print OUT1 "$tr\t$fi[1]\t$fi[0]";
    if ($#fi == 5)
       {
       print OUT1 "\t$fi[4]\t$fi[5]\t$method\t\t$conf\n";
       }
    elsif ($#fi == 3)
       {
       print OUT1 "\t\t\t$method\t\t$conf\n";
       }
    # print out clone IDs for each transcript
    foreach $c (@clones)
       {
       print OUT2 "$tr\t$c\n";
       }
    }
 if($gene && !exists($idsHash{$gene})) 
    {
    print STDERR "$gene\n";
    }
 }
 close IDS;
 close INFO;
 close OUT1;
 close OUT2;
 close STDERR;
 '_EOF_'
     chmod +x formatVegaInfo.pl 
     wc -l vegaInformation.txt
     # 7169 vegaInformation.txt
     awk '{print $2}' vegaInformation.txt | sort | uniq > genesWithInfo.txt
     awk '{if ($2 ~ /OTTDARG/) print $2;}' vegaIDs.uniq \
         | sort | uniq > genesFromGtf.txt
     # Number of genes with info AND in gtf file:
     wc -l genesFromGtf.txt
     # 6171 genesFromGtf.txt
     comm -12 genesWithInfo.txt genesFromGtf.txt | wc -l
     # 6171
     # Number of genes wih no info:
     comm -13 genesWithInfo.txt genesFromGtf.txt | wc -l
     # 0
     # Use perl script above to extract vegaInfo table information.
     # Re-did this with updated perl script to get all transcript IDs
     # (hartera, 2007-04-07)
     perl formatVegaInfo.pl vegaIDs.uniq vegaInformation.txt \ 
          vegaInfoZfish.txt vegaToCloneId.txt
     # info.log contains genes for which are not in the gff file of VEGA
     # and this is empty as it should be.
     wc -l vegaInfoZfish.txt
     # 8817 vegaInfoZfish.txt
     wc -l vegaToCloneId.txt
     # 9698 vegaToCloneId.txt
     # The vegaToCloneId.txt file is also larger than before as it now 
     # has all the transcript IDs (hartera, 2007-04-05).
     awk '{print $1}' vegaInfoZfish.txt | sort | uniq -c | sort -nr > out2
     # transcripts only have 1 entry
     awk '{print $2}' vegaInfoZfish.txt | sort | uniq > infogenes.txt
     comm -13 infogenes.txt genesFromGtf.txt 
     # There are no genes in the GFF file that are not in vegaInfoZfish.txt
     # However, there are genes in the information file that do not have 
     # transcripts represented that are in the GFF file.
     # Then remake the pseudogenes track from this.
     # Next step is to find which transcripts are pseudogenes.
     grep pseudogene vegaInfoZfish.txt | sort | uniq | wc -l
     # Once vegaInfoZfish.txt updated, found 53 pseudogenes so need to update
     # the pseudogene track 
     # There are only 53 in the info file, and all of these are in the GFF
     # file. Anyway, this is too sparse for a separate track, but
     # a subtrack could be created.
     # Get transcript IDs for pseudogenes.
     grep pseudogene vegaInfoZfish.txt | awk '{print $1}' > pseudogenes.ids 
     grep -w -f pseudogenes.ids vega.gtf > vegaPseudoGene.gtf 
     awk '{print $20}' vegaPseudoGene.gtf | sort | uniq | wc -l
     # 53
     # Need to remake the vegGene table:
     grep -vw -f pseudogenes.ids vega.gtf > vegaGene.gtf
     wc -l vega*gtf
     # 119707 vega.gtf
     # 119529 vegaGene.gtf
     # 178 vegaPseudoGene.gtf
     # Need to relabel IDs to get the name to be the otter transcript ID
     # and name 2 to be the transcript_id (needs to be labeled as gene_id)
     # Also, relabel the otter_transcript_id to be transcript_id as ldHgGene
     # groups the rows by this ID.  
     sed -e 's/gene_id/tmp_id/' vegaGene.gtf > vegaGeneFormat.gtf
     perl -pi.bak -e 's/transcript_id/gene_id/' vegaGeneFormat.gtf
     perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaGeneFormat.gtf
 
     # Do the same for the pseudogene GTF files:
     sed -e 's/gene_id/tmp_id/' vegaPseudoGene.gtf > vegaPseudoGeneFormat.gtf
     perl -pi.bak -e 's/transcript_id/gene_id/' vegaPseudoGeneFormat.gtf
     perl -pi.bak -e 's/otter_transcript_id/transcript_id/' \
          vegaPseudoGeneFormat.gtf
     rm *.bak
     # load GTF files for Vega genes and pseudogenes:
     # Reloaded all tables after updating as above (2007-04-06, hartera)
     ssh hgwdev
     cd /cluster/data/danRer4/bed/vega
     hgsql -e 'drop table vegaGene;' danRer4
     hgsql -e 'drop table vegaPseudoGene;' danRer4
 
     ldHgGene -bin -genePredExt danRer4 vegaGene vegaGeneFormat.gtf
     # Read 8764 transcripts in 119529 lines in 1 files
     #  8764 groups 29 seqs 3 sources 2 feature types
     # 8764 gene predictions
 
     ldHgGene -bin -genePredExt danRer4 vegaPseudoGene vegaPseudoGeneFormat.gtf
     # Read 53 transcripts in 178 lines in 1 files
     #   53 groups 11 seqs 1 sources 1 feature types
     # 53 gene predictions
 
     hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer4 \
          | sort | uniq 
     hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer4 \
          | sort | uniq 
     # vegaGene includes several scaffolds so need to lift file to chrom 
     # level for these and reload vegaGene. vegaPseudoGene has no scaffolds.
     
     # scaffolds in vegaGene:
 # chrZv6_scaffold3697
 # chrZv6_scaffold3723
 # chrZv6_scaffold3731
 # chrZv6_scaffold3734
     # These are all on the chrUn_random virtual chrom
     ssh kkstore04
     cd /cluster/data/danRer4/bed/vega
     sed -e 's/chrZv6_scaffold/Zv6_scaffold/g' vegaGeneFormat.gtf \
            > vegaGeneFormat2.gtf 
     liftUp vegaGeneFormatLifted.gtf \
            /cluster/data/danRer4/jkStuff/liftAll.lft carry vegaGeneFormat2.gtf
     # Reload vegaGene table:
     ssh hgwdev
     cd /cluster/data/danRer4/bed/vega
     hgsql -e 'drop table vegaGene;' danRer4 
     ldHgGene -bin -genePredExt danRer4 vegaGene vegaGeneFormatLifted.gtf
     # Read 8764 transcripts in 119529 lines in 1 files
     #   8764 groups 26 seqs 3 sources 2 feature types
     # 8764 gene predictions
 
     # Vega information tables:
     # mySQL table definition and autosql-generated files created previously 
     # for zebrafish-specific information (vegaInfoZfish) in the VEGA GENES 
     # section above.
     # Add clone_id to a separate table instead of this one. 
     
     # created a second table for the cloneId accessions since there
     # are multiple ids for some VEGA genes. Otherwise, there would be 
     # a comma separated list in this field or many rows repeated but just
     # different in the cloneId field. Associate transcript ID to clone IDs.  
     # see VEGA GENES section
     # load these tables: 
     cd /cluster/data/danRer4/bed/vega
     hgsql -e 'drop table vegaInfoZfish;' danRer4
     hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                  vegaInfoZfish.txt
     hgsql -e 'drop table vegaToCloneId;' danRer4
     hgLoadSqlTab danRer4 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
                  vegaToCloneId.txt
 
     # Add code to hgc.c so that this works for Zebrafish and creates the
     # relevant links. Add searches by vega transcript ID, ZFIN ID and 
     # clone ID. trackDb entry added as in VEGA GENES section above.
     # Added track handler to hgTracks.c for vegaGeneZfish so that the
     # transcript names from the name2 column of the genePred table is 
     # used for the item name displayed in the track.
     # Add a Vega zebrafish-specific html description to trackDb/zebrafish. 
     # The Pseudogenes are a subtrack of the Genes track
     # because it is too sparse to show as a separate track. 
 
     # ADD Descriptions for Vega Genes 
     # Looked into using description from BioMart for VEGA genes but easier
     # to get them all directly from Sanger. Kerstin sent a list of 
     # descriptions: for_rachel.txt
     # Add these again to updated tables (2007-04-06, hartera)
     ssh kkstore04
     mkdir -p /cluster/data/danRer4/bed/vega/description
     # copy file here and rename
     cd /cluster/data/danRer4/bed/vega/description
     mv ../vegaDescriptions.txt .
     # get list of VEGA gene IDs in vegaInfoZfish
     ssh hgwdev
     cd /cluster/data/danRer4/bed/vega/description
     hgsql -N -e 'select distinct(geneId) from vegaInfoZfish;' danRer4 | sort \
           > geneIdsFromInfo.sort
     # get sorted list of gene IDs from description file:
     awk '{print $1;}' vegaDescriptions.txt | sort | uniq > vegaDesc.ids.sort
     wc -l *.sort
     # 6171 geneIdsFromInfo.sort
     # 14150 vegaDesc.ids.sort
 
     comm -12 geneIdsFromInfo.sort vegaDesc.ids.sort | wc
     # 6168
     comm -23 geneIdsFromInfo.sort vegaDesc.ids.sort > genesNoDesc.txt
     # There are 3 with no description
     # OTTDARG00000004654
     # OTTDARG00000018757
     # OTTDARG00000018760
     # Searched for these three at
     # http://vega.sanger.ac.uk/Danio_rerio/index.html
     # and found that these three do not have a description.
     # add them to the descriptions list
     ssh kkstore04
     cd /cluster/data/danRer4/bed/vega/description
     # add the three with no description to the descriptions list
     cat vegaDescriptions.txt genesNoDesc.txt > vegaAll.txt
     # remove header
     tail +2 vegaAll.txt | sort | uniq > vegaAllDesc.txt
     wc -l vegaAll*
     # 23058 vegaAll.txt
     # 15460 vegaAllDesc.txt
     # clean up
     rm genesNoDesc.txt geneIds* vegaDesc.ids.sort
     # Then add these to the vegaInfoZish table
 cat << 'EOF' > addDesc.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($infoFile, $descFile, %descHash);
 $infoFile = $ARGV[0]; # vegaInfoZfish.txt file
 $descFile = $ARGV[1]; # file of descriptions
 
 open(INFO, $infoFile) || die "Can not open $infoFile : $!\n";
 open(DESC, $descFile) || die "Can not open $descFile : $!\n";
 open(ERROR, ">error.log") || die "Can not create error.log : $!\n";
 open(OUT, ">out.txt") || die "Can not create out.txt: $!\n";
 
 while (<DESC>)
 {
 my($line, @f, $id, $desc);
 chomp;
 $line = $_;
 @f = split(/\t/, $line);
 if ($f[0] =~ /^OTTDARG/)
   {
   $id = $f[0];
   $desc = $f[1];
   }
 else
   {
   print ERROR "OTTDARG ID is not found on a line of the descriptions file.\n";
   }
 $descHash{$id} = $desc;
 }
 close DESC;
 
 while (<INFO>)
 {
 my ($li, @fi, $de, $i, $last);
 $de = "";
 chomp;
 $li = $_;
 @fi = split(/\t/, $li);
 if ($fi[1] =~ /OTTDARG/)
    {
    if (exists($descHash{$fi[1]}))
       {
       $de = $descHash{$fi[1]};
       }
    else
       {
       print ERROR "There is no description for $fi[1] available.\n";
       }
    }
 $last = $#fi;
 for ($i = 0; $i <= 5; $i++ )
    {
    print OUT "$fi[$i]\t";
    }
 print OUT "$de\t";
 if ($last == 5)
    {
    # if there are only 5 fields, the last one is missing so add extra tab
    print OUT "\t\n";
    }
 else
    {
    print OUT "$fi[$last]\n";
    }
 }
 close INFO;
 close ERROR;
 'EOF'
     chmod +x addDesc.pl
     # add new descriptions to vegaInfoZfish.txt file
     perl addDesc.pl ../vegaInfoZfish.txt vegaAllDesc.txt
     # check output in out.txt then rename
     mv out.txt vegaInfoZfishWithDesc.txt
     rm error.log # empty
     # Reload vegaInfoZfish table 
     ssh hgwdev  
     cd /cluster/data/danRer4/bed/vega/description
     hgsql -e 'drop table vegaInfoZfish;' danRer4
     hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
                  vegaInfoZfishWithDesc.txt
     # No errors loading
     
 # Added code already to hgc.c so that this works for Zebrafish and creates the
 # relevant links. Add searches by vega transcript ID, ZFIN ID and 
 # clone ID. trackDb entry added as in VEGA GENES section above.
 # Added track handler to hgTracks.c for vegaGeneZfish so that the
 # transcript names from the name2 column of the genePred table are
 # used for the item name displayed in the track.
 # Add a Vega zebrafish-specific html description to trackDb/zebrafish. 
 # The Pseudogenes are a subtrack of the Genes track because it is too sparse 
 # to show as a separate track. 
 
 ##########################################################################
 # N-SCAN gene predictions (nscanGene) - (2006-08-30 markd)
     cd /cluster/data/danRer4/bed/nscan/
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     wget -nv -r -np http://ardor.wustl.edu/jeltje/zebrafish/chr_gtf
     wget -nv -r -np http://ardor.wustl.edu/jeltje/zebrafish/chr_ptx
     # clean up and rename downloaded directorys:
     mv ardor.wustl.edu/jeltje/zebrafish/chr_gtf .
     mv ardor.wustl.edu/jeltje/zebrafish/chr_ptx .
     rm -rf ardor.wustl.edu
     rm chr_*/index.html*
     gzip chr_*/*
     chmod a-w chr_*/*.gz
 
     # load tracks.  Note that these have *utr features, rather than
     # exon features.  currently ldHgGene creates separate genePred exons
     # for these.
     ldHgGene -bin -gtf -genePredExt danRer4 nscanGene chr_gtf/chr*.gtf.gz
 
     # load protein, add .1 suffix to match transcript id
     hgPepPred -suffix=.1 danRer4 generic nscanPep chr_ptx/chr*.fa.gz
     rm *.tab
 
     # update trackDb; need a danRer4-specific page to describe informants
     zebrafish/danRer4/nscanGene.html   (copy from mm8 and edit)
     zebrafish/danRer4/trackDb.ra
     # changed search regex to
     termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]
 
 #######################################################################
 # UPDATE AFFY ZEBRAFISH TRACK USING BLAT WITHOUT -mask OPTION AND
 # USING -repeats OPTION AND DIFFERENT FILTERING TO REMOVE SHORT
 # ALIGNMENTS (DONE, 2006-09-27 - 2006-09-28, hartera)
 # With the previous version of this track, QA found a number of short
 # alignments of <= 30 bp and there are a number in the <= 50bp range.
 # These do not seem to be meaningful so filtering was changed to try to
 # remove these alignments while retaining meaningful alignments.
 # pslCDnaFilter was used with the same settings as used for the
 # Genbank EST alignments for zebrafish.
 # Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the
 # higher minIdentity is causing alignments to be dropped that should not be.
 # Blat's minIdentity seems to be more severe than that for pslReps or
 # pslCDnaFilter as it takes insertions and deletions into account.
 # These are Jim's recommendations.
 # NOTE: Also added alignments for NA_random and Un_random, these had not 
 # been done for the original affyZebrafish track but should have been. 
 
     # Array chip sequences already downloaded for danRer1
     ssh hgwdev
     cd /projects/compbio/data/microarray/affyZebrafish
     mkdir -p /san/sanvol1/scratch/affy
    cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
        /san/sanvol1/scratch/affy/
     # Set up cluster job to align Zebrafish consensus sequences to danRer3
     mkdir -p /cluster/data/danRer4/bed/affyZebrafish.2006-09-27
     # remove old link and create new one
     rm /cluster/data/danRer4/bed/affyZebrafish
     ln -s /cluster/data/danRer4/bed/affyZebrafish.2006-09-27 \
           /cluster/data/danRer4/bed/affyZebrafish
     # Align sequences on the pitakluster. Scaffolds were aligned for NA_random
     # and Un_random and lifted to chrom level afterwards. Chroms 1-25 and M
     # were aligned as ~5 Mb chunks.
     ssh pk
     cd /cluster/data/danRer4/bed/affyZebrafish
     ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst
     ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
     foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
        ls -1 $f >> genome.lst
     end
     wc -l genome.lst 
     # 3237 genome.lst
     # for output:
     mkdir -p /san/sanvol1/scratch/danRer4/affy/psl
     # use -repeats option to report matches to repeat bases separately
     # to other matches in the PSL output.
     echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
 -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out
 line+ /san/sanvol1/scratch/danRer4/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
 > template.sub
 
     gensub2 genome.lst affy.lst template.sub para.spec
     para create para.spec
     para try, check, push ... etc.
     para time
 # Completed: 3237 of 3237 jobs
 #CPU time in finished jobs:      19319s     321.98m     5.37h    0.22d  0.001 y
 #IO & Wait Time:                  9297s     154.95m     2.58h    0.11d  0.000 y
 #Average job time:                   9s       0.15m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              98s       1.63m     0.03h    0.00d
 #Submission to last job:          3135s      52.25m     0.87h    0.04d
     
     # need to do pslSort and lift up
     ssh pk
     cd /san/sanvol1/scratch/danRer4/affy
     # Do sort, liftUp and then best in genome filter.
     # only use alignments that have at least
     # 95% identity in aligned region.
     # Previously did not use minCover since a lot of sequence is in
     # Un and NA so genes may be split up so good to see all alignments.
     # However, found a number of short alignments of <= 50 bp. These are
     # not meaningful so maybe need to use minCover. If increased too much,
     # then hits on poor parts of the assembly will be missed.
     # use pslCDnaFilter with the same parameters as used for zebrafish
     # Genbank EST alignments.
     pslSort dirs raw.psl tmp psl
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
 #                         seqs    aligns
 #             total:     15272   828202
 #drop minNonRepSize:     2763    741674
 #     drop minIdent:     2656    39188
 #     drop minCover:     2550    10784
 #        weird over:     359     1439
 #        kept weird:     277     347
 #    drop localBest:     2830    17737
 #              kept:     14952   18819
 # Kept 97.9% of alignments. There are 15502 Affy sequences originally
 # aligned so there are now 96.5% remaining.
      
     # lift up the coordinates to chrom level
     liftUp affyZebrafish.psl \
            /cluster/data/danRer4/jkStuff/liftAll.lft warn contig.psl
     # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
     # Lifting contig.psl
     
     # rsync these psl files
     rsync -a --progress /san/sanvol1/scratch/danRer4/affy/*.psl \
          /cluster/data/danRer4/bed/affyZebrafish/
     
     ssh kkstore04
     cd /cluster/data/danRer4/bed/affyZebrafish
     # shorten names in psl file
     sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
     mv affyZebrafish.psl.tmp affyZebrafish.psl
     pslCheck affyZebrafish.psl
     # psl is good
     # load track into database
     ssh hgwdev
     cd /cluster/data/danRer4/bed/affyZebrafish
     hgsql -e 'drop table affyZebrafish;' danRer4
     hgLoadPsl danRer4 affyZebrafish.psl
     # Add consensus sequences for Zebrafish chip
     # Copy sequences to gbdb if they are not there already
     mkdir -p /gbdb/hgFixed/affyProbes
     ln -s \
     /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /gbdb/hgFixed/affyProbes
     # these sequences were loaded previously so no need to reload.
     hgLoadSeq -abbr=Zebrafish: danRer3 \
               /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
     # Clean up
     rm batch.bak contig.psl raw.psl
     # check number of short alignments:
     hgsql -e \
      'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer4
     # 7
     # for previous filtered set, there were 1272 alignments of <= 50 bp so
     # this has improved.
     hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer4
     # 14952
     # Previously, there were 14819 so more sequences have aligned but less
     # short alignments are retained. Many of the short alignments may also 
     # have longer alignments to different regions of the genome that are good.
 
 #########################################################################
 # COMPUGEN ZEBRAFISH OLIGOS TRACK (in progress, 2006-10-20, hartera)
 # Align the zebrafish oligos from Compugen used to create the arrays
 # used by GIS to study expression at different developmental stages.
     ssh hgwdev
     mkdir -p /projects/compbio/data/microarray/compugen/zebrafish
     # save Compugen oligos FASTA file here. obtained from 
     # Sinnakaruppan Mathavan <mathavans@gis.a-star.edu.sg> at the 
     # Genome Institute of Singapore (GIS).
     # Permission was obtained from Compugen to display the sequences 
     # along with a disclaimer. see README.txt
     cd /projects/compbio/data/microarray/compugen/zebrafish
     unzip Zebrafish\ Oligos_Compugen_XEBLIB96_pov_070704.zip
     # this gives an Excel file, XEBLIB96_pov_070704.xls
     # save as a tab separated text file using Excel: XEBLIB96_pov_070704.txt
     # Remove quotation marks
     sed -e 's/"//g' XEBLIB96_pov_070704.txt > GISArray.txt
     # also remove other unwanted characters, ^@, which is ASCII for NULL
     tr -d '\0' < GISArray.txt > GISArray.format.txt 
     awk 'BEGIN{FS="\t"} {if ($2 !~ /Serial/ && ($2 != "")) print ">"$2"\n"$4}' \
         GISArray.format.txt > GISZfishArray.fa
     grep '>' GISZfishArray.fa | wc -l
     # 16399
     # align sequences to the zebrafish genome on pk
     mkdir -p /san/sanvol1/scratch/compugen
    cp /projects/compbio/data/microarray/compugen/zebrafish/GISZfishArray.fa \
        /san/sanvol1/scratch/compugen/
     # Set up cluster job to align Zebrafish consensus sequences to danRer4
     mkdir -p /cluster/data/danRer4/bed/compugenZebrafish.2006-11-03
     ln -s /cluster/data/danRer4/bed/compugenZebrafish.2006-11-03 \
           /cluster/data/danRer4/bed/compugenZebrafish
     # Align sequences on the pitakluster. Scaffolds were aligned for NA_random
     # and Un_random and lifted to chrom level afterwards. Chroms 1-25 and M
     # were aligned as ~5 Mb chunks.
     ssh pk
     cd /cluster/data/danRer4/bed/compugenZebrafish
     ls -1 /san/sanvol1/scratch/compugen/GISZfishArray.fa > oligos.lst
     ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
     foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
        ls -1 $f >> genome.lst
     end
     wc -l genome.lst 
     # 3237 genome.lst
     # oligos are 65 bp in length.
     # for output:
     mkdir -p /san/sanvol1/scratch/danRer4/compugen/psl
     # use -repeats option to report matches to repeat bases separately
     # to other matches in the PSL output.
     echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
 -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out
 line+ /san/sanvol1/scratch/danRer4/compugen/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
 
     gensub2 genome.lst oligos.lst template.sub para.spec
     para create para.spec
     para try, check, push ... etc.
     para time
 # Completed: 3237 of 3237 jobs
 # CPU time in finished jobs:       1948s      32.46m     0.54h    0.02d  0.000 y
 # IO & Wait Time:                 11145s     185.75m     3.10h    0.13d  0.000 y
 # Average job time:                   4s       0.07m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             428s       7.13m     0.12h    0.00d
 # Submission to last job:           621s      10.35m     0.17h    0.01d
 
     # need to do pslSort and lift up
     ssh pk
     cd /san/sanvol1/scratch/danRer4/compugen
       
     # Do sort, liftUp and then best in genome filter.
     # only use alignments that have at least
     # 95% identity in aligned region.
     # Previously did not use minCover since a lot of sequence is in
     # Un and NA so genes may be split up so good to see all alignments.
     # However, found a number of short alignments of <= 50 bp. These are
     # not meaningful so maybe need to use minCover. If increased too much,
     # then hits on poor parts of the assembly will be missed.
     # use pslCDnaFilter with the same parameters as used for zebrafish
     # Genbank EST alignments.
     pslSort dirs raw.psl tmp psl
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
 # for Compugen:
 # Dropping minCover to 0.10 doesn't make a difference. Decreasing the minId to
 # 0.92 increases the number of sequences aligned and does not increase
 # the number of alignments for sequences with the most alignments. 
 # Removed the minimum non repeat filter does significantly increase the 
 # number of alignments for some sequences.
 
 145 CGENZEB_456015402_0
      79 CGENZEB_456008445_0
      72 CGENZEB_456015991_0
      53 CGENZEB_456012678_0
      46 CGENZEB_456004521_0
 
 # Total sequences: 16399
 seqs    aligns
              total:     15544   102554
 drop minNonRepSize:     1004    72545
      drop minIdent:     825     3549
         weird over:     13      48
         kept weird:     8       16
     drop localBest:     1288    7040
               kept:     14632   19420
 # 89.2% are kept.
 
 
 # minCov = 0.10 minNonRepSize = 8
 # seqs    aligns
 seqs    aligns
              total:     15544   102554
 drop minNonRepSize:     1004    72545
      drop minIdent:     825     3549
         weird over:     13      48
         kept weird:     8       16
     drop localBest:     1288    7040
               kept:     14632   19420
 
 # 89.2% are kept.
 # minCov=0.10 minNonRepSize = 10
 seqs    aligns
              total:     15544   102554
 drop minNonRepSize:     1015    72795
      drop minIdent:     811     3462
         weird over:     13      48
         kept weird:     8       16
     drop localBest:     1278    6901
               kept:     14616   19396
 
 # 89.1% kept.
 # minNonRepSize = 0
                         seqs    aligns
              total:     15544   102554
      drop minIdent:     1344    23893
         weird over:     42      271
         kept weird:     24      44
     drop localBest:     1772    49794
               kept:     15338   28867
 # 93.8% kept from total
 # but there are large numbers of alignments for some probes:
 # 62 CGENZEB_456005547_0
     603 CGENZEB_456005221_0
     454 CGENZEB_456010007_0
     409 CGENZEB_456014900_0
     372 CGENZEB_456009900_0
 
 # try increase identity but low minReps
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \
        -ignoreNs -bestOverlap -minId=0.97 -minCover=0.15 raw.psl \
        contigMinRep8minId97.psl
 # seqs    aligns
              total:     15544   102554
 drop minNonRepSize:     1004    72545
      drop minIdent:     1982    8772
         weird over:     9       29
         kept weird:     7       14
     drop localBest:     766     2915
               kept:     13715   18322
 # this has improved highest number of hits a lot but this is similar to 
 # that achieved with higher identity too
 # but only kept 80% of seqeuences.
 145 CGENZEB_456015402_0
      79 CGENZEB_456008445_0
      72 CGENZEB_456015991_0
      53 CGENZEB_456012678_0
      46 CGENZEB_456004521_0
 # lower minCov:
    pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \
         -ignoreNs -bestOverlap -minId=0.95 -minCover=0.08 raw.psl \
         contigMinCov8.psl
 
 # seqs    aligns
              total:     15544   102554
 drop minNonRepSize:     1004    72545
      drop minIdent:     825     3549
         weird over:     13      48
         kept weird:     8       16
     drop localBest:     1288    7040
               kept:     14632   19420
 
 # 89.2%, now nearBest = 0.1%
    pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \
         -ignoreNs -bestOverlap -minId=0.95 -minCover=0.10 raw.psl \
         contigMinCov10NearBest1percent.psl
 # seqs    aligns
              total:     15544   102554
 drop minNonRepSize:     1004    72545
      drop minIdent:     825     3549
         weird over:     13      48
         kept weird:     7       15
     drop localBest:     1350    7451
               kept:     14632   19009
 # same number of sequences aligning but less overall alignments:
 # 115 CGENZEB_456015402_0
 #     71 CGENZEB_456015991_0
 #    71 CGENZEB_456008445_0
 #    46 CGENZEB_456004521_0
 #    38 CGENZEB_456008610_0
 # CGENZEB_456012678_0 now went down to 1. 
 # 89.2% aligned
 
    # use minCover = 0.40
    pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \
         -ignoreNs -bestOverlap -minId=0.95 -minCover=0.40 raw.psl \
         contig.psl
 seqs    aligns
              total:     15544   102554
 drop minNonRepSize:     1004    72545
      drop minIdent:     825     3549
         weird over:     13      48
         kept weird:     7       15
     drop localBest:     1350    7451
               kept:     14632   19009
 
    # little difference using minCover=0.60
 
 
    cd /san/sanvol1/scratch/danRer4/compugen
    rm contig*
 # Use these parameters:
    pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \
         -ignoreNs -bestOverlap -minId=0.95 -minCover=0.40 raw.psl \
         contig.psl
    # use minCover = 0.40
     seqs    aligns
              total:     15544   102554
 drop minNonRepSize:     1004    72545
      drop minIdent:     825     3549
         weird over:     13      48
         kept weird:     7       15
     drop localBest:     1350    7451
               kept:     14632   19009
     # use minCover=0.60
  seqs    aligns
              total:     15544   102554
 drop minNonRepSize:     1004    72545
      drop minIdent:     825     3549
      drop minCover:     198     507
         weird over:     9       39
         kept weird:     4       12
     drop localBest:     1285    7009
               kept:     14588   18944
 
  
    # lift up the coordinates to chrom level
    liftUp compugenZebrafish.psl \
           /cluster/data/danRer4/jkStuff/liftAll.lft warn contig.psl
    # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
    # Lifting contig.psl
     
    # rsync these psl files
    rsync -a --progress /san/sanvol1/scratch/danRer4/compugen/*.psl \
          /cluster/data/danRer4/bed/compugenZebrafish
     
    ssh kkstore04
    cd /cluster/data/danRer4/bed/compugenZebrafish
    # shorten names in psl file
    pslCheck compugenZebrafish.psl
    # psl is good
    # load track into database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/compugenZebrafish
    hgsql -e 'drop table compugenZebrafish;' danRer4
    hgLoadPsl danRer4 compugenZebrafish.psl
    # Add entry in trackDb/zebrafish/trackDb.ra and a search for hgFindSpec
    # Add a description page. 
    # Need to add disclaimer for sequences.
 
     # Add consensus sequences for Zebrafish chip
     # Copy sequences to gbdb if they are not there already
     mkdir -p /gbdb/hgFixed/compugenProbes
     ln -s \
     /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
       /gbdb/hgFixed/affyProbes
     # these sequences were loaded previously so no need to reload.
     hgLoadSeq -abbr=Zebrafish: danRer3 \
               /gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
     # Clean up
     rm batch.bak contig.psl raw.psl
     # check number of short alignments:
     hgsql -e \
      'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer4
     # 7
     # for previous filtered set, there were 1272 alignments of <= 50 bp so
     # this has improved.
     hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer4
     # 14952
     # Previously, there were 14819 so more sequences have aligned but less
     # short alignments are retained. Many of the short alignments may also 
     # have longer alignments to different regions of the genome that are good.
     
 #########################################################################
 # ENSEMBL GENES TRACKS FOR ENSEMBL VERSION 42
 # ENSEMBL GENES (PROTEIN-CODING) AND ENSEMBL NON-CODING GENES 
 # (DONE, 2007-01-08 - 2007-01-09 hartera)
 # Obtained from BioMart at Ensembl (The Wellcome Trust Sanger Institute)
    # Starting downloading Ensembl v41 genes (2006-12-13)
    # get "unexpected end of file" error with the peptide download. 
    # Notified Ensembl (2006-12-15).
    # Ensembl helpdesk say that the files sometimes get terminated early
    # for large downloads so try using this link to BioMart instead:
    # http://www.biomart.org/biomart/martview
    # Repeat above using this link. This has Ensembl42 though so e-mailed 
    # Ensembl to ask if they are releasing Ensembl42 soon (2006-12-18)
 # Ensembl was updated to v42 in Dec. 2006 so use this new data set 
 # (2007-01-08):
     ssh kkstore04
     mkdir -p /cluster/data/danRer4/bed/ensembl42
     cd /cluster/data/danRer4/bed/ensembl42
 
     # Get the Ensembl gene data from BioMart at:
     # http://www.biomart.org/biomart/martview
     # Follow this sequence through the pages: (NOTE: this interface has changed
     # significantly since danRer3). Ensembl version is 42 (Dec 2006).
     # 1) The Dataset link in the left panel is selected. Select the 
     # Ensembl dataset (v42 here) and the Danio_rerio choice (ZFISH6 here).
     # 2) Click on the Attributes link in the left panel.  
     # 3) Select Structures. Click on the + next to GENE to expand it
     # and check the boxes for the Ensembl Gene ID and Ensembl
     # Transcript ID. 
     # 4) Clicking on the "Count" link on the top black menu shows that there
     # are  28,508 / 28,508 Genes selected in Danio rerio genes (ZFISH6)
     # 5) Click on the "Results" link on the top black menu and then select GFF
     # as the format and select to export all results to a 
     # "Compressed web file (notify by e-mail)" and hit the "Go" button and
     # enter e-mail address as requested.
     # When results are ready, you will receive an e-mail with a link to
     # download the results, save as ensemblGene42.gff.gz
     # Save as and move file to 
     # /cluster/data/danRer4/bed/ensembl42
     gunzip ensemblGene42.gff.gz
     # file unzips ok.
     # Repeat above but at step 3, selec the Features Attribute
     # select Ensembl Transcript ID and Biotype under the GENE section.
     # Select "Text, tab separated" as the output format and gzip 
     # compression. Biotype gives information to separate the genes into
     # protein-coding and RNA genes and pseudogenes.
     # For step 5, select CSV as the output and then select to export all
     # results to a "Compressed web file (notify by e-mail)" and hit the 
     # "Go" button and enter e-mail address as requested.
  
     # Save as ensemblGene42Biotype.tsv.gz and move to 
     # /cluster/data/danRer4/bed/ensembl42
     gunzip ensemblGene42Biotype.tsv.gz
     # file unzips ok.
 
     # The Ensembl gene predictions are mapped to chromosomes except for 
     # chrNA and chrUn. Use lift files for scaffolds to these chroms.
     # get chrUn and chrNA Ensembl records.
  
     ssh kkstore04
     cd /cluster/data/danRer4/bed/ensembl42
     
     # need to lift up the NA and Un scaffolds to chroms
     liftUp -type=.gtf ensemblGene42.lifted \
      /cluster/data/danRer4/jkStuff/liftAll.lft carry ensemblGene42.gff
     # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
     # Lifting ensemblGene42.gff
 
     wc -l ensemblGene42*
     # 807871 ensemblGene42.gff
     # 807871 ensemblGene42.lifted
     # 39626 ensemblGene42Biotype.tsv
     
     # check there are no scaffolds left in lifted file:
     grep Zv6_NA ensemblGene42.lifted
     grep Zv6_scaffold ensemblGene42.lifted
     # there are none so ok.
     # add chr at beginning of each line. NA and Un already have "chr" 
     # prefix so then remove the extra one.
     sed -e "s/^/chr/" ensemblGene42.lifted | sed -e "s/chrchr/chr/" \
         > ensGene42.gff
     # check file sizes -ok and some of the lifted co-ordinates
     # Also remove the suffix that denotes the transcript version number. 
     # This is not in the ensGtp or ensPep tables.
     perl -pi.bak -e 's/\.[0-9]+//'g ensGene42.gff
 
    # Next split up the gff into a protein-coding gene set and a RNA gene and
    # pseudogene set and load into different tracks.
  
    # get transcript IDs only for protein coding transcripts
    grep "protein_coding" ensemblGene42Biotype.tsv | awk '{print $1}' \
         > ensGene42ProteinCoding.ids
    # skip header line and grab everything else from the file
    tail +2 ensemblGene42Biotype.tsv | grep -v "protein_coding" \
         | awk '{print $1}' > ensGene42NonCoding.ids
    wc -l ensGene42*ids
    # 3560 ensGene42NonCoding.ids
    # 36065 ensGene42ProteinCoding.ids
    # 39625 total
    wc -l ensemblGene42Biotype.tsv
    # 39626 ensemblGene41Biotype.tsv
    # extra line is the header line
    # then get only the protein-coding trancsripts from the GFF file
    # write a script to do this as grep is slow
 cat << 'EOF' > getIds.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($in, $file, %ids);
 $in = $ARGV[0]; # list of ids
 $file = $ARGV[1]; # GFF file or other data file
 
 open(IN, $in) || die "Can not open $in :$!\n"; 
 open (FILE, $file) || die "Can not open $file :$!\n"; 
 open (FOUND, ">found.log") || "Can not create found.log: $!\n";
 
 while (<IN>) {
    chomp;
    my $l = $_;
    $ids{$l} = 1;
 }
 close(IN);
 
 # read GFF file or other data file and check whether transcript ID is in 
 # the hash before printing out that line.
 while  (<FILE>){
    my ($line, $transId);
 
    $line = $_;
    $transId = "";
 
    if ($line =~ /(ENSDART[0-9]+)/){
       $transId = $1;
    }
    if (exists($ids{$transId})){
       print $line;
       print FOUND "$transId\n";
    }
 }
 close(FILE);
 'EOF'
    chmod +x getIds.pl
    perl getIds.pl ensGene42ProteinCoding.ids ensGene42.gff \
         > ensGene42ProteinCoding.gff
    # uniq found.log and check against input ids
    sort found.log | uniq > foundProtein.uniq
    sort ensGene42ProteinCoding.ids > ens42ProteinIds.sort
    comm -13 foundProtein.uniq ens42ProteinIds.sort
    # All ids were found in the gff file
 
    perl getIds.pl ensGene42NonCoding.ids ensGene42.gff \
         > ensGene42NonCoding.gff
    sort found.log | uniq > foundNonCoding.uniq
    sort ensGene42NonCoding.ids > ens42NonCodingIds.sort
    comm -13 foundNonCoding.uniq ens42NonCodingIds.sort
    # All ids were found in the gff file
    rm *.sort *.uniq *.bak found.log
    wc -l ensGene42*.gff
    # 807871 ensGene42.gff
    #   3695 ensGene42NonCoding.gff
    # 804176 ensGene42ProteinCoding.gff
 
    # load into database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/ensembl42
    hgsql -e 'drop table ensGene;' danRer4
    hgsql -e 'drop table ensGeneNonCoding;' danRer4
    /cluster/bin/x86_64/ldHgGene danRer4 ensGene ensGene42ProteinCoding.gff
    # Read 36065 transcripts in 804176 lines in 1 files
    # 36065 groups 27 seqs 1 sources 4 feature types
    # 36065 gene predictions
 
    /cluster/bin/x86_64/ldHgGene danRer4 ensGeneNonCoding ensGene42NonCoding.gff
    # Read 3560 transcripts in 3695 lines in 1 files
    # 3560 groups 27 seqs 1 sources 1 feature types
    # 3560 gene predictions
    # The only difference between Ensembl v42 and v41 for zebrafish is two
    # extra gene predictions in the non-coding category in v42.
  
    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
    # hgKnownToSuper.  Use BioMart to create it as above, except:
    # Step 3) Choose "Features". Expand the GENE section and under 
    # "Ensembl Attributes", check boxes for Ensembl Gene ID, 
    # Ensembl Transcript ID, Ensembl Peptide ID.
    # After clicking on the Results link in the top black menu,
    # Choose CSV as the output format and Export all results to a 
    # "Compressed web file (notify by e-mail)" and hit the 
    # "Go" button and enter e-mail address as requested.
    # Result name: ensembl42Gtp.tsv.gz
 
    ssh kkstore04 
    cd /cluster/data/danRer4/bed/ensembl42
    gunzip ensembl42Gtp.tsv.gz
    # separate the IDs for protein-coding genes and the rest (RNA genes and
    # pseudogenes).
    # transcript ID and gene ID are in different columns than before so switch
    # Gene ID should be in first column and Transcrip ID in the second column.
    awk 'BEGIN {FS="\t"} {OFS="\t"} {print $2,$1,$3}' ensembl42Gtp.tsv \
        > ens42GtpFormat.tsv
    perl getIds.pl ensGene42ProteinCoding.ids ens42GtpFormat.tsv \
         > ensGtpProteinCoding.txt
    # uniq found.log and check against input ids
    sort found.log | uniq > foundProtein.uniq
    sort ensGene42ProteinCoding.ids > ens42ProteinIds.sort
    comm -13 foundProtein.uniq ens42ProteinIds.sort
    
    perl getIds.pl ensGene42NonCoding.ids ens42GtpFormat.tsv \
         > ensGtpNonCoding.txt
    # uniq found.log and check against input ids
    sort found.log | uniq > foundNonCoding.uniq
    sort ensGene42NonCoding.ids > ens42NonCodingIds.sort
    comm -13 foundNonCoding.uniq ens42NonCodingIds.sort
    # All ids were found in the gff file
    rm *.sort *.uniq *.bak found.log
    wc -l ensGtp*.txt
    # 3560 ensGtpNonCoding.txt
    # 36065 ensGtpProteinCoding.txt
    # The non-coding set have only gene ids and transcript ids and
    # no protein ids.
    
    # Load database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/ensembl42/
    hgsql -e 'drop table ensGtp;' danRer4
    # load ensGtp for protein-coding genes
    hgLoadSqlTab danRer4 ensGtp ~/kent/src/hg/lib/ensGtp.sql \
             ensGtpProteinCoding.txt
    # only load IDs for the protein coding genes. The non-coding genes
    # have no protein ID.
 
    # Get the ensembl peptide sequences from
    # http://www.biomart.org/biomart/martview
    # Follow this sequence:
    # 1) Choose the Ensembl Genes 42 as the database and then 
    # Danio Rerio genese (ZFISH6) as the dataset.
    # 2) Click on the Attributes link in the left panel. Select sequences.
    # 3) Expand the SEQUENCES section and choose Peptide as type of sequence 
    # to export and then expand the Header Information section and select 
    # Ensembl Gene ID from Gene Attributes and 
    # Ensembl Transcript ID and Ensembl Peptide ID from 
    # Transcript Attributes 
    # 4) Click on the Filters link in the left panel and expand the GENE
    # section. Select the Gene type box and then select protein_coding as 
    # these are the only genes with an associated protein sequence.
    # 5) Click on the Results link in the top black menu bar and 
    # choose FASTA for the output and export all results to
    # Compressed file (notify by e-mail).
    # save the file as ensembl42Pep.fasta.gz and move to
    # /cluster/data/danRer4/bed/ensembl42
    # Got results URL by e-mail but BioMart seems to be currently inaccessible
    ssh kkstore04
    cd /cluster/data/danRer4/bed/ensembl42
    gunzip ensembl42Pep.fasta.gz
    grep '>' ensembl42Pep.fasta | wc -l
    # 36048
    grep '>' ensembl42Pep.fasta > headers
    awk 'BEGIN {FS="|"} {print $2;}' headers > pepTranscript.ids
    sort pepTranscript.ids | uniq > pepTranscript.ids.sort
    sort ensGene42ProteinCoding.ids | uniq > proteinCoding.ids.sort
    comm -13 proteinCoding.ids.sort pepTranscript.ids.sort
    # no difference
    comm -23 proteinCoding.ids.sort pepTranscript.ids.sort > noPep
    # There are 17 of these.
    # found some of them on the Ensembl zebrafish Genome Browser and found
    # the peptide sequences. E-mailed Ensembl's helpdesk to ask how to get
    # peptide sequences for these 17 transcript IDs (2007-01-09).
    # Then downloaded peptide sequences for just this set of 17, but only got
    # 16 of them. To do this, follow the instructions as above for the  
    # obtaining the peptide sequences but on the Filters page, expand the GENE
    # section and check the box for ID list limit and select 
    # Ensembl Transcript ID(s) and paste in the list. Name output file
    # otherIDs.fasta.gz
    gunzip otherIDs.fasta.gz
    grep '>' otherIDs.fasta > headers2
    awk 'BEGIN {FS="|"} {print $2;}' headers2 > otherPepTranscript.ids
    sort otherPepTranscript.ids | uniq > otherPepTranscript.ids.sort
    comm -13 noPep otherPepTranscript.ids.sort
    # no difference
    comm -23 noPep otherPepTranscript.ids.sort
    # ENSDART00000049311
    # Repeat above procedure to query for peptide sequence for just this 
    # transcript ID and name file: otherIDs2.fasta.gz 
    # E-mailed helpdesk@ensembl.org to report all these problems (2007-01-09)
    gunzip otherIDs2.fasta.gz
    # Concatenate all sequences:
    cat ensembl42Pep.fasta otherIDs.fasta otherIDs2.fasta > ens42Pep.fasta
    grep '>' ens42Pep.fasta | wc 
    # 36065 
    grep '>' ens42Pep.fasta > all.headers
    awk 'BEGIN {FS="|"} {print $2;}' all.headers | sort | uniq > allTxIds.sort
    comm -13 proteinCoding.ids.sort allTxIds.sort
    # no difference
    comm -23 proteinCoding.ids.sort allTxIds.sort
    # no difference so got all protein sequences for the protein-coding 
    # trancsript IDs now.
 
    # load into database
    ssh hgwdev
    cd /cluster/data/danRer4/bed/ensembl42
    hgsql -e 'drop table ensPep;' danRer4
    hgPepPred danRer4 ensembl ensembl42Pep.fasta
 
    # edit trackDb/zebrafish/danRer4 to have an ensGene entry with the 
    # archive date for Enembl v42 which is used for creating stable archive 
    # links for the transcript ID and protein ID to make sure that these
    # always connect to the correct version of Ensembl Genes.
    # added track handler to hgTracks.c for ensGeneNonCoding and added
    # code to hgc.c to handle creating the correct stable archive link for
    # a particular version of Ensembl.
 # trackDb/zebrafish/danRer4/trackDb.ra entries for ensGene and
 # ensGeneNonCoding include these lines for creating the correct URLs:
 # url http://dec2006.archive.ensembl.org/Danio_rerio/transview?transcript=$$
 # urlName gene
 # archive dec2006
 
     # Add Biotype and External Gene ID to the Ensembl Non-Coding genes table
     # These can be retrieved from BioMart using the method as above for
     # Biotype but also selecting the External Gene ID. Click on the Filter 
     # link on the left panel and expand the GENE section and check the box
     # for Gene Type and select all types except for protein_coding.
     # Select TSV as the output and Compressed file (*.gz) as the format.
     # save as ensNonCoding.biotype.txt.gz
     ssh hgwdev
     cd /cluster/data/danRer4/bed/ensembl42
     gunzip ensNonCoding.biotype.txt.gz
     tail +2 ensNonCoding.biotype.txt > ensNonCoding.biotype.tab
 cat << 'EOF' > ensBiotype.sql
 CREATE TABLE ensBiotype (
     transcriptId varchar(255) not null,
     biotype varchar(255) not null,
     extGeneId varchar(255) not null
 );
 'EOF'
     hgLoadSqlTab danRer4 ensBiotype ensBiotype.sql ensNonCoding.biotype.tab
     # Add extra fields to ensNonCoding genePred table:
     hgsql -e \
          'alter table ensGeneNonCoding add biotype varchar(255) NOT NULL;' \
          danRer4
     hgsql -e \
          'alter table ensGeneNonCoding add extGeneId varchar(255) NOT NULL;' \
          danRer4
     # Add index to the extGeneId column:
     hgsql -e 'alter table ensGeneNonCoding add index(extGeneId);' danRer4
     hgsql -e 'select count(*) from ensGeneNonCoding;' danRer4
     # 3560
     hgsql -e 'update ensGeneNonCoding set biotype = "";' danRer4
     hgsql -e 'update ensGeneNonCoding set extGeneId = "";' danRer4
     # Now populate these columns with data from the ensBiotype table
     hgsql -e 'select count(*) from ensGeneNonCoding as g, ensBiotype as b \
           where g.name = b.transcriptId;' danRer4
     # 3560
     hgsql -e 'update ensGeneNonCoding as g, ensBiotype as b \
           set g.biotype = b.biotype where g.name = b.transcriptId;' danRer4
     hgsql -e 'select count(*) from ensGeneNonCoding where biotype != "";' \
           danRer4
     # 3560
     # then set the External Gene ID:
     hgsql -e 'update ensGeneNonCoding as g, ensBiotype as b \
           set g.extGeneId = b.extGeneId where g.name = b.transcriptId;' danRer4
     hgsql -e 'select count(*) from ensGeneNonCoding where biotype != "";' \
           danRer4
     # 3393
     # This is correct since 167 rows in the ensNonCoding.biotype.tab have no
     # external Gene ID:
     awk '{if ($3 == "") print;}' ensNonCoding.biotype.tab | wc -l
     # 167
     # 3393 + 167 = 3360
     # Now check code in hgc.c for handling the details page for this track.
  
 #########################################################################
 # RADIATION HYBRID (RH) MAP TRACK (DONE, 2007-01-12 - 2007-01-23, hartera)
 # Data from Yi Zhou at Boston Children's Hospital:
 # yzhou@enders.tch.harvard.edu
 # Latest RH map sequences and primers received on 2006-10-03 from
 # Anhua (Peter) Song - asong@enders.tch.harvard.edu
 # Changed the name of rhMapInfo table and related files to rhMapZfishInfo
 # to make the name more zebrafish-specific (2007-02-08, hartera)
 # Remake track as one of the primer sequences was in the sequence for
 # 1942C.INSERTMUT and also changed another marker name to remove a forward 
 # slash. Remade rhMapZfishInfo table and removed spaces from primer sequences. 
 # (2007-02-14, hartera)
 # Collected stats on RH map alignments for Yi Zhou (DONE, 2007-03-28, hartera)
 
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/ZonLab/rhMap-2006-10-03
     cd /cluster/data/danRer4/bed/ZonLab
     ln -s rhMap-2006-10-03 rhMap
     cd rhMap
     # download data files from e-mail:
     # rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip
     unzip rhSequenceSubmit100306.zip
     unzip rhSequenceSubmitSeq100306.zip
     dos2unix rhSequenceSubmit100306.txt
     dos2unix rhSequenceSubmitSeq100306.txt
     # Sequences are in rhSequenceSubmitSeq100306.txt and primers and other
     # information are in rhSequenceSubmi100306.txt
     grep '>' rhSequenceSubmitSeq100306.txt | wc -l
     # 11514
     wc -l  rhSequenceSubmit100306.txt
     # 13438 rhSequenceSubmit100306.txt
    
     grep '>' rhSequenceSubmitSeq100306.txt > rhMap.names
 
     # remove '>' from names and grab first field
     perl -pi.bak -e 's/>//' rhMap.names
     awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \
         > rhMap.namesOnly.sort
     awk 'BEGIN {FS="|"} {print $1;}' rhSequenceSubmit100306.txt | sort | uniq \
         > rhMapPrimers.namesOnly.sort
     wc -l *.sort
     # 11514 rhMap.namesOnly.sort
     # 13436 rhMapPrimers.namesOnly.sort (after removing blank line)
     
     # get a list of headers from the FASTA file
     grep '>' rhSequenceSubmitSeq100306.txt > rhMap.headers
     awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq
 # BAC_END
 # EST
 # GENE
 # SSLP
 # STS
     # There are 5 types of sequence here.
     awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq
 #BACends
 #Custom
 #Insertion_Mutant
 #Insertion_Mutants
 #MGH
 #NCBI
 #Sanger SG
 #Sequencing_Project
 #ThisseClone
 #Thisse_Clone
 #other_zfEst
 #wu_zfEst
 #wz
     awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq
     # CHBG
     # MPIEB
 
 # Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone;
 # So there are 11 different sources.
     # There are 2 sequences with problem primers. E-mailed Peter Song about
     # these and he suggested to delete thoser primers:
     # >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.|
     # >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A| 
     # edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers.
     # need to reformat FASTA headers so they are in the format: 
     # NAME.SOURCE.TYPE.ORIGIN
     # Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone
     # so change these to have the same name. Also shorten Sanger SG to
     # Shotgun.
     sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmitSeq100306.txt \
        | sed -e 's/Insertion_Mutant/InsertMut/' \
        | sed -e 's/Sanger SG/Shotgun/' \
        | sed -e 's/ThisseClone/Thisse/' \
        | sed -e 's/Thisse_Clone/Thisse/' \
        | sed -e 's/Sequencing_Project/Seqproj/' > rhMap100306.fa
     # Do the same for the primers and information file:
     sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmit100306.txt \
        | sed -e 's/Insertion_Mutant/InsertMut/' \
        | sed -e 's/Sanger SG/Shotgun/' \
        | sed -e 's/ThisseClone/Thisse/' \
        | sed -e 's/Thisse_Clone/Thisse/' \
        | sed -e 's/Sequencing_Project/Seqproj/' > rhMapPrimers100306.txt
     # edit these files to remove the extra newline char after the first primer
     # for 1942c and then change "/" in FJ34C05.Y1/FJ56G09.Y1.WU_ZFEST to 
     # an underscore (2007-02-14, hartera)
     perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
          rhMap100306.fa
     perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
          rhMapPrimers100306.txt 
     # use a script to reformat the names for the FASTA headers to the format 
     # >NAME.SOURCE where name is the first field separated by "|" and source
     # is the 9th field. The source is used to make the name unique. Some
     # of these names are BAC ends that occur in the BAC ends track so there
     # are name clashes in the seq table if the names are not made unique.
     # Also make the name upper case as for those for the danRer1 and danRer2
     # RH map and remove base numbering on each sequence line of FASTA file.
 cat << '_EOF_' > rhFix
 #!/usr/bin/awk -f 
 
 #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
 /^>/ {
     split(toupper($0), a, "\\|");
     print a[1]"."a[9];
     next;
 }
 
 /^[0-9]+ / {
     $0 = $2;
 }
 
 {
     print $0;
 }
 
 '_EOF_'
 # << keep emacs coloring happy
     chmod +x rhFix
     rhFix rhMap100306.fa > rhMap.fa
     # Blat sequences vs danRer4 genome
     ssh pk
     mkdir -p /cluster/data/danRer4/bed/ZonLab/rhMap/blatRun
     cd /cluster/data/danRer4/bed/ZonLab/rhMap
     # put the rhMap sequences on the san 
     mkdir -p /san/sanvol1/scratch/danRer4/rhMap
     cp rhMap.fa /san/sanvol1/scratch/danRer4/rhMap/
     # do blat run to align RH map sequences to danRer4 and and use
     # chrNA_random and chrUn_random separated into scaffolds.
     cd blatRun
     ls -1S /san/sanvol1/scratch/danRer4/rhMap/rhMap.fa > rhMap.lst
     ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
     foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
        ls -1 $f >> genome.lst
     end
     wc -l genome.lst 
     # 3237 genome.lst
     # for output:
     mkdir -p /san/sanvol1/scratch/danRer4/rhMap/psl
     # use -repeats option to report matches to repeat bases separately
     # to other matches in the PSL output.
     echo '#LOOP\n/cluster/bin/x86_64/blat -repeats=lower -minIdentity=80
 -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out
 line+ /san/sanvol1/scratch/danRer4/rhMap/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
 > template.sub
 
     gensub2 genome.lst rhMap.lst template.sub para.spec
     para create para.spec
     para try, check, push ... etc.
     para time
 # Completed: 3237 of 3237 jobs
 #CPU time in finished jobs:       4787s      79.78m     1.33h    0.06d  0.000 y
 #IO & Wait Time:                  8080s     134.67m     2.24h    0.09d  0.000 y
 #Average job time:                   4s       0.07m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              18s       0.30m     0.01h    0.00d
 #Submission to last job:           752s      12.53m     0.21h    0.01d
 
     # need to do pslSort and lift up
     ssh pk
     cd /san/sanvol1/scratch/danRer4/rhMap
     # Do sort, liftUp and then best in genome filter.
     # only use alignments that have at least
     # 95% identity in aligned region.
     # Previously did not use minCover since a lot of sequence is in
     # Un and NA so genes may be split up so good to see all alignments.
     # However, found a number of short alignments of <= 50 bp. These are
     # not meaningful so maybe need to use minCover. If increased too much,
     # then hits on poor parts of the assembly will be missed.
     # use pslCDnaFilter with the same parameters as used for zebrafish
     # Genbank EST alignments.
     pslSort dirs raw.psl tmp psl
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
 #                         seqs    aligns
 #             total:     11326   1628158
 #      drop invalid:     1       1
 # drop minNonRepSize:     3068    1286657
 #     drop minIdent:     3442    104586
 #     drop minCover:     2838    205568
 #        weird over:     163     1124
 #        kept weird:     107     172
 #    drop localBest:     3011    17130
 #              kept:     11121   14216
 
 # 11514
 # The percentage aligned is 11121/11514 = 96.6%
 # Number of alignments for markers with most alignments after filtering:
 # 35 BZ83M20.Z.BACENDS
 # 17 ZKP63A5.YA.BACENDS
 # 17 ZKP117C9.YA.BACENDS
 # 16 ZK30E10.SP6.BACENDS
 # 15 ZC133H17.ZA.BACENDS
 # 12 Z13442.MGH
 # 11 ZK105J10.T7.BACENDS
 # 10 ZC261G9.ZAF.BACENDS
 # 10 ZC261G9.ZA.BACENDS
 # 9 ZK19H9.SP6.BACENDS
 # 9 Z4910.MGH
 # 9 FJ07G09.X1.WU_ZFEST
 # 8 ZK4I5.T7.BACENDS
 # 8 ZC27I3.ZA.BACENDS
 
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.92 -minCover=0.15 raw.psl contig.psl
 #                         seqs    aligns
 #            total:     11326   1628158
 #     drop invalid:     1       1
 # drop minNonRepSize:     3068    1286657
 #     drop minIdent:     2740    60578
 #     drop minCover:     3083    223430
 #        weird over:     318     3132
 #        kept weird:     154     249
 #    drop localBest:     3480    43022
 #              kept:     11212   14470
 # Percentage aligned is 11212/11514 = 97.4%
 
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=10 \
        -ignoreNs -bestOverlap -minId=0.92 -minCover=0.15 raw.psl contig.psl
 #                         seqs    aligns
 #             total:     11326   1628158
 #      drop invalid:     1       1
 #drop minNonRepSize:     3026    1258275
 #     drop minIdent:     2902    72521
 #     drop minCover:     3256    231002
 #        weird over:     344     3365
 #        kept weird:     157     252
 #    drop localBest:     3604    51799
 #            kept:     11228   14560
 
 # There isn't much difference 11228/11514 = 97.5%
      awk '{print $10}' contig.psl | sort | uniq -c | sort -nr 
 # Top numbers of hits:
 # 35 BZ83M20.Z.BACENDS
 # 17 ZKP63A5.YA.BACENDS
 # 17 ZKP117C9.YA.BACENDS
 # 16 ZK30E10.SP6.BACENDS
 # 15 ZC133H17.ZA.BACENDS
 # 13 FJ07G09.X1.WU_ZFEST
 # 12 Z13442.MGH
 # 11 ZK105J10.T7.BACENDS
 # 10 ZC261G9.ZAF.BACENDS
 # 10 ZC261G9.ZA.BACENDS
 # 9 ZK19H9.SP6.BACENDS
 # 9 Z4910.MGH
 # 9 Z3157.MGH
 # 8 ZK4I5.T7.BACENDS
 
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.90 -minCover=0.15 raw.psl contig.psl
 #    seqs    aligns
 #           total:     11326   1628158
 #      drop invalid:     1       1
 # drop minNonRepSize:     3068    1286657
 #     drop minIdent:     2306    34000
 #     drop minCover:     3166    230461
 #        weird over:     388     5030
 #        kept weird:     168     270
 #    drop localBest:     3647    62505
 #              kept:     11232   14534
 
 # Percent sequences aligned: 11232/11514 = 97.6%
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.90 -minCover=0.20 raw.psl contig.psl
 #                           seqs    aligns
 #             total:     11326   1628158
 #      drop invalid:     1       1
 # drop minNonRepSize:     3068    1286657
 #     drop minIdent:     2306    34000
 #     drop minCover:     3418    245102
 #        weird over:     343     4235
 #        kept weird:     159     252
 #    drop localBest:     3206    48291
 #              kept:     11189   14107
 # Percent sequences aligned: 11189/11514 = 97.2%
 
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.80 -minCover=0.20 raw.psl contig.psl
 # 			seqs    aligns
 #             total:     11326   1628158
 #      drop invalid:     1       1
 #drop minNonRepSize:     3068    1286657
 #     drop minIdent:     1       2
 #     drop minCover:     3599    256955
 #        weird over:     414     8594
 #        kept weird:     173     270
 #    drop localBest:     3410    70389
 #              kept:     11205   14154
 # Percent sequences aligned: 11205/11514 = 97.3%
 # 35 BZ83M20.Z.BACENDS
 # 17 ZKP63A5.YA.BACENDS
 # 17 ZKP117C9.YA.BACENDS
 # 16 ZK30E10.SP6.BACENDS
 # 15 ZC133H17.ZA.BACENDS
 # 13 FJ07G09.X1.WU_ZFEST
 # 11 ZK105J10.T7.BACENDS
 # 10 ZC261G9.ZAF.BACENDS
 # 10 ZC261G9.ZA.BACENDS
 # 9 ZK19H9.SP6.BACENDS
 # 9 Z4910.MGH
 # 8 ZK4I5.T7.BACENDS
 # 8 ZC27I3.ZA.BACENDS
 # 8 Z7243.MGH
 # 8 Z3157.MGH
 
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.80 -minCover=0.15 raw.psl contig.psl
 #                         seqs    aligns
 #            total:     11326   1628158
 #      drop invalid:     1       1
 #drop minNonRepSize:     3068    1286657
 #     drop minIdent:     1       2
 #     drop minCover:     3322    238087
 #        weird over:     470     9995
 #        kept weird:     181     288
 #    drop localBest:     3876    88821
 #              kept:     11246   14590
 # Percent sequences aligned: 11246/11514 = 97.7%
 # Use lower minId and higher minCover (0.20) as for the BAC ends and for
 # the RH map on other zebrafish assemblies.
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.85 -minCover=0.20 raw.psl contig.psl
 #                        seqs    aligns
 #             total:     11326   1628158
 #      drop invalid:     1       1
 #drop minNonRepSize:     3068    1286657
 #     drop minIdent:     775     3806
 #     drop minCover:     3552    255528
 #        weird over:     403     7578
 #        kept weird:     171     268
 #    drop localBest:     3358    68020
 #              kept:     11203   14146
 # 97.3% (11203/11514) of sequences are aligned using these filter criteria
     # Loaded these sequences as below and then checked the rhMap track in the
     # danRer4 Genome Browser to see if there are any pileups.
     # there is one big pileup on chr24 that is in the same region as 
     # that was found for danRer3 after using liftOver:
     # i.e. chr13:8,112,962-8,113,055 on danRer3 which lifts over to
     # chr24:8,191,404-8,191,497 on danRer4 and there is also a pileup
     # of RH map sequences here. If you look at Z33743, it has 3 alignments
     # to chr23, chr24 and chrNA_random. The chr23 alignment is the best and
     # this is where its primers map to. If a higher threshold is taken
     # for min coverage in the filtering, this may be avoided. Checked all the 
     # whole chromosome views in the Browser and chr24 is the only one that 
     # appears to have this large pileup. 
 
     # try increasing the minCover parameter:
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.85 -minCover=0.25 raw.psl contig.psl
 #                        seqs    aligns
 #             total:     11326   1628158
 #      drop invalid:     1       1
 #drop minNonRepSize:     3068    1286657
 #     drop minIdent:     775     3806
 #     drop minCover:     3754    271241
 #        weird over:     358     6379
 #        kept weird:     157     252
 #    drop localBest:     2916    52769
 #              kept:     11100   13684
     # Percent sequences aligned: 11100/11514 = 96.4%
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.85 -minCover=0.30 raw.psl contig.psl
 #  			seqs    aligns
 #      	total:     11326   1628158
 # drop invalid:     1       1
 # drop minNonRepSize:     3068    1286657
 #     drop minIdent:     775     3806
 #     drop minCover:     3929    283124
 #        weird over:     310     5451
 #        kept weird:     145     236
 #    drop localBest:     2549    41325
 #              kept:     10938   13245
     # Percent sequences aligned: 10938/11514 = 95.0%
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.85 -minCover=0.40 raw.psl contig.psl
 #  			seqs    aligns
 #             total:     11326   1628158
 #      drop invalid:     1       1
 #drop minNonRepSize:     3068    1286657
 #     drop minIdent:     775     3806
 #     drop minCover:     4293    298517
 #        weird over:     245     4052
 #        kept weird:     128     211
 #    drop localBest:     2079    26658
 #              kept:     10489   12519
     # Percent sequences aligned: 10489/11514 = 91.1%
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.85 -minCover=0.35 raw.psl contig.psl
 # 			seqs    aligns
 #     	     total:     11326   1628158
 #      drop invalid:     1       1
 # drop minNonRepSize:     3068    1286657
 #     drop minIdent:     775     3806
 #     drop minCover:     4119    292022
 #        weird over:     274     4640
 #        kept weird:     137     227
 #    drop localBest:     2279    32801
 #              kept:     10724   12871
     # Percent sequences aligned: 10724/11514 = 93.1%
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.85 -minCover=0.32 raw.psl contig.psl
 # 			seqs    aligns
 #             total:     11326   1628158
 # 	drop invalid:     1       1
 # drop minNonRepSize:     3068    1286657
 #     drop minIdent:     775     3806
 #     drop minCover:     4001    287002
 #        weird over:     296     5113
 #        kept weird:     144     235
 #    drop localBest:     2437    37599
 #              kept:     10862   13093
     # Percent sequences aligned: 10862/11514 = 94.3%
     rm contig*
     # Final parameters: use minCover=0.33
     pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
        -ignoreNs -bestOverlap -minId=0.85 -minCover=0.33 raw.psl contig.psl
 # 			seqs    aligns
 # 	     total:     11326   1628158
 #     drop invalid:     1       1
 # drop minNonRepSize:     3068    1286657
 #      drop minIdent:     775     3806
 #     drop minCover:     4045    288763
 #        weird over:     287     4946
 #        kept weird:     142     233
 #    drop localBest:     2375    35906
 #              kept:     10818   13025
     # Percent sequences aligned: 10818/11514 = 94.0%
     # This is a compromise between reducing the number of sequences piling
     # up on chr24 but not losing all alignments for too many sequences.
     cd /cluster/data/danRer4/bed/ZonLab/rhMap
     # lift up to genome level coordinates
     rm rhMap*psl
     liftUp rhMap.psl \
            /cluster/data/danRer4/jkStuff/liftAll.lft warn \
            /san/sanvol1/scratch/danRer4/rhMap/contig.psl
     # Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
     pslCheck rhMap.psl
     # psl looks ok
     # cleanup 
     rm *.bak rhMap.headers rhMap.names *.sort headers.new 
     # Load sequence alignments into the database
     ssh hgwdev
     cd /cluster/data/danRer4/bed/ZonLab/rhMap   
     # drop test tables and reload final psl file
     # drop old rhMap table
     hgsql -e 'drop table rhMap;' danRer4 
     hgLoadPsl danRer4 rhMap.psl
      
     # Copy sequences to gbdb if they are not already there.
     mkdir -p /gbdb/danRer4/rhMap
     # remove old sequences
     rm /gbdb/danRer4/rhMap/rhMap20061003.fa
     ln -s \
       /cluster/data/danRer4/bed/ZonLab/rhMap/rhMap.fa \
       /gbdb/danRer4/rhMap/rhMap20061003.fa
 
     # then add sequences to database:
     # remove old sequences (2007-02-14, hartera)
     hgsql -e 'select * from extFile where path like "%rhMap%";' danRer4
 # id     | name             | path                                 | size    |
 +--------+------------------+--------------------------------------+---------+
 #| 709793 | rhMap20061003.fa | /gbdb/danRer4/rhMap/rhMap20061003.fa | 7456887 |
     hgsql -e 'select count(*) from seq where extFile = 709793;' danRer4
     # 11514
     hgsql -e 'delete from seq where extFile = 709793;' danRer4
     hgsql -e 'delete from extFile where id = 709793;' danRer4
     
     # then reload the new sequence file 
     hgLoadSeq danRer4 /gbdb/danRer4/rhMap/rhMap20061003.fa
     # loaded succesfully
     # Check in the Browser and see if there are many pileups
     # Much reduced now on chr24. Took 10 random sequences in the pileup from
     # minCover=0.20 and found that 7 of them still align to danRer4 
     # with minCover=0.33 and 2 of those that don't also have primers that
     # do not map using the hgPcr tool.
     # Add trackDb entry and also an rhMap.html for trackDb/zebrafish/danRer4
     # also add the search specs for hgFindSpec to trackDb.ra
 
     # Add table of related information for the RH map details pages:
     
     # Check that all the headers from rhMap.headers are also in the primers
     # file which seems to contain the same headers from the FASTA file
     # as well as additional markers.
     # Remake the rhMapZfishInfo table too (hartera, 2007-02-14) so that 
     # new line is removed from 1942C.INSERTMUT line and also the underscore is
     # added to the FJ34C05.Y1_FJ56G09.Y1.WU_ZFEST ID in place of "/". 
     ssh kkstore04
     cd /cluster/data/danRer4/bed/ZonLab/rhMap/
     grep '>' rhMap100306.fa > rhMap.headers
 
     perl -pi.bak -e 's/>//' rhMap.headers
     sort rhMap.headers > rhMap.headers.sort
     sort rhMapPrimers100306.txt > rhMapPrimers.sort
     wc -l *.sort
     # 11514 rhMap.headers.sort
     # 13437 rhMapPrimers.sort
     comm -12 rhMap.headers.sort rhMapPrimers.sort | wc -l
     # 11514 in common
     # so all FASTA headers from rhMap022306.fa are in the primers file
     # Get headers again from rhMap.fa file as the names of the sources have
     # been changed. Parse out information from headers to add to an rhMapInfo
     # table so that this information can be displayed on the details page for
     # the RH map markers.
     # Fields: 1 - name, 2 - linkage group (chrom), 3 - position number on the 
     # RH map for that linkage group, 4 - distance (in cR) from the 
     # top of a linkage group, 4 - position number in entire RH map (ordered 
     # from LG1 to LG25, 5 - type of marker (SSLP, BAC_END, EST, GENE, STS),
     # 9 - source, 10 - institute that mapped the marker, 11 - 5' forward primer,
     # 12 - 3' reverse primer.
     # Sort headers by linkage group and by position
     grep '>' rhMap100306.fa > rhMap.headers2
     # then use the rhMap.headers2 file to extract the marker information
     # and to reformat the names for the FASTA headers to the format 
     # >NAME.SOURCE where name is the first field separated by "|" and source
     # is the 9th field so that names in the rhMap and rhMapInfo tables are 
     # the same. The source is used to make the name unique. 
 cat << '_EOF_' > getRhInfo
 #!/usr/bin/awk -f 
 
 #>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
 /^>/ {
     sub(/>/,"",$0);
     split(toupper($0), a, "\\|");
     print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12];
     next;
 }
 '_EOF_'
 # << keep emacs coloring happy
     chmod +x getRhInfo
     getRhInfo rhMap.headers2 > rhMapInfo.tab
     # Sort headers by linkage group (LG) and by position
     sort -k 2,2 -k 3,3n rhMapInfo.tab > rhMapInfoSorted.tab
     wc -l rhMapInfoSorted.tab
     # 11514 rhMapInfoSorted.tab
     # Need to add ZFIN IDs - data received on 2006-06-23
     # rhSeqWithZdbNameToRachel.zip
     unzip rhSeqWithZdbNameToRachel.zip
     tail +3 rhSeqWithZdbNameToRachel.txt \
             | awk 'BEGIN {OFS= "\t"} {print $1, $7}' \
             | sort | uniq > rhSeqZfinIds.txt
     # translate names to upper case
     cat rhSeqZfinIds.txt | tr '[a-z]' '[A-Z]' > rhSeqZfinIds.format.txt      
     # then map these marker names and ZFIN IDs to markers in
     # rhMapInfoSorted.tab. Also remove spaces - some of the primer sequences
     # have spaces (hartera, 2007-02-14)  
 cat << 'EOF' > mapZfinIds.pl
 #!/usr/bin/perl -w
 use strict;
 
 my ($zf, $rh, %zfinIds);
 $zf = $ARGV[0]; # file of ZFIN IDs and marker names
 $rh = $ARGV[1]; # rhMapInfo.tab file
 
 open (ZFIN, $zf) || die "Can not open $zf :$!\n";
 open (RH, $rh) || die "Can not open $rh : $!\n";
 
 while (<ZFIN>){
    my ($line, @fi);
    chomp;
    $line = $_;
    @fi = split(/\t/, $line);
    # store ZFIN ID in hash keyed by marker name
    $zfinIds{$fi[1]} = $fi[0];
 }
 close ZFIN;
 
 # read in the markers from rhMapInfo file
 while (<RH>){
    my ($li, @f, $marker, @m, $mName, $j, $i);
    $mName = "";
    $zf = "";
    chomp;
    $li = $_;
    @f = split(/\t/, $li);
    $marker = $f[0];
    # split by "."
    @m = split(/\./, $marker);
    
    # remove the extension after the last "." 
    $mName = $m[0];
    if (($mName ne "") && (exists($zfinIds{$mName}))) {
       $zf = $zfinIds{$mName};
    }
    for ($j = 1; $j < $#m; $j++){
       $mName = $mName . "." . $m[$j];
    }
    if (($mName ne "") && (exists($zfinIds{$mName}))) {
       $zf = $zfinIds{$mName};
    }
    print "$f[0]\t$zf";
    # print other fields and remove spaces
    for ($i = 1; $i <= $#f; $i++){
       $f[$i] =~ s/\s//g;
       print "\t$f[$i]";
    }
    if ($#f == 6){
       print "\t\t";
    }
    print "\n";
 }
 'EOF'
     chmod +x mapZfinIds.pl
     perl mapZfinIds.pl rhSeqZfinIds.format.txt rhMapInfoSorted.tab \
          > rhMapInfoWithZfinIds.tab
     # There are 1867 markers with no ZFIN ID
     wc -l rhMapInfo*
     # 11514 rhMapInfo.tab
     # 11514 rhMapInfoSorted.tab
     # 11514 rhMapInfoWithZfinIds.tab
 
     # When loading, found that 1942.C has only 1 primer. Problem with
     # rhMapPrimers100306.txt. There was a new line between the primers
     # for this file so remove it there and in rhMap100306.fa and then 
     # process it again (now this was done at an earlier step, 2007-02-14).
 
     # Create a table with RH map item information including type, source,
     # origin and primer sequences.
     # already created rhMapInfo.sql, rhMapInfo.c and rhMapInfo.h files 
     # using autosql - see danRer3.txt. None of the assemblies with RH
     # map on the RR have this rhMapInfo table so it can be redefined.
     # load these into a table called rhMapInfo2 - this is rhMapInfo
     # with an extra column for the ZFIN ID. 
     # Use autosql to create a .sql file.
     ssh hgwdev
     # rename the information table and make it zebrafish specific
     # (2007-02-08, hartera)
 cat << 'EOF' > ~/kent/src/hg/lib/rhMapZfishInfo.as
 table rhMapZfishInfo
 "Zebrafish Radiation Hybrid map information"
 (
 string name;            "Name of Radiation Hybrid (RH) map marker"
 string zfinId; 		"ZFIN ID for the marker"
 string linkageGp;       "Linkage group to which the marker was mapped"
 uint position;          "Position number in RH map for this linkage group"
 uint distance;          "Distance from the top of linkage group (cR)"
 string markerType;      "Type of marker"
 string source;          "Source of marker"
 string mapSite;         "Institution that mapped the marker"
 string leftPrimer;      "Forward primer sequence"
 string rightPrimer;     "Reverse primer sequence"
 )
 'EOF'
 # << happy emacs
     # create .sql, .c and .h files using autoSql
     cd ~/kent/src/hg/lib
     autoSql rhMapZfishInfo.as rhMapZfishInfo
     mv rhMapZfishInfo.h ../inc
     # edit rhMapZfishInfo.sql and add an index (INDEX(zfinId)).
     # commit these files (*.as, *sql, *.c and *.h) to CVS replacing 
     # the original rhMapInfo* files. 
     # make changes to hgc so that it prints the ZFIN ID in addition to the
     # other rhMapZfishInfo fields. 
  
     # reload table with new name (2007-02-08, hartera):
     cd /cluster/data/danRer4/bed/ZonLab/rhMap
     hgsql -e 'drop table rhMapInfo;' danRer4 
     # reloaded the rhMapZfishInfo table (2007-02-08, hartera)
     hgsql -e 'drop table rhMapZfishInfo;' danRer4
     hgLoadSqlTab danRer4 rhMapZfishInfo ~/kent/src/hg/lib/rhMapZfishInfo.sql \
           rhMapInfoWithZfinIds.tab
     
     # add code to hgc.c to print ZFIN ID, if available, on the details page
     # together with the other marker-related information.
     # added track to trackDb.ra in trackDb/zebrafish/danRer4 with a URL for 
     # the ZFIN IDs to link to the relevant page at http://www.zfin.org 
     # and added an html page for the track.
     # Added the rhMapZfishInfo.h file to the makefile in src/hg/lib
     # and replaced rhMapInfo with rhMapZfishInfo in src/hg/hgc/hgc.c
 
     # RH MAP STATISTICS
     # Get some stats for Yi Zhou at Harvard (2007-03-20 & 2007-03-28)
     # Of the 11514 markers with sequence information, 10818 aligned (94%)
     # using a filter for 85% sequence identity and all portions of all 
     # alignments for a sequence must be within 0.5% of the identity of the 
     # best alignments for each portion of the marker. The query must have at
     # least 0.33 of the sequence aligned and at least 16 bases must not be in
     # repeat regions.
     cd /cluster/data/danRer4/bed/ZonLab/rhMap
     mkdir stats
     cd stats
     hgsql -e 'select count(distinct(qName)) from rhMap;' danRer4
     # 10818
     hgsql -N -e 'select qName from rhMap;' danRer4 | sort | uniq -c \
           | sort -nr > qNames.count
     # send this list too
     # 1701 markers have 2 or more BLAT alignment that pass the filter.
     hgsql -N -e 'select name, linkageGp from rhMapZfishInfo;' danRer4 \
           > markers.linkageGroups
     hgsql -N -e 'select qName, tName from rhMap;' danRer4 > rhMap.align.chroms
 
     ssh kkstore04
     cd /cluster/data/danRer4/bed/ZonLab/rhMap/stats
     sed -e 's/LG/chr/' markers.linkageGroups > markers.rhMap.chroms
     # some marker names contain "LG"
     awk '{print $1}' markers.linkageGroups | grep "LG" 
     # there are 18 and all begin with "TLG"
     sed -e 's/Tchr/TLG/' markers.rhMap.chroms > markers.rhMap.chroms2
     sort markers.rhMap.chroms2 | uniq > markers.rhMap.chroms.sort
     wc -l markers.rhMap.chroms*
     # 11514 markers.rhMap.chroms
     # 11514 markers.rhMap.chroms.sort
     # 11514 markers.rhMap.chroms2
     # same when uniqued
     sort rhMap.align.chroms | uniq > rhMap.align.chroms.sort
     wc -l rhMap.align*
     # 13025 rhMap.align.chroms
     # 11344 rhMap.align.chroms.sort
     
     # Find how well the RH map and Zv6 agree in terms of chromosome 
     # assignment given that linkage group number is the same as the 
     # chromosome number.
     comm -23 rhMap.align.chroms.sort markers.rhMap.chroms.sort \
              > diffChromInGenome
     # need to find just those in rhMap.align.chroms.sort that are 
     # in rhMap.
     awk '{print $1}' rhMap.align.chroms.sort | sort | uniq > rhMap.align.names
     foreach n (`cat rhMap.align.names`)
       echo $n
       grep -w $n markers.rhMap.chroms.sort >> markers.rhMap.chroms.aligned
     end
     # 10818 in markers.rhMap.chroms.aligned
     # 10818 rhMap.align.names
     # then compare this list to the ones that are aligned to the genome
     comm -13 rhMap.align.chroms.sort markers.rhMap.chroms.aligned \
          > diffChromInRHMap
     wc -l diffChromInRHMap
     # 1392 diffChromInRHMap
     # these are the markers that have a different chromosome (linkage group)
     # assigned in the RH map to that found by BLAT alignment of the marker
     # sequence to the genome. This list shows the linkage groups (chr) in the 
     # RH map then generate a list of where these align in the genome
     # These are markers that have at least one alignment to the same chrom
     # as in the linkage map. They may be aligning to other chroms too.
     awk '{print $1}' diffChromInRHMap > diffChromInRHMap.names
     foreach n (`cat diffChromInRHMap.names`)
       echo $n
       grep -w $n rhMap.align.chroms.sort >> rhMap.genomeAlign.diffInRHmap
     end
     wc -l rhMap.genomeAlign.diffInRHmap
     # 1562 rhMap.genomeAlign.diffInRHmap
     # This is the list of markers that differ in chrom between the RH map
     # and genome alignment with the list of chroms to which they are 
     # aligned by BLAT in an alignment of the marker sequence to the genome. 
     # There are more lines in this file because some markers align more than 
     # once to the genome so they appear more than once in the file.
     # Therefore of those markers aligned, 10818, there are 1392 (12.9%)
     # that are aligning to a different chromosome.
     # Some of these may be aligning to chrUn_random or chrNA_random
     grep random rhMap.genomeAlign.diffInRHmap | awk '{print $1}' \
          | sort | uniq > diffInRHmap.alignedToRandom
     wc -l diffInRHmap.alignedToRandom
     # 142 diffInRHmap.alignedToRandom
     # Of the markers with different chroms in the genome alignment and the 
     # linkage map, 142 (1.3% of 10818) are aligning to chrUn_random or 
     # chrNA_random so the sequence containing these markers has
     # not yet been placed on a chromosome.
 
 #########################################################################
 ##  Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
     hgsql -h genome-testdb hgcentraltest \
 	-e "update dbDb set orderKey = 450 where name = 'danRer4';"
 
 ##########################################################################
 # GenBank gbMiscDiff table (markd 2007-01-10)
 # Supports `NCBI Clone Validation' section of mgcGenes details page
 
    # genbank release 157.0 now contains misc_diff fields for MGC clones
    # reloading mRNAs results in gbMiscDiff table being created.
    ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer4
 
 #########################################################################
 # BLASTZ/CHAIN/NET oryLat1 (DONE - 2007-01-19,20 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19
     cd /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19
     cat << '_EOF_' > DEF
 # Zebrafish vs. Medaka
 
 # Try "human-fugu" (more distant, less repeat-killed than mammal) params
 # +M=50:
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Zebrafish danRer4, no randoms or Un in this sequence
 SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.2bit
 SEQ1_LEN=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.sizes
 SEQ1_CHUNK=40000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=30
 
 # TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
 #       chrUn in Scaffolds for this alignment run
 SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl DEF -verbose=2 \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/danRer4OryLat1 > do.log 2>&1 &
     #	real    556m6.806s
     cat fb.danRer4.chainOryLat1Link.txt
     #	209746583 bases of 1626093931 (12.899%) in intersection
     cd /cluster/data/danRer4/bed
     ln -s blastz.oryLat1.2007-01-19 blastz.oryLat1
 
     ## swap to oryLat1 - also in oryLat1.txt
     mkdir /cluster/data/oryLat1/bed/blastz.swap.danRer4
     cd /cluster/data/oryLat1/bed/blastz.swap.danRer4
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=pk  > swap.log 2>&1 &
 
     cat fb.oryLat1.chainDanRer4Link.txt
     #	156014546 bases of 700386597 (22.275%) in intersection
     cd /cluster/data/oryLat1/bed
     ln -s blastz.swap.danRer4 blastz.danRer4
 
 #########################################################################
 # BLASTZ/CHAIN/NET fr2 (DONE - 2007-01-29 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/blastz.fr2.2007-01-29
     cd /cluster/data/danRer4/bed/blastz.fr2.2007-01-29
     cat << '_EOF_' > DEF
 # Zebrafish vs. Fugu
 
 # Try "human-fugu" (more distant, less repeat-killed than mammal) params
 # +M=50:
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Zebrafish danRer4, no randoms or Un in this sequence
 SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.2bit
 SEQ1_LEN=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.sizes
 SEQ1_CHUNK=40000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=30
 
 # QUERY: Fugu fr2
 #       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
 SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/danRer4/bed/blastz.fr2.2007-01-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl DEF -verbose=2 \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/danRer4Fr2 > do.log 2>&1 &
     ## recover from pk kluster problems and finish blastz job
     time doBlastzChainNet.pl DEF -verbose=2 \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-continue=cat -bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/danRer4Fr2 > cat.log 2>&1 &
     ## recover from kki kluster problems and finish chain job
     time doBlastzChainNet.pl DEF -verbose=2 \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-continue=chainMerge -bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/danRer4Fr2 > chainMerge.log 2>&1 &
     #	real    554m13.214s
 
     ## swap
     mkdir /cluster/data/fr2/bed/blastz.danRer4.swap
     cd /cluster/data/fr2/bed/blastz.danRer4.swap
 
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/danRer4/bed/blastz.fr2.2007-01-29/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap -bigClusterHub=pk > swap.log 2>&1 &
     #	running 2007-01-30 - 16:35
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/danRer4/bed/blastz.fr2.2007-01-29/DEF \
 	-chainMinScore=2000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/danRer4/bed/blastz.fr2.2007-01-29
     time nice -n +19 featureBits danRer4 chainFr2Link \
 	> fb.danRer4.chainFr2Link.txt 2>&1
     #	138918185 bases of 1626093931 (8.543%) in intersection
     time nice -n +19 featureBits fr2 chainDanRer4Link \
 	> fb.fr2.chainDanRer4Link.txt 2>&1
     #	80963231 bases of 393312790 (20.585%) in intersection
 
 # ASZ (3-22-2007)this process failed to create four tables, so I created 
 # them an left them empty (as discussed with Hiram).
 
  CREATE TABLE `danRer4`.`chrUn_random_chainFr2` (
 `bin` smallint( 5 ) unsigned NOT NULL default '0',
 `score` double NOT NULL default '0',
 `tName` varchar( 255 ) NOT NULL default '',
 `tSize` int( 10 ) unsigned NOT NULL default '0',
 `tStart` int( 10 ) unsigned NOT NULL default '0',
 `tEnd` int( 10 ) unsigned NOT NULL default '0',
 `qName` varchar( 255 ) NOT NULL default '',
 `qSize` int( 10 ) unsigned NOT NULL default '0', 
 `qStrand` char( 1 ) NOT NULL default '',
 `qStart` int( 10 ) unsigned NOT NULL default '0',
 `qEnd` int( 10 ) unsigned NOT NULL default '0',
 `id` int( 10 ) unsigned NOT NULL default '0',
 KEY `bin` ( `bin` ) ,
 KEY `id` ( `id` )
 ) TYPE = MYISAM ;
 
  CREATE TABLE `danRer4`.`chrUn_random_chainFr2Link` (
 `bin` smallint( 5 ) unsigned NOT NULL default '0',
 `tName` varchar( 255 ) NOT NULL default '',
 `tStart` int( 10 ) unsigned NOT NULL default '0',
 `tEnd` int( 10 ) unsigned NOT NULL default '0',
 `qStart` int( 10 ) unsigned NOT NULL default '0',
 `chainId` int( 10 ) unsigned NOT NULL default '0',
 KEY `bin` ( `bin` ) ,
 KEY `chainId` ( `chainId` )
 ) TYPE = MYISAM ;
 
  CREATE TABLE `danRer4`.`chrNA_random_chainFr2` (
 `bin` smallint( 5 ) unsigned NOT NULL default '0',
 `score` double NOT NULL default '0',
 `tName` varchar( 255 ) NOT NULL default '',
 `tSize` int( 10 ) unsigned NOT NULL default '0',
 `tStart` int( 10 ) unsigned NOT NULL default '0',
 `tEnd` int( 10 ) unsigned NOT NULL default '0',
 `qName` varchar( 255 ) NOT NULL default '',
 `qSize` int( 10 ) unsigned NOT NULL default '0', 
 `qStrand` char( 1 ) NOT NULL default '',
 `qStart` int( 10 ) unsigned NOT NULL default '0',
 `qEnd` int( 10 ) unsigned NOT NULL default '0',
 `id` int( 10 ) unsigned NOT NULL default '0',
 KEY `bin` ( `bin` ) ,
 KEY `id` ( `id` )
 ) TYPE = MYISAM ;
 
  CREATE TABLE `danRer4`.`chrNA_random_chainFr2Link` (
 `bin` smallint( 5 ) unsigned NOT NULL default '0',
 `tName` varchar( 255 ) NOT NULL default '',
 `tStart` int( 10 ) unsigned NOT NULL default '0',
 `tEnd` int( 10 ) unsigned NOT NULL default '0',
 `qStart` int( 10 ) unsigned NOT NULL default '0',
 `chainId` int( 10 ) unsigned NOT NULL default '0',
 KEY `bin` ( `bin` ) ,
 KEY `chainId` ( `chainId` )
 ) TYPE = MYISAM ;
 
 ###########################################################################
 # CREATE LIFTOVER FROM danRer4 TO danRer5 
 # (DONE, 2007-09-21 - 2007-09-22, hartera)
     ssh kkstore04
     mkdir /cluster/data/danRer4/bed/blat.danRer5
     cd /cluster/data/danRer4/bed/blat.danRer5
     time nice doSameSpeciesLiftOver.pl danRer4 danRer5 \
         -bigClusterHub pk \
         -ooc /san/sanvol1/scratch/danRer4/danRer4_11.ooc \
 	-buildDir=/cluster/data/danRer4/bed/blat.danRer5 >& do.log &
     # 0.337u 0.208s 4:58:26.59 0.0%   0+0k 0+0io 28pf+0w
     # Remove symbolic link to liftOver chains and copy over the file
     rm ../liftOver/danRer4ToDanRer5.over.chain.gz
     cp -p danRer4ToDanRer5.over.chain.gz ../liftOver
     
     # a link in /usr/local/apache/htdocs/goldenPath/danRer5/liftOver has 
     # already been made to this file and md5sum.txt needs to be updated
     ssh hgwdev 
     cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver
     md5sum *.gz > md5sum.txt
 
     md5sum *.gz > ../../goldenPath/liftOver/md5sum.txt
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver
     ln -s /cluster/data/danRer5/bed/liftOver/danRer4ToDanRer5.over.chain.gz .
 
 #############################################################################
 # CONTRAST GENES (2007-10-02 markd)
 # recieved predictions from Sam Gross <ssgross@stanford.edu>
 
     cd /cluster/data/danRer4/bed/contrastGene/
     wget http://www.stanford.edu/~ssgross/contrast.danRer4.bed
     # this is a custom track, not a pure BED
     tail +2 contrast.danRer4.bed | hgLoadBed -tab danRer4 contrastGene stdin
 
     # verify 
     # load track db (ra and contrastGene.html are global
     # request push of contrastGene
 
 
 ###########################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 danRer4.upstreamGeneTbl = refGene
 danRer4.upstreamMaf = multiz7way /hive/data/genomes/danRer4/bed/multiz7way/species.lst