src/hg/makeDb/doc/rn5.txt 5578389fb6a724225e9c67b28ead1712a3745ee5

5578389fb6a724225e9c67b28ead1712a3745ee5
gperez2
  Mon Jan 10 15:00:46 2022 -0800
rn5 vs. rn7 lastz/chain/net run for user, refs #28726

diff --git src/hg/makeDb/doc/rn5.txt src/hg/makeDb/doc/rn5.txt
index 5518f50..26a253e 100644
--- src/hg/makeDb/doc/rn5.txt
+++ src/hg/makeDb/doc/rn5.txt
@@ -1,3008 +1,3111 @@
 # for emacs: -*- mode: sh; -*-
 #
 #	the above keeps emacs happy while working with this text document
 
 # This file describes how we made the browser database on the
 # Rattus norvegicus genome, March 2012 update (Rnor5.0) from Baylor.
 
 #	http://www.ncbi.nlm.nih.gov/bioproject/16219
 #	http://www.ncbi.nlm.nih.gov/genome/73
 #	http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AABR06
 #	Genome Coverage : 3x BAC; 6x WGS ABI Sanger reads
 #	chrMt: NC_001665.2
 
 #	DATE:   08-Mar-2012
 #	ORGANISM:       Rattus norvegicus
 #	TAXID:  10116
 #	ASSEMBLY LONG NAME:     Rnor_5.0
 #	ASSEMBLY SHORT NAME:    Rnor_5.0
 #	ASSEMBLY SUBMITTER:     Rat Genome Sequencing Consortium
 #	ASSEMBLY TYPE:  Haploid
 #	NUMBER OF ASSEMBLY-UNITS:       1
 #	ASSEMBLY ACCESSION:     GCA_000001895.3
 #	FTP-RELEASE DATE: 19-Mar-2012
 
 #########################################################################
 ## Download sequence (DONE - 2012-03-19 - Hiram)
     mkdir /hive/data/genomes/rn5
     mkdir /hive/data/genomes/rn5/genbank
     cd /hive/data/genomes/rn5/genbank
 
     rsync -a -P \
 rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Rattus_norvegicus/Rnor_5.0/ ./
 
     faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz
 # 2902588968 bases (334517722 N's 2568071246 real 2568071246 upper 0 lower)
 #	in 21 sequences in 21 files
 # Total size: mean 138218522.3 sd 67159802.9
 #	min 54450796 (gi|380690185|gb|CM000083.4|)
 #	max 290094216 (gi|380690196|gb|CM000072.4|) median 118718031
 
     faSize Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz
 # 4156200 bases (1012688 N's 3143512 real 3143512 upper 0 lower)
 #	in 1278 sequences in 21 files
 # Total size: mean 3252.1 sd 8429.6
 #	min 500 (gi|380099756|gb|AABR06109458.1|)
 #	max 227955 (gi|380099484|gb|AABR06109730.1|) median 2219
 
     faSize Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz
 # 2937457 bases (1314805 N's 1622652 real 1622652 upper 0 lower)
 #	in 1439 sequences in 1 files
 # Total size: mean 2041.3 sd 5072.3 min 280 (gi|380097677|gb|AABR06111537.1|)
 #	max 73090 (gi|380452989|gb|JH620568.1|) median 723
 
     # and all together:
     faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz \
 Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz \
 Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz
 # 2909682625 bases (336845215 N's 2572837410 real 2572837410 upper 0 lower)
 #	in 2738 sequences in 43 files
 # Total size: mean 1062703.7 sd 13357023.2
 #	min 280 (gi|380097677|gb|AABR06111537.1|)
 #	max 290094216 (gi|380690196|gb|CM000072.4|) median 1018
 
 #########################################################################
 # fixup names for UCSC standards (DONE - 2012-03-19 - Hiram)
     mkdir /hive/data/genomes/rn5/ucsc
     cd /hive/data/genomes/rn5/ucsc
 
     ########################  Assembled Chromosomes
     cat << '_EOF_' > toUcsc.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my %accToChr;
 
 open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
         die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
 while (my $line = <FH>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my ($chrN, $acc) = split('\s+', $line);
     $accToChr{$acc} = $chrN;
 }
 close (FH);
 
 foreach my $acc (keys %accToChr) {
     my $chrN =  $accToChr{$acc};
     print "$acc $accToChr{$acc}\n";
     open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz";
     open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
     while (my $line = <FH>) {
         if ($line =~ m/^#/) {
             print UC $line;
         } else {
             $line =~ s/^$acc/chr${chrN}/;
             print UC $line;
         }
     }
     close (FH);
     close (UC);
     open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
     open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
     while (my $line = <FH>) {
         if ($line =~ m/^>/) {
             printf UC ">chr${chrN}\n";
         } else {
             print UC $line;
         }
     }
     close (FH);
     close (UC);
 }
 '_EOF_'
     # << happy emacs
     chmod +x toUcsc.pl
     time ./toUcsc.pl
     #	real    0m50.678s
     time gzip *.fa *.agp
     #	real    12m46.877s
     faSize chr*.fa.gz
     #	2725521370 bases (77999939 N's 2647521431 real 2647521431 upper 0
     #	lower) in 21 sequences in 21 files
     #	Total size: mean 129786731.9 sd 33408399.1 min 61431566 (chr19)
     #	max 195471971 (chr1) median 124902244
 
     ########################  Unplaced scaffolds
     cat << '_EOF_' > unplaced.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
 my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
 open (FH, "zcat $agpFile|") or die "can not read $agpFile";
 open (UC, ">unplaced.agp") or die "can not write to unplaced.agp";
 while (my $line = <FH>) {
     if ($line =~ m/^#/) {
         print UC $line;
     } else {
         $line =~ s/\.1//;    
         printf UC "chrUn_%s", $line;
     }
 }
 close (FH);
 close (UC);
 
 open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
 open (UC, ">unplaced.fa") or die "can not write to unplaced.fa";
 while (my $line = <FH>) {
     if ($line =~ m/^>/) {
         chomp $line;
         $line =~ s/.*gb\|//;
         $line =~ s/\.1\|.*//;
         printf UC ">chrUn_$line\n";
     } else {
         print UC $line;
     }
 }
 close (FH);
 close (UC);
 '_EOF_'
     # << happy emacs
     chmod +x unplaced.pl
     time ./unplaced.pl
     #	real    0m0.131s
     # make sure none of the names got to be over 31 characers long:
     grep -v "^#" unplaced.agp | cut -f1 | sort | uniq -c | sort -rn
     gzip *.fa *.agp
     # not much in that sequence:
     faSize unplaced.fa.gz
 # 2937457 bases (1314805 N's 1622652 real 1622652 upper 0 lower)
 #	in 1439 sequences in 1 files
 # Total size: mean 2041.3 sd 5072.3 min 280 (chrUn_AABR06111537)
 #	max 73090 (chrUn_JH620568) median 723
 
     ########################  Unlocalized scaffolds
     cat << '_EOF_' > unlocalized.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my %accToChr;
 my %chrNames;
 
 open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or
         die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf";
 while (my $line = <FH>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my ($chrN, $acc) = split('\s+', $line);
     $accToChr{$acc} = $chrN;
     $chrNames{$chrN} += 1;
 }
 close (FH);
 
 foreach my $chrN (keys %chrNames) {
     my $agpFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz";
     my $fastaFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz";
     open (FH, "zcat $agpFile|") or die "can not read $agpFile";
     open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp";
     while (my $line = <FH>) {
         if ($line =~ m/^#/) {
             print UC $line;
         } else {
             chomp $line;
             my (@a) = split('\t', $line);
             my $acc = $a[0];
             my $accNo1 = $acc;
             $accNo1 =~ s/.1$//;
             die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
             die "ERROR: chrN $chrN not correct for $acc"
                 if ($accToChr{$acc} ne $chrN);
             my $ucscName = "chr${chrN}_${accNo1}_random";
             printf UC "%s", $ucscName;
             for (my $i = 1; $i < scalar(@a); ++$i) {
                 printf UC "\t%s", $a[$i];
             }
             printf UC "\n";
         }
     }
     close (FH);
     close (UC);
     printf "chr%s\n", $chrN;
     open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
     open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa";
     while (my $line = <FH>) {
         if ($line =~ m/^>/) {
             chomp $line;
             my $acc = $line;
             $acc =~ s/.*gb\|//;
             $acc =~ s/\|.*//;
             my $accNo1 = $acc;
             $accNo1 =~ s/.1$//;
             die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./);
             die "ERROR: chrN $chrN not correct for $acc"
                 if ($accToChr{$acc} ne $chrN);
             my $ucscName = "chr${chrN}_${accNo1}_random";
             printf UC ">$ucscName\n";
         } else {
             print UC $line;
         }
     }
     close (FH);
     close (UC);
 }
 '_EOF_'
     # << happy emacs
     chmod +x unlocalized.pl
     time ./unlocalized.pl
     #	real    0m0.592s
     gzip *.fa *.agp
 
     # verify none of the names are longer than 31 characters:
     zcat *.agp.gz | grep -v "^#" | cut -f1 | sort -u \
 	| awk '{print length($1),$1}' | sort -rn | head
 25 chr20_AABR06110665_random
 25 chr20_AABR06110664_random
 ...
 
     faSize chr*_random.fa.gz
 # 4156200 bases (1012688 N's 3143512 real 3143512 upper 0 lower)
 #	in 1278 sequences in 21 files
 # Total size: mean 3252.1 sd 8429.6 min 500 (chr2_AABR06109458_random)
 #	max 227955 (chr4_AABR06109730_random) median 2219
 
     #	verify all the sequence is still here after all this rigamarole:
     time faSize *.fa.gz
 # 2909682625 bases (336845215 N's 2572837410 real 2572837410 upper 0 lower)
 #	in 2738 sequences in 43 files
 # Total size: mean 1062703.7 sd 13357023.2 min 280 (chrUn_AABR06111537)
 #	max 290094216 (chr1) median 1018
     # verify same numbers as was in the original files measured above
 
 #########################################################################
 # Create .ra file and run makeGenomeDb.pl (DONE - Hiram - 2012-03-19)
     cd /hive/data/genomes/rn5
     cat << '_EOF_' >rn5.config.ra
 # Config parameters for makeGenomeDb.pl:
 db rn5
 clade mammal
 scientificName Rattus norvegicus
 commonName Rat
 assemblyDate Mar. 2012
 assemblyLabel RGSC Rnor_5.0 (GCA_000001895.3)
 assemblyShortLabel RGSC 5.0
 orderKey 1559
 mitoAcc NC_001665
 fastaFiles /hive/data/genomes/rn5/ucsc/*.fa.gz
 agpFiles /hive/data/genomes/rn5/ucsc/*.agp.gz
 # qualFiles none
 dbDbSpeciesDir rat
 ncbiAssemblyId 73
 ncbiAssemblyName Rnor_5.0
 taxId 10116
 '_EOF_'
     # << happy emacs
 
     #	run agp step first to verify fasta and agp files agree
     makeGenomeDb.pl -stop=agp rn5.config.ra > agp.log 2>&1
     # verify end of agp.log indictates:
 # All AGP and FASTA entries agree - both files are valid
     # continue with the build
     time makeGenomeDb.pl -continue=db rn5.config.ra > db.log 2>&1
     #	real    22m39.834s
 
 #########################################################################
 # running repeat masker (DONE - 2012-03-19 - Hiram)
     mkdir /hive/data/genomes/rn5/bed/repeatMasker
     cd /hive/data/genomes/rn5/bed/repeatMasker
     time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
 	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
 	-smallClusterHub=encodek rn5 > do.log 2>&1 &
     # hgwdev rebooted during kluster run, continuing:
     time doRepeatMasker.pl -buildDir=`pwd` -noSplit \
 	-continue=cat -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
 	-smallClusterHub=encodek rn5 > cat.log 2>&1 &
     #	real    55m22.186s
     cat faSize.rmsk.txt
     #	2909698938 bases (336845215 N's 2572853723 real 1473277432 upper
     #	1099576291 lower) in 2739 sequences in 1 files
     #	Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537)
     #	max 290094216 (chr1) median 1018
     #	%37.79 masked total, %42.74 masked real
 
     grep -i versi do.log
 # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $
 #    April 26 2011 (open-3-3-0) version of RepeatMasker
 
     featureBits -countGaps rn5 rmsk
     #	1100336249 bases of 2909698938 (37.816%) in intersection
     # why is it different than the faSize above ?
     # because rmsk masks out some N's as well as bases, the faSize count above
     #	separates out the N's from the bases, it doesn't show lower case N's
 
 ##########################################################################
 # running simple repeat (DONE - 2012-03-19 - Hiram)
     mkdir /hive/data/genomes/rn5/bed/simpleRepeat
     cd /hive/data/genomes/rn5/bed/simpleRepeat
     time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \
 	-dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \
 	rn5 > do.log 2>&1 &
     #	real    14m45.644s
 
     cat fb.simpleRepeat 
     #	97893561 bases of 2780239565 (3.521%) in intersection
 
     # add the TRF mask to the rmsk sequence:
     #	it masks more sequence
     cd /hive/data/genomes/rn5
     twoBitMask rn5.rmsk.2bit \
 	-add bed/simpleRepeat/trfMask.bed rn5.2bit
     #	you can safely ignore the warning about fields >= 13
 
     twoBitToFa rn5.2bit stdout | faSize stdin > faSize.rn5.2bit.txt
     cat faSize.rn5.2bit.txt
     #	2909698938 bases (336845215 N's 2572853723 real 1471484052 upper
     #	1101369671 lower) in 2739 sequences in 1 files
     #	Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537)
     #	max 290094216 (chr1) median 1018
     #	%37.85 masked total, %42.81 masked real
 
     #	replace the previous symLink which goes to the unmasked 2bit
     rm /gbdb/rn5/rn5.2bit
     ln -s `pwd`/rn5.2bit /gbdb/rn5/rn5.2bit
 
 #########################################################################
 # Verify all gaps are marked, add any N's not in gap as type 'other'
 #	(DONE - 2012-03-19 - Hiram)
     mkdir /hive/data/genomes/rn5/bed/gap
     cd /hive/data/genomes/rn5/bed/gap
     time nice -n +19 findMotif -motif=gattaca -verbose=4 \
 	-strand=+ ../../rn5.unmasked.2bit > findMotif.txt 2>&1
     #	real    0m35.258s
     grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
     time featureBits -countGaps rn5 -not gap -bed=notGap.bed
     #	2573362844 bases of 2909698938 (88.441%) in intersection
     #	real    0m14.151s
 
     time featureBits -countGaps rn5 allGaps.bed notGap.bed -bed=new.gaps.bed
     #	509121 bases of 2909698938 (0.017%) in intersection
     #	real    2m40.070s
 
     #	what is the highest index in the existing gap table:
     hgsql -N -e "select ix from gap;" rn5 | sort -n | tail -1
     #	22612
     cat << '_EOF_' > mkGap.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my $ix=`hgsql -N -e "select ix from gap;" rn5 | sort -n | tail -1`;
 chomp $ix;
 
 open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed";
 while (my $line = <FH>) {
     my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line);
     ++$ix;
     printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart,
         $chromEnd, $ix, $chromEnd-$chromStart;
 }
 close (FH);
 '_EOF_'
     # << happy emacs
     chmod +x ./mkGap.pl
     ./mkGap.pl > other.bed
     featureBits -countGaps rn5 other.bed
     #	509121 bases of 2909698938 (0.017%) in intersection
     wc -l other.bed
     #	148489
     # verify no mistake here:
     featureBits -countGaps rn5 gap other.bed
     #	0 bases of 2909698938 (0.000%) in intersection
 
     hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \
 	-noLoad rn5 otherGap other.bed
     #	starting with this many
     hgsql -e "select count(*) from gap;" rn5
     #	109913
     hgsql rn5 -e 'load data local infile "bed.tab" into table gap;'
     #	result count:
     hgsql -e "select count(*) from gap;" rn5
     #	258402
     calc 109913 \+ 148489
     #	109913 + 148489 = 258402
 
     # verify we aren't adding gaps where gaps already exist
     # this would output errors if that were true:
     gapToLift -minGap=1 rn5 nonBridged.lift -bedFile=nonBridged.bed
     # see example in danRer7.txt when problems arise
 
     # there are no non-bridged gaps here:
     hgsql -N -e "select bridge from gap;" rn5 | sort | uniq -c
     #	8109 no
     #	101804 yes
 
 ##########################################################################
 ## WINDOWMASKER (DONE - 2012-03-19 - Hiram)
     mkdir /hive/data/genomes/rn5/bed/windowMasker
     cd /hive/data/genomes/rn5/bed/windowMasker
     time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
 	-dbHost=hgwdev rn5 > do.log 2>&1 &
     #	real    231m12.464s
 
     # Masking statistics
     twoBitToFa rn5.wmsk.2bit stdout | faSize stdin
     #	2909698938 bases (336845215 N's 2572853723 real 1725214664 upper
     #	847639059 lower) in 2739 sequences in 1 files
     #	Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537)
     #	max 290094216 (chr1) median 1018
     #	%29.13 masked total, %32.95 masked real
 
     twoBitToFa rn5.wmsk.sdust.2bit stdout | faSize stdin
     #	2909698938 bases (336845215 N's 2572853723 real 1710265619 upper
     #	862588104 lower) in 2739 sequences in 1 files
     #	Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537)
     #	max 290094216 (chr1) median 1018
     #	%29.65 masked total, %33.53 masked real
 
     hgLoadBed rn5 windowmaskerSdust windowmasker.sdust.bed.gz
     #	Loaded 13855996 elements of size 3
 
     featureBits -countGaps rn5 windowmaskerSdust
     #	1199303584 bases of 2909698938 (41.217%) in intersection
 
     #	eliminate the gaps from the masking
     featureBits rn5 -not gap -bed=notGap.bed
     #	2572853723 bases of 2572853723 (100.000%) in intersection
     time nice -n +19 featureBits rn5 windowmaskerSdust notGap.bed \
         -bed=stdout | gzip -c > cleanWMask.bed.gz
     #	862588104 bases of 2572853723 (33.527%) in intersection
     #	real    3m53.016s
 
     #	reload track to get it clean
     hgLoadBed rn5 windowmaskerSdust cleanWMask.bed.gz
     #	Loaded 13904385 elements of size 4
     time featureBits -countGaps rn5 windowmaskerSdust
     #	862588104 bases of 2909698938 (29.645%) in intersection
     #	real    1m26.398s
 
     #	mask with this clean result
     zcat cleanWMask.bed.gz \
 	| twoBitMask ../../rn5.unmasked.2bit stdin \
 	    -type=.bed rn5.cleanWMSdust.2bit
     twoBitToFa rn5.cleanWMSdust.2bit stdout | faSize stdin \
         > rn5.cleanWMSdust.faSize.txt
     cat rn5.cleanWMSdust.faSize.txt
     #	2909698938 bases (336845215 N's 2572853723 real 1710265619 upper
     #	862588104 lower) in 2739 sequences in 1 files
     #	Total size: mean 1062321.6 sd 13354598.7 min 280 (chrUn_AABR06111537)
     #	max 290094216 (chr1) median 1018
     #	%29.65 masked total, %33.53 masked real
 
     # how much does this window masker and repeat masker overlap:
     featureBits -countGaps rn5 rmsk windowmaskerSdust
     #	648600018 bases of 2909698938 (22.291%) in intersection
 
 #########################################################################
 # MASK SEQUENCE WITH WM+TRF
     # not running this since RM + TRF is plenty of masking
 #    cd /hive/data/genomes/rn5
 #    twoBitMask -add bed/windowMasker/rn5.cleanWMSdust.2bit \
 #	bed/simpleRepeat/trfMask.bed rn5.2bit
     #	safe to ignore the warnings about BED file with >=13 fields
 #    twoBitToFa rn5.2bit stdout | faSize stdin > faSize.rn5.txt
 #    cat faSize.rn5.txt
 
     #	create symlink to gbdb
 #    ssh hgwdev
 #    rm /gbdb/rn5/rn5.2bit
 #    ln -s `pwd`/rn5.2bit /gbdb/rn5/rn5.2bit
 
 #########################################################################
 # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2012-03-23 - Hiram)
     ssh encodek
     mkdir /hive/data/genomes/rn5/bed/linSpecRep
     cd /hive/data/genomes/rn5/bed/linSpecRep
 
     # split the RM output by chromosome name into separate files
     mkdir rmsk dateRepeats
     head -3 ../repeatMasker/rn5.sorted.fa.out > rmsk.header.txt
     headRest 3 ../repeatMasker/rn5.sorted.fa.out \
 	| splitFileByColumn -ending=.out -col=5 -head=rmsk.header.txt stdin rmsk
 
     ls -1S rmsk/* > rmOut.list
     wc -l rmOut.list
     #	2382 rmOut.list
 
     cat << '_EOF_' > mkLSR
 #!/bin/csh -fe
 rm -f dateRepeats/$1_homo-sapiens_mus-musculus
 /scratch/data/RepeatMasker110426/DateRepeats \
     $1 -query rat -comp human -comp mouse
 mv $1_homo-sapiens_mus-musculus dateRepeats
 '_EOF_'
     #	<< happy emacs
     chmod +x mkLSR
 
     cat << '_EOF_' > template
 #LOOP
 ./mkLSR $(path1) {check out line+ dateRepeats/$(file1)_homo-sapiens_mus-musculus}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 rmOut.list single template jobList
     para create jobList
     para try ... check ... push ... etc...
     para time
 # Completed: 2382 of 2382 jobs
 # CPU time in finished jobs:      37294s     621.57m    10.36h    0.43d  0.001 y
 # IO & Wait Time:                  6367s     106.11m     1.77h    0.07d  0.000 y
 # Average job time:                  18s       0.31m     0.01h    0.00d
 # Longest finished job:              78s       1.30m     0.02h    0.00d
 # Submission to last job:          2245s      37.42m     0.62h    0.03d
 
     mkdir notInHuman notInMouse
     for F in dateRepeats/chr*.out_homo-sapiens*
     do
 	B=`basename ${F}`
 	B=${B/.out*/}
 	echo $B 
         /cluster/bin/scripts/extractRepeats 1 ${F} > \
 		notInHuman/${B}.out.spec
         /cluster/bin/scripts/extractRepeats 2 ${F} > \
 		notInMouse/${B}.out.spec
     done
 
     #	Verify that these two things are actually different
     #	To check identical
     find ./notInHuman ./notInMouse -name "*.out.spec" | \
 	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
 	| sort -k1,1n | sort -t"/" -k3,3 > check.same
     # some of them are the same, but not all:
     sed -e 's#./notInHuman/##; s#./notInMouse/##' check.same \
 	| sort | uniq -c | sort -rn | less
     # you will see a count of two at the beginning, but it becomes one soon
 
     #	Copy to data/staging for cluster replication
     mkdir /hive/data/staging/data/rn5
     rsync -a -P ./notInMouse/ /hive/data/staging/data/rn5/notInMouse/
     rsync -a -P ./notInHuman/ /hive/data/staging/data/rn5/notInHuman/
 
     # We also need the nibs for the lastz runs with lineage specific repeats
     mkdir /hive/data/staging/data/rn5/nib
     mkdir /hive/data/genomes/rn5/nib
     cd /hive/data/genomes/rn5
     cut -f1 chrom.sizes | while read C
 do
     twoBitToFa -seq=${C} rn5.2bit stdout | faToNib -softMask stdin nib/${C}.nib
     ls -og nib/$C.nib
 done
     # verify one is properly masked:
     nibFrag -masked nib/chrM.nib 0 `grep -w chrM chrom.sizes | cut -f2` + \
 	stdout | faSize stdin
     #	16313 bases (0 N's 16313 real 15922 upper 391 lower)
     #	in 1 sequences in 1 files
     #	%2.40 masked total, %2.40 masked real
     # compare to:
     twoBitToFa -seq=chrM rn5.2bit stdout | faSize stdin
     #	16313 bases (0 N's 16313 real 15922 upper 391 lower)
     #	in 1 sequences in 1 files
     #	%2.40 masked total, %2.40 masked real
 
     #	Copy to data/genomes staging for cluster replication
     rsync -a -P ./nib/ /hive/data/staging/data/rn5/nib/
 
 #########################################################################
 # cpgIslands - (DONE - 2011-04-20 - Hiram)
     mkdir /hive/data/genomes/rn5/bed/cpgIslands
     cd /hive/data/genomes/rn5/bed/cpgIslands
     time doCpgIslands.pl rn5 > do.log 2>&1
     #   Elapsed time: 11m10s
 
     cat fb.rn5.cpgIslandExt.txt
     #   10377460 bases of 2572853723 (0.403%) in intersection
 
 #########################################################################
 # genscan - (DONE - 2011-04-25 - Hiram)
     mkdir /hive/data/genomes/rn5/bed/genscan
     cd /hive/data/genomes/rn5/bed/genscan
     time doGenscan.pl rn5 > do.log 2>&1
     #   real    104m9.716s
     # a number of jobs did not finish, rerunning with window size 2000000
     #   only chr7 completed at 15000000
     # rerunning with window size of 1000000
     #   real    78m10.904s
     # rerunning with window size of 500000
     # rerunning with window size of 250000
     # rerunning with window size of 100000
     #   real    85m5.776s
     # rerunning with window size of 50000
     #   real    86m57.706s
     #   This is not working.  Need to run these split
     mkdir /hive/data/genomes/rn5/bed/genscan/splitRun
     cd /hive/data/genomes/rn5/bed/genscan/splitRun
     gapToLift rn5 rn5.nonBridged.lift -bedFile=rn5.nonBridged.bed
 
     for C in 2 3 4 5 6 7 11 13 15 19
 do
     echo chr${C}
     cd /hive/data/genomes/rn5/bed/genscan/splitRun
     grep -w "chr${C}" rn5.nonBridged.lift | grep -v random \
         | sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift
     mkdir chr${C}
     faToTwoBit ../hardMaskedFa/000/chr${C}.fa chr${C}/chr${C}.2bit
     ~/kent/src/hg/utils/lft2BitToFa.pl chr${C}/chr${C}.2bit \
         chr${C}.nonBridged.lift > chr${C}/chr${C}.nonBridged.fa
     cd /hive/data/genomes/rn5/bed/genscan/splitRun/chr${C}
     mkdir split${C}
     faSplit sequence chr${C}.nonBridged.fa 100 split${C}/chr${C}_
 done
 
     for C in 2 3 4 5 6 7 11 13 15 19
 do
     grep -w "chr${C}" rn5.nonBridged.lift | grep -v random \
         | sed -e "s/chr${C}./chr${C}_/" > chr${C}.nonBridged.lift
 done
 
     for C in 2 3 4 5 6 7 11 13 15 19
 do
     echo chr${C}
     cd /hive/data/genomes/rn5/bed/genscan/splitRun/chr${C}
     echo '#!/bin/sh' > cmdList.sh
     export NL=-1
     ls split${C} | while read F
   do
   NL=`echo $NL | awk '{print $1+1}'`
   if [ "${NL}" -gt 7 ]; then
     NL=0
     echo "echo waiting before $F" >> cmdList.sh
     echo wait >> cmdList.sh
   fi
   echo "/cluster/bin/x86_64/gsBig split${C}/${F} gtf/${F}.gtf -trans=pep/${F}.pep -subopt=subopt/${F}.bed -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000 &" 
   done >> cmdList.sh
     echo "echo waiting at end" >> cmdList.sh
     echo "wait" >> cmdList.sh
     chmod +x cmdList.sh
     rm -fr gtf pep subopt
     mkdir gtf pep subopt
 done
 
 # running chr15 - real    345m30.138s
     # running them all:
     time ./runAll.sh > runAll.log 2>&1
     #   real    458m23.357s
 
     # collecting the results:
 for C in chr2 chr3 chr4 chr5 chr6 chr7 chr11 chr13 chr15 chr19
 do
     cd /hive/data/genomes/rn5/bed/genscan/splitRun/${C}
     cat gtf/${C}_*.gtf | liftUp -type=.gtf stdout ../${C}.nonBridged.lift error stdin \
         | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.gtf
 cat subopt/${C}_*.bed | liftUp -type=.bed stdout ../${C}.nonBridged.lift error stdin \
         | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.subopt.bed
 cat pep/${C}_*.pep | sed -e "s/${C}_0\([0-4]\)\./${C}.\1/g" > ${C}.pep
 ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed
 ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed
 done
 
     # after verifying the sizes of the files seem same compared to what
     #   happened in the main run:
 for C in chr2 chr3 chr4 chr5 chr6 chr7 chr11 chr13 chr15 chr19
 do
     cd /hive/data/genomes/rn5/bed/genscan/splitRun/${C}
    ls -l ../../gtf/00?/${C}.gtf ../../pep/00?/${C}.pep ../../subopt/00?/${C}.bed
     ls -l ${C}.gtf ${C}.pep ${C}.subopt.bed
     cd /hive/data/genomes/rn5/bed/genscan/splitRun
 done
 
     # this is tricky, it is counting on the file existing, empty or otherwise
 for C in chr2 chr3 chr4 chr5 chr6 chr7 chr11 chr13 chr15 chr19
 do
     cd /hive/data/genomes/rn5/bed/genscan/splitRun/${C}
     D=`ls ../../gtf/00?/${C}.gtf`
     rm -f "${D}"
     cp -p ${C}.gtf "${D}"
     D=`ls ../../pep/00?/${C}.pep`
     rm -f "${D}"
     cp -p ${C}.pep "${D}"
     D=`ls ../../subopt/00?/${C}.bed`
     rm -f "${D}"
     cp -p ${C}.subopt.bed "${D}"
     cd /hive/data/genomes/rn5/bed/genscan/splitRun
 done
 
     # Now, we can continue
     cd /hive/data/genomes/rn5/bed/genscan
     time doGenscan.pl -continue=makeBed -workhorse=hgwdev -dbHost=hgwdev \
         rn5 > makeBed.log 2>&1
     #   real    1m51.711s
 
     cat fb.rn5.genscan.txt
     #   56153914 bases of 2572853723 (2.183%) in intersection
     cat fb.rn5.genscanSubopt.txt
     #   59799363 bases of 2572853723 (2.324%) in intersection
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-04 - Hiram)
     # Use -repMatch=900, based on size -- for human we use 1024
     # use the "real" number from the faSize measurement,
     # hg19 is 2897316137, calculate the ratio factor for 1024:
     calc \( 2572837410 / 2897316137 \) \* 1024
     #	( 2572837410 / 2897316137 ) * 1024 = 909.319309
 
     # round up to 950  (rn4 was 1024)
 
     cd /hive/data/genomes/rn5
     time blat rn5.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=jkStuff/rn5.11.ooc -repMatch=800
     #   Wrote 34513 overused 11-mers to jkStuff/rn5.11.ooc
     #   rn4 had: Wrote 25608 overused 11-mers to /cluster/bluearc/rn4/11.ooc
     #	real     1m13.627s
 
     # there are non-bridged gaps, create lift file needed for genbank
     hgsql -N -e "select bridge from gap;" rn5 | sort | uniq -c
     #   8109 no
     #   250293 yes
 
     cd /hive/data/genomes/rn5/jkStuff
     gapToLift rn5 rn5.nonBridged.lift -bedFile=rn5.nonBridged.bed
     # largest non-bridged contig:
     awk '{print $3-$2,$0}' rn5.nonBridged.bed | sort -nr | head
     #   13151837 chr10  48884958        62036795        chr10.150
 
 #########################################################################
 # AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram)
     # examine the file:
     /cluster/data/genbank/data/organism.lst
     # for your species to see what counts it has for:
 # organism       mrnaCnt estCnt  refSeqCnt
 # Rattus norvegicus	123478	1103589	16820
     # to decide which "native" mrna or ests you want to specify in genbank.conf
 
     ssh hgwdev  
     cd $HOME/kent/src/hg/makeDb/genbank
     git pull
     # edit etc/genbank.conf to add:
 # rn5 (rat)
 rn5.serverGenome = /hive/data/genomes/rn5/rn5.2bit
 rn5.clusterGenome = /hive/data/genomes/rn5/rn5.2bit
 rn5.ooc = /hive/data/genomes/rn5/jkStuff/rn5.11.ooc
 rn5.lift = /hive/data/genomes/rn5/jkStuff/rn5.nonBridged.lift
 rn5.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
 rn5.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
 rn5.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
 rn5.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
 rn5.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
 rn5.downloadDir = rn5
 rn5.refseq.mrna.xeno.load  = yes
 rn5.refseq.mrna.xeno.loadDesc = yes
 rn5.perChromTables = no
 rn5.mgc = yes
 # rn5.upstreamGeneTbl = refGene
 # rn5.upstreamMaf = multiz9way /hive/data/genomes/rn5/bed/multiz9way/species.lst
 
     # end of section added to etc/genbank.conf
     git commit -m "adding rn5 rat" etc/genbank.conf
     git push
     make etc-update
 
     ssh hgwdev			# used to do this on "genbank" machine
     screen -S rn5           # long running job managed in screen
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbAlignStep -initial rn5 &
     #	var/build/logs/2012.05.04-10:28:06.rn5.initalign.log
     #   real    1166m46.194s
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad rn5 &
     #	logFile: var/dbload/hgwdev/logs/2012.05.07-13:39:54.dbload.log
     #   real    63m53.369s
 
     # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     git pull
     # add rn5 to:
         etc/align.dbs etc/hgwdev.dbs
     git commit -m "Added rn5." etc/align.dbs etc/hgwdev.dbs
     git push
     make etc-update
 
 #########################################################################
 # constructing downloads
     # first add rn5 to all.joiner and make sure joinerCheck is clean on it
     cd /hive/data/genomes/rn5
     time makeDownloads.pl -workhorse=hgwdev -dbHost=hgwdev rn5 \
         > downloads.log 2>&1
     #   real    24m32.649s
     # need to edit the README.txt files to make sure they are correct.
 
 ##########################################################################
 # create pushQ entry (DONE - 2012-05-23 - Hiram)
     # first make sure all.joiner is up to date and has this new organism
     # a keys check should be clean:
     cd ~/kent/src/hg/makeDb/schema
     joinerCheck -database=rn5 -keys all.joiner
 
     mkdir /hive/data/genomes/rn5/pushQ
     cd /hive/data/genomes/rn5/pushQ
     makePushQSql.pl rn5 > rn5.sql 2> stderr.out
     # check stderr.out for no significant problems, it is common to see:
 # WARNING: hgwdev does not have /gbdb/rn5/wib/gc5Base.wib
 # WARNING: hgwdev does not have /gbdb/rn5/wib/quality.wib
 # WARNING: hgwdev does not have /gbdb/rn5/bbi/quality.bw
 # WARNING: rn5 does not have seq
 # WARNING: rn5 does not have extFile
     # which are no real problem
     # if some tables are not identified:
 # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of
 # supporting and genbank tables) which tracks to assign these tables to:
 #	<some table list ... >
     # put them in manually after loading the pushQ entry
     scp -p rn5.sql hgwbeta:/tmp
     ssh hgwbeta
     cd /tmp
     hgsql qapushq < rn5.sql
 
 #########################################################################
 # fixup search rule for assembly track/gold table (DONE - 2012-06-07 - Hiram)
     hgsql -N -e "select frag from gold;" rn5 | sort | head -1
 AABR06000001.1
     hgsql -N -e "select frag from gold;" rn5 | sort | tail -2
 AABR06112651.1
 NC_001665
 
     # to test the rule:
     hgsql -N -e "select frag from gold;" rn5 | wc -l
     #   112652
     # should find all:
     hgsql -N -e "select frag from gold;" rn5 \
         | egrep -e '[AN][AC][B_][R0]0[0-9]+(\.1)?' | wc -l
     #   112652
     # or exclude all:
     hgsql -N -e "select frag from gold;" rn5 \
         | egrep -v -e '[AN][AC][B_][R0]0[0-9]+(\.1)?' | wc -l
     #   0
 
     # hence, add to trackDb/rat/rn5/trackDb.ra
 searchTable gold
 shortCircuit 1
 termRegex [AN][AC][B_][R0]0[0-9]+(\.1)?
 query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
 searchPriority 8
 
 #########################################################################
 # chain/net hg19 (DONE - 2012-06-28 - Hiram)
     # the original alignment to human:
     cd /hive/data/genomes/hg19/bed/lastzRn5.2012-06-27
     cat fb.hg19.chainRn5Link.txt
     #   917356917 bases of 2897316137 (31.662%) in intersection
 
     #	running the swap - DONE - 2012-06-28
     mkdir /hive/data/genomes/rn5/bed/blastz.hg19.swap
     cd /hive/data/genomes/rn5/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzRn5.2012-06-27/DEF \
 	-swap -noLoadChainSplit \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #   real    66m53.095s
     cat fb.rn5.chainHg19Link.txt
     #	933922552 bases of 2572853723 (36.299%) in intersection
 
 ##############################################################################
 # make refSeq amino acid fasta file for use with mm10 UCSC genes
 #
 #  downloaded rn5.refGenePepProtNames.faa.gz using table browser, selecting
 #  refGene, download sequence, and clicking the "protein" radio button.
 #  This generates a file with the protein id's, whereas as we want the NM_*
 #  id's for the blastp process to work
 
 zcat rn5.refGenePepProtNames.faa.gz | awk '/^>/ {print buf; buf=$1 " "} ! /^>/ {buf=buf $1}' | tr -d '>' | sed 's/\.[0-9] / /' | sort > tmp1
 
 echo "select refGene.name,refLink.protAcc from refGene,refLink where refGene.name=refLink.mrnaAcc and refLink.protAcc != ''" | hgsql rn5 | tail -n +2 | awk '{print $2,$1}' | sort > tmp2
 
 join tmp1 tmp2 | awk '{printf ">%s\n%s\n",$3,$2}' > rn5.refGenePep.faa
 rm tmp1 tmp2
 
 #########################################################################
 # LASTZ Macaca rheMac3 (DONE - 2012-07-10 - Chin)
     screen -S rheMac3-rn5
     mkdir /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10
     cd /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #   number of jobs, 50,000 to something under 100,000
 
     cat << '_EOF_' > DEF
 # rat vs. macacque
 BLASTZ_M=254
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/cluster/data/rn5/rn5.2bit
 SEQ1_LEN=/cluster/data/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Macaca Mulatta RheMac3
 SEQ2_DIR=/cluster/data/rheMac3/rheMac3.2bit
 SEQ2_LEN=/cluster/data/rheMac3/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=1000
 
 BASE=/cluster/data/rn5/bed/lastz.rheMac3.2012-07-10
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
       -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
       -chainMinScore=3000 -chainLinearGap=medium \
       -syntenicNet `pwd`/DEF > do.log 2>&1 &
     #   real    1526m4.274s
     cat fb.rn5.chainRheMac3Link.txt
     # 896256761 bases of 2572853723 (34.835%) in intersection
     cd /hive/data/genomes/rn5/bed
     ln -s lastz.rheMac3.2012-07-10 lastz.rheMac3
 
     # then swap
     mkdir /cluster/data/rheMac3/bed/blastz.rn5.swap
     cd /cluster/data/rheMac3/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /cluster/data/rn5/bed/lastz.rheMac3.2012-07-10/DEF \
         -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
         -chainMinScore=3000 -chainLinearGap=medium \
         -swap -syntenicNet > swap.log 2>&1 &
     #   real    69m12.752s
     cat fb.rheMac3.chainRn5Link.txt
     # 858153767 bases of 2639145830 (32.516%) in intersection
     cd /hive/data/genomes/rheMac3/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 ##############################################################################
 # LIFTOVER TO rn4 (DONE - 2012-11-01 - Hiram)
     mkdir /hive/data/genomes/rn5/bed/blat.rn4.2012-11-01
     cd /hive/data/genomes/rn5/bed/blat.rn4.2012-11-01
     # -debug run to create run dir, preview scripts...
     doSameSpeciesLiftOver.pl -debug \
 	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         -ooc /hive/data/genomes/rn5/jkStuff/rn5.11.ooc rn5 rn4
     # Real run:
     time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
 	-bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
         -ooc /hive/data/genomes/rn5/jkStuff/rn5.11.ooc rn5 rn4 > do.log 2>&1
     #	real    114m49.176s
 
     # test the browser on hgwdev to see if it can convert from rn5 to rn4
 
 #############################################################################
 # lastz zebrafish danRer7 (DONE - 2012-11-13,14 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5DanRer7
     mkdir /hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13
     cd /hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13
 
     cat << '_EOF_' > DEF
 # Rat vs. zebrafish
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=15
 
 # QUERY: zebrafish danRer7
 SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit
 SEQ2_LEN=/scratch/data/danRer7/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=10
 
 BASE=/hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #	number of jobs, 50,000 to something under 100,000
     # when not present, SEQ2_LIMIT is a default 100
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         `pwd`/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
     #	real    1099m42.792s
     cat fb.rn5.chainDanRer7Link.txt
     #	66573209 bases of 2572853723 (2.588%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzDanRer7.2012-11-13 lastz.danRer7
 
     #	and for the swap
     mkdir /hive/data/genomes/danRer7/bed/blastz.rn5.swap
     cd /hive/data/genomes/danRer7/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzDanRer7.2012-11-13/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
     #	real    15m5.664s
     cat  fb.danRer7.chainRn5Link.txt
     #	68007020 bases of 1409770109 (4.824%) in intersection
 
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/danRer7/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 #########################################################################
 # lastz frog xenTro3 (DONE - 2012-11-13,14 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5XenTro3
     mkdir /hive/data/genomes/rn5/bed/lastzXenTro3.2012-11-13
     cd /hive/data/genomes/rn5/bed/lastzXenTro3.2012-11-13
 
     cat << '_EOF_' > DEF
 # Rat vs. frog
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=30
 
 # QUERY: frog xenTro3
 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit
 SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=80
 
 BASE=/hive/data/genomes/rn5/bed/lastzXenTro3.2012-11-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #	number of jobs, 50,000 to something under 100,000
     # when not present, SEQ2_LIMIT is a default 100
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         `pwd`/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
     #	real    1307m43.094s
     cat fb.rn5.chainXenTro3Link.txt
     #	82011038 bases of 2572853723 (3.188%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzXenTro3.2012-11-13 lastz.xenTro3
 
     #	and for the swap
     mkdir /hive/data/genomes/xenTro3/bed/blastz.rn5.swap
     cd /hive/data/genomes/xenTro3/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzXenTro3.2012-11-13/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
     #	real    25m43.688s
 
     cat  fb.xenTro3.chainRn5Link.txt
     #	87276772 bases of 1358334882 (6.425%) in intersection
 
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/xenTro3/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 #########################################################################
 # lastz turkey melGal1 (DONE - 2012-11-13,14 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5MelGal1
     mkdir /hive/data/genomes/rn5/bed/lastzMelGal1.2012-11-13
     cd /hive/data/genomes/rn5/bed/lastzMelGal1.2012-11-13
 
     cat << '_EOF_' > DEF
 # Rat vs. turkey
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=20
 
 # QUERY: turkey melGal1
 SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit
 SEQ2_LEN=/scratch/data/melGal1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=30
 
 BASE=/hive/data/genomes/rn5/bed/lastzMelGal1.2012-11-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #	number of jobs, 50,000 to something under 100,000
     # when not present, SEQ2_LIMIT is a default 100
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         `pwd`/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
     #	real    1225m37.742s
     cat fb.rn5.chainMelGal1Link.txt
     #	93392251 bases of 2572853723 (3.630%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzMelGal1.2012-11-13 lastz.melGal1
 
     #	and for the swap
     mkdir /hive/data/genomes/melGal1/bed/blastz.rn5.swap
     cd /hive/data/genomes/melGal1/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzMelGal1.2012-11-13/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
     #	real    10m0.139s
     cat  fb.melGal1.chainRn5Link.txt
     #	74638274 bases of 935922386 (7.975%) in intersection
 
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/melGal1/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 #########################################################################
 # lastz chicken galGal4 (DONE - 2012-11-29,12-04 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5GalGal4
     mkdir /hive/data/genomes/rn5/bed/lastzGalGal4.2012-11-29
     cd /hive/data/genomes/rn5/bed/lastzGalGal4.2012-11-29
 
     cat << '_EOF_' > DEF
 # Rat vs. chicken
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=20
 
 # QUERY: chicken galGal4
 SEQ2_DIR=/hive/data/genomes/galGal4/galGal4.2bit
 SEQ2_LEN=/hive/data/genomes/galGal4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=60
 
 BASE=/hive/data/genomes/rn5/bed/lastzGalGal4.2012-11-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #	number of jobs, 50,000 to something under 100,000
     # when not present, SEQ2_LIMIT is a default 100
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         `pwd`/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
     #	Elapsed time: 1483m27s
     cat fb.rn5.chainGalGal4Link.txt
     #	97465937 bases of 2572853723 (3.788%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzGalGal4.2012-11-29 lastz.galGal4
 
     #	and for the swap
     mkdir /hive/data/genomes/galGal4/bed/blastz.rn5.swap
     cd /hive/data/genomes/galGal4/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzGalGal4.2012-11-29/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
     #	real    11m49.505s
     cat fb.galGal4.chainRn5Link.txt
     #	81184849 bases of 1032854810 (7.860%) in intersection
 
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/galGal4/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 #########################################################################
 # lastz Opossum monDom5 (WORKING - 2012-11-29 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5MonDom5
     mkdir /hive/data/genomes/rn5/bed/lastzMonDom5.2012-11-29
     cd /hive/data/genomes/rn5/bed/lastzMonDom5.2012-11-29
 
     cat << '_EOF_' > DEF
 # Rat vs. opossum
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 BLASTZ_M=254
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=20
 
 # QUERY: Opossum monDom5
 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
 SEQ2_LEN=/scratch/data/monDom5/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/rn5/bed/lastzMonDom5.2012-11-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #	number of jobs, 50,000 to something under 100,000
     #	Can't do this when there are only the single small set of chroms
      time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         `pwd`/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
     #	real    3213m53.797s
     #   chaining failed, running last two for chr1 and chr3
     ./chain.csh rn5.2bit:chr1: chain/rn5.2bit:chr1:.chain &
     ./chain.csh rn5.2bit:chr3: chain/rn5.2bit:chr3:.chain
     #   real    269m41.974s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         -continue=chainMerge `pwd`/DEF \
         -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
         -chainMinScore=5000 -chainLinearGap=loose > chainMerge.log 2>&1 &
 
     cat fb.rn5.chainMonDom5Link.txt
     #	249403097 bases of 2572853723 (9.694%) in intersection
 
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzMonDom5.2012-11-29 lastz.monDom5
 
     #	and for the swap
     mkdir /hive/data/genomes/monDom5/bed/blastz.rn5.swap
     cd /hive/data/genomes/monDom5/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzMonDom5.2012-11-29/DEF \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
 XXX - running - Wed Apr  2 12:44:21 PDT 2014
     #	real    73m49.230s
     cat  fb.monDom5.chainRn5Link.txt
     #	252291401 bases of 3501660299 (7.205%) in intersection
 
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/monDom5/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 #########################################################################
 # LASTZ cow bosTau7 (DONE - 2012-11-29,12-03 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5BosTau7
     mkdir /hive/data/genomes/rn5/bed/lastzBosTau7.2012-11-29
     cd /hive/data/genomes/rn5/bed/lastzBosTau7.2012-11-29
 
     cat << '_EOF_' > DEF
 # cow vs mouse
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=20
 
 # QUERY: cow BosTau7
 SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit
 SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/hive/data/genomes/rn5/bed/lastzBosTau7.2012-11-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #	number of jobs, 50,000 to something under 100,000
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #   Elapsed time: 2921m25s
     cat fb.rn5.chainBosTau7Link.txt
     #   690036664 bases of 2572853723 (26.820%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzBosTau7.2012-11-29 lastz.bosTau7
 
     mkdir /hive/data/genomes/bosTau7/bed/blastz.rn5.swap
     cd /hive/data/genomes/bosTau7/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzBosTau7.2012-11-29/DEF \
 	-swap -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #   real    64m20.430s
     cat fb.bosTau7.chainRn5Link.txt
     #	687933878 bases of 2804673174 (24.528%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/bosTau7/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 ##############################################################################
 # LASTZ dog canFam3 (DONE - 2012-11-29,12-04 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5CanFam3
     mkdir /hive/data/genomes/rn5/bed/lastzCanFam3.2012-11-29
     cd /hive/data/genomes/rn5/bed/lastzCanFam3.2012-11-29
 
     cat << '_EOF_' > DEF
 # dog vs rat
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=20
 
 # QUERY: dog CanFam3
 SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit
 SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=40
 
 BASE=/hive/data/genomes/rn5/bed/lastzCanFam3.2012-11-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #	number of jobs, 50,000 to something under 100,000
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    5963m50.439s
     # there was a hung-up job that prolonged this time
     cat fb.rn5.chainCanFam3Link.txt
     #	768049485 bases of 2572853723 (29.852%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzCanFam3.2012-11-29 lastz.canFam3
 
     mkdir /hive/data/genomes/canFam3/bed/blastz.rn5.swap
     cd /hive/data/genomes/canFam3/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzCanFam3.2012-11-29/DEF \
 	-swap -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	Elapsed time: 67m13s
     cat fb.canFam3.chainRn5Link.txt
     #	734454555 bases of 2392715236 (30.695%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/canFam3/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 ##############################################################################
 # LASTZ panda ailMel1 (DONE - 2012-11-29,12-04 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5AilMel1
     mkdir /hive/data/genomes/rn5/bed/lastzAilMel1.2012-11-29
     cd /hive/data/genomes/rn5/bed/lastzAilMel1.2012-11-29
 
     cat << '_EOF_' > DEF
 # panda vs rat
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=50
 
 # QUERY: panda AilMel1
 SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
 SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=350
 
 BASE=/hive/data/genomes/rn5/bed/lastzAilMel1.2012-11-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #	number of jobs, 50,000 to something under 100,000
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	Elapsed time: 2389m42s
     cat fb.rn5.chainAilMel1Link.txt
     #	818618644 bases of 2572853723 (31.818%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzAilMel1.2012-11-29 lastz.ailMel1
 
     cd /hive/data/genomes/rn5/bed/lastzAilMel1.2012-11-29
     time doRecipBest.pl -buildDir=`pwd` rn5 ailMel1 > rbest.log 2>&1
     # real    25m5.378s
 
     mkdir /hive/data/genomes/ailMel1/bed/blastz.rn5.swap
     cd /hive/data/genomes/ailMel1/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzAilMel1.2012-11-29/DEF \
 	-swap -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	Elapsed time: 72m40.305s
     cat fb.ailMel1.chainRn5Link.txt
     #	776437110 bases of 2245312831 (34.580%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/ailMel1/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 ##############################################################################
 # LASTZ Chimp PanTro4 (DONE - 2012-11-29,12-04 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5PanTro4
     mkdir /hive/data/genomes/rn5/bed/lastzPanTro4.2012-11-29
     cd /hive/data/genomes/rn5/bed/lastzPanTro4.2012-11-29
 
     cat << '_EOF_' > DEF
 # chimp vs rat
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=40
 
 # QUERY: Chimp PanTro4
 SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit
 SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/hive/data/genomes/rn5/bed/lastzPanTro4.2012-11-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    3194m24.511s
     cat fb.rn5.chainPanTro4Link.txt
     #	916343862 bases of 2572853723 (35.616%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzPanTro4.2012-11-29 lastz.panTro4
 
     mkdir /hive/data/genomes/panTro4/bed/blastz.rn5.swap
     cd /hive/data/genomes/panTro4/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzPanTro4.2012-11-29/DEF \
 	-swap -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    85m28.505s
     cat fb.panTro4.chainRn5Link.txt
     #	901495233 bases of 2902338967 (31.061%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/panTro4/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 ##############################################################################
 # LASTZ guinea pig cavPor3 (DONE - 2012-11-29,12-04 - Hiram)
     # establish a screen to control this job with a name to indicate what it is
     screen -S rn5CavPor3
     mkdir /hive/data/genomes/rn5/bed/lastzCavPor3.2012-11-29
     cd /hive/data/genomes/rn5/bed/lastzCavPor3.2012-11-29
 
     cat << '_EOF_' > DEF
 # guinea pig vs rat
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz
 
 # TARGET: Rat Rn5
 SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=40
 
 # QUERY: guinea pig CavPor3
 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit
 SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=20
 
 BASE=/hive/data/genomes/rn5/bed/lastzCavPor3.2012-11-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable
     #	number of jobs, 50,000 to something under 100,000
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    2935m17.649s
     cat fb.rn5.chainCavPor3Link.txt
     #	749383689 bases of 2572853723 (29.127%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s lastzCavPor3.2012-11-29 lastz.cavPor3
 
     mkdir /hive/data/genomes/cavPor3/bed/blastz.rn5.swap
     cd /hive/data/genomes/cavPor3/bed/blastz.rn5.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/rn5/bed/lastzCavPor3.2012-11-29/DEF \
 	-swap -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    82m22.205s
     cat fb.cavPor3.chainRn5Link.txt
     #	750114059 bases of 2663369733 (28.164%) in intersection
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/cavPor3/bed
     ln -s blastz.rn5.swap lastz.rn5
 
 #########################################################################
 # construct lift file Ensembl names to UCSC names (DONE 2013-02-19 Chin)
     cd /hive/data/genomes/rn5/jkStuff
 
 cat ../chrom.sizes | while read L
 do
 ucName=`echo "${L}" | awk '{print $1}'`
 ucSize=`echo "${L}" | awk '{print $2}'`
 ensName=`echo $L  | sed -e 's/^chrM/MT/; s/^chr[0-9A-Za-z]*_//; s/_random//; s/^chr//; s/^\([JA][HA][B-Z0-9]*\)/\1.1/;' | awk '{print $1}'`
 ensSize=`echo $L | awk '{print $2}'`
 echo -e "0\t$ensName\t$ensSize\t$ucName\t$ucSize"
 done > ensToUcsc.lift
 
 ##############################################################################
 # SWAP Mm9/Mouse lastz (DONE - 2013-03-27 - Hiram)
     # original alignment
     cd /hive/data/genomes/mm9/bed/lastzRn5.2013-03-14
     cat fb.mm9.chainRn5Link.txt
     #   1762022227 bases of 2620346127 (67.244%) in intersection
     # mm9 to rn4 was:
     #   1713323126 bases of 2620346127 (65.385%) in intersection
 
     #	and this swap
     mkdir /hive/data/genomes/rn5/bed/blastz.mm9.swap
     cd /hive/data/genomes/rn5/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/mm9/bed/lastzRn5.2013-03-14/DEF \
 	-swap -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \
 	-noLoadChainSplit -syntenicNet -workhorse=hgwdev \
 	-smallClusterHub=encodek > swap.log 2>&1 &
     #   real    5m19.183s
     # finish netChains manually due to nib directory spec error, then:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/mm9/bed/lastzRn5.2013-03-14/DEF \
 	-swap -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \
 	-continue=load -syntenicNet -workhorse=hgwdev \
 	-smallClusterHub=encodek > load.log 2>&1 &
     # and the synteny step too, same error
     #	real    121m21.029s
     cat fb.rn5.chainMm9Link.txt 
     #   1795345063 bases of 2572853723 (69.780%) in intersection
     # FYI, rn4 was:
     #   1711034941 bases of 2571531505 (66.538%) in intersection
 
     # set sym link to indicate this is the lastz for this genome:
     cd /hive/data/genomes/rn5/bed
     ln -s blastz.mm9.swap lastz.mm9
 
 #########################################################################
 ##  CYTOBAND - ideogram track (DONE - 2013-06-18 - Hiram)
     mkdir /hive/data/genomes/rn5/bed/cytoBand
     cd /hive/data/genomes/rn5/bed/cytoBand
     # fetch the ideogram file:
     rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ideogram_10116_GCF_000000225.3_NA_V1 ./
 
     # Create bed file
     $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \
        ideogram_10116_GCF_000000225.3_NA_V1
 
     ## can now verify before load:
     $HOME/kent/src/utils/ncbi/cytoBandVerify.pl
     #	everything checks out OK on 21 chroms
     # Load the bed file
     hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
 	rn5 cytoBand cytoBand.bed
     #   Read 403 elements of size 5 from cytoBand.bed
     # Make cytoBandIdeo track for ideogram gif on hgTracks page.
     # The cytoBandIdeo is just a replicate of the cytoBand track.
     hgsql -e "CREATE TABLE cytoBandIdeo SELECT * FROM cytoBandIdeo;" rn5
     hgsql rn5 -e "CREATE TABLE cytoBandIdeo (index(chrom(25),chromStart)) as SELECT * from cytoBand;"
 
     # dump existing table
     hgsql -N -e "SELECT * FROM cytoBandIdeo" rn5 > rn5.cytoBandIdeo
 
     # find chroms already covered
     hgsql -N -e 'SELECT chrom FROM cytoBandIdeo' rn5 \
        | sort -u > rn5.coveredNames
     # make cytoBand records for chroms not already covered
     hgsql -N -e 'SELECT chrom, size FROM chromInfo' rn5 \
       | grep -wvf rn5.coveredNames \
       | awk '{print $1"\t0\t"$2"\t\tgneg"}' > rn5.cytoBandNew
     # check
     wc -l rn5.*
     # combine and sort
     cat rn5.cytoBandNew rn5.cytoBandIdeo | sort -k1,1 -k2,2n \
         > rn5.cytoBandIdeoFull
     # replace exsting table
     hgsql -e "DROP TABLE cytoBandIdeo" rn5
     hgLoadSqlTab rn5 cytoBandIdeo $HOME/kent/src/hg/lib/cytoBandIdeo.sql \
            rn5.cytoBandIdeoFull
 
 ##########################################################################
 # create ucscToINSDC name mapping (DONE - 2013-08-23 - Hiram)
     mkdir /hive/data/genomes/rn5/bed/ucscToINSDC
     cd /hive/data/genomes/rn5/bed/ucscToINSDC
 
     # copying these scripts from the previous load and improving them
     # with each instance
     ./translateNames.sh NC_001665.2
     ./verifyAll.sh
     ./join.sh
 
     sed -e "s/21/25/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
       | hgLoadSqlTab rn5 ucscToINSDC stdin ucscToINSDC.tab
     checkTableCoords rn5 ucscToINSDC
     featureBits -countGaps rn5 ucscToINSDC
     # 2909698938 bases of 2909698938 (100.000%) in intersection
 
     # verify the track link to INSDC functions
 
 ##############################################################################
 # chain/net hg38 (DONE - 2014-03-31 - Pauline)
 
     #	running the swap
     mkdir /hive/data/genomes/rn5/bed/blastz.hg38.swap
     cd /hive/data/genomes/rn5/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \
 	-swap \
 	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 
     #   real    66m53.095s
     cat fb.rn5.chainHg38Link.txt
     #   934256475 bases of 2572853723 (36.312%) in intersection
 ##############################################################################
 ## 13-Way Multiz (DONE - 2014-04-02 - Pauline)
 
     mkdir /hive/data/genomes/rn5/bed/multiz13way
     cd /hive/data/genomes/rn5/bed/multiz13way
 
     #run tree doctor to make *.nh tree file
     /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/tree_doctor \
      --prune-all-but rn5,danRer7,galGal4,mm10,cavPor3,canFam3,rheMac3,panTro4,hg19,ailMel1,bosTau7,monDom5,melGal1,rn5 \
      ~/kent/src/hg/utils/phyloTrees/120way.nh > 13way.nh
 
     cat 13way.nh 
     (((((((hg19:0.00655,panTro4:0.00684):0.027424,rheMac3:0.037601):0.109934,
     ((mm10:0.084509,rn5:0.091589):0.230915,cavPor3:0.175779):0.041059):0.020593,
     (bosTau7:0.18908,(canFam3:0.052458,ailMel1:0.08):0.080572):0.032898):0.258392,
     monDom5:0.340786):0.181168,(galGal4:0.116254,melGal1:0.150718):0.443188):0.261354,danRer7:1.20107);
 
     #   Use this specification in the phyloGif tool:
     #   http://genome.ucsc.edu/cgi-bin/phyloGif
     #   to obtain a gif image for htdocs/images/phylo/rn5_13way.gif
 
 
     #output distances to text file
     /cluster/bin/phast/all_dists 13way.nh >>& /hive/data/genomes/rn5/bed/multiz13way/13way.distances3.txt
 
     grep -i rn5 13way.distances.txt | sort -k3,3n
 
     #mm10	rn5	0.176098
     #rn5	cavPor3	0.498283
     #hg19	rn5	0.507471
     #panTro4	rn5	0.507761
     #rheMac3	rn5	0.511098
     #rn5	canFam3	0.550084
     #rn5	ailMel1	0.577626
     #rn5	bosTau7	0.606134
     #rn5	monDom5	0.983334
     #rn5	galGal4	1.383158
     #rn5	melGal1	1.417622
     #rn5	danRer7	2.286140
 
     #Hiram used some text and phylo editing magic to convert 13way.nh to
     #reorder.13way.nh to get rn5 appearing at the top so that the phylo gif
     #displays as we want it.
 
     # extract species list from that .nh file
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
         reorder.13way.nh | xargs echo | sed 's/ //g; s/,/ /g' \
         | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt
 
     # making common names file
     ~/kent/src/hg/utils/phyloTrees/commonNames.sh /hive/data/genomes/rn5/bed/multiz13way/reorder.13way.nh >>& 13way.commonNames.nh
 
 
     # using this Common name .nh file in the phyloGif tool:
     # ~/kent/src/hg/utils/phyloTrees/13way.commonNames.nh
     #   http://genome.ucsc.edu/cgi-bin/phyloGif
     #   to obtain a png image for src/hg/htdocs/images/phylo/rn5_13way.png
 
     # construct a common name list:
     sed -e 's#  *##; s#[()]##g; s#:.*##;' \
     /hive/data/genomes/rn5/bed/multiz13way/13way.commonNames.nh \
           > rn5.13way.commonName.list
 
     #printing out the tree text in a way that allows easier manipulation of data to get "rat" to top of phylogif image
     asciiTree.pl 13way.commonNames.nh
 
 (((((((Human:0.00655,
       Chimp:0.00684):0.027424,
      Rhesus:0.037601):0.109934,
     ((Mouse:0.084509,
      Rat:0.091589):0.230915,
     Guinea_pig:0.175779):0.041059):0.020593,
    (Cow:0.18908,
    (Dog:0.052458,
 
      (Chicken:0.116254,
      Turkey:0.150718):0.443188):0.261354,
     Zebrafish:1.20107);
 
     #Hiram took tree file edited in treegraph - to get rat at top
 
     # Make 3 lists - to specify which type of alignment to use for each spp - rbest, syntenic nets, and normal mafs
 
     #syntenic nets: mm10, cavPor3, hg38, bosTau7, canFam3, rheMac3, panTro4
     #normal maf (aka nets): monDom5, galGal4, melGal1, danRer7
     #rbest: ailMel1
 
     #   bash shell syntax here ...
     cd /hive/data/genomes/rn5/bed/multiz13way
     export H=/hive/data/genomes/rn5/bed
     mkdir mafLinks
 
     # for syntenic nets:
     cat synNet.list | while read G
     do
         if [ -s ${H}/lastz.${G}/axtChain/rn5.*.synNet.maf.gz ]; then
             ln -s ${H}/lastz.${G}/axtChain/rn5.*.synNet.maf.gz ./mafLinks/$G.maf.gz
         else
             echo "missing lastz.${G}/mafSynNet/"
         fi
     done
 
     # for nets:
     cat netOnly.list | while read G
     do
         if [ -s ${H}/lastz.${G}/mafNet/rn5.${G}.net.maf.gz ]; then
             ln -s ${H}/lastz.$G/mafNet/rn5.${G}.net.maf.gz ./mafLinks/$G.maf.gz
         else
             echo "missing lastz.${G}/mafNet/"
         fi
     done
 
     #for rbest:
     mafRBestNet/rn5.ailMel1.rbest.maf.gz
     bed/lastz.ailMel1/axtChain [file is here]
     ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
     ln -s /hive/data/genomes/rn5/bed/lastz.ailMel1/mafRBestNet/rn5.ailMel1.rbest.maf.gz ./mafLinks/$G.maf.gz
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
         13way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
 
     # split the maf files into a set of hashed named files
     # this hash named split keeps the same chr/contig names in the same
     # named hash file.
     mkdir /hive/data/genomes/rn5/bed/multiz13way/mafSplit
     cd /hive/data/genomes/rn5/bed/multiz13way/mafSplit
     for D in `grep -v rn5 ../species.list.txt`
     do
 	echo "${D}"
 	mkdir $D
 	cd $D
 	echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}.maf.gz"
 	mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}.maf.gz
 	cd ..
     done
 
     # construct a list of all possible maf file names.
     # they do not all exist in each of the species directories
     find . -type f | wc -l
     # 844
     find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list
     wc -l maf.list
     # 157 maf.list
 
     mkdir /hive/data/genomes/rn5/bed/multiz13way/splitRun
     cd /hive/data/genomes/rn5/bed/multiz13way/splitRun
     mkdir maf run
     cd run
     mkdir penn
     cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
     cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
     cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn
 
     #   set the db and pairs directories here
     cat > autoMultiz.csh << '_EOF_'
     #!/bin/csh -ef
     set db = rn5
     set c = $1
     set result = $2
     set run = `/bin/pwd`
     set tmp = /dev/shm/$db/multiz.$c
     set pairs = /hive/data/genomes/$db/bed/multiz13way/mafSplit
     /bin/rm -fr $tmp
     /bin/mkdir -p $tmp
     /bin/cp -p ../../tree.nh ../../species.list $tmp
     pushd $tmp > /dev/null
     foreach s (`grep -v rn5 species.list.txt`)
 	set in = $pairs/$s/$c.maf
 	set out = $db.$s.sing.maf
 	if (-e $in.gz) then
 	    /bin/zcat $in.gz > $out
 	    if (! -s $out) then
 		echo "##maf version=1 scoring=autoMZ" > $out
 	    endif
 	else if (-e $in) then
 	    /bin/ln -s $in $out
 	else
 	    echo "##maf version=1 scoring=autoMZ" > $out
 	endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \
 	    > /dev/null
     popd > /dev/null
     /bin/rm -f $result
     /bin/cp -p $tmp/$c.maf $result
     /bin/rm -fr $tmp
     /bin/rmdir --ignore-fail-on-non-empty /dev/shm/$db
     '_EOF_'
     # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
     #LOOP
     ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/rn5/bed/multiz13way/splitRun/maf/$(root1).maf}
     #ENDLOOP
     '_EOF_'
 
     #cat-ing together all mafs into single file in prep to load
     cd 
     cd /hive/data/genomes/rn5/bed/multiz13way/splitRun/maf
 
     #checking the first line of the first maf:
     head -1 splitRun/maf/001.maf > multiz13way.maf
 
     bash
     for F in splitRun/maf/*.maf
     do
 	egrep -v "^#" ${F}
 	echo "${F}" 1>&2 #sends stuff to stderr (vs stdout where everything else is headed)
     done >> multiz13way.maf
     tail -1 splitRun/maf/001.maf >> multiz13way.maf
 
     # Load into database
     ssh hgwdev
     cd /hive/data/genomes/rn5/bed/multiz13way
     mkdir /gbdb/rn5/multiz13way
     ln -s `pwd`/multiz13way.maf /gbdb/rn5/multiz13way
     cd /dev/shm
     time nice hgLoadMaf rn5 multiz13way
 
     time hgLoadMafSummary -verbose=2 -minSize=30000 \
         -mergeGap=1500 -maxSize=200000 rn5 multiz13waySummary \
         /gbdb/rn5/multiz13way/multiz13way.maf
 
     [hgwdev:/dev/shm] $ wc -l multiz13way*
      14612517 multiz13way.tab
       2492720 multiz13waySummary.tab
 
 
 #######################################################################
 # GAP ANNOTATE MULTIZ13WAY MAF AND LOAD TABLES (DONE - 2014-05-09 - Pauline)
     # mafAddIRows has to be run on single chromosome maf files, it does not
     #   function correctly when more than one reference sequence
     #   are in a single file.
     mkdir -p /hive/data/genomes/rn5/bed/multiz13way/anno/mafSplit
     cd /hive/data/genomes/rn5/bed/multiz13way/anno/mafSplit
     time mafSplit -byTarget -useFullSequenceName \
         /dev/null . ../../multiz13way.maf
 
     ls | wc
     # 721     721   18142
 
     cd /hive/data/genomes/rn5/bed/multiz13way/anno
     # skipped making the N.bed files - all dbs should have one already
 
     for DB in `cat ../species.list.txt`
     do
       echo "${DB} "
       ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
       echo ${DB}.bed  >> nBeds
       ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
       echo ${DB}.len  >> sizes
     done
 
     # make sure they all are successful symLinks:
     ls -ogrtL
 
     cd /hive/data/genomes/rn5/bed/multiz13way/anno
     mkdir result
     cat << '_EOF_' > template
     #LOOP
     mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/rn5/rn5.2bit result/$(file1)
     #ENDLOOP
     '_EOF_'
     # << happy emacs
 
     ls mafSplit/*.maf > maf.list
     gensub2 maf.list single template jobList
 
     # combine into one file
     head -q -n 1 result/chr1.maf > rn5.13way.maf
     for F in result/*.maf
     do
 	grep -h -v "^#" ${F}
     done >> rn5.13way.maf
     #   these maf files do not have the end marker, this does nothing:
     #   tail -q -n 1 result/GL172637.maf >> rn5.13way.maf
     # How about an official end marker:
     echo "##eof maf" >> rn5.13way.maf
 
     # Load into database
     rm /gbdb/rn5/multiz13way/multiz13way.maf   # remove old symlink
     ln -s `pwd`/rn5.13way.maf /gbdb/rn5/multiz13way/multiz13way.maf
     cd /dev/shm 
     time nice -n +19 hgLoadMaf rn5 multiz13way
 
     #real	6m45.606s
     user	5m5.435s
     sys	0m15.640s
 
     time nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \
         -mergeGap=1500 -maxSize=200000 rn5 multiz13waySummary \
         /gbdb/rn5/multiz13way/multiz13way.maf
     #   Created 516059 summary blocks from 5782695 components and 2929343 mafs
     #   from /gbdb/rn5/multiz13way/multiz13way.maf
     #   real    1m6.705s
 
     wc -l multiz13way*.tab
     #   2929343 multiz13way.tab
     #   516059 multiz13waySummary.tab
     #   3445402 total
 
     rm multiz13way*.tab
  
 #########################################################################
 # Phylogenetic tree from 13-way (DONE - 2014-05-09 - Pauline)
 
     mkdir /hive/data/genomes/rn5/bed/multiz13way/4d
     cd /hive/data/genomes/rn5/bed/multiz13way/4d
 
     # the split annotated maf's are in:
     ../anno/result/*.maf
 
     # using ensGene for rn5, only transcribed genes and nothing
     #	from the randoms and other misc.
     hgsql rn5 -Ne \
     "select * from ensGene WHERE cdsEnd > cdsStart;" | cut -f 2-20 \
 	> ensGene.gp
     #	22705 ensGene.gp
 
     genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp
     wc -l ensGeneNR.gp
     #	18416 ensGeneNR.gp (old)
     #   22863 ensGeneNR.gp (rn5)
     
     ssh ku
     mkdir /hive/data/genomes/rn5/bed/multiz13way/4d/run
     cd /hive/data/genomes/rn5/bed/multiz13way/4d/run
     mkdir ../mfa
 
     # newer versions of msa_view have a slightly different operation
     # the sed of the gp file inserts the reference species in the chr name
 
     cat << '_EOF_' > 4d.csh
     #!/bin/csh -fe
     set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
     set r = "/hive/data/genomes/rn5/bed/multiz13way"
     set c = $1
     set infile = $r/anno/result/$2
     set outfile = $3
     cd /scratch/tmp
     # 'clean' maf
     perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
     awk -v C=$c '$2 == C {print}' $r/4d/ensGeneNR.gp | sed -e "s/\t$c\t/\trn5.$c\t/" > $c.gp
     set NL=`wc -l $c.gp| gawk '{print $1}'`
     if ("$NL" != "0") then
 	$PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
 	$PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
     else
 	echo "" > $r/4d/run/$outfile
     endif
     rm -f $c.gp $c.maf $c.ss
     '_EOF_'
     # << happy emacs
     chmod +x 4d.csh
 
     ls -1S /hive/data/genomes/rn5/bed/multiz13way/anno/result/*.maf \
 	| sed -e "s#.*multiz13way/anno/result/##" \
 	> maf.list
 
     cat << '_EOF_' > template
 	#LOOP
 	4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
 	#ENDLOOP
 	'_EOF_'
     # << happy emacs
 
     # the tac puts the quick jobs at the front
     gensub2 maf.list single template stdout | tac > jobList
     para create jobList
     para try ... check
     para -maxJob=5 push
     para time
 
     # there are numerous problems in some of the result.  Not a whole lot
     #	of sequence is matching in these distant organisms
     # Completed: 9143 of 9202 jobs
     # Crashed: 30 jobs
     # Other count: 29 jobs
     # CPU time in finished jobs:        711s      11.85m     0.20h    0.01d  0.000 y
     # IO & Wait Time:                 23273s     387.89m     6.46h    0.27d  0.001 y
     # Average job time:                   3s       0.04m     0.00h    0.00d
     # Longest finished job:               7s       0.12m     0.00h    0.00d
     # Submission to last job:          2270s      37.83m     0.63h    0.03d
 
     # combine mfa files
     ssh hgwdev
     cd /hive/data/genomes/rn5/bed/multiz13way/4d
     # remove the broken empty files, size 0 and size 1:
     find ./mfa -type f -size 0 | xargs rm -f
     # most interesting, this did not identify files of size 1:
     #    find ./mfa -type f -size 1
     ls -og mfa | awk '$3 == 1' > empty.list
     sed -e "s#^#mfa/##" empty.list | xargs rm -f
     #want comma-less species.list.txt
     /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
 	--aggregate "`cat ../species.list.txt`" mfa/*.mfa | sed s/"> "/">"/ \
 	    > 4d.all.mfa
     # check they are all in there:
     grep "^>" 4d.all.mfa
 	#>rn5
 	#>mm10
 	#>cavPor3
 	#>hg38
 	#>panTro4
 	#>rheMac3
 	#>bosTau7
 	#>canFam3
 	#>ailMel1
 	#>monDom5
 	#>galGal4
 	#>melGal1
 	#>danRer7
 
     # tree-commas.nh:
     #((rn5,(((rn5,(mm9, rn4)),monDom5), ((melGal1,galGal3), anoCar2))),danRer7)
 
     # use phyloFit to create tree model (output is phyloFit.mod)
     time nice -n +19 \
 	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
 	    --EM --precision MED --msa-format FASTA --subst-mod REV \
 		--tree ../tree-commas.nh 4d.all.mfa
     #	real    2m10.469s
     mv phyloFit.mod all.mod
 
     grep TREE all.mod
     #TREE: (((((((hg38:0.00668077,panTro4:0.00700149):0.0249002,rheMac3:0.035629):0.0951332,((mm10:0.0835964,rn5:0.087855):0.209399,cavPor3:0.213799):0.044572):0.0151261,(bosTau7:0.166871,(canFam3:0.0861626,ailMel1:0.0807089):0.0633319):0.030696):0.246197,monDom5:0.347347):0.194885,(galGal4:0.0541739,melGal1:0.0658595):0.408012):0.636337,danRer7:0.636337);
 
 	# it actually looks pretty good as a tree estimate ...
 
 
 #########################################################################
 # MULTIZ13way MAF FRAMES (DONE - 2014-05-09 - Pauline)
     ssh hgwdev
     mkdir /hive/data/genomes/rn5/bed/multiz13way/frames
     cd /hive/data/genomes/rn5/bed/multiz13way/frames
     mkdir genes
 
     cat << '_EOF_' > showGenes.csh
     #!/bin/csh 
     foreach db (`cat ../species.list.txt`)
 	echo -n "${db}: "
 	set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
 	foreach table ($tables)
 	    if ($table == "ensGene" || $table == "refGene" || \
 	       $table == "mgcGenes" || $table == "knownGene" || \
 	       $table == "xenoRefGene" ) then
 	       set count = `hgsql $db -N -e "select count(*) from $table"`
 		echo -n "${table}: ${count}, "
 	    endif
 	end
 	set orgName = `hgsql hgcentraltest -N -e \
 		"select scientificName from dbDb where name='$db'"`
 	set orgId = `hgsql rn5 -N -e \
 		"select id from organism where name='$orgName'"`
 	if ($orgId == "") then
 	    echo "Mrnas: 0"
 	else
 	    set count = `hgsql rn5 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
 	    echo "Mrnas: ${count}"
 	endif
     end
     '_EOF_'
 
     # Getting numbers of items in gene tracks for each of the assemblies to pick which is best gene set to use:
 
     rn5: ensGene: 29188, mgcGenes: 6924, refGene: 18572, xenoRefGene: 174272,
     Mrnas: 1247318
     mm10: ensGene: 94647, knownGene: 61642, mgcGenes: 26768, refGene: 33750,
     xenoRefGene: 159981, Mrnas: 370860
     cavPor3: ensGene: 26129, refGene: 482, xenoRefGene: 278026, Mrnas: 1244
     hg38: knownGene: 104178, mgcGenes: 34081, refGene: 54311, xenoRefGene: 171890,
     Mrnas: 2016023
     panTro4: ensGene: 29160, refGene: 2624, xenoRefGene: 278936, Mrnas: 6097
     rheMac3: refGene: 6369, xenoRefGene: 273293, Mrnas: 383362
     bosTau7: refGene: 14571, xenoRefGene: 263920, Mrnas: 34058
     canFam3: ensGene: 29884, refGene: 1579, xenoRefGene: 252177, Mrnas: 4544
     ailMel1: ensGene: 25055, xenoRefGene: 285239, Mrnas: 151
     monDom5: ensGene: 24882, refGene: 486, xenoRefGene: 247493, Mrnas: 2190
     galGal4: ensGene: 17954, refGene: 6811, xenoRefGene: 206366, Mrnas: 36961
     melGal1: ensGene: 17373, xenoRefGene: 240191, Mrnas: 333
     danRer7: ensGene: 56754, mgcGenes: 17994, refGene: 15912, xenoRefGene: 225864,
     Mrnas: 45144
 
     rn5: refGene
     rheMac3: refGene
     bosTau7: refGene
     danRer7: refGene
 
     mm10: knownGene
     hg38: knownGene
 
     cavPor3: ensGene
     panTro4: ensGene
     canFam3: ensGene
     ailMel1: ensGene
     monDom5: ensGene
     galGal4: ensGene
     melGal1: ensGene
 
     #------------------------------------------------------------------------
     # get the genes for all genomes
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
 
     # using ensGene for: cavPor3 panTro4 canFam3 ailMel1 monDom5 galGal4 melGal1
     for qDB in cavPor3 panTro4 canFam3 ailMel1 monDom5 galGal4 melGal1 
     do
       echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB}
 	  hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \
 	  | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz
     done
 
     # Using knownGene for: mm10 hg38
     # genePreds; (must keep only the first 10 columns for knownGene)
     for qDB in mm10 hg38
     do
       echo hgsql -N -e \"'select * from 'knownGene\;\" ${qDB}
       hgsql -N -e "select * from knownGene" ${qDB} | cut -f 1-10 \
       | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz
     done
 
     # Using refGene for: xenTro7 and bosTau7 want the full extended genePred:
     for qDB in rn5 rheMac3 bosTau7 danRer7
     do
     hgsql -N -e "select * from refGene" ${qDB} | cut -f2- \
 	  | genePredSingleCover stdin stdout | gzip -2c \
 	    > genes/$qDB.gp.gz
 	echo "${qDB} done"
     done
 
     # verify counts for genes are reasonable:
 	for T in genes/*.gz
     do
 	echo -n "# $T: "
 	zcat $T | cut -f1 | sort | uniq -c | wc -l
     done
 
     # genes/ailMel1.gp.gz: 19204
     # genes/bosTau7.gp.gz: 12993
     # genes/canFam3.gp.gz: 19507
     # genes/cavPor3.gp.gz: 18631
     # genes/danRer7.gp.gz: 14016
     # genes/galGal4.gp.gz: 15391
     # genes/hg38.gp.gz: 21887
     # genes/melGal1.gp.gz: 14050
     # genes/mm10.gp.gz: 21013
     # genes/monDom5.gp.gz: 21033
     # genes/panTro4.gp.gz: 18657
     # genes/rheMac3.gp.gz: 5614
     # genes/rn5.gp.gz: 16480
 
     ssh hgwdev
     cd /hive/data/genomes/rn5/bed/multiz13way/frames
     time (cat ../anno/rn5.13way.maf \
 	| nice -n +19 genePredToMafFrames rn5 stdin stdout \
 	    `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list.txt` \
 		| gzip > multiz13way.mafFrames.gz)
     #	real    2m22.546s
 
     # verify there are frames on everything:
     zcat multiz13way.mafFrames.gz | awk '{print $4}' | sort | uniq -c
 
      228464 ailMel1
      142021 bosTau7
      237790 canFam3
      210120 cavPor3
      254471 danRer7
      375209 galGal4
      232596 hg38
      352014 melGal1
      212356 mm10
      396623 monDom5
      213837 panTro4
       47695 rheMac3
      150465 rn5
 
     ssh hgwdev
     cd /hive/data/genomes/rn5/bed/multiz13way/frames
     time hgLoadMafFrames rn5 multiz13wayFrames multiz13way.mafFrames.gz
     # real    0m34.491s
 
 #########################################################################
 # phastCons 13-way (DONE - 2014-05-09 - Pauline)
 
     # split 13way mafs into 10M chunks and generate sufficient statistics 
     # files for # phastCons
     ssh ku 
     mkdir -p /hive/data/genomes/rn5/bed/multiz13way/cons/SS
     cd /hive/data/genomes/rn5/bed/multiz13way/cons/SS
     mkdir result done
 
     cat << '_EOF_' > mkSS.csh
 	#!/bin/csh -ef
 	set c = $1
 	set MAF = /hive/data/genomes/rn5/bed/multiz13way/anno/result/$c.maf
 	set WINDOWS = /hive/data/genomes/rn5/bed/multiz13way/cons/SS/result/$c
 	set WC = `cat $MAF | wc -l`
 	set NL = `grep "^#" $MAF | wc -l`
 	if ( -s $2 ) then
 	    exit 0
 	endif
 	if ( -s $2.running ) then
 	    exit 0
 	endif
 
 	date >> $2.running
 
 	rm -fr $WINDOWS
 	mkdir $WINDOWS
 	pushd $WINDOWS > /dev/null
 	if ( $WC != $NL ) then
 	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
 	    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
 	endif
 	popd > /dev/null
 	date >> $2
 	rm -f $2.running
 	'_EOF_'
     # << happy emacs
 
     chmod +x mkSS.csh
 
     cat << '_EOF_' > template
 	#LOOP
 	mkSS.csh $(root1) {check out line+ done/$(root1)}
 	#ENDLOOP
 	'_EOF_'
     # << happy emacs
 
     #	do the easy ones first to see some immediate results
     ls -1S -r ../../anno/result | sed -e "s/.maf//" > maf.list
 
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check ... etc
 
     #Completed: 721 of 721 jobs
     #CPU time in finished jobs:       1900s      31.66m     0.53h    0.02d  0.000 y
     #IO & Wait Time:                  6905s     115.09m     1.92h    0.08d  0.000 y
     #Average job time:                  12s       0.20m     0.00h    0.00d
     #Longest finished job:             228s       3.80m     0.06h    0.00d
     #Submission to last job:           305s       5.08m     0.08h    0.00d
 
     # not all of them produce results
     #	 there isn't much aligned in these MAF files
     find ./result -type f | wc
     #       452     452   21708 
 
     # Run phastCons
     #	This job is I/O intensive in its output files, beware where this
     #	takes place or do not run too many at once.
     ssh ku
     mkdir -p /hive/data/genomes/rn5/bed/multiz13way/cons/run.cons
     cd /hive/data/genomes/rn5/bed/multiz13way/cons/run.cons
 
     #	there are going to be only one phastCons run using
     #	this same script.  It triggers off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is:
     #	all 
 
     cat << '_EOF_' > doPhast.csh
 	#!/bin/csh -fe
 	set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
 	set c = $1
 	set cX = $1:r
 	set f = $2
 	set len = $3
 	set cov = $4
 	set rho = $5
 	set grp = $cwd:t
 	set cons = /hive/data/genomes/rn5/bed/multiz13way/cons
 	set tmp = $cons/tmp/$f
 	mkdir -p $tmp
 	set ssSrc = $cons
 	set useGrp = "$grp.mod"
 	ln -s $ssSrc/SS/result/$c/$f.ss $tmp
 	ln -s $cons/$grp/$grp.mod $tmp
 	pushd $tmp > /dev/null
 	$PHASTBIN/phastCons $f.ss $useGrp \
 	    --rho $rho --expected-length $len --target-coverage $cov --quiet \
 	    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 	popd > /dev/null
 	mkdir -p pp/$c bed/$c
 	sleep 4
 	touch pp/$c bed/$c
 	rm -f pp/$c/$f.pp
 	rm -f bed/$c/$f.bed
 	mv $tmp/$f.pp pp/$c
 	mv $tmp/$f.bed bed/$c
 	rm -fr $tmp
     '_EOF_'
     # << happy emacs
     chmod +x doPhast.csh
 
     #	this template will serve for all runs
     #	root1 == chrom name, file1 == ss file name without .ss suffix
 
     cat << '_EOF_' > template
     #LOOP
     ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
     #ENDLOOP
     '_EOF_'
     # << happy emacs
 
     find ../SS -type f | grep ".ss$" | sed -e 's/.ss$//' > ss.list
     wc -l ss.list
     #452 ss.list
 
     # Create parasol batch and run it
     # run for all species
     mkdir /hive/data/genomes/rn5/bed/multiz13way/cons/all
     cd /hive/data/genomes/rn5/bed/multiz13way/cons/all
     #	Using the .mod tree
     cp -p ../../4d/all.mod ./all.mod
 
     gensub2 ../run.cons/ss.list single ../run.cons/template jobList
     para create jobList
     para try ... check ...
     para push
     #Completed: 452 of 452 jobs
     #CPU time in finished jobs:       7248s     120.79m     2.01h    0.08d  0.000 y
     #IO & Wait Time:                  5165s      86.09m     1.43h    0.06d  0.000 y
     #Average job time:                  27s       0.46m     0.01h    0.00d
     #Longest finished job:              49s       0.82m     0.01h    0.00d
     #Submission to last job:           119s       1.98m     0.03h    0.00d
 
     cd /hive/data/genomes/rn5/bed/multiz13way/cons/all
 
     find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \
 	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
 	    > tmpMostConserved.bed
 
     /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
 
     # load into database
     time nice -n +19 hgLoadBed rn5 phastConsElements13way mostConserved.bed
     # real 16.00
 
 
     # on human we often try for 5% overall cov, and 70% CDS cov 
     # most bets are off here for that goal, these alignments are too few
     #	and too far between
     featureBits rn5 -enrichment ensGene:cds phastConsElements13way
     # ensGene:cds 1.330%, phastConsElements13way 5.786%, both 0.982%, cover
     # 73.81%, enrich 12.76x
 
     featureBits rn5 -enrichment mgcGenes:cds phastConsElements13way
     # mgcGenes:cds 0.307%, phastConsElements13way 5.786%, both 0.244%, cover
     # 79.45%, enrich 13.73x
 
     # Create merged posterier probability file and wiggle track data files
     cd /hive/data/genomes/rn5/bed/multiz13way/cons/all
     mkdir downloads
 
     find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
 	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
 	| gzip -c > downloads/phastCons13way.wigFix.gz
 
     # check integrity of data with wigToBigWig
     time (zcat downloads/phastCons13way.wigFix.gz \
 	| wigToBigWig -verbose=2 stdin /hive/data/genomes/rn5/chrom.sizes \
 	    phastCons13way.bw) > bigWig.log 2>&1 &
     tail bigWig.log
 
     2 levels.  Level sizes are 1 2
     Made 1 primary index nodes out of 477 items
     level 0: size 1, offset 4912418091
     level 1: size 1, offset 4912424239
     2 levels.  Level sizes are 1 1
     # pid=5203: VmPeak:     20899452 kB
     # real    27m37.874s
 
     bigWigInfo phastCons13way.bw
 
     #version: 4
     #isCompressed: yes
     #isSwapped: 0
     #primaryDataSize: 3,587,651,518
     #primaryIndexSize: 75,581,132
     #zoomLevels: 10
     #chromCount: 179
     #basesCovered: 1,901,460,685
     #mean: 0.157458
     #min: 0.000000
     #max: 1.000000
     #std: 0.270193
 
     #	encode those files into wiggle data
     time (zcat downloads/phastCons13way.wigFix.gz \
 	| wigEncode stdin phastCons13way.wig phastCons13way.wib)
     #Converted stdin, upper limit 1.00, lower limit 0.00
     #real    10m0.015s
 
     du -hsc *.wi?
     #   3.6G    phastCons13way.wib
     #   452M    phastCons13way.wig
     #   4.0G    total
 
     # Load gbdb and database with wiggle.
     ln -s `pwd`/phastCons13way.wib /gbdb/rn5/multiz13way/phastCons13way.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/rn5/multiz13way \
 	rn5 phastCons13way phastCons13way.wig
     #   real    0m27.750s
 
     # use to set trackDb.ra entries for wiggle min and max
     # and verify table is loaded correctly
 
     wigTableStats.sh rn5 phastCons13way
     # db.table      min max mean count sumData stdDev viewLimits
     # rn5.phastCons13way      0 1 0.157458 1901460685 2.99401e+08 0.270193 viewLimits=0:1
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram -db=rn5 \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    phastCons13way > histogram.data 2>&1
     #   real    1m45.632s
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
     set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
     set size 1.4, 0.8
     set key left box
     set grid noxtics
     set grid ytics
     set title " X. tropicalis rn5 Histogram phastCons13way track"
     set xlabel " phastCons13way score"
     set ylabel " Relative Frequency"
     set y2label " Cumulative Relative Frequency (CRF)"
     set y2range [0:1]
     set y2tics
     set yrange [0:0.02]
 
     plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
 	    "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
     '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 #############################################################################
 # phyloP for 13-way (DONE - 2014-05-09 - Pauline)
     # run phyloP with score=LRT 
     ssh ku
     mkdir /cluster/data/rn5/bed/multiz13way/consPhyloP
     cd /cluster/data/rn5/bed/multiz13way/consPhyloP
 
     mkdir run.phyloP
     cd run.phyloP
     # Adjust model file base composition background and rate matrix to be
     # representative of the chromosomes in play
     grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
     OLD:#	0.495
     # 0.542
     /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
 	../../cons/all/all.mod 0.542 > all.mod
     # verify, the BACKGROUND should now be paired up:
     grep BACK all.mod 
     OLD:#	BACKGROUND: 0.252500 0.247500 0.247500 0.252500 
 
     #	following the pattern from panTro3 with grp: "all" (no other grp)
     cat << '_EOF_' > doPhyloP.csh
     #!/bin/csh -fe
     set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
     set f = $1
     set file1 = $f:t
     set out = $2
     set cName = $f:t:r
     set n = $f:r:e
     set grp = $cwd:t
     set cons = /hive/data/genomes/rn5/bed/multiz13way/consPhyloP
     set tmp = $cons/tmp/$grp/$f
     rm -fr $tmp
     mkdir -p $tmp
     set ssSrc = "/hive/data/genomes/rn5/bed/multiz13way/cons/SS/result/$f"
     set useGrp = "$grp.mod"
     ln -s $cons/run.phyloP/$grp.mod $tmp
     pushd $tmp > /dev/null
     $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
 	-i SS $useGrp $ssSrc.ss > $file1.wigFix
     popd > /dev/null
     mkdir -p $out:h
     sleep 4
     mv $tmp/$file1.wigFix $out
     rm -fr $tmp
     rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$cName
     rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
     rmdir --ignore-fail-on-non-empty $cons/tmp
     '_EOF_'
     # << happy emacs
 
     # Create list of chunks
     find ../../cons/SS/result -type f | grep ".ss$" \
 	| sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list
     # make sure the list looks good
     wc -l ss.list
     #   452 ss.list 
 
     # Create template file
     #	file1 == $chr/$chunk/file name without .ss suffix
     cat << '_EOF_' > template
     #LOOP
     ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
     #ENDLOOP
     '_EOF_'
     # << happy emacs
 
     ######################   Running all species  #######################
     # setup run for all species
     mkdir /hive/data/genomes/rn5/bed/multiz13way/consPhyloP/all
     cd /hive/data/genomes/rn5/bed/multiz13way/consPhyloP/all
     rm -fr wigFix
     mkdir wigFix
 
     gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
     # the -ram=8g will allow only one job per node to slow this down since
     #	it would run too fast otherwise.  Either run on one of the small
     #	klusters or use the -ram=8g on the para create
     para -ram=8g create jobList
     para try ... check ... etc ...
     para -maxJob=64 push
     para time > run.time
 
     Completed: 452 of 452 jobs
     CPU time in finished jobs:      51556s     859.27m    14.32h    0.60d  0.002 y
     IO & Wait Time:                  3606s      60.10m     1.00h    0.04d  0.000 y
     Average job time:                 122s       2.03m     0.03h    0.00d
     Longest finished job:             303s       5.05m     0.08h    0.00d
     Submission to last job:           740s      12.33m     0.21h    0.01d
 
     # make downloads
     mkdir downloads
 
     time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
 	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
 	| gzip -c > downloads/phyloP13way.wigFix.gz) &
     #   real	25m39.941s
 
     # check integrity of data with wigToBigWig
     time (zcat downloads/phyloP13way.wigFix.gz \
 	| wigToBigWig -verbose=2 stdin /hive/data/genomes/rn5/chrom.sizes \
 	phyloP13way.bw) > bigWig.log 2>&1 &
     egrep "real|VmPeak" bigWig.log
     # pid=78657: VmPeak:    20899744 kB
     # real    28m10.064s
  
     bigWigInfo phyloP13way.bw 
 
     #version: 4
     #isCompressed: yes
     #isSwapped: 0
     #primaryDataSize: 3,232,659,693
     #primaryIndexSize: 75,581,132
     #zoomLevels: 10
     #chromCount: 179
     #basesCovered: 1,901,460,685
     #mean: 0.111680
     #min: -7.060000
     #max: 2.209000
     #std: 0.635959
 
     #	encode those files into wiggle data
     time (zcat downloads/phyloP13way.wigFix.gz \
 	| wigEncode stdin phyloP13way.wig phyloP13way.wib) &
     #   Converted stdin, upper limit 2.21, lower limit -7.06
     #   real    10m35.410s
 
     du -hsc *.wi?
     #   3.6G    phyloP13way.wib
     #   459M    phyloP13way.wig
     #   4.0G    total
 
     # Load gbdb and database with wiggle.
     ln -s `pwd`/phyloP13way.wib /gbdb/rn5/multiz13way/phyloP13way.wib
     nice hgLoadWiggle -pathPrefix=/gbdb/rn5/multiz13way rn5 \
 	phyloP13way phyloP13way.wig
 
     # use to set trackDb.ra entries for wiggle min and max
     # and verify table is loaded correctly
 
     wigTableStats.sh rn5 phyloP13way
     # db.table      min max mean count sumData stdDev viewLimits
     # rn5.phyloP13way -7.06 2.209 0.11168 1901460685 2.12355e+08 0.635959 
     # viewLimits=-3.06811:2.209
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram -db=rn5 \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    phyloP13way > histogram.data 2>&1
     #	real    0m19.396s
 
     # find out the range for the 2:5 graph
     grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin
     # Q1 0.000240
     # median 0.000383
     # Q3 0.000726
     # average 0.001000
     # min 0.000107
     # max 0.041706
     # count 1000
     # total 1.000011
     # standard deviation 0.002765
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " X. tropicalis rn5 Histogram phyloP13way track"
 set xlabel " phyloP13way score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.007]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 
 #########################################################################
 # Construct download files for 13-way (DONE - 2014-05-09 - Pauline)
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/multiz13way
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/multiz13way/maf
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/multiz13way/alignments
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/phastCons13way
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/phastCons13way/rn5.13way.phastCons
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/phyloP13way
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/phyloP13way/rn5.13way.phyloP13way
 
     mkdir /hive/data/genomes/rn5/bed/multiz13way/downloads
     cd /hive/data/genomes/rn5/bed/multiz13way/downloads
     mkdir multiz13way phastCons13way phyloP13way
     cd multiz13way
     mkdir maf alignments
     cd maf
     time rsync -a -P ../../../anno/result/ ./
     #   real    322m58.633s
     time gzip *.maf
     #   real    515m4.515s
     time md5sum *.maf.gz > md5sum.txt
     #   real    10m6.351s
     ln -s `pwd`/*.maf.gz `pwd`/md5sum.txt \
         /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/multiz13way/maf
     cd ..
     du -hsc maf ../../anno/result/
     #    68G     maf
     #    722G    ../../anno/result/
     #    789G    total
 
     :
     #    8.5G    maf
     #     45G     ../../anno/result/
     #     54G     total
 
 
     grep TREE ../../4d/all.mod | sed -e 's/TREE: //' \
        | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
           > rn5.13way.nh
 
    ~/kent/src/hg/utils/phyloTrees/commonNames.sh rn5.13way.nh \
        | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
           > rn5.13way.commonNames.nh
    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh rn5.13way.nh \
        | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
           > rn5.13way.scientificNames.nh
     md5sum *.gz *.nh > md5sum.txt
 
     ln -s `pwd`/*.nh `pwd`/*.txt `pwd`/*.maf.gz \
         /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/multiz13way
 
     #####################################################################
     cd /hive/data/genomes/rn5/bed/multiz13way/downloads/phastCons13way
     mkdir rn5.13way.phastCons
     cd rn5.13way.phastCons
     ln -s ../../../cons/all/downloads/chr*.gz .
     time md5sum *.gz > md5sum.txt
     # real    2m9.045s
 
     ln -s `pwd`/*.gz `pwd`/md5sum.txt \
   /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/phastCons13way/rn5.13way.phastCons
     #   real    6m11.158s
     cd ..
     ln -s ../../cons/all/all.mod rn5.13way.phastCons.mod
     ln -s ../../cons/all/phastCons13way.bw rn5.13way.phastCons.bw
     time md5sum *.mod *.bw > md5sum.txt
     #  real    3m4.462s
     # obtain the README.txt from hg19/phastCons100way and update for this
     #   situation, from:
     # /hive/data/genomes/hg19/bed/multiz100way/downloads/phastCons100way/README.txt
     ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
       /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/phastCons13way
 
     #####################################################################
     cd /hive/data/genomes/rn5/bed/multiz13way/downloads/phyloP13way
     mkdir rn5.13way.phyloP13way
     cd rn5.13way.phyloP13way
     ln -s ../../../consPhyloP/all/downloads/*.gz .
     time md5sum *.gz > md5sum.txt &
     # real    3m37.813s
 
     ln -s `pwd`/*.gz `pwd`/md5sum.txt \
       /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/phyloP13way/rn5.13way.phyloP13way
 
     cd ..
     ln -s ../../consPhyloP/run.phyloP/all.mod rn5.13way.phyloP13way.mod
 
     ln -s ../../consPhyloP/all/phyloP13way.bw rn5.13way.phyloP13way.bw
 
     time md5sum *.mod *.bw > md5sum.txt &
     #  real    4m44.724s
 
     # obtain the README.txt from hg19/phyloP100way and update for this
     #   situation, from:
     # /hive/data/genomes/hg19/bed/multiz100way/downloads/phyloP100way/README.txt
     ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
       /usr/local/apache/htdocs-hgdownload/goldenPath/rn5/phyloP13way
 
     ############################################################################
     # creating upstream mafs (DONE - 2014-05-08 - Pauline)
     ssh hgwdev
     mkdir /hive/data/genomes/rn5/goldenPath/multiz13way
     cd /hive/data/genomes/rn5/goldenPath/multiz13way
     # run this bash script
     cat << '_EOF_' > mkUpstream.sh
 #!/bin/sh
 DB=rn5
 GENE=ensGene
 NWAY=multiz13way
 export DB GENE
 
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.${GENE}.maf"
     echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout"
     featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | mafFrags ${DB} ${NWAY} stdin stdout \
                 -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.list.txt \
         | gzip -c > upstream${S}.${GENE}.maf.gz
     echo "done upstream${S}.${GENE}.maf.gz"
 done
 '_EOF_'
     # << happy emacs
     chmod +x mkUpstream.sh
     time ./mkUpstream.sh
     #   real    57m14.100s
 
     # FIXUP/VERIFY the genbank.conf file to indicate this gene choice for the
     #   upstream automatic generation
     kent/src/hg/makeDb/genbank/etc/genbank.conf
     # +rn5.upstreamGeneTbl = ensGene
     # +rn5.upstreamMaf = multiz13way /hive/data/genomes/rn5/bed/multiz13way/species.list.txt
 
     git commit -m "Adding to the genbank.conf file to indicate this gene choice. refs #8212" genbank.conf
     git push
     make etc-update
 #############################################################################
 # hgPal downloads (DONE - 2014-05-08 - Pauline) 
 #   FASTA from 13way for ensGene
 
     ssh hgwdev
     screen
     bash
     rm -rf /hive/data/genomes/rn5/bed/multiz13way/pal
     mkdir /hive/data/genomes/rn5/bed/multiz13way/pal
     cd /hive/data/genomes/rn5/bed/multiz13way/pal
     for i in `cat ../species.list`; do echo $i; done > order.lst
 
     mz=multiz13way
     gp=ensGene
     db=rn5
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /hive/data/genomes/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.jobs
 
     time sh -x $gp.jobs > $gp.jobs.log 2>&1 &
     sleep 1
     tail -f $gp.jobs.log
     # real    124m33.520s 
 
     mz=multiz13way
     gp=ensGene
     db=rn5
     zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     # we're only distributing exons at the moment
     mz=multiz13way
     gp=ensGene
     db=rn5
     pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
     mkdir -p $pd
     md5sum *.fa.gz > md5sum.txt
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
     ln -s `pwd`/md5sum.txt $pd/
 
 #############################################################################
 # wiki page for 13-way (DONE - 2014-06-04 - Hiram)
     mkdir /hive/users/hiram/bigWays/rn5.13way
     cd /hive/users/hiram/bigWays
     echo "rn5" > rn5.13way/ordered.list
     grep -i rn5 /hive/data/genomes/rn5/bed/multiz13way/13way.distances.txt \
         | sort -k3,3n | sed -e 's/rn5//' | awk '{print $1}' \
            >> rn5.13way/ordered.list
     ./sizeStats.sh rn5.13way/ordered.list 
     ./dbDb.sh rn5 13way
     ./sizeStats.pl rn5 13way
     # needed to construct symlinks for defCheck.pl:
     cd /hive/data/genomes/rn5/bed/lastz.hg19
     ln -s /hive/data/genomes/hg19/bed/lastzRn5.2012-06-27/DEF .
     cd /hive/data/genomes/rn5/bed/lastz.mm10
     ln -s /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23/DEF .
 
     ./defCheck.pl rn5 13way
 
     # this constructs the html pages in rn5.13way/:
 # -rw-rw-r-- 1 7328 Jun  4 16:10 Rn5_conservation_alignment.html
 # -rw-rw-r-- 1 9646 Jun  4 16:11 Rn5_Genome_size_statistics.html
 # -rw-rw-r-- 1 5634 Jun  4 16:16 Rn5_conservation_lastz_parameters.html
 
     # add those pages to the genomewiki.
 
 #############################################################################
 # LIFTOVER TO Rn6 (DONE - 2014-07-14 - Hiram )
     mkdir /hive/data/genomes/rn5/bed/blat.rn6.2014-07-14
     cd /hive/data/genomes/rn5/bed/blat.rn6.2014-07-14
     # -debug run to create run dir, preview scripts...
     doSameSpeciesLiftOver.pl -ooc=/hive/data/genomes/rn5/jkStuff/rn5.11.ooc \
         -debug rn5 rn6
     # Real run:
     time doSameSpeciesLiftOver.pl \
       -ooc=/hive/data/genomes/rn5/jkStuff/rn5.11.ooc rn5 rn6 > do.log 2>&1
     # about one hour
 
     # test with sequence
 #############################################################################
 ##############################################################################
 # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd)
 ##############################################################################
 # Crispr track. See ../crisprTrack/README.txt (2016-09-15 max)
 # Command: doCrispr.sh <db> ensGene
 ##############################################################################
+# LASTZ Rat Rn5 vs. Rat Rn7 (DONE - 2022-01-07 - Gerardo)
+
+# should be able to run this from anywhere, this time it was run from:
+    cd kent/src/hg/utils/automation
+
+  time (~/kent/src/hg/utils/automation/pairLastz.sh \
+	rn5 rn7 mammal mammal) \
+	   > rn5_rn7_20220107.log 2>&1 &
+  # check the total time
+grep -w real  rn5_rn7_20220107.log  | tail -1 | sed -e 's/^/    # /;'
+    # real      532m37.936s
+
+  # this  rn5_rn7_20220107.log log file happens to have a copy of the make doc, as well
+  # as the copy of the make doc left in the target assembly directory:
+# /hive/data/genomes/rn5/bed/lastzRn7.2022-01-07/makeDoc.txt
+
+    # this command outputs this makeDoc text:
+
+    cat kent/src/hg/utils/automation/rn5_rn7_20220107.log
+
+##############################################################################
+# LASTZ Rat Rn5 vs. Rat Rn7
+#    (DONE - 2022-01-07 - Gerardo)
+
+    mkdir /hive/data/genomes/rn5/bed/lastzRn7.2022-01-07
+    cd /hive/data/genomes/rn5/bed/lastzRn7.2022-01-07
+
+    printf '# Rat Rn7 vs. Rat Rn5
+BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
+
+# TARGET: Rat Rn5
+SEQ1_DIR=/hive/data/genomes/rn5/rn5.2bit
+SEQ1_LEN=/hive/data/genomes/rn5/chrom.sizes
+SEQ1_CHUNK=20000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=40
+
+# QUERY: Rat Rn7
+SEQ2_DIR=/hive/data/genomes/rn7/rn7.2bit
+SEQ2_LEN=/hive/data/genomes/rn7/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LAP=0
+SEQ2_LIMIT=100
+
+BASE=/hive/data/genomes/rn5/bed/lastzRn7.2022-01-07
+TMPDIR=/dev/shm
+
+' > DEF
+
+    time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl  -verbose=2 `pwd`/DEF -syntenicNet \
+        -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
+        -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1
+    grep -w real do.log | sed -e 's/^/    # /;'
+    # real	243m55.340s
+
+    sed -e 's/^/    # /;' fb.rn5.chainRn7Link.txt
+    # 2536856185 bases of 2572853723 (98.601%) in intersection
+    sed -e 's/^/    # /;' fb.rn5.chainSynRn7Link.txt
+    # 2490005622 bases of 2572853723 (96.780%) in intersection
+
+    time (~/kent/src/hg/utils/automation/doRecipBest.pl  -load -workhorse=hgwdev -buildDir=`pwd` \
+       \
+       \
+        rn5 rn7) > rbest.log 2>&1
+
+    grep -w real rbest.log | sed -e 's/^/    # /;'
+    # real	89m48.335s
+
+    sed -e 's/^/    # /;' fb.rn5.chainRBest.Rn7.txt
+    # 2402478429 bases of 2572853723 (93.378%) in intersection
+
+    ### and for the swap
+
+    cd /hive/data/genomes/rn7/bed/blastz.rn5.swap
+
+   time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl  -swap -verbose=2 \
+    /hive/data/genomes/rn5/bed/lastzRn7.2022-01-07/DEF -swapDir=`pwd` \
+  -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
+    -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1
+
+    grep -w real swap.log | sed -e 's/^/    # /;'
+    # real	117m28.787s
+
+    sed -e 's/^/    # /;' fb.rn7.chainRn5Link.txt
+    # 2455376156 bases of 2626580772 (93.482%) in intersection
+    sed -e 's/^/    # /;' fb.rn7.chainSynRn5Link.txt
+    # 2410196801 bases of 2626580772 (91.762%) in intersection
+\    time (~/kent/src/hg/utils/automation/doRecipBest.pl  -load -workhorse=hgwdev -buildDir=`pwd` \
+    \
+    \
+   rn7 rn5) > rbest.log 2>&1
+
+    grep -w real rbest.log | sed -e 's/^/    # /;'
+    # real	81m24.947s
+
+    sed -e 's/^/    # /;' fb.rn7.chainRBest.Rn5.txt
+    # 2400855216 bases of 2626580772 (91.406%) in intersection
+
+##############################################################################
+
+real	532m37.936s
+user	0m1.288s
+sys	0m0.938s