8102c8c2b15180f3e48534af65f117ac467c4505
jcasper
  Thu Apr 14 16:00:15 2022 -0700
Makedoc for initial DECIPHER build on hg38, refs #29130

diff --git src/hg/makeDb/doc/hg38/hg38.txt src/hg/makeDb/doc/hg38/hg38.txt
index 5ad8e9b..7624f37 100644
--- src/hg/makeDb/doc/hg38/hg38.txt
+++ src/hg/makeDb/doc/hg38/hg38.txt
@@ -1,7011 +1,7035 @@
 # for emacs: -*- mode: sh; -*-
 
 # This file describes how we made the browser database on
 # NCBI build 38 (December 2013 freeze) aka:
 #	GRCh38 - Genome Reference Consortium Human Reference 38
 #	Assembly Accession: GCA_000001405.2
 
 #############################################################################
 ## Download sequence - DONE - 2013-12-24
     mkdir /hive/data/genomes/hg38
     mkdir /hive/data/genomes/hg38/genbank
     cd /hive/data/genomes/hg38/genbank
     time rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/ ./
 # sent 19643 bytes  received 4914689807 bytes  4490369.53 bytes/sec
 # total size is 4914019581  speedup is 1.00
 
 # real    18m14.497s
 
 #############################################################################
 ## convert to UCSC names - DONE - 2013-12-24
 #  with this release, NCBI has adopted a naming convention that is similar
 #  to UCSC.  The delivered sequence with these names can be found in:
 #  /hive/data/genomes/hg38/genbank/seqs_for_alignment_pipelines/
 #
 #  The following scripts reproduce this naming scheme from the separate
 #  files in the release
 #
     mkdir /hive/data/genomes/hg38/ucsc
     cat << '_EOF_' > ucscCompositeAgp.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my %accToChr;
 
 open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or
         die "can not read Primary_Assembly/assembled_chromosomes/chr2acc";
 while (my $line = <FH>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my ($chrN, $acc) = split('\s+', $line);
     $accToChr{$acc} = $chrN;
 }
 close (FH);
 
 foreach my $acc (keys %accToChr) {
     my $chrN =  $accToChr{$acc};
     print "$acc $accToChr{$acc}\n";
     open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz";
     open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp";
     while (my $line = <FH>) {
         if ($line =~ m/^#/) {
             print UC $line;
         } else {
             $line =~ s/^$acc/chr${chrN}/;
             print UC $line;
         }
     }
     close (FH);
     close (UC);
     open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz";
     open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa";
     while (my $line = <FH>) {
         if ($line =~ m/^>/) {
             printf UC ">chr${chrN}\n";
         } else {
             print UC $line;
         }
     }
     close (FH);
     close (UC);
 }
 '_EOF_'
     # << happy emacs
     chmod +x ucscCompositeAgp.pl
 
     cat << '_EOF_' > unlocalized.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my %accToChr;
 my %chrNames;
 
 open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or
         die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf";
 while (my $line = <FH>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my ($chrN, $acc) = split('\s+', $line);
     $acc =~ s/\./v/;
     $accToChr{$acc} = $chrN;
     $chrNames{$chrN} += 1;
 }
 close (FH);
 
 foreach my $chrN (keys %chrNames) {
     my $agpFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz";
     my $fastaFile =  "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz";
     open (FH, "zcat $agpFile|") or die "can not read $agpFile";
     open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp";
     while (my $line = <FH>) {
         if ($line =~ m/^#/) {
             print UC $line;
         } else {
             chomp $line;
             my (@a) = split('\t', $line);
             my $acc = $a[0];
             $acc =~ s/\./v/;
             die "ERROR: chrN $chrN not correct for $acc"
                 if ($accToChr{$acc} ne $chrN);
             my $ucscName = "chr${chrN}_${acc}_random";
             printf UC "%s", $ucscName;
             for (my $i = 1; $i < scalar(@a); ++$i) {
                 printf UC "\t%s", $a[$i];
             }
             printf UC "\n";
         }
     }
     close (FH);
     close (UC);
     printf "chr%s\n", $chrN;
     open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
     open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa";
     while (my $line = <FH>) {
         if ($line =~ m/^>/) {
             chomp $line;
             my $acc = $line;
             $acc =~ s/.*gb\|//;
             $acc =~ s/. Homo.*//;
             $acc =~ s/\./v/;
             die "ERROR: chrN $chrN not correct for $acc"
                 if ($accToChr{$acc} ne $chrN);
             my $ucscName = "chr${chrN}_${acc}_random";
             printf UC ">$ucscName\n";
         } else {
             print UC $line;
         }
     }
     close (FH);
     close (UC);
 }
 '_EOF_'
     # << happy emacs
     chmod +x unlocalized.pl
 
     cat << '_EOF_' > unplaced.pl
 #!/bin/env perl
 
 use strict;
 use warnings;
 
 my $agpFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
 my $fastaFile =  "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz";
 open (FH, "zcat $agpFile|") or die "can not read $agpFile";
 open (UC, ">chrUn.agp") or die "can not write to chrUn.agp";
 while (my $line = <FH>) {
     if ($line =~ m/^#/) {
         print UC $line;
     } else {
         $line =~ s/\./v/;
         printf UC "chrUn_%s", $line;
     }
 }
 close (FH);
 close (UC);
 
 open (FH, "zcat $fastaFile|") or die "can not read $fastaFile";
 open (UC, ">chrUn.fa") or die "can not write to chrUn.fa";
 while (my $line = <FH>) {
     if ($line =~ m/^>/) {
         chomp $line;
         $line =~ s/.*gb\|//;
         $line =~ s/. Homo.*//;
         $line =~ s/\./v/;
         printf UC ">chrUn_$line\n";
     } else {
         print UC $line;
     }
 }
 close (FH);
 close (UC);
 '_EOF_'
     # << happy emacs
     chmod +x unplaced.pl
 
     cat << '_EOF_' > altSequence.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 use File::Basename;
 
 open (AG, ">chrAlt.agp") or die "can not write to chrAlt.agp";
 open (FA, ">chrAlt.fa") or die "can not write to chrAlt.fa";
 open (FH, "find ../genbank/ALT* -type f | grep alt_scaffold_placement.txt|") or die "can not find alt_scaffold_placement.txt files";
 while (my $file = <FH>) {
   chomp $file;
   my $dirName = dirname($file);
   my $agpFile = "$dirName/AGP/alt.scaf.agp.gz";
   my $fastaFile = "$dirName/FASTA/alt.scaf.fa.gz";
   # key is genbank acc name, value is UCSC chr name
   my %nameDelta;
 #  printf STDERR "# %s\n", $file;
   open (AL, "<$file") or die "can not read $file";
   while (my $line = <AL>) {
      next if ($line =~ m/^#/);
      chomp $line;
      my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc,
           $parent_type, $parent_name, $parent_acc, $region_name, $ori,
            $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop,
             $alt_start_tail, $alt_stop_tail) = split('\t', $line);
      my $ucscAcc = $alt_scaf_acc;
      $ucscAcc =~ s/\./v/;
      my $ucscName = sprintf("chr%s_%s_alt", $parent_name, $ucscAcc);
      printf "%s %s\n", $alt_scaf_acc, $ucscName;
      if (exists ($nameDelta{$alt_scaf_acc})) {
          die "duplicate name incorrect ? $alt_scaf_acc $nameDelta{$alt_scaf_acc} ne $ucscName" if ($nameDelta{$alt_scaf_acc} ne $ucscName);
      } else {
          $nameDelta{$alt_scaf_acc} = $ucscName;
      }
   }
   close (AL);
   open (AL, "zcat $agpFile|") or die "can not read $agpFile";
   while (my $line = <AL>) {
      if ($line =~ m/^#/) {
        print AG "$line";
      } else {
        my ($acc, $rest) = split('\t', $line, 2);
        die "can not find ucsc name for $acc" if (!exists($nameDelta{$acc}));
        printf AG "%s\t%s", $nameDelta{$acc}, $rest;
      }
   }
   close (AL);
   open (AL, "zcat $fastaFile|") or die "can not read $fastaFile";
   while (my $line = <AL>) {
      chomp $line;
      if ($line =~ m/^>/) {
        $line =~ s/.*gb.//;
        $line =~ s/. Homo.*//;
        die "can not find ucsc name for $line" if (!exists($nameDelta{$line}));
        printf FA ">%s\n", $nameDelta{$line};
      } else {
        printf FA "%s\n", $line;
      }
   }
   close (AL);
 }
 close (FH);
 close (AG);
 close (FA);
 '_EOF_'
     # << happy emacs
     chmod +x altSequence.pl
 
     ./ucscCompositeAgp.pl
     ./unlocalized.pl
     ./unplaced.pl
     ./altSequence.pl
 
     # temporarily verify the fasta and AGP are complete and compatible
     faToTwoBit chr*.fa hg38.test.2bit
     cat chr*.agp > hg38.agp
     checkAgpAndFa hg38.agp hg38.test.2bit 2>&1 | tail -1
 # All AGP and FASTA entries agree - both files are valid
 
     rm -f hg38.agp hg38.test.2bit
 
     # comparing faCounts of this 2bit file and the sequences delivered
     # in genbank/seqs_for_alignment_pipelines/
     # result in the exact same sequence
 
 #############################################################################
 ## initial db build - DONE - 2013-12-24 - Hiram
 
     cd /hive/data/genomes/hg38
     cat << '_EOF_' > hg38.config.ra
 # Config parameters for makeGenomeDb.pl:
 db hg38
 scientificName Homo sapiens
 commonName Human
 assemblyDate Dec. 2013
 assemblyLabel GRCh38 Genome Reference Consortium Human Reference 38 (GCA_000001405.2)
 assemblyShortLabel GRCh38
 orderKey 13
 mitoAcc none
 fastaFiles /hive/data/genomes/hg38/ucsc/chr*.fa
 agpFiles /hive/data/genomes/hg38/ucsc/chr*.agp
 # qualFiles /dev/null
 dbDbSpeciesDir human
 photoCreditURL http://www.cbse.ucsc.edu/
 photoCreditName Graphic courtesy of CBSE
 ncbiGenomeId 51
 ncbiAssemblyId 883148
 ncbiAssemblyName GRCh38
 ncbiBioProject 31257
 genBankAccessionID GCA_000001305.2
 taxId   9606
 '_EOF_'
     # << happy emacs
 
     # step wise to first verify AGP and Fasta files
     time makeGenomeDb.pl -stop=agp hg38.config.ra > agp.log 2>&1
 
     # looking good, continue:
     time makeGenomeDb.pl -continue=db hg38.config.ra > db.log 2>&1
 
     # add the files produced by the trackDb build to the source tree
 
     # this path is fixed in the makeGenomeDb.pl for next time
     # honor new convention for bbi location files:
     cd /gbdb/hg38/bbi
     mkdir gc5BaseBw
     mv gc5Base.bw gc5BaseBw
     cd gc5BaseBw
     # before
     hgsql -e 'select * from gc5BaseBw;' hg38
 # +---------------------------+
 # | fileName                  |
 # +---------------------------+
 # | /gbdb/hg38/bbi/gc5Base.bw |
 # +---------------------------+
     # and fixed
     hgBbiDbLink hg38 gc5BaseBw `pwd`/gc5Base.bw
     hgsql -e 'select * from gc5BaseBw;' hg38
 # +-------------------------------------+
 # | fileName                            |
 # +-------------------------------------+
 # | /gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw |
 # +-------------------------------------+
 
 #############################################################################
 ## RepeatMasker with CrossMatch - DONE - 2013-12-24,27 - Hiram
     mkdir /hive/data/genomes/hg38/bed/repeatMaskerCM
     cd /hive/data/genomes/hg38/bed/repeatMaskerCM
     # running this step wise so it can be loaded into its own table
     time doRepeatMasker.pl -stop=mask -bigClusterHub=ku \
        -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1
     # real    3443m13.026s
 # RepeatMasker version June 20 2013 open-4.0.3
 # Search Engine: cross-match version 1.090518
 # RepeatMasker Database: 20130422
 
     # take the install script from this -debug run and alter it to load
     # the table into rmskCM
     time doRepeatMasker.pl -continue=install -stop=install -debug \
        -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38
     cat fb.hg38.rmskCM.txt
     # 1586326530 bases of 3209286105 (49.429%) in intersection
 
     # profile of repeat elements:
 #  1852545 rmskClass/SINE.tab
 #  1570523 rmskClass/LINE.tab
 #   748597 rmskClass/LTR.tab
 #   703682 rmskClass/Simple_repeat.tab
 #   499108 rmskClass/DNA.tab
 #   102856 rmskClass/Low_complexity.tab
 #     7962 rmskClass/Satellite.tab
 #     5750 rmskClass/Retroposon.tab
 #     5667 rmskClass/LTR?.tab
 #     5622 rmskClass/Unknown.tab
 #     4516 rmskClass/snRNA.tab
 #     3294 rmskClass/DNA?.tab
 #     2026 rmskClass/tRNA.tab
 #     1840 rmskClass/rRNA.tab
 #     1784 rmskClass/RC.tab
 #     1672 rmskClass/srpRNA.tab
 #     1420 rmskClass/scRNA.tab
 #      704 rmskClass/RNA.tab
 #      411 rmskClass/RC?.tab
 #       38 rmskClass/SINE?.tab
 
     # using this RM result with trfMask for the final masked sequence
     cd /hive/data/genomes/hg38
     twoBitMask hg38.rmskCM.2bit -add bed/simpleRepeat/trfMask.bed hg38.2bit
     twoBitToFa hg38.2bit stdout | faSize stdin > faSize.hg38.2bit.txt
 # 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper 1588630985 lower) in 455 sequences in 1 files
 # %49.50 masked total, %52.10 masked real
 
     featureBits -countGaps hg38 rmskCM '!rmskHmmer' -bed=crossMatchUnique.bed
     # 24868153 bases of 3209286105 (0.775%) in intersection
     hgLoadBed hg38 crossMatchUnique crossMatchUnique.bed
     # Read 2352219 elements of size 4 from crossMatchUnique.bed
 
 #############################################################################
 ## repeating RepeatMasker Blastn run (DONE - 2014-01-07 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/rmskBlastn
     cd /hive/data/genomes/hg38/bed/rmskBlastn
 
     time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
       -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
         -stop=mask -buildDir=`pwd` hg38 > mask.log
     # real    203m33.670s
 
 # 3209286105 bases (159970322 N's 3049315783 real 1491207906 upper 1558107877 lower) in 455 sequences in 1 files
 # %48.55 masked total, %51.10 masked real
 
     # install step with debug so the script can be altered to load into
     # a specific rmskBlastn table:
 
     $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
       -useRMBlastn -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
         -continue=install -debug -buildDir=`pwd` hg38
 
 #############################################################################
 ## repeating RepeatMasker cross-match run (DONE - 2014-01-07 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/rmskCM
     cd /hive/data/genomes/hg38/bed/rmskCM
 
     # missed recording stderr ....  forgot the 2>&1
     time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
       -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
         -stop=mask -buildDir=`pwd` hg38 > mask.log
     # real    1897m33.517s
     # running from Tue Jan  7 16:10:33 PST 2014 thru 08 Jan 23:48
 #  *** All done!  (through the 'mask' step) - Elapsed time: 1897m34s
 #  *** Steps were performed in /hive/data/genomes/hg38/bed/rmskCM
     # running install manually to allow edit of the script to load
     # a specific rmskCm table
     time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
       -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
         -continue=install -stop=install -buildDir=`pwd` hg38 -debug
 
 #############################################################################
 ## RepeatMasker with RM Blastn - DONE - 2013-12-24,25 - Hiram
     mkdir /hive/data/genomes/hg38/bed/repeatMaskerBlastn
     cd /hive/data/genomes/hg38/bed/repeatMaskerBlastn
     # running this step wise so it can be loaded into its own table
     time doRepeatMasker.pl -stop=mask -useRMBlastn -bigClusterHub=ku \
        -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1
     # real    354m55.842s
 
     # take the install script from this -debug run and alter it to load
     # the table into rmskBlastn
     doRepeatMasker.pl -useRMBlastn -bigClusterHub=ku  -continue=install \
      -stop=install -debug -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38
     # 1560264046 bases of 3209286105 (48.617%) in intersection
     # profile of repeat elements:
 #   1824560 rmskClass/SINE.tab
 #   1552814 rmskClass/LINE.tab
 #    738435 rmskClass/LTR.tab
 #    715998 rmskClass/Simple_repeat.tab
 #    486591 rmskClass/DNA.tab
 #    105026 rmskClass/Low_complexity.tab
 #      7712 rmskClass/Satellite.tab
 #      5638 rmskClass/Retroposon.tab
 #      5276 rmskClass/Unknown.tab
 #      5100 rmskClass/LTR?.tab
 #      4548 rmskClass/snRNA.tab
 #      3033 rmskClass/DNA?.tab
 #      1987 rmskClass/tRNA.tab
 #      1809 rmskClass/rRNA.tab
 #      1710 rmskClass/RC.tab
 #      1633 rmskClass/srpRNA.tab
 #      1428 rmskClass/scRNA.tab
 #       614 rmskClass/RNA.tab
 #       376 rmskClass/RC?.tab
 #        38 rmskClass/SINE?.tab
 #         3 rmskClass/Unspecified.tab
 #   5464329 total
 
 #############################################################################
 ## repeating RepeatMasker run with HMMER - DONE - 2014-01-08 - Hiram
     mkdir /hive/data/genomes/hg38/bed/rmskHmmer
     cd /hive/data/genomes/hg38/bed/rmskHmmer
 
     # trying cpu=4 and ram=32g
     time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
       -stop=mask -useHMMER -bigClusterHub=ku \
        -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1
     # 6 jobs required more than 32 Gb of memory to complete, ran them on
     # hgwdev to complete, then continuing:
     time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
       -continue=cat -stop=mask -useHMMER -bigClusterHub=ku \
        -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > cat.log 2>&1
     #  real    24m5.274s
 # 3209286105 bases (159970322 N's 3049315783 real 1314916231 upper 1734399552 lower) in 455 sequences in 1 files
 # %54.04 masked total, %56.88 masked real
 
     # running install manually to allow edit of the script to load
     # a specific rmskHmmer table
     time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
       -continue=install -debug -useHMMER -bigClusterHub=ku \
        -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38
 
     time ./doLoad_rmskHmmer.bash > load.log 2>&1
     # real    4m47.432s
 
     featureBits -countGaps hg38 rmskHmmer > fb.hg38.rmskHmmer.txt 2>&1
     # 1734398971 bases of 3209286105 (54.043%) in intersection
 
     grep rmskClass hg38.class.profile.txt \
         | sed -e 's#rmskClass/##; s/.tab//;' | sort -rn
     # profile of repeat elements:
 #  1884179 SINE
 #  1702529 LINE
 #   805427 LTR
 #   636906 Simple_repeat
 #   565171 DNA
 #    95480 Low_complexity
 #    11861 Retroposon
 #    10852 Satellite
 #     9181 LTR?
 #     6783 scRNA
 #     4582 DNA?
 #     3914 Unknown
 #     2059 RC
 #     1517 srpRNA
 #     1484 RNA
 #      970 SINE?
 #      806 RC?
 #      464 rRNA
 #  5744165 total
 
     featureBits -countGaps hg38 rmskHmmer '!rmskCM' -bed=hmmerUnique.bed
     # 172940594 bases of 3209286105 (5.389%) in intersection
     hgLoadBed hg38 hmmerUnique hmmerUnique.bed
     # Read 3099505 elements of size 4 from hmmerUnique.bed
 
 #############################################################################
 ## RepeatMasker with HMMER - DONE - 2013-12-24,26 - Hiram
     mkdir /hive/data/genomes/hg38/bed/repeatMaskerHMMER
     cd /hive/data/genomes/hg38/bed/repeatMaskerHMMER
 
     time doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \
        -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38 > mask.log 2>&1
     # take the install script from this -debug run and alter it to load
     # the table into rmskHmmer
     doRepeatMasker.pl -continue=install -stop=install -useHMMER \
       -bigClusterHub=ku -workhorse=hgwdev -dbHost=hgwdev \
          -buildDir=`pwd` hg38 > mask.log 2>&1
     # 1702017722 bases of 3209286105 (53.034%) in intersection
     # profile of repeat elements:
 #   1879864 rmskClass/SINE.tab
 #   1678216 rmskClass/LINE.tab
 #    794231 rmskClass/LTR.tab
 #    651561 rmskClass/Simple_repeat.tab
 #    551965 rmskClass/DNA.tab
 #     97186 rmskClass/Low_complexity.tab
 #     10756 rmskClass/Retroposon.tab
 #     10448 rmskClass/Satellite.tab
 #      8393 rmskClass/LTR?.tab
 #      5849 rmskClass/scRNA.tab
 #      4282 rmskClass/Unknown.tab
 #      4276 rmskClass/DNA?.tab
 #      2000 rmskClass/RC.tab
 #      1573 rmskClass/srpRNA.tab
 #      1291 rmskClass/RNA.tab
 #       906 rmskClass/snRNA.tab
 #       747 rmskClass/SINE?.tab
 #       723 rmskClass/RC?.tab
 #       722 rmskClass/rRNA.tab
 #       468 rmskClass/tRNA.tab
 #   5705457 total
 
 #############################################################################
 # rmsk from genbank release (DONE - 2014-12-25 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/repeatMaskerGenbank
     cd /hive/data/genomes/hg38/bed/repeatMaskerGenbank
 
     head -3 ../repeatMaskerBlastn/hg38.fa.out > genbank.rm.out
 find ../../genbank -type f | grep rm.out | grep -v "/placed_scaffolds/" | while read F
 do
   headRest 3 $F
 done | sort -k5,45 -k6,6n >> genbank.rm.out
     grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \
        | awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt
 
     sed -e "`cat accessionToUcsc.sed.txt`" genbank.rm.out > ucscNames.rm.out
 
     head -3 ucscNames.rm.out > hg38.sorted.fa.out
     tail -n +4 ucscNames.rm.out  | sort -k5,5 -k6,6n >> hg38.sorted.fa.out
 
     hgLoadOut -table=rmskGenbank -nosplit hg38 hg38.sorted.fa.out
     hgLoadOut -verbose=2 -tabFile=hg38.rmskGenbank.tab -table=rmskGenbank \
        -nosplit hg38 hg38.sorted.fa.out 2> bad.records.txt
     # fixed up one of the masking scripts from the other runs to construct
     # the bbi files
 
     # 1581568556 bases of 3209286105 (49.281%) in intersection
     # profile of repeat elements:
 #   1849444 rmskClass/SINE.tab
 #   1586141 rmskClass/LINE.tab
 #    759248 rmskClass/LTR.tab
 #    502186 rmskClass/DNA.tab
 #    433789 rmskClass/Simple_repeat.tab
 #    396378 rmskClass/Low_complexity.tab
 #     10198 rmskClass/Satellite.tab
 #      5884 rmskClass/LTR?.tab
 #      4595 rmskClass/snRNA.tab
 #      4163 rmskClass/Retroposon.tab
 #      2802 rmskClass/Unknown.tab
 #      2157 rmskClass/DNA?.tab
 #      2154 rmskClass/tRNA.tab
 #      1915 rmskClass/rRNA.tab
 #      1860 rmskClass/RC.tab
 #      1784 rmskClass/srpRNA.tab
 #      1397 rmskClass/scRNA.tab
 #       822 rmskClass/RNA.tab
 #       488 rmskClass/SINE?.tab
 #       445 rmskClass/RC?.tab
 #   5567850 total
 
 #############################################################################
 ## running TRF simple repeats - DONE - 2013-12-24,29 - Hiram
     # this procedure ran into much trouble on this release.  The new
     # repeat sequences in the centromeres caused trf to run indefinitely.
     # I tried different sizes of chunks, working down to 20 Mbase chunks.
     # Even still, some jobs would not complete.  Those broke down even
     # more, eventually to the smallest bit of 30 Kbase that needed to
     # run all the way down to 3,000 based chunks with 1,000 base overlaps.
 
     # this did not work:
     screen # use screen to manage this day-long job
     mkdir /hive/data/genomes/hg38/bed/simpleRepeat
     cd /hive/data/genomes/hg38/bed/simpleRepeat
     time doSimpleRepeat.pl -bigClusterHub=ku -workhorse=hgwdev \
 	-smallClusterHub=ku -buildDir=`pwd` hg38 > do.log 2>&1
     cd /hive/data/genomes/hg38/bed
     # move it aside:
     mv simpleRepeat simpleRepeat.2013-12-24
 
     # Instead, something like this:
     mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap
     cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap
     mkdir -p noGap
 
     twoBitToFa ../../../hg38.unmasked.2bit stdout \
        | faSplit -lift=noGap.lift gap stdin 5000000 noGap/hg38_
     # make sure nothing has gone missing:
     faCount noGap/*.fa > faCount.txt
     tail -1 faCount.txt
 # total 3068387174 898285419 623727342 626335137 900967885  19071391 30979734
     # compared to the full sequence, same numbers for ACGT:
     twoBitToFa ../../../hg38.unmasked.2bit stdout | faCount stdin
 # total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743
     faToTwoBit noGap/*.fa hg38.nogap.2bit
     twoBitInfo hg38.nogap.2bit stdout | sort -k2,2nr > hg38.nogap.sizes
 
 
     mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M
     cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M
     rm -rf /hive/data/genomes/hg38/TrfPart20M
     /cluster/bin/scripts/simplePartition.pl \
 /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/splitGap/hg38.nogap.2bit \
    20000000 /hive/data/genomes/hg38/TrfPart20M
    rm -f /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M
    ln -s /hive/data/genomes/hg38/TrfPart20M \
       /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/TrfPart20M
    ssh ku
    cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M
    gensub2 /hive/data/genomes/hg38/TrfPart20M/partitions.lst single gsub jobList
    para create jobList
    para push
    # 20 jobs would not complete:
 # Completed: 143 of 163 jobs
 # Jobs currently running: 20
 # CPU time in finished jobs:      76994s    1283.24m    21.39h    0.89d  0.002 y
 # IO & Wait Time:                  1095s      18.24m     0.30h    0.01d  0.000 y
 # Time in running jobs:         1807279s   30121.32m   502.02h   20.92d  0.057 y
 # Average job time:                 546s       9.10m     0.15h    0.01d
 # Longest running job:            90422s    1507.03m    25.12h    1.05d
 # Longest finished job:           43348s     722.47m    12.04h    0.50d
 # Submission to last job:         43363s     722.72m    12.05h    0.50d
    # determine which are the last jobs as individual bits:
    para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \
      > not.done.list
    awk '{print $NF}' not.done.list | sed -e 's/.bed//' | while read F
 do
    cat $F
 done > seq.specs.not.done
 
    mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs
    cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs
    mkdir fasta
    for seqSpec in `cat ../seq.specs.not.done`
 do
   fName=`echo $seqSpec | sed -e 's/.*://'`
   echo $fName
   twoBitToFa $seqSpec fasta/$fName.fa
 done
   ls -1S `pwd`/fasta > part.list
   cat << '_EOF_' > template
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
 #ENDLOOP
 '_EOF_'
   # << happy emacs
 
   cat << '_EOF_' > runTrf
 #!/bin/bash
 set -beEu -o pipefail
 export path1=$1
 export inputFN=`basename $1`
 export outpath=$2
 export outputFN=`basename $2`
 mkdir -p /dev/shm/$outputFN
 cp -p $path1 /dev/shm/$outputFN
 cd /dev/shm/$outputFN
 /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
       $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
 cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/lastJobs
 rm -f $outpath
 cp -p /dev/shm/$outputFN/$outputFN $outpath
 rm -fr /dev/shm/$outputFN/*
 rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
 '_EOF_'
   # << happy emacs
   chmod +x runTrf
 
   gensub2 part.list single template jobList
   para create jobList
   para push
   # not all of these jobs will finish either:
 # Completed: 85 of 106 jobs
 # Jobs currently running: 21
 # CPU time in finished jobs:      58076s     967.93m    16.13h    0.67d  0.002 y
 # IO & Wait Time:                   828s      13.81m     0.23h    0.01d  0.000 y
 # Time in running jobs:         1988997s   33149.95m   552.50h   23.02d  0.063 y
 # Average job time:                 693s      11.55m     0.19h    0.01d
 # Longest running job:            94730s    1578.83m    26.31h    1.10d
 # Longest finished job:           34216s     570.27m     9.50h    0.40d
 # Submission to last job:         34342s     572.37m     9.54h    0.40d
 
   # can use what we have here:
   liftUp result.bed ../../splitGap/noGap.lift error bed/*.bed
   # find jobs not done
   para status | grep -v done | awk '{print $(NF-1),$NF}' | grep TrfRun \
      > not.done.list
   # splitting up those last jobs:
   mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits
   cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits
   mkdir noGap
   awk '{print $2}' ../lastJobs/not.done.list | while read F
 do
   cp -p $F ./noGap/
 done
 
   # split into 1,000,000 chunks with 10,000 overlap:
   mkdir -p 1M_10K
 
 for F in noGap/*.fa
 do
   B=`basename $F | sed -e 's/.fa//'`
   echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_"
   faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/${B}_
 done
 
   ls -1S `pwd`/1M_10K/*.fa > part.list
   cat << '_EOF_' > runTrf
 #!/bin/bash
 set -beEu -o pipefail
 export path1=$1
 export inputFN=`basename $1`
 export outpath=$2
 export outputFN=`basename $2`
 mkdir -p /dev/shm/$outputFN
 cp -p $path1 /dev/shm/$outputFN
 cd /dev/shm/$outputFN
 /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
       $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
 cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitBits
 rm -f $outpath
 cp -p /dev/shm/$outputFN/$outputFN $outpath
 rm -fr /dev/shm/$outputFN/*
 rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
 '_EOF_'
   # << happy emacs
 
   cat << '_EOF_' > template
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
 #ENDLOOP
 '_EOF_'
   # << happy emacs
 
   gensub2 part.list single template jobList
   para create jobList
   para push
   # not all of these jobs will complete either:
 # Completed: 53 of 96 jobs
 # CPU time in finished jobs:     212403s    3540.05m    59.00h    2.46d  0.007 y
 # IO & Wait Time:                  1851s      30.85m     0.51h    0.02d  0.000 y
 # Average job time:                4043s      67.38m     1.12h    0.05d
 # Longest finished job:           68726s    1145.43m    19.09h    0.80d
 # Submission to last job:         68890s    1148.17m    19.14h    0.80d
   # use what results we have here:
   cat *.lift  | liftUp parts.bed stdin error bed/*.bed
   liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed \
     | sort -u | sort -k1,1 -k2,2n > hg38.result.bed
 
   para status | grep -v -w done | awk '{print $(NF-1)}' > will.not.finish.txt
 
   # split those last bits:
   mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits
   cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits
   mkdir splitBits
   cat ../splitBits/will.not.finish.txt | while read F
 do
   cp -p $F splitBits
 done
 
   #  100K chunks with 10K overlap
   mkdir -p 100K_10K
 
 for F in splitBits/*.fa
 do
   B=`basename $F | sed -e 's/.fa//'`
   echo "faSplit -lift=$B.lift -extra=10000 size $F 1000000 1M_10K/$B_"
   faSplit -lift=$B.lift -extra=10000 size $F 100000 100K_10K/${B}_
 done
 
   cat << '_EOF_' > runTrf
 #!/bin/bash
 set -beEu -o pipefail
 export path1=$1
 export inputFN=`basename $1`
 export outpath=$2
 export outputFN=`basename $2`
 mkdir -p /dev/shm/$outputFN
 cp -p $path1 /dev/shm/$outputFN
 cd /dev/shm/$outputFN
 /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
       $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
 cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/splitSplitBits
 rm -f $outpath
 cp -p /dev/shm/$outputFN/$outputFN $outpath
 rm -fr /dev/shm/$outputFN/*
 rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
 '_EOF_'
   # << happy emacs
   chmod +x runTrf
 
   cat << '_EOF_' > template
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
 #ENDLOOP
 '_EOF_'
   # << happy emacs
 
   ls -1S `pwd`/100K_10K/*.fa > part.list
   gensub2 part.list single template jobList
   para create jobList
   para push
   # one last bit does not complete:
 # Completed: 420 of 421 jobs
 # CPU time in finished jobs:      19862s     331.04m     5.52h    0.23d  0.001 y
 # IO & Wait Time:                  2360s      39.33m     0.66h    0.03d  0.000 y
 # Average job time:                  53s       0.88m     0.01h    0.00d
 # Longest finished job:             368s       6.13m     0.10h    0.00d
 # Submission to last job:           448s       7.47m     0.12h    0.01d
 
   # can use the results obtained here:
   cat *.lift  | liftUp splitParts.bed stdin error bed/*.bed
   cat ../splitBits/*.lift | liftUp parts.bed  stdin error splitParts.bed
   liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \
     | sort -k1,1 -k2,2n > hg38.result.bed
 
   para status | grep -v -w done | awk '{print $(NF-1)}'
   # last chunk: 100K_10K/hg38_89_2_00.fa
 
   mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K
   cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K
   cp -p ../splitSplitBits/100K_10K/hg38_89_2_00.fa .
 
   # 20K chunks with 10K overlap:
   mkdir -p 20K_10K
 
 for F in hg38_89_2_00.fa
 do
   B=`basename $F | sed -e 's/.fa//'`
   echo "faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/$B_"
   faSplit -lift=$B.lift -extra=10000 size $F 20000 20K_10K/${B}_
 done
 
   ls -1S `pwd`/20K_10K/*.fa > part.list
   cat << '_EOF_' > runTrf
 #!/bin/bash
 set -beEu -o pipefail
 export path1=$1
 export inputFN=`basename $1`
 export outpath=$2
 export outputFN=`basename $2`
 mkdir -p /dev/shm/$outputFN
 cp -p $path1 /dev/shm/$outputFN
 cd /dev/shm/$outputFN
 /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
       $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
 cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last100K
 rm -f $outpath
 cp -p /dev/shm/$outputFN/$outputFN $outpath
 rm -fr /dev/shm/$outputFN/*
 rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
 '_EOF_'
   # << happy emacs
   chmod +s runTrf
   cat << '_EOF_' > template
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
 #ENDLOOP
 '_EOF_'
   # << happy emacs
 
   gensub2 part.list single template jobList
   para create jobList
   para push
   # one of these jobs will not finish:
 # Completed: 4 of 5 jobs
 # CPU time in finished jobs:         10s       0.17m     0.00h    0.00d  0.000 y
 # IO & Wait Time:                    16s       0.26m     0.00h    0.00d  0.000 y
 # Average job time:                   7s       0.11m     0.00h    0.00d
 # Longest finished job:               8s       0.13m     0.00h    0.00d
 # Submission to last job:            16s       0.27m     0.00h    0.00d
 
   # can use the results we have here:
   cat *.lift  | liftUp 20Kparts.bed stdin error bed/*.bed
   cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kparts.bed
   cat ../splitBits/*.lift | liftUp parts.bed  stdin error 100Kpart.bed
   liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \
     | sort -k1,1 -k2,2n > hg38.result.bed
 
   # finally, what turns out to be the last batch:
   mkdir /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K
   cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K
   cp -p ../last100K/20K_10K/hg38_89_2_00_3.fa .
 
   # 2K chunks with 1K overlap
   mkdir -p 2K_1K
 
 for F in hg38_89_2_00_3.fa
 do
   B=`basename $F | sed -e 's/.fa//'`
   echo "faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/$B_"
   faSplit -lift=$B.lift -extra=1000 size $F 2000 2K_1K/${B}_
 done
 
   ls -1S `pwd`/2K_1K/*.fa > part.list
   cat << '_EOF_' > runTrf
 #!/bin/bash
 set -beEu -o pipefail
 export path1=$1
 export inputFN=`basename $1`
 export outpath=$2
 export outputFN=`basename $2`
 mkdir -p /dev/shm/$outputFN
 cp -p $path1 /dev/shm/$outputFN
 cd /dev/shm/$outputFN
 /cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf \
       $inputFN /dev/null -bedAt=$outputFN -tempDir=/dev/shm
 cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M/last30K
 rm -f $outpath
 cp -p /dev/shm/$outputFN/$outputFN $outpath
 rm -fr /dev/shm/$outputFN/*
 rmdir --ignore-fail-on-non-empty /dev/shm/$outputFN
 '_EOF_'
   # << happy emacs
   chmod +x runTrf
   cat << '_EOF_' > template
 #LOOP
 ./runTrf {check in line+ $(path1)}  {check out line bed/$(root1).bed}
 #ENDLOOP
 '_EOF_'
   # << happy emacs
 
   gensub2 part.list single template jobList
   para create
   para push
 # Completed: 15 of 15 jobs
 # CPU time in finished jobs:          1s       0.02m     0.00h    0.00d  0.000 y
 # IO & Wait Time:                    26s       0.43m     0.01h    0.00d  0.000 y
 # Average job time:                   2s       0.03m     0.00h    0.00d
 # Longest finished job:               4s       0.07m     0.00h    0.00d
 # Submission to last job:            14s       0.23m     0.00h    0.00d
 
   cat *.lift  | liftUp 2Kparts.bed stdin error bed/*.bed
   cat ../last100K/*.lift | liftUp 20Kpart.bed stdin error 2Kparts.bed
   cat ../splitSplitBits/*.lift | liftUp 100Kpart.bed stdin error 20Kpart.bed
   cat ../splitBits/*.lift | liftUp parts.bed  stdin error 100Kpart.bed
   liftUp -type=.bed stdout ../../splitGap/noGap.lift error parts.bed | sort -u \
     | sort -k1,1 -k2,2n > hg38.result.bed
 
   ## To put it all together:
   cd /hive/data/genomes/hg38/bed/simpleRepeat.2013-12-27/run20M
   cat /hive/data/genomes/hg38/TrfPart20M/???/*.bed lastJobs/bed/*.bed \
      splitBits/parts.bed splitSplitBits/parts.bed last100K/parts.bed \
      last30K/parts.bed > beforeLift.simpleRepeat.bed
   liftUp -type=.bed stdout ../splitGap/noGap.lift error \
      beforeLift.simpleRepeat.bed | sort -u \
        | sort -k1,1 -k2,2n > simpleRepeat.bed
 
   awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
 
   hgLoadBed hg38 simpleRepeat simpleRepeat.bed \
         -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
   featureBits hg38 simpleRepeat > fb.simpleRepeat 2>&1
   cat fb.simpleRepeat
 # 146785521 bases of 3049335806 (4.814%) in intersection
 
   cd /hive/data/genomes/hg38/bed
   ln -s simpleRepeat.2013-12-27/run20M simpleRepeat
 
 ############################################################################
 
  # WINDOWMASKER - DONE - 2013-12-24 - Hiram
     mkdir /hive/data/genomes/hg38/bed/windowMasker
     cd /hive/data/genomes/hg38/bed/windowMasker
     time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
 	-dbHost=hgwdev hg38 > do.log 2>&1 &
 
 ############################################################################
 # Verify all gaps are marked - DONE - 2013-12-24 - Hiram
     mkdir /hive/data/genomes/hg38/bed/gap
     cd /hive/data/genomes/hg38/bed/gap
     time nice -n +19 findMotif -motif=gattaca -verbose=4 \
 	-strand=+ ../../hg38.unmasked.2bit > findMotif.txt 2>&1
     #	real    0m28.634s
     grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
     featureBits hg38 -not gap -bed=notGap.bed
     #	3049335806 bases of 3049335806 (100.000%) in intersection
     time featureBits hg38 allGaps.bed notGap.bed -bed=new.gaps.bed
     #   20023 bases of 3049335806 (0.001%) in intersection
     # real    0m20.427s
     # this indicates that 20,023 bases are not marked as N's
     # with this element size profile:
     awk '{print $3-$2}' new.gaps.bed | ave stdin
 # Q1 1.000000
 # median 1.000000
 # Q3 100.000000
 # average 44.894619
 # min 1.000000
 # max 1000.000000
 # count 446
 # total 20023.000000
 # standard deviation 81.743447
 
     # the four largest ones:
 # 1000 chr2         32916625        32917625        chr2.7
 # 1000 chr2         32867130        32868130        chr2.6
 #  348 chr20        36314371        36314719        chr20.36
 #  200 chr12       123443533       123443733        chr12.10
 
 #########################################################################
 ## CYTOBAND - fixing the ideogram track (DONE - 2014-06-11 - Hiram)
     ## the file we used before was broken
     mkdir -p /hive/data/outside/ncbi/ideogram/2014-06
     cd /hive/data/outside/ncbi/ideogram/2014-06
     # fetch all the ideogram files:
     rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./
     mkdir /hive/data/genomes/hg38/bed/cytoBandUpdate
     cd /hive/data/genomes/hg38/bed/cytoBandUpdate
 
     # Create bed file
     $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \
 /hive/data/outside/ncbi/ideogram/2014-06/ideogram_9606_GCF_000001305.14_850_V1
 
     # add in the other genome data:
     hgsql -N -e 'select * from cytoBand;' hg38 \
         | egrep "chrU|chrM|_alt|_random" >> cytoBand.bed
 
     $HOME/kent/src/utils/ncbi/cytoBandVerify.pl
     #   everything checks out OK on 455 chroms
 
     # Load the bed file
     hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
 	hg38 cytoBand cytoBand.bed
     cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head
     #  23
     sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql
     sort -k1,1 -k2,2n cytoBand.bed \
 	| hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin
 
     # Make cytoBandIdeo track for ideogram gif on hgTracks page.
     # cytoBandIdeo is just a replicate of the cytoBand track.
     hgsql -e "drop table cytoBandIdeo;" hg38
     hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;"
 
 #########################################################################
 ##  CYTOBAND - ideogram track (DONE - 2014-03-04 - Hiram)
     ssh hgwdev
     mkdir -p /hive/data/outside/ncbi/ideogram/2014-03
     cd /hive/data/outside/ncbi/ideogram/2014-03
 
     # fetch all the ideogram files:
     rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./
 
     mkdir /hive/data/genomes/hg38/bed/cytoBand
     cd /hive/data/genomes/hg38/bed/cytoBand
 
     # Create bed file
     $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \
 /hive/data/outside/ncbi/ideogram/2014-03/ideogram_9606_GCF_000001305.14_850_V1
 
     # add in the other genome data:
     hgsql -N -e 'select * from cytoBand;' hg38 > bobTable.bed
 
     egrep "chrU|chrM|_alt|_random" bobTable.bed >> cytoBand.bed
 
     ## can now verify before load:
     $HOME/kent/src/utils/ncbi/cytoBandVerify.pl
     #   everything checks out OK on 455 chroms
 
     # Load the bed file
     hgLoadBed -tab -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
 	hg38 cytoBand cytoBand.bed
     cut -f1 cytoBand.bed | sort -u | awk '{print length($1)}' | sort -rn | head
     #  23
     sed -e 's/12/23/' $HOME/kent/src/hg/lib/cytoBand.sql > cytoBand.sql
     sort -k1,1 -k2,2n cytoBand.bed \
 	| hgLoadSqlTab hg38 cytoBand cytoBand.sql stdin
 
     # Make cytoBandIdeo track for ideogram gif on hgTracks page.
     # cytoBandIdeo is just a replicate of the cytoBand track.
     hgsql -e "drop table cytoBandIdeo;" hg38
     hgsql hg38 -e "create table cytoBandIdeo (index(chrom(23),chromStart)) as select * from cytoBand;"
 
 ##########################################################################
 # cytoBandIdeo - (DONE - 2013-12-26 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/cytoBand
     cd /hive/data/genomes/hg38/bed/cytoBand
     makeCytoBandIdeo.csh hg38
 
 #making temporary liftover of items from hg19
 liftOver /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \
       /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
       cytobands.bed unMapped
 
 liftOver -minBlocks=0.5 /hive/data/genomes/hg19/bed/ncbiCytoBand/cytobands.bed \
       /hive/data/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz \
       cytobands.0.5.bed unMapped0.5
 
 ###############################                    ######################
 # cytoBandIdeo - (reDONE - 2014-02-25 - kuhn)
 
 # adding centromeres to generic cytonBandIdeo tavle as it exists.
 # (lifted track is already gone)
 
 # get the cen values for hg38
 hgsql -Ne "SELECT DISTINCT chrom FROM centromeres" hg38 | sort > hg38.chroms
 rm -f hg38.cens
 foreach chrom (`cat hg38.chroms`)
   set cenStart=""
   set cenEnd=""
   set cenStart=`hgsql -Ne 'SELECT MIN(chromStart) FROM centromeres WHERE chrom = "'$chrom'"' hg38`
   set cenEnd=`hgsql -Ne 'SELECT MAX(chromEnd) FROM centromeres WHERE chrom = "'$chrom'"' hg38`
   echo "$chrom $cenStart $cenEnd" >> hg38.cens
 end
 
 # Modified makeCytoBandIdeo.csh to use this file instead of looking
 #   for centromeres in a gap table.
 # Replaced existing cytoBandIdeo table, which was really only a copy
 #   of chromInfo.
 
 ##########################################################################
 # hg19 <-> hg38 difference tracks (DONE - 2013-12-28 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/liftOverHg38
     cd /hive/data/genomes/hg19/bed/liftOverHg38
 
     #	not needed, but interesting, collect all the fragment
     #	definitions from the gold tables:
     hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg19 \
         | sort > hg19.gold.frags.tab
 
     hgsql -N -e "select frag,fragStart,fragEnd,strand from gold;" hg38 \
         | sort > hg38.gold.frags.tab
 
     # construct common and difference listings
     comm -12 hg19.gold.frags.tab hg38.gold.frags.tab \
 	> identical.hg19.hg38.frags.tab
     comm -23 hg19.gold.frags.tab hg38.gold.frags.tab \
 	> unique.hg19Only.frags.tab
     comm -13 hg19.gold.frags.tab hg38.gold.frags.tab \
 	> unique.hg38Only.frags.tab
 
     # better yet, get full information about each fragment
     hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg19 \
         | sort -k6 > hg19.gold.tab
 
     hgsql -N -e "select chrom,chromStart,chromEnd,ix,type,frag,fragStart,fragEnd,strand from gold;" hg38 \
         | sort -k6 > hg38.gold.tab
 
     # construct a single key for each fragment for joining.
     # the key is frag,fragStart,fragEnd,strand
     awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n",
 	$6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg19.gold.tab | sort \
 	> hg19.fragKey.tab
     awk '{printf "%s,%d,%d,%s\t%s\t%s\t%s\t%d\t%d\t%d\t%s\n",
 	$6,$7,$8,$9,$6,$9,$1,$2,$3,$4,$5}' hg38.gold.tab | sort \
 	> hg38.fragKey.tab
 
     # now, by joining those keys, we can get exact identicals, and
     # the only-in listings as bed files to load as tracks:
     join hg19.fragKey.tab hg38.fragKey.tab \
 	| awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $4,$5,$6,$2,$3,$5,$6}' \
         | sort -k1,1 -k2,2n > hg19.hg38.identical.bed
 
     join hg19.fragKey.tab hg38.fragKey.tab \
 	| awk '{printf "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0,0,128\n", $11,$12,$13,$9,$10,$12,$13}' \
         | sort -k1,1 -k2,2n > hg38.hg19.identical.bed
 
     join -v 1 hg19.fragKey.tab hg38.fragKey.tab \
 	| awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \
         | sort -k1,1 -k2,2n > hg19.only.bed
 
     join -v 2 hg19.fragKey.tab hg38.fragKey.tab \
 	| awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $4,$5,$6,$2,$3}' \
         | sort -k1,1 -k2,2n > hg38.only.bed
 
     hgLoadBed hg19 hg38ContigDiff hg19.only.bed
     hgLoadBed hg38 hg19ContigDiff hg38.only.bed
 
     wc -l hg??.only.bed
     #  6097 hg19.only.bed
     #  23632 hg38.only.bed
 
     # this leaves the outstanding question of "why" they might be in
     #	the only-in listings.  Some contigs may be different versions,
     #   sometimes different sections of the same contig are used,
     #	and contigs are dropped from hg19 to hg38, or new contigs added
     #	to hg38 to fill in gaps from hg19
     # Let's see if we can measure some of this:
     awk '{print $4}' hg19.only.bed | sort -u > hg19.only.ids.list
     awk '{print $4}' hg38.only.bed | sort -u > hg38.only.ids.list
 
     # Looks like 5405 idential contigs with different parts used:
     comm -12 hg19.only.ids.list hg38.only.ids.list > differentPortions.list
     wc -l differentPortions.list
     # 5405
 
     # and perhaps 63 = 5468-5405 of different versions of same contig:
     sed -e "s/\.[0-9]*$//" hg19.only.ids.list | sort -u \
 	> hg19.noVersions.ids.list
     sed -e "s/\.[0-9]*$//" hg38.only.ids.list | sort -u \
 	> hg38.noVersions.ids.list
     comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | wc -l
     #	5468
     sed -e "s/\.[0-9]*$//" differentPortions.list | sort -u \
 	> differentPortions.noVersions.list
     comm -12 hg19.noVersions.ids.list hg38.noVersions.ids.list | sort -u \
 	> noVersions.common.list
     # indeed, 63 contigs of different versions:
     comm -23 noVersions.common.list differentPortions.noVersions.list \
 	| sort -u > differentVersions.list
     wc -l differentVersions.list
     #	63
 
     # dividing up these items:
     cat << '_EOF_' > identifyPortions.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my %differentVersions;
 my %differentPortions;
 
 open (FH, "<differentVersions.list" ) or
 	die "can not read differentVersions.list";
 while (my $line = <FH>) {
     chomp $line;
     $differentVersions{$line} = 1;
 }
 close (FH);
 
 open (FH, "differentPortions.list" ) or
 	die "can not read differentPortions.list";
 while (my $line = <FH>) {
     chomp $line;
     $differentPortions{$line} = 1;
 }
 close (FH);
 
 my %hg19Done;
 open (DP, ">hg19.differentPortions.bed") or die "can not write to hg19.differentPortions.bed";
 open (DV, ">hg19.differentVersions.bed") or die "can not write to hg19.differentVersions.bed";
 open (FH, "<hg19.only.bed" ) or die "can not read hg19.only.bed";
 while (my $line = <FH>) {
     chomp $line;
     my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
     # assume done while $acc is still complete
     $hg19Done{$acc} = 1;
     if (exists($differentPortions{$acc})) {
 	printf DP "%s\n", $line;
     } else {
 	my $trimAcc = $acc;
 	$trimAcc =~ s/\.[0-9]+$//;
 	if (exists($differentVersions{$trimAcc})) {
 	    printf DV "%s\n", $line;
 	} else {
             # this one does not match
 	    $hg19Done{$acc} = 0;
 	}
     }
 }
 close (FH);
 close (DV);
 close (DP);
 open (DR, ">hg19.dropped.bed") or die "can not write to hg19.dropped.bed";
 open (FH, "<hg19.only.bed" ) or die "can not read hg19.only.bed";
 while (my $line = <FH>) {
     chomp $line;
     my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
     if (0 == $hg19Done{$acc}) {
 	printf DR "%s\n", $line;
     }
 }
 close (FH);
 close (DR);
 
 my %hg38Done;
 open (DP, ">hg38.differentPortions.bed") or die "can not write to hg38.differentPortions.bed";
 open (DV, ">hg38.differentVersions.bed") or die "can not write to hg38.differentVersions.bed";
 open (FH, "<hg38.only.bed" ) or die "can not read hg38.only.bed";
 while (my $line = <FH>) {
     chomp $line;
     my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
     # assume done while $acc is still complete
     $hg38Done{$acc} = 1;
     if (exists($differentPortions{$acc})) {
 	printf DP "%s\n", $line;
     } else {
 	my $trimAcc = $acc;
 	$trimAcc =~ s/\.[0-9]+$//;
 	if (exists($differentVersions{$trimAcc})) {
 	    printf DV "%s\n", $line;
 	} else {
             # this one does not match
 	    $hg38Done{$acc} = 0;
 	}
     }
 }
 close (FH);
 close (DV);
 close (DP);
 open (DR, ">hg38.newTo19.bed") or die "can not write to hg38.newTo19.bed";
 open (FH, "<hg38.only.bed" ) or die "can not read hg38.only.bed";
 while (my $line = <FH>) {
     chomp $line;
     my ($chr, $start, $end, $acc, $score, $strand) = split('\s+', $line);
     if (0 == $hg38Done{$acc}) {
 	printf DR "%s\n", $line;
     }
 }
 close (FH);
 close (DR);
 '_EOF_'
     # << happy emacs
     chmod +x identifyPortions.pl
     ./identifyPortions.pl
     # make sure nothing was lost
     sort hg19.differentVersions.bed hg19.differentPortions.bed \
 	hg19.dropped.bed  | sum
     #	43711   233
     sort hg19.only.bed | sum
     #	43711   233
     sort hg38.differentVersions.bed hg38.differentPortions.bed \
 	hg38.newTo19.bed | sum
     #	00502   911
     sort hg38.only.bed | sum
     #	00502   911
 
     sort -k1,1 -k2,2n hg38.differentVersions.bed hg38.differentPortions.bed \
 	hg38.newTo19.bed > hg38.itemRgb.bed
     sort -k1,1 -k2,2n hg19.differentVersions.bed hg19.differentPortions.bed \
 	hg19.dropped.bed > hg19.itemRgb.bed
 
     hgLoadBed hg19 hg38ContigDiff hg19.itemRgb.bed
     # if you wanted to load the identicals in this track too:
     sort -k1,1 -k2,2n hg38.hg19.identical.bed hg38.itemRgb.bed \
        | hgLoadBed hg38 hg38ContigDiff stdin
     # but we don't, we deliver only the differences
     hgLoadBed hg38 hg38ContigDiff hg38.itemRgb.bed
 
 #########################################################################
 # construct ooc file to be used in blat operations
 #                      DONE - 2012-12-30 - Hiram
 # can be done on unmasked sequence the same result as masked:
     cd /hive/data/genomes/hg38
     time blat hg38.unmasked.2bit /dev/null /dev/null \
        -tileSize=11 -makeOoc=jkStuff/hg38.11.ooc -repMatch=1024
 
     # been confirmed, the 100-base non-bridged gaps are really non-bridged
     gapToLift -minGap=100 -bedFile=jkStuff/nonBridgedGaps.bed hg38 \
 	jkStuff/hg38.nonBridged.lft
 
 ##############################################################################
 # cpgIslands - (DONE - 2014-01-07 - Hiram)
     # run on the Hmmer + trfMask sequence
     mkdir /hive/data/genomes/hg38/bed/cpgIslands
     cd /hive/data/genomes/hg38/bed/cpgIslands
     time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
       -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
         -workhorse=hgwdev -smallClusterHub=ku hg38 > do.log 2>&1
     # real    3m31.684s
     # wc -l cpgIsland.bed -> 30456 cpgIsland.bed
     cat fb.hg38.cpgIslandExt.txt
     #  23654068 bases of 3049335806 (0.776%) in intersection
 
     # Previously in hg19:
     featureBits -countGaps hg19 cpgIslandExt
     # 21842742 bases of 3137161264 (0.696%) in intersection
 
     # when run on Hmmer and Trf masked sequence:
     # wc -l cpgIsland.bed -> 30416 cpgIsland.bed
     #   23635946 bases of 3049335806 (0.775%) in intersection
 
     # when run on unmasked sequence:
     # wc -l cpgIsland.bed -> 55149 cpgIsland.bed
     # 33637531 bases of 3049335806 (1.103%) in intersection
 ##############################################################################
 # rerun cpgIslands on contig sequence (DONE - 2014-01-07 - Hiram)
     # this is a test of the contig sequence file,
     # should get a very similar answer to the above
     mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigs
     cd /hive/data/genomes/hg38/bed/cpgIslandsContigs
 
     # run stepwise so the lift can be done on the result before loading
     time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
       -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
        -stop=makeBed -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \
         -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1
     # real    9m31.502s
     # fails on the bedToBigBed creation since this isn't the actual
     # hg38 sequence.
     mv cpgIsland.bed cpgIsland.beforeLift.bed
     liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \
       cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed
     bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \
        cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb
     zcat ../cpgIslands/cpgIsland.bed.gz | sort -k1,1 -k2,2n > t.bed
     # Surprisingly, a few more are detected, perhaps due to the different
     # masking since this contig run is on the final corrected cross-match rmsk
     # plus TRF, the above was on the corrupted HMMER+TRF mask:
     wc -l cpgIsland.bed t.bed
 #   30477 cpgIsland.bed
 #   30456 t.bed
     # 2,835 different items between the two:
     sort t.bed cpgIsland.bed | uniq -c | awk '$1 < 2' | wc -l
     # 2835
     # 29.049 identical items
     sort t.bed cpgIsland.bed | uniq -c | awk '$1 == 2' | wc -l
     # 29049
     cut -f1-3 cpgIsland.bed | sort > contigs.bed
     cut -f1-3 t.bed | sort > fullSequence.bed
     # 29,339 identical locations:
     comm -12 contigs.bed fullSequence.bed | wc -l
     # 29339
 
     time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
       -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
        -continue=load -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \
         -workhorse=hgwdev -smallClusterHub=ku hg38 > load.log 2>&1
     # real    0m12.056s
 
     cat fb.hg38.cpgIslandExt.txt
     # 23610399 bases of 3049335806 (0.774%) in intersection
 
 ##############################################################################
 # rerun cpgIslands on contig UNMASKED sequence (DONE - 2014-01-07 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked
     cd /hive/data/genomes/hg38/bed/cpgIslandsContigsUnmasked
 
     twoBitToFa -noMask ../../hg38.contigs.2bit stdout \
       | faToTwoBit stdin hg38.contigsUnmasked.2bit
 
     # verify sequence is OK:
     twoBitToFa hg38.contigsUnmasked.2bit stdout | faSize stdin
 # 3061688741 bases (12372958 N's 3049315783 real 3049315783 upper 0 lower)
 #    in 733 sequences in 1 files
 # %0.00 masked total, %0.00 masked real
     twoBitToFa hg38.contigsUnmasked.2bit stdout | faCount stdin | tail -1
 # total 3061688741 898285419 623727342 626335137 900967885  12372958 30979743
     # ACGT CpG same as original hg38.2bit except for the missing N's:
 # total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743
 
     # run stepwise so the lift can be done on the result before loading
     time $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
       -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
        -stop=makeBed -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \
         -workhorse=hgwdev -smallClusterHub=ku hg38 > makeBed.log 2>&1
     # real    11m0.690s
     # as above, failed on the bedToBigBed step since this isn't the full hg38
     # sequence
     mv cpgIsland.bed cpgIsland.beforeLift.bed
     liftUp -type=.bed stdout ../../jkStuff/hg38.contigs.lift carry \
       cpgIsland.beforeLift.bed | sort -k1,1 -k2,2n > cpgIsland.bed
     bedToBigBed -tab -type=bed4+6 -as=$HOME/kent/src/hg/lib/cpgIslandExt.as \
        cpgIsland.bed ../../chrom.sizes hg38.cpgIslandExt.bb
     # a lot more here that for masked sequence:
     wc -l cpgIsland.bed ../cpgIslandsContigs/cpgIsland.bed
     # 55149 cpgIsland.bed
     # 30477 ../cpgIslandsContigs/cpgIsland.bed
     featureBits -countGaps hg38 cpgIsland.bed
     # 33637531 bases of 3209286105 (1.048%) in intersection
     featureBits -countGaps hg38 ../cpgIslandsContigs/cpgIsland.bed
     # 23610399 bases of 3209286105 (0.736%) in intersection
 
     # debug load step so it can be loaded into a separate table:
     $HOME/kent/src/hg/utils/automation/doCpgIslands.pl \
       -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
        -debug -continue=load -maskedSeq=`pwd`/hg38.contigsUnmasked.2bit \
         -workhorse=hgwdev -smallClusterHub=ku hg38
 
     time ./doLoadCpg.csh > load.log 2>&1
     # real    0m2.179s
     # 33637531 bases of 3049335806 (1.103%) in intersection
 
 #########################################################################
 # construct liftOver to hg19 (DONE - 2013-12-31 - Hiram)
     # it turns out it doesn't matter if the query or target 2bit files
     # are masked.  This procedure can be done on completely unmasked sequences
     # for both, same result masked or not masked
     screen -S hg38	# manage this longish running job in a screen
     mkdir /hive/data/genomes/hg38/bed/blat.hg19.2013-12-31
     cd /hive/data/genomes/hg38/bed/blat.hg19.2013-06-10
     # this was run in manual steps as experiments were done about the masking
     # check it with -debug first to see if it is going to work:
     doSameSpeciesLiftOver.pl -stop=net -buildDir=`pwd` -bigClusterHub=ku \
       -dbHost=hgwdev -workhorse=hgwdev -debug \
         -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc hg38 hg19
     # the debug step doesn't actually construct enough files to run the
     # steps manually.  The chaining has an extra procedure that is performed
     # while not in 'debug' mode
     # the run.blat was operated manually, then chaining:
     time doSameSpeciesLiftOver.pl -continue=chain -stop=net -buildDir=`pwd` \
       -bigClusterHub=ku \
         -dbHost=hgwdev -workhorse=hgwdev \
            -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \
              hg38 hg19 > chain.log 2>&1
     # real    22m31.635s
     # loading is only a few seconds:
     doSameSpeciesLiftOver.pl -continue=load -buildDir=`pwd` \
      -bigClusterHub=ku \
        -dbHost=hgwdev -workhorse=hgwdev \
           -ooc=/hive/data/genomes/hg38/jkStuff/hg38.11.ooc \
              hg38 hg19 > load.log 2>&1
 
     # verify this file exists:
     #	/gbdb/hg38/liftOver/hg38ToHg19.over.chain.gz
     # and try out the conversion on genome-test from hg38 to hg19
     # same file should exist for downloads:
     #  /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz
 
 ############################################################################
 # marking the PAR regions: (DONE - 2014-01-09 - Hiram)
     # after much experimentation with the AGP files and the given NCBI
     # files in hg38/genbank/Primary_Assembly/pseudoautosomal_region
     # the PAR region definitions can be seen in the par_align.gff file:
 # CM000685.2  10001  2781479  ->  CM000686.2 10001 2781479
 # CM000685.2  155701383  156030895 -> CM000686.2 56887903 57217415
     # equivalent to:
 # chrX  10001  2781479  ->  chrY 10001 2781479
 # chrX  155701383  156030895 -> chrY 56887903 57217415
 
     # subtract one for the chromStart position:
     cat << '_EOF_' > hg38Par.bed4
 chrX 10000      2781479   PAR1
 chrX 155701382  156030895 PAR2
 chrY 10000      2781479   PAR1
 chrY 56887902   57217415  PAR2
 '_EOF_'
     # << happy emacs
 
     hgLoadBed hg38 par hg38Par.bed4
     checkTableCoords  hg38
 
     # hg19 had:
 +-------+------------+-----------+------+
 | chrom | chromStart | chromEnd  | name |
 +-------+------------+-----------+------+
 | chrX  |      60000 |   2699520 | PAR1 |
 | chrX  |  154931043 | 155260560 | PAR2 |
 | chrY  |      10000 |   2649520 | PAR1 |
 | chrY  |   59034049 |  59363566 | PAR2 |
 +-------+------------+-----------+------+
 
     # The AGP files come close to definining the location, but not
     # precisely.  The first region uses different bits of AC006209.25:
 zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\
   | grep AC006209.25
 CM000685.2      2665048 2677319 56      F       AC006209.25     127483  139754 -
 CM000685.2      2677869 2804801 58      F       AC006209.25     1       126933 -
 zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\
   | grep AC006209.25
 CM000686.2      2665048 2677319 56      F       AC006209.25     127483  139754 -
 CM000686.2      2677869 2781479 58      F       AC006209.25     23323   126933 -
 
     # and the second region uses different bits of AJ271735.1:
 zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\
   | grep AJ271735.1 | head -1
 CM000685.2 155676925 155719966 3096  O AJ271735.1     44687    87728   +
 zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\
   | grep AJ271735.1 | head -1
 CM000686.2  56887903  56906486  356  O AJ271735.1     69145    87728   +
 
     # combining all the contig definitions from each will find all the
     # exact identical contig bits:
 zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrY.comp.agp.gz\
   | grep -v "^#" | awk '$5 != "N"' \
     | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \
     | sort > chrY.comp.agp.txt
 zcat ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/chrX.comp.agp.gz\
   | grep -v "^#" | awk '$5 != "N"' \
     | awk '{printf "%s_%d_%d\t%s\t%d\t%d\n", $6,$7,$8,$1,$2,$3}' \
     | sort > chrX.comp.agp.txt
    join -t'^I' chrY.comp.agp.txt chrX.comp.agp.txt | head
 
 CM000685.2  10001   44821   CM000686.2      10001   44821
 ...
 CM000685.2  2677320 2677868 CM000686.2      2677320 2677868
 
 CM000685.2 155719967  155720351       CM000686.2      56906487        56906871
 ...
 CM000685.2 155964490  156030895       CM000686.2      57151010        57217415
 
 ############################################################################
 ## altLocations track (DONE - 2014-01-02 - Hiram)
     # indicate corresponding locations between haplotypes and reference
     mkdir /hive/data/genomes/hg38/bed/altLocations
     cd /hive/data/genomes/hg38/bed/altLocations
 
     find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \
   | while read F
 do
   grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s\t%d\t%d\tchr%s_%s_alt\n", $6,$12-1,$13,$6, $4}'
 done | sort -k1,1 -k2,2n > chrToAlt.bed
 
     # note silent hidden <tab> character in the join -t argument
     # explicit as written here
 
 find ../../genbank/ALT_* -type f | grep alt_scaffold_placement.txt \
   | while read F
 do
   grep -v "^#" ${F} | sed -e 's/\./v/;' | awk -F'\t' '{printf "chr%s_%s_alt\tchr%s:%d-%d\n", $6,$4,$6,$12,$13}'
 done | sort > altToChr.tab
 sort ../../chrom.sizes | join -t'^I' - altToChr.tab \
    | awk '{printf "%s\t0\t%d\t%s\n", $1,$2,$3}' > altToChr.bed
 
 
    hgLoadBed hg38 altLocations chrToAlt.bed altToChr.bed
    featureBits -countGaps hg38 altLocations
    # 170113652 bases of 3209286105 (5.301%) in intersection
 
 ############################################################################
 ## genscan (DONE - 2014-01-07 - Hiram)
    mkdir /hive/data/genomes/hg38/bed/genscan
    cd /hive/data/genomes/hg38/bed/genscan
 
    # using the contig sequence
    # running stepwise to allow the lifting of the final result
    time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \
      -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \
        -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         > do.log 2>&1
    # three jobs did not finish due to almost all N's in the sequence,
    # just a couple of bases in each piece.  Their empty result is good enough.
    time $HOME/kent/src/hg/utils/automation/doGenscan.pl hg38 -buildDir=`pwd` \
      -maskedSeq=/hive/data/genomes/hg38/hg38.contigs.2bit \
        -continue=makeBed -stop=makeBed -bigClusterHub=ku -dbHost=hgwdev \
          -workhorse=hgwdev > makeBed.log 2>&1
    # real    0m48.161s
 
    cd lifted
    mkdir -p gtf subopt nameFixed/gtf nameFixed/pep newNames pep
    for F in ../gtf/000/*.gtf
 do
    B=`basename $F`
    liftUp gtf/${B} ../../../jkStuff/hg38.contigs.lift carry $F
    echo $B
 done
    for F in ../subopt/000/*.bed
 do
    B=`basename $F`
    liftUp subopt/${B} ../../../jkStuff/hg38.contigs.lift carry $F
    echo $B
 done
 
    ls gtf/chr*_[0-9][0-9].gtf \
      | sed -e 's/_[0-9][0-9]//; s#gtf/##; s/.gtf//;' | sort -u | while read C
 do
    cat ../pep/000/${C}_[0-9][0-9].pep > pep/${C}.pep
    cat gtf/${C}_[0-9][0-9].gtf | ./gtfFixId.pl ${C} > nameFixed/gtf/${C}.gtf
    ./pepNameFix.pl ${C} > nameFixed/pep/${C}.pep
 done
 
    cat nameFixed/gtf/*.gtf > ../hg38.genscan.gtf
    ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' | while read C
 do
    cat gtf/${C}
 done >> ../hg38.genscan.gtf
 
    cat nameFixed/pep/*.pep > ../hg38.genscan.pep
    ls gtf | egrep -v '^chr[0-9XY][0-9]*_[0-9][0-9].gtf' \
      | sed -e 's/.gtf/.pep/' | while read C
 do
    cat ../pep/000/${C}
 done >> ../hg38.genscan.pep
 
    cd /hive/data/genomes/hg38/bed/genscan
    cat lifted/subopt/*.bed | sort -k1,1 -k2,2n > hg38.genscanSubopt.bed
 
    gtfToGenePred hg38.genscan.gtf hg38.genscan.gp
    genePredCheck -db=hg38 hg38.genscan.gp
    # checked: 44149 failed: 0
    genePredToBed hg38.genscan.gp hg38.genscan.bed
    bedToBigBed hg38.genscan.bed ../../chrom.sizes hg38.genscan.bb
    bedToBigBed hg38.genscanSubopt.bed ../../chrom.sizes hg38.genscanSubopt.bb
    ldHgGene -gtf hg38 genscan hg38.genscan.gtf
 # Read 44149 transcripts in 339212 lines in 1 files
 #  44149 groups 345 seqs 1 sources 1 feature types
 
     cat fb.hg38.genscan.txt
     # 58278346 bases of 3049335806 (1.911%) in intersection
     cat fb.hg38.genscanSubopt.txt
     # 55020514 bases of 3049335806 (1.804%) in intersection
 
     # oddly, we are getting half of what hg19 had ?
     featureBits hg19 genscan
     # 106433874 bases of 2897316137 (3.674%) in intersection
 
     # This is because hg19 was run on soft-masked sequence and not
     # on hard masked sequence
 
 ############################################################################
 ## genscan on unmasked sequence experiment (DONE - 2013-12-03 - Hiram)
    ## instead, working on unmasked sequence:
    mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun
    cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun
 
    mkdir liftSpecs
    split -a 3 -d -l 1 ../../../jkStuff/hg38.nonBridged.lift liftSpecs/hg38_
 
    mkdir fasta
 for F in liftSpecs/hg38_*
 do
    L=`cut -f2 $F`
    echo $L
    /cluster/home/hiram/kent/src/hg/utils/lft2BitToFa.pl \
        ../../../hg38.unmasked.2bit $F > fasta/${L}.fa
 done
 
 
    cat << '_EOF_' > template
 #LOOP
 ./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed}
 #ENDLOOP
 '_EOF_'
   # << happy emacs
    cat << '_EOF_' > runGsBig.bash
 #!/bin/bash
 
 set -beEu -o pipefail
 
 export seqFile=$1
 export resultGtf=$2
 export resultPep=$3
 export resultSubopt=$4
 /cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000
 '_EOF_'
   # << happy emacs
 
   ls -1S `pwd`/fasta/*.fa > part.list
   gensub2 part.list single template jobList
   para create jobList
   para push
   # several jobs crashed:
 # Completed: 726 of 733 jobs
 # Crashed: 7 jobs
 # CPU time in finished jobs:      62501s    1041.68m    17.36h    0.72d  0.002 y
 # IO & Wait Time:                  2563s      42.72m     0.71h    0.03d  0.000 y
 # Average job time:                  90s       1.49m     0.02h    0.00d
 # Longest finished job:            3288s      54.80m     0.91h    0.04d
 # Submission to last job:          3294s      54.90m     0.92h    0.04d
 
   para status | grep -v -w done | awk '{print $(NF-3)}' > crashed.job.list
 
   mkdir /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs
   cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun/crashedJobs
   mkdir splitBits
 
   for F in chr2.06 chr1.03 chr3.05 chr12.07 chr10.05 chr17.08 chr11.04
 do
    faSplit -lift=${F}.lift gap ../fasta/${F}.fa 2000000 splitBits/${F}_
 done
 
   ls -1S `pwd`/splitBits/*.fa > part.list
   cat << '_EOF_' > runGsBig.bash
 #!/bin/bash
 
 set -beEu -o pipefail
 
 export seqFile=$1
 export resultGtf=$2
 export resultPep=$3
 export resultSubopt=$4
 /cluster/bin/x86_64/gsBig $seqFile $resultGtf -trans=$resultPep -subopt=$resultSubopt -exe=/scratch/data/genscan/genscan -par=/scratch/data/genscan/HumanIso.smat -tmp=/dev/shm -window=2400000
 '_EOF_'
   # << happy emacs
   chmod +x runGsBig.bash
 
   cat << '_EOF_' > template
 #LOOP
 ./runGsBig.bash $(path1) {check out exists gtf/$(root1).gtf} {check out exists pep/$(root1).pep} {check out exists subopt/$(root1).bed}
 #ENDLOOP
 '_EOF_'
   # << happy emacs
 
   gensub2 part.list single template jobList
   para create jobList
   para push
 # Completed: 331 of 334 jobs
 # Crashed: 3 jobs
 # CPU time in finished jobs:      18097s     301.62m     5.03h    0.21d  0.001 y
 # IO & Wait Time:                  1085s      18.08m     0.30h    0.01d  0.000 y
 # Average job time:                  58s       0.97m     0.02h    0.00d
 # Longest finished job:              79s       1.32m     0.02h    0.00d
 # Submission to last job:           249s       4.15m     0.07h    0.00d
   # the last three completed with -window=1600000
 
   # lifting results:
   cat << '_EOF_' > fixIds.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my $argc = scalar(@ARGV);
 
 if ($argc != 1) {
   printf STDERR "usage: cat chrN.M.lifted | ./fixIds.pl chrN.M\n";
   exit 255;
 }
 
 my $F=shift;
 my $C = $F;
 $C =~ s/\.[0-9][0-9]//;
 
 my $id = 0;
 my $prevId = "";
 open (GT, ">${F}.gtf") or die "can not write to ${F}.gtf";
 while (my $line=<>) {
    chomp $line;
    my $geneId = $line;
    $geneId =~ s/^${C}.*gene_id "${C}//;
    $geneId =~ s/";.*//;
    $id += 1 if ( $prevId ne $geneId);
    $line =~ s/${C}[0-9]+.[0-9]+/${F}.$id/g;
    printf GT "%s\n", $line;
    $prevId = $geneId;
 }
 close (GT);
 '_EOF_'
   # << happy emacs
   chmod +x fixIds.pl
   for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05
 do
   echo "${F}" 1>&2
   cut -f2 ${F}.lift | while read P
   do
      liftUp -type=.gtf stdout ${F}.lift error gtf/${P}.gtf
   done > ${F}.lifted.gtf
   cat ${F}.lifted.gtf | ./fixIds.pl ${F}
 done
   # copied these results to ../gtf/ to get into the final result
 # -rw-rw-r-- 1 3349959 Jan  2 15:33 chr1.03.gtf
 # -rw-rw-r-- 1 2439182 Jan  2 15:33 chr10.05.gtf
 # -rw-rw-r-- 1 1068097 Jan  2 15:33 chr11.04.gtf
 # -rw-rw-r-- 1 2392548 Jan  2 15:33 chr12.07.gtf
 # -rw-rw-r-- 1 1831336 Jan  2 15:33 chr17.08.gtf
 # -rw-rw-r-- 1 3539694 Jan  2 15:33 chr2.06.gtf
 # -rw-rw-r-- 1 2309903 Jan  2 15:33 chr3.05.gtf
 
   for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05
 do
   echo "${F}" 1>&2
   cut -f2 ${F}.lift | while read P
   do
      liftUp -type=.bed stdout ${F}.lift error subopt/${P}.bed
   done > ${F}.lifted.subopt.bed
 done
   # copied these results to ../subopt/ to get into the final result
 # -rw-rw-r-- 1 3349959 Jan  2 15:33 chr1.03.gtf
 # -rw-rw-r-- 1 2439182 Jan  2 15:33 chr10.05.gtf
 # -rw-rw-r-- 1 1068097 Jan  2 15:33 chr11.04.gtf
 # -rw-rw-r-- 1 2392548 Jan  2 15:33 chr12.07.gtf
 # -rw-rw-r-- 1 1831336 Jan  2 15:33 chr17.08.gtf
 # -rw-rw-r-- 1 3539694 Jan  2 15:33 chr2.06.gtf
 # -rw-rw-r-- 1 2309903 Jan  2 15:33 chr3.05.gtf
 
 
   cat << '_EOF_' > pepNameFix.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 # BIG ASSUMPTION ! ! ! - the peptides are in the same order as
 # they are in the GTF file ! ! !
 
 my $argc = scalar(@ARGV);
 
 if ($argc != 1) {
   printf STDERR "usage: cat chrN.M.needNameFix.pep | ./pepNameFix.pl chrN.M > chrN.M.pep\n";
   exit 255;
 }
 
 my $C=shift;
 
 my $id = 1;
 
 while (my $line = <>) {
   if ($line =~ m/^>/) {
     printf ">%s.%d\n", $C, $id++;
   } else {
     print $line;
   }
 }
 '_EOF_'
   # << happy emacs
   chmod +x pepNameFix.pl
 
 for F in chr1.03 chr10.05 chr11.04 chr12.07 chr17.08 chr2.06 chr3.05
 do
   echo "${F}" 1>&2
   cut -f2 ${F}.lift | while read P
   do
      cat pep/${P}.pep
   done > ${F}.needNameFix.pep
   cat ${F}.needNameFix.pep | ./pepNameFix.pl ${F} > ${F}.pep
 done
   # copied these results to ../pep/ to get into the final result:
 # -rw-rw-r-- 1 1592655 Jan  2 15:55 chr1.03.pep
 # -rw-rw-r-- 1 1169168 Jan  2 15:55 chr10.05.pep
 # -rw-rw-r-- 1  519106 Jan  2 15:55 chr11.04.pep
 # -rw-rw-r-- 1 1152111 Jan  2 15:55 chr12.07.pep
 # -rw-rw-r-- 1  775052 Jan  2 15:55 chr17.08.pep
 # -rw-rw-r-- 1 1799546 Jan  2 15:55 chr2.06.pep
 # -rw-rw-r-- 1 1248762 Jan  2 15:55 chr3.05.pep
 
   # and then, adding in all the results together
 
   cd /hive/data/genomes/hg38/bed/genscan/unmaskedRun
   cat << '_EOF_' > gtfIdFix.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my $argc = scalar(@ARGV);
 
 if ($argc != 1) {
   printf STDERR "usage: cat lifted/gtf/chrN.gtf | ./gtfIdFix.pl chrN\n";
   exit 255;
 }
 
 my $C=shift;
 
 my $id = 0;
 my $prevId = "";
 open (NM, ">nameFixed/newNames/${C}.tab") or die "can not write to nameFixed/newNames/${C}.tab";
 open (GT, ">nameFixed/gtf/${C}.gtf") or die "can not write to nameFixed/gtf/${C}.gtf";
 while (my $line=<>) {
    chomp $line;
    my $geneId = $line;
    $geneId =~ s/^${C}.*gene_id "//;
    $geneId =~ s/";.*//;
    if ( $prevId ne $geneId) {
      $id += 1;
      printf NM "%s\t%s.%d\n", $geneId, $C, $id;
    }
    $line =~ s/${C}.[0-9]+.[0-9]+/${C}.$id/g;
    printf GT "%s\n", $line;
    $prevId = $geneId;
 }
 close (GT);
 close (NM);
 '_EOF_'
   # << happy emacs
   chmod +x gtfIdFix.pl
 
   rm -fr lifted
   rm -fr nameFix
   mkdir -p lifted
   mkdir -p lifted/gtf
   mkdir -p lifted/pep
   mkdir -p lifted/subopt
   mkdir -p nameFix
   mkdir -p nameFix/gtf
   mkdir -p nameFix/newNames
 
   for F in liftSpecs/hg38_*
 do
    L=`cut -f2 $F`
    C=`cut -f4 $F`
    liftUp -type=.gtf stdout ${F} error gtf/${L}.gtf >> lifted/gtf/${C}.gtf
    cat pep/${L}.pep >> lifted/pep/${C}.pep
    liftUp -type=.bed stdout ${F} error subopt/${L}.bed >> lifted/subopt/${C}.bed
 done
 
   for F in lifted/gtf/*.gtf
 do
   C=`basename $F | sed -e 's/.gtf//'`
   cat $F | ./gtfIdFix.pl $C
 done
 
 mkdir -p nameFixed/pep
 
   cat << '_EOF_' > pepNameFix.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my $argc = scalar(@ARGV);
 if ($argc != 1) {
   printf STDERR "usage: ./pepNameFix.pl chrN > chrN.pep\n";
   exit 255
 }
 
 my $C = shift;
 my %newName;
 
 open (FH, "<lifted/pep/$C.pep") or die "can not read <lifted/pep/$C.pep";
 open (NM, "<nameFixed/newNames/$C.tab") or die "can not read nameFixed/newNames/$C.tab";
 while (my $line = <NM>) {
   chomp $line;
   my ($needFix, $fixedName) = split('\t', $line);
   $newName{$needFix} = $fixedName;
 }
 close (NM);
 
 while (my $line = <FH>) {
   if ($line =~m /^>/) {
     chomp $line;
     $line =~ s/^>//;
     die "can not find name to fix $line" if (!exists($newName{$line}));
     printf ">%s\n", $newName{$line};
   } else {
     print $line;
   }
 }
 close (FH);
 '_EOF_'
   # << happy emacs
   chmod +x pepNameFix.pl
 
   for F in lifted/pep/*.pep
 do
   C=`basename $F | sed -e 's/.pep//'`
   echo $C
   ./pepNameFix.pl $C > nameFixed/pep/$C.pep
 done
 
 #############################################################################
 # Mark the new centromere regions (DONE - 2014-01-09 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/centromere
     cd /hive/data/genomes/hg38/bed/centromere
     grep GJ ../../hg38.agp > hg38.centContigs.agp
 
     awk '{printf "%s\t%d\t%d\t%s\n", $1, $2-1, $3, $6}' hg38.centContigs.agp \
       > hg38.centContigs.bed4
 
     hgLoadBed hg38 centromeres hg38.centContigs.bed4
     checkTableCoords hg38 centromeres
 
 #############################################################################
 ## alternate sequence/haplotype alignments (DONE - 2014-01-23 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/lastzAltSequences
     cd /hive/data/genomes/hg38/bed/lastzAltSequences
 
 rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa
 mkdir targetFa
 mkdir queryFa
 touch temp.lift
 
 cat ../altLocations/chrToAlt.bed | while read L
 do
   chrName=`echo $L | awk '{print $1}'`
   chromSize=`egrep "^$chrName   " ../../chrom.sizes | cut -f2`
   chrStart=`echo $L | awk '{if (($2-10000)>=0) {printf "%d", $2-10000} else {printf "0"}}'`
   chrEnd=`echo $L | awk -v chromSize=$chromSize '{if (($3+10000)<=chromSize) {printf "%d", $3+10000} else {printf "%d", chromSize}}'`
   chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'`
   queryName=`echo $L | awk '{print $4}'`
   partName="${chrName}_${chrStart}_${chrEnd}"
   echo $chrName $chrStart $chrEnd $queryName $partName $chromSize
   echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift
   twoBitToFa ../../hg38.unmasked.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa
   twoBitToFa ../../hg38.unmasked.2bit:$queryName queryFa/$queryName.fa
 done
 
 sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift
 
     # these were run serially on hgwdev, they could be a cluster run:
     ssh ku
     mkdir /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz
     cd /hive/data/genomes/hg38/bed/lastzAltSequences/run.blastz
     mkdir ../lav ../psl
 
     # construct the jobList
     ls ../targetFa | sed -e 's/.fa//;' | while read partName
 do
    echo "./runJob.sh ${partName}"
 done > jobList
 
     cat << '_EOF_' > runJob
 #!/bin/sh
 
 export partName=$1
 export target="../targetFa/$partName.fa"
 export query="../queryFa/$partName.fa"
 export lav="../lav/$partName.lav"
 export psl="../psl/$partName.psl"
 
 /cluster/bin/penn/lastz-distrib-1.03.46/bin/lastz \
   $target $query \
   Y=15000 T=2 M=254 O=600 H=2000 O=600 E=150 K=10000 L=10000 \
   Q=/scratch/data/blastz/human_chimp.v2.q > $lav
 lavToPsl $lav stdout | liftUp $psl ../hg38.haplotypes.lift error stdin
 '_EOF_'
     # << happy emacs
 
     # these were run serially on hgwdev, they could be a cluster run:
     time ./jobList > do.log
     # real    61m35.898s
 
     # chaining lastz results:
     mkdir -p /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run/chain
     cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain/run
 
     ls ../../psl/*.psl | while read P
 do
   B=`basename $P | sed -e 's/.psl//'`
   echo $B $P
   ls -og $P ../../targetFa/${B}.fa ../../queryFa/${B}.fa
   /cluster/home/hiram/kent/src/hg/mouseStuff/axtChain/axtChain \
     -psl -scoreScheme=/scratch/data/blastz/human_chimp.v2.q \
     -minScore=1000 -linearGap=medium $P \
     ../../../../hg38.unmasked.2bit \
     ../../../../hg38.unmasked.2bit stdout \
   | chainAntiRepeat ../../../../hg38.unmasked.2bit \
     ../../../../hg38.unmasked.2bit stdin chain/${B}.chain
 done
 
    # real    7m54.677s
 
    cd /hive/data/genomes/hg38/bed/lastzAltSequences/axtChain
    find ./run/chain -name "*.chain" | chainMergeSort -inputList=stdin \
        | nice gzip -c > hg38.haplotypes.all.chain.gz
    chainPreNet  hg38.haplotypes.all.chain.gz ../../../chrom.sizes \
      /hive/data/genomes/hg38/chrom.sizes stdout \
        | chainNet  stdin -minSpace=1 ../../../chrom.sizes \
           ../../../chrom.sizes stdout /dev/null \
              | netSyntenic stdin noClass.net
 
     # Make liftOver chains from chroms to alternates:
     netChainSubset -verbose=0 noClass.net hg38.haplotypes.all.chain.gz stdout \
       | chainStitchId stdin stdout | gzip -c > hg38.haplotypes.over.chain.gz
     # swap the alignments to get the alternates to chrom mappings:
     chainSwap hg38.haplotypes.over.chain.gz stdout \
        | gzip -c > hg38.reference.over.chain.gz
     # and put them all together so mappings go both directions
     chainMergeSort hg38.haplotypes.over.chain.gz hg38.reference.over.chain.gz \
         | gzip -c > hg38.haploReference.over.chain.gz
 
     hgLoadChain -tIndex hg38 chainAltSequence hg38.haploReference.over.chain.gz
     netClass -verbose=0 -noAr noClass.net hg38 hg38 hg38.hg38AltSequence.net
     netFilter -minGap=10 hg38.hg38AltSequence.net \
       | hgLoadNet -verbose=0 hg38 netAltSequence stdin
 
     chainToPsl hg38.haploReference.over.chain.gz ../../../chrom.sizes \
       ../../../chrom.sizes \
         /hive/data/genomes/hg38/hg38.unmasked.2bit  \
           /hive/data/genomes/hg38/hg38.unmasked.2bit  \
              hg38.beforeRecalc.haploReference.over.psl
 
     pslCheck -targetSizes=../../../chrom.sizes \
         -querySizes=../../../chrom.sizes \
     hg38.beforeRecalc.haploReference.over.psl 2>&1 | tail -1
     # checked: 3092 failed: 57 errors: 57
 
     pslRecalcMatch hg38.beforeRecalc.haploReference.over.psl \
     ../../../hg38.unmasked.2bit ../../../hg38.unmasked.2bit  \
         hg38.haploReference.over.psl
 
     pslCheck -targetSizes=../../../chrom.sizes \
       -querySizes=../../../chrom.sizes \
          hg38.haploReference.over.psl 2>&1 | tail -1
     # checked: 3092 failed: 0 errors: 0
 
     hgLoadPsl hg38 -table=altSequenceLiftOver hg38.haploReference.over.psl
 
 #############################################################################
 ## construct non-bridged contig sequence (DONE - 2014-01-10 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/nonBridgedContigs
     cd /hive/data/genomes/hg38/bed/nonBridgedContigs
 
     # only need the actual split chroms in this lift, and the
     # _nn name is a bit more convenient than the .nn:
     gapToLift -minGap=100 hg38 stdout | sed -e 's/\./_/;' \
         | awk '$1 != 0' > hg38.contigs.lift
     # the warnings gapToLift issues are about gaps defined in the table
     # that are abutting to each other.  teleomere gaps are next to contig gaps
     # those lifts in the format of a bed file:
     awk '{printf "%s\t%d\t%d\t%s\n", $4, $1, $1+$3, $2}' hg38.contigs.lift \
         > hg38.contigs.bed
     # the negation of that is the gaps between the contigs
     #  fixup the .N to _nn with the awk:
     featureBits -not -countGaps hg38 hg38.contigs.bed -bed=stdout \
 | awk '{split($4,a,"."); printf "%s\t%d\t%d\t%s_%02d\n", $1,$2,$3,a[1],a[2]}' \
              > hg38.gaps.bed
     # 268613637 bases of 3209286105 (8.370%) in intersection
 
     # together, those two should be %100 of the genome exactly:
     featureBits -countGaps -or hg38 hg38.contigs.bed hg38.gaps.bed
     #  3209286105 bases of 3209286105 (100.000%) in intersection
 
     # the list of all those other bits not in the split chroms:
     egrep "_alt|chrUn|chrM|_random" hg38.gaps.bed | cut -f1 \
        | sort > other.bits.list
 
     # extract those chrom pieces and the other bits from the masked sequence:
     (twoBitToFa -bed=hg38.contigs.bed ../../hg38.2bit stdout; \
       twoBitToFa -seqList=other.bits.list ../../hg38.2bit stdout) \
         | faToTwoBit stdin hg38.contigs.2bit
     twoBitInfo hg38.contigs.2bit stdout | sort -k2nr > hg38.contigs.chrom.sizes
     # verify nothing has been lost:
     twoBitToFa ../../hg38.2bit stdout | faCount stdin | tail -1
 # total 3209286105 898285419 623727342 626335137 900967885 159970322 30979743
     twoBitToFa hg38.contigs.2bit stdout | faCount stdin | tail -1
 # total 3061688741 898285419 623727342 626335137 900967885  12372958 30979743
     # the ACGT and CPG counts remain the same, only N's have been lost
 
     # make a copy of this at the top:
     cp -p hg38.contigs.2bit ../..
     cp -p hg38.contigs.lift ../../jkStuff
 
     # load as a track to be able to see where they are:
     egrep "chrUn|chrM|_alt|_random" hg38.contigs.chrom.sizes \
 	| awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $1}' \
 	> fullCoverage.hg38Contigs.bed
     cat hg38.contigs.bed >>  fullCoverage.hg38Contigs.bed
     featureBits -or -countGaps hg38 fullCoverage.hg38Contigs.bed gap
     # 3209286105 bases of 3209286105 (100.000%) in intersection
 
     hgLoadBed hg38 contigAlignmentSegments fullCoverage.hg38Contigs.bed
 
 #############################################################################
 ## analysis of repeat elements from each RM run
 ## (DONE - 2014-01-10 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/repeatElementCount
     cd /hive/data/genomes/hg38/bed/repeatElementCount
     for F in ../rmsk*/hg38.class.profile.txt \
           ../repeatMaskerGenbank/hg38.class.profile.txt
 do
    D=`dirname $F`
    B=`basename $D | sed -e 's/repeatMaskerGenbank/NCBI/; s/rmsk//;'`
    echo "==== $B ===="
    grep rmskClass $F | sed -e 's#rmskClass/##; s/.tab//;' \
      | awk '{printf "%s\t%d\n", $2, $1}' | sort > ${B}.tab
 done
 
    # Hmmer does not have snRNA and tRNA ?
    echo -e "snRNA\t0" >> Hmmer.tab
    echo -e "tRNA\t0" >> Hmmer.tab
    sort Hmmer.tab > t.tab
    mv t.tab Hmmer.tab
 
    echo "#  Repeat Masker item counts" > table.result.txt
    echo "#  class         NCBI cross-match rmblastn HMMER" >> table.result.txt
    join NCBI.tab CM.tab  | join - Blastn.tab  | join - Hmmer.tab \
      | awk '{printf "%-15s\t%7d\t%7d\t%7d\t%7d\n", $1,$2,$3,$4,$5}' \
        | sort -k2,2nr >> table.result.txt
 
    cat table.result.txt
 #  Repeat Masker item counts
 #  class         NCBI cross-match rmblastn HMMER
 SINE            1849444 1852545 1822406 1884179
 LINE            1586141 1570523 1551012 1702529
 LTR              759248  748597  737799  805427
 DNA              502186  499108  485558  565171
 Simple_repeat    433789  703682  716968  636906
 Low_complexity   396378  102856  105181   95480
 Satellite         10198    7962    7703   10852
 LTR?               5884    5667    5068    9181
 snRNA              4595    4516    4548       0
 Retroposon         4163    5750    5630   11861
 Unknown            2802    5622    5263    3914
 DNA?               2157    3294    3018    4582
 tRNA               2154    2026    1983       0
 rRNA               1915    1840    1810     464
 RC                 1860    1784    1706    2059
 srpRNA             1784    1672    1633    1517
 scRNA              1397    1420    1426    6783
 RNA                 822     704     611    1484
 SINE?               488      38      38     970
 RC?                 445     411     374     806
 
 total           5567850 5520017 5459735 5744165
 
 #############################################################################
 ## blat server turned on (DONE - 2014-01-13 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("hg38", "blat4c", "17780", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("hg38", "blat4c", "17781", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 ############################################################################
 ## reset default position to ABO gene (DONE - 2014-01-13 - Hiram)
     ssh hgwdev
     hgsql -e 'update dbDb set defaultPos="chr9:133252000-133280861"
 	where name="hg38";' hgcentraltest
 
 #########################################################################
 ## update grp table with new set of standard rows (DONE - 2014-01-29 - Hiram)
     hgsql -e 'alter table grp rename grpOriginal;' hg38
     hgsql -e 'drop table grp;' hg38
     hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg19.grp" hg38
     hgsql -e 'delete from grp where name="denisova";' hg38
     hgsql -e 'delete from grp where name="pub";' hg38
     hgsql -e 'delete from grp where name="neandertal";' hg38
     hgsql -e 'update grp set defaultIsClosed=0 where name="map";' hg38
 
     hgsql -e 'drop table grpOriginal;' hg38
 
 ############################################################################
 # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2014-01-21 - Hiram)
     ssh ku
     mkdir /hive/data/genomes/hg38/bed/linSpecRep
     cd /hive/data/genomes/hg38/bed/linSpecRep
     #	create individual .out files from the master record in ../repeatMasker
     mkdir splitOut
     cat << '_EOF_' > split.csh
 #!/bin/csh -fe
 set C = $1
 head -3 ../repeatMasker/hg38.sorted.fa.out > splitOut/${C}.out
 grep "${C} " ../repeatMasker/hg38.sorted.fa.out >> splitOut/${C}.out
 '_EOF_'
     # << happy emacs
     chmod +x split.csh
 
     cat << '_EOF_' > template
 #LOOP
 split.csh $(root1) {check out line+ splitOut/$(root1).out}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     # small ones first:
     cut -f1 ../../chrom.sizes | tac > chrom.list
     gensub2 chrom.list single template jobList
     para create jobList
     para try ... check ... push ... etc...
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:        127s       2.12m     0.04h    0.00d  0.000 y
 # IO & Wait Time:                 17154s     285.90m     4.76h    0.20d  0.001 y
 # Average job time:                 186s       3.10m     0.05h    0.00d
 # Longest finished job:             224s       3.73m     0.06h    0.00d
 # Submission to last job:           280s       4.67m     0.08h    0.00d
 
     #	now, we can date and process each of those .out files
     #	constructing the humanSpecific set of repeats
     #   this means repeats found in human, and not in others
     #   using mouse here for 'others' is good enough, a variety
     #   of other species could be used (rat dog cow) where they all
     #   produce the same result
     mkdir dateRepeats
     cd dateRepeats
     cat << '_EOF_' > mkLSR
 #!/bin/bash
 set -beEu -o pipefail
 rm -f $1.out_mus-musculus
 ln -s ../splitOut/$1.out .
 /scratch/data/RepeatMasker/DateRepeats $1.out -query human -comp mouse
 rm $1.out
 mkdir -p ../humanSpecific
 /cluster/bin/scripts/extractRepeats 1 $1.out_mus-musculus \
 	> ../humanSpecific/$1.out.spec
 '_EOF_'
     #	<< happy emacs
     chmod +x mkLSR
 
     cat << '_EOF_' > template
 #LOOP
 ./mkLSR $(path1) {check out line+ ../humanSpecific/$(path1).out.spec}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 ../chrom.list single template jobList
     para try ... check ... push ... etc...
     para time
 # Completed: 455 of 455 jobs
 # CPU time in finished jobs:      13985s     233.08m     3.88h    0.16d  0.000 y
 # IO & Wait Time:                  1470s      24.50m     0.41h    0.02d  0.000 y
 # Average job time:                  34s       0.57m     0.01h    0.00d
 # Longest finished job:             111s       1.85m     0.03h    0.00d
 # Submission to last job:          1427s      23.78m     0.40h    0.02d
 
 
     # We also need the nibs for blastz runs with lineage specific repeats
     mkdir /hive/data/genomes/hg38/bed/nibs
     cd /hive/data/genomes/hg38/bed/nibs
     cut -f1 ../../chrom.sizes | while read C
 do
     twoBitToFa -seq=${C} ../../hg38.2bit stdout \
 	| faToNib -softMask stdin ${C}.nib
     echo "${C} done"
 done
 
     # verify nothing lost
     cat ../../chrom.sizes \
      | awk '{printf "nibFrag -masked %s.nib 0 %d + stdout\n", $1, $2}' \
         | sh | faSize stdin
 # 3209286105 bases (159970322 N's 3049315783 real 1460684798 upper
 #  1588630985 lower) in 455 sequences in 1 files
 # Total size: mean 7053376.1 sd 31548372.6
 #  min 970 (chrUn_KI270394v1.nib:0-970)
 #  max 248956422 (chr1.nib:0-248956422) median 161218
 # %49.50 masked total, %52.10 masked real
 
     mkdir /hive/data/staging/data/hg38/nib
     rsync -a --progress ./ /hive/data/staging/data/hg38/nib
 
 #############################################################################
 ## GRC Contigs/ctgPos2 track (DONE - 2014-12-25 - Hiram)
     # provide mapping of UCSC chrom names to GRC names
     mkdir /hive/data/genomes/hg38/bed/ctgPos2
     cd /hive/data/genomes/hg38/bed/ctgPos2
     grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \
 	| awk '{printf "s/^%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt
 
     find ../../genbank -type f | grep "/assembled_chromosomes/AGP/" | sed -e 's/.comp//' | while read F
 do
    if [ -s $F ]; then
       zcat $F | grep -v "^#"
    fi
 done | sed -e "`cat accessionToUcsc.sed.txt`" > ucsc.grch38.agp
 
     awk '$5 != "N"' ucsc.grch38.agp \
 | awk '{printf "%s\t%d\t%s\t%d\t%d\t%s\n", $6, $3-$2+1, $1, $2-1, $3, $5}' \
 	| sort -u | sort -k3,3 -k4,4n > ctgPos2.tab
 
 
     export ctgSize=`awk '{print length($1)}' ctgPos2.tab | sort -n | tail -1`
     export chrSize=`awk '{print length($3)}' ctgPos2.tab | sort -n | tail -1`
 
     sed -e "s/20/$ctgSize/; s/16/$chrSize/;" \
 	/cluster/home/hiram/kent/src/hg/lib/ctgPos2.sql > hg38.ctgPos2.sql
 
     hgLoadSqlTab hg38 ctgPos2 hg38.ctgPos2.sql ctgPos2.tab
 
 ############################################################################
 # constructing download files (WORKING - 2014-01-15 - Hiram)
     # add hg38 to all.joiner and verify it is clean:
     joinerCheck -database=hg38 -keys all.joiner
 # Checking keys on database hg38
 #  hg38.ucscToINSDC.chrom - hits 455 of 455 (100.000%) ok
     # and all table coordinates are OK:
     checkTableCoords hg38
 
     cd /hive/data/genomes/hg38
     time $HOME/kent/src/hg/utils/automation/makeDownloads.pl \
       -workhorse=hgwdev hg38
     # makeDownloads.pl has made a preliminary set of files
 
     # need to fixup these names and add chromFa.tar.gz files
     cd /hive/data/genomes/hg38/goldenPath/bigZips
 
     mkdir chroms
     mkdir maskedChroms
 
     faSplit byname hg38.fa.gz chroms/
     faSplit byname hg38.fa.masked.gz maskedChroms/
 
     tar cvzf ./hg38.chromFa.tar.gz ./chroms/
     tar cvzf ./hg38.chromFaMasked.tar.gz ./maskedChroms/
 
     cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips
     ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFa.tar.gz hg38.chromFa.tar.gz
     ln -s /hive/data/genomes/hg38/goldenPath/bigZips/hg38.chromFaMasked.tar.gz hg38.chromFaMasked.tar.gz
 
     #also added entries for above to md5sum.txt and README.txt
 
 ############################################################################
 # LASTZ MOUSE Mm10 (DONE - 2014-01-23,31 - Hiram)
     # can no longer use the lineage specific repeats with the new lastz
     # use a screen to manage this longish job:
     screen -S hg38Mm10
 
     mkdir /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23
     cd /hive/data/genomes/hg38/bed/lastzMm10.2014-01-23
 
     # best to always specify an exact path to lastz so we know which one is used
     # lastz default parameters are human-mouse parameters
 
     cat << '_EOF_' > DEF
 # human vs mouse
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 
 # TARGET: Human Hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=40000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse Mm10
 SEQ2_DIR=/scratch/data/mm10/mm10.2bit
 SEQ2_LEN=/scratch/data/mm10/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzMm10.2014-01-23
 TMPDIR=/dev/shm
 '_EOF_'
     # << happy emacs
 
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
         -stop=net `pwd`/DEF \
         -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-fileServer=hgwdev \
         -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1
     #	real    1494m26.135s ---- busy cluster
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
         -continue=load `pwd`/DEF \
         -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-fileServer=hgwdev \
         -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1
     #	Elapsed time: 43m11s
     cat fb.hg38.chainMm10Link.txt
     # 964465044 bases of 3049335806 (31.629%) in intersection
 
     #	and the swap
     mkdir /hive/data/genomes/mm10/bed/blastz.hg38.swap
     cd /hive/data/genomes/mm10/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg38/bed/lastzMm10.2014-01-23/DEF \
 	-swap -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
     #   real    83m28.397s
 
     cat fb.mm10.chainHg38Link.txt
     #	937030766 bases of 2652783500 (35.323%) in intersection
 
 #########################################################################
 # LASTZ Dog CanFam3 (DONE - 2014-01-26 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26
     cd /hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26
 
     cat << '_EOF_' > DEF
 # human vs dog
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 
 # TARGET: Human Hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Dog CanFam3
 SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit
 SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26
 TMPDIR=/dev/shm
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen hg38CanFam3
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
     # Elapsed time: 1396m22s - busy cluster
     cat fb.hg38.chainCanFam3Link.txt
     #  1523987456 bases of 3049335806 (49.978%) in intersection
 
     #	running the swap
     mkdir /hive/data/genomes/canFam3/bed/blastz.hg38.swap
     cd /hive/data/genomes/canFam3/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg38/bed/lastzCanFam3.2014-01-26/DEF \
 	-syntenicNet -swap \
 	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
     #	real    107m57.787s
 
     cat fb.canFam3.chainHg38Link.txt
     #	1437624815 bases of 2392715236 (60.083%) in intersection
 
 #########################################################################
 # LASTZ Macaca Mulatta RheMac3 (DONE - 2014-01-27,02-10 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27
     cd /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27
 
     # best to always specify an exact path to lastz so we know which one is used
     # lastz default parameters are human-mouse parameters
 
     cat << '_EOF_' > DEF
 # human vs macaca mulatta
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Macaca Mulatta RheMac3
 SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit
 SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27
 TMPDIR=/dev/shm
 '_EOF_'
     # << happy emacs
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
         `pwd`/DEF \
         -syntenicNet -fileServer=hgwdev \
 	-chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
     #   Elapsed time: 1426m43s - busy cluster
     cat fb.hg38.chainRheMac3Link.txt
     #   2431208700 bases of 3049335806 (79.729%) in intersection
 
     #   running the swap
     mkdir /hive/data/genomes/rheMac3/bed/blastz.hg38.swap
     cd /hive/data/genomes/rheMac3/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/hg38/bed/lastzRheMac3.2014-01-27/DEF \
         -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1
     #    82m32.329s
     cat fb.rheMac3.chainHg38Link.txt
     #   2288533769 bases of 2639145830 (86.715%) in intersection
 
 #########################################################################
 ## construct analysis set (DONE - 2014-01-27 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/analysisSet
     cd /hive/data/genomes/hg38/bed/analysisSet
     mkdir -p splitFa
 
     faToTwoBit \
 ../../genbank/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz \
 	hg38.unmasked.analysisSet.2bit
 
     faCount splitFa/c*.fa > splitFa.faCount.txt
 
     egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../rmskCM/hg38.sorted.fa.out \
 	> hg38.analysisSet.out
 
     twoBitMask hg38.unmasked.analysisSet.2bit hg38.analysisSet.out \
 	hg38.rmsk.analysisSet.2bit
 
     egrep -v "chr[0-9_BGHIJKLXv]*_alt" ../simpleRepeat/trfMask.bed \
 	> trfMask.analysisSet.bed
 
     twoBitMask hg38.rmsk.analysisSet.2bit -add trfMask.analysisSet.bed \
 	hg38.analysisSet.2bit
 
     twoBitToFa hg38.unmasked.analysisSet.2bit stdout | faSize stdin
 # 3099922541 bases (165046090 N's 2934876451 real 2934876451 upper 0 lower)
 #	in 195 sequences in 1 files
 # Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1)
 #	max 248956422 (chr1) median 32032
 # %0.00 masked total, %0.00 masked real
 
     twoBitToFa hg38.analysisSet.2bit stdout | faSize stdin
 # 3099922541 bases (165046090 N's 2934876451 real 1409378896 upper 1525497555
 #	lower) in 195 sequences in 1 files
 # Total size: mean 15897038.7 sd 46804464.6 min 970 (chrUn_KI270394v1)
 #	max 248956422 (chr1) median 32032
 # %49.21 masked total, %51.98 masked real
 
     mkdir hg38.analysisSet.chroms
     twoBitToFa hg38.analysisSet.2bit stdout \
 	| faSplit byname stdin hg38.analysisSet.chroms/
 
     tar cvzf ./hg38.analysisSet.chroms.tar.gz ./hg38.analysisSet.chroms
 
     ln -s `pwd`/hg38.analysisSet.2bit \
         /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips
     ln -s `pwd`/hg38.analysisSet.chroms.tar.gz \
         /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips
     # add these md5 sums to md5sum.txt
     md5sum hg38.analysisSet.2bit hg38.analysisSet.chroms.tar.gz >> \
         /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips/md5sum.txt
 
     cp ../../genbank/README_ANALYSIS_SETS README.analysisSet.txt
     # add note at the top of README:
     ######################################################################
     UCSC copy of the file from:
 
     ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/README_ANALYSIS_SETS
 
     ln -s `pwd`/README.analysisSet.txt \
         /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/bigZips
 
 #########################################################################
 # the FULL analysis set (DONE - 2014-03-18 - Hiram
     mkdir /hive/data/genomes/hg38/bed/fullAnalysisSet
     cd /hive/data/genomes/hg38/bed/fullAnalysisSet
 
     mkdir hg38.fullAnalysisSet.chroms
     twoBitToFa ../analysisSet/hg38.analysisSet.2bit stdout \
        | faSplit byname stdin hg38.fullAnalysisSet.chroms/
 
     grep _alt ../../chrom.sizes | cut -f 1 > alt.list
 
     twoBitToFa -seqList=alt.list ../../hg38.2bit stdout \
        | faSplit byname stdin hg38.fullAnalysisSet.chroms/
 
     faCount hg38.fullAnalysisSet.chroms/chr*.fa > faCount.fullAnalysisSet.txt
 
     faToTwoBit hg38.fullAnalysisSet.chroms/chr*.fa hg38.fullAnalysisSet.2bit
     twoBitInfo hg38.fullAnalysisSet.2bit stdout | sort -k2nr > chrom.sizes
 
     tar cvzf ./hg38.fullAnalysisSet.chroms.tar.gz ./hg38.fullAnalysisSet.chroms
 
 #########################################################################
 # LASTZ Self/hg38 (DONE - 2014-01-25,02-10 - Hiram)
     # can no longer use the lineage specific repeats with the new lastz
     # use a screen to manage this longish job:
     screen -S hg38Self
 
     mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25
     cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25
     # construct the non-bridged contigs sequence to use:
     (twoBitToFa ../nonBridgedContigs/hg38.chroms.contigs.2bit stdout;
       twoBitToFa ../../hg38.2bit:chrM stdout) | faToTwoBit stdin hg38.self.2bit
     twoBitInfo hg38.self.2bit stdout | sort -k2nr > hg38.self.chrom.sizes
 
     # best to always specify an exact path to lastz so we know which one is used
     # lastz default parameters are human-mouse parameters
 
     cat << '_EOF_' > DEF
 # human vs human with mouse defaults
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 
 # TARGET: Human Hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Human Hg38
 SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit
 SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes
 SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25
 TMPDIR=/dev/shm
 '_EOF_'
 _EOF_
 
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
         -stop=net `pwd`/DEF \
         -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-fileServer=hgwdev \
         -chainMinScore=3000 -chainLinearGap=medium > net.log 2>&1
     #  real    1518m15.817s -- problems
     # there was a problem in the 'part014' batch.  running that manually:
     mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob
     cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob
     # make 100 jobs out of the 10 parts:
     mkdir -p psl
     cp ../tParts/part014.lst ./xpart014.lst
     split -l 1 xpart014.lst -d -a 3 part
     for F in part0*
 do
    mv $F $F.lst
 done
 
 for T in part0*.lst
 do
   for Q in part0*.lst
   do
     mkdir -p psl/${T}
     echo /cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T} ${Q} ../../DEF \{check out exists psl/${T}/${T}.${Q}.psl\}
   done
 done > jobList
     para -ram=32g create jobList
     para push
     # one last failing job:
 # Completed: 99 of 100 jobs
 # CPU time in finished jobs:       2836s      47.27m     0.79h    0.03d  0.000 y
 # IO & Wait Time:                   279s       4.65m     0.08h    0.00d  0.000 y
 # Average job time:                  31s       0.52m     0.01h    0.00d
 # Longest finished job:             586s       9.77m     0.16h    0.01d
 # Submission to last job:           620s      10.33m     0.17h    0.01d
 
     mkdir /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010
     cd /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010
     mkdir psl
 
     twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 part010.fa
 
     faSplit -lift=split010.lift size part010.fa 169000 split010_
 TOP="/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/run.blastz/lastJob/split010"
 
 for T in split*.fa
 do
   mkdir -p psl/${T}
   echo "${TOP}/${T}" > ${T}.lst
   faToTwoBit  ${T} ${T}.2bit
   for Q in split*.fa
   do
      echo "/cluster/home/hiram/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf ${T}.lst ${Q}.lst DEF {check out exists psl/${T}/${T}.${Q}.psl}"
   done
 done > jobList
      para -ram=32g create jobList
 
 # Completed: 100 of 100 jobs
 # CPU time in finished jobs:     176579s    2942.99m    49.05h    2.04d  0.006 y
 # IO & Wait Time:                  1239s      20.64m     0.34h    0.01d  0.000 y
 # Average job time:                1778s      29.64m     0.49h    0.02d
 # Longest finished job:           29343s     489.05m     8.15h    0.34d
 # Submission to last job:         29348s     489.13m     8.15h    0.34d
 
     catDir psl/* | grep -v "^#" > raw.psl
 
     liftUp -type=.psl stdout split010.lift error raw.psl \
         | liftUp -pslQ -type=.psl chr16_03.psl split010.lift error stdin
 
     # this combination allowed psl headers to sneak in the middle,
     # had to be cleaned:
     catDir psl/* | grep -v "^#" > part014.psl
     cat split010/chr16_03.psl >> part014.psl
     cp -p part014.psl ../../psl/part014.lst/part014.lst_part014.lst.psl
 
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
         -continue=cat -stop=net `pwd`/DEF \
         -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-fileServer=hgwdev \
         -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1
     # real    43m11.340s
     # failed in chaining, running manually on hgwdev
     time ./bigJobs.sh > bigJobs.log 2>&1
     #  real    468m59.648s
 
     time ./part014.sh > part014.log 2>&1
 
     # real    1319m57.911s
     # -rw-rw-r-- 1 3581498246 Feb  8 14:37 part014.lst.chain
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
         -continue=chainMerge -stop=net `pwd`/DEF \
         -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-fileServer=hgwdev \
         -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1
 
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 \
         -continue=load -stop=load `pwd`/DEF \
         -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-fileServer=hgwdev \
         -chainMinScore=3000 -chainLinearGap=medium > load.log 2>&1
 
     hgLoadChain -normScore -tIndex hg38 chainSelf hg38.hg38.all.chain.gz
     #  Loading 104815249 chains into hg38.chainSelf
 
     cat fb.hg38.chainSelfLink.txt
     #   392419010 bases of 3049335806 (12.869%) in intersection
     cd /hive/data/genomes/hg38/bed
     ln -s lastzSelf.2014-01-25 lastz.self
     ln -s lastzSelf.2014-01-25 lastz.hg38
 
 #########################################################################
 ## 4-Way Multiz for UCSC Genes construction (DONE - 2014-02-11 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg38/bed/multiz4way
     cd /hive/data/genomes/hg38/bed/multiz4way
 
     #	extract our 4 organisms from the 44-way on hg18:
     ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh
 
     /cluster/bin/phast/tree_doctor \
 	--prune-all-but hg19,mm10,canFam3,rheMac3 $HOME/kent/src/hg/utils/phyloTrees/120way.nh \
 	| sed -e "s/hg19/hg38/" > 4way.nh
 
     #	this looks like:
     cat 4way.nh
 (((hg38:0.033974,rheMac3:0.037601):0.109934,mm10:0.356483):0.020593,canFam3:0.165928);
 
 
     #	Use this specification in the phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to obtain a gif image for htdocs/images/phylo/hg38_4way.gif
 
     /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
     #	Use this output to create the table below
     grep -y hg38 4way.distances.txt | sort -k3,3n
 #
 #	If you can fill in all the numbers in this table, you are ready for
 #	the multiple alignment procedure
 #
 #                         featureBits chainLink measures
 #                                        chainHg38Link   chain    linearGap
 #    distance                      on hg38    on other   minScore
 #  1  0.071575 - rhesus rheMac3 (% 79.729) (% 86.715)       5000     medium
 #  2  0.330429 - dog canFam3    (% 49.978) (% 60.083)       3000     medium
 #  3  0.500391 - mouse mm10     (% 31.629) (% 35.323)       3000     medium
 
     #	using the syntenic nets
     cd /cluster/data/hg38/bed/multiz4way
     mkdir mafLinks
     cd mafLinks
     mkdir rheMac3 canFam3 mm10
 
     for D in mm10 canFam3 rheMac3
 do
     ln -s ../../../lastz.${D}/axtChain/hg38.${D}.synNet.maf.gz ./${D}/
 done
 
     mkdir /hive/data/genomes/hg38/bed/multiz4way/mafSplit
     cd /hive/data/genomes/hg38/bed/multiz4way/mafSplit
     for D in mm10 canFam3 rheMac3
 do
     echo "working: ${D}"
     zcat ../mafLinks/${D}/hg38.${D}.synNet.maf.gz > ${D}.maf
     mkdir -p ${D}
     mafSplit -byTarget -useFullSequenceName /dev/null ${D}/${D}_  ${D}.maf
     rm -f ${D}.maf
 done
 
     #	determine what is the newest version of multiz and use that
     cd /hive/data/genomes/hg38/bed/multiz4way
     mkdir penn
     cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
     cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
     cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn
 
     # the autoMultiz cluster run
     ssh ku
     cd /hive/data/genomes/hg38/bed/multiz4way
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	4way.nh > tmp.nh
     echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     mkdir run maf
     cd run
 
     #	NOTE: you need to set the db and multiz dirname properly in this script
     cat > autoMultiz << '_EOF_'
 #!/bin/csh -ef
 set db = hg38
 set c = $1
 set maf = $2
 set binDir = /hive/data/genomes/hg38/bed/multiz4way/penn
 set tmp = /dev/shm/$db/multiz.$c
 set pairs = /hive/data/genomes/hg38/bed/multiz4way/mafSplit
 rm -fr $tmp
 mkdir -p $tmp
 cp ../{tree.nh,species.lst} $tmp
 pushd $tmp
 foreach s (`cat species.lst`)
     set in = $pairs/$s/${s}_$c.maf
     set out = $db.$s.sing.maf
     if ($s == $db) then
 	continue
     endif
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($binDir $path); rehash
 $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $maf
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod +x autoMultiz
 
 cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg38/bed/multiz4way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 /cluster/data/hg38/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     # 455 jobs
     para try ... check ... push ... etc ...
 # Completed: 455 of 455 jobs
 # CPU time in finished jobs:      50111s     835.18m    13.92h    0.58d  0.002 y
 # IO & Wait Time:                  5574s      92.91m     1.55h    0.06d  0.000 y
 # Average job time:                 122s       2.04m     0.03h    0.00d
 # Longest finished job:            4717s      78.62m     1.31h    0.05d
 # Submission to last job:          4722s      78.70m     1.31h    0.05d
 
     #	combine results into a single file for loading and gbdb reference
     cd /hive/data/genomes/hg38/bed/multiz4way
     grep "^#" maf/chr19_GL949749v2_alt.maf | grep -v "eof maf" > multiz4way.maf
     grep -h -v "^#" maf/*.maf >> multiz4way.maf
     grep "^#" maf/chr19_GL949749v2_alt.maf | grep "eof maf" >> multiz4way.maf
     #	real    3m27.561s
 
     #	makes a 8.5 Gb file:
     #   -rw-rw-r-- 1 9044143788 Feb 11 12:51 multiz4way.maf
 
     # Load into database
     ssh hgwdev
     cd /hive/data/genomes/hg38/bed/multiz4way
     mkdir /gbdb/hg38/multiz4way
     ln -s /hive/data/genomes/hg38/bed/multiz4way/multiz4way.maf \
 	/gbdb/hg38/multiz4way
     #	the hgLoadMaf generates huge tmp files, locate them in /dev/shm
     cd /dev/shm
     time nice -n +19 hgLoadMaf hg38 multiz4way
     #   Loaded 6141667 mafs in 1 files from /gbdb/hg38/multiz4way
     #   real    2m2.812s
 
     cd /hive/data/genomes/hg38/bed/multiz4way
     time (cat /gbdb/hg38/multiz4way/*.maf \
         | hgLoadMafSummary -verbose=2 -minSize=10000 \
 	-mergeGap=500 -maxSize=50000 hg38 multiz4waySummary stdin)
     # Created 1266559 summary blocks from 11780291 components and 6141667 mafs
     # real    3m0.791s
 # -rw-rw-r-- 1  311246327 Feb 11 12:54 multiz4way.tab
 # -rw-rw-r-- 1   58730176 Feb 11 12:58 multiz4waySummary.tab
     wc -l multiz4way*
     # 6141667 multiz4way.tab
     # 1266559 multiz4waySummary.tab
     # 7408226 total
 
 #########################################################################
 ## RE-load alternate sequence for PSL display (DONE - 2016-01-15 - Hiram)
 ## The procedure below
 ##    "load alternate sequence for PSL display (DONE - #2014-02-24 - Hiram)
 ## produced an illegal psl Table altSeqLiftOverPsl:
     pslCheck -db=hg38 altSeqLiftOverPsl
     checked: 266 failed: 264 errors: 1046
 
 ## Since then, the gff3ToPsl command has been updated to be a bit more
 ##  robust, so, the following sequence produces the new alignment file:
     mkdir -p /hive/data/genomes/hg38/bed/altAlignments/redo2016
     cd /hive/data/genomes/hg38/bed/altAlignments/redo2016
 
 mkdir -p ucscPsl
 
 awk -F'/' '{printf "s/^%s\t/%s\t/g;\n", $3,$2}' ../accessionToUcsc.sed.txt \
     > ucscToNcbi.sed.txt
 
 sed -f ucscToNcbi.sed.txt ../../../chrom.sizes > ncbi.chrom.sizes
 
 paste ncbi.chrom.sizes ../../../chrom.sizes \
   | awk -F'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1,$2,$3,$4}' \
     > ncbiToUcsc.lift
 
 find ../../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \
   | while read gff
 do
   name=`basename $gff | sed -e 's/_.*//;'`
   fasta=`dirname $gff | sed -e 's#alignments#FASTA/alt.scaf.fa.gz#;'`
   size=`faCount $fasta | grep -w total | cut -f2`
   printf "%s\t%d\n" "$name" "$size" > target.sizes
   gff3ToPsl ncbi.chrom.sizes target.sizes $gff $name.psl
   pslCheck ${name}.psl
   liftUp -type=.psl stdout ncbiToUcsc.lift error ${name}.psl \
     | liftUp -type=.psl -pslQ ucscPsl/${name}.psl ncbiToUcsc.lift error stdin
   pslCheck ucscPsl/${name}.psl
 done
 
   pslSort dirs altSeqLiftOverPsl.psl ./tmp ucscPsl
   pslCheck -db=hg38 altSeqLiftOverPsl.psl
 
   hgLoadPsl hg38 altSeqLiftOverPsl.psl
   pslCheck -db=hg38 altSeqLiftOverPsl
   #  checked: 266 failed: 0 errors: 0
 
 #########################################################################
 ## load alternate sequence for PSL display (DONE - 2014-02-24 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/altAlignments/sequence
     cd /hive/data/genomes/hg38/bed/altAlignments/sequence
 
     rm -fr hg38.haplotypes.lift temp.lift targetFa queryFa
     mkdir targetFa
     mkdir queryFa
     touch temp.lift
 
     cat ../../altLocations/chrToAlt.bed | while read L
 do
   chrName=`echo $L | awk '{print $1}'`
   chromSize=`egrep "^$chrName   " ../../../chrom.sizes | cut -f2`
   chrStart=`echo $L | awk '{printf "%d", $2}'`
   chrEnd=`echo $L | awk  '{printf "%d", $3}'`
   chrSize=`echo $chrEnd $chrStart | awk '{print $1-$3}'`
   queryName=`echo $L | awk '{print $4}'`
   partName="${chrName}_${chrStart}_${chrEnd}"
   echo $chrName $chrStart $chrEnd $queryName $partName $chromSize
   echo -e "$chrStart\t${partName}\t$chrSize\t$chrName\t$chromSize" >> temp.lift
   twoBitToFa ../../../hg38.2bit:$chrName:$chrStart-$chrEnd stdout | sed -e "s/^>.*/>$partName/;" > targetFa/$queryName.fa
   twoBitToFa ../../../hg38.2bit:$queryName queryFa/$queryName.fa
 done
 
 sort -u temp.lift | sort -k4,4 -k1,1n > hg38.haplotypes.lift
 
     mkdir /gbdb/hg38/ncbiAltMappings
     cd /hive/data/genomes/hg38/bed/altAlignments/sequence/queryFa
     ln -s `pwd`/*.fa /gbdb/hg38/ncbiAltMappings
     cd /hive/data/genomes/hg38/bed/altAlignments/sequence
     hgLoadSeq -drop -seqTbl=seqNcbiAltSequence -extFileTbl=extNcbiAltSequence \
         hg38 /gbdb/hg38/ncbiAltMappings/*.fa
 
     pslSwap ../altAlignments.psl stdout \
       | pslRecalcMatch stdin ../../../hg38.2bit ../../../hg38.2bit \
         hg38.referenceTarget.psl
 
     # the table name altSeqLiftOverPsl is recognized in hgc to allow display
     # of the details of the alignments
     hgLoadPsl hg38 -table=altSeqLiftOverPsl hg38.referenceTarget.psl
 
 #########################################################################
 ## alternate sequence alignments EXPERIMENT (DONE - 2014-01-17 - Hiram)
     # the lastzAltSequences.2014-01-23 alignment was used for this instead
     # of this procedure
     mkdir /hive/data/genomes/hg38/bed/altAlignments
     cd /hive/data/genomes/hg38/bed/altAlignments
 
     grep -v "^#" ../../genbank/GCA_000001405.15_GRCh38_top-level.acc2name \
 	| awk '{printf "s/%s/%s/g;\n", $1, $3}' > accessionToUcsc.sed.txt
 
     find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \
 	| while read F
 do
    cat $F | sed -f accessionToUcsc.sed.txt \
 	| gff3ToPsl ../../chrom.sizes stdin stdout
 done > altAlignments.psl
 	| xargs cat | sed -f accessionToUcsc.sed.txt \
 	| gff3ToPsl ../../chrom.sizes stdin altAlignments.psl
 
     time pslRecalcMatch altAlignments.psl ../../hg38.2bit ../../hg38.2bit \
         altRecalcMatch.psl
     # real    0m51.122s
 
     # just to see what they look like in different formats:
     pslToChain altRecalcMatch.psl altAlignments.chain
     chainToAxt altAlignments.chain ../../hg38.2bit ../../hg38.2bit \
 	altAlignments.axt
     axtToMaf -score altAlignments.axt ../../chrom.sizes ../../chrom.sizes \
         altAlignments.maf
 
     mkdir mafSplits
     mafSplit /dev/null mafSplits/ altAlignments.maf
     # doesn't work:
 # Can't find chrom in MAF component src: chr6_GL000250v2_alt
 
     mkdir splits psl
     find ../../genbank -type f | grep alt_scaffolds | grep "\.gff$" \
         | while read F
 do
    chrAlt=`basename $F | sed -e 's/_.*//' | sed -f accessionToUcsc.sed.txt`
    echo $chrAlt
    cat $F | sed -f accessionToUcsc.sed.txt \
         | gff3ToPsl ../../chrom.sizes stdin splits/${chrAlt}.psl
    pslRecalcMatch splits/${chrAlt}.psl ../../hg38.2bit ../../hg38.2bit \
 	psl/${chrAlt}.psl
 done
 
    mkdir swap
    mkdir swap/psl swap/chain swap/axt swap/maf swap/anno
    for F in psl/*.psl
 do
   B=`basename $F | sed -e 's/.psl//'`
   echo $B
   pslSwap $F stdout | pslRecalcMatch stdin ../../hg38.2bit ../../hg38.2bit \
       swap/psl/${B}.psl
   pslToChain swap/psl/${B}.psl swap/chain/${B}.chain
   chainToAxt swap/chain/${B}.chain ../../hg38.2bit ../../hg38.2bit \
 	swap/axt/${B}.axt
   axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \
       | sed -e 's/^s chr\([0-9XYM][0-9]* \)/s ref38.chr\1/; s/^s chr\([0-9XYM][0-9]*_\)/s alt38.chr\1/;' > swap/maf/${B}.maf
   mafAddIRows -nBeds=nBeds swap/maf/${B}.maf ../../hg38.2bit swap/anno/${B}.maf
 done
 # axtToMaf -score swap/axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \
 #      | sed -e 's/^s chr/s hg38.chr/' > swap/maf/${B}.maf
 
    twoBitInfo -nBed ../../hg38.2bit ../../hg38.N.bed
    ln -s  ../../hg38.N.bed hg38.bed
    ln -s ../../hg38.N.bed ref38.bed
    ln -s ../../hg38.N.bed alt38.bed
    echo hg38.bed > nBeds
    echo ref38.bed >> nBeds
    echo alt38.bed >> nBeds
    ln -s  ../../chrom.sizes hg38.len
    ln -s  ../../chrom.sizes ref38.len
    ln -s  ../../chrom.sizes alt38.len
    echo hg38.len > sizes
    echo ref38.len >> sizes
    echo alt38.len >> sizes
 
    mkdir chain axt maf anno
    for F in psl/*.psl
 do
    B=`basename $F | sed -e 's/.psl//'`
    echo $B
    pslToChain $F chain/${B}.chain
    chainToAxt chain/${B}.chain ../../hg38.2bit ../../hg38.2bit axt/${B}.axt
   axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \
       | sed -e 's/^s chr\([0-9XYM][0-9]* \)/s ref38.chr\1/; s/^s chr\([0-9XYM][0-9]*_\)/s alt38.chr\1/;' > maf/${B}.maf
    mafAddIRows -nBeds=nBeds maf/${B}.maf ../../hg38.2bit anno/${B}.maf
 done
 
 #   axtToMaf -score axt/${B}.axt ../../chrom.sizes ../../chrom.sizes stdout \
 #      | sed -e 's/^s chr/s hg38.chr/' > maf/${B}.maf
 
 ############################################################################
 # Liftover Gencode V19 from hg19  (DONE braney 2014-02-14)
 
 mkdir /cluster/data/hg38/bed/liftOverGencodeV19
 cd /cluster/data/hg38/bed/liftOverGencodeV19
 
 echo "show tables like 'wgEncodeGencode%19'" | hgsql hg19 | tail -n +2 > all.gencode.tables
 echo " select tableName from trackDb where tableName like 'wgEncodeGencode_%V19';" | hgsql hg19 --skip-column-names > genePred.gencode.tables
 
 # load the non-genepred table as is.   This isn't quite the right thing to do
 # with exon support, but it's good enough for our purposes at the moment
 join -v 1 *.gencode.tables | while read t; do echo "create table $t select * from hg19.$t" | hgsql hg38; echo $t; done
 
 for i in `cat genePredExt.gencode.tables`;
 do
     echo "select name,score,name2 from $i" | hgsql hg19 | sort > $i.name2Score.txt;
     genePredToFakePsl hg19 $i $i.psl $i.cds;
     pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout |  sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout | sort |  join /dev/stdin $i.name2Score.txt| tr ' ' '\t' | hgLoadGenePred -genePredExt hg38 $i stdin;
     echo $i;
 done
 
 for i in `cat genePred.gencode.tables`;
 do
     genePredToFakePsl hg19 $i $i.psl $i.cds;
     pslMap -chainMapFile -swapMap $i.psl /gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz stdout | pslCDnaFilter -uniqueMapped stdin stdout |  sort -k 14,14 -k 16,16n | pslToBed -cds=$i.cds stdin stdout | bedToGenePred stdin stdout |  tr ' ' '\t' | hgLoadGenePred hg38 $i stdin;
     echo $i;
 done
 
 #####################################################################
 ## tRNAs track ( 2014-02-18 braney DONE)
 ## this is a preliminary version for UCSC build.  NOT FOR RELEASE!
 ssh hgwdev
 cd /hive/data/genomes/hg38/bed
 mkdir tRNAs
 cd tRNAs
 
 cp  /hive/users/pchan/tRNAs/Eukaryota/hg38/hg38-tRNAs.bed .
 
 hgLoadBed -tab hg38 tRNAs hg38-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql
 
 ## tRNAs track (2015-10-04, Chris FINISHING BUILD FOR RELEASE)
     cd /hive/data/genomes/hg38/bed/tRNAs
     cat /hive/users/pchan/gtrnadb2/Eukaryota/hg38/hg38-tRNAs.bed | sed 's^</BLOCKQUOTE>^^g' | > hg38-tRNAs2.bed
     hgsql hg38 -e 'drop table if exists tRNAs'
     hgLoadBed -tab hg38 tRNAs hg38-tRNAs2.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql
     mkdir gif
     cp -p /hive/users/pchan/gtrnadb2/Eukaryota/hg38/images/* gif
     cd /hive/data/gbdb/hg38
     ln -s /hive/data/genomes/hg38/bed/tRNAs/gif RNA-img
     cd /usr/local/apache/htdocs-ceisenhart/RNA-img
     ln -s /gbdb/hg38/RNA-img hg38
 
 ############################################################################
 # EXONIPHY , lifted from hg19 (DONE - braney 2014-02-19)
 #	needed for ucscGenes building
     # exoniphyHg19.gp is prepared as follows
     mkdir /cluster/data/hg38/bed/exoniphy
     cd /cluster/data/hg38/bed/exoniphy
     hgsql hg19 -e "select * from exoniphy" -N | cut  -f 2-16 > exoniphyHg19.gp
     time nice -n +19 liftOver -genePred exoniphyHg19.gp \
 	/cluster/data/hg19/bed/liftOver/hg19ToHg38.over.chain.gz \
 	    exoniphyHg38.gp unmapped
     # real    0m2.015s
     # user    0m1.894s
     # sys     0m0.076s
 
     wc -l *
     #   186601 exoniphyHg19.gp
     #   186533 exoniphyHg38.gp
     #      136 unmapped
     #   373270 total
 
     cd /cluster/data/hg38/bed/exoniphy
     nice -n +19 hgLoadGenePred -genePredExt hg38 exoniphy exoniphyHg38.gp
     nice -n +19 featureBits hg38 exoniphy
     # 28807039 bases of 3049335806 (0.945%) in intersection
     nice -n +19 featureBits hg19 exoniphy
     # 28661160 bases of 2897316137 (0.989%) in intersection
 
 #########################################################################
 # LASTZ Rat Rn5 (DONE - 2014-02-27 - Hiram)
     #	establish a screen to control this job
     screen -S hg38Rn5
     mkdir /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27
     cd /hive/data/genomes/hg38/bed/lastzRn5.2014-02-27
 
     # XXX don't forget to specify the BLASTZ binary:
     cat << '_EOF_' > DEF
 # human vs rat
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 
 # TARGET: Human Hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn5
 SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit
 SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
 
     #   real    658m53.984s
     cat fb.hg38.chainRn5Link.txt
     # 938823407 bases of 3049335806 (30.788%) in intersection
 
     #	running the swap
     mkdir /hive/data/genomes/rn5/bed/blastz.hg38.swap
     cd /hive/data/genomes/rn5/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \
 	-swap \
 	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
     #   real    66m53.095s
     cat fb.rn5.chainHg38Link.txt
     #   934256475 bases of 2572853723 (36.312%) in intersection
 
     # syntenic net for 14-way use 2014-04-02 - Hiram
     cd /hive/data/genomes/rn5/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg38/bed/lastzRn5.2014-02-27/DEF \
 	-continue=syntenicNet -syntenicNet -swap \
 	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-chainMinScore=3000 -chainLinearGap=medium > synNet.log 2>&1
     #  real    16m54.489s
 
 ##############################################################################
 # LASTZ Rat Rn4 (DONE - 2014-02-27 - Hiram)
     #	establish a screen to control this job
     screen -S hg38Rn4
     mkdir /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27
     cd /hive/data/genomes/hg38/bed/lastzRn4.2014-02-27
 
     # XXX don't forget to specify the BLASTZ binary:
     cat << '_EOF_' > DEF
 # human vs rat
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 
 # TARGET: Human Hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn4
 SEQ2_DIR=/hive/data/genomes/rn4/rn4.2bit
 SEQ2_LEN=/hive/data/genomes/rn4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzRn4.2014-02-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
     #   real    658m53.984s
 
     cat fb.hg38.chainRn4Link.txt
     #   913992768 bases of 3049335806 (29.974%) in intersection
 
     #	running the swap
     mkdir /hive/data/genomes/rn4/bed/blastz.hg38.swap
     cd /hive/data/genomes/rn4/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg38/bed/lastzRn4.2014-02-27/DEF \
 	-swap \
 	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #   real    73m5.666s
 
     cat fb.rn4.chainHg38Link.txt
     #	889613774 bases of 2571531505 (34.595%) in intersection
 
 ##############################################################################
 # GENEID GENE PREDICTIONS (DONE - 2014-03-07 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg38/bed/geneid
     cd /hive/data/genomes/hg38/bed/geneid
     mkdir download
     cd download
     for C in `cut -f1 ../../../chrom.sizes`
     do
 	echo $C
  wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.gtf3
     wget --timestamping \
 http://genome.crg.es/genepredictions/H.sapiens/hg38/geneid_v1.4/${C}.prot
     done
 
     cd ..
     cat download/*.gtf | ldHgGene -gtf -genePredExt hg38 geneid stdin
     #	Read 33428 transcripts in 277332 lines in 1 files
     #	33428 groups 92 seqs 1 sources 3 feature types
     #	33428 gene predictions
 
 ############################################################################
 # GENEREVIEWS TRACK (DONE 2014-05-17 - Chin)
 # This track depends on some tasks completed for hg19, specifically:
 #
 # $HOME/kent/src/hg/lib/geneReviewsGrshortNBKid.sql
 # $HOME/kent/src/hg/lib/geneReviewsGrshortTitleNBKid.sql
 # $HOME/kent/src/hg/lib/geneReviewsDetail.sql
 # $HOME/kent/src/hg/makeDb/trackDb/human/geneReviews.html
 #
 # Unlike hg19, this hg38 tracks is generated by the automatic geneReviews
 # scripts in
 # /hive/data/outside/otto/geneReviews, specifically buildGeneReviews.sh.
 # Current data are fetched weekly from NCBI
 # ftp://ftp.ncbi.nlm.nih.gov/pub/GeneReviews/
 # to /hive/data/outside/otto/geneReviews/${DATE}.
 
 ###########################################################################
 # Chimp Lastz run (DONE - 2014-05-27 - Hiram)
     screen -S hg38PanTro4      # use a screen to manage this longish running job
     mkdir /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27
     cd /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27
 
     # always set the BLASTZ program so we know what version was used
     cat << '_EOF_' > DEF
 # human vs chimp
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 BLASTZ_O=600
 BLASTZ_E=150
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 
 BLASTZ_T=2
 BLASTZ_Y=15000
 BLASTZ_K=4500
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 #    A    C    G    T
 #    90 -330 -236 -356
 #  -330  100 -318 -236
 #  -236 -318  100 -330
 #  -356 -236 -330   90
 
 # TARGET: Human Hg38
 SEQ1_DIR=/scratch/data/hg38/hg38.2bit
 SEQ1_LEN=/scratch/data/hg38/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Chimp PanTro4
 SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit
 SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=200
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27
 TMPDIR=/dev/shm
 '_EOF_'
     # << emacs
 
     time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
         -chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -syntenicNet) > do.log 2>&1
     # real    154m12.215s
     cat fb.hg38.chainPanTro4Link.txt
     # 2839294579 bases of 3049335806 (93.112%) in intersection
 
     # filter with doRecipBest.pl
     time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
         hg38 panTro4) > rbest.log 2>&1
     # real    57m55.320s
 
     # running the swap
     mkdir /hive/data/genomes/panTro4/bed/blastz.hg38.swap
     cd /hive/data/genomes/panTro4/bed/blastz.hg38.swap
     time (doBlastzChainNet.pl -verbose=2 \
         -swap /hive/data/genomes/hg38/bed/lastzPanTro4.2014-05-27/DEF \
         -chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -syntenicNet) > swap.log 2>&1
     cat fb.panTro4.chainHg38Link.txt
     # 2776497530 bases of 2902338967 (95.664%) in intersection
     # real    98m23.729s
 
     time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
         panTro4 hg38) > rbest.log 2>&1
     # real    64m33.812s
 
 #############################################################################
 # Opossum Lastz run (DONE - 2014-05-27 - Hiram)
     screen -S hg38MonDom5      # use a screen to manage this longish running job
     mkdir /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27
     cd /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27
 
     # always set the BLASTZ program so we know what version was used
     cat << '_EOF_' > DEF
 # human vs chimp
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 BLASTZ_M=50
 
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 #     A    C    G    T
 #    91  -90  -25 -100
 #   -90  100 -100  -25
 #   -25 -100  100  -90
 #  -100  -25  -90  91
 
 # TARGET: Human Hg38
 SEQ1_DIR=/scratch/data/hg38/hg38.2bit
 SEQ1_LEN=/scratch/data/hg38/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Opossum MonDom5
 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
 SEQ2_LEN=/hive/data/genomes/monDom5/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27
 TMPDIR=/dev/shm
 '_EOF_'
     # << emacs
 
     time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
         -chainMinScore=5000 -chainLinearGap=loose \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -syntenicNet) > do.log 2>&1
     # real    670m13.280s
     # one failed chain run for hg19, finished manually on hgwdev, then:
     time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
         -continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -syntenicNet) > chainMerge.log 2>&1
     # real    164m28.822s
 
     cat fb.hg38.chainMonDom5Link.txt
     # 438195373 bases of 3049335806 (14.370%) in intersection
 
     # filter with doRecipBest.pl
     time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \
         -dbHost=hgwdev -workhorse=hgwdev hg38 monDom5) > rbest.log 2>&1
     # real    130m22.825s
 
     # running the swap
     mkdir /hive/data/genomes/monDom5/bed/blastz.hg38.swap
     cd /hive/data/genomes/monDom5/bed/blastz.hg38.swap
     time (doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/hg38/bed/lastzMonDom5.2014-05-27/DEF \
         -swap -chainMinScore=5000 -chainLinearGap=loose \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -syntenicNet) > swap.log 2>&1
     # real    102m41.443s
 
     cat fb.monDom5.chainHg38Link.txt
     # 420069915 bases of 3501660299 (11.996%) in intersection
     time (/cluster/bin/scripts/doRecipBest.pl -buildDir=`pwd` \
         -dbHost=hgwdev -workhorse=hgwdev monDom5 hg38) > rbest.log 2>&1
     #  real    90m56.189s
 
 _EOF_
 #############################################################################
 # LOCUS REFERENCE GENOMIC (LRG) REGIONS AND TRANSCRIPTS (DONE 10/25/19 angie)
 # Redmine #13359, #24285 -- otto-mate To Do #17877
 # previously done 7/7/14, 9/9/16, 5/30/18
     set today = `date +%Y_%m_%d`
     mkdir -p /hive/data/genomes/hg38/bed/lrg/$today
     cd /hive/data/genomes/hg38/bed/lrg/$today
     wget ftp://ftp.ebi.ac.uk/pub/databases/lrgex/LRG_public_xml_files.zip
     unzip LRG_public_xml_files.zip
 
     # Run script to convert LRG*.xml files to BED+ for regions and genePredExt+fa for transcripts:
     # parseLrgXml.pl updated 2020-09-16 to add four new fields to the gp output
     # the four extra fields are identifiers for:
     # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein
 
     ~/kent/src/hg/utils/automation/parseLrgXml.pl GRCh38
     genePredCheck lrgTranscriptsUnmapped.gp
 #Error: lrgTranscriptsUnmapped.gp:765: LRG_7t1 no exonFrame on CDS exon 46
 #checked: 1029 failed: 1
     # If there are complaints e.g. about exonFrame, look for inconsistencies in the
     # affected transcript's coding_region/coordinates vs. exon/intron info in xml.
     # Contact Variation team leader Fiona Cunningham @EBI to resolve in the background
     # (missing exonFrame info doesn't affect our track representation because we end up using
     # psl).  We agreed to disagree about exon 46 of LRG_7t1 because that last coding exon
     # portion is only the stop codon.
 
     # No longer necessary to filter out alt and fix patches since they have been added to hg38.
 
     # and we need the transcript plus gene name later:
     cut -f1,12 lrgTranscriptsUnmapped.gp | sort > transcript.gene.name.txt
 
     # five extra columns have been added to the genePred (2020-10-05 - Hiram)
     # extract them so they can be added to the psl:
     awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s %s %s %s\n", $1,$16,$17,$18,$19, $16,$18,$17,$19}' lrgTranscriptsUnmapped.gp | sort \
        | join -t$'\t' - transcript.gene.name.txt \
          | awk -F$'\t' '{printf "%s\t%s\t%s\t%s\t%s\t%s\t%s %s\n", $1,$2,$3,$4,$5,$7,$6,$7}' > lrgTransExtraFields.tsv
 
     # the five extra fields are identifiers for:
     # NCBI transcript, Ensembl transcript, NCBI protein, Ensembl protein,
     #	Gene name
 
     # Load LRG regions:
     bedToBigBed lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.bb \
       -tab -type=bed12+ -as=$HOME/kent/src/hg/lib/lrg.as -extraIndex=name
     ln -sf `pwd`/lrg.bb /gbdb/hg38/bbi/lrg.bb
     hgBbiDbLink hg38 lrg /gbdb/hg38/bbi/lrg.bb
 
     # Map LRG fixed_annotation transcripts from LRG coords to hg38 coords (HT MarkD):
     lrgToPsl lrg.bed /hive/data/genomes/hg38/chrom.sizes lrg.psl
     pslCheck lrg.psl
 #checked: 919 failed: 0 errors: 0
     awk '{print $10 "\t" $11;}' lrg.psl > lrg.sizes
     genePredToFakePsl -chromSize=lrg.sizes placeholder \
       lrgTranscriptsUnmapped.gp lrgTranscriptsFakePsl.psl lrgTranscripts.cds
     pslMap lrgTranscriptsFakePsl.psl lrg.psl lrgTranscriptsHg38.psl
     mrnaToGene -genePredExt -cdsFile=lrgTranscripts.cds -keepInvalid \
       lrgTranscriptsHg38.psl lrgTranscriptsHg38NoName2.gp
 #Warning: no CDS for LRG_163t1
 #Warning: no CDS for LRG_347t1
     # It's OK if mrnaToGene complains about "no CDS" for a non-coding tx (RefSeq accession NR_*).
     grep -l NR_ LRG_163.xml LRG_347.xml
 #LRG_163.xml
 #LRG_347.xml
 
     cat lrgCdna.tab | sed -e 's/^/>/;' | tr '\t' '\n' > lrgCdna.fa
     # construct bigPsl with five extra fields
     pslToBigPsl -fa=lrgCdna.fa -cds=lrgTranscripts.cds \
 	lrgTranscriptsHg38.psl bigPsl.txt
 
     # add the five extra identifiers to the bigPsl file:
     join -t$'\t' -1 4 \
        -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15\
 ,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23,1.24,1.25,2.2,2.3,2.4,2.5,2.6,2.7 \
        <(sort -k4 bigPsl.txt) lrgTransExtraFields.tsv \
          | sort -k1,1 -k2,2n > lrgExtraTranscriptsHg38.bigPsl.bed
 
     bedToBigBed -as=bigPsl+6.as -type=bed12+19 -tab \
        lrgExtraTranscriptsHg38.bigPsl.bed ../../../chrom.sizes lrgBigPsl.bb
     bigBedInfo lrgBigPsl.bb
     rm -f /gbdb/hg38/bbi/lrgBigPsl.bb
     ln -sf `pwd`/lrgBigPsl.bb /gbdb/hg38/bbi
     hgBbiDbLink hg38 lrgBigPsl /gbdb/hg38/bbi/lrgBigPsl.bb
 
 
     # Load PSL, CDS and sequences.
     hgLoadPsl hg38 -table=lrgTranscriptAli lrgTranscriptsHg38.psl
     hgLoadSqlTab hg38 lrgCds ~/kent/src/hg/lib/cdsSpec.sql lrgTranscripts.cds
     hgPepPred hg38 tab lrgCdna lrgCdna.tab
     hgPepPred hg38 tab lrgPep lrgPep.tab
 
 
 #############################################################################
 ## 7-Way Multiz (DONE - 2014-06-02 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg38/bed/multiz7way
     cd /hive/data/genomes/hg38/bed/multiz7way
 
     # from the 63-way in the source tree, select out the 7 used here:
     /cluster/bin/phast/tree_doctor \
         --prune-all-but hg19,panTro4,rheMac3,mm10,rn5,canFam3,monDom5 \
         /cluster/home/hiram/kent/src/hg/utils/phyloTrees/130way.nh \
           | sed -e 's/hg19/hg38/' > hg38.7way.nh
 
     #	what that looks like:
     ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh
 # (((((hg38:0.006550,
 #     panTro4:0.006840):0.027424,
 #    rheMac3:0.037601):0.109934,
 #   (mm10:0.084509,
 #   rn5:0.091589):0.271974):0.020593,
 #  canFam3:0.165928):0.258392,
 # monDom5:0.340786);
 
     # extract species list from that .nh file
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
         hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' \
         | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt
 
     # construct db to name translation list:
     cat species.list.txt | while read DB
 do
 hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest
 done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \
         > db.to.name.txt
 
     # construct a common name .nh file:
     /cluster/bin/phast/tree_doctor --rename \
     "`cat db.to.name.txt`" hg38.7way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \
        | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
          > hg38.7way.commonNames.nh
 
     $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.nh > t.nh
     $HOME/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \
        | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
           > hg38.7way.scientificNames.nh
     rm -f t.nh
     cat hg38.7way.scientificNames.nh
 # (((((Homo_sapiens:0.00655,
 #     Pan_troglodytes:0.00684):0.027424,
 #    Macaca_mulatta:0.037601):0.109934,
 #   (Mus_musculus:0.084509,
 #   Rattus_norvegicus:0.091589):0.271974):0.020593,
 #  Canis_lupus_familiaris:0.165928):0.258392,
 # Monodelphis_domestica:0.340786);
 
     ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.7way.commonNames.nh
 # (((((Human:0.00655,
 #     Chimp:0.00684):0.027424,
 #    Rhesus:0.037601):0.109934,
 #   (Mouse:0.084509,
 #   Rat:0.091589):0.271974):0.020593,
 #  Dog:0.165928):0.258392,
 # Opossum:0.340786);
 
     #	Use this specification in the phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to obtain a png image for src/hg/htdocs/images/phylo/hg38_7way.png
 
     /cluster/bin/phast/all_dists hg38.7way.nh | grep hg38 \
         | sed -e "s/hg38.//" | sort -k2n > 7way.distances.txt
     #	Use this output to create the table below
     head 7way.distances.txt
 # taeGut1 0.075718
 # melUnd1 0.220312
 # galGal4 0.507021
 # melGal1 0.509140
 # hg19    1.175433
 # mm10    1.383071
 
     cat << '_EOF_' > sizeStats.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 open (FH, "<7way.distances.txt") or
         die "can not read 7way.distances.txt";
 
 my $count = 0;
 while (my $line = <FH>) {
     chomp $line;
     my ($D, $dist) = split('\s+', $line);
     my $chain = "chain" . ucfirst($D);
     my $B="/hive/data/genomes/hg38/bed/lastz.$D/fb.hg38." .
         $chain . "Link.txt";
     my $chainLinkMeasure =
         `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
     chomp $chainLinkMeasure;
     $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
     $chainLinkMeasure =~ s/\%//;
     my $swapFile="/hive/data/genomes/${D}/bed/lastz.hg38/fb.${D}.chainHg38Link.txt";
     my $swapMeasure = "N/A";
     if ( -s $swapFile ) {
 	$swapMeasure =
 	    `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
 	chomp $swapMeasure;
 	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
 	$swapMeasure =~ s/\%//;
     }
     my $orgName=
     `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
     chomp $orgName;
     if (length($orgName) < 1) {
         $orgName="N/A";
     }
     ++$count;
     printf "# %02d  %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist,
         $chainLinkMeasure, $swapMeasure, $orgName, $D;
 }
 close (FH);
 '_EOF_'
     # << happy emacs
     chmod +x ./sizeStats.pl
     ./sizeStats.pl
 #
 
 #	If you can fill in all the numbers in this table, you are ready for
 #	the multiple alignment procedure
 
 #       featureBits chainLink measures
 #               chainLink
 #  N distance  on hg38  on other     other species
 # 01  0.0134 (% 93.112) (% 95.664) - Chimp panTro4
 # 02  0.0716 (% 79.729) (% 86.715) - Rhesus rheMac3
 # 03  0.3304 (% 49.978) (% 60.083) - Dog canFam3
 # 04  0.5004 (% 31.629) (% 35.323) - Mouse mm10
 # 05  0.5075 (% 30.788) (% 36.312) - Rat rn5
 # 06  0.7637 (% 14.370) (% 11.996) - Opossum monDom5
 
 # None of this concern for distances matters in building the first step, the
 # maf files.
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	hg38.7way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh
 
     sed 's/[()]//g; s/,/ /g' tree.nh > species.list
     #   hg38 panTro4 rheMac3 mm10 rn5 canFam3 monDom5
 
     #	bash shell syntax here ...
     cd /hive/data/genomes/hg38/bed/multiz7way
     export H=/hive/data/genomes/hg38/bed
     mkdir mafLinks
     # want syntenic net for: panTro4 rheMac3 mm10 rn5 canFam3
     # and unfiltered maf net for: monDom5
     for G in panTro4 rheMac3 mm10 rn5 canFam3
     do
       mkdir mafLinks/$G
       echo ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G
       ln -s ${H}/lastz.$G/axtChain/hg38.${G}.synNet.maf.gz ./mafLinks/$G
     done
 
     mkdir mafLinks/monDom5
     echo ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5
     ln -s ${H}/lastz.monDom5/mafNet/*.maf.gz ./mafLinks/monDom5
     # verify the symLinks are good:
     ls -ogrtL mafLinks/*/*
 #-rw-rw-r-- 1  709500062 Jan 25 12:15 mafLinks/mm10/hg38.mm10.synNet.maf.gz
 #-rw-rw-r-- 1 1089643630 Jan 27 19:15 mafLinks/canFam3/hg38.canFam3.synNet.maf.gz
 #-rw-rw-r-- 1 1277455681 Jan 28 21:52 mafLinks/rheMac3/hg38.rheMac3.synNet.maf.gz
 #-rw-rw-r-- 1  687500679 Mar  1 12:27 mafLinks/rn5/hg38.rn5.synNet.maf.gz
 #-rw-rw-r-- 1 1463969868 May 27 11:41 mafLinks/panTro4/hg38.panTro4.synNet.maf.gz
 #-rw-rw-r-- 1  323347908 May 29 12:38 mafLinks/monDom5/hg38.monDom5.net.maf.gz
 
     # split the maf files into a set of hashed named files
     # this hash named split keeps the same chr/contig names in the same
     # named hash file.
     mkdir /hive/data/genomes/hg38/bed/multiz7way/mafSplit
     cd /hive/data/genomes/hg38/bed/multiz7way/mafSplit
     for D in `sed -e "s/hg38 //" ../species.list`
 do
     echo "${D}"
     mkdir $D
     cd $D
     echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz"
     mafSplit -byTarget -useHashedName=8 /dev/null . \
 	../../mafLinks/${D}/*.maf.gz
     cd ..
 done
 
     # construct a list of all possible maf file names.
     # they do not all exist in each of the species directories
     find . -type f | wc -l
     # 641
     find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list
     wc -l maf.list
     # 118 maf.list
 
     mkdir /hive/data/genomes/hg38/bed/multiz7way/splitRun
     cd /hive/data/genomes/hg38/bed/multiz7way/splitRun
     mkdir maf run
     cd run
     mkdir penn
     cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn
     cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn
     cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn
 
     #	set the db and pairs directories here
     cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 set db = hg38
 set c = $1
 set result = $2
 set run = `/bin/pwd`
 set tmp = /dev/shm/$db/multiz.$c
 set pairs = /hive/data/genomes/hg38/bed/multiz7way/mafSplit
 /bin/rm -fr $tmp
 /bin/mkdir -p $tmp
 /bin/cp -p ../../tree.nh ../../species.list $tmp
 pushd $tmp > /dev/null
 foreach s (`/bin/sed -e "s/$db //" species.list`)
     set in = $pairs/$s/$c
     set out = $db.$s.sing.maf
     if (-e $in.gz) then
         /bin/zcat $in.gz > $out
         if (! -s $out) then
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     else if (-e $in) then
         /bin/ln -s $in $out
     else
         echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($run/penn $path); rehash
 $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \
         > /dev/null
 popd > /dev/null
 /bin/rm -f $result
 /bin/cp -p $tmp/$c $result
 /bin/rm -fr $tmp
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
     cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/hg38/bed/multiz7way/splitRun/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
 # << happy emacs
 
     ln -s ../../mafSplit/maf.list maf.list
     ssh ku
     cd /hive/data/genomes/hg38/bed/multiz7way/splitRun/run
     gensub2 maf.list single template stdout > jobList
     para -ram=8g create jobList
 # Completed: 118 of 118 jobs
 # CPU time in finished jobs:     118241s    1970.69m    32.84h    1.37d  0.004 y
 # IO & Wait Time:                   682s      11.36m     0.19h    0.01d  0.000 y
 # Average job time:                1008s      16.80m     0.28h    0.01d
 # Longest finished job:           10068s     167.80m     2.80h    0.12d
 # Submission to last job:         10076s     167.93m     2.80h    0.12d
 
     # combine into one file  (the 1>&2 redirect sends the echo to stderr)
     cd /hive/data/genomes/hg38/bed/multiz7way
     head -1 splitRun/maf/017.maf > multiz7way.maf
     for F in splitRun/maf/*.maf
 do
     echo "${F}" 1>&2
     egrep -v "^#" ${F}
 done >> multiz7way.maf
     tail -1 splitRun/maf/017.maf >> multiz7way.maf
 # -rw-rw-r-- 1 15635828403 Jun  3 11:49 multiz7way.maf
 
     # Load into database
     ssh hgwdev
     cd /hive/data/genomes/hg38/bed/multiz7way
     mkdir /gbdb/hg38/multiz7way
     ln -s `pwd`/multiz7way.maf /gbdb/hg38/multiz7way
     cd /dev/shm
     time nice -n +17 hgLoadMaf hg38 multiz7way
     # Loaded 10270624 mafs in 1 files from /gbdb/hg38/multiz7way
     # real    3m51.265s
 
     time nice -n +17 hgLoadMafSummary -verbose=2 -minSize=30000 \
 	-mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \
 	/gbdb/hg38/multiz7way/multiz7way.maf
     # Created 1260918 summary blocks from 35384988 components
     # and 10270624 mafs from /gbdb/hg38/multiz7way/multiz7way.maf
     # real    5m39.388s
 
 
     wc -l multiz7way*.tab
     # 10270624 multiz7way.tab
     # 1260918 multiz7waySummary.tab
     # 11531542 total
 
     rm multiz7way*.tab
 
 ##############################################################################
 # GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2014-06-03 - Hiram)
     # mafAddIRows has to be run on single chromosome maf files, it does not
     #	function correctly when more than one reference sequence
     #	are in a single file.  Need to split of the maf file into individual
     #   maf files
     mkdir -p /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit
     cd /hive/data/genomes/hg38/bed/multiz7way/anno/mafSplit
 
     time mafSplit -outDirDepth=1 -byTarget -useFullSequenceName \
         /dev/null . ../../multiz7way.maf
     #   real    4m8.617s
 
     find . -type f | wc -l
     #   353
 
     # check for N.bed files everywhere:
     cd /hive/data/genomes/hg38/bed/multiz7way/anno
     for DB in `cat ../species.list`
 do
     if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then
         echo "MISS: ${DB}"
 #        cd /hive/data/genomes/${DB}
 #        twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed
     else
         echo "  OK: ${DB}"
     fi
 done
 
     cd /hive/data/genomes/hg38/bed/multiz7way/anno
     for DB in `cat ../species.list`
 do
     echo "${DB} "
     ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
     echo ${DB}.bed  >> nBeds
     ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
     echo ${DB}.len  >> sizes
 done
     # make sure they all are successful symLinks:
     ls -ogrtL
 
     screen -S hg38      # use a screen to control this longish job
     ssh ku
     cd /hive/data/genomes/hg38/bed/multiz7way/anno
     mkdir result
     for D in `ls mafSplit`
 do
     echo mkdir result/${D}
     mkdir result/${D}
 done
     cat << '_EOF_' > template
 #LOOP
 mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/hg38/hg38.2bit {check out exists+ result/$(path1)}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list
     gensub2 maf.list single template jobList
     # limit jobs on a node with the ram=32g requirement because they go fast
     para -ram=32g create jobList
     para try ... check ... push ...
 # Completed: 353 of 353 jobs
 # CPU time in finished jobs:        530s       8.83m     0.15h    0.01d  0.000 y
 # IO & Wait Time:                  1057s      17.62m     0.29h    0.01d  0.000 y
 # Average job time:                   4s       0.07m     0.00h    0.00d
 # Longest finished job:              63s       1.05m     0.02h    0.00d
 # Submission to last job:           220s       3.67m     0.06h    0.00d
 
     # verify all result files have some content, look for 0 size files:
     find ./result -type f -size 0
     # should see none
     # or in this manner:
     find ./result -type f | xargs ls -og | sort -k3nr | tail
 
     # combine into one file  (the 1>&2 redirect sends the echo to stderr)
     head -q -n 1 result/0/chr8.maf > hg38.7way.maf
     find ./result -type f | while read F
 do
     echo "${F}" 1>&2
     grep -h -v "^#" ${F}
 done >> hg38.7way.maf
 
     #	these maf files do not have the end marker, this does nothing:
     #	tail -q -n 1 result/0/chr8.maf >> hg38.7way.maf
     # How about an official end marker:
     echo "##eof maf" >> hg38.7way.maf
     ls -og
 # -rw-rw-r--  1 17795297196 Jun  3 14:01 hg38.7way.maf
 
     du -hsc hg38.7way.maf
     # 17G     hg38.7way.maf
 
     # construct symlinks to get the individual maf files into gbdb:
     rm /gbdb/hg38/multiz7way/multiz7way.maf   # remove previous results
     ln -s `pwd`/hg38.7way.maf /gbdb/hg38/multiz7way/multiz7way.maf
 
     # Load into database
     cd /dev/shm
     time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/hg38/multiz7way \
         hg38 multiz7way
     # Loaded 10359242 mafs in 1 files from /gbdb/hg38/multiz7way
     # real    4m21.862s
 
     time hgLoadMafSummary -verbose=2 -minSize=30000 \
 	-mergeGap=1500 -maxSize=200000 hg38 multiz7waySummary \
         /gbdb/hg38/multiz7way/multiz7way.maf
 #  Created 1260918 summary blocks from 35384988 components
 #  and 10359242 mafs from /gbdb/hg38/multiz7way/multiz7way.maf
 #  real    6m6.583s
 
 # -rw-rw-r-- 1 530538267 Jun  3 14:05 multiz7way.tab
 # -rw-rw-r-- 1  60616616 Jun  3 14:15 multiz7waySummary.tab
 
     rm multiz7way*.tab
 
 ######################################################################
 # MULTIZ7WAY MAF FRAMES (DONE - 2014-06-03 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg38/bed/multiz7way/frames
     cd /hive/data/genomes/hg38/bed/multiz7way/frames
 #   survey all the genomes to find out what kinds of gene tracks they have
     cat << '_EOF_' > showGenes.csh
 #!/bin/csh -fe
 foreach db (`cat ../species.list`)
     echo -n "${db}: "
     set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
     foreach table ($tables)
         if ($table == "ensGene" || $table == "refGene" || \
            $table == "mgcGenes" || $table == "knownGene" || \
            $table == "xenoRefGene" ) then
            set count = `hgsql $db -N -e "select count(*) from $table"`
             echo -n "${table}: ${count}, "
         endif
     end
     set orgName = `hgsql hgcentraltest -N -e \
             "select scientificName from dbDb where name='$db'"`
     set orgId = `hgsql hg19 -N -e \
             "select id from organism where name='$orgName'"`
     if ($orgId == "") then
         echo "Mrnas: 0"
     else
         set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
         echo "Mrnas: ${count}"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./showGenes.csh
     time ./showGenes.csh
 # hg38: knownGene: 104178, mgcGenes: 34081, refGene: 54852, xenoRefGene: 172740, Mrnas: 10723716
 # panTro4: ensGene: 29160, refGene: 2622, xenoRefGene: 280516, Mrnas: 11163
 # rheMac3: refGene: 6369, xenoRefGene: 275096, Mrnas: 443642
 # mm10: ensGene: 94647, knownGene: 61642, mgcGenes: 26768, refGene: 33765, xenoRefGene: 161178, Mrnas: 5224613
 # rn5: ensGene: 29188, mgcGenes: 6924, refGene: 18567, xenoRefGene: 175416, Mrnas: 1247500
 # canFam3: ensGene: 29884, refGene: 1582, xenoRefGene: 253196, Mrnas: 387195
 # monDom5: ensGene: 24882, refGene: 492, xenoRefGene: 248251,  Mrnas: 2461
 
     # from that summary, use these gene sets:
     # refGene - rheMac3
     # ensGene - panTro4 rn5 canFam3 monDom5
     # knownGene - hg38 mm10
 
     mkdir genes
     #   1. knownGene: hg38 mm10
     for DB in hg38 mm10
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > genes/${DB}.gp.gz
 done
     #   2. ensGene:
     for DB in panTro4 rn5 canFam3 monDom5
 do
 hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     #   3. refGene
     for DB in rheMac3
 do
 hgsql -N -e "select * from refGene" ${DB} | cut -f2- \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
 
     # verify counts for genes are reasonable:
     for T in genes/*.gz
 do
     echo -n "# $T: "
     zcat $T | cut -f1 | sort | uniq -c | wc -l
 done
 # genes/canFam3.gp.gz: 19507
 # genes/hg38.gp.gz: 21887
 # genes/mm10.gp.gz: 21013
 # genes/monDom5.gp.gz: 21033
 # genes/panTro4.gp.gz: 18657
 # genes/rheMac3.gp.gz: 5614
 # genes/rn5.gp.gz: 22863
 
     time (cat ../anno/hg38.7way.maf \
 	| nice -n +19 genePredToMafFrames hg38 stdin stdout \
 	    `sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g" ../species.list` \
 		| gzip > multiz7wayFrames.bed.gz)
     #   real    3m44.591s
 
     # verify there are frames on everything, should be 7 species:
     zcat multiz7wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c
 # 265160 canFam3
 # 208941 hg38
 # 253323 mm10
 # 574521 monDom5
 # 200156 panTro4
 #  49802 rheMac3
 # 244731 rn5
 
     #   load the resulting file
     ssh hgwdev
     cd /hive/data/genomes/hg38/bed/multiz7way/frames
     time hgLoadMafFrames hg38 multiz7wayFrames multiz7wayFrames.bed.gz
     #   real    0m19.959s
 
     time featureBits -countGaps hg38 multiz7wayFrames
     #   52686177 bases of 3209286105 (1.642%) in intersection
     #   real    0m12.593s
 
     #   enable the trackDb entries:
 # frames multiz7wayFrames
 # irows on
     #   appears to work OK
 
 #########################################################################
 # Phylogenetic tree from 7-way (DONE - 2014-06-04 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/multiz7way/4d
     cd /hive/data/genomes/hg38/bed/multiz7way/4d
 
     # the annotated maf is:
     ../anno/hg38.7way.maf
 
     # using knownGene for hg38
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg38 > hg38.knownGene.gp
 
     genePredSingleCover hg38.knownGene.gp stdout | sort > hg38.knownGeneNR.gp
     wc -l hg38.knownGeneNR.gp
     #	21887 hg38.knownGeneNR.gp
 
     mkdir annoSplit
     cd annoSplit
     time mafSplit -verbose=2 -outDirDepth=1 -byTarget -useFullSequenceName \
         /dev/null . ../../anno/hg38.7way.maf
     # real    5m14.770s
 
     find . -type f | wc -l
     #   353
     ssh ku
     mkdir /hive/data/genomes/hg38/bed/multiz7way/4d/run
     cd /hive/data/genomes/hg38/bed/multiz7way/4d/run
     mkdir ../mfa
 
     # newer versions of msa_view have a slightly different operation
     # the sed of the gp file inserts the reference species in the chr name
     cat << '_EOF_' > 4d.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
 set r = "/hive/data/genomes/hg38/bed/multiz7way"
 set c = $1:r
 set infile = $r/4d/annoSplit/$2
 set outDir = $r/4d/mfa/$3:h
 set outfile = $r/4d/mfa/$3
 /bin/mkdir -p $outDir
 cd /scratch/tmp
 /bin/awk -v C=$c '$2 == C {print}' $r/4d/hg38.knownGeneNR.gp | sed -e "s/\t$c\t/\thg38.$c\t/" > $c.gp
 set NL=`wc -l $c.gp| gawk '{print $1}'`
 echo $NL
 if ("$NL" != "0") then
     $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss
     $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile
 else
     echo "" > $outfile
 endif
 /bin/rm -f $c.gp $c.ss
 '_EOF_'
     # << happy emacs
     chmod +x 4d.csh
 
     find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list
 
     cat << '_EOF_' > template
 #LOOP
 4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(root1).mfa}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check
     para time
 # Completed: 353 of 353 jobs
 # CPU time in finished jobs:        836s      13.93m     0.23h    0.01d  0.000 y
 # IO & Wait Time:                  1172s      19.54m     0.33h    0.01d  0.000 y
 # Average job time:                   6s       0.09m     0.00h    0.00d
 # Longest finished job:              72s       1.20m     0.02h    0.00d
 # Submission to last job:            89s       1.48m     0.02h    0.00d
 
     # Not all results have contents, that is OK
 
     # combine mfa files
     ssh hgwdev
     cd /hive/data/genomes/hg38/bed/multiz7way/4d
     # remove the broken empty files, size 0 and size 1:
     find ./mfa -type f -size 0 | xargs rm -f
     # most interesting, this did not identify files of size 1:
 #    find ./mfa -type f -size 1
     find ./mfa -type f | xargs ls -og | awk '$3 == 1' | awk '{print $NF}' \
         > empty.list
     cat empty.list | xargs rm -f
     #want comma-less species.list
     /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \
 	--aggregate "`cat ../species.list`" mfa/*/*.mfa | sed s/"> "/">"/ \
 	    > 4d.all.mfa
     # check they are all in there:
     grep "^>" 4d.all.mfa
     #    >hg38
     #    >panTro4
     #    >rheMac3
     #    >mm10
     #    >rn5
     #    >canFam3
     #    >monDom5
 
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	../hg38.7way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh
     # tree_commas.nh looks like:
     #   (((((hg38,panTro4),rheMac3),(mm10,rn5)),canFam3),monDom5)
     # use phyloFit to create tree model (output is phyloFit.mod)
     time nice -n +19 \
 	/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \
 	    --EM --precision MED --msa-format FASTA --subst-mod REV \
 		--tree tree_commas.nh 4d.all.mfa
     #   real    0m6.583s
 
 
     mv phyloFit.mod all.mod
 
     grep TREE all.mod
 # TREE: (((((hg38:0.00673596,panTro4:0.00686169):0.0248146,rheMac3:0.0357598):0.0970072,(mm10:0.081661,rn5:0.0874126):0.246527):0.0264964,canFam3:0.156769):0.303241,monDom5:0.303241);
 
     # compare these calculated lengths to the tree extracted from 130way:
     grep TREE all.mod | sed -e 's/TREE: //' \
       | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 | sort -k3n \
         | sed -e "s/hg38.//; s/^/    #  /"
     #  panTro4  0.013598
     #  rheMac3  0.067310
     #  canFam3  0.311823
     #  mm10     0.456746
     #  rn5      0.462497
     #  monDom5  0.761536
 
     # yes, somewhat similar
     /cluster/bin/phast/all_dists ../hg38.7way.nh | grep hg38 \
         | sort -k3n | sed -e "s/hg38.//; s/^/    #  /"
     #  panTro4   0.013390
     #  rheMac3   0.071575
     #  canFam3   0.330429
     #  mm10      0.500391
     #  rn5       0.507471
     #  monDom5   0.763679
 
 #########################################################################
 # phastCons 7-way (DONE - 2014-06-04 - Hiram)
     # split 7way mafs into 10M chunks and generate sufficient statistics
     # files for # phastCons
     ssh ku
     mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/SS
     cd /hive/data/genomes/hg38/bed/multiz7way/cons/SS
     mkdir result done
 
     cat << '_EOF_' > mkSS.csh
 #!/bin/csh -ef
 set d = $1
 set c = $2
 set doneDir = done/$d
 set MAF = /hive/data/genomes/hg38/bed/multiz7way/anno/result/$d/$c.maf
 set WINDOWS = /hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$d/$c
 set WC = `cat $MAF | wc -l`
 set NL = `grep "^#" $MAF | wc -l`
 if ( -s $3 ) then
     exit 0
 endif
 if ( -s $3.running ) then
     exit 0
 endif
 
 /bin/mkdir -p $doneDir
 /bin/date >> $3.running
 
 /bin/rm -fr $WINDOWS
 /bin/mkdir -p $WINDOWS
 pushd $WINDOWS > /dev/null
 if ( $WC != $NL ) then
 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \
     $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
 endif
 popd > /dev/null
 /bin/date >> $3
 /bin/rm -f $3.running
 '_EOF_'
     # << happy emacs
     chmod +x mkSS.csh
 
     cat << '_EOF_' > template
 #LOOP
 mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	do the easy ones first to see some immediate results
     find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list
 
     gensub2 maf.list single template jobList
     para -ram=32g create jobList
     para try ... check ... etc
 # Completed: 353 of 353 jobs
 # CPU time in finished jobs:       1216s      20.27m     0.34h    0.01d  0.000 y
 # IO & Wait Time:                  1385s      23.08m     0.38h    0.02d  0.000 y
 # Average job time:                   7s       0.12m     0.00h    0.00d
 # Longest finished job:             111s       1.85m     0.03h    0.00d
 # Submission to last job:           189s       3.15m     0.05h    0.00d
 
     find ./result -type f | wc -l
     #	 641
 
     # Run phastCons
     #	This job is I/O intensive in its output files, beware where this
     #	takes place or do not run too many at once.
     ssh ku
     mkdir -p /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons
     cd /hive/data/genomes/hg38/bed/multiz7way/cons/run.cons
 
     #	This is setup for multiple runs based on subsets, but only running
     #   the 'all' subset here.
     #   It triggers off of the current working directory
     #	$cwd:t which is the "grp" in this script.  Running:
     #	all and vertebrates
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
 set c = $1
 set d = $2
 set f = $3
 set len = $4
 set cov = $5
 set rho = $6
 set grp = $cwd:t
 set cons = /hive/data/genomes/hg38/bed/multiz7way/cons
 set tmp = $cons/tmp/${d}_${c}
 mkdir -p $tmp
 set ssSrc = $cons/SS/result
 set useGrp = "$grp.mod"
 if (-s $cons/$grp/$grp.non-inf) then
   ln -s $cons/$grp/$grp.mod $tmp
   ln -s $cons/$grp/$grp.non-inf $tmp
   ln -s $ssSrc/$d/$f $tmp
 else
   ln -s $ssSrc/$d/$f $tmp
   ln -s $cons/$grp/$grp.mod $tmp
 endif
 pushd $tmp > /dev/null
 if (-s $grp.non-inf) then
   $PHASTBIN/phastCons $f $useGrp \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative `cat $grp.non-inf` \
     --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
 else
   $PHASTBIN/phastCons $f $useGrp \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp
 endif
 popd > /dev/null
 mkdir -p pp/$d bed/$d
 sleep 4
 touch pp/$d bed/$d
 rm -f pp/$d/$c.pp
 rm -f bed/$d/$c.bed
 mv $tmp/$c.pp pp/$d
 mv $tmp/$c.bed bed/$d
 rm -fr $tmp
 rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h
 '_EOF_'
     # << happy emacs
     chmod +x doPhast.csh
 
     #	this template will serve for all runs
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     cat << '_EOF_' > template
 #LOOP
 ../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list
     wc -l ss.list
     #	641 ss.list
 
     # Create parasol batch and run it
     # run for all species
     cd /hive/data/genomes/hg38/bed/multiz7way/cons
     mkdir -p all
     cd all
     #	Using the .mod tree
     cp -p ../../4d/all.mod ./all.mod
 
     gensub2 ../run.cons/ss.list single ../run.cons/template jobList
     para -ram=32g create jobList
     para try ... check ...
     para push
 # Completed: 641 of 641 jobs
 # CPU time in finished jobs:       6557s     109.29m     1.82h    0.08d  0.000 y
 # IO & Wait Time:                  4497s      74.94m     1.25h    0.05d  0.000 y
 # Average job time:                  17s       0.29m     0.00h    0.00d
 # Longest finished job:              33s       0.55m     0.01h    0.00d
 # Submission to last job:           120s       2.00m     0.03h    0.00d
 
     # create Most Conserved track
     cd /hive/data/genomes/hg38/bed/multiz7way/cons/all
     cut -f1 ../../../../chrom.sizes | while read C
 do
     ls -d bed/?/${C} 2> /dev/null | while read D
     do
         echo ${D}/${C}*.bed 1>&2
         cat ${D}/${C}*.bed
     done | sort -k1,1 -k2,2n \
     | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
 done > tmpMostConserved.bed
 
     /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
     #    -rw-rw-r--  1 42636652 Jun  4 10:45 tmpMostConserved.bed
     #    -rw-rw-r--  1 43721828 Jun  4 10:45 mostConserved.bed
 
     # load into database
     ssh hgwdev
     cd /hive/data/genomes/hg38/bed/multiz7way/cons/all
     time nice -n +19 hgLoadBed hg38 phastConsElements7way mostConserved.bed
     #  Read 1234990 elements of size 5 from mostConserved.bed
     #  real    0m11.390s
 
     # on human we often try for 5% overall cov, and 70% CDS cov
     # most bets are off here for that goal, these alignments are too few
     #	and too far between
     #	--rho 0.3 --expected-length 45 --target-coverage 0.3
     featureBits hg38 -enrichment knownGene:cds phastConsElements7way
     # knownGene:cds 1.266%, phastConsElements7way 4.551%,
     #    both 0.888%, cover 70.16%, enrich 15.42x
 
     # Create merged posterier probability file and wiggle track data files
     cd /hive/data/genomes/hg38/bed/multiz7way/cons/all
     mkdir downloads
 
     # the third sed fixes the chrom names, removing the partition extensions
     time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
 	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
 	| sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \
         | gzip -c > downloads/phastCons7way.wigFix.gz)
     #   real    37m47.242s
 
     # check integrity of data with wigToBigWig
     time (zcat downloads/phastCons7way.wigFix.gz \
 	| wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \
 	    phastCons7way.bw) > bigWig.log 2>&1 &
     tail bigWig.log
     # pid=34733: VmPeak:    33106324 kB
     #   real    40m53.287s
 
     bigWigInfo phastCons7way.bw
 # version: 4
 # isCompressed: yes
 # isSwapped: 0
 # primaryDataSize: 5,675,802,079
 # primaryIndexSize: 92,579,900
 # zoomLevels: 10
 # chromCount: 353
 # basesCovered: 2,898,191,577
 # mean: 0.168088
 # min: 0.000000
 # max: 1.000000
 # std: 0.233827
 
     #	encode those files into wiggle data
     time (zcat downloads/phastCons7way.wigFix.gz \
 	| wigEncode stdin phastCons7way.wig phastCons7way.wib)
     #   Converted stdin, upper limit 1.00, lower limit 0.00
     #   real    15m28.525s
 
     du -hsc *.wi?
     #  2.7G    phastCons7way.wib
     #  282M    phastCons7way.wig
     #  3.0G    total
 
     # Load gbdb and database with wiggle.
     ln -s `pwd`/phastCons7way.wib /gbdb/hg38/multiz7way/phastCons7way.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way \
 	hg38 phastCons7way phastCons7way.wig
     #   real    0m33.502s
 
     # use to set trackDb.ra entries for wiggle min and max
     # and verify table is loaded correctly
 
     wigTableStats.sh hg38 phastCons7way
 # db.table          min max mean       count sumData      stdDev  viewLimits
 hg38.phastCons7way 0 1 0.168088 2898191577 4.87152e+08 0.233827 viewLimits=0:1
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram -db=hg38 \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    phastCons7way > histogram.data 2>&1
     #	real    2m40.179s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Human hg38 Histogram phastCons7way track"
 set xlabel " phastCons7way score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 #########################################################################
 # phyloP for 7-way (DONE - 2014-06-04 - Hiram)
     # run phyloP with score=LRT
     ssh ku
     mkdir /cluster/data/hg38/bed/multiz7way/consPhyloP
     cd /cluster/data/hg38/bed/multiz7way/consPhyloP
 
     mkdir run.phyloP
     cd run.phyloP
     # Adjust model file base composition background and rate matrix to be
     # representative of the chromosomes in play
     grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
     #	0.556
     /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \
 	../../cons/all/all.mod 0.556 > all.mod
     # verify, the BACKGROUND should now be paired up:
     grep BACK all.mod
     #   BACKGROUND: 0.222000 0.278000 0.278000 0.222000
 
     cat << '_EOF_' > doPhyloP.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin
 set f = $1
 set d = $f:h
 set file1 = $f:t
 set out = $2
 set cName = $f:t:r
 set grp = $cwd:t
 set cons = /hive/data/genomes/hg38/bed/multiz7way/consPhyloP
 set tmp = $cons/tmp/$grp/$f
 /bin/rm -fr $tmp
 /bin/mkdir -p $tmp
 set ssSrc = "/hive/data/genomes/hg38/bed/multiz7way/cons/SS/result/$f"
 set useGrp = "$grp.mod"
 /bin/ln -s $cons/run.phyloP/$grp.mod $tmp
 pushd $tmp > /dev/null
 $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
     -i SS $useGrp $ssSrc.ss > $file1.wigFix
 popd > /dev/null
 /bin/mkdir -p $out:h
 sleep 4
 /bin/touch $out:h
 /bin/mv $tmp/$file1.wigFix $out
 /bin/rm -fr $tmp
 /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d
 /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h
 /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
 /bin/rmdir --ignore-fail-on-non-empty $cons/tmp
 '_EOF_'
     # << happy emacs
 
     # Create list of chunks
     find ../../cons/SS/result -type f | grep ".ss$" \
 	| sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list
     # make sure the list looks good
     wc -l ss.list
     #	641 ss.list
 
     # Create template file
     #	file1 == $chr/$chunk/file name without .ss suffix
     cat << '_EOF_' > template
 #LOOP
 ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     ######################   Running all species  #######################
     # setup run for all species
     mkdir /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all
     cd /hive/data/genomes/hg38/bed/multiz7way/consPhyloP/all
     rm -fr wigFix
     mkdir wigFix
 
     gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
     # the -ram=8g will allow only one job per node to slow this down since
     #	it would run too fast otherwise.  Either run on one of the small
     #	klusters or use the -ram=8g on the para create
     para -ram=32g create jobList
     para try ... check ... push ... etc ...
     para time > run.time
 # Completed: 641 of 641 jobs
 # CPU time in finished jobs:       4755s      79.24m     1.32h    0.06d  0.000 y
 # IO & Wait Time:                  4343s      72.39m     1.21h    0.05d  0.000 y
 # Average job time:                  14s       0.24m     0.00h    0.00d
 # Longest finished job:              27s       0.45m     0.01h    0.00d
 # Submission to last job:          1152s      19.20m     0.32h    0.01d
 
     # make downloads
     mkdir downloads
 
     time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
 	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
 	| gzip -c > downloads/phyloP7way.wigFix.gz) &
     #   real    29m51.665s
 
     # check integrity of data with wigToBigWig
     time (zcat downloads/phyloP7way.wigFix.gz \
 	| wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \
 	phyloP7way.bw) > bigWig.log 2>&1 &
     egrep "real|VmPeak" bigWig.log
     # pid=76577: VmPeak:    33106320 kB
     #  real    42m53.038s
 
 
     bigWigInfo phyloP7way.bw
 # version: 4
 # isCompressed: yes
 # isSwapped: 0
 # primaryDataSize: 3,759,451,708
 # primaryIndexSize: 92,579,900
 # zoomLevels: 10
 # chromCount: 353
 # basesCovered: 2,898,191,577
 # mean: 0.074472
 # min: -5.220000
 # max: 1.062000
 # std: 0.545945
 
     #	encode those files into wiggle data
     time (zcat downloads/phyloP7way.wigFix.gz \
 	| wigEncode stdin phyloP7way.wig phyloP7way.wib) &
     #   Converted stdin, upper limit 1.06, lower limit -5.22
     #   real    16m11.861s
 
 
     du -hsc *.wi?
     #   47M     phyloP7way.wib
     #   12M     phyloP7way.wig
     #   58M     total
 
     # Load gbdb and database with wiggle.
     ln -s `pwd`/phyloP7way.wib /gbdb/hg38/multiz7way/phyloP7way.wib
     nice hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz7way hg38 \
 	phyloP7way phyloP7way.wig
 
     # use to set trackDb.ra entries for wiggle min and max
     # and verify table is loaded correctly
 
     wigTableStats.sh hg38 phyloP7way
 # db.table      min max mean count sumData
 # hg38.phyloP7way -5.22 1.062 0.0744721 2898191577 2.15834e+08
 #       stdDev viewLimits
 #     0.545945 viewLimits=-2.65525:1.062
 
     #	that range is: 5.22+1.062 = 6.282 for hBinSize=0.006282
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.006282 -hBinCount=1000 -hMinVal=-5.22 -verbose=2 \
 	    -db=hg38 phyloP7way > histogram.data 2>&1
     #   real    2m55.843s
 
 
     # find out the range for the 2:5 graph
     grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin
 # Q1 0.000001
 # median 0.000060
 # Q3 0.000656
 # average 0.001022
 # min 0.000000
 # max 0.065461
 # count 978
 # total 0.999982
 # standard deviation 0.004157
 
     #	create plot of histogram:
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Human hg38 Histogram phyloP7way track"
 set xlabel " phyloP7way score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 #############################################################################
 # construct download files for 7-way (DONE - 2014-06-05 - Hiram)
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way
     mkdir /hive/data/genomes/hg38/bed/multiz7way/downloads
     cd /hive/data/genomes/hg38/bed/multiz7way/downloads
     mkdir multiz7way phastCons7way phyloP7way
     cd multiz7way
     time cp -p ../../anno/hg38.7way.maf .
     #   real    0m55.984s
     time gzip *.maf
     #   real    46m53.149s
 
     ln -s ../../hg38.7way.nh .
     ln -s ../../hg38.7way.commonNames.nh .
     time md5sum *.nh *.maf.gz > md5sum.txt
     #   real    1m55.317s
     ln -s `pwd`/* \
         /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way
 
     du -hsc *.maf.gz ../../anno/hg38.7way.maf
     #  3.5G    hg38.7way.maf.gz
     #   17G     ../../anno/hg38.7way.maf
 
     #####################################################################
     cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phastCons7way
 
     ln -s ../../cons/all/downloads/phastCons7way.wigFix.gz \
         ./hg38.phastCons7way.wigFix.gz
     ln -s ../../cons/all/phastCons7way.bw ./hg38.phastCons7way.bw
     ln -s ../../cons/all/all.mod ./hg38.phastCons7way.mod
     time md5sum *.gz *.mod *.bw > md5sum.txt
     #   real    0m37.384s
     # obtain the README.txt from petMar2/phastCons7way and update for this
     #   situation
     ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
       /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons7way
 
     #####################################################################
     cd /hive/data/genomes/hg38/bed/multiz7way/downloads/phyloP7way
 
     ln -s ../../consPhyloP/all/downloads/phyloP7way.wigFix.gz \
         ./hg38.phyloP7way.wigFix.gz
     ln -s ../../consPhyloP/run.phyloP/all.mod hg38.phyloP7way.mod
     ln -s ../../consPhyloP/all/phyloP7way.bw hg38.phyloP7way.bw
 
     time md5sum *.mod *.bw *.gz > md5sum.txt
     #   real    0m29.431s
 
     # obtain the README.txt from geoFor1/phyloP7way and update for this
     #   situation
     ln -s `pwd`/* \
       /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP7way
 
     ###########################################################################
     ## create upstream refGene maf files
     cd /hive/data/genomes/hg38/bed/multiz7way/downloads/multiz7way
     # bash script
 #!/bin/sh
 export geneTbl="knownGene"
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.maf"
     featureBits hg38 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | /cluster/bin/$MACHTYPE/mafFrags hg38 multiz7way \
                 stdin stdout \
                 -orgs=/hive/data/genomes/hg38/bed/multiz7way/species.list \
         | gzip -c > upstream${S}.${geneTbl}.maf.gz
     echo "done upstream${S}.${geneTbl}.maf.gz"
 done
     #   real    60m16.631s
 
     md5sum upstream*.gz >> md5sum.txt
 
     # some other symlinks were already made above
     # obtain the README.txt from geoFor1/multiz7way and update for this
     #   situation
     ln -s `pwd`/upstream*.gz README.txt \
         /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz7way
 
 #############################################################################
 # hgPal downloads (DONE - 2014-06-06 - Hiram)
 #   FASTA from 7-way for knownGene, refGene and knownCanonical
 
     ssh hgwdev
     screen -S hg38HgPal
     mkdir /hive/data/genomes/hg38/bed/multiz7way/pal
     cd /hive/data/genomes/hg38/bed/multiz7way/pal
     cat ../species.list | tr '[ ]' '[\n]' > order.list
 
     export mz=multiz7way
     export gp=knownGene
     export db=hg38
     export I=0
     mkdir exonAA exonNuc
     for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
     do
         I=`echo $I | awk '{print $1+1}'`
 	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &"
 	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &"
         if [ $I -gt 6 ]; then
             echo "date"
             echo "wait"
             I=0
         fi
     done > $gp.jobs
     echo "date" >> $gp.jobs
     echo "wait" >> $gp.jobs
 
     time ./$gp.jobs > $gp.jobs.log 2>&1 &
     #   real    28m46.919s
 
     time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     #   real    0m23.798s
     time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     #   real    1m28.197s
 
     export mz=multiz7way
     export gp=knownGene
     export db=hg38
     export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
     mkdir -p $pd
     md5sum *.fa.gz > md5sum.txt
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
     ln -s `pwd`/md5sum.txt $pd/
 
     rm -rf exonAA exonNuc
 
     ### need other gene track alignments also
     # running up refGene
     cd /hive/data/genomes/hg38/bed/multiz7way/pal
     export mz=multiz7way
     export gp=refGene
     export db=hg38
     export I=0
     mkdir exonAA exonNuc
     for C in `sort -nk2 ../../../chrom.sizes | cut -f1`
     do
         I=`echo $I | awk '{print $1+1}'`
 	echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &"
 	echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &"
         if [ $I -gt 6 ]; then
             echo "date"
             echo "wait"
             I=0
         fi
     done > $gp.jobs
     echo "date" >> $gp.jobs
     echo "wait" >> $gp.jobs
 
     time sh -x $gp.jobs > $gp.jobs.log 2>&1
     #   real    15m15.424s
 
     export mz=multiz7way
     export gp=refGene
     export db=hg38
     time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     #   real    0m23.119s
     time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     #   real    1m15.547s
 
     du -hsc exonAA exonNuc refGene*.fa.gz
     #  59M     exonAA
     #  101M    exonNuc
     #  59M     refGene.multiz7way.exonAA.fa.gz
     #  101M    refGene.multiz7way.exonNuc.fa.gz
     #  317M    total
 
     rm -rf exonAA exonNuc
 
     # we're only distributing exons at the moment
     export mz=multiz7way
     export gp=refGene
     export db=hg38
     export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
     mkdir -p $pd
     md5sum *.fa.gz > md5sum.txt
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
     ln -s `pwd`/md5sum.txt $pd/
 
     ### And knownCanonical
     cd /hive/data/genomes/hg38/bed/multiz7way/pal
     export mz=multiz7way
     export gp=knownCanonical
     export db=hg38
     mkdir exonAA exonNuc ppredAA ppredNuc knownCanonical
 
     cut -f1 ../../../chrom.sizes | while read C
     do
         echo $C
 	hgsql hg38 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed
     done
 
     ls knownCanonical/*.known.bed | while read F
     do
       if [ -s $F ]; then
          echo $F | sed -e 's#knownCanonical/##; s/.known.bed//'
       fi
     done | while read C
     do
 	echo "date"
 	echo "mafGene -geneBeds=knownCanonical/$C.known.bed  $db $mz knownGene order.list stdout | \
 	    gzip -c > ppredAA/$C.ppredAA.fa.gz"
 	echo "mafGene -geneBeds=knownCanonical/$C.known.bed -noTrans $db $mz knownGene order.list stdout | \
 	    gzip -c > ppredNuc/$C.ppredNuc.fa.gz"
 	echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons -noTrans $db $mz knownGene order.list stdout | \
 	    gzip -c > exonNuc/$C.exonNuc.fa.gz"
 	echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons $db $mz knownGene order.list stdout | \
 	    gzip -c > exonAA/$C.exonAA.fa.gz"
     done > $gp.$mz.jobs
 
     time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1
     # real    72m58.133s
 
     rm *.known.bed
     mz=multiz7way
     gp=knownCanonical
     db=hg38
     zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz &
     zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz &
     zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz &
     zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     mz=multiz7way
     gp=knownCanonical
     db=hg38
     pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
     mkdir -p $pd
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
     cd  $pd
     md5sum *.exon*.fa.gz > md5sum.txt
 
 #############################################################################
 # wiki page for 7-way (DONE - 2014-06-04 - Hiram)
     mkdir /hive/users/hiram/bigWays/hg38.7way
     cd /hive/users/hiram/bigWays
     echo "hg38" > hg38.7way/ordered.list
     awk '{print $1}' /hive/data/genomes/hg38/bed/multiz7way/7way.distances.txt \
        >> hg38.7way/ordered.list
 
     # sizeStats.sh catches up the cached measurements required for data
     # in the tables.  They may already be done.
     ./sizeStats.sh hg38.7way/ordered.list
     # dbDb.sh constructs hg38.7way/Hg38_7-way_conservation_alignment.html
     ./dbDb.sh hg38 7way
     # sizeStats.pl constructs hg38.7way/Hg38_7-way_Genome_size_statistics.html
     ./sizeStats.pl hg38 7way
 
     # defCheck.pl constructs Hg38_7-way_conservation_lastz_parameters.html
     ./defCheck.pl hg38 7way
 
     # this constructs the html pages in hg38.7way/:
 # -rw-rw-r-- 1 4153 Jun  5 11:03 Hg38_7-way_conservation_alignment.html
 # -rw-rw-r-- 1 5833 Jun  5 11:04 Hg38_7-way_Genome_size_statistics.html
 # -rw-rw-r-- 1 3854 Jun  5 11:04 Hg38_7-way_conservation_lastz_parameters.html
 
     # add those pages to the genomewiki.  Their page names are the
     # names of the .html files without the .html:
 #  Hg38_7-way_conservation_alignment
 #  Hg38_7-way_Genome_size_statistics
 #  Hg38_7-way_conservation_lastz_parameters
 
     # when you view the first one you enter, it will have links to the
     # missing two.
 
 #############################################################################
 # GRC Incident database (DONE - 2014-06-14 - Hiram)
     # this procedure is run as a cron job in Hiram's account:
 
     #	33 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo
 
     # data comes from: ftp://ftp.ncbi.nlm.nih.gov/pub/grc/
     # processed by /hive/data/outside/grc/incidentDb/grcUpdate.sh
 
     # the table in the dataBase is: grcIncidentDb
     # which is the URL to the bb file, a single row:
     # http://genomewiki.ucsc.edu/images/7/7f/Hg38.grcIncidentDb.bb
 
 #############################################################################
 # RepeatMasker Visualization track (DONE - 2014-07-25 - Hiram)
     mkdir /hive/data/genomes/hg38/bed/rmskJoined
     cd /hive/data/genomes/hg38/bed/rmskJoined
 
     ln -s ../repeatMasker/hg38.sorted.fa.out .
     ln -s ../repeatMasker/hg38.fa.align.gz .
 
     # working on fixing this script for the next release of RM
     /scratch/data/RepeatMasker140131/util/nextVerRmToUCSCTables.pl \
             -out hg38.sorted.fa.out -align hg38.fa.align.gz
 
     hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \
         -renameSqlTable -verbose=4 -tab \
             -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \
                 rmskJoinedBaseline hg38.sorted.fa.join.bed \
                     > loadJoined.log 2>&1
 
     hgLoadSqlTab hg38 rmskAlignBaseline \
         /cluster/home/hiram/kent/src/hg/lib/rmskAlign.sql \
             hg38.fa.align.tsv > loadAlign.log 2>&1
 
     hgLoadOutJoined -verbose=2 hg38 hg38.sorted.fa.out > loadOut.log 2>&1
 
     featureBits -countGaps hg38 rmskJoinedBaseline
     #    2716777279 bases of 3209286105 (84.654%) in intersection
 
 ##############################################################################
 # LASTZ Macaca Mulatta RheMac2 (DONE - 2014-07-13 - braney)
     mkdir /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11
     cd /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11
 
     # best to always specify an exact path to lastz so we know which one is used
     # lastz default parameters are human-mouse parameters
 
     cat << '_EOF_' > DEF
 # human vs macaca mulatta
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 # TARGET: Human Hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Macaca Mulatta RheMac2
 SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit
 SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11
 TMPDIR=/dev/shm
 '_EOF_'
     # << happy emacs
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
         `pwd`/DEF \
         -syntenicNet -fileServer=hgwdev \
 	-chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
     #  Elapsed time: 141m36s
     cat fb.hg38.chainRheMac2Link.txt
     # 2455106923 bases of 3049335806 (80.513%) in intersection
 
     #   running the swap
     mkdir /hive/data/genomes/rheMac2/bed/blastz.hg38.swap
     cd /hive/data/genomes/rheMac2/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/hg38/bed/lastzRheMac2.2014-07-11/DEF \
         -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1
     # 83m26.095s
     cat fb.rheMac2.chainHg38Link.txt
     # 2313950599 bases of 2646704109 (87.428%) in intersection
 #
 
 #########################################################################
 # LASTZ Chlorocebus sabaeus  (DONE - 2014-07-13 - braney)
     mkdir /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11
     cd /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11
 
     # best to always specify an exact path to lastz so we know which one is used
     # lastz default parameters are human-mouse parameters
 
     cat << '_EOF_' > DEF
 # human vs Chlorocebus sabaeus
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 BLASTZ_O=600
 BLASTZ_E=150
 # other parameters from panTro2 vs hg18 lastz on advice from Webb
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_T=2
 
 
 # TARGET: Human Hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY Chlorocebus sabaeus chlSab2
 SEQ2_DIR=/scratch/data/chlSab2/chlSab2.2bit
 SEQ2_LEN=/scratch/data/chlSab2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11
 TMPDIR=/dev/shm
 '_EOF_'
     # << happy emacs
     time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
         `pwd`/DEF \
         -syntenicNet -fileServer=hgwdev \
 	-chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1
     # Elapsed time: 142m4s
     cat fb.hg38.chainChlSab2Link.txt
     # 2573435303 bases of 3049335806 (84.393%) in intersection
 
     #   running the swap
     mkdir /hive/data/genomes/chlSab2/bed/blastz.hg38.swap
     cd /hive/data/genomes/chlSab2/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/hg38/bed/lastzChlSab2.2014-07-11/DEF \
         -swap -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1
     # 88m48.411s
     cat fb.chlSab2.chainHg38Link.txt
     # 2429053010 bases of 2752019208 (88.264%) in intersection
 
 #########################################################################
 # SEGMENTAL DUPLICATIONS (DONE - 2014-08-13 - Hiram)
     # redmine issue: refs #13580
 
     # file received in email from Archana Natarajan Raja (araja at uw.edu)
     mkdir /hive/data/genomes/hg38/bed/genomicSuperDups
     cd /hive/data/genomes/hg38/bed/genomicSuperDups
 # -rw-r--r-- 1 16478617 Aug 11 16:18 GenomicSuperDup.tab
 
     # no longer filtering items smaller than 1,000 bases, see note
     # in redmine issue refs #13580
 # While the size of the 24 alignments are less than 1000 bases , the size of
 # their pairs to which they align are always >1000, you can confirm this by
 # looking at the value in column 22 in your table (alignB -ucsc format), will
 # always be >1000 bp . We are seeing this only now because there are lots of
 # new and resolved duplications added to hg38. Hence , I would recommend not
 # filtering these items and uploading the current set as is.
 
     # there is no chrEBV in the browser:
     grep -v chrEBV GenomicSuperDup.tab | sed -e 's/\t_\t/\t-\t/;' \
       | hgLoadBed hg38 genomicSuperDups stdin \
 	-sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
     #  Read 69894 elements of size 29 from stdin
 
     checkTableCoords  hg38 genomicSuperDups
     # <silence>  (the chrEBV was found with this check)
 
     featureBits -countGaps hg38 genomicSuperDups
     # 175429664 bases of 3209286105 (5.466%) in intersection
 
     featureBits -countGaps hg19 genomicSuperDups
     #  166092393 bases of 3137161264 (5.294%) in intersection
     featureBits -countGaps hg18 genomicSuperDups
     #  159204446 bases of 3107677273 (5.123%) in intersection
 
     featureBits -countGaps mm10 genomicSuperDups
     # 214917441 bases of 2730871774 (7.870%) in intersection
     featureBits -countGaps mm9 genomicSuperDups
     # 208214567 bases of 2725765481 (7.639%) in intersection
 
 ##############################################################################
 # cloneEnds (DONE - 2014-08-14 - Hiram)
 
     mkdir /hive/data/genomes/hg38/bed/cloneEnds
     cd /hive/data/genomes/hg38/bed/cloneEnds
 
     # fetch the NCBI INSDC name correspondence file:
     rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001405.26.assembly.txt ./
 
     # fetch the clone reports
     mkdir reports
     rsync -a -P \
 rsync://ftp.ncbi.nih.gov/repository/clone/reports/Homo_sapiens/*.GCF_000001405.26.106.*.gff \
        ./reports/
 
     # script to establish refSeq to UCSC chrom names:
 
     cat << '_EOF_' > refSeqNames.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 open (FH, "<GCF_000001405.26.assembly.txt") or die "can not read GCF_000001405.26.assembly.txt";
 while (my $line = <FH>) {
   chomp $line;
   next if ($line =~ m/^#/);
   my @a = split('\t', $line);
   my $chrN = $a[2];
   my $refSeq = $a[6];
   my $contig = $a[4];
   my $type = $a[1];
   next if (!defined $type);
   next if (!defined $refSeq);
   next if (!defined $contig);
   my $suffix = "";
   if ($type eq "alt-scaffold") {
      $suffix = "_alt";
   } elsif ($type eq "unlocalized-scaffold") {
      $suffix = "_random";
   } elsif ($type eq "unplaced-scaffold") {
      $chrN = "Un";
   }
   $chrN = "M" if ($chrN eq "MT");
   if ($a[0] =~ m/_/) {
     $contig =~ s/\./v/;
     printf "%s\tchr%s_%s%s\n", $refSeq, $chrN, $contig, $suffix;
   } else {
     printf "%s\tchr%s\n", $refSeq, $chrN;
   }
 }
 close (FH);
 '_EOF_'
     # << happy emacs
 
     chmod +x refSeqNames.pl
 
     ./refSeqNames.pl > refSeq.ucscName.tab
 
     # establish full library list:
     ls reports/*.GCF_000001405.26.106.*.gff | sed -e 's#reports/##' \
        | cut -d"." -f1 | sort -u > library.list.txt
 
     # a script to scan the GFF files, with the refSeq.ucscName.tab
     # name correspondence to construct bed files
 
     cat << '_EOF_' > hg38.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my $argc = scalar(@ARGV);
 
 if ($argc < 1) {
   printf STDERR "usage: ./hg38.pl <report.gff> [moreReports.gff]\n";
   exit 255;
 }
 
 my %refSeqToUcsc;   # key is refSeq name, value is UCSC chrom name
 open (FH, "<refSeq.ucscName.tab") or die "can not read refSeq.ucscName.tab";
 while (my $line = <FH>) {
   chomp $line;
   my ($refSeq, $ucsc) = split('\t', $line);
   $refSeqToUcsc{$refSeq} = $ucsc;
 }
 close (FH);
 
 my %chromSizes;    # key is UCSC chrom name, key is chrom size
 open (FH, "</hive/data/genomes/hg38/chrom.sizes") or die "can not read hg38/chrom.sizes";
 while (my $line = <FH>) {
   chomp $line;
   my ($chr, $size) = split('\t', $line);
   $chromSizes{$chr} = $size;
 }
 close (FH);
 
 while (my $file = shift) {
 my %starts;   # key is parent ID, value is start end coordinates start,end
 my %ends;	# key is parent ID, value is end end coordinates start,end
 my %parents;	# key is parent ID, value is 1 to signify exists
 my %endNames;   # key is parent ID, value is the Name of the parent clone_insert
 
 printf STDERR "# processing $file\n";
 
 open (FH, "<$file") or die "can not read $file";
 while (my $line = <FH>) {
   chomp $line;
   next if ($line=~ m/^#/);
   my @a = split('\t', $line);
   next if (scalar(@a) < 1);
   my $contig = $a[0];
   $contig =~ s/ref.//;
   $contig =~ s/\|//;
   my $ucscChr = $refSeqToUcsc{$contig};
   if (!defined($ucscChr)) {
     printf STDERR "# ERR: contig not in refSeqToUcsc: '$contig'\n";
     next;
   }
   next if (! exists($chromSizes{$ucscChr}));
   my $chromSize = $chromSizes{$ucscChr};
   my $chromStart = $a[3] - 1;
   my $chromEnd = $a[4];
   if ($chromStart > $chromSize) {
     printf STDERR "# warning chromStart over size $ucscChr $chromStart $chromEnd\n";
     $chromStart = $chromSize-1;
   }
   if ($chromEnd > $chromSize) {
     my $overRun = $chromEnd - $chromSize;
     printf STDERR "# warning chromEnd over size by $overRun -> $ucscChr $chromStart $chromEnd\n";
     $chromEnd = $chromSize;
   }
   my $id="notFound";
   my $name="notFound";
   my $parent="notFound";
   my @b = split(';', $a[8]);
   for (my $i = 0; $i < scalar(@b); ++$i) {
      my ($tag, $value) = split('=', $b[$i]);
      if ($tag eq "ID") {
         $id = $value;
         if ($id !~ m/-/) {
           if (exists($parents{$id})) {
             printf STDERR "# WARN: duplicate parent: $id";
           } else {
             $parents{$id} = $ucscChr;
           }
         }
      } elsif ($tag eq "Parent") {
         $parent = $value;
      } elsif ($tag eq "Name") {
         $name = $value;
      }
   }
   my $type="notFound";
   my $insertType = $a[2];
   if ($insertType =~ m/clone_insert_start/) {
      $type = "start";
      if ($parent eq "notFound") {
        printf STDERR "# ERR: can not find parent for start $name Ttype $id\n";
      } else {
        if (!exists($parents{$parent})) {
          printf STDERR "# ERR: start found $name  with no parent $parent declared\n";
        } elsif (exists($starts{$parent})) {
          printf STDERR "# ERR: duplicate start for $parent\n";
        } elsif ($ucscChr eq $parents{$parent}) {
          $starts{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd);
        } else {
          printf STDERR "# ERR: start on different chrom $ucscChr than parent $parent $parents{$parent}\n";
        }
      }
   } elsif ($insertType =~ m/clone_insert_end/) {
      $type = "end";
      if ($parent eq "notFound") {
        printf STDERR "# ERR: can not find parent for end $name Ttype $id\n";
      } else {
        if (!exists($parents{$parent})) {
          printf STDERR "# ERR: end found $name  with no parent $parent declared\n";
        } elsif (exists($ends{$parent})) {
          printf STDERR "# ERR: duplicate end for $parent\n";
        } elsif ($ucscChr eq $parents{$parent}) {
          $ends{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd);
        } else {
          printf STDERR "# ERR: end on different chrom $ucscChr than parent $parent $parents{$parent}\n";
        }
      }
   } elsif ($insertType =~ m/clone_insert/) {
      $type = "insert";
      $endNames{$id} = $name;
   }
   $name =~ s/gi\|//g;
   $id =~ s/gi\|//g;
   printf STDERR "%s\t%d\t%d\t%s_%s_%s\t0\t%s\n", $ucscChr, $chromStart, $chromEnd, $name, $type, $id, $a[6];
 }	# while (my $line = <FH>)
 
 close (FH);
 
 foreach my $parent (keys %parents) {
   if (! exists($starts{$parent}) ) {
     printf STDERR "# ERR: no start for $parent\n";
   } elsif (! exists($ends{$parent}) ) {
     printf STDERR "# ERR: no end for $parent\n";
   } else {
     my $strand = "+";
     my $chrStart = 0;
     my $chrEnd = 0;
     my $blockStart = 0;
     my ($sStart, $sEnd) = split('\t', $starts{$parent});
     my ($eStart, $eEnd) = split('\t', $ends{$parent});
     my $startSize = $sEnd - $sStart;
     my $endSize = $eEnd - $eStart;
     if ($eStart < $sStart) {
       $chrStart = $eStart;
       $chrEnd = $sEnd;
       $blockStart = $sStart - $chrStart;
       $strand = "-";
       $startSize = $eEnd - $eStart;
       $endSize = $sEnd - $sStart;
     } else {
       $chrStart = $sStart;
       $chrEnd = $eEnd;
       $blockStart = $eStart - $chrStart;
     }
     if ($startSize > $blockStart) {
       printf STDERR "# startSize > blockStart $endNames{$parent}\n";
     } else {
       printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t2\t%d,%d\t0,%d\n", $parents{$parent}, $chrStart, $chrEnd, $endNames{$parent}, $strand, $chrStart, $chrEnd, $startSize, $endSize, $blockStart;
     }
   }
 }
 }
 '_EOF_'
     # << happy emacs
 
     chmod +x hg38.pl
 
     # process GFF files into bed files into separateLibs/ directory
 for L in `cat library.list.txt`
 do
    export destDir="separateLibs/${L}"
    echo "working: ${L}" 1>&1
    mkdir -p "${destDir}"
    ./hg38.pl reports/${L}.GCF_000001405.26.106.*.gff \
        2> ${destDir}/tmp.bed6 | sort -k1,1 -k2,2n > ${destDir}/hg38.${L}.bed
    sort -k1,1 -k2,2n ${destDir}/tmp.bed6 > ${destDir}/hg38.${L}.items.bed6
 done
 
     # use only those libraries with more than 20,000 clone ends
     wc -l separateLibs/*/*.bed | sort -n | grep -v total | awk '$1 > 20000' \
         | sed -e 's#.*separateLibs/##; s#/.*##' > libs.over20K.list
 
     # note those libraries with less than 20,000 clone ends
     wc -l separateLibs/*/*.bed | grep -v total | awk '$1 < 20000' | sed -e 's#.*separateLibs/##; s#/.*##' > libs.under20K.list
 
     # filter out bad ends, length must be <= median size times three
     cat libs.over20K.list | while read D
 do
    if [ ! -s separateLibs/${D}/lengths.txt ]; then
       awk '{print $3-$2}' separateLibs/${D}/hg38.${D}.bed \
         > separateLibs/${D}/lengths.txt
    fi
    median3X=`ave separateLibs/${D}/lengths.txt | grep median | awk '{printf "%d", $2*3}'`
    awk '($3-$2) < '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.median3X.bed
    awk '($3-$2) >= '$median3X'' separateLibs/${D}/hg38.${D}.bed > separateLibs/${D}/hg38.badMap.bed
    before=`cat separateLibs/${D}/hg38.${D}.bed | wc -l`
    after=`cat separateLibs/${D}/hg38.median3X.bed | wc -l`
    dropped=`echo $before $after | awk '{print $1-$2}'`
    perCent=`echo $dropped $before | awk '{printf "%.2f", 100*'$dropped/$before'}'`
    echo "$D $before - $after = $dropped -> % $perCent dropped"
 done
 
 #  ABC20 24692 - 24474 = 218 -> % 0.88 dropped
 #  RP11 86660 - 85903 = 757 -> % 0.87 dropped
 #  CTD 95853 - 94941 = 912 -> % 0.95 dropped
 #  CH17 105618 - 105060 = 558 -> % 0.53 dropped
 #  ABC21 182154 - 180973 = 1181 -> % 0.65 dropped
 #  ABC22 189939 - 188743 = 1196 -> % 0.63 dropped
 #  COR02 208263 - 206782 = 1481 -> % 0.71 dropped
 #  ABC18 325080 - 322904 = 2176 -> % 0.67 dropped
 #  ABC27 334178 - 331822 = 2356 -> % 0.71 dropped
 #  ABC24 398944 - 395776 = 3168 -> % 0.79 dropped
 #  ABC23 436965 - 433896 = 3069 -> % 0.70 dropped
 #  ABC16 452220 - 449101 = 3119 -> % 0.69 dropped
 #  COR2A 583008 - 578578 = 4430 -> % 0.76 dropped
 #  WI2 587165 - 582843 = 4322 -> % 0.74 dropped
 #  ABC7 649297 - 644071 = 5226 -> % 0.80 dropped
 #  ABC11 729962 - 724864 = 5098 -> % 0.70 dropped
 #  ABC9 755994 - 750648 = 5346 -> % 0.71 dropped
 #  ABC12 777816 - 771827 = 5989 -> % 0.77 dropped
 #  ABC10 787969 - 781331 = 6638 -> % 0.84 dropped
 #  ABC13 810822 - 803589 = 7233 -> % 0.89 dropped
 #  ABC14 845573 - 839126 = 6447 -> % 0.76 dropped
 #  ABC8 1204275 - 1192784 = 11491 -> % 0.95 dropped
 
    # loading the median3X files
 for L in `cat libs.over20K.list`
 do
     echo $L 1>&2
     hgLoadBed -type=bed12 hg38 cloneEnd${L} \
        separateLibs/${L}/hg38.median3X.bed \
         > separateLibs/loadBed.${L}.log 2>&1
 done
 
    # loading the dropped ends:
    mkdir /hive/data/genomes/hg38/bed/cloneEnds/droppedTooBig
    # link them to here
    cat ../libs.over20K.list | while read L
 do
   ln -s ../separateLibs/${L}/hg38.badMap.bed ${L}.badMap.bed
 done
   # then load
   hgLoadBed -type=bed12 hg38 cloneEndbadEnds *.badMap.bed
 
     # construct multiple mapped ends:
 for L in `cat libs.over20K.list`
 do
     cat separateLibs/${L}/hg38.median3X.bed
 done | sort -k4 > allEnds.bed
 
     cut -f4 allEnds.bed | sort | uniq -c | sort -rn > allEnds.names.count.txt
 
     awk '$1 > 1' allEnds.names.count.txt | awk '{print $2}' \
        | sort > multiples.names.txt
 
     join -t'	' -o "2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12" \
        -2 4 multiples.names.txt allEnds.bed | sort -k1,1 -k2,2n \
            > allEnds.multiple.locations.bed
 
     hgLoadBed -type=bed12 hg38 cloneEndmultipleMaps \
         allEnds.multiple.locations.bed > load.multipleMaps.log 2>&1
 
     awk '$6 == "+"' allEnds.bed | sort -k1,1 -k2,2n \
       | bedItemOverlapCount hg38 stdin > allEnds.forward.bedGraph
 
     awk '$6 == "-"' allEnds.bed | sort -k1,1 -k2,2n \
       | bedItemOverlapCount hg38 stdin > allEnds.reverse.bedGraph
 
     bedGraphToBigWig allEnds.forward.bedGraph \
        /hive/data/genomes/hg38/chrom.sizes \
          cloneEndcoverageForward.bw
 
     bedGraphToBigWig allEnds.reverse.bedGraph \
        /hive/data/genomes/hg38/chrom.sizes \
           cloneEndcoverageReverse.bw
 
     mkdir /gbdb/hg38/bbi/cloneEnd
     ln -s `pwd`/cloneEndcoverageForward.bw /gbdb/hg38/bbi/cloneEnd
     ln -s `pwd`/cloneEndcoverageReverse.bw /gbdb/hg38/bbi/cloneEnd
 
     hgBbiDbLink hg38 cloneEndcoverageForward \
         /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageForward.bw
     hgBbiDbLink hg38 cloneEndcoverageReverse \
         /gbdb/hg38/bbi/cloneEnd/cloneEndcoverageReverse.bw
 
     ### Fixup the scores to indicate how many multiple mappings as mentioned
     ### in the hg19 bacEnds description page: one mapping: score = 1000
     ### multiple mappings: score = 1500/count
     ### the sort | uniq -c | awk does this score calculation with the name
     ###   in column 1
     ### The join puts the existing table together with those scores
     ### DONE - 2015-06-18 - Hiram
 
     mkdir /hive/data/genomes/hg38/bed/cloneEnds/addCounts
     cd /hive/data/genomes/hg38/bed/cloneEnds/addCounts
     mkdir score withScore noScore withScore
     for table in cloneEndABC10 cloneEndABC11 cloneEndABC12 cloneEndABC13 \
 cloneEndABC14 cloneEndABC16 cloneEndABC18 cloneEndABC20 cloneEndABC21 \
 cloneEndABC22 cloneEndABC23 cloneEndABC24 cloneEndABC27 cloneEndABC7 \
 cloneEndABC8 cloneEndABC9 cloneEndCH17 cloneEndCOR02 cloneEndCOR2A \
 cloneEndCTD cloneEndRP11 cloneEndWI2 cloneEndbadEnds cloneEndmultipleMaps
 do
   hgsql -N -e "select name from $table;" hg38 | sort | uniq -c |
       awk '{ if (1 == $1) {printf "%s\t1000\n", $2} else {printf "%s\t%d\n", $2, 1500/$1} }' \
          | sort > score/hg38.$table.score.tab
   hgsql -N -e "select * from $table order by name;" hg38 \
       | sort -k5 > noScore/hg38.$table.tab
   join -t'^I' -1 5 noScore/hg38.$table.tab score/hg38.$table.score.tab \
   | awk '{printf "%d\t%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n", $2,$3,$4,$5,$1,$14,$7,$8,$9,$10,$11,$12,$13}' \
     | sort -k2,2 -k3,3n > withScore/hg38.$table.withScore.tab
   hgsql -e "delete from $table;" hg38
   hgsql -e "load data local infile \"withScore/hg38.$table.withScore.tab\" into table $table;" hg38
 done
 
 ##############################################################################
 # SIB Transcriptome (DONE 2014-08-27 Steve)
 
     # Create working directory and download data from where Christian
     # Iseli (christian.iseli@unil.ch) put it, and unpack.
     mkdir -p /hive/data/genomes/hg38/bed/sibTranscriptome
     cd /hive/data/genomes/hg38/bed/sibTranscriptome
     wget --timestamping http://ludwig-sun1.unil.ch/~chris/HTr.gtf.gz
     wget --timestamping http://ludwig-sun1.unil.ch/~chris/txg.tar.gz
 
     tar -zxvf txg.tar.gz
 
     zcat HTr.gtf.gz | ldHgGene hg38 sibGene stdin
     # Reading stdin
     # Read 208508 transcripts in 2824960 lines in 1 files
     # 208508 groups 25 seqs 1 sources 2 feature types
     # 208508 gene predictions
 
     # Do a little data cleanup and transformation and load splice graphs
     # into database.
     sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql > sibTxGraph.sql
     cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb \
       -sqlTable=sibTxGraph.sql hg38 sibTxGraph stdin
     # Reading stdin
     # Read 47817 elements of size 18 from stdin
     # Sorted
     # Creating table definition for sibTxGraph from sql: sibTxGraph.sql
     # Saving bed.tab
     # Loading hg38
 
     # Create sibAltEvents track for analyzed alt-splices.
     # Not on RR for hg18 and hg19, so do not push it out
     cat txg/*.txg | txgAnalyze stdin /cluster/data/hg38/hg38.2bit sibAltEvents.bed
     awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
     hgLoadBed hg38 sibAltEvents foo.bed
     # Reading foo.bed
     # Read 452436 elements of size 6 from foo.bed
     # Sorted
     # Creating table definition for sibAltEvents, bedSize: 6
     # Saving bed.tab
     # Loading hg38
 
     # push sibGene and sibTxGraph for hg38
 
 ############################################################################
 # Orangutan Lastz run (DONE - 2014-05-27 - Hiram)
     screen -S hg38PonAbe2      # use a screen to manage this longish running job
     mkdir /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02
     cd /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02
 
     # always set the BLASTZ program so we know what version was used
     cat << '_EOF_' > DEF
 # human vs chimp
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz
 BLASTZ_O=600
 BLASTZ_E=150
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 
 BLASTZ_T=2
 BLASTZ_Y=15000
 BLASTZ_K=4500
 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
 #    A    C    G    T
 #    90 -330 -236 -356
 #  -330  100 -318 -236
 #  -236 -318  100 -330
 #  -356 -236 -330   90
 
 # TARGET: Human Hg38
 SEQ1_DIR=/scratch/data/hg38/hg38.2bit
 SEQ1_LEN=/scratch/data/hg38/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_IN_CONTIGS=0
 
 # QUERY: Orangutan PonAbe2
 SEQ2_DIR=/hive/data/genomes/ponAbe2/ponAbe2.2bit
 SEQ2_LEN=/hive/data/genomes/ponAbe2/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 SEQ2_IN_CONTIGS=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02
 TMPDIR=/dev/shm
 '_EOF_'
 
     time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
         -chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -syntenicNet) > do.log 2>&1
     # real    144m46.575s
     cat fb.hg38.chainPonAbe2Link.txt
     # 2719618310 bases of 3049335806 (89.187%) in intersection
 
     # filter with doRecipBest.pl
     time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
         hg38 ponAbe2) > rbest.log 2>&1
     # real    60m1.060s
     time (doRecipBest.pl -load -continue=load -workhorse=hgwdev \
 	-buildDir=`pwd` hg38 ponAbe2) > loadRBest.log 2>&1 &
     # real    3m35.834s
 
     cat fb.hg38.chainRBestPonAbe2Link.txt
     # 2538296592 bases of 3049335806 (83.241%) in intersection
 
     # running the swap
     mkdir /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap
     cd /hive/data/genomes/ponAbe2/bed/blastz.hg38.swap
     time (doBlastzChainNet.pl -verbose=2 \
         -swap /hive/data/genomes/hg38/bed/lastzPonAbe2.2014-09-02/DEF \
         -chainMinScore=5000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -syntenicNet) > swap.log 2>&1
     # real    102m27.866s
     cat fb.ponAbe2.chainHg38Link.txt
     #  2773568958 bases of 3093572278 (89.656%) in intersection
 
     time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
         ponAbe2 hg38) > rbest.log 2>&1
     # real    78m47.312s
 
 
 
 
 #############################################################################
 # Add chrX alts to par (DONE 2014-10-14 angie)
 # Thanks to Hiram for pointing out that intersecting chrX positions in
 # altLocations and par shows whether a chrX alt overlaps a PAR.
     cd /hive/data/genomes/hg38/bed/par
     hgsql hg38 -e 'select * from altLocations where chrom = "chrX"'
 #+-----+-------+------------+----------+---------------------+
 #| bin | chrom | chromStart | chromEnd | name                |
 #+-----+-------+------------+----------+---------------------+
 #|  73 | chrX  |     319337 |   601516 | chrX_KI270880v1_alt |
 #|  73 | chrX  |     326487 |   601516 | chrX_KI270913v1_alt |
 #| 149 | chrX  |   79965153 | 80097082 | chrX_KI270881v1_alt |
 #+-----+-------+------------+----------+---------------------+
     hgsql hg38 -e 'select * from par where chrom = "chrX"'
 #+-----+-------+------------+-----------+------+
 #| bin | chrom | chromStart | chromEnd  | name |
 #+-----+-------+------------+-----------+------+
 #|   9 | chrX  |      10000 |   2781479 | PAR1 |
 #| 221 | chrX  |  155701382 | 156030895 | PAR2 |
 #+-----+-------+------------+-----------+------+
     # chrX_KI270880v1_alt and chrX_KI270913v1_alt are entirely contained in PAR1;
     # chrX_KI270881v1_alt is not in either PAR.
     hgsql hg38 -e 'select chrom,size from chromInfo \
                      where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");'
 #+---------------------+--------+
 #| chrom               | size   |
 #+---------------------+--------+
 #| chrX_KI270880v1_alt | 284869 |
 #| chrX_KI270913v1_alt | 274009 |
 #+---------------------+--------+
     # Process that into bed4 with name=PAR1:
     hgsql hg38 -NBe 'select chrom, 0, size, "PAR1" from chromInfo \
                        where chrom in ("chrX_KI270880v1_alt", "chrX_KI270913v1_alt");' \
       >> hg38Par.bed4
     hgLoadBed hg38 par hg38Par.bed4
     checkTableCoords hg38 par
 
 
 #############################################################################
 # LASTZ Cow bosTau8 (DONE - 2014-10-15 - Steve)
     mkdir /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-215
     cd /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15
 
     cat << '_EOF_' > DEF
 # human vs cow
 # maximum M allowed with lastz is only 254
 BLASTZ_M=254
 
 # TARGET: Human hg38
 SEQ1_DIR=/scratch/data/hg38/hg38.2bit
 SEQ1_LEN=/scratch/data/hg38/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau8
 SEQ2_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit
 SEQ2_LEN=/hive/data/genomes/bosTau8/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 
 BASE=/hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         `pwd`/DEF \
         -syntenicNet \
         -noLoadChainSplit \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
     # real    602m37.523s
     cat fb.hg38.chainBosTau8Link.txt
     # 1401921010 bases of 3049335806 (45.975%) in intersection
     # Create link
     cd /hive/data/genomes/hg38/bed
     ln -s  lastzBosTau8.2014-10-15 lastz.bosTau8
 
     #   running the swap
     mkdir /hive/data/genomes/bosTau8/bed/blastz.hg38.swap
     cd /hive/data/genomes/bosTau8/bed/blastz.hg38.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/hg38/bed/lastzBosTau8.2014-10-15/DEF \
         -swap  -syntenicNet \
         -noLoadChainSplit \
         -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
         -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
     #   real     116m32.121s
     cat fb.bosTau8.chainHg38Link.txt
     #   1336307377 bases of 2649307237 (50.440%) in intersection
     cd /hive/data/genomes/bosTau8/bed
     ln -s blastz.hg38.swap lastz.hg38
 
 ############################################################################
 # NCBI ClinVar (new version -DONE - 2014-11-08 - Max)
 # see hg19.txt
 #########################################################################
 
 ########################################################################
 # CNV Developmental Delay track (2014-11-21 Steve)
 
     mkdir /hive/data/genomes/hg38/bed/cnvDevDelay
     cd /hive/data/genomes/hg38/bed/cnvDevDelay
 
 wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd100_Coe_et_al_2014/gvf/nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz'
 wget --timestamping 'ftp://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/nstd54_Cooper_et_al_2011/gvf/nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz'
 
 cp /kent/src/hg/utils/automation/gvfToBed8Attrs.pl .
 mv gvfToBed8Attrs.pl gvfToBed8AttrsCase.pl
 cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl100.pl
 cp gvfToBed8AttrsCase.pl gvfToBed8AttrsControl54.pl
 
 # made three local copies of Angie's gvf conversion script - one to include
 # only case individuals from nstd100, one to include only control individuals
 # from nstd100 and one to include only control individuals from nstd54
 
 # had to add an additional elsif statement to the nstd100 scripts to filter
 # based on sample_name field:
 
 #  } elsif ($tag eq "sample_name") {
 #    $sample_name = $val;
 #  }
 
 # added line 33/35 to each file:
 
 # next if ($sample_name eq "Unknown"); # keep only "case" individuals from nstd100
 # next if ($sample_name ne "Unknown"); # keep only "control" individuals from nstd100
 # next if ($phenotype ne "not_reported"); # keep only "control" individuals from nstd54
 
 zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsCase.pl > cnvDevDelayAllCase.bed
 zcat nstd100_Coe_et_al_2014.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl100.pl > cnvDevDelayAllControl.bed
 zcat nstd54_Cooper_et_al_2011.GRCh38.remap.all.germline.ucsc.gvf.gz | gvfToBed8AttrsControl54.pl >> cnvDevDelayAllControl.bed
 
 # GRCh38 data from dbVar had different naming scheme for alternate chromosomes
 # (e.g., chr1|NT_187515.1 instead of chr1_KI270762v1_alt), so needed to write
 # a script to substitute the correct UCSC names
 
     cat << '_EOF_' > chromXref.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 sub usage() {
   printf STDERR "usage: ./chromXref.pl <infile> <outfile>\n"
 }
 
 my $argc = scalar(@ARGV);
 
 if ($argc != 2) {
   usage;
   exit 255;
 }
 
 open (file1, "<hg38.xref") or die "cannot read hg38.xref";
 
 my @accArray = ();
 my $i = 0;
 while (my $line = <file1>) {
   chomp($line);
   my ($type, $chr, $acc1, $acc2) = split('\t', $line);
   ($type, undef) = split('-', $type);
   ($acc1, my $version) = split('\.', $acc1);
   if ($type eq "unlocalized") {
     $type = "random";
   }
   my $ucscAcc = "_" . $acc1 . "v" . $version . "_" . $type;
   $accArray[$i][0] = $ucscAcc;
   $accArray[$i][1] = $acc2;
   $i++;
 }
 
 close (file1);
 
 open (file2, "<$ARGV[0]") or die "cannot read $ARGV[0]";
 open (file3, ">$ARGV[1]") or die "cannot read $ARGV[1]";
 local $/;
 my $fileContents = <file2>;
 for ($i = 0; $i < scalar(@accArray); $i++) {
   my $temp1 = $accArray[$i][1];
   my $temp2 = $accArray[$i][0];
   if ($fileContents =~ m/\|$temp1/) {
     $fileContents =~ s/\|$temp1/$temp2/g;
   }
 }
 
 print file3 $fileContents;
 close (file2);
 close (file3);
 '_EOF_'
     # << happy emacs
 
 cp /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt .
 
 cat GCF_000001405.26.assembly.txt | grep -v '^#\|assembled\|unplaced' | awk '{print $2 "\t" $3 "\t" $5 "\t" $7}' > hg38.xref
 
 chromXref.pl cnvDevDelayAllCase.bed cnvDevDelayAllCaseUcsc.bed
 chromXref.pl cnvDevDelayAllControl.bed cnvDevDelayAllControlUcsc.bed
 
 hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
         -allowStartEqualEnd hg38 cnvDevDelayCase cnvDevDelayAllCaseUcsc.bed
 
 hgLoadBed -tab -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed8Attrs.sql \
         -allowStartEqualEnd hg38 cnvDevDelayControl cnvDevDelayAllControlUcsc.bed
 
     checkTableCoords hg38 cnvDevDelayCase
     checkTableCoords hg38 cnvDevDelayControl
 
 
 #########################################################################
 # RETROFINDER RETROPOSED GENES ucscRetro track VERSION 9
 # (2015-01-12 - 2015-01-20, hartera, DONE)
 ssh hgwdev
 mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112
 cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112
 
 cat << '_EOF_' > DEF
 
 RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 "
 VERSION=9
 RUNDATE="2015-01-12"
 DB=hg38
 SCORETHRESH=550
 GENOMENAME='Homo sapiens'
 GBDB=hg
 DATE=20150112
 RUNDIR=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE
 BINDIR=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/bin
 KENTDIR=/cluster/home/hartera/kent
 KENTBINDIR=/cluster/home/hartera/bin/x86_64
 MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz.$VERSION
 TMPMRNA=$RUNDIR/mrnaBlastz/$DB
 TMPEST=$RUNDIR/est/$DB
 USEALTSEQS=0
 EST=all_est
 SPLICED_EST=intronEst
 SPLIT_EST=0
 SPLIT_SPLICED_EST=0
 LASTZPROG=/cluster/bin/penn/x86_64/lastz
 SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/src/pipeline
 GENOME=/hive/data/genomes
 RETRODIR=$GENOME/$DB/bed/retro
 BASE=$RUNDIR/retro
 OUTDIR=${BASE}/version${VERSION}/${DB}
 RESULT=$OUTDIR/result
 RESULTSPLIT=$OUTDIR/resultSplit
 LOG=$OUTDIR/log
 OUT=$OUTDIR/out
 OVERLAPDIR=$OUTDIR/run.o
 TABLE=ucscRetroInfo$VERSION
 ORTHOTABLE=ucscRetroOrtho$VERSION
 ALIGN=ucscRetroAli$VERSION
 LOCAL=/scratch/data/$DB
 TWOBIT=$GENOME/$DB/$DB.2bit
 RMSK=rmsk
 NET1=netMm10
 NET2=netCanFam3
 NET3=netRheMac3
 # these two nets determine which retros are classified as ancient,
 # use two farthest nets
 ANCIENT1=netMm10
 ANCIENT2=netCanFam3
 GENE1=knownGene
 GENE2=refGene
 GENE3=wgEncodeGencodeCompV19
 CLUSTER=ku
 SPECIES="hg38 mm10"
 ROOTDIR="/cluster/home/hartera/public_html/retro/hg38Jun14"
 WEBROOT=$ROOTDIR/retro.$VERSION
 WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu
 SHUFFLEDIR=shuffle
 SHUFFLEROOT=$WEBROOT/$SHUFFLEDIR
 DUPDIR=dups
 DUPROOT=$WEBROOT/$DUPDIR
 AGEDIR=age
 AGEROOT=$WEBROOT/$AGEDIR
 EXPDIR=exp
 GENEPFAM=knownGene
 PFAM=knownToPfam
 PFAMIDFIELD=name
 PFAMDOMAIN=value
 ALTSPICE=
 #ALTSPLICE=sibTxGraph
 SPLITBYAGE=$SCRIPT/splitRetrosByAge
 PDB=proteins140122
 #ARRAY=gnfAtlas2
 #AFFYPROBE="affyU133A,affyGnf1h"
 #ARRAYMEDIAN=hgFixed.gnfHumanAtlas2Median
 #ARRAYRATIO=hgFixed.gnfHumanAtlas2AllRatio
 #ARRAYABS=hgFixed.gnfHumanAtlas2All
 #ARRAYEXP=hgFixed.gnfHumanAtlas2MedianExps
 #ARRAYEXPALL=hgFixed.gnfHumanAtlas2AllExps
 #ARRAYLOOKUP=knownToGnfAtlas2
 #ARRAYPSLS="/hive/data/genomes/hg19/bed/geneAtlas2/affyU133A.psl /hive/data/genomes/hg19/bed/geneAtlas2/affyGnf1h.psl"
 '_EOF_'
     # << happy emacs
 chmod +x DEF
 
 mkdir -p /hive/data/genomes/hg38/bed/retro
 mkdir -p /hive/data/genomes/hg38/bed/mrnaBlastz.9
 mkdir -p /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz
 cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/mrnaBlastz
 cp ../DEF .
 
 # Create S1.len file
 rom.sizes without random chroms or chrM, there are many alt loci also
 # in hg38 that were not in hg19 so 285 chroms total.
 cat /hive/data/genomes/hg38/chrom.sizes | grep -v random \
    | grep -v chrUn | grep -v chrM > S1.len
 cp S1.len /hive/data/genomes/hg38/bed/mrnaBlastz.9
 
 screen
 # Run steps 1 to 5 of RetroFinder pipeline from scripts in CCDS SVN source tree:
 retroFinder/branches/version2/src/pipeline/ucscStep1.sh DEF
 # check cluster jobs on ku
 retroFinder/branches/version2/src/pipeline/ucscStep2.sh DEF
 retroFinder/branches/version2/src/pipeline/ucscStep3.sh DEF
 #check cluster jobs on ku
 retroFinder/branches/version2/src/pipeline/ucscStep4.sh DEF
 #check cluster jobs on ku
     # Load the track
 retroFinder/branches/version2/src/pipeline/ucscStep5.sh DEF
 cd /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38
 retroFinder/branches/version2/src/pipeline/filterMrna.sh
 retroFinder/branches/version2/src/pipeline/filterEst.sh
 # Check cluster jobs on ku
 retroFinder/branches/version2/src/pipeline/analyseExpress.sh
 # Check cluster jobs on ku
 #added ucscRetroAli9 to kent/src/hg/makeDb/human/hg38/trackDb.ra
 # copied
 # /hive/groups/gencode/pseudogenes/retroFinder/hg38.20150112/retro/version9/hg38/trackDb.retro
 # entry to kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra and edited it to
 # remove the full date and add:
 # dataVersion Jan. 2015
 # Scripts copied ucscRetroAli9.psl, ucscRetroInfo9.bed and ucscRetroCds9.tab
 # to /hive/data/genomes/hg38/bed/retro/
 
 ##########
 # Make dbVar chrom to UCSC chrom lift file
 #  DONE braney 2/12/15
 cd /cluster/data/hg38/jkStuff
 sort /cluster/data/hg38/chrom.sizes > tmpChrom
 grep -v '^#\|assembled' /hive/data/genomes/hg38/genbank/GCF_000001405.26.assembly.txt | awk 'BEGIN {OFS="\t"} {print "chr" $3 "_" $5 "_" $2, "chr" $3 "|"$7}' | sed 's/-scaffold//' | sed 's/unlocalized/random/' | sed 's/_unplaced//' | sed 's/chrna/chrUn/g' | sed 's/\./v/'  | sort | join /dev/stdin tmpChrom | awk 'BEGIN {OFS="\t"} {print 0, $2, $3, $1, $3}'  > dbVar.lift
 awk 'BEGIN {OFS="\t"} {print 0, $1, $2, $1, $2}' /cluster/data/hg38/chrom.sizes >> dbVar.lift
 rm tmpChrom
 
 #########################################################################
 # UCSC to RefSeq name correspondence (DONE - 2015-04-13 - Hiram)
 
     mkdir /hive/data/genomes/hg38/bed/ucscToRefSeq
     cd /hive/data/genomes/hg38/bed/ucscToRefSeq
 
     # columns 5 and 7 are the INSDC and RefSeq names
 
     grep -v "^#" ../../genbank/GCF_000001405.26.assembly.txt \
       | awk -F'\t' '{printf "%s\t%s\n", $5,$7}'  | sort > insdc.refSeq.tab
 
     hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' hg38 \
       | sort > insdc.ucsc.tab
 
     join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' \
        | cut -f2- > ucsc.refSeq.tab
 
 
     export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1`
     sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
     hgLoadSqlTab hg38 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab
 
     checkTableCoords  hg38 -table=ucscToRefSeq
 
 #########################################################################
 #CREATE MICROSAT TRACK (DONE - 2015-05-22 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/hg38/bed/microsat
     cd /cluster/data/hg38/bed/microsat
 
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
        ../simpleRepeat/simpleRepeat.bed > microsat.bed
 
     hgLoadBed hg38 microsat microsat.bed
 
 #############################################################################
 # ENCODE Regulatory tracks  (Kate & Chris)
 
 # see reg.txt
 #########################################################################
 # GWIPS-viz Ribo-seq - (DONE - 2016-02-05 - Steve)
 # contact Audrey Michel (audreymannion@gmail.com)
 # redmine #16765
 
 obtained bigWig file from shared Google drive
 https://drive.google.com/a/soe.ucsc.edu/folderview?id=0B_xvV_5tXzOGQ1h5NEh4bnhNTDg&usp=sharing_eid
 
 mkdir /hive/data/genomes/hg38/bed/gwipsvizRiboseq
 cp Global_RiboProElong.10_02_2016.bw /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw
 
 mkdir /gbdb/hg38/bbi/gwipsvizRiboseq
 cd /gbdb/hg38/bbi/gwipsvizRiboseq
 ln -s /hive/data/genomes/hg38/bed/gwipsvizRiboseq/gwipsvizRiboseq.bw gwipsvizRiboseq.bw
 
 hgsql hg38
 create table gwipsvizRiboseq select * from gc5BaseBw;
 update gwipsvizRiboseq set fileName="/gbdb/hg38/bbi/gwipsvizRiboseq/gwipsvizRiboseq.bw" where fileName="/gbdb/hg38/bbi/gc5BaseBw/gc5Base.bw";
 
 #########################################################################
 # COSMIC v81 DONE Chris Eisenhart 2017-05-11
 # Make a new COSCMIC track for hg19
 mkdir /hive/data/outside/cosmic/hg38/v81
 cd /hive/data/outside/cosmic/hg38/v81
 
 # Get the new data
 sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk
 # Login to SFTP server then run these commands
 get /files/grch38/cosmic/v81/CosmicMutantExport.tsv.gz
 
 # Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts.
 zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv
 
 # Use a script to convert to bed format.
 cosmicToBed cosMut.tsv cosMut.bed
 # This many lines were skipped, 131597 for not having genomic coordinate
 
 # Sort and convert to big bed using the .as file.
 sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed
 bedToBigBed -type=bed4+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V81.bb -tab -extraIndex=name,cosmLabel
 
 # Link it up so the outside world can see it.
 cd /gbdb/hg38/cosmic/
 ln -s /hive/data/outside/cosmic/hg38/v81/cosMutHg38V81.bb .
 #########################################################################
 # hoffmanMappability hub import (2 super tracks) DONE Chris Eisenhart 2017-05-16
 mkdir /hive/data/outside/hoffmanMappability/hg38
 cd /hive/data/outside/hoffmanMappability/hg38
 wget https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/trackDb.txt
 # Get the trackDb file
 importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/ --test
 # Check that the commands are what we want, then run for real
 importTrackHub trackDb.txt hofMap.ra /gbdb/hg38/hoffmanMappability/ --trackDbPath=$HOME/kent/src/hg/makeDb/trackDb/human/hg38/
 # View the .ra file to make sure things are ok, here changed the groups to map,
 # added the alpha tags, and removed the 'show' from 'superTrack on show'
 cp hofMap.ra ~/kent/src/hg/makeDb/trackDb/human/hg38
 # Include hofMap.ra in the trackDb.ra file
 
 # the importTrackHub failed on redirection, fetch all the files manually:
 # 2017-09-15 - Hiram
 
 cd /hive/data/outside/hoffmanMappability/hg38
 
 grep bigDataUrl trackDb.txt | awk '{print $NF}' | sed -e 's#https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/##;' | while read F
 do
   echo $F
   rm -f $F
   wget --timestamping "https://www.pmgenomics.ca/hoffmanlab/proj/bismap/trackhub/hg38/${F}"
 done
     # real    29m40.429s
 
 #########################################################################
 # tcgaExpr super track Chris Eisenhart, DONE, 2017-05-17
 # tcgaTranscExpr
 # TCGA transcript level expression barChart track, from TOIL pipeline recompute (John Vivian)
 # biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf
 mkdir /hive/data/outside/tcgaBarcharts/
 mkdir /hive/data/outside/tcgaBarcharts/transcripts
 cd /hive/data/outside/tcgaBarcharts/transcripts
 
 # Get all the meta data
 cp ~max/projects/cirm/datasetPages/tcgaGtex/tcgaMeta.tab .
 # Cut out the meta data the script wants, sample name and group.
 cut -f 1,5 tcgaMeta.tab | sed 's/ /_/g' > tcgaLargeSamples.tsv
 
 # Get and clean the matrix
 cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.tpm.tab .
 # Clean up the transcript names (remove the .#)
 cut -f 1 tcga.tpm.tab | cut -f 1 -d "." > tcgaTranscripts.txt
 cut -f 2- tcga.tpm.tab > tcgaTpmValues.tsv
 paste tcgaTranscripts.txt tcgaTpmValues.tsv > tcgaMatrix.tsv
 
 # Build a coordinate map
 hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene
 hgsql hg38 -e "select * from ensemblToGeneName" | sort >  ensemblToGeneName
 join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed
 
 # Use the meta data, matrix, and coordinate map to generate a barchart bed
 time expMatrixToBarchartBed tcgaLargeSamples.tsv tcgaMatrix.tsv coord.bed tcgaTransExp.bed --groupOrder tcgaGroupOrder.txt
 
 # NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb.
 # The order of the labels in the barChartBars field should match the order of the labels in the
 # expScores column in the bed file header.
 
 # Sort and convert into a bigBed file.
 sort -k1,1 -k2,2n tcgaTransExp.bed > sortedTcgaTransExp.bed
 bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartTranscExp.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTransExp.bb
 
 # Link the files into gbdb
 cd /gbdb/hgFixed/human/expMatrix
 ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaLargeSamples.tsv tcgaLargeSamples.tab
 ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaMatrix.tsv tcgaMatrix.tab
 ln -s /hive/data/outside/tcgaBarcharts/transcripts/tcgaTransExp.bb .
 
 ###########3
 # Reload bigBed with a schema that will be shared with genes track, to support
 # configuration as subtracks in a composite
 # (2007-08-30 kate)
 cd /hive/data/outside/tcgaBarcharts/transcripts
 bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaTransExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaTranscExpr.hg38.bb
 mkdir /gbdb/hg38/tcga
 ln -s `pwd`/tcgaTranscExpr.hg38.bb /gbdb/hg38/tcga/tcgaTranscExpr.bb
 
 # TCGA gene level expression barChart track, from TOIL pipeline recompute (John Vivian)
 # tcgaGeneExpr
 mkdir ../genes
 cd ../genes
 
 # Get the gene matrix.
 cp ~max/projects/cirm/datasetPages/tcgaGtex/tcga.geneTpm.tab .
 
 # Make a coordinate file, the genes in gtexGeneModelV6 have .# versions which are
 # removed with the temp fils.
 hgsql hg38 -e "select * from hg38.gtexGeneModelV6" | awk '{print $3"\t"$5"\t"$6"\t"$2"\t0\t"$4"\t"$2}' > coord6+1.bed.temp
 cut -f 4 coord6+1.bed.temp | cut -f 1 -d "." > foo
 cut -f 1-3 coord6+1.bed.temp > foo2
 paste foo2 foo > foo3
 cut -f 5- coord6+1.bed.temp > foo4
 paste foo3 foo4 > coord6+1.bed
 # This bed file didn't have the right gene names (ENS rather than Hugo), fix it.
 hgsql hg38 -e "select * From knownCanonical" > foo
 wc foo
 cut -f 6 foo | cut -f 1 -d "."
 cut -f 6 foo | cut -f 1 -d "." > foo2
 head foo
 cut -f 1-3 foo > foo3
 paste foo2 foo3 > foo4
 cut -f 4- coord6+1.bed > foo5
 join <(sort foo5) <(sort foo4) | awk '{print $5"\t"$6"\t"$7"\t"$1"\t0\t"$3"\t"$4}' > coord6+1.3.bed
 
 # Generate the bed file, can use the same transcript file
 time expMatrixToBarchartBed ../transcripts/tcgaLargeSamples.tsv tcga.geneTpm.tab coord6+1.3.bed tcgaGeneExp.bed --groupOrder=../transcripts/tcgaGroupOrder.txt
 
 # Convert to big bed
 sort -k1,1 -k2,2n tcgaGeneExp.bed > sortedTcgaGeneExp.bed
 bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExp.as sortedTcgaGeneExp.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExp.bb
 
 # Link to gbdb
 cd /gbdb/hgFixed/human/expMatrix
 ln -s /hive/data/outside/tcgaBarcharts/genes/tcgaGeneExp.bb .
 ln -s /hive/data/outside/tcgaBarcharts/genes/tcga.geneTpm.tab tcgaGeneMatrix.tab
 
 ###########3
 # Reload bigBed with a schema that will be shared with transcript track, to support
 # configuration as subtracks in a composite
 # Apparently Chris actually loaded the #3 file (added gene names, adjusted end coord apparently)
 # (2007-08-30 kate)
 cd /hive/data/outside/tcgaBarcharts/genes
 bedToBigBed -type=bed6+5 -as=$HOME/kent/src/hg/lib/barChartGeneExpr.as sortedTcgaGeneExp3.bed /hive/data/genomes/hg38/chrom.sizes tcgaGeneExpr.hg38.bb
 mkdir /gbdb/hg38/tcga
 ln -s `pwd`/tcgaGeneExpr.hg38.bb /gbdb/hg38/tcga/tcgaGeneExpr.bb
 
 #########################################################################
 # gtexTransExp Chris Eisenhart, done, 2017-05-23
 # TCGA transcript level RNA-seq, from TOIL pipeline recompute (John Vivian)
 # biorxiv.org/content/biorxiv/early/2016/07/07/062497.full.pdf
 mkdir /hive/data/outside/gtex/barChartTrack
 cd /hive/data/outside/gtex/barChartTrack
 
 # Seems John included some TCGA data (CML) in the GTEx matrix and samples, the cleaning steps remove this.
 # Make a clean sample file
 cat ../johnVivianRecompute/sraToSample.txt | sed 's/ male /\tmale\t/g' | sed 's/ female /\tfemale\t/g' | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' > gtexSampleGroups.txt
 cat ../johnVivianRecompute/sraToSample.txt | cut -f 1 -d " " > gtexSampleNames.txt
 paste gtexSampleNames.txt gtexSampleGroups.txt > gtexSamples.txt
 grep -v '(CML)' gtexSamples.tsv > cleanGtexSamples.tsv
 
 # Make a clean matrix
 cut -f 1 ../johnVivianRecompute/gtex.tpm.tab | cut -f 1 -d "." > gtexTranscripts.txt
 cut -f 2- ../johnVivianRecompute/gtex.tpm.tab > gtexTpmValues.tsv
 paste gtexTranscripts.txt gtexTpmValues.tsv > gtexMatrix.tsv
 rowsToCols gtexMatrix.tsv tspsdGtexMatrix.tsv
 sort tspsdGtexMatrix.tsv > sortedTspsdGtexMatrix.tsv
 grep -v '(CML)' gtexSamples.tsv | cut -f 1 | sed 's/Run_s/#transcript/g' | sort > sortedCleanGtexSamples.tsv
 join sortedCleanGtexSamples.tsv sortedTspsdGtexMatrix.tsv > cleanTspsdGtexMatrix.tsv
 rowsToCols cleanTspsdMatrix.tsv cleanGtexMatrix.tsv
 
 # Build a coordinate map
 hgsql hg38 -e "select * from ensGene" | cut -f 2- | sort > ensGene
 hgsql hg38 -e "select * from ensemblToGeneName" | sort >  ensemblToGeneName
 join ensGene ensemblToGeneName | awk '{print $2"\t"$4"\t"$5"\t"$1"\t0\t"$3"\t"$16}' > coord.bed
 # NOTE: CHRISL10-05-2021 - the above ensGene steps weren't actually done or the files were removed,
 # there was a coord.tsv which I used instead so the below re-run could work
 tawk '{print $1,$2,$3,$4,0,$5,$6}' coord.tsv > coord.bed
 # END CHRISL10-05-2021 NOTE)
 
 # Get the gtex ordering
 hgsql hgFixed -e "select * from gtexTissue" | cut -f 3 | sed 's/ - /-/g' | sed 's/ /_/g' | sed '1D' > gtexGroupOrder.txt
 
 # Use the meta data, matrix, and coordinate map to generate a barchart bed
 # NOTE: CHRISL10-05-2021 - re-ran this step to fix float parsing bug:
 time expMatrixToBarchartBed cleanGtexSamples.tsv cleanGtexMatrix.tsv coord.bed gtexTransExp.bed --groupOrderFile gtexGroupOrder.txt
 
 # NOTE: Use the header line of the bed file to populate the barChartBars field in the trackDb.
 # The order of the labels in the barChartBars field should match the order of the labels in the
 # expScores column in the bed file header.
 
 # Sort and convert into a bigBed file.
 sort -k1,1 -k2,2n gtexTransExp.bed > sortedGtexTransExp.bed
 # NOTE: CHRISL10-05-2021 - re-ran bedToBigBed step with correct file names
 bedToBigBed -as=$HOME/kent/src/hg/lib/barChartBed.as -type=bed6+5 sortedGtexTransExp.bed /hive/data/genomes/hg38/chrom.sizes gtexTranscExpr.bb
 
 # Link the files into gbdb
 cd /gbdb/hgFixed/human/expMatrix
 ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexSamples.tsv cleanGtexSamples.tab
 ln -s /hive/data/outside/gtex/barChartTrack/cleanGtexMatrix.tsv cleanGtexMatris.tab
 
 # <2007-08-30 kate)
 cd /gbdb/hg38/gtex
 ln -s /hive/data/outside/gtex/barChartTrack/gtexTranscExpr.bb .
 
 #########################################################################
 # LASTZ human/hg38 vs. Zebrafish /danRer11
 #	(DONE - 2017-06-12 - Chris)
 
     mkdir /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12
     cd /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12
 
     printf '# human vs zebrafish danRer11
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
 BLASTZ_M=254
 
 # TARGET: human hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/hg38.contigs.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/hg38.contigs.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=40000000
 SEQ1_LIMIT=20
 SEQ1_LAP=10000
 
 # QUERY: zebrafish danRer11
 SEQ2_DIR=/hive/data/genomes/danRer11/danRer11.2bit
 SEQ2_LEN=/hive/data/genomes/danRer11/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12
 TMPDIR=/dev/shm
 ' > DEF
 
     time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
         -chainMinScore=3000 -chainLinearGap=medium \
           -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
             -noDbNameCheck -syntenicNet) > do.log 2>&1
     # real    3327m39.074s
 
 	cat fb.hg38.chainDanRer11Link.txt
     # 41036733 bases of 3049335806 (1.346%) in intersection
 
 	973293331 bases of 3049335806 (31.918%) in intersection
 
     time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` hg38 danRer11) \
        > rbest.log 2>&1 &
 
     # and for the swap:
     mkdir /hive/data/genomes/danRer11/bed/blastz.hg38.swap
     cd /hive/data/genomes/danRer11/bed/blastz.hg38.swap
 
     time (doBlastzChainNet.pl -verbose=2 \
       /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12/DEF \
         -swap -chainMinScore=3000 -chainLinearGap=medium \
           -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
             -noDbNameCheck -syntenicNet) > swap.log 2>&1
 	#  real	39m24.916s
 
     cat fb.danRer11.chainHg38Link.txt
     # 47869194 bases of 1674677181 (2.858%) in intersection
 
     time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` danRer11 hg38) \
        > rbest.log 2>&1 &
     # real	638m45.337s
 _EOF_
 #########################################################################
 # refSeqFuncElems NCBI refSeq functional elements, REDONE 2017-11-29 Angie
 # previously done 2017-08-01 by Chris E
 
 mkdir /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29
 cd /hive/data/genomes/hg38/bed/refSeqFuncElems.2017-11-29
 
 # NOTE FOR NEXT TIME: instead of using interim GFF, in the future these annotations might be
 # folded into the same main release GFF3 from which the ncbiRefSeq* tables are extracted by
 # doNcbiRefSeq.pl.
 wget ftp://ftp.ncbi.nlm.nih.gov/genomes/H_sapiens/GFF_interim/interim_GRCh38.p11_top_level_2017-06-27.gff3.gz
 
 # Get mapping of RefSeq NC_* chromosome accs (and NT_*, NW_*) to hg38 chrom names
 hgsql hg38 -NBe 'select alias, chrom from chromAlias where source = "refseq" order by alias' \
 > refSeqToChrom.tab
 cut -f 2 refSeqToChrom.tab | sed -e 's/^/^/' > chrom.tab
 
 # Use Terence Murphy's list of feature types (and the multi-type attribute regulatory_class)
 # to identify Functional Elements and swap in hg38 chrom names.
 # Use subColumn -miss so it doesn't quit when it sees a patch contig that doesn't map to an
 # hg38 chrom.  Use grep -f chrom.tab to filter out patch contig annotations.
 zcat interim_GRCh38.p11_top_level_2017-06-27.gff3.gz \
 | grep -P "(\t(CAAT_signal|GC_rich_promoter_region|TATA_box|enhancer|insulator|locus_control_region|mobile_genetic_element|origin_of_replication|promoter|protein_binding_site|recombination_feature|regulatory_region|repeat_region|sequence_feature|sequence_secondary_structure|silencer|stem_loop)\t|regulatory_class=)" \
 | subColumn -miss=/dev/null 1 stdin refSeqToChrom.tab stdout \
 | grep -f chrom.tab > funcElems.gff
 wc -l funcElems.gff
 #5756 funcElems.gff
 
 # Transform GFF to BED+
 ~/kent/src/hg/utils/automation/parseRefSeqFuncElems funcElems.gff /dev/stdout \
 | sort -k1,1 -k2n,2n > refSeqFuncElems.bed
 wc -l refSeqFuncElems.bed
 #5756 refSeqFuncElems.bed
 
 # Make bigBed and link from /gbdb
 bedToBigBed -tab -type=bed9+7 -as=$HOME/kent/src/hg/lib/refSeqFuncElems.as \
   refSeqFuncElems.bed /hive/data/genomes/hg38/chrom.sizes refSeqFuncElems.bb
 rm -f /gbdb/hg38/ncbiRefSeq/refSeqFuncElems.bb
 ln -s `pwd`/refSeqFuncElems.bb /gbdb/hg38/ncbiRefSeq/
 
 ###################################################################
 # cosmicRegions (DONE 2017-08-03 Chris)
 # Make a new COSCMIC track for hg38 v82
 mkdir /hive/data/outside/cosmic/hg38/v82
 cd /hive/data/outside/cosmic/hg38/v82
 
 # Get the new data
 sftp ceisenha@ucsc.edu@sftp-cancer.sanger.ac.uk
 # Login to SFTP server then run these commands
 get /files/grch38/cosmic/v82/CosmicMutantExport.tsv.gz
 
 # Remove the 'NS' fields, search for the \t after to exclude the E'NS'ST transcripts.
 zcat CosmicMutantExport.tsv.gz | sed 's/NS\t/\t/g' > cosMut.tsv
 
 # Use a script to convert to bed format.
 cosmicToBed cosMut.tsv cosMut.bed
 # This many lines were skipped, 134601 for not having genomic coordinate
 
 # Sort and convert to big bed using the .as file.
 sort -k1,1 -k2,2n cosMut.bed > sCosMut.bed
 bedToBigBed -type=bed8+31 -as=cosmicNew.as sCosMut.bed /hive/data/genomes/hg38/chrom.sizes cosMutHg38V82.bb -tab -extraIndex=name,cosmLabel
 
 
 # Link it up so the outside world can see it.
 cd /gbdb/hg38/cosmic/
 ln -s /hive/data/outside/cosmic/hg38/v82/cosMutHg38V82.bb .
 
 #########################################################################
 # RepeatMasker Visualization track update (DONE - 2018-05-04 - ChrisL)
     screen -S rmskJoined.2018-05-04
     mkdir /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04
     cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04
 
     ln -s ../repeatMasker/hg38.sorted.fa.out .
     ln -s ../repeatMasker/hg38.fa.align.gz .
 
     # this script points to the most recent RepeatMasker version:
     time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \
         -out hg38.sorted.fa.out -align hg38.fa.align.gz) > do.log 2>&1 &
 
     # no differences, forgot to remake rmsk files
     # so instead remake the rmsk track and try again
     mkdir /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04
     cd /hive/data/genomes/hg38/bed/repeatMasker.2018-05-04
 
     # remake the sorted.fa.out and fa.align.gz, stop after masking
     # so rmsk table isn't overwritten
     time (doRepeatMasker.pl -stop=mask -useHMMER -bigClusterHub=ku \
        -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` hg38) > mask.log 2>&1 &
     # RepeatMasker bug?: Undefined id, line 1440295 of input:
     #    10  26.1  0.0  0.0  chr13     114292339 114292382   (71946) C  L1P4           LINE/L1               (17) 6149   6106
     # RepeatMasker bug?: Undefined id, line 3529762 of input:
     #   992   2.3  0.5  0.0  chr3      180461254 180462048 (17833511) C  L1PA3          LINE/L1                (3) 6152   5354
     # RepeatMasker bug?: Undefined id, line 3529763 of input:
     #  1153   3.2  0.2  0.0  chr3      180462043 180463006 (17832553) +  L1PA3          LINE/L1               4392 5357  (789)
     # RepeatMasker bug?: Undefined id, line 5303571 of input:
     #   220  22.5  0.0 17.7  chr9      105798076 105799127 (32595590) C  SATR2          Satellite              (4)  866      1
     # real    643m17.617s
 
     # get rid of the missing id items:
     grep -v "114292339 114292382\|180461254 180462048\|180462043 180463006\|105798076 105799127" \
         hg38.fa.out > clean.hg38.fa.out
     mv clean.hg38.fa.out hg38.fa.out
 
     # finish the last step of doCat.csh:
     /cluster/bin/scripts/extractNestedRepeats.pl hg38.fa.out | sort -k1,1 -k2,2n > hg38.nestedRepeats.bed
 
     cd /hive/data/genomes/hg38/bed/rmskJoined.2018-05-04
 
     rm hg38.sorted.fa.out
     rm hg38.fa.align.gz
     rm *.tsv
     ln -s ../repeatMasker.2018-05-04/hg38.sorted.fa.out .
     ln -s ../repeatMasker.2018-05-04/hg38.fa.align .
 
     # and then re-run
     time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \
         -out hg38.sorted.fa.out -align hg38.fa.align.gz) > rerun.log 2>&1 &
     # real    141m7.268s
 
     # confirm the counts are different from the previous version:
     # wc -l ../rmskJoined/hg38.fa.align.tsv ../rmskJoined/hg38.sorted.fa.join.bed ../rmskJoined/hg38.sorted.fa.out.tsv
    7203858 ../rmskJoined/hg38.fa.align.tsv
    4607727 ../rmskJoined/hg38.sorted.fa.join.bed
    5520118 ../rmskJoined/hg38.sorted.fa.out.tsv
   17331703 total
     # wc -l *.tsv
    7227245 hg38.fa.align.tsv
    4828114 hg38.sorted.fa.join.tsv
    5916189 hg38.sorted.fa.out.tsv
   17971548 total
 
     hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \
         -renameSqlTable -verbose=4 -tab \
             -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as hg38 \
                 rmskJoinedCurrent hg38.sorted.fa.join.tsv \
                     > loadJoined.log 2>&1
 
     hgLoadSqlTab hg38 rmskAlignCurrent \
         /cluster/home/chmalee/kent/src/hg/lib/rmskAlign.sql \
             hg38.fa.align.tsv > loadAlign.log 2>&1
 
     hgLoadOutJoined -verbose=2 -table=rmskOutCurrent hg38 hg38.sorted.fa.out > loadOut.log 2>&1
 
     featureBits -countGaps hg38 rmskJoinedCurrent
     # 2796899855 bases of 3209286105 (87.150%) in intersection
 #########################################################################
 # Hi-C Visualization based on Krietenstein 2019 (DONE - 2019-10-07 - Jonathan)
 mkdir -p /hive/data/genomes/hg38/bed/hic
 cd /hive/data/genomes/hg38/bed/hic
 
 # Files are located on 4D Nucleome (data.4dnucleome.org).  The URL for the paper on that
 # site is https://data.4dnucleome.org/publications/b13590b2-a341-4e5e-ad5e-72e233b32e9d/.
 # The four file IDs downloaded below are for contact matrix .hic files created for
 # different cell-line/protocol combinations
 wget 'https://data.4dnucleome.org/files-processed/4DNFI2TK7L2F/@@download/4DNFI2TK7L2F.hic' # H1-hESC Micro-C XL
 wget 'https://data.4dnucleome.org/files-processed/4DNFIQYQWPF5/@@download/4DNFIQYQWPF5.hic' # H1-hESC in situ
 wget 'https://data.4dnucleome.org/files-processed/4DNFI18Q799K/@@download/4DNFI18Q799K.hic' # HFFc6 Micro-C XL
 wget 'https://data.4dnucleome.org/files-processed/4DNFIFLJLIS5/@@download/4DNFIFLJLIS5.hic' # HFFc6 in situ
 
 printf "All files were downloaded from the 4D Nucleome Data Portal at data.4dnucleome.org.
 These are processed contact matrices from Krietenstein et al. (2019) Ultrastructural details
 of mammalian chromosme architecture. (https://www.biorxiv.org/content/10.1101/639922v1).
 
 4DNFI2TK7L2F.hic - Micro-C XL data set on H1-hESC
 4DNFIQYQWPF5.hic - in situ Hi-C data set on H1-hESC
 4DNFI18Q799K.hic - Micro-C  XL data set on HFFc6
 4DNFIFLJLIS5.hic - in situ Hi-C data set on HFFc6" > README.txt
 
 mkdir -p /gbdb/hg38/bbi/hic
 cd /gbdb/hg38/bbi/hic
 ln -s /hive/data/genomes/hg38/bed/hic/* .
 
 
 #########################################################################
 # LASTZ Self/hg38 (DONE 2020-02-11 - Angie)
     # RM #24695
     # Re-run with updated process to include pslDropOverlap .
     # Use "contigs" from previous run lastzSelf.2014-01-25/hg38.self.2bit
 
     screen -S hg38Self -t hg38Self
     mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
     cat << _EOF_ > DEF
 # human vs human with mouse defaults
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz
 
 # TARGET: Human hg38
 SEQ1_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ1_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ1_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit
 SEQ1_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes
 SEQ1_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Human hg38
 SEQ2_DIR=/hive/data/genomes/hg38/hg38.2bit
 SEQ2_LEN=/hive/data/genomes/hg38/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit
 SEQ2_CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.chrom.sizes
 SEQ2_LIFT=/hive/data/genomes/hg38/jkStuff/hg38.contigs.lift
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
 TMPDIR=/dev/shm
 _EOF_
 
     # NOTE FOR NEXT TIME: use -chainMinScore=10000 (at least), not 3000
 
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
         -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
         -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
         -stop=net >& do.log &
     tail -f do.log
 
 
     # After two days, 4 jobs are running, one of which (part014.lst vs itself) crashed with
     # out-of-mem error.  After 3 days, 3 jobs completed but part014.lst runs lastz out of mem.
     # Split part014.lst up into components, run on hgwdev (more mem).
     mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014
     mkdir psl
     cp /dev/null jobList
     for t in $(cat ../tParts/part014.lst); do
       tBase=$(basename $t)
       for q in $(cat ../tParts/part014.lst); do
         qBase=$(basename $q)
         echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $t $q ../../DEF {check out exists psl/${tBase}_${qBase}.psl }" >> jobList
       done
     done
     para create jobList
     para try, check, push, etc,
     # 94 of the jobs ran for 12s or less.  The other 6 are chr{X_Y}_00 vs. self & each other,
     # chr13_16 vs self and chr16_03 vs self.  All but chr16_03 vs self completed in < 6 minutes.
 #Completed: 99 of 100 jobs
 #Crashed: 1 jobs
 #CPU time in finished jobs:       1559s      25.98m     0.43h    0.02d  0.000 y
 #IO & Wait Time:                   248s       4.14m     0.07h    0.00d  0.000 y
 #Average job time:                  18s       0.30m     0.01h    0.00d
 #Longest finished job:             321s       5.35m     0.09h    0.00d
 #Submission to last job:         94681s    1578.02m    26.30h    1.10d
 
     # Dang, chr16_03 vs. self still runs out of mem even on hgwdev.
     mkdir /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03
     twoBitToFa /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/hg38.self.2bit:chr16_03:0-1689648 \
       chr16_03.fa
     faSplit -lift=chr16_03.lift size chr16_03.fa 169000 chr16_03_split_
     faToTwoBit chr16_03_split_*.fa chr16_03_split.2bit
     twoBitInfo chr16_03_split.2bit stdout | sort -k2nr > chr16_03_split.sizes
     sed -re 's@CTGDIR.*@CTGDIR=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.2bit@;
              s@CTGLEN.*@CTGLEN=/hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz/part014/chr16_03/chr16_03_split.sizes@;' \
       ../../../DEF > DEF.split
     mkdir psl
     cwd=$(pwd)
     while read tBase tSize; do
       while read qBase qSize; do
         echo "$HOME/kent/src/hg/utils/automation/blastz-run-ucsc -outFormat psl -dropSelf $cwd/chr16_03_split.2bit:$tBase:0-$tSize $cwd/chr16_03_split.2bit:$qBase:0-$qSize DEF.split {check out exists psl/${tBase}_${qBase}.psl}"
       done < chr16_03_split.sizes
     done < chr16_03_split.sizes > jobList
     para create jobList
     para try, check, push, etc,
 #Completed: 100 of 100 jobs
 #CPU time in finished jobs:     142614s    2376.89m    39.61h    1.65d  0.005 y
 #IO & Wait Time:                   167s       2.79m     0.05h    0.00d  0.000 y
 #Average job time:                1428s      23.80m     0.40h    0.02d
 #Longest finished job:           22861s     381.02m     6.35h    0.26d
 #Submission to last job:         22874s     381.23m     6.35h    0.26d
     # 6 hours for chr16_03_split_00 vs. itself.  ~4.5h for _09 vs _00.
     cat psl/*.psl \
     | liftUp -nohead -type=.psl stdout \
         chr16_03.lift error stdin \
     | liftUp -nohead -type=.psl -pslQ \
         ../psl/hg38.self.2bit:chr16_03:0-1689648_hg38.self.2bit:chr16_03:0-1689648.psl \
         chr16_03.lift error stdin
 
     cd ..
     cat psl/* > ../../psl/part014.lst/part014.lst_part014.lst.psl
 
     # Make run.time file or doBlastzChainNet.pl won't continue:
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/run.blastz
     para time >& run.time
 
     # Resume doBlastzChainNet.pl:
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
         -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
         -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
         -continue=cat -stop=net >& do2.log &
     tail -f do2.log
 #Batch failed after 4 tries on chain.csh part016.lst chain/part016.lst.chain
 #Command failed:
 #ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' hgwdev nice /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/doChainRun.csh
 
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run
     para problems
     # mostly these:
 #errAbort re-entered due to out-of-memory condition. Exiting.
     # one job made it through errAbort:
 #needLargeMem: Out of memory - request size 564838920 bytes, errno: 12
     para time
 #Completed: 59 of 68 jobs
 #Crashed: 9 jobs
 #CPU time in finished jobs:      24727s     412.12m     6.87h    0.29d  0.001 y
 #IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 #Average job time:                 409s       6.82m     0.11h    0.00d
 #Longest finished job:            2350s      39.17m     0.65h    0.03d
 #Submission to last job:          2462s      41.03m     0.68h    0.03d
     para crashed
 #chain.csh part012.lst {check out line+ chain/part012.lst.chain}
 #chain.csh part017.lst {check out line+ chain/part017.lst.chain}
 #chain.csh part016.lst {check out line+ chain/part016.lst.chain}
 #chain.csh part015.lst {check out line+ chain/part015.lst.chain}
 #chain.csh part014.lst {check out line+ chain/part014.lst.chain}
 #chain.csh hg38.self.2bit:chr1_10: {check out line+ chain/hg38.self.2bit:chr1_10:.chain}
 #chain.csh hg38.self.2bit:chr10_05: {check out line+ chain/hg38.self.2bit:chr10_05:.chain}
 #chain.csh hg38.self.2bit:chr7_00: {check out line+ chain/hg38.self.2bit:chr7_00:.chain}
 
     # Run the jobs outside of parasol (~11h):
     csh -efx chain.csh part012.lst chain/part012.lst.chain &
     csh -efx chain.csh part017.lst chain/part017.lst.chain &
     csh -efx chain.csh part016.lst chain/part016.lst.chain &
     csh -efx chain.csh part015.lst chain/part015.lst.chain &
     csh -efx chain.csh part014.lst chain/part014.lst.chain &
     csh -efx chain.csh hg38.self.2bit:chr1_10: chain/hg38.self.2bit:chr1_10:.chain &
     csh -efx chain.csh hg38.self.2bit:chr10_05: chain/hg38.self.2bit:chr10_05:.chain &
     csh -efx chain.csh hg38.self.2bit:chr7_00: chain/hg38.self.2bit:chr7_00:.chain &
     csh -efx chain.csh hg38.self.2bit:chr16_08: chain/hg38.self.2bit:chr16_08:.chain &
 
     # Resume doBlastzChainNet.pl again:
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27
     ~/kent/src/hg/utils/automation/doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
         -chainMinScore=3000 -chainLinearGap=medium -syntenicNet \
         -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \
         -continue=chainMerge -stop=net >& do3.log &
     tail -f do3.log
 # *** All done !  Elapsed time: 19m11s
 
     # Load track w/new name chainSelfRedo to compare to existing chainSelf:
     hgLoadChain -normScore -tIndex hg38 chainSelfRedo axtChain/hg38.hg38.all.chain.gz
 
     # No idea why but somehow the liftUp seems not to have worked for part012 and part017,
     # so the all.chain had chr22_31, chr8_01 etc.  :b  run again again.
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run
     mv chain/part012.lst.chain{,.bak}
     mv chain/part017.lst.chain{,.bak}
     csh -efx chain.csh part012.lst chain/part012.lst.chain >& part012.log &
     csh -efx chain.csh part017.lst chain/part017.lst.chain >& part017.log &
     # Those completed successfully.  Dunno why the earlier ones didn't get lifted.
     cd ..
     mv hg38.hg38.all{,.oopsPartUnlifted}.chain.gz
     # Reconstruct hg38.hg38.all.chain.gz (the chainMerge step is just this command):
     find /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/run/chain -name "*.chain" \
     | chainMergeSort -inputList=stdin \
     | nice gzip -c \
       > /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/hg38.hg38.all.chain.gz
 
     # NOTE FOR NEXT TIME: this filtering step will be unnecessary when -minScore=10000 is used
     # from the beginning.
     # Filter to minScore of 10000 (too much fluff with -minScore=3000) per Jim (see #24695)
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain
     mv hg38.hg38.all.chain.gz hg38.hg38.all.unfiltered.chain.gz
     chainFilter hg38.hg38.all.unfiltered chain.gz -minScore=10000 \
     | gzip -c > hg38.hg38.all.chain.gz
     hgLoadChain -normScore -tIndex hg38 chainSelfRedo hg38.hg38.all.chain.gz
     checkTableCoords hg38 chainSelfRedo
 
     # Rename to chainSelf and update lastz symlinks and downloads
     hgsql hg38 -e 'drop table chainSelf; drop table chainSelfLink;
                    rename table chainSelfRedo to chainSelf;
                    rename table chainSelfRedoLink to chainSelfLink;'
     cd /hive/data/genomes/hg38/bed
     rm lastz.self lastz.hg38
     ln -s lastzSelf.2020-01-27 lastz.self
     ln -s lastzSelf.2020-01-27 lastz.hg38
     cd /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain
     cp /hive/data/genomes/hg38/bed/lastzSelf.2014-01-25/axtChain/README.txt .
     $EDITOR README.txt
     md5sum hg38.hg38.all.chain.gz > md5sum.txt
     # Make sure that the old download dir has only symlinks, no real files, then remove and rebuild.
     ls -lR /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
     rm -r /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
     cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/vsSelf/
     ln -s /hive/data/genomes/hg38/bed/lastzSelf.2020-01-27/axtChain/{README.txt,hg38.hg38.all.chain.gz,md5sum.txt} .
 
 
 #########################################################################
 # NCBI ReMap alignments (DONE 2020-02-11 Angie)
 # RM 24449
     mkdir /hive/data/genomes/hg38/bed/chainHg19ReMap
     cd /hive/data/genomes/hg38/bed/chainHg19ReMap
     wget ftp://ftp.ncbi.nlm.nih.gov/pub/remap/Homo_sapiens/current/GCF_000001405.39_GRCh38.p13/GCF_000001405.25_GRCh37.p13/GCF_000001405.39-GCF_000001405.25.gff
     # We will need to substitute all the RefSeq chrom and contig IDs with our own names.
     # The same alt contig can appear in both assemblies with the same name, so replace
     # hg19 names at the beginning of the line and hg38 names after "Target=".
     hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \
     | sed -re 's/\./\\./;' \
     | awk '{print "s/^" $1 "\\b/" $2 "/;";}' \
       > hg38.hg19.chromAlias.sed
     hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source)' \
     | sed -re 's/\./\\./;' \
     | awk '{print "s/Target=" $1 "\\b/Target=" $2 "/;";}' \
       >> hg38.hg19.chromAlias.sed
 
     # There are some GRCh38.p13 sequences that we have not yet imported into hg38 -- use -dropT.
     sed -f hg38.hg19.chromAlias.sed GCF_000001405.39-GCF_000001405.25.gff \
     | gff3ToPsl -dropT /hive/data/genomes/{hg19,hg38}/chrom.sizes stdin stdout \
     | pslPosTarget stdin stdout \
     | sort -k14,14 -k16n,16n > remap.hg38.hg19.psl
 
     # Convert to chain for browser display.  Some of the remap chains have minScore < 1000 and
     # by default would be dropped by chainScore... use -minScore=0 to prevent that.
     time pslToChain remap.hg38.hg19.psl stdout \
     | chainScore -minScore=0 stdin /hive/data/genomes/{hg38/hg38.2bit,hg19/hg19.2bit} \
         remap.hg38.hg19.chain
 #real    9m31.900s
 #user    9m1.624s
 #sys     0m20.863s
     hgLoadChain hg38 -tIndex chainHg19ReMap remap.hg38.hg19.chain
 #Loading 5315 chains into hg38.chainHg19ReMap
     time axtChain -psl -linearGap=medium -verbose=0 remap.hg38.hg19.psl \
       /hive/data/genomes/hg38/hg38.2bit /hive/data/genomes/hg19/hg19.2bit \
       remap.axtChain.hg38.hg19.chain
 #real    2m26.333s
 #user    2m4.237s
 #sys     0m22.071s
     hgLoadChain hg38 -tIndex chainHg19ReMapAxtChain remap.axtChain.hg38.hg19.chain
 #Loading 2115 chains into hg38.chainHg19ReMapAxtChain
 
 ###################################################
 
 # PanelApp refs #25568
 cd /hive/data/genomes/hg38/bed
 mkdir panelApp
 cd panelApp
 wget https://hgwdev.gi.ucsc.edu/~bnguy/panel/hg38/panel_hg38.bb
 wget https://hgwdev.gi.ucsc.edu/~bnguy/panel/hg38/str_hg38.bb
 mv /cluster/home/bnguy/trackhub/region/region_hg38.bb .
 mv panel_hg38.bb genesPanel.bb
 mv str_hg38.bb STRsPanel.bb
 mv region_hg38.bb CNVregions.bb
 cd /gbdb/hg38
 mkdir panelApp
 cd panelApp
 ln -s /hive/data/genomes/hg38/bed/panelApp/genesPanel.bb
 ln -s /hive/data/genomes/hg38/bed/panelApp/STRsPanel.bb
 ln -s /hive/data/genomes/hg38/bed/panelApp/CNVregions.bb 
 cd ~/kent/src/hg/makeDb/trackDb/human/hg38
 wget https://hgwdev.gi.ucsc.edu/~bnguy/panel/hg38/panelapp.html
 mv panelapp.html panelApp.html
 curl https://hgwdev.gi.ucsc.edu/~bnguy/panel/hg38/trackDb.txt >> trackDb.ra
 vi trackDb.ra
 cd ~/kent/src/hg/makeDb/trackDb
 make alpha DBS=hg38
 
 ######
 #Agilent SNP/CNV arrays 3/11/21
 #Downloaded by web browser
 cd /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto
 fetchChromSizes hg38 > hg38.chrom.sizes
 bedSort hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed
 bedToBigBed hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bb
 bedSort hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed
 bedToBigBed hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bb
 bedSort hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed
 bedToBigBed hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bed hg38.chrom.sizes hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bb
 mkdir -p /gbdb/hg38/snpCnvArrays/agilent
 cd /gbdb/hg38/snpCnvArrays/agilent
 ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH+SNP_Microarray_4x180K_085591_D_BED_20200302.bb
 ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH_Microarray_8x60K_085590_D_BED_20200302.bb
 ln -s /hive/data/genomes/hg38/bed/agilentProbes/genetiSureCyto/hg38.GenetiSure_Cyto_CGH_Microarray_4x180K_085589_D_BED_20200302.bb
 vi ~/kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra
+
+#########################################################################
+# DECIPHER CNV & SNV - initial build (DONE 2022-04-08 Jonathan)
+# RM 29130
+
+cd /hive/data/genomes/outside/otto/decipher
+mkdir 2022-04-05
+cd 2022-04-05
+
+# manually fetch decipher-variants-grch38-2022-04-03.bed from DECIPHER
+../buildDecipher decipher-variants-grch38-2022-04-03.bed
+
+for i in `cat ../decipher.tables`
+        do
+        n=$i"New"
+        o=$i"Old"
+        hgsqlSwapTables hg38 $n $i $o -dropTable3
+        done
+
+mkdir -p /gbdb/hg38/decipher
+cd /gbdb/hg38/decipher
+ln -s /hive/data/outside/otto/decipher/2022-04-05/decipherCnv.bb .
+
+#########################################################################