src/hg/makeDb/doc/hg38/cactus447.txt 33401c5cafc069c3f78bcecc3c0d6f2d0b77aeda

33401c5cafc069c3f78bcecc3c0d6f2d0b77aeda
hiram
  Wed Aug 30 14:36:08 2023 -0700
phyloP done for both all and primates subset refs #31528

diff --git src/hg/makeDb/doc/hg38/cactus447.txt src/hg/makeDb/doc/hg38/cactus447.txt
index a21cfba..704e2d2 100644
--- src/hg/makeDb/doc/hg38/cactus447.txt
+++ src/hg/makeDb/doc/hg38/cactus447.txt
@@ -1,749 +1,1463 @@
 
 ################################################################################
 #### 2023-06-13 - fetch maf.gz - Hiram
 
 mkdir -p /hive/data/genomes/hg38/bed/cactus447/singleCover
 cd /hive/data/genomes/hg38/bed/cactus447/singleCover
 
 obtained maf file from Glenn, from the 'courtyard' login:
 
 /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz
 
 -rw-r--r-- 1 hickey cgl 881994218891 Jun  5 11:06 /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz
 
 There was some go around with a different version.  This one
 is 'single coverage'
 
 -rw-r--r-- 1 881994218891 Jun  5 11:06 447-mammalian-2022v1-single-copy.maf.gz
 
 ################################################################################
 #### split this file into individual chromosome files 2023-06-21 - Hiram
 
   mkdir  /hive/data/genomes/hg38/bed/cactus447/splitSingleCover
   cd  /hive/data/genomes/hg38/bed/cactus447/splitSingleCover
 
 time mafSplit -byTarget -useFullSequenceName /dev/null ./  ../singleCover/447-mammalian-2022v1-single-copy.maf.gz
 
 Splitting 1 files by target sequence -- ignoring first argument /dev/null
 
 real    804m35.124s
 user    1061m9.580s
 sys     101m19.894s
 
 
 ################################################################################
 ### convert the sequence names to hg38 chrom names 2023-06-12 to 06-14 - Hiram
 
   mkdir  /hive/data/genomes/hg38/bed/cactus447/ucscNames
   cd  /hive/data/genomes/hg38/bed/cactus447/ucscNames
 
    runOne script for processing each file:
 
 #!/bin/bash
 
 set -beEu -o pipefail
 
 export cactusMaf="${1}"
 export srcMaf="../splitSingleCover/${cactusMaf}"
 
 cactusId=${cactusMaf%.maf}
 
 export sedCommand=`grep -w "${cactusId}" cactus.to.ucsc.sed`
 export ucscName=`grep -w "${cactusId}" 2022v1.hg38.chrom.equiv.txt | cut -f2`
 
 case "${ucscName}" in
     chrM)
 if [ ${srcMaf} -nt ${ucscName}.maf ]; then
   sed -f cactus.to.ucsc.sed ${srcMaf} > ${ucscName}.maf
   touch -r ${srcMaf} ${ucscName}.maf
 fi
        ;;
     chr[0-9][0-9])
        rm -f "${ucscName}.maf"
        ln ${srcMaf} "${ucscName}.maf"
        ;;
     chr[0-9])
        rm -f "${ucscName}.maf"
        ln ${srcMaf} "${ucscName}.maf"
        ;;
     G*.maf)
     K*.maf)
 if [ ${srcMaf} -nt ${ucscName}.maf ]; then
   sed -e "${sedCommand}" ${srcMaf} > ${ucscName}.maf
   touch -r ${srcMaf} ${ucscName}.maf
 else
   printf "DONE: ${ucscName}.maf\n" 1>&2
 fi
        ;;
 esac
 
 if [ "${ucscName}.maf" -nt "nameScan/${ucscName}.seqCount.txt" ]; then
   grep "^s." "${ucscName}.maf" | cut -d' ' -f2 | sort | uniq -c | sort -rn \
     > "nameScan/${ucscName}.seqCount.txt"
   touch -r "${ucscName}.maf" "nameScan/${ucscName}.seqCount.txt"
   sed -e "s/^ \+//;" "nameScan/${ucscName}.seqCount.txt" | cut -d' ' -f2 \
     | cut -d'.' -f1 | sort | uniq -c \
        | sort -rn > "nameScan/${ucscName}.species.txt"
 fi
 #####################################################
 
      template:
 #LOOP
 runOne $(path1)
 #ENDLOOP
 
      using list of maf files:
      ls ../splitSingleCover  | grep maf > cactus.maf.list
 
      gensub2 cactus.maf.list single template jobList
 
      ### made mistake of running too many of those at the same time
      ### and their use of large amounts of memory caused hgwdev to
      ### hang up and needed a reboot to recover.  So, run these carefully
      ### only a couple at a time on hgwdev and on hgcompute-01, the large
      ### ones get as large as 800 Gb in memory.
 
      ### get the species names from the nameScan:
 
    sed -e 's/^ \+//;' nameScan/chr2.species.txt | cut -d' ' -f2 \
       | sort > ../species.list.txt
 
 ################################################################################
 ### get the chrom.sizes out of the maf files 2023-06-19 - Hiram
 
    mkdir /hive/data/genomes/hg38/bed/cactus447/chromSizes
    cd /hive/data/genomes/hg38/bed/cactus447/chromSizes
 
    ### runOne script to run each one:
 
 #!/bin/bash
 
 export inMaf="${1}"
 export result="${2}"
 
 grep "^s " "${inMaf}" | awk '{printf "%s\t%s\n", $2,$6}' | sort -u > "${result}
 
     ### working on the maf list:
     ls ../ucscNames/chr*.maf > maf.list
 
      ### template to construct jobList
 #LOOP
 runOne $(path1) {check out line+ result/$(root1).txt}
 #ENDLOOP
 
      gensub2 maf.list single template jobList
 
      para -ram=6g create jobList
 
 Completed: 101 of 101 jobs
 CPU time in finished jobs:      90801s    1513.35m    25.22h    1.05d  0.003 y
 IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 Average job time:                 762s      12.69m     0.21h    0.01d
 Longest finished job:            7952s     132.53m     2.21h    0.09d
 Submission to last job:          8022s     133.70m     2.23h    0.09d
 
     ### sizes.pl script to get the answer out of the result/*.txt files:
 
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my %chromSizes; # key is assembly name, value is a pointer to a hash
                 #  with key chromName, value size
 
 open (my $FH, "-|", "ls result/*.txt") or die "can not read ls result/*.txt";
 while (my $file = <$FH>) {
   chomp $file;
   printf STDERR "# reading: %s\n", $file;
   open (my $R, "<", $file) or die "can not read $file";
   while (my $line = <$R>) {
     chomp $line;
     my @a = split('\.', $line);
     if (! defined($chromSizes{$a[0]})) {
        my %h;
        $chromSizes{$a[0]} = \%h;
     }
     my $asm = $chromSizes{$a[0]};
     if (scalar(@a) == 3) {
       my ($a2, $size) = split('\s+', $a[2]);
       my $chr = "$a[1].$a2";
       if (defined($asm->{$chr})) {
          if ($asm->{$chr} != $size) {
            printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s.%s\n", $asm->{$chr}, $size, $a[0], $a[1], $a[2];
            exit 255;
          }
       } else {
          $asm->{$chr} = $size;
       }
 #      printf "%s\t%s.%s\n", $a[0], $a[1], $a[2];
     } elsif (scalar(@a) == 2) {
       my ($chr, $size) = split('\s+', $a[1]);
       if (defined($asm->{$chr})) {
          if ($asm->{$chr} != $size) {
            printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s\n", $asm->{$chr}, $size, $a[0], $a[1];
            exit 255;
          }
       } else {
          $asm->{$chr} = $size;
       }
 #      printf "%s\t%s\n", $a[0], $a[1];
     } else {
       printf STDERR "ERROR: not 2 or 3: '%s'\n", $line;
       exit 255;
     }
   }
   close ($R);
 }
 close ($FH);
 
 foreach my $asm (sort keys %chromSizes) {
    printf STDERR "# output %s\n", $asm;
    my $out = "sizes/${asm}.chrom.sizes";
    open ($FH, "|-", "sort -k2,2nr > $out") or die "can not write to $out";
    my $h = $chromSizes{$asm};
    foreach my $chr (sort keys %$h) {
 #      printf  "%s\t%s\t%d\n", $asm, $chr, $h->{$chr};
       printf $FH "%s\t%d\n", $chr, $h->{$chr};
    }
    close ($FH);
 }
 
     mkdir sizes
 
     time (./sizes.pl) > sizes.log 2>&1
 real    10m11.808s
 user    6m40.128s
 sys     3m37.475s
 
 
 ################################################################################
 ### add the iRow annotation 2023-06-19 to 2023-06-26 - Hiram
 
   mkdir /hive/data/genomes/hg38/bed/cactus447/iRows
   cd /hive/data/genomes/hg38/bed/cactus447/iRows
 
   # obtained the N.bed files from Glenn
   ##  symLink them into this directory:
 
 ls /hive/data/genomes/hg38/bed/cactus447/Nbeds/*.N.bed | grep -v "hg38.N.bed" \
    | while read P
 do
   B=`basename $P | sed -e 's/N.bed/bed/;'`
   rm -f ./${P}
   ln -s "${P}" ./${B}
 done
 
   ### and symlink in the chrom.sizes files as .len files
 
   ls /hive/data/genomes/hg38/bed/cactus447/chromSizes/sizes/*.chrom.sizes \
  | grep -v hg38.chrom.sizes | while read P
 do
   S=`basename ${P} | sed -e 's/chrom.sizes/len/;'`
   printf "%s\n" "${S}"
   ln -s "${P}" ./${S}
 done
 
 ln -s /hive/data/genomes/hg38/chrom.sizes hg38.len
 
    ls *.bed > nBeds
    ls *.len > sizes
 
    ### list of maf file to process:
 
    ls ../ucscNames/chr*.maf > maf.list
 
    ### template file to construct jobList
 #LOOP
 mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/hg38/hg38.2bit result/$(file1)
 #ENDLOOP
 
    gensub2 maf.list single template jobList
 
    ### took a week to get these all done.
    ### convert each file to a bigMaf file:
 
    mkdir /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf
    cd /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf
 
    ### tried to sort them, but made a mistake on the sort -k argument:
    for F in ../result/chr*.maf
 do
    B=`basename $F | sed -e 's/.maf//;'`
    mafToBigMaf hg38 "${F}" stdout | $HOME/bin/x86_64/gnusort -k2,2 -S1000G \
         --parallel=32 > "${B}.bigMaf"
 done
 
 real    1588m59.209s
 user    1259m38.364s
 sys     371m13.448s
 
    ### put these all together:
    ls -S chr*.bigMaf | while read M
 do
   cat "${M}"
 done > ../hg38.cactus447.bigMaf
 
 real    322m25.741s
 user    0m13.577s
 sys     82m41.509s
 
    ### since they were sorted incorrectly above, sort this one huge file:
 
    $HOME/bin/x86_64/gnusort -S500G --parallel=64 -k1,1 -k2,2n \
      hg38.cactus447.bigMaf > hg38.sorted447way.bigMaf
 real    752m12.425s
 user    47m34.825s
 sys     371m7.394s
 
 ################################################################################
 ### make the bigBed file from the sorted bigMaf 2023-06-27
 
 bedToBigBed -verbose=2 -itemsPerSlot=4 -type=bed3+1 \
  -as=$HOME/kent/src/hg/lib/bigMaf.as -tab iRows/hg38.sorted447way.bigMaf \
      /hive/data/genomes/hg38/chrom.sizes hg38.cactus447way.bb
 
 # pid=61008: VmPeak:     5682928 kB
 # real    4157m14.091s
 
 # that time is 69 hours
 
 ##############################################################################
 # liftOver files to convert sequence names (DONE - 2023-08-04 - Hiram)
 #
 #   The sequence names in the assemblies in the alignment do not match
 #   the sequence names in the corresponding genome browsers assemblies
 
     mkdir /hive/data/genomes/hg38/bed/cactus447/liftOver
     cd /hive/data/genomes/hg38/bed/cactus447/liftOver
     mkdir lifts
     ln -s ../idKeys/447way.equivalent.txt .
 
 #  using this script to construct liftOver files
 #!/bin/bash
 
 TOP="/hive/data/genomes/hg38/bed/cactus447/liftOver"
 
 cd "${TOP}"
 mkdir -p lifts
 
 cat 447way.equivalent.txt | while read L
 do
   seqName=`printf "%s" "${L}" | cut -f1`
   asmId=`printf "%s" "${L}" | cut -f2`
   seqIdKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt"
   case ${asmId} in
       GCA_* | GCF_* )
         gcX="${asmId:0:3}"
         d0="${asmId:4:3}"
         d1="${asmId:7:3}"
         d2="${asmId:10:3}"
     ab="/hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmId}"
         buildDir=`realpath "${ab}"`
         asmKeys="${buildDir}/trackData/idKeys/${asmId}.idKeys.txt"
         asmSizes="${buildDir}/${asmId}.chrom.sizes"
 #        asmSizes="${buildDir}/idKeys/chrom.sizes"
 ls -og "${asmKeys}"
         ;;
      [a-zA-Z0-9]*)
         asmKeys="/hive/data/genomes/${asmId}/bed/idKeys/${asmId}.idKeys.txt"
         asmSizes="/hive/data/genomes/${asmId}/chrom.sizes"
         ;;
      *)  printf "ERROR not recognized $asmId\n" 1>&2
          exit 255
        ;;
   esac
   seqKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt"
   seq2Bit="/hive/data/genomes/hg38/bed/cactus447/2bitFiles/${seqName}.2bit"
   if [ ! -s  "${seqKeys}" ]; then
     printf "can not find seq idKeys\n%s\n" "${seqKeys}" 1>&2
     exit 255
   fi
   if [ ! -s  "${asmKeys}" ]; then
     printf "can not find asm idKeys\n%s\n" "${asmKeys}" 1>&2
     exit 255
   fi
   printf "%s\t%s\n" "${asmId}" "${seqName}" 1>&2
 printf "lifts/${seqName}.${asmId}.lift\n" 1>&2
 printf "$seqKeys\n" 1>&2
 printf "$asmKeys\n" 1>&2
 printf "$asmSizes\n" 1>&2
   if [ ! -s "lifts/${seqName}.${asmId}.lift" ]; then
   join -t$'\t' "${seqKeys}" "${asmKeys}" | sort -k3,3 \
     | join -t$'\t' -2 3 <(sort -k1,1 ${asmSizes}) - \
       | awk -F$'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1, $2, $4, $2}' \
          > lifts/${seqName}.${asmId}.lift
   fi
 done
 
     # a couple of those ended up with duplicate names in the liftOver
     # due to duplicate contigs in the assemblies.  They were edited
     # manually, one was really bad and needed this perl script to clean it:
 
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my %idDone;     # key is sequence name, value is string of names used
 
 open (my $fh, "<", "lifts/Otolemur_garnettii.otoGar3.lift") or die "can not read lifts/Otolemur_garnettii.otoGar3.lift";
 while (my $line = <$fh>) {
   chomp $line;
   my @a = split('\t', $line);
   if (!defined($idDone{$a[1]})) {
     $idDone{$a[1]} = $a[3];
     printf "%s\n", $line;
   } else {
     $idDone{$a[1]} .= "," . $a[3];
   }
 }
 close ($fh);
 
 foreach my $seq (sort keys %idDone) {
   printf STDERR "%s\t%s\n", $seq, $idDone{$seq};
 }
 
 ##############################################################################
 # MAF FRAMES (working - 2023-08-01 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/hg38/bed/cactus447/frames
     cd /hive/data/genomes/hg38/bed/cactus447/frames
 
     mkdir genes
 
 #   survey all the genomes to find out what kinds of gene tracks they have
 
 printf '#/bin/bash
 
 egrep -v "GCA_|GCF_" ../idKeys/447way.equivalent.txt | cut -f2 | while read db
 do
   printf "# ${db}:"
   for table in `hgsql -N -e "show tables;" $db | egrep "Gene|ncbiRefSeq" | egrep -v "Curated|Cds|Link|Other|PepTable|Predicted|Psl|wgEncode|gencode|uuGene|ToGeneName|ccds|encode|Ext|Old|Pep|Select|Bak|Verify|orfeome|Multi|transMap|Pfam|knownGeneMrna|sgpGene"`
   do
      count=`hgsql -N -e "select count(*) from $table;" $db`
      printf " %%s: %%s" "${table}" "${count}"
   done
   printf "\\n"
 done
 ' > showGenes.sh
 
 
     chmod +x ./showGenes.sh
     ./showGenes.sh > gene.survey.txt 2>&1 &
     # most of them have ncbiRefSeq
     grep ncbiRefSeq gene.survey.txt  | cut -d' ' -f2 | sed -e 's/://;' \
        | sort | xargs echo | fold -s -w 72
 # bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1
 # mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1
 # rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1
 
     # a couple do not:
     grep -v ncbiRefSeq gene.survey.txt  | cut -d' ' -f2 | sed -e 's/://;' \
        | sort | xargs echo | fold -s -w 72
 # cavApe1 eidHel1 ptePar1
     # but they do have ensGene or augustusGene:
     egrep "cavApe1|eidHel1|ptePar1" gene.survey.txt
 # cavApe1: augustusGene: 10114 ensGene: 22510 xenoRefGene: 322224
 # eidHel1: augustusGene: 49674
 # ptePar1: augustusGene: 68613
 
     #   1. ncbiRefSeq  - add hg38 here manually since it didn't come in above
 #    for db in hg38
 # do
 #    asmName="${db}"
 
     for db in bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1 mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1 rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1
 do
     asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq" ${db} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > genes/${asmName}.gp.gz
     echo -n "# ${asmName}: "
     genePredCheck -db=${db} genes/${asmName}.gp.gz
 done
 # hg38: checked: 22882 failed: 0
 # Bos_mutus: checked: 20457 failed: 0
 # CanFam4: checked: 21143 failed: 0
 # Chinchilla_lanigera: checked: 20482 failed: 0
 # Colobus_angolensis: checked: 20042 failed: 0
 # Dipodomys_ordii: checked: 19738 failed: 0
 # Elephantulus_edwardii: checked: 20384 failed: 0
 # Felis_catus: checked: 20051 failed: 0
 # Fukomys_damarensis: checked: 19811 failed: 0
 # Microtus_ochrogaster: checked: 20048 failed: 0
 # Mus_musculus: checked: 23375 failed: 0
 # Myotis_brandtii: checked: 19949 failed: 0
 # Myotis_davidii: checked: 18815 failed: 0
 # Myotis_lucifugus: checked: 19895 failed: 0
 # Odobenus_rosmarus: checked: 19346 failed: 0
 # Orcinus_orca: checked: 19136 failed: 0
 # Otolemur_garnettii: checked: 19536 failed: 0
 # Propithecus_coquerelli: checked: 19356 failed: 0
 # Pteropus_alecto: checked: 18326 failed: 0
 # Macaca_mulatta: checked: 21021 failed: 0
 # Sorex_araneus: checked: 19160 failed: 0
 # Ictidomys_tridecemlineatus: checked: 19892 failed: 0
 # Sus_scrofa: checked: 24180 failed: 0
 # Carlito_syrichta: checked: 19968 failed: 0
 # Tupaia_chinensis: checked: 21047 failed: 0
 
     #   2. ensGene
     for db in cavApe1
 do
 asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
 hgsql -N -e "select
 name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
 from ensGene" ${db} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /dev/shm/${db}.tmp.gz
     mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz
     echo -n "# ${asmName}: "
     genePredCheck -db=${db} genes/${asmName}.gp.gz
 done
 # Cavia_aperea: checked: 14182 failed: 0
 
     #   3. augustusGene
     for db in eidHel1 ptePar1
 do
 asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
 hgsql -N -e "select
 name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
 from augustusGene" ${db} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /dev/shm/${db}.tmp.gz
     mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz
     echo -n "# ${asmName}: "
     genePredCheck -db=${db} genes/${asmName}.gp.gz
 done
 # Eidolon_helvum: checked: 31495 failed: 0
 # Pteronotus_parnellii: checked: 47678 failed: 0
 
     # verify counts for genes are reasonable:
     for T in genes/*.gz
 do
     printf "# $T: "
     zcat $T | cut -f1 | sort -u | wc -l
 done
 # genes/Bos_mutus.gp.gz: 20457
 # genes/CanFam4.gp.gz: 21143
 # genes/Canis_lupus_familiaris.gp.gz: 19812
 # genes/Carlito_syrichta.gp.gz: 19968
 # genes/Cavia_aperea.gp.gz: 14182
 # genes/Cercocebus_atys.gp.gz: 20534
 # genes/Chinchilla_lanigera.gp.gz: 20481
 # genes/Colobus_angolensis.gp.gz: 20042
 # genes/Condylura_cristata.gp.gz: 17842
 # genes/Dipodomys_ordii.gp.gz: 19738
 ...
 # genes/Pteropus_vampyrus.gp.gz: 19485
 # genes/Rattus_norvegicus.gp.gz: 22854
 # genes/Rhinopithecus_roxellana.gp.gz: 21027
 # genes/Saimiri_boliviensis.gp.gz: 20933
 # genes/Sorex_araneus.gp.gz: 19160
 # genes/Sus_scrofa.gp.gz: 24043
 # genes/Theropithecus_gelada.gp.gz: 20592
 # genes/Tupaia_chinensis.gp.gz: 21047
 
 # And, can pick up ncbiRefSeq for GCF equivalent assemblies for those
 # outside the UCSC database set of genomes
 
 grep GCF_ ../idKeys/447way.equivalent.txt | while read dbGCF
 do
   asmName=`printf "%s" "${dbGCF}" | cut -f1`
   asmId=`printf "%s" "${dbGCF}" | cut -f2`
   gcX="${asmId:0:3}"
   d0="${asmId:4:3}"
   d1="${asmId:7:3}"
   d2="${asmId:10:3}"
   buildDir="/hive/data/genomes/asmHubs/refseqBuild/$gcX/$d0/$d1/$d2/${asmId}"
   if [ ! -d "${buildDir}" ]; then
      printf "not found: %s\n" "${buildDir}"
   else
 
   ncbiRefSeqGp="${buildDir}/trackData/ncbiRefSeq/process/${asmId}.ncbiRefSeq.gp"
     sizes="${buildDir}/${asmId}.chrom.sizes"
     if [ -s "${ncbiRefSeqGp}" ]; then
       genePredSingleCover "${ncbiRefSeqGp}" stdout | gzip -c > genes/${asmName}.gp.gz
       printf "# %s: " "${asmName}"
       genePredCheck -chromSizes="${sizes}" genes/${asmName}.gp.gz
     else
       printf "MISSING: ${ncbiRefSeqGp}\n" 1>&2
     fi
   fi
 
 done
 
 # verify counts for genes are reasonable:
     for T in genes/*.gz
 do
     printf "# $T: "
     zcat $T | cut -f1 | sort -u | wc -l
 done
 # genes/Bos_mutus.gp.gz: 20457
 # genes/CanFam4.gp.gz: 21143
 # genes/Canis_lupus_familiaris.gp.gz: 19812
 # genes/Carlito_syrichta.gp.gz: 19968
 # genes/Cavia_aperea.gp.gz: 14182
 # genes/Cercocebus_atys.gp.gz: 20534
 # genes/Chinchilla_lanigera.gp.gz: 20481
 # genes/Colobus_angolensis.gp.gz: 20042
 # genes/Condylura_cristata.gp.gz: 17842
 # genes/Dipodomys_ordii.gp.gz: 19738
 ...
 # genes/Pteropus_vampyrus.gp.gz: 19485
 # genes/Rattus_norvegicus.gp.gz: 22854
 # genes/Rhinopithecus_roxellana.gp.gz: 21027
 # genes/Saimiri_boliviensis.gp.gz: 20933
 # genes/Sorex_araneus.gp.gz: 19160
 # genes/Sus_scrofa.gp.gz: 24043
 # genes/Theropithecus_gelada.gp.gz: 20592
 # genes/Tupaia_chinensis.gp.gz: 21047
 
     # these gene predictions need lifting to change the sequence names
     mv genes genes.seqToAsm
     mkdir genes
     cd genes.seqToAsm
 
 for F in *.gp.gz
 do
   seqName=`echo $F | sed -e 's/.gp.gz//;'`
   liftFile="`ls -d /hive/data/genomes/hg38/bed/cactus447/liftOver/lifts/${seqName}.*.lift 2> /dev/null`"
   if [ -s "${liftFile}" ]; then
     printf "%s\n" "${seqName}" 1>&2
     liftUp -type=.genepred stdout "${liftFile}" warn "${F}" | gzip -c \
         > "../genes/${seqName}.gp.gz"
   else
     printf "can not find lift file for $seqName\n" 1>&2
   fi
 done
 
     # kluster job to annotate each maf file
     screen -S hg38      # manage long running procedure with screen
     ssh ku
     cd /hive/data/genomes/hg38/bed/cactus447way/frames
 
     printf '#!/bin/bash
 set -beEu -o pipefail
 
 export C="${1}"
 export G="${2}"
 
 cat ../iRows/result/${C}.maf | genePredToMafFrames hg38 stdin stdout \
         ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
 ' > runOne
 
     chmod +x runOne
 
     ls ../iRows/result | grep maf | sed -e "s/.maf//" > chr.list
     ls genes | sed -e "s/.gp.gz//" > gene.list
 
     printf '#LOOP
 runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
 #ENDLOOP
 ' > template
 
     mkdir parts
     # run on the ku kluster
     gensub2 chr.list gene.list template jobList
     para -ram=128g create jobList
     para try ... check ... push
 # Completed: 4444 of 4444 jobs
 # CPU time in finished jobs:    3955725s   65928.75m  1098.81h   45.78d  0.125 y
 # IO & Wait Time:               1735948s   28932.46m   482.21h   20.09d  0.055 y
 # Average job time:                1281s      21.35m     0.36h    0.01d
 # Longest finished job:           34637s     577.28m     9.62h    0.40d
 # Submission to last job:         55372s     922.87m    15.38h    0.64d
 
     # collect all results into one file:
     cd /hive/data/genomes/hg38/bed/cactus447way/frames
     find ./parts -type f | while read F;
 do
  zcat ${F} | awk '11 == NF'
 done | sort -k1,1 -k2,2n > cactus447wayFrames.bed
 
     bedToBigBed -type=bed3+7 -as=$HOME/kent/src/hg/lib/mafFrames.as \
        -tab cactus447wayFrames.bed ../../../chrom.sizes cactus447wayFrames.bb
 
     bigBedInfo cactus447wayFrames.bb | sed -e 's/^/# /;'
 
 # version: 4
 # fieldCount: 11
 # hasHeaderExtension: yes
 # isCompressed: yes
 # isSwapped: 0
 # extraIndexCount: 0
 # itemCount: 16,063,019
 # primaryDataSize: 136,782,205
 # primaryIndexSize: 1,013,720
 # zoomLevels: 10
 # chromCount: 82
 # basesCovered: 65,486,909
 # meanDepth (of bases covered): 22.709888
 # minDepth: 1.000000
 # maxDepth: 44.000000
 # std of depth: 18.692097
 
     # -rw-rw-r-- 1 1192312943 Aug  5 15:28 cactus447wayFrames.bed
     # -rw-rw-r-- 1  166572857 Aug  5 15:29 cactus447wayFrames.bb
 
     gzip cactus447wayFrames.bed
 
     # verify there are frames on everything expected.  Since the
     #  frames on the HL* asemblies did not work due to sequence name
     #  differences, this won't be everything.
     # (ls genes | wc shows 44 'expected')
     zcat cactus447wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \
         | sed -e 's/^/# /;' > species.check.list
     wc -l species.check.list
     # 44
 
 #  391443 Bos_mutus
 #  414721 CanFam4
 #  417196 Canis_lupus_familiaris
 #  328151 Carlito_syrichta
 #  255097 Cavia_aperea
 #  373053 Cercocebus_atys
 #  398356 Chinchilla_lanigera
 #  350442 Colobus_angolensis
 #  346524 Condylura_cristata
 #  363333 Dipodomys_ordii
 ...
 #  379732 Pteropus_alecto
 #  387484 Pteropus_vampyrus
 #  389056 Rattus_norvegicus
 #  360929 Rhinopithecus_roxellana
 #  366019 Saimiri_boliviensis
 #  336296 Sorex_araneus
 #  402603 Sus_scrofa
 #  357418 Theropithecus_gelada
 #  372558 Tupaia_chinensis
 #  195495 hg38
 
     #   load the resulting file, merely for measurement purposes here
     ssh hgwdev
     cd /hive/data/genomes/hg38/bed/cactus447way/frames
     time hgLoadMafFrames hg38 cactus447wayFrames cactus447wayFrames.bed.gz
     #   real    3m14.979s
 
     hgsql -e 'select count(*) from cactus447wayFrames;' hg38
     # +----------+
     # | count(*) |
     # +----------+
     # | 16063019 |
     # +----------+
 
     time featureBits -countGaps hg38 cactus447wayFrames
     # 65486909 bases of 3299210039 (1.985%) in intersection
     # real    1m50.539s
 
     #   enable the trackDb entries:
 # frames https://hgdownload.soe.ucsc.edu/goldenPath/hg38/cactus447way/cactus447wayFrames.bb
 # irows on
     #   zoom to base level in an exon to see codon displays
     #	appears to work OK
 
     # do not need this loaded table:
     hgsql hg38 -e 'drop table cactus447wayFrames;'
 
 #########################################################################
+# Try loading this track data into database to see if download files
+#    such as upstream gene mafs can be constructed
+    mkdir -p /gbdb/hg38/cactus447way/maf
+    cd /hive/data/genomes/hg38/bed/cactus447/iRows/result
+    ln -s `pwd`/chr*.maf /gbdb/hg38/cactus447way/maf/
+
+    # this generates an immense cactus447way.tab file in the directory
+    #	where it is running.  Best to run this over in scratch.
+    #   This is going to take all day.
+    cd /dev/shm
+    time hgLoadMaf -pathPrefix=/gbdb/hg38/cactus447way/maf hg38 cactus447way
+
+# -rw-rw-r--    1 55422976 Aug  8 09:22 cactus447way.tab
+
+    hgsql -e 'select count(*) from cactus447way;' hg38
+
+    # Loading cactus447way into database
+    # Loaded 108225530 mafs in 101 files from /gbdb/hg38/cactus447way/maf
+
+    # real    440m30.144s
+    # user    404m46.183s
+    # sys     19m27.664s
+
+#########################################################################
+# Phylogenetic tree from 447-way (DONE - 2022-08-26 - Hiram)
+    mkdir /hive/data/genomes/hg38/bed/cactus447/4d
+    cd /hive/data/genomes/hg38/bed/cactus447/4d
+
+    # the annotated maf's are in:
+    ../iRows/result/chr*.maf
+
+    # using ncbiRefSeq for hg38, only transcribed genes and nothing
+    #	from the randoms and other misc.
+    hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq where cdsEnd > cdsStart;" hg38 \
+      | egrep -E -v "chrM|chrUn|random|_alt|_fix" > ncbiRefSeq.gp
+    wc -l *.gp
+    #     130420 ncbiRefSeq.gp
+
+    # verify it is only on the chroms:
+    cut -f2 ncbiRefSeq.gp | sort | uniq -c | sort -rn | sed -e 's/^/    # /;'
+    #   12841 chr1
+    #    9518 chr2
+    #    8490 chr3
+    #    7521 chr19
+    #    7519 chr11
+    #    7329 chr17
+    #    7287 chr12
+    #    6155 chr6
+    #    6151 chr10
+    #    5905 chr7
+    #    5561 chr5
+    #    5504 chr9
+    #    5351 chr4
+    #    5235 chr16
+    #    4964 chr8
+    #    4296 chrX
+    #    4255 chr15
+    #    3959 chr14
+    #    3089 chr20
+    #    2721 chr22
+    #    2521 chr18
+    #    2471 chr13
+    #    1411 chr21
+    #     366 chrY
+
+    genePredSingleCover ncbiRefSeq.gp stdout | sort > ncbiRefSeqNR.gp
+    wc -l ncbiRefSeqNR.gp
+    #	19542 ncbiRefSeqNR.gp
+
+    ssh ku
+    mkdir /hive/data/genomes/hg38/bed/cactus447/4d/run
+    cd /hive/data/genomes/hg38/bed/cactus447/4d/run
+    mkdir ../mfa
+
+    # newer versions of msa_view have a slightly different operation
+    # the sed of the gp file inserts the reference species in the chr name
+    # these processes are going to be copying the whole chromosome maf file
+    # to /scratch/tmp/ - these are huge files, make sure /scratch/tmp/
+    # on the kluster nodes is clean enough to manage the largest chromosome
+    # maf file, for example:
+# -rw-rw-r-- 1 661829442743 Jun 20 18:55 chr1.maf
+# -rw-rw-r-- 1 682410918389 Jun 21 16:21 chr2.maf
+
+    cat << '_EOF_' > 4d.csh
+#!/bin/csh -fe
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin
+set r = "/hive/data/genomes/hg38/bed/cactus447"
+set c = $1
+set infile = $r/iRows/result/$2
+set outfile = $3
+cd /scratch/tmp
+# 'clean' maf, removes all chrom names, leaves only the db name
+perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
+awk -v C=$c '$2 == C {print}' $r/4d/ncbiRefSeqNR.gp | sed -e "s/\t$c\t/\thg38.$c\t/" > $c.gp
+set NL=`wc -l $c.gp| gawk '{print $1}'`
+if ("$NL" != "0") then
+    $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss
+    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
+else
+    echo "" > $r/4d/run/$outfile
+endif
+rm -f $c.gp $c.maf $c.ss
+'_EOF_'
+    # << happy emacs
+    chmod +x 4d.csh
+
+    ls -1S /hive/data/genomes/hg38/bed/cactus447/iRows/result/chr*.maf \
+	| sed -e "s#.*cactus447/iRows/result/##" \
+        | egrep -E -v "chrM|chrUn|random|_alt|_fix" > maf.list
+
+    printf '#LOOP
+4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
+#ENDLOOP
+' > template
+
+    gensub2 maf.list single template jobList
+    para -ram=128g create jobList
+    para try ... check ... push ... etc...
+    para time
+# CPU time in finished jobs:     290959s    4849.31m    80.82h    3.37d  0.009 y
+# IO & Wait Time:                 17027s     283.79m     4.73h    0.20d  0.001 y
+# Average job time:               15399s     256.65m     4.28h    0.18d
+# Longest finished job:           36250s     604.17m    10.07h    0.42d
+# Submission to last job:         36265s     604.42m    10.07h    0.42d
+
+    # combine mfa files
+    ssh hgwdev
+    cd /hive/data/genomes/hg38/bed/cactus447/4d
+    # verify no tiny files:
+    ls -og mfa | sort -k3nr | tail -3
+    # -rw-rw-r-- 1  2583636 Aug  8 10:30 chr21.mfa
+    # -rw-rw-r-- 1  1759815 Aug  8 09:47 chrY.mfa
+
+    #want comma-less species.list
+    time /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin/msa_view \
+	--aggregate "`cat ../species.list.txt`" mfa/*.mfa | sed s/"> "/">"/ \
+	    > 4d.all.mfa
+    # real    0m9.416s
+    # -rw-rw-r-- 1 303080280 Aug  8 21:48 4d.all.mfa
+
+    # check they are all in there:
+    grep "^>" 4d.all.mfa | wc -l
+    #   447
+
+    sed 's/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
+        ../hg38.447way.nh
+
+    sed 's/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
+	../hg38.447way.nh > tree-commas.nh
+
+    # use phyloFit to create tree model (output is phyloFit.mod)
+    time /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin/phyloFit \
+	    --EM --precision MED --msa-format FASTA --subst-mod REV \
+		--tree tree-commas.nh 4d.all.mfa
+    #   real    880m32.863s
+
+    mv phyloFit.mod all.mod
+
+    grep TREE all.mod
+# TREE: (((((((((((((hg38:0.00899231,(Pan_paniscus:0.00233221,
+# Pan_troglodytes:0.00214362):0.00450706):0.00200006,
+# (Gorilla_gorilla:0.00120794,
+# Gorilla_beringei:0.00131286):0.00731999):0.00848699,
+# (Pongo_abelii:0.00175881,Pongo_pygmaeus:0.0021602):0.015206):0.00273536,
+# (((((Hylobates_lar:0.000904688,Hylobates_pileatus:0.000921279):0.00271202,
+# (((Hylobates_abbotti:0.00182513,Hylobates_muelleri:0.00188051):0.000678509,
+# Hylobates_agilis:0.00217126):0.00068124,
+# ...
+# (Ceratotherium_simum:0.000693402,
+# Ceratotherium_simum_cottoni:0.000665708):0.00612505):0.00973125):0.0392953):0.0100285):0.0323073):0.00299066):0.00383198):0.00906124):0.0213096):0.012194,
+# (((Dasypus_novemcinctus:0.0705749,(Tolypeutes_matacus:0.036879,
+# Chaetophractus_vellerosus:0.0367115):0.0193664):0.0421356,
+# ((Tamandua_tetradactyla:0.0243613,
+# Myrmecophaga_tridactyla:0.0216596):0.0822034,
+# (Choloepus_didactylus:0.0057887,
+# Choloepus_hoffmanni:0.00621468):0.0702582):0.018824):0.0529836,
+# (((Trichechus_manatus:0.0621184,(Procavia_capensis:0.0107201,
+# Heterohyrax_brucei:0.0105857):0.149548):0.00440411,
+# Loxodonta_africana:0.0800445):0.0234193,
+# ((((Microgale_talazaci:0.118008,Echinops_telfairi:0.077048):0.158928,
+# Chrysochloris_asiatica:0.138211):0.0147081,
+# Elephantulus_edwardii:0.20827):0.00712897,
+# Orycteropus_afer:0.112233):0.00664842):0.0534765):0.012194);
+
+    # compare these calculated lengths to what we started with
+
+    /cluster/bin/phast/all_dists ../hg38.447way.nh  | grep hg38 \
+	| sed -e "s/hg38.//;" | sort > original.dists
+
+    grep TREE all.mod | sed -e 's/TREE: //;' \
+       | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 \
+          | sed -e "s/hg38.//;"  | sort > hg38.dists
+
+    # printing out the 'original', the 'new' the 'difference' and
+    #    percent difference/delta
+    join original.dists hg38.dists | awk '{
+  printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }'       | sort -k4n | cut -f2,3,4,6 | sort -k4,4n
+Chiropotes_albinasus    0.044221        0.153018        -71.100786
+Hapalemur_gilberti      0.091144        0.238983        -61.861722
+Hapalemur_meridionalis  0.091511        0.239866        -61.849116
+Hapalemur_occidentalis  0.090759        0.237751        -61.826028
+Hapalemur_griseus       0.091379        0.239368        -61.824889
+Hapalemur_alaotrensis   0.091149        0.238685        -61.812012
+Eulemur_rufus   0.090624        0.236415        -61.667407
+Eulemur_fulvus  0.090508        0.236040        -61.655652
+Lemur_catta     0.090748        0.236490        -61.627130
+...
+Tolypeutes_matacus      0.518057        0.332742        55.693300
+Chaetophractus_vellerosus       0.520177        0.332575        56.408930
+Catagonus_wagneri       0.578187        0.366272        57.857275
+Megaderma_lyra  0.575967        0.360862        59.608659
+Spermophilus_dauricus   0.530847        0.329338        61.186076
+Choloepus_didactylus    0.537227        0.329232        63.175815
+Vicugna_pacos   0.553177        0.338170        63.579561
+Choloepus_hoffmanni     0.539357        0.329658        63.611076
+Tursiops_truncatus      0.533317        0.325632        63.779051
+Myrmecophaga_tridactyla 0.589117        0.357048        64.996583
+
+    # also calculated with SSREV model:
+    mkdir /cluster/data/hg38/bed/cactus447/4dSSREV
+    cd /cluster/data/hg38/bed/cactus447/4dSSREV
+    cp -p ../4d/4d.all.mfa .
+    cp -p ../4d/tree-commas.nh .
+    time /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/phyloFit \
+       --EM --precision MED --msa-format FASTA --subst-mod SSREV \
+          --tree tree-commas.nh 4d.all.mfa
+    # real    1406m51.018s
+
+    mv phyloFit.mod all.mod
+    grep BACK all.mod
+    # BACKGROUND: 0.254978 0.245022 0.245022 0.254978
+
+#########################################################################
+# phyloP for 447-way (DONE - 2017-11-06 - Hiram)
+#
+    # split SS files into 1M chunks, this business needs smaller files
+    #   to complete
+
+    ssh ku
+    mkdir /hive/data/genomes/hg38/bed/cactus447/consPhyloP
+    cd /hive/data/genomes/hg38/bed/cactus447/consPhyloP
+    mkdir ss run.split
+    cd run.split
+
+    # the annotated maf's are in:
+    ../iRows/result/chr*.maf
+
+    printf '#!/bin/csh -ef
+set c = $1
+set MAF = /hive/data/genomes/hg38/bed/cactus447/iRows/result/$c.maf
+set WINDOWS = /hive/data/genomes/hg38/bed/cactus447/consPhyloP/ss/$c
+set NL = `grep -m 10 -c -v "^#" $MAF`
+if ( -s $2 ) then
+    exit 0
+endif
+if ( -s $2.running ) then
+    exit 0
+endif
+
+date >> $2.running
+
+rm -fr $WINDOWS
+mkdir -p $WINDOWS
+pushd $WINDOWS > /dev/null
+if ( $NL > 0 ) then
+/cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/msa_split \
+    $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000
+endif
+popd > /dev/null
+date >> $2
+rm -f $2.running
+' > doSplit.csh
+
+    chmod +x doSplit.csh
+
+    #	do the easy ones first to see some immediate results
+    ls -1SL -r ../../iRows/result | grep chr | sed -e "s/.maf//;" > maf.list
+
+    # this needs a {check out line+ $(root1.done)} test for verification:
+    printf '#LOOP
+./doSplit.csh $(root1) {check out line+ $(root1).done}
+#ENDLOOP
+' > template
+
+    gensub2 maf.list single template jobList
+    # this was tricky, some of the jobs couldn't complete even with
+    #  128g, had to complete a number of these manually, and they
+    #  take a long time, more than 12 hours for individual chromosomes
+    para -ram=128g create jobList
+    para try ... check ... push ... etc...
+
+# Completed: 79 of 101 jobs
+# Crashed: 22 jobs
+# CPU time in finished jobs:       8058s     134.29m     2.24h    0.09d  0.000 y
+# IO & Wait Time:                   380s       6.34m     0.11h    0.00d  0.000 y
+# Average job time:                 107s       1.78m     0.03h    0.00d
+# Longest finished job:            3239s      53.98m     0.90h    0.04d
+# Submission to last job:          5350s      89.17m     1.49h    0.06d
+
+    # run phyloP with score=LRT
+    ssh ku
+    mkdir /cluster/data/hg38/bed/cactus447/consPhyloP
+    cd /cluster/data/hg38/bed/cactus447/consPhyloP
+
+    mkdir run.phyloP
+    cd run.phyloP
+    # Adjust model file base composition background and rate matrix to be
+    # representative of the chromosomes in play
+    grep BACK ../../4dSSREV/all.mod
+    # BACKGROUND: 0.254978 0.245022 0.245022 0.254978
+    # interesting, they are already paired up
+
+    grep BACKGROUND ../../4dSSREV/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
+    #	0.490
+    /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/modFreqs \
+	../../4dSSREV/all.mod 0.490 > all.mod
+    # verify, the BACKGROUND should now be paired up:
+    grep BACK all.mod
+    #   BACKGROUND: 0.255000 0.245000 0.245000 0.255000
+
+    printf '#!/bin/csh -fe
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin
+set f = $1
+set ssFile = $1:t
+set out = $2
+set cName = $f:h
+set n = $f:r:e
+set grp = $cwd:t
+set cons = /hive/data/genomes/hg38/bed/cactus447/consPhyloP
+set tmp = $cons/tmp/$grp/$f
+/bin/rm -fr $tmp
+/bin/mkdir -p $tmp
+set ssSrc = "$cons/ss/$cName/$ssFile"
+set useGrp = "$grp.mod"
+/bin/ln -s $cons/run.phyloP/$grp.mod $tmp
+pushd $tmp > /dev/null
+echo source: $ssSrc.ss
+$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
+    -i SS $useGrp $ssSrc.ss > $ssFile.wigFix
+popd > /dev/null
+/bin/mkdir -p $out:h
+sleep 4
+/bin/touch $out:h
+/bin/mv $tmp/$ssFile.wigFix $out
+/bin/rm -fr $tmp
+/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp
+/bin/rmdir --ignore-fail-on-non-empty $cons/tmp
+' > doPhyloP.csh
+
+    chmod +x doPhyloP.csh
+
+    # Create list of chunks
+    find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list
+    # make sure the list looks good
+    wc -l ss.list
+    #	3031 ss.list
+
+    # Create template file
+    #	file1 == $chr/$chunk/file name without .ss suffix
+    printf '#LOOP
+../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
+#ENDLOOP
+' > template
+
+    ######################   Running all species  #######################
+    # setup run for all species
+    mkdir /hive/data/genomes/hg38/bed/cactus447/consPhyloP/all
+    cd /hive/data/genomes/hg38/bed/cactus447/consPhyloP/all
+    mkdir wigFix
+
+    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
+    # beware overloading the cluster with these quick and high I/O jobs
+    para -ram=64g create jobList
+    para try ... check ...
+    para -maxJob=16 push
+    para time > run.time
+Completed: 3031 of 3031 jobs
+CPU time in finished jobs:   28585172s  476419.53m  7940.33h  330.85d  0.906 y
+IO & Wait Time:                132590s    2209.83m    36.83h    1.53d  0.004 y
+Average job time:                9475s     157.91m     2.63h    0.11d
+Longest finished job:           13733s     228.88m     3.81h    0.16d
+Submission to last job:         38568s     642.80m    10.71h    0.45d
+
+    mkdir downloads
+    time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'`
+do
+    echo "working: $D" 1>&2
+    find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
+	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
+        | gzip -c > downloads/${D}.phyloP447way.wigFix.gz
+done
+    #   real    34m31.522s
+
+    du -hsc downloads
+    #   5.4G    downloads
+
+    # check integrity of data with wigToBigWig
+    zcat downloads/*.wigFix.gz > all.wigFix
+  time (wigToBigWig -verbose=2 all.wigFix /hive/data/genomes/hg38/chrom.sizes \
+	phyloP447way.bw) > bigWig.log 2>&1
+
+    egrep "real|VmPeak" bigWig.log
+    # pid=236037: VmPeak:   33037308 kB
+    # real    25m42.759s
+
+    bigWigInfo phyloP447way.bw | sed -e 's/^/# /;'
+# version: 4
+# isCompressed: yes
+# isSwapped: 0
+# primaryDataSize: 7,824,240,203
+# primaryIndexSize: 90,693,508
+# zoomLevels: 10
+# chromCount: 98
+# basesCovered: 2,874,168,445
+# mean: -0.022678
+# min: -20.000000
+# max: 8.123000
+# std: 1.692685
+
+    #	encode those files into wiggle data
+#     time (zcat downloads/*.wigFix.gz \
+    wigEncode all.wigFix phyloP447way.wig phyloP447way.wib
+# Converted all.wigFix, upper limit 8.12, lower limit -20.00
+# real    8m8.231s
+
+
+# -rw-rw-r--   1  2874168445 Aug 30 14:08 phyloP447way.wib
+# -rw-rw-r--   1   298978228 Aug 30 14:08 phyloP447way.wig
+
+    du -hsc *.wi?
+    # 2.7G    phyloP447way.wib
+    # 286M    phyloP447way.wig
+
+    # Load gbdb and database with wiggle.
+    rm -f /gbdb/hg38/cactus447way/phyloP447way.wib
+    ln -s `pwd`/phyloP447way.wib /gbdb/hg38/cactus447way/phyloP447way.wib
+    time hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus447way hg38 \
+	phyloP447way phyloP447way.wig
+    # real    0m27.533s
+
+    # use to set trackDb.ra entries for wiggle min and max
+    # and verify table is loaded correctly
+
+    wigTableStats.sh hg38 phyloP447way
+# db.table          min   max     mean       count     sumData
+hg38.phyloP447way   -20 8.123 -0.022678 2874168445 -6.51804e+07
+#      1.69269 viewLimits=-8.4861:8.123
+#       stdDev viewLimits
+
+    #	that range is: 20+8.123= 28.123 for hBinSize=0.028123
+
+    #  Create histogram to get an overview of all the data
+    time hgWiggle -doHistogram \
+	-hBinSize=0.028123 -hBinCount=1000 -hMinVal=-20 -verbose=2 \
+	    -db=hg38 phyloP447way > histogram.data 2>&1
+    #  real    1m39.861s
+
+
+    # x-axis range:
+    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \
+	| sed -e 's/^/# /;'
+# Q1 -10.086650
+# median -4.012075
+# Q3 2.062495
+# average -4.017054
+# min -20.000000
+# max 8.123000
+# count 864
+# total -3470.734684
+# standard deviation 7.028195
+
+    # find out the range for the 2:5 graph
+    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \
+      | sed -e 's/^/# /;'
+# Q1 0.000016
+# median 0.000074
+# Q3 0.000431
+# average 0.001157
+# min 0.000000
+# max 0.025240
+# count 864
+# total 1.000001
+# standard deviation 0.003283
+
+    #	create plot of histogram:
+    printf 'set terminal pngcairo size 1200,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb"
+set output "hg38.phyloP447.histo.png"
+set size 1.0, 1.0
+set style line 1 lt 2 lc rgb "#ff88ff" lw 2
+set style line 2 lt 2 lc rgb "#66ff66" lw 2
+set style line 3 lt 2 lc rgb "#ffff00" lw 2
+set style line 4 lt 2 lc rgb "#ffffff" lw 2
+set border lc rgb "#ffff00"
+set key left box ls 3
+set key tc variable
+set grid noxtics
+set grid ytics ls 4
+set y2tics border nomirror out tc rgb "#ffffff"
+set ytics border nomirror out tc rgb "#ffffff"
+set title " Human hg38 Histogram phyloP447way track" tc rgb "#ffffff"
+set xlabel " hg38.phyloP447way score" tc rgb "#ffffff"
+set ylabel " Relative Frequency" tc rgb "#ff88ff"
+set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66"
+set xrange [-4:2.5]
+set yrange [0:0.04]
+set y2range [0:1]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \
+        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2
+' | gnuplot
+
+    # verify it looks sane
+    display hg38.phyloP447.histo.png &
+    # it now shows the spike at -1.000 for the artifical filling of
+    # no data available.
+
+    ######################   Running primate species  #######################
+    # computing 4d-sites phyloFit for primates
+    mkdir /cluster/data/hg38/bed/cactus447/4d/primates
+    cd /cluster/data/hg38/bed/cactus447/4d/primates
+    # the primates.list.txt was obtained from the hg38.447way.nh
+    # tree by taking the first 243 in that tree, ending in:
+    head -243 ../../hg38.447way.nh | tail -1
+     Galagoides_demidoff:0.00802126):0.0124128):0.0334321):0.0146219):0.105,
+
+    head -243 ../../hg38.447way.nh | sed -e 's/^ \+//;' \
+       | tr -d '(' | sed -e 's/:.*//;' | sort  > primates.list.txt
+
+    # then only using the mfa sequences for those specific sequences
+    ls ../mfa/chr*.mfa | while read C
+do
+  B=`basename $C`
+  faSomeRecords "${C}" primates.list.txt ${B}
+  printf "%s\n" "${B}"
+done
+    # resulting in:
+    faSize chr*.mfa
+# 151201516 bases (18902 N's 151182614 real 151182614 upper 0 lower)
+#	in 5832 sequences in 24 files
+
+    # then running phyloFit:
+    sed 's/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d'  primates.nh > tree-commas.nh
+
+    # note: using the SSREV model here:
+    /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/phyloFit \
+      --EM --precision MED --msa-format FASTA --subst-mod SSREV \
+        --tree tree-commas.nh primates.mfa
+    # rename the result
+    mv phyloFit.mod primates.mod
+
+    # adjust the frequencies
+    cd  /cluster/data/hg38/bed/cactus447/consPhyloP/run.phyloP
+    grep BACK ../../4d/primates/primates.mod
+    # BACKGROUND: 0.261935 0.238065 0.238065 0.261935
+    # interesting, they are already paired up
+
+    grep BACK ../../4d/primates/primates.mod | awk '{printf "%0.3f\n", $3 + $4}'
+    #	0.476
+    /cluster/bin/phast.build/cornellCVS/phast.2021-06-01/bin/modFreqs \
+	../../4d/primates/primates.mod 0.476 > primates.mod
+    # verify, the BACKGROUND should now be paired up:
+    grep BACK all.mod
+    #   BACKGROUND: 0.262000 0.238000 0.238000 0.262000
+
+    # setup run for primate species
+    mkdir /hive/data/genomes/hg38/bed/cactus447/consPhyloP/primates
+    cd /hive/data/genomes/hg38/bed/cactus447/consPhyloP/primates
+    mkdir wigFix
+
+    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
+    para -ram=6g create jobList
+    para try ... check ...
+    para -maxJob=16 push
+    para time > run.time
+Completed: 2438 of 2438 jobs
+CPU time in finished jobs:   18916928s  315282.13m  5254.70h  218.95d  0.600 y
+IO & Wait Time:                 68909s    1148.49m    19.14h    0.80d  0.002 y
+Average job time:                7787s     129.79m     2.16h    0.09d
+Longest finished job:           12050s     200.83m     3.35h    0.14d
+Submission to last job:         26707s     445.12m     7.42h    0.31d
+
+    mkdir downloads
+    time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'`
+do
+    echo "working: $D" 1>&2
+    find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
+	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
+        | gzip -c > downloads/${D}.phyloP447wayPrimates.wigFix.gz
+done
+    #   real    33m18.006s
+
+    du -hsc downloads
+    #   4.9G    downloads
+
+    # check integrity of data with wigToBigWig
+    zcat downloads/*.wigFix.gz > primates.wigFix
+    time (wigToBigWig -verbose=2 primates.wigFix \
+        /hive/data/genomes/hg38/chrom.sizes \
+	phyloP447wayPrimates.bw) > bigWig.log 2>&1
+
+    egrep "real|VmPeak" bigWig.log
+    # pid=236355: VmPeak:   33037308 kB
+    # real    26m17.042s
+
+    bigWigInfo phyloP447wayPrimates.bw | sed -e 's/^/# /;'
+# version: 4
+# isCompressed: yes
+# isSwapped: 0
+# primaryDataSize: 6,954,238,655
+# primaryIndexSize: 90,693,508
+# zoomLevels: 10
+# chromCount: 98
+# basesCovered: 2,874,168,445
+# mean: -0.066955
+# min: -20.000000
+# max: 1.587000
+# std: 1.479093
+
+    #	encode those files into wiggle data
+#    time (zcat downloads/*.wigFix.gz \
+	time wigEncode primates.wigFix phyloP447wayPrimates.wig phyloP447wayPrimates.wib
+# Converted primates.wigFix, upper limit 1.59, lower limit -20.00
+# real    9m17.113s
+XXX - running - Wed Aug 30 13:59:09 PDT 2023
+
+# -rw-rw-r--   1  2874168445 Aug 30 14:08 phyloP447wayPrimates.wib
+# -rw-rw-r--   1   320006223 Aug 30 14:08 phyloP447wayPrimates.wig
+
+    du -hsc *.wi?
+    # 2.7G    phyloP447way.wib
+    # 306M    phyloP447way.wig
+
+    # Load gbdb and database with wiggle.
+    rm -f /gbdb/hg38/cactus447way/phyloP447wayPrimates.wib
+    ln -s `pwd`/phyloP447wayPrimates.wib /gbdb/hg38/cactus447way/phyloP447wayPrimates.wib
+    time hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus447way hg38 \
+	phyloP447wayPrimates phyloP447wayPrimates.wig
+    # real    0m27.140s
+
+    # use to set trackDb.ra entries for wiggle min and max
+    # and verify table is loaded correctly
+
+    wigTableStats.sh hg38 phyloP447wayPrimates
+# db.table                min   max     mean       count     sumData
+hg38.phyloP447wayPrimates -20 1.587 -0.0669553 2874168445 -1.92441e+08
+#      1.47909 viewLimits=-7.46242:1.587
+#       stdDev viewLimits
+
+    #	that range is: 20+1.587= 21.587 for hBinSize=0.021587
+
+    #  Create histogram to get an overview of all the data
+    time hgWiggle -doHistogram \
+	-hBinSize=0.021587 -hBinCount=1000 -hMinVal=-20 -verbose=2 \
+	    -db=hg38 phyloP447wayPrimates > histogram.data 2>&1
+    #   real    1m30.401s
+    # x-axis range:
+    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \
+	| sed -e 's/^/# /;'
+# Q1 -11.645800
+# median -7.231290
+# Q3 -2.816750
+# average -7.238782
+# min -20.000000
+# max 1.565410
+# count 818
+# total -5921.323540
+# standard deviation 5.110470
+
+    # find out the range for the 2:5 graph
+    grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \
+      | sed -e 's/^/# /;'
+# Q1 0.000007
+# median 0.000032
+# Q3 0.000340
+# average 0.001237
+# min 0.000001
+# max 0.025761
+# count 818
+# total 1.012033
+# standard deviation 0.003386
+
+    #	create plot of histogram:
+    printf 'set terminal pngcairo size 1200,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb"
+set output "hg38.phyloP447Primates.histo.png"
+set size 1.0, 1.0
+set style line 1 lt 2 lc rgb "#ff88ff" lw 2
+set style line 2 lt 2 lc rgb "#66ff66" lw 2
+set style line 3 lt 2 lc rgb "#ffff00" lw 2
+set style line 4 lt 2 lc rgb "#ffffff" lw 2
+set border lc rgb "#ffff00"
+set key left box ls 3
+set key tc variable
+set grid noxtics
+set grid ytics ls 4
+set y2tics border nomirror out tc rgb "#ffffff"
+set ytics border nomirror out tc rgb "#ffffff"
+set title " Human hg38 Histogram phyloP447wayPrimates track" tc rgb "#ffffff"
+set xlabel " hg38.phyloP447wayPrimates score" tc rgb "#ffffff"
+set ylabel " Relative Frequency" tc rgb "#ff88ff"
+set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66"
+set xrange [-3:2]
+set yrange [0:0.04]
+set y2range [0:1]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \
+        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2
+' | gnuplot
+
+    # verify it looks sane
+    display hg38.phyloP447Primates.histo.png &
+    # it now shows the spike at -1.000 for the artifical filling of
+    # no data available.
+
+#############################################################################