src/hg/makeDb/doc/hg38/cactus447.txt 5b1483d60866e637703c920a8ff02516b52d53a8

5b1483d60866e637703c920a8ff02516b52d53a8
hiram
  Mon Aug 7 13:25:52 2023 -0700
add the frames procedure refs #31528

diff --git src/hg/makeDb/doc/hg38/cactus447.txt src/hg/makeDb/doc/hg38/cactus447.txt
index 3bddbf2..a21cfba 100644
--- src/hg/makeDb/doc/hg38/cactus447.txt
+++ src/hg/makeDb/doc/hg38/cactus447.txt
@@ -1,305 +1,749 @@
 
 ################################################################################
 #### 2023-06-13 - fetch maf.gz - Hiram
 
 mkdir -p /hive/data/genomes/hg38/bed/cactus447/singleCover
 cd /hive/data/genomes/hg38/bed/cactus447/singleCover
 
 obtained maf file from Glenn, from the 'courtyard' login:
 
 /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz
 
 -rw-r--r-- 1 hickey cgl 881994218891 Jun  5 11:06 /nanopore/cgl/glenn-scratch/new-maf/447-mammalian-2022v1-single-copy.maf.gz
 
 There was some go around with a different version.  This one
 is 'single coverage'
 
 -rw-r--r-- 1 881994218891 Jun  5 11:06 447-mammalian-2022v1-single-copy.maf.gz
 
 ################################################################################
 #### split this file into individual chromosome files 2023-06-21 - Hiram
 
   mkdir  /hive/data/genomes/hg38/bed/cactus447/splitSingleCover
   cd  /hive/data/genomes/hg38/bed/cactus447/splitSingleCover
 
 time mafSplit -byTarget -useFullSequenceName /dev/null ./  ../singleCover/447-mammalian-2022v1-single-copy.maf.gz
 
 Splitting 1 files by target sequence -- ignoring first argument /dev/null
 
 real    804m35.124s
 user    1061m9.580s
 sys     101m19.894s
 
 
 ################################################################################
 ### convert the sequence names to hg38 chrom names 2023-06-12 to 06-14 - Hiram
 
   mkdir  /hive/data/genomes/hg38/bed/cactus447/ucscNames
   cd  /hive/data/genomes/hg38/bed/cactus447/ucscNames
 
    runOne script for processing each file:
 
 #!/bin/bash
 
 set -beEu -o pipefail
 
 export cactusMaf="${1}"
 export srcMaf="../splitSingleCover/${cactusMaf}"
 
 cactusId=${cactusMaf%.maf}
 
 export sedCommand=`grep -w "${cactusId}" cactus.to.ucsc.sed`
 export ucscName=`grep -w "${cactusId}" 2022v1.hg38.chrom.equiv.txt | cut -f2`
 
 case "${ucscName}" in
     chrM)
 if [ ${srcMaf} -nt ${ucscName}.maf ]; then
   sed -f cactus.to.ucsc.sed ${srcMaf} > ${ucscName}.maf
   touch -r ${srcMaf} ${ucscName}.maf
 fi
        ;;
     chr[0-9][0-9])
        rm -f "${ucscName}.maf"
        ln ${srcMaf} "${ucscName}.maf"
        ;;
     chr[0-9])
        rm -f "${ucscName}.maf"
        ln ${srcMaf} "${ucscName}.maf"
        ;;
     G*.maf)
     K*.maf)
 if [ ${srcMaf} -nt ${ucscName}.maf ]; then
   sed -e "${sedCommand}" ${srcMaf} > ${ucscName}.maf
   touch -r ${srcMaf} ${ucscName}.maf
 else
   printf "DONE: ${ucscName}.maf\n" 1>&2
 fi
        ;;
 esac
 
 if [ "${ucscName}.maf" -nt "nameScan/${ucscName}.seqCount.txt" ]; then
   grep "^s." "${ucscName}.maf" | cut -d' ' -f2 | sort | uniq -c | sort -rn \
     > "nameScan/${ucscName}.seqCount.txt"
   touch -r "${ucscName}.maf" "nameScan/${ucscName}.seqCount.txt"
   sed -e "s/^ \+//;" "nameScan/${ucscName}.seqCount.txt" | cut -d' ' -f2 \
     | cut -d'.' -f1 | sort | uniq -c \
        | sort -rn > "nameScan/${ucscName}.species.txt"
 fi
 #####################################################
 
      template:
 #LOOP
 runOne $(path1)
 #ENDLOOP
 
      using list of maf files:
      ls ../splitSingleCover  | grep maf > cactus.maf.list
 
      gensub2 cactus.maf.list single template jobList
 
      ### made mistake of running too many of those at the same time
      ### and their use of large amounts of memory caused hgwdev to
      ### hang up and needed a reboot to recover.  So, run these carefully
      ### only a couple at a time on hgwdev and on hgcompute-01, the large
      ### ones get as large as 800 Gb in memory.
 
      ### get the species names from the nameScan:
 
    sed -e 's/^ \+//;' nameScan/chr2.species.txt | cut -d' ' -f2 \
       | sort > ../species.list.txt
 
 ################################################################################
 ### get the chrom.sizes out of the maf files 2023-06-19 - Hiram
 
    mkdir /hive/data/genomes/hg38/bed/cactus447/chromSizes
    cd /hive/data/genomes/hg38/bed/cactus447/chromSizes
 
    ### runOne script to run each one:
 
 #!/bin/bash
 
 export inMaf="${1}"
 export result="${2}"
 
 grep "^s " "${inMaf}" | awk '{printf "%s\t%s\n", $2,$6}' | sort -u > "${result}
 
     ### working on the maf list:
     ls ../ucscNames/chr*.maf > maf.list
 
      ### template to construct jobList
 #LOOP
 runOne $(path1) {check out line+ result/$(root1).txt}
 #ENDLOOP
 
      gensub2 maf.list single template jobList
 
      para -ram=6g create jobList
 
 Completed: 101 of 101 jobs
 CPU time in finished jobs:      90801s    1513.35m    25.22h    1.05d  0.003 y
 IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 Average job time:                 762s      12.69m     0.21h    0.01d
 Longest finished job:            7952s     132.53m     2.21h    0.09d
 Submission to last job:          8022s     133.70m     2.23h    0.09d
 
     ### sizes.pl script to get the answer out of the result/*.txt files:
 
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 my %chromSizes; # key is assembly name, value is a pointer to a hash
                 #  with key chromName, value size
 
 open (my $FH, "-|", "ls result/*.txt") or die "can not read ls result/*.txt";
 while (my $file = <$FH>) {
   chomp $file;
   printf STDERR "# reading: %s\n", $file;
   open (my $R, "<", $file) or die "can not read $file";
   while (my $line = <$R>) {
     chomp $line;
     my @a = split('\.', $line);
     if (! defined($chromSizes{$a[0]})) {
        my %h;
        $chromSizes{$a[0]} = \%h;
     }
     my $asm = $chromSizes{$a[0]};
     if (scalar(@a) == 3) {
       my ($a2, $size) = split('\s+', $a[2]);
       my $chr = "$a[1].$a2";
       if (defined($asm->{$chr})) {
          if ($asm->{$chr} != $size) {
            printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s.%s\n", $asm->{$chr}, $size, $a[0], $a[1], $a[2];
            exit 255;
          }
       } else {
          $asm->{$chr} = $size;
       }
 #      printf "%s\t%s.%s\n", $a[0], $a[1], $a[2];
     } elsif (scalar(@a) == 2) {
       my ($chr, $size) = split('\s+', $a[1]);
       if (defined($asm->{$chr})) {
          if ($asm->{$chr} != $size) {
            printf STDERR "ERROR: size mismatch: %d != %d for %s\t%s\n", $asm->{$chr}, $size, $a[0], $a[1];
            exit 255;
          }
       } else {
          $asm->{$chr} = $size;
       }
 #      printf "%s\t%s\n", $a[0], $a[1];
     } else {
       printf STDERR "ERROR: not 2 or 3: '%s'\n", $line;
       exit 255;
     }
   }
   close ($R);
 }
 close ($FH);
 
 foreach my $asm (sort keys %chromSizes) {
    printf STDERR "# output %s\n", $asm;
    my $out = "sizes/${asm}.chrom.sizes";
    open ($FH, "|-", "sort -k2,2nr > $out") or die "can not write to $out";
    my $h = $chromSizes{$asm};
    foreach my $chr (sort keys %$h) {
 #      printf  "%s\t%s\t%d\n", $asm, $chr, $h->{$chr};
       printf $FH "%s\t%d\n", $chr, $h->{$chr};
    }
    close ($FH);
 }
 
     mkdir sizes
 
     time (./sizes.pl) > sizes.log 2>&1
 real    10m11.808s
 user    6m40.128s
 sys     3m37.475s
 
 
 ################################################################################
 ### add the iRow annotation 2023-06-19 to 2023-06-26 - Hiram
 
   mkdir /hive/data/genomes/hg38/bed/cactus447/iRows
   cd /hive/data/genomes/hg38/bed/cactus447/iRows
 
   # obtained the N.bed files from Glenn
   ##  symLink them into this directory:
 
 ls /hive/data/genomes/hg38/bed/cactus447/Nbeds/*.N.bed | grep -v "hg38.N.bed" \
    | while read P
 do
   B=`basename $P | sed -e 's/N.bed/bed/;'`
   rm -f ./${P}
   ln -s "${P}" ./${B}
 done
 
   ### and symlink in the chrom.sizes files as .len files
 
   ls /hive/data/genomes/hg38/bed/cactus447/chromSizes/sizes/*.chrom.sizes \
  | grep -v hg38.chrom.sizes | while read P
 do
   S=`basename ${P} | sed -e 's/chrom.sizes/len/;'`
   printf "%s\n" "${S}"
   ln -s "${P}" ./${S}
 done
 
 ln -s /hive/data/genomes/hg38/chrom.sizes hg38.len
 
    ls *.bed > nBeds
    ls *.len > sizes
 
    ### list of maf file to process:
 
    ls ../ucscNames/chr*.maf > maf.list
 
    ### template file to construct jobList
 #LOOP
 mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/hg38/hg38.2bit result/$(file1)
 #ENDLOOP
 
    gensub2 maf.list single template jobList
 
    ### took a week to get these all done.
    ### convert each file to a bigMaf file:
 
    mkdir /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf
    cd /hive/data/genomes/hg38/bed/cactus447/iRows/bigMaf
 
    ### tried to sort them, but made a mistake on the sort -k argument:
    for F in ../result/chr*.maf
 do
    B=`basename $F | sed -e 's/.maf//;'`
    mafToBigMaf hg38 "${F}" stdout | $HOME/bin/x86_64/gnusort -k2,2 -S1000G \
         --parallel=32 > "${B}.bigMaf"
 done
 
 real    1588m59.209s
 user    1259m38.364s
 sys     371m13.448s
 
    ### put these all together:
    ls -S chr*.bigMaf | while read M
 do
   cat "${M}"
 done > ../hg38.cactus447.bigMaf
 
 real    322m25.741s
 user    0m13.577s
 sys     82m41.509s
 
    ### since they were sorted incorrectly above, sort this one huge file:
 
    $HOME/bin/x86_64/gnusort -S500G --parallel=64 -k1,1 -k2,2n \
      hg38.cactus447.bigMaf > hg38.sorted447way.bigMaf
 real    752m12.425s
 user    47m34.825s
 sys     371m7.394s
 
 ################################################################################
 ### make the bigBed file from the sorted bigMaf 2023-06-27
 
 bedToBigBed -verbose=2 -itemsPerSlot=4 -type=bed3+1 \
  -as=$HOME/kent/src/hg/lib/bigMaf.as -tab iRows/hg38.sorted447way.bigMaf \
      /hive/data/genomes/hg38/chrom.sizes hg38.cactus447way.bb
-XXX running - Tue Jun 27 14:16:31 PDT 2023
+
+# pid=61008: VmPeak:     5682928 kB
+# real    4157m14.091s
+
+# that time is 69 hours
+
+##############################################################################
+# liftOver files to convert sequence names (DONE - 2023-08-04 - Hiram)
+#
+#   The sequence names in the assemblies in the alignment do not match
+#   the sequence names in the corresponding genome browsers assemblies
+
+    mkdir /hive/data/genomes/hg38/bed/cactus447/liftOver
+    cd /hive/data/genomes/hg38/bed/cactus447/liftOver
+    mkdir lifts
+    ln -s ../idKeys/447way.equivalent.txt .
+
+#  using this script to construct liftOver files
+#!/bin/bash
+
+TOP="/hive/data/genomes/hg38/bed/cactus447/liftOver"
+
+cd "${TOP}"
+mkdir -p lifts
+
+cat 447way.equivalent.txt | while read L
+do
+  seqName=`printf "%s" "${L}" | cut -f1`
+  asmId=`printf "%s" "${L}" | cut -f2`
+  seqIdKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt"
+  case ${asmId} in
+      GCA_* | GCF_* )
+        gcX="${asmId:0:3}"
+        d0="${asmId:4:3}"
+        d1="${asmId:7:3}"
+        d2="${asmId:10:3}"
+    ab="/hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmId}"
+        buildDir=`realpath "${ab}"`
+        asmKeys="${buildDir}/trackData/idKeys/${asmId}.idKeys.txt"
+        asmSizes="${buildDir}/${asmId}.chrom.sizes"
+#        asmSizes="${buildDir}/idKeys/chrom.sizes"
+ls -og "${asmKeys}"
+        ;;
+     [a-zA-Z0-9]*)
+        asmKeys="/hive/data/genomes/${asmId}/bed/idKeys/${asmId}.idKeys.txt"
+        asmSizes="/hive/data/genomes/${asmId}/chrom.sizes"
+        ;;
+     *)  printf "ERROR not recognized $asmId\n" 1>&2
+         exit 255
+       ;;
+  esac
+  seqKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt"
+  seq2Bit="/hive/data/genomes/hg38/bed/cactus447/2bitFiles/${seqName}.2bit"
+  if [ ! -s  "${seqKeys}" ]; then
+    printf "can not find seq idKeys\n%s\n" "${seqKeys}" 1>&2
+    exit 255
+  fi
+  if [ ! -s  "${asmKeys}" ]; then
+    printf "can not find asm idKeys\n%s\n" "${asmKeys}" 1>&2
+    exit 255
+  fi
+  printf "%s\t%s\n" "${asmId}" "${seqName}" 1>&2
+printf "lifts/${seqName}.${asmId}.lift\n" 1>&2
+printf "$seqKeys\n" 1>&2
+printf "$asmKeys\n" 1>&2
+printf "$asmSizes\n" 1>&2
+  if [ ! -s "lifts/${seqName}.${asmId}.lift" ]; then
+  join -t$'\t' "${seqKeys}" "${asmKeys}" | sort -k3,3 \
+    | join -t$'\t' -2 3 <(sort -k1,1 ${asmSizes}) - \
+      | awk -F$'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1, $2, $4, $2}' \
+         > lifts/${seqName}.${asmId}.lift
+  fi
+done
+
+    # a couple of those ended up with duplicate names in the liftOver
+    # due to duplicate contigs in the assemblies.  They were edited
+    # manually, one was really bad and needed this perl script to clean it:
+
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my %idDone;     # key is sequence name, value is string of names used
+
+open (my $fh, "<", "lifts/Otolemur_garnettii.otoGar3.lift") or die "can not read lifts/Otolemur_garnettii.otoGar3.lift";
+while (my $line = <$fh>) {
+  chomp $line;
+  my @a = split('\t', $line);
+  if (!defined($idDone{$a[1]})) {
+    $idDone{$a[1]} = $a[3];
+    printf "%s\n", $line;
+  } else {
+    $idDone{$a[1]} .= "," . $a[3];
+  }
+}
+close ($fh);
+
+foreach my $seq (sort keys %idDone) {
+  printf STDERR "%s\t%s\n", $seq, $idDone{$seq};
+}
+
+##############################################################################
+# MAF FRAMES (working - 2023-08-01 - Hiram)
+    ssh hgwdev
+    mkdir /hive/data/genomes/hg38/bed/cactus447/frames
+    cd /hive/data/genomes/hg38/bed/cactus447/frames
+
+    mkdir genes
+
+#   survey all the genomes to find out what kinds of gene tracks they have
+
+printf '#/bin/bash
+
+egrep -v "GCA_|GCF_" ../idKeys/447way.equivalent.txt | cut -f2 | while read db
+do
+  printf "# ${db}:"
+  for table in `hgsql -N -e "show tables;" $db | egrep "Gene|ncbiRefSeq" | egrep -v "Curated|Cds|Link|Other|PepTable|Predicted|Psl|wgEncode|gencode|uuGene|ToGeneName|ccds|encode|Ext|Old|Pep|Select|Bak|Verify|orfeome|Multi|transMap|Pfam|knownGeneMrna|sgpGene"`
+  do
+     count=`hgsql -N -e "select count(*) from $table;" $db`
+     printf " %%s: %%s" "${table}" "${count}"
+  done
+  printf "\\n"
+done
+' > showGenes.sh
+
+
+    chmod +x ./showGenes.sh
+    ./showGenes.sh > gene.survey.txt 2>&1 &
+    # most of them have ncbiRefSeq
+    grep ncbiRefSeq gene.survey.txt  | cut -d' ' -f2 | sed -e 's/://;' \
+       | sort | xargs echo | fold -s -w 72
+# bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1
+# mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1
+# rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1
+
+    # a couple do not:
+    grep -v ncbiRefSeq gene.survey.txt  | cut -d' ' -f2 | sed -e 's/://;' \
+       | sort | xargs echo | fold -s -w 72
+# cavApe1 eidHel1 ptePar1
+    # but they do have ensGene or augustusGene:
+    egrep "cavApe1|eidHel1|ptePar1" gene.survey.txt
+# cavApe1: augustusGene: 10114 ensGene: 22510 xenoRefGene: 322224
+# eidHel1: augustusGene: 49674
+# ptePar1: augustusGene: 68613
+
+    #   1. ncbiRefSeq  - add hg38 here manually since it didn't come in above
+#    for db in hg38
+# do
+#    asmName="${db}"
+
+    for db in bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1 mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1 rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1
+do
+    asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq" ${db} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > genes/${asmName}.gp.gz
+    echo -n "# ${asmName}: "
+    genePredCheck -db=${db} genes/${asmName}.gp.gz
+done
+# hg38: checked: 22882 failed: 0
+# Bos_mutus: checked: 20457 failed: 0
+# CanFam4: checked: 21143 failed: 0
+# Chinchilla_lanigera: checked: 20482 failed: 0
+# Colobus_angolensis: checked: 20042 failed: 0
+# Dipodomys_ordii: checked: 19738 failed: 0
+# Elephantulus_edwardii: checked: 20384 failed: 0
+# Felis_catus: checked: 20051 failed: 0
+# Fukomys_damarensis: checked: 19811 failed: 0
+# Microtus_ochrogaster: checked: 20048 failed: 0
+# Mus_musculus: checked: 23375 failed: 0
+# Myotis_brandtii: checked: 19949 failed: 0
+# Myotis_davidii: checked: 18815 failed: 0
+# Myotis_lucifugus: checked: 19895 failed: 0
+# Odobenus_rosmarus: checked: 19346 failed: 0
+# Orcinus_orca: checked: 19136 failed: 0
+# Otolemur_garnettii: checked: 19536 failed: 0
+# Propithecus_coquerelli: checked: 19356 failed: 0
+# Pteropus_alecto: checked: 18326 failed: 0
+# Macaca_mulatta: checked: 21021 failed: 0
+# Sorex_araneus: checked: 19160 failed: 0
+# Ictidomys_tridecemlineatus: checked: 19892 failed: 0
+# Sus_scrofa: checked: 24180 failed: 0
+# Carlito_syrichta: checked: 19968 failed: 0
+# Tupaia_chinensis: checked: 21047 failed: 0
+
+    #   2. ensGene
+    for db in cavApe1
+do
+asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
+hgsql -N -e "select
+name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
+from ensGene" ${db} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /dev/shm/${db}.tmp.gz
+    mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz
+    echo -n "# ${asmName}: "
+    genePredCheck -db=${db} genes/${asmName}.gp.gz
+done
+# Cavia_aperea: checked: 14182 failed: 0
+
+    #   3. augustusGene
+    for db in eidHel1 ptePar1
+do
+asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
+hgsql -N -e "select
+name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
+from augustusGene" ${db} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /dev/shm/${db}.tmp.gz
+    mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz
+    echo -n "# ${asmName}: "
+    genePredCheck -db=${db} genes/${asmName}.gp.gz
+done
+# Eidolon_helvum: checked: 31495 failed: 0
+# Pteronotus_parnellii: checked: 47678 failed: 0
+
+    # verify counts for genes are reasonable:
+    for T in genes/*.gz
+do
+    printf "# $T: "
+    zcat $T | cut -f1 | sort -u | wc -l
+done
+# genes/Bos_mutus.gp.gz: 20457
+# genes/CanFam4.gp.gz: 21143
+# genes/Canis_lupus_familiaris.gp.gz: 19812
+# genes/Carlito_syrichta.gp.gz: 19968
+# genes/Cavia_aperea.gp.gz: 14182
+# genes/Cercocebus_atys.gp.gz: 20534
+# genes/Chinchilla_lanigera.gp.gz: 20481
+# genes/Colobus_angolensis.gp.gz: 20042
+# genes/Condylura_cristata.gp.gz: 17842
+# genes/Dipodomys_ordii.gp.gz: 19738
+...
+# genes/Pteropus_vampyrus.gp.gz: 19485
+# genes/Rattus_norvegicus.gp.gz: 22854
+# genes/Rhinopithecus_roxellana.gp.gz: 21027
+# genes/Saimiri_boliviensis.gp.gz: 20933
+# genes/Sorex_araneus.gp.gz: 19160
+# genes/Sus_scrofa.gp.gz: 24043
+# genes/Theropithecus_gelada.gp.gz: 20592
+# genes/Tupaia_chinensis.gp.gz: 21047
+
+# And, can pick up ncbiRefSeq for GCF equivalent assemblies for those
+# outside the UCSC database set of genomes
+
+grep GCF_ ../idKeys/447way.equivalent.txt | while read dbGCF
+do
+  asmName=`printf "%s" "${dbGCF}" | cut -f1`
+  asmId=`printf "%s" "${dbGCF}" | cut -f2`
+  gcX="${asmId:0:3}"
+  d0="${asmId:4:3}"
+  d1="${asmId:7:3}"
+  d2="${asmId:10:3}"
+  buildDir="/hive/data/genomes/asmHubs/refseqBuild/$gcX/$d0/$d1/$d2/${asmId}"
+  if [ ! -d "${buildDir}" ]; then
+     printf "not found: %s\n" "${buildDir}"
+  else
+
+  ncbiRefSeqGp="${buildDir}/trackData/ncbiRefSeq/process/${asmId}.ncbiRefSeq.gp"
+    sizes="${buildDir}/${asmId}.chrom.sizes"
+    if [ -s "${ncbiRefSeqGp}" ]; then
+      genePredSingleCover "${ncbiRefSeqGp}" stdout | gzip -c > genes/${asmName}.gp.gz
+      printf "# %s: " "${asmName}"
+      genePredCheck -chromSizes="${sizes}" genes/${asmName}.gp.gz
+    else
+      printf "MISSING: ${ncbiRefSeqGp}\n" 1>&2
+    fi
+  fi
+
+done
+
+# verify counts for genes are reasonable:
+    for T in genes/*.gz
+do
+    printf "# $T: "
+    zcat $T | cut -f1 | sort -u | wc -l
+done
+# genes/Bos_mutus.gp.gz: 20457
+# genes/CanFam4.gp.gz: 21143
+# genes/Canis_lupus_familiaris.gp.gz: 19812
+# genes/Carlito_syrichta.gp.gz: 19968
+# genes/Cavia_aperea.gp.gz: 14182
+# genes/Cercocebus_atys.gp.gz: 20534
+# genes/Chinchilla_lanigera.gp.gz: 20481
+# genes/Colobus_angolensis.gp.gz: 20042
+# genes/Condylura_cristata.gp.gz: 17842
+# genes/Dipodomys_ordii.gp.gz: 19738
+...
+# genes/Pteropus_vampyrus.gp.gz: 19485
+# genes/Rattus_norvegicus.gp.gz: 22854
+# genes/Rhinopithecus_roxellana.gp.gz: 21027
+# genes/Saimiri_boliviensis.gp.gz: 20933
+# genes/Sorex_araneus.gp.gz: 19160
+# genes/Sus_scrofa.gp.gz: 24043
+# genes/Theropithecus_gelada.gp.gz: 20592
+# genes/Tupaia_chinensis.gp.gz: 21047
+
+    # these gene predictions need lifting to change the sequence names
+    mv genes genes.seqToAsm
+    mkdir genes
+    cd genes.seqToAsm
+
+for F in *.gp.gz
+do
+  seqName=`echo $F | sed -e 's/.gp.gz//;'`
+  liftFile="`ls -d /hive/data/genomes/hg38/bed/cactus447/liftOver/lifts/${seqName}.*.lift 2> /dev/null`"
+  if [ -s "${liftFile}" ]; then
+    printf "%s\n" "${seqName}" 1>&2
+    liftUp -type=.genepred stdout "${liftFile}" warn "${F}" | gzip -c \
+        > "../genes/${seqName}.gp.gz"
+  else
+    printf "can not find lift file for $seqName\n" 1>&2
+  fi
+done
+
+    # kluster job to annotate each maf file
+    screen -S hg38      # manage long running procedure with screen
+    ssh ku
+    cd /hive/data/genomes/hg38/bed/cactus447way/frames
+
+    printf '#!/bin/bash
+set -beEu -o pipefail
+
+export C="${1}"
+export G="${2}"
+
+cat ../iRows/result/${C}.maf | genePredToMafFrames hg38 stdin stdout \
+        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
+' > runOne
+
+    chmod +x runOne
+
+    ls ../iRows/result | grep maf | sed -e "s/.maf//" > chr.list
+    ls genes | sed -e "s/.gp.gz//" > gene.list
+
+    printf '#LOOP
+runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
+#ENDLOOP
+' > template
+
+    mkdir parts
+    # run on the ku kluster
+    gensub2 chr.list gene.list template jobList
+    para -ram=128g create jobList
+    para try ... check ... push
+# Completed: 4444 of 4444 jobs
+# CPU time in finished jobs:    3955725s   65928.75m  1098.81h   45.78d  0.125 y
+# IO & Wait Time:               1735948s   28932.46m   482.21h   20.09d  0.055 y
+# Average job time:                1281s      21.35m     0.36h    0.01d
+# Longest finished job:           34637s     577.28m     9.62h    0.40d
+# Submission to last job:         55372s     922.87m    15.38h    0.64d
+
+    # collect all results into one file:
+    cd /hive/data/genomes/hg38/bed/cactus447way/frames
+    find ./parts -type f | while read F;
+do
+ zcat ${F} | awk '11 == NF'
+done | sort -k1,1 -k2,2n > cactus447wayFrames.bed
+
+    bedToBigBed -type=bed3+7 -as=$HOME/kent/src/hg/lib/mafFrames.as \
+       -tab cactus447wayFrames.bed ../../../chrom.sizes cactus447wayFrames.bb
+
+    bigBedInfo cactus447wayFrames.bb | sed -e 's/^/# /;'
+
+# version: 4
+# fieldCount: 11
+# hasHeaderExtension: yes
+# isCompressed: yes
+# isSwapped: 0
+# extraIndexCount: 0
+# itemCount: 16,063,019
+# primaryDataSize: 136,782,205
+# primaryIndexSize: 1,013,720
+# zoomLevels: 10
+# chromCount: 82
+# basesCovered: 65,486,909
+# meanDepth (of bases covered): 22.709888
+# minDepth: 1.000000
+# maxDepth: 44.000000
+# std of depth: 18.692097
+
+    # -rw-rw-r-- 1 1192312943 Aug  5 15:28 cactus447wayFrames.bed
+    # -rw-rw-r-- 1  166572857 Aug  5 15:29 cactus447wayFrames.bb
+
+    gzip cactus447wayFrames.bed
+
+    # verify there are frames on everything expected.  Since the
+    #  frames on the HL* asemblies did not work due to sequence name
+    #  differences, this won't be everything.
+    # (ls genes | wc shows 44 'expected')
+    zcat cactus447wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \
+        | sed -e 's/^/# /;' > species.check.list
+    wc -l species.check.list
+    # 44
+
+#  391443 Bos_mutus
+#  414721 CanFam4
+#  417196 Canis_lupus_familiaris
+#  328151 Carlito_syrichta
+#  255097 Cavia_aperea
+#  373053 Cercocebus_atys
+#  398356 Chinchilla_lanigera
+#  350442 Colobus_angolensis
+#  346524 Condylura_cristata
+#  363333 Dipodomys_ordii
+...
+#  379732 Pteropus_alecto
+#  387484 Pteropus_vampyrus
+#  389056 Rattus_norvegicus
+#  360929 Rhinopithecus_roxellana
+#  366019 Saimiri_boliviensis
+#  336296 Sorex_araneus
+#  402603 Sus_scrofa
+#  357418 Theropithecus_gelada
+#  372558 Tupaia_chinensis
+#  195495 hg38
+
+    #   load the resulting file, merely for measurement purposes here
+    ssh hgwdev
+    cd /hive/data/genomes/hg38/bed/cactus447way/frames
+    time hgLoadMafFrames hg38 cactus447wayFrames cactus447wayFrames.bed.gz
+    #   real    3m14.979s
+
+    hgsql -e 'select count(*) from cactus447wayFrames;' hg38
+    # +----------+
+    # | count(*) |
+    # +----------+
+    # | 16063019 |
+    # +----------+
+
+    time featureBits -countGaps hg38 cactus447wayFrames
+    # 65486909 bases of 3299210039 (1.985%) in intersection
+    # real    1m50.539s
+
+    #   enable the trackDb entries:
+# frames https://hgdownload.soe.ucsc.edu/goldenPath/hg38/cactus447way/cactus447wayFrames.bb
+# irows on
+    #   zoom to base level in an exon to see codon displays
+    #	appears to work OK
+
+    # do not need this loaded table:
+    hgsql hg38 -e 'drop table cactus447wayFrames;'
+
+#########################################################################