src/hg/makeDb/doc/hg38/cactus447.txt 5b1483d60866e637703c920a8ff02516b52d53a8

5b1483d60866e637703c920a8ff02516b52d53a8
hiram
  Mon Aug 7 13:25:52 2023 -0700
add the frames procedure refs #31528

diff --git src/hg/makeDb/doc/hg38/cactus447.txt src/hg/makeDb/doc/hg38/cactus447.txt
index 3bddbf2..a21cfba 100644
--- src/hg/makeDb/doc/hg38/cactus447.txt
+++ src/hg/makeDb/doc/hg38/cactus447.txt
@@ -290,16 +290,460 @@
 
    ### since they were sorted incorrectly above, sort this one huge file:
 
    $HOME/bin/x86_64/gnusort -S500G --parallel=64 -k1,1 -k2,2n \
      hg38.cactus447.bigMaf > hg38.sorted447way.bigMaf
 real    752m12.425s
 user    47m34.825s
 sys     371m7.394s
 
 ################################################################################
 ### make the bigBed file from the sorted bigMaf 2023-06-27
 
 bedToBigBed -verbose=2 -itemsPerSlot=4 -type=bed3+1 \
  -as=$HOME/kent/src/hg/lib/bigMaf.as -tab iRows/hg38.sorted447way.bigMaf \
      /hive/data/genomes/hg38/chrom.sizes hg38.cactus447way.bb
-XXX running - Tue Jun 27 14:16:31 PDT 2023
+
+# pid=61008: VmPeak:     5682928 kB
+# real    4157m14.091s
+
+# that time is 69 hours
+
+##############################################################################
+# liftOver files to convert sequence names (DONE - 2023-08-04 - Hiram)
+#
+#   The sequence names in the assemblies in the alignment do not match
+#   the sequence names in the corresponding genome browsers assemblies
+
+    mkdir /hive/data/genomes/hg38/bed/cactus447/liftOver
+    cd /hive/data/genomes/hg38/bed/cactus447/liftOver
+    mkdir lifts
+    ln -s ../idKeys/447way.equivalent.txt .
+
+#  using this script to construct liftOver files
+#!/bin/bash
+
+TOP="/hive/data/genomes/hg38/bed/cactus447/liftOver"
+
+cd "${TOP}"
+mkdir -p lifts
+
+cat 447way.equivalent.txt | while read L
+do
+  seqName=`printf "%s" "${L}" | cut -f1`
+  asmId=`printf "%s" "${L}" | cut -f2`
+  seqIdKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt"
+  case ${asmId} in
+      GCA_* | GCF_* )
+        gcX="${asmId:0:3}"
+        d0="${asmId:4:3}"
+        d1="${asmId:7:3}"
+        d2="${asmId:10:3}"
+    ab="/hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmId}"
+        buildDir=`realpath "${ab}"`
+        asmKeys="${buildDir}/trackData/idKeys/${asmId}.idKeys.txt"
+        asmSizes="${buildDir}/${asmId}.chrom.sizes"
+#        asmSizes="${buildDir}/idKeys/chrom.sizes"
+ls -og "${asmKeys}"
+        ;;
+     [a-zA-Z0-9]*)
+        asmKeys="/hive/data/genomes/${asmId}/bed/idKeys/${asmId}.idKeys.txt"
+        asmSizes="/hive/data/genomes/${asmId}/chrom.sizes"
+        ;;
+     *)  printf "ERROR not recognized $asmId\n" 1>&2
+         exit 255
+       ;;
+  esac
+  seqKeys="/hive/data/genomes/hg38/bed/cactus447/idKeys/${seqName}/${seqName}.idKeys.txt"
+  seq2Bit="/hive/data/genomes/hg38/bed/cactus447/2bitFiles/${seqName}.2bit"
+  if [ ! -s  "${seqKeys}" ]; then
+    printf "can not find seq idKeys\n%s\n" "${seqKeys}" 1>&2
+    exit 255
+  fi
+  if [ ! -s  "${asmKeys}" ]; then
+    printf "can not find asm idKeys\n%s\n" "${asmKeys}" 1>&2
+    exit 255
+  fi
+  printf "%s\t%s\n" "${asmId}" "${seqName}" 1>&2
+printf "lifts/${seqName}.${asmId}.lift\n" 1>&2
+printf "$seqKeys\n" 1>&2
+printf "$asmKeys\n" 1>&2
+printf "$asmSizes\n" 1>&2
+  if [ ! -s "lifts/${seqName}.${asmId}.lift" ]; then
+  join -t$'\t' "${seqKeys}" "${asmKeys}" | sort -k3,3 \
+    | join -t$'\t' -2 3 <(sort -k1,1 ${asmSizes}) - \
+      | awk -F$'\t' '{printf "0\t%s\t%d\t%s\t%d\n", $1, $2, $4, $2}' \
+         > lifts/${seqName}.${asmId}.lift
+  fi
+done
+
+    # a couple of those ended up with duplicate names in the liftOver
+    # due to duplicate contigs in the assemblies.  They were edited
+    # manually, one was really bad and needed this perl script to clean it:
+
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+my %idDone;     # key is sequence name, value is string of names used
+
+open (my $fh, "<", "lifts/Otolemur_garnettii.otoGar3.lift") or die "can not read lifts/Otolemur_garnettii.otoGar3.lift";
+while (my $line = <$fh>) {
+  chomp $line;
+  my @a = split('\t', $line);
+  if (!defined($idDone{$a[1]})) {
+    $idDone{$a[1]} = $a[3];
+    printf "%s\n", $line;
+  } else {
+    $idDone{$a[1]} .= "," . $a[3];
+  }
+}
+close ($fh);
+
+foreach my $seq (sort keys %idDone) {
+  printf STDERR "%s\t%s\n", $seq, $idDone{$seq};
+}
+
+##############################################################################
+# MAF FRAMES (working - 2023-08-01 - Hiram)
+    ssh hgwdev
+    mkdir /hive/data/genomes/hg38/bed/cactus447/frames
+    cd /hive/data/genomes/hg38/bed/cactus447/frames
+
+    mkdir genes
+
+#   survey all the genomes to find out what kinds of gene tracks they have
+
+printf '#/bin/bash
+
+egrep -v "GCA_|GCF_" ../idKeys/447way.equivalent.txt | cut -f2 | while read db
+do
+  printf "# ${db}:"
+  for table in `hgsql -N -e "show tables;" $db | egrep "Gene|ncbiRefSeq" | egrep -v "Curated|Cds|Link|Other|PepTable|Predicted|Psl|wgEncode|gencode|uuGene|ToGeneName|ccds|encode|Ext|Old|Pep|Select|Bak|Verify|orfeome|Multi|transMap|Pfam|knownGeneMrna|sgpGene"`
+  do
+     count=`hgsql -N -e "select count(*) from $table;" $db`
+     printf " %%s: %%s" "${table}" "${count}"
+  done
+  printf "\\n"
+done
+' > showGenes.sh
+
+
+    chmod +x ./showGenes.sh
+    ./showGenes.sh > gene.survey.txt 2>&1 &
+    # most of them have ncbiRefSeq
+    grep ncbiRefSeq gene.survey.txt  | cut -d' ' -f2 | sed -e 's/://;' \
+       | sort | xargs echo | fold -s -w 72
+# bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1
+# mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1
+# rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1
+
+    # a couple do not:
+    grep -v ncbiRefSeq gene.survey.txt  | cut -d' ' -f2 | sed -e 's/://;' \
+       | sort | xargs echo | fold -s -w 72
+# cavApe1 eidHel1 ptePar1
+    # but they do have ensGene or augustusGene:
+    egrep "cavApe1|eidHel1|ptePar1" gene.survey.txt
+# cavApe1: augustusGene: 10114 ensGene: 22510 xenoRefGene: 322224
+# eidHel1: augustusGene: 49674
+# ptePar1: augustusGene: 68613
+
+    #   1. ncbiRefSeq  - add hg38 here manually since it didn't come in above
+#    for db in hg38
+# do
+#    asmName="${db}"
+
+    for db in bosMut1 canFam4 chiLan1 colAng1 dipOrd2 eleEdw1 felCat8 fukDam1 micOch1 mm10 myoBra1 myoDav1 myoLuc2 odoRosDiv1 orcOrc1 otoGar3 proCoq1 pteAle1 rheMac10 sorAra2 speTri2 susScr3 tarSyr2 tupChi1
+do
+    asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq" ${db} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > genes/${asmName}.gp.gz
+    echo -n "# ${asmName}: "
+    genePredCheck -db=${db} genes/${asmName}.gp.gz
+done
+# hg38: checked: 22882 failed: 0
+# Bos_mutus: checked: 20457 failed: 0
+# CanFam4: checked: 21143 failed: 0
+# Chinchilla_lanigera: checked: 20482 failed: 0
+# Colobus_angolensis: checked: 20042 failed: 0
+# Dipodomys_ordii: checked: 19738 failed: 0
+# Elephantulus_edwardii: checked: 20384 failed: 0
+# Felis_catus: checked: 20051 failed: 0
+# Fukomys_damarensis: checked: 19811 failed: 0
+# Microtus_ochrogaster: checked: 20048 failed: 0
+# Mus_musculus: checked: 23375 failed: 0
+# Myotis_brandtii: checked: 19949 failed: 0
+# Myotis_davidii: checked: 18815 failed: 0
+# Myotis_lucifugus: checked: 19895 failed: 0
+# Odobenus_rosmarus: checked: 19346 failed: 0
+# Orcinus_orca: checked: 19136 failed: 0
+# Otolemur_garnettii: checked: 19536 failed: 0
+# Propithecus_coquerelli: checked: 19356 failed: 0
+# Pteropus_alecto: checked: 18326 failed: 0
+# Macaca_mulatta: checked: 21021 failed: 0
+# Sorex_araneus: checked: 19160 failed: 0
+# Ictidomys_tridecemlineatus: checked: 19892 failed: 0
+# Sus_scrofa: checked: 24180 failed: 0
+# Carlito_syrichta: checked: 19968 failed: 0
+# Tupaia_chinensis: checked: 21047 failed: 0
+
+    #   2. ensGene
+    for db in cavApe1
+do
+asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
+hgsql -N -e "select
+name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
+from ensGene" ${db} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /dev/shm/${db}.tmp.gz
+    mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz
+    echo -n "# ${asmName}: "
+    genePredCheck -db=${db} genes/${asmName}.gp.gz
+done
+# Cavia_aperea: checked: 14182 failed: 0
+
+    #   3. augustusGene
+    for db in eidHel1 ptePar1
+do
+asmName=`grep -w "${db}" ../idKeys/447way.equivalent.txt | cut -f1`
+hgsql -N -e "select
+name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds
+from augustusGene" ${db} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /dev/shm/${db}.tmp.gz
+    mv /dev/shm/${db}.tmp.gz genes/${asmName}.gp.gz
+    echo -n "# ${asmName}: "
+    genePredCheck -db=${db} genes/${asmName}.gp.gz
+done
+# Eidolon_helvum: checked: 31495 failed: 0
+# Pteronotus_parnellii: checked: 47678 failed: 0
+
+    # verify counts for genes are reasonable:
+    for T in genes/*.gz
+do
+    printf "# $T: "
+    zcat $T | cut -f1 | sort -u | wc -l
+done
+# genes/Bos_mutus.gp.gz: 20457
+# genes/CanFam4.gp.gz: 21143
+# genes/Canis_lupus_familiaris.gp.gz: 19812
+# genes/Carlito_syrichta.gp.gz: 19968
+# genes/Cavia_aperea.gp.gz: 14182
+# genes/Cercocebus_atys.gp.gz: 20534
+# genes/Chinchilla_lanigera.gp.gz: 20481
+# genes/Colobus_angolensis.gp.gz: 20042
+# genes/Condylura_cristata.gp.gz: 17842
+# genes/Dipodomys_ordii.gp.gz: 19738
+...
+# genes/Pteropus_vampyrus.gp.gz: 19485
+# genes/Rattus_norvegicus.gp.gz: 22854
+# genes/Rhinopithecus_roxellana.gp.gz: 21027
+# genes/Saimiri_boliviensis.gp.gz: 20933
+# genes/Sorex_araneus.gp.gz: 19160
+# genes/Sus_scrofa.gp.gz: 24043
+# genes/Theropithecus_gelada.gp.gz: 20592
+# genes/Tupaia_chinensis.gp.gz: 21047
+
+# And, can pick up ncbiRefSeq for GCF equivalent assemblies for those
+# outside the UCSC database set of genomes
+
+grep GCF_ ../idKeys/447way.equivalent.txt | while read dbGCF
+do
+  asmName=`printf "%s" "${dbGCF}" | cut -f1`
+  asmId=`printf "%s" "${dbGCF}" | cut -f2`
+  gcX="${asmId:0:3}"
+  d0="${asmId:4:3}"
+  d1="${asmId:7:3}"
+  d2="${asmId:10:3}"
+  buildDir="/hive/data/genomes/asmHubs/refseqBuild/$gcX/$d0/$d1/$d2/${asmId}"
+  if [ ! -d "${buildDir}" ]; then
+     printf "not found: %s\n" "${buildDir}"
+  else
+
+  ncbiRefSeqGp="${buildDir}/trackData/ncbiRefSeq/process/${asmId}.ncbiRefSeq.gp"
+    sizes="${buildDir}/${asmId}.chrom.sizes"
+    if [ -s "${ncbiRefSeqGp}" ]; then
+      genePredSingleCover "${ncbiRefSeqGp}" stdout | gzip -c > genes/${asmName}.gp.gz
+      printf "# %s: " "${asmName}"
+      genePredCheck -chromSizes="${sizes}" genes/${asmName}.gp.gz
+    else
+      printf "MISSING: ${ncbiRefSeqGp}\n" 1>&2
+    fi
+  fi
+
+done
+
+# verify counts for genes are reasonable:
+    for T in genes/*.gz
+do
+    printf "# $T: "
+    zcat $T | cut -f1 | sort -u | wc -l
+done
+# genes/Bos_mutus.gp.gz: 20457
+# genes/CanFam4.gp.gz: 21143
+# genes/Canis_lupus_familiaris.gp.gz: 19812
+# genes/Carlito_syrichta.gp.gz: 19968
+# genes/Cavia_aperea.gp.gz: 14182
+# genes/Cercocebus_atys.gp.gz: 20534
+# genes/Chinchilla_lanigera.gp.gz: 20481
+# genes/Colobus_angolensis.gp.gz: 20042
+# genes/Condylura_cristata.gp.gz: 17842
+# genes/Dipodomys_ordii.gp.gz: 19738
+...
+# genes/Pteropus_vampyrus.gp.gz: 19485
+# genes/Rattus_norvegicus.gp.gz: 22854
+# genes/Rhinopithecus_roxellana.gp.gz: 21027
+# genes/Saimiri_boliviensis.gp.gz: 20933
+# genes/Sorex_araneus.gp.gz: 19160
+# genes/Sus_scrofa.gp.gz: 24043
+# genes/Theropithecus_gelada.gp.gz: 20592
+# genes/Tupaia_chinensis.gp.gz: 21047
+
+    # these gene predictions need lifting to change the sequence names
+    mv genes genes.seqToAsm
+    mkdir genes
+    cd genes.seqToAsm
+
+for F in *.gp.gz
+do
+  seqName=`echo $F | sed -e 's/.gp.gz//;'`
+  liftFile="`ls -d /hive/data/genomes/hg38/bed/cactus447/liftOver/lifts/${seqName}.*.lift 2> /dev/null`"
+  if [ -s "${liftFile}" ]; then
+    printf "%s\n" "${seqName}" 1>&2
+    liftUp -type=.genepred stdout "${liftFile}" warn "${F}" | gzip -c \
+        > "../genes/${seqName}.gp.gz"
+  else
+    printf "can not find lift file for $seqName\n" 1>&2
+  fi
+done
+
+    # kluster job to annotate each maf file
+    screen -S hg38      # manage long running procedure with screen
+    ssh ku
+    cd /hive/data/genomes/hg38/bed/cactus447way/frames
+
+    printf '#!/bin/bash
+set -beEu -o pipefail
+
+export C="${1}"
+export G="${2}"
+
+cat ../iRows/result/${C}.maf | genePredToMafFrames hg38 stdin stdout \
+        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
+' > runOne
+
+    chmod +x runOne
+
+    ls ../iRows/result | grep maf | sed -e "s/.maf//" > chr.list
+    ls genes | sed -e "s/.gp.gz//" > gene.list
+
+    printf '#LOOP
+runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
+#ENDLOOP
+' > template
+
+    mkdir parts
+    # run on the ku kluster
+    gensub2 chr.list gene.list template jobList
+    para -ram=128g create jobList
+    para try ... check ... push
+# Completed: 4444 of 4444 jobs
+# CPU time in finished jobs:    3955725s   65928.75m  1098.81h   45.78d  0.125 y
+# IO & Wait Time:               1735948s   28932.46m   482.21h   20.09d  0.055 y
+# Average job time:                1281s      21.35m     0.36h    0.01d
+# Longest finished job:           34637s     577.28m     9.62h    0.40d
+# Submission to last job:         55372s     922.87m    15.38h    0.64d
+
+    # collect all results into one file:
+    cd /hive/data/genomes/hg38/bed/cactus447way/frames
+    find ./parts -type f | while read F;
+do
+ zcat ${F} | awk '11 == NF'
+done | sort -k1,1 -k2,2n > cactus447wayFrames.bed
+
+    bedToBigBed -type=bed3+7 -as=$HOME/kent/src/hg/lib/mafFrames.as \
+       -tab cactus447wayFrames.bed ../../../chrom.sizes cactus447wayFrames.bb
+
+    bigBedInfo cactus447wayFrames.bb | sed -e 's/^/# /;'
+
+# version: 4
+# fieldCount: 11
+# hasHeaderExtension: yes
+# isCompressed: yes
+# isSwapped: 0
+# extraIndexCount: 0
+# itemCount: 16,063,019
+# primaryDataSize: 136,782,205
+# primaryIndexSize: 1,013,720
+# zoomLevels: 10
+# chromCount: 82
+# basesCovered: 65,486,909
+# meanDepth (of bases covered): 22.709888
+# minDepth: 1.000000
+# maxDepth: 44.000000
+# std of depth: 18.692097
+
+    # -rw-rw-r-- 1 1192312943 Aug  5 15:28 cactus447wayFrames.bed
+    # -rw-rw-r-- 1  166572857 Aug  5 15:29 cactus447wayFrames.bb
+
+    gzip cactus447wayFrames.bed
+
+    # verify there are frames on everything expected.  Since the
+    #  frames on the HL* asemblies did not work due to sequence name
+    #  differences, this won't be everything.
+    # (ls genes | wc shows 44 'expected')
+    zcat cactus447wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \
+        | sed -e 's/^/# /;' > species.check.list
+    wc -l species.check.list
+    # 44
+
+#  391443 Bos_mutus
+#  414721 CanFam4
+#  417196 Canis_lupus_familiaris
+#  328151 Carlito_syrichta
+#  255097 Cavia_aperea
+#  373053 Cercocebus_atys
+#  398356 Chinchilla_lanigera
+#  350442 Colobus_angolensis
+#  346524 Condylura_cristata
+#  363333 Dipodomys_ordii
+...
+#  379732 Pteropus_alecto
+#  387484 Pteropus_vampyrus
+#  389056 Rattus_norvegicus
+#  360929 Rhinopithecus_roxellana
+#  366019 Saimiri_boliviensis
+#  336296 Sorex_araneus
+#  402603 Sus_scrofa
+#  357418 Theropithecus_gelada
+#  372558 Tupaia_chinensis
+#  195495 hg38
+
+    #   load the resulting file, merely for measurement purposes here
+    ssh hgwdev
+    cd /hive/data/genomes/hg38/bed/cactus447way/frames
+    time hgLoadMafFrames hg38 cactus447wayFrames cactus447wayFrames.bed.gz
+    #   real    3m14.979s
+
+    hgsql -e 'select count(*) from cactus447wayFrames;' hg38
+    # +----------+
+    # | count(*) |
+    # +----------+
+    # | 16063019 |
+    # +----------+
+
+    time featureBits -countGaps hg38 cactus447wayFrames
+    # 65486909 bases of 3299210039 (1.985%) in intersection
+    # real    1m50.539s
+
+    #   enable the trackDb entries:
+# frames https://hgdownload.soe.ucsc.edu/goldenPath/hg38/cactus447way/cactus447wayFrames.bb
+# irows on
+    #   zoom to base level in an exon to see codon displays
+    #	appears to work OK
+
+    # do not need this loaded table:
+    hgsql hg38 -e 'drop table cactus447wayFrames;'
+
+#########################################################################