919608dc52ac2e23dcf184d2fcfb7b6f93103a52
hiram
  Mon Sep 4 14:18:26 2023 -0700
collecting the genes from various browsers no redmine

diff --git src/hg/makeDb/doc/mito/genes.txt src/hg/makeDb/doc/mito/genes.txt
new file mode 100644
index 0000000..05fec9a
--- /dev/null
+++ src/hg/makeDb/doc/mito/genes.txt
@@ -0,0 +1,179 @@
+#### obtain all the genes for the 'mito' browser
+
+### This script will use the mito.chrom.sizes listing which defines
+###  the species used in the mito browser.  It will parse the names
+###  to determine if they are UCSC databse browsers or GenArk browsers.
+###  Then, it will check those browser build directories to find the
+###  appropriate gene definitions and gather them all together
+###  in this /hive/data/genomes/mito/trackData/genes/<species>
+###  directory.
+
+#########################################################################
+#!/bin/bash
+
+mkdir -p /hive/data/genomes/mito/trackData/genes
+cd /hive/data/genomes/mito/trackData/genes
+
+for asmChr in `cut -f1 ../../mito.chrom.sizes`
+do
+  case "${asmChr}" in
+      GC*)
+        asmName=`echo $asmChr | cut -d'_' -f1-2 | sed -e 's/v/./;'`
+        chrName=`echo $asmChr | cut -d'_' -f3-`
+        gcX="${asmName:0:3}"
+        d0="${asmName:4:3}"
+        d1="${asmName:7:3}"
+        d2="${asmName:10:3}"
+        t=`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`
+        if [ -d "${t}" ]; then
+          buildDir=`realpath "${t}"`
+          asmId=`basename "${buildDir}"`
+          ncbiRefSeq="${buildDir}/trackData/ncbiRefSeq"
+          if [ -d "${ncbiRefSeq}" ]; then
+            if [ ! -s "${asmName}/${asmChr}.gp.gz" ]; then
+              printf "%s %s %s\n" "${asmId}" "${asmChr}" "${chrName}"
+              mkdir -p "${asmName}"
+              if [ -s "${ncbiRefSeq}/process/${asmId}.other.gp" -a -s "${ncbiRefSeq}/process/${asmId}.ncbiRefSeq.gp" ]; then
+              egrep -h -w "${chrName}" \
+                "${ncbiRefSeq}/process/${asmId}.ncbiRefSeq.gp" \
+                "${ncbiRefSeq}/process/${asmId}.other.gp" \
+                  | sort -u | tawk -v chr="${asmChr}" '{$2=chr; print $0}' \
+                     | gzip -c > "${asmName}/${asmChr}.gp.gz"
+printf "# DBG proceess %s both\n" "${asmChr}" 1>&2
+             elif [ -s "${ncbiRefSeq}/process/${asmId}.ncbiRefSeq.gp" ]; then
+              egrep -h -w "${chrName}" \
+                "${ncbiRefSeq}/process/${asmId}.ncbiRefSeq.gp" \
+                  | sort -u | tawk -v chr="${asmChr}" '{$2=chr; print $0}' \
+                     | gzip -c > "${asmName}/${asmChr}.gp.gz"
+printf "# DBG proceess %s other\n" "${asmChr}" 1>&2
+              elif [ -s "${ncbiRefSeq}/process/${asmId}.other.gp" ]; then
+              egrep -h -w "${chrName}" \
+                "${ncbiRefSeq}/process/${asmId}.other.gp" \
+                  | sort -u | tawk -v chr="${asmChr}" '{$2=chr; print $0}' \
+                     | gzip -c > "${asmName}/${asmChr}.gp.gz"
+printf "# DBG %s\n" "${asmChr}" 1>&2
+              else
+                 printf "# can not find ncbiRefSeq.gp or other.gp\n"
+              fi
+            else
+              printf "DONE\t%s %s %s\n" "${asmId}" "${asmChr}" "${chrName}"
+            fi
+          else
+            printf "%s: no ncbiRefSeq\n" "${asmName}"
+          fi
+        else
+          printf "%s: GCx missing buildDir\n" "${asmName}"
+        fi
+        ;;
+      mm10_chrM)
+        if [ ! -s "mm10/${asmChr}.gp.gz" ]; then
+          printf "%s\n" "mm10/${asmChr}.gp.gz"
+          mkdir -p mm10
+          zcat /hive/data/genomes/mm10/bed/ncbiRefSeq.p6.2021-04-22/process/mm10.curated.gp.gz \
+             | awk '$2 == "chrM"' \
+                  | sed -e 's/chrM/mm10_chrM/;' \
+                 | gzip -c > mm10/${asmChr}.gp.gz
+         else
+          printf "DONE\t%s\n" "mm10/${asmChr}.gp.gz"
+        fi
+        ;;
+      mm39_chrM)
+        if [ ! -s "hg39/${asmChr}.gp.gz" ]; then
+          mkdir -p mm39
+          zcat /hive/data/genomes/mm39/bed/ncbiRefSeq.2023-04-19/process/mm39.other.gp.gz \
+             | awk '$2 == "chrM"' \
+                  | sed -e 's/chrM/mm39_chrM/;' \
+                 | gzip -c > mm39/${asmChr}.gp.gz
+        else
+          printf "DONE\t%s\n" "mm39/${asmChr}.gp.gz"
+        fi
+        ;;
+      hg19a_chrM)
+        if [ ! -s "hg19/${asmChr}.gp.gz" ]; then
+        mkdir -p hg19
+        hgsql -N -e 'select * from ensGene where chrom="chrM";' hg19 \
+           | cut -f2- | sed -e 's/chrM/hg19a_chrM/;' | gzip -c > hg19/${asmChr}.gp.gz
+        else
+          printf "DONE\t%s\n" "hg19/${asmChr}.gp.gz"
+        fi
+        ;;
+      hg19b_chrMT)
+        if [ ! -s "hg19/${asmChr}.gp.gz" ]; then
+          mkdir -p hg19
+          zcat /hive/data/genomes/hg38/bed/ncbiRefSeq.p14.2023-03-29/process/hg38.other.gp.gz \
+             | awk '$2 == "chrM"' \
+                  | sed -e "s/chrM/${asmChr}/;" \
+                     | gzip > hg19/${asmChr}.gp.gz
+        else
+          printf "DONE\t%s\n" "hg19/${asmChr}.gp.gz"
+        fi
+        ;;
+      hg38_chrM)
+        if [ ! -s "hg38/${asmChr}.gp.gz" ]; then
+        mkdir -p hg38
+        zcat /hive/data/genomes/hg38/bed/ncbiRefSeq.p14.2023-03-29/process/hg38.other.gp.gz \
+           | awk '$2 == "chrM"' \
+               | sed -e 's/chrM/hg38_chrM/;' \
+                  | gzip > hg38/${asmChr}.gp.gz
+        else
+          printf "DONE\t%s\n" "hg38/${asmChr}.gp.gz"
+        fi
+        ;;
+      [a-z]*)
+        asmName=`echo $asmChr | cut -d'_' -f1`
+        chr=`echo $asmChr | cut -d'_' -f2-`
+        buildDir="/hive/data/genomes/${asmName}"
+        if [ -d "${buildDir}" ]; then
+          printf "DBG %s:\t" "${asmChr}" 1>&2
+          hgsql -N -e 'show tables;' $asmName | grep -i gene | while read tbl
+          do
+            printf " %s" "${tbl}"
+          done
+          printf "\n"
+          # priority of gene tables
+          for xTbl in ncbiRefSeq ensGene refGene augustusGene
+          do
+          hgsql -N -e 'show tables;' $asmName | grep -i gene | while read tbl
+          do
+             case "${tbl}" in
+               $xTbl)
+                 mkdir -p "${asmName}"
+                 if [ ! -s "$asmName/${asmChr}.gp.gz" ]; then
+printf "DBG select * from $tbl where chrom=\"$chr\"; $asmName\n" 1>&2
+       hgsql -N -e "select * from $tbl where chrom=\"$chr\";" $asmName \
+                   | cut -f2- | tawk -v chr="${asmChr}" '{$2=chr; print $0}' \
+                      | gzip -c > $asmName/${asmChr}.gp.gz
+                 C=`zgrep -c . "$asmName/${asmChr}.gp.gz"`
+                 if [ "${C}" -eq 0 ]; then
+                    rm -f "$asmName/${asmChr}.gp.gz"
+                 fi
+                 else
+                   printf "DONE\t%s\n" "$asmName/${asmChr}.gp.gz"
+                 fi
+                  ;;
+             esac
+          done
+          done
+        else
+          printf "%s: a-z missing\n" "${asmName}"
+        fi
+        ;;
+  esac
+done
+
+exit $?
+      9 refGene
+     23 augustusGene
+      9 ensGene
+      9 ensemblToGeneName
+      3 geneName
+      2 geneid
+      1 mgcGenes
+      5 ncbiRefSeq
+      5 no
+      1 nscanGene
+      1 sgpGene
+      1 transMapAlnUcscGenes
+      1 transMapInfoUcscGenes
+     19 xenoRefGene
+