6619816b39594b4cafca3215c7d8ac4dfd185652
hiram
  Tue May 20 14:50:43 2025 -0700
document procedure for Tiberius gene predictions addition to GenArk hubs refs #34917

diff --git src/hg/makeDb/doc/tiberius.txt src/hg/makeDb/doc/tiberius.txt
new file mode 100644
index 00000000000..bf670c6ca92
--- /dev/null
+++ src/hg/makeDb/doc/tiberius.txt
@@ -0,0 +1,220 @@
+#############################################################################
+#  procedure for adding 'tiberius' gene prediction tracks to genark
+#############################################################################
+
+#############################################################################
+### fetch tiberius predictions - (DONE - Hiram - 2025-02-01)
+#############################################################################
+
+mkdir -p /hive/data/outside/genark/tiberius
+cd /hive/data/outside/genark/tiberius
+
+### credential file obtained from Mario Stanke in email
+
+rsync --password-file=.tiberius.credential -av -P \
+   rsync://hiram@bioinf.uni-greifswald.de/tiberius/ ./2025-02-01/
+
+#############################################################################
+### add html and trackDb definitions as obtained from Mario
+###     (DONE - Hiram - 2025-05-09)
+#############################################################################
+
+cat << '__EOF__' > 2025-02-01/Tiberius.html
+<h2 id="description">UCSC Tiberius Track</h2>
+<p>The protein-coding genes were predicted with Tiberius in <i>ab initio</i> mode. The soft-masked genome was input only. The command was:</p>
+<code>tiberius.py --genome genome.fa --out tiberius.gtf</code>
+<p><a href="https://bioinf.uni-greifswald.de/bioinf/tiberius/genes/tib-tbl.html">Table with predicted coordinates, protein sequences and coding sequences of all mammals.</a></p>
+<p>Download code and see accuracy statistics on the <a href="https://github.com/Gaius-Augustus/Tiberius">Tiberius GitHub page</a>.</p>
+<p>Tiberius is a deep learning model that combines a HMM layer with other sequence-to-sequence models (convolutional neural networks, LSTM).</p>
+<p>Tiberius was trained on 32 mammalian genomes that did not include any <i>Hominidae</i> (see supplements of below preprint).</p>
+<h2 id="credits">Contact</h2>
+<p>Questions should be directed to <a href="mailto:lars.gabriel@uni-greifswald.de">Lars Gabriel</a> or <a href="mailto:mario.stanke@uni-greifswald.de">Mario Stanke</a>.</p>
+<h2 id="reference">Reference</h2>
+<a href="https://academic.oup.com/bioinformatics/article/40/12/btae685/7903281">Tiberius: End-to-End Deep Learning with an HMM for Gene Prediction. Lars Gabriel, Felix Becker, Katharina J. Hoff and Mario Stanke, <i>Bioinformatics</i> 2024;</a>, https://doi.org/10.1093/bioinformatics/btae685
+__EOF__
+
+cat << '__EOF__' > 2025-02-01/Tiberius.html
+track Tiberius
+bigDataUrl contrib/tiberius/tiberius.bigGenePred.bb
+shortLabel Tiberius genes
+longLabel Tiberius ab initio gene prediction
+type bigGenePred
+visibility pack
+color 0,102,204
+type bigGenePred
+html contrib/tiberius/Tiberius.html
+group genes
+dataVersion Tiberius version 2025-01-07
+baseColorDefault genomicCodons
+__EOF__
+
+#############################################################################
+### identify corresponding assemblies
+#############################################################################
+
+find ./2025-02-01 -type f | grep '/bb/' | awk -F$'/' '{print $NF}' \
+   | sed -e 's/.bb//;' | sort -u > tiberius.2025-02-01.accession.list
+
+grep -F -f tiberius.2025-02-01.accession.list \
+   $HOME/kent/src/hg/makeDb/doc/asmHubs/master.run.list \
+     | cut -d' ' -f2 | sort -u > to.link.list
+
+### looks like 701 of them match:
+
+wc -l tiberius.2025-02-01.accession.list to.link.list
+ 1317 tiberius.2025-02-01.accession.list
+  701 to.link.list
+
+#############################################################################
+### setup the link script to get these files into the genark build hierarchy
+#############################################################################
+
+cat << '__EOF__' > linkOne.sh
+#!/bin/bash
+
+set -beEu -o pipefail
+
+export TOP="/hive/data/outside/genark/tiberius"
+
+export asmId="${1}"
+export acc=`echo $asmId | cut -d'_' -f1-2`
+export gcX="${asmId:0:3}"
+export d0="${asmId:4:3}"
+export d1="${asmId:7:3}"
+export d2="${asmId:10:3}"
+
+export gbkRef="genbankBuild"
+
+if [[ "$gcX" == GCF ]]; then
+    gbkRef="refseqBuild"
+fi
+
+export buildDir="/hive/data/genomes/asmHubs/${gbkRef}/${gcX}/${d0}/${d1}/${d2}/${asmId}"
+if [ ! -d "${buildDir}" ]; then
+  printf "ERROR: can not find build directory:\n%s\n" "${buildDir}" 1>&2
+  exit 255
+fi
+export buildTrackDb="${buildDir}/${asmId}.trackDb.txt"
+if [ ! -s "${buildTrackDb}" ]; then
+  printf "ERROR: can not find build trackDb:\n%s\n" "${buildTrackDb}" 1>&2
+  exit 255
+fi
+
+export destDir="${buildDir}/contrib/tiberius"
+
+export destDir="${buildDir}/contrib/tiberius"
+if [ -d "${destDir}" ]; then
+   printf "DONE: %s\n" "${asmId}"
+   exit 0
+fi
+
+mkdir -p "${destDir}"
+
+export tiberiusVer="/gbdb/genark/contribTracks/tiberius/2025-02-01"
+export destLink="${destDir}/tiberius.bigGenePred.bb"
+export destTrackDb="${destDir}/tiberius.trackDb.txt"
+export destHtml="${destDir}/Tiberius.html"
+
+export srcHtml="${TOP}/2025-02-01/Tiberius.html"
+export srcTrackDb="${TOP}/2025-02-01/tiberius.trackDb.txt"
+export srcCount=`ls  ${TOP}/2025-02-01/*/bb/${acc}.bb | wc -l`
+if [ "${srcCount}" -eq 1 ]; then
+  export srcFile=`ls  ${TOP}/2025-02-01/*/bb/${acc}.bb`
+  rm -f "${destLink}" "${destTrackDb}" "${destHtml}"
+  printf "ln -s $srcFile $destLink\n"
+  ln -s $srcFile $destLink
+  ln -s $srcHtml $destHtml
+  ln -s $srcTrackDb $destDir
+  cat "${buildTrackDb}" ${buildDir}/contrib/*/*.trackDb.txt \
+    > "${buildDir}/alpha.trackDb.txt"
+  sed -e 's/genomesFile genomes.txt/useOneFile on/; /trackDb trackDb.txt/d; s/^genome /\ngenome /;' \
+    ${buildDir}/$asmId.hub.txt \
+    ${buildDir}/$asmId.genomes.txt > "${buildDir}/alpha.hub.txt"
+  printf "\n" >> "${buildDir}/alpha.hub.txt"
+  cat "${buildTrackDb}" >> "${buildDir}/alpha.hub.txt"
+  cat ${buildDir}/contrib/*/*.trackDb.txt >> "${buildDir}/alpha.hub.txt"
+else
+  printf "ERROR: can not find source file at\n" 1>&2
+  printf "%s\n" "${TOP}/2025-02-01/*/bb/${acc}.bb"
+  exit 255
+fi
+
+exit $?
+__EOF__
+
+chmod +x linkOne.sh
+
+#############################################################################
+### make all the symlinks
+#############################################################################
+
+for S in `cat to.link.list`
+do
+  ./linkOne.sh "${S}" 2>&1
+done > link.log
+
+#############################################################################
+### with the links in place, the tracks will get into the genark
+### assemblies with the usual GenArk build procedure
+#############################################################################
+### for example, the 'primates':
+
+cd ~kent/src/hg/makeDb/doc/primatesAsmHub
+
+time (make) > dbg 2>&1
+### verify no errors:
+    egrep -i "fail|error|missing|cannot|clade|class|real" dbg
+### if good, verify on download:
+time (make verifyTestDownload) >> test.down.log 2>&1
+### verify no errors:
+    egrep -i "fail|error|missing|cannot|clade|class|real|check" test.down.log
+time (make sendDownload) >> send.down.log 2>&1
+### verify no errors:
+    egrep -i "fail|error|missing|cannot|clade|class|real" send.down.log
+time (make verifyDownload) >> verify.down.log 2>&1
+### verify no errors:
+    egrep -i "fail|error|missing|cannot|clade|class|real|check" send.down.log
+
+### all of the genark 'clades' can be done in one go:
+
+#!/bin/bash
+
+runOne() {
+  clade="${1}"
+  cd "../${clade}AsmHub"
+  printf "%s sleep %d\n" "${clade}" "${rand}"
+  time (make) > dbg 2>&1
+  time (make verifyTestDownload) >> test.down.log 2>&1
+  printf "# from the make in ../${clade}AsmHub\n"
+  egrep -w -i "fail|error|missing|cannot|clade|class|real" dbg | egrep -v "unclassified"
+  grep check test.down.log | tail -3
+  printf "#### done with ${clade}\n"
+}
+
+for C in primates plants invertebrate legacy \
+ birds fish fungi mammals vertebrate viral bacteria
+do
+  runOne "${C}" &
+done
+
+printf "waiting . . .\n"
+wait
+printf ". . . exit\n"
+exit $?
+
+#############################################################################
+### the daily cron jobs will correctly get all the files out to our
+### mirror sites into the /gbdb/genark/ hierarchy
+#############################################################################
+
+### current cron job is in Hiram's hgwdev account, this should be moved
+### to the 'otto' user.
+
+# push out the /gbdb/hubs/GC[AF]/ hierarchy to:
+03 01 * * * /hive/data/inside/genArk/pushRR.sh
+
+### on the Asia node, it is a pull script in the qateam account:
+#  pull down the /gbdb/hubs/GC[AF]/ files from hgwdev daily
+02 16 * * * ~/cronScripts/pullHgwdev.sh
+
+#############################################################################