74e53377949096d1ea6f3d78ec59b1914f1373d8
hiram
Tue May 20 14:58:35 2025 -0700
note about trackDb files and beta and public release control refs #34917
diff --git src/hg/makeDb/doc/tiberius.txt src/hg/makeDb/doc/tiberius.txt
index bf670c6ca92..e36b2884e4e 100644
--- src/hg/makeDb/doc/tiberius.txt
+++ src/hg/makeDb/doc/tiberius.txt
@@ -1,220 +1,244 @@
#############################################################################
# procedure for adding 'tiberius' gene prediction tracks to genark
#############################################################################
#############################################################################
### fetch tiberius predictions - (DONE - Hiram - 2025-02-01)
#############################################################################
mkdir -p /hive/data/outside/genark/tiberius
cd /hive/data/outside/genark/tiberius
### credential file obtained from Mario Stanke in email
rsync --password-file=.tiberius.credential -av -P \
rsync://hiram@bioinf.uni-greifswald.de/tiberius/ ./2025-02-01/
#############################################################################
### add html and trackDb definitions as obtained from Mario
### (DONE - Hiram - 2025-05-09)
#############################################################################
cat << '__EOF__' > 2025-02-01/Tiberius.html
UCSC Tiberius Track
The protein-coding genes were predicted with Tiberius in ab initio mode. The soft-masked genome was input only. The command was:
tiberius.py --genome genome.fa --out tiberius.gtf
Table with predicted coordinates, protein sequences and coding sequences of all mammals.
Download code and see accuracy statistics on the Tiberius GitHub page.
Tiberius is a deep learning model that combines a HMM layer with other sequence-to-sequence models (convolutional neural networks, LSTM).
Tiberius was trained on 32 mammalian genomes that did not include any Hominidae (see supplements of below preprint).
Contact
Questions should be directed to Lars Gabriel or Mario Stanke.
Reference
Tiberius: End-to-End Deep Learning with an HMM for Gene Prediction. Lars Gabriel, Felix Becker, Katharina J. Hoff and Mario Stanke, Bioinformatics 2024;, https://doi.org/10.1093/bioinformatics/btae685
__EOF__
cat << '__EOF__' > 2025-02-01/Tiberius.html
track Tiberius
bigDataUrl contrib/tiberius/tiberius.bigGenePred.bb
shortLabel Tiberius genes
longLabel Tiberius ab initio gene prediction
type bigGenePred
visibility pack
color 0,102,204
type bigGenePred
html contrib/tiberius/Tiberius.html
group genes
dataVersion Tiberius version 2025-01-07
baseColorDefault genomicCodons
__EOF__
+### And these two files are checked into the source tree in:
+
+ makeDb/trackDb/contrib/tiberius/Tiberius.html
+ makeDb/trackDb/contrib/tiberius/tiverius.trackDb.txt
+
#############################################################################
### identify corresponding assemblies
#############################################################################
find ./2025-02-01 -type f | grep '/bb/' | awk -F$'/' '{print $NF}' \
| sed -e 's/.bb//;' | sort -u > tiberius.2025-02-01.accession.list
grep -F -f tiberius.2025-02-01.accession.list \
$HOME/kent/src/hg/makeDb/doc/asmHubs/master.run.list \
| cut -d' ' -f2 | sort -u > to.link.list
### looks like 701 of them match:
wc -l tiberius.2025-02-01.accession.list to.link.list
1317 tiberius.2025-02-01.accession.list
701 to.link.list
#############################################################################
### setup the link script to get these files into the genark build hierarchy
#############################################################################
cat << '__EOF__' > linkOne.sh
#!/bin/bash
set -beEu -o pipefail
export TOP="/hive/data/outside/genark/tiberius"
export asmId="${1}"
export acc=`echo $asmId | cut -d'_' -f1-2`
export gcX="${asmId:0:3}"
export d0="${asmId:4:3}"
export d1="${asmId:7:3}"
export d2="${asmId:10:3}"
export gbkRef="genbankBuild"
if [[ "$gcX" == GCF ]]; then
gbkRef="refseqBuild"
fi
export buildDir="/hive/data/genomes/asmHubs/${gbkRef}/${gcX}/${d0}/${d1}/${d2}/${asmId}"
if [ ! -d "${buildDir}" ]; then
printf "ERROR: can not find build directory:\n%s\n" "${buildDir}" 1>&2
exit 255
fi
export buildTrackDb="${buildDir}/${asmId}.trackDb.txt"
if [ ! -s "${buildTrackDb}" ]; then
printf "ERROR: can not find build trackDb:\n%s\n" "${buildTrackDb}" 1>&2
exit 255
fi
export destDir="${buildDir}/contrib/tiberius"
export destDir="${buildDir}/contrib/tiberius"
if [ -d "${destDir}" ]; then
printf "DONE: %s\n" "${asmId}"
exit 0
fi
mkdir -p "${destDir}"
export tiberiusVer="/gbdb/genark/contribTracks/tiberius/2025-02-01"
export destLink="${destDir}/tiberius.bigGenePred.bb"
export destTrackDb="${destDir}/tiberius.trackDb.txt"
export destHtml="${destDir}/Tiberius.html"
export srcHtml="${TOP}/2025-02-01/Tiberius.html"
export srcTrackDb="${TOP}/2025-02-01/tiberius.trackDb.txt"
export srcCount=`ls ${TOP}/2025-02-01/*/bb/${acc}.bb | wc -l`
if [ "${srcCount}" -eq 1 ]; then
export srcFile=`ls ${TOP}/2025-02-01/*/bb/${acc}.bb`
rm -f "${destLink}" "${destTrackDb}" "${destHtml}"
printf "ln -s $srcFile $destLink\n"
ln -s $srcFile $destLink
ln -s $srcHtml $destHtml
ln -s $srcTrackDb $destDir
cat "${buildTrackDb}" ${buildDir}/contrib/*/*.trackDb.txt \
> "${buildDir}/alpha.trackDb.txt"
sed -e 's/genomesFile genomes.txt/useOneFile on/; /trackDb trackDb.txt/d; s/^genome /\ngenome /;' \
${buildDir}/$asmId.hub.txt \
${buildDir}/$asmId.genomes.txt > "${buildDir}/alpha.hub.txt"
printf "\n" >> "${buildDir}/alpha.hub.txt"
cat "${buildTrackDb}" >> "${buildDir}/alpha.hub.txt"
cat ${buildDir}/contrib/*/*.trackDb.txt >> "${buildDir}/alpha.hub.txt"
else
printf "ERROR: can not find source file at\n" 1>&2
printf "%s\n" "${TOP}/2025-02-01/*/bb/${acc}.bb"
exit 255
fi
exit $?
__EOF__
chmod +x linkOne.sh
#############################################################################
### make all the symlinks
#############################################################################
for S in `cat to.link.list`
do
./linkOne.sh "${S}" 2>&1
done > link.log
#############################################################################
### with the links in place, the tracks will get into the genark
### assemblies with the usual GenArk build procedure
#############################################################################
### for example, the 'primates':
cd ~kent/src/hg/makeDb/doc/primatesAsmHub
time (make) > dbg 2>&1
### verify no errors:
egrep -i "fail|error|missing|cannot|clade|class|real" dbg
### if good, verify on download:
time (make verifyTestDownload) >> test.down.log 2>&1
### verify no errors:
egrep -i "fail|error|missing|cannot|clade|class|real|check" test.down.log
time (make sendDownload) >> send.down.log 2>&1
### verify no errors:
egrep -i "fail|error|missing|cannot|clade|class|real" send.down.log
time (make verifyDownload) >> verify.down.log 2>&1
### verify no errors:
egrep -i "fail|error|missing|cannot|clade|class|real|check" send.down.log
### all of the genark 'clades' can be done in one go:
#!/bin/bash
runOne() {
clade="${1}"
cd "../${clade}AsmHub"
printf "%s sleep %d\n" "${clade}" "${rand}"
time (make) > dbg 2>&1
time (make verifyTestDownload) >> test.down.log 2>&1
printf "# from the make in ../${clade}AsmHub\n"
egrep -w -i "fail|error|missing|cannot|clade|class|real" dbg | egrep -v "unclassified"
grep check test.down.log | tail -3
printf "#### done with ${clade}\n"
}
for C in primates plants invertebrate legacy \
birds fish fungi mammals vertebrate viral bacteria
do
runOne "${C}" &
done
printf "waiting . . .\n"
wait
printf ". . . exit\n"
exit $?
+#############################################################################
+### beta and public release control
+#############################################################################
+
+### the release of this track is controlled by two files in trackDb:
+
+ makeDb/trackDb/betaGenArk.txt
+ makeDb/trackDb/publicGenArk.txt
+
+### cat publicGenArk.txt betaGenArk.txt
+# the listing in this file triggers the building of the public.hub.txt
+# file in the genark system. Any contrib project listed here will be included
+# contrib track name: tracks found in /contrib//
+tiberius
+# the listing in this file triggers the building of the beta.hub.txt
+# file in the genark system. Any contrib project listed here will be included
+# contrib track name: tracks found in /contrib//
+tiberius
+
#############################################################################
### the daily cron jobs will correctly get all the files out to our
### mirror sites into the /gbdb/genark/ hierarchy
#############################################################################
### current cron job is in Hiram's hgwdev account, this should be moved
### to the 'otto' user.
# push out the /gbdb/hubs/GC[AF]/ hierarchy to:
03 01 * * * /hive/data/inside/genArk/pushRR.sh
### on the Asia node, it is a pull script in the qateam account:
# pull down the /gbdb/hubs/GC[AF]/ files from hgwdev daily
02 16 * * * ~/cronScripts/pullHgwdev.sh
#############################################################################