74e53377949096d1ea6f3d78ec59b1914f1373d8 hiram Tue May 20 14:58:35 2025 -0700 note about trackDb files and beta and public release control refs #34917 diff --git src/hg/makeDb/doc/tiberius.txt src/hg/makeDb/doc/tiberius.txt index bf670c6ca92..e36b2884e4e 100644 --- src/hg/makeDb/doc/tiberius.txt +++ src/hg/makeDb/doc/tiberius.txt @@ -1,220 +1,244 @@ ############################################################################# # procedure for adding 'tiberius' gene prediction tracks to genark ############################################################################# ############################################################################# ### fetch tiberius predictions - (DONE - Hiram - 2025-02-01) ############################################################################# mkdir -p /hive/data/outside/genark/tiberius cd /hive/data/outside/genark/tiberius ### credential file obtained from Mario Stanke in email rsync --password-file=.tiberius.credential -av -P \ rsync://hiram@bioinf.uni-greifswald.de/tiberius/ ./2025-02-01/ ############################################################################# ### add html and trackDb definitions as obtained from Mario ### (DONE - Hiram - 2025-05-09) ############################################################################# cat << '__EOF__' > 2025-02-01/Tiberius.html

UCSC Tiberius Track

The protein-coding genes were predicted with Tiberius in ab initio mode. The soft-masked genome was input only. The command was:

tiberius.py --genome genome.fa --out tiberius.gtf

Table with predicted coordinates, protein sequences and coding sequences of all mammals.

Download code and see accuracy statistics on the Tiberius GitHub page.

Tiberius is a deep learning model that combines a HMM layer with other sequence-to-sequence models (convolutional neural networks, LSTM).

Tiberius was trained on 32 mammalian genomes that did not include any Hominidae (see supplements of below preprint).

Contact

Questions should be directed to Lars Gabriel or Mario Stanke.

Reference

Tiberius: End-to-End Deep Learning with an HMM for Gene Prediction. Lars Gabriel, Felix Becker, Katharina J. Hoff and Mario Stanke, Bioinformatics 2024;, https://doi.org/10.1093/bioinformatics/btae685 __EOF__ cat << '__EOF__' > 2025-02-01/Tiberius.html track Tiberius bigDataUrl contrib/tiberius/tiberius.bigGenePred.bb shortLabel Tiberius genes longLabel Tiberius ab initio gene prediction type bigGenePred visibility pack color 0,102,204 type bigGenePred html contrib/tiberius/Tiberius.html group genes dataVersion Tiberius version 2025-01-07 baseColorDefault genomicCodons __EOF__ +### And these two files are checked into the source tree in: + + makeDb/trackDb/contrib/tiberius/Tiberius.html + makeDb/trackDb/contrib/tiberius/tiverius.trackDb.txt + ############################################################################# ### identify corresponding assemblies ############################################################################# find ./2025-02-01 -type f | grep '/bb/' | awk -F$'/' '{print $NF}' \ | sed -e 's/.bb//;' | sort -u > tiberius.2025-02-01.accession.list grep -F -f tiberius.2025-02-01.accession.list \ $HOME/kent/src/hg/makeDb/doc/asmHubs/master.run.list \ | cut -d' ' -f2 | sort -u > to.link.list ### looks like 701 of them match: wc -l tiberius.2025-02-01.accession.list to.link.list 1317 tiberius.2025-02-01.accession.list 701 to.link.list ############################################################################# ### setup the link script to get these files into the genark build hierarchy ############################################################################# cat << '__EOF__' > linkOne.sh #!/bin/bash set -beEu -o pipefail export TOP="/hive/data/outside/genark/tiberius" export asmId="${1}" export acc=`echo $asmId | cut -d'_' -f1-2` export gcX="${asmId:0:3}" export d0="${asmId:4:3}" export d1="${asmId:7:3}" export d2="${asmId:10:3}" export gbkRef="genbankBuild" if [[ "$gcX" == GCF ]]; then gbkRef="refseqBuild" fi export buildDir="/hive/data/genomes/asmHubs/${gbkRef}/${gcX}/${d0}/${d1}/${d2}/${asmId}" if [ ! -d "${buildDir}" ]; then printf "ERROR: can not find build directory:\n%s\n" "${buildDir}" 1>&2 exit 255 fi export buildTrackDb="${buildDir}/${asmId}.trackDb.txt" if [ ! -s "${buildTrackDb}" ]; then printf "ERROR: can not find build trackDb:\n%s\n" "${buildTrackDb}" 1>&2 exit 255 fi export destDir="${buildDir}/contrib/tiberius" export destDir="${buildDir}/contrib/tiberius" if [ -d "${destDir}" ]; then printf "DONE: %s\n" "${asmId}" exit 0 fi mkdir -p "${destDir}" export tiberiusVer="/gbdb/genark/contribTracks/tiberius/2025-02-01" export destLink="${destDir}/tiberius.bigGenePred.bb" export destTrackDb="${destDir}/tiberius.trackDb.txt" export destHtml="${destDir}/Tiberius.html" export srcHtml="${TOP}/2025-02-01/Tiberius.html" export srcTrackDb="${TOP}/2025-02-01/tiberius.trackDb.txt" export srcCount=`ls ${TOP}/2025-02-01/*/bb/${acc}.bb | wc -l` if [ "${srcCount}" -eq 1 ]; then export srcFile=`ls ${TOP}/2025-02-01/*/bb/${acc}.bb` rm -f "${destLink}" "${destTrackDb}" "${destHtml}" printf "ln -s $srcFile $destLink\n" ln -s $srcFile $destLink ln -s $srcHtml $destHtml ln -s $srcTrackDb $destDir cat "${buildTrackDb}" ${buildDir}/contrib/*/*.trackDb.txt \ > "${buildDir}/alpha.trackDb.txt" sed -e 's/genomesFile genomes.txt/useOneFile on/; /trackDb trackDb.txt/d; s/^genome /\ngenome /;' \ ${buildDir}/$asmId.hub.txt \ ${buildDir}/$asmId.genomes.txt > "${buildDir}/alpha.hub.txt" printf "\n" >> "${buildDir}/alpha.hub.txt" cat "${buildTrackDb}" >> "${buildDir}/alpha.hub.txt" cat ${buildDir}/contrib/*/*.trackDb.txt >> "${buildDir}/alpha.hub.txt" else printf "ERROR: can not find source file at\n" 1>&2 printf "%s\n" "${TOP}/2025-02-01/*/bb/${acc}.bb" exit 255 fi exit $? __EOF__ chmod +x linkOne.sh ############################################################################# ### make all the symlinks ############################################################################# for S in `cat to.link.list` do ./linkOne.sh "${S}" 2>&1 done > link.log ############################################################################# ### with the links in place, the tracks will get into the genark ### assemblies with the usual GenArk build procedure ############################################################################# ### for example, the 'primates': cd ~kent/src/hg/makeDb/doc/primatesAsmHub time (make) > dbg 2>&1 ### verify no errors: egrep -i "fail|error|missing|cannot|clade|class|real" dbg ### if good, verify on download: time (make verifyTestDownload) >> test.down.log 2>&1 ### verify no errors: egrep -i "fail|error|missing|cannot|clade|class|real|check" test.down.log time (make sendDownload) >> send.down.log 2>&1 ### verify no errors: egrep -i "fail|error|missing|cannot|clade|class|real" send.down.log time (make verifyDownload) >> verify.down.log 2>&1 ### verify no errors: egrep -i "fail|error|missing|cannot|clade|class|real|check" send.down.log ### all of the genark 'clades' can be done in one go: #!/bin/bash runOne() { clade="${1}" cd "../${clade}AsmHub" printf "%s sleep %d\n" "${clade}" "${rand}" time (make) > dbg 2>&1 time (make verifyTestDownload) >> test.down.log 2>&1 printf "# from the make in ../${clade}AsmHub\n" egrep -w -i "fail|error|missing|cannot|clade|class|real" dbg | egrep -v "unclassified" grep check test.down.log | tail -3 printf "#### done with ${clade}\n" } for C in primates plants invertebrate legacy \ birds fish fungi mammals vertebrate viral bacteria do runOne "${C}" & done printf "waiting . . .\n" wait printf ". . . exit\n" exit $? +############################################################################# +### beta and public release control +############################################################################# + +### the release of this track is controlled by two files in trackDb: + + makeDb/trackDb/betaGenArk.txt + makeDb/trackDb/publicGenArk.txt + +### cat publicGenArk.txt betaGenArk.txt +# the listing in this file triggers the building of the public.hub.txt +# file in the genark system. Any contrib project listed here will be included +# contrib track name: tracks found in /contrib// +tiberius +# the listing in this file triggers the building of the beta.hub.txt +# file in the genark system. Any contrib project listed here will be included +# contrib track name: tracks found in /contrib// +tiberius + ############################################################################# ### the daily cron jobs will correctly get all the files out to our ### mirror sites into the /gbdb/genark/ hierarchy ############################################################################# ### current cron job is in Hiram's hgwdev account, this should be moved ### to the 'otto' user. # push out the /gbdb/hubs/GC[AF]/ hierarchy to: 03 01 * * * /hive/data/inside/genArk/pushRR.sh ### on the Asia node, it is a pull script in the qateam account: # pull down the /gbdb/hubs/GC[AF]/ files from hgwdev daily 02 16 * * * ~/cronScripts/pullHgwdev.sh #############################################################################