584b69eff038b10c2176d2b9299c356af21288cc hiram Wed Apr 22 16:30:53 2026 -0700 initial scripts to run the galaxy workflow based off of the ottoRequest table entries refs #31811 diff --git src/hg/utils/otto/userRequests/ottoRequestAlign.sh src/hg/utils/otto/userRequests/ottoRequestAlign.sh new file mode 100755 index 00000000000..cc1ec80e3ea --- /dev/null +++ src/hg/utils/otto/userRequests/ottoRequestAlign.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# ottoRequestAlign.sh - look up an ottoRequest row by id and construct +# the kegAlignLastz.sh command line from genark metadata +# +# usage: ottoRequestAlign.sh <id> +# +# Queries hgcentraltest.ottoRequest for fromDb/toDb, then looks up +# each accession in hgcentraltest.genark for asmName and clade. +# Prints and executes the resulting kegAlignLastz.sh command. + +set -beEu -o pipefail + +if [ $# != 1 ]; then + printf "usage: ottoRequestAlign.sh <id>\n" 1>&2 + printf " where <id> is a row id from hgcentraltest.ottoRequest\n" 1>&2 + exit 255 +fi + +export requestId="$1" + +# validate id is a positive integer +case "${requestId}" in + ''|*[!0-9]*) + printf "ERROR: id must be a positive integer, got: '%s'\n" "${requestId}" 1>&2 + exit 255 + ;; +esac + +############################################################################ +# step 1: look up fromDb and toDb from ottoRequest +############################################################################ +export ottoResult=$(hgsql -N -e \ + "select fromDb,toDb from ottoRequest where id=${requestId};" hgcentraltest) + +if [ -z "${ottoResult}" ]; then + printf "ERROR: no ottoRequest row found for id=%s\n" "${requestId}" 1>&2 + exit 255 +fi + +export fromDb=$(printf "%s" "${ottoResult}" | cut -f1) +export toDb=$(printf "%s" "${ottoResult}" | cut -f2) + +if [ -z "${fromDb}" -o -z "${toDb}" ]; then + printf "ERROR: empty fromDb or toDb for ottoRequest id=%s\n" "${requestId}" 1>&2 + printf " got: fromDb='%s' toDb='%s'\n" "${fromDb}" "${toDb}" 1>&2 + exit 255 +fi + +printf "# ottoRequest id=%s: fromDb='%s' toDb='%s'\n" \ + "${requestId}" "${fromDb}" "${toDb}" 1>&2 + +############################################################################ +# genarkLookup - query genark table for accession, asmName, clade +# arg: gcAccession (e.g. GCF_000002285.3) +# sets: _acc, _asmName, _clade +############################################################################ +function genarkLookup() { + local acc=$1 + local result=$(hgsql -N -e \ + "select gcAccession,asmName,clade from genark where gcAccession='${acc}';" \ + hgcentraltest) + if [ -z "${result}" ]; then + printf "ERROR: accession '%s' not found in hgcentraltest.genark\n" "${acc}" 1>&2 + return 1 + fi + _acc=$(printf "%s" "${result}" | cut -f1) + _asmName=$(printf "%s" "${result}" | cut -f2) + _clade=$(printf "%s" "${result}" | cut -f3) +} + +############################################################################ +# dbDbCladeLookup - look up clade for a UCSC database name +# from dbDb.name.clade.tsv (in same directory as this script) +# arg: dbName (e.g. hg38, rn7) +# sets: _clade +############################################################################ +function dbDbCladeLookup() { + local dbName=$1 + local scriptDir=$(cd "$(dirname "$0")" && pwd) + local tsvFile="${scriptDir}/dbDb.name.clade.tsv" + if [ ! -s "${tsvFile}" ]; then + printf "ERROR: clade lookup file not found: %s\n" "${tsvFile}" 1>&2 + return 1 + fi + _clade=$(grep -v '^#' "${tsvFile}" | awk -F'\t' -v db="${dbName}" '$1==db {print $2}') + if [ -z "${_clade}" ]; then + printf "ERROR: UCSC db '%s' not found in %s\n" "${dbName}" "${tsvFile}" 1>&2 + return 1 + fi +} + +############################################################################ +# cladeMap - convert genark/dbDb plural clade to kegAlignLastz singular form +# primates -> primate, mammals -> mammal, everything else -> other +############################################################################ +function cladeMap() { + local genarkClade=$1 + case "${genarkClade}" in + primates) printf "primate" ;; + mammals) printf "mammal" ;; + *) printf "other" ;; + esac +} + +############################################################################ +# step 2: look up both identifiers -- GenArk accession or UCSC db name +############################################################################ +case "${fromDb}" in + GC[AF]_*) + genarkLookup "${fromDb}" || exit 255 + export fromId="${_acc}_${_asmName}" + export fromClade="${_clade}" + ;; + *) + dbDbCladeLookup "${fromDb}" || exit 255 + export fromId="${fromDb}" + export fromClade="${_clade}" + ;; +esac + +case "${toDb}" in + GC[AF]_*) + genarkLookup "${toDb}" || exit 255 + export toId="${_acc}_${_asmName}" + export toClade="${_clade}" + ;; + *) + dbDbCladeLookup "${toDb}" || exit 255 + export toId="${toDb}" + export toClade="${_clade}" + ;; +esac + +printf "# from: %s clade=%s\n" "${fromId}" "${fromClade}" 1>&2 +printf "# to: %s clade=%s\n" "${toId}" "${toClade}" 1>&2 + +############################################################################ +# step 3: map clades and build the command +############################################################################ +export fromCladeArg=$(cladeMap "${fromClade}") +export toCladeArg=$(cladeMap "${toClade}") + +export cmd="kegAlignLastz.sh ${fromId} ${toId} ${fromCladeArg} ${toCladeArg}" + +printf "# %s\n" "${cmd}" 1>&2