f5cece60b38bd8c2cfd57acf2abb972d66edb6d9 hiram Fri Apr 24 11:43:36 2026 -0700 rename doneStatus to just status and correctly set status codes refs #31811 diff --git src/hg/utils/otto/userRequests/ottoRequestAlign.sh src/hg/utils/otto/userRequests/ottoRequestAlign.sh index 7b9559b659b..18d42c128ea 100755 --- src/hg/utils/otto/userRequests/ottoRequestAlign.sh +++ src/hg/utils/otto/userRequests/ottoRequestAlign.sh @@ -1,79 +1,70 @@ #!/bin/bash # ottoRequestAlign.sh - look up an ottoRequest row by id and construct # the kegAlignLastz.sh command line from genark metadata # # usage: ottoRequestAlign.sh <id> # # Queries hgcentraltest.ottoRequest for fromDb/toDb, then looks up # each accession in hgcentraltest.genark for asmName and clade. # Prints and executes the resulting kegAlignLastz.sh command. set -beEu -o pipefail +############################################################################ +### verify arguments +############################################################################ if [ $# != 1 ]; then printf "usage: ottoRequestAlign.sh <id>\n" 1>&2 printf " where <id> is a row id from hgcentraltest.ottoRequest\n" 1>&2 exit 255 fi export requestId="$1" # validate id is a positive integer case "${requestId}" in ''|*[!0-9]*) printf "ERROR: id must be a positive integer, got: '%s'\n" "${requestId}" 1>&2 exit 255 ;; esac ############################################################################ -# step 1: look up fromDb and toDb from ottoRequest +### function definitions ############################################################################ -export ottoResult=$(hgsql -N -e \ - "select fromDb,toDb from ottoRequest where id=${requestId};" hgcentraltest) - -if [ -z "${ottoResult}" ]; then - printf "ERROR: no ottoRequest row found for id=%s\n" "${requestId}" 1>&2 - exit 255 -fi - -export fromDb=$(printf "%s" "${ottoResult}" | cut -f1) -export toDb=$(printf "%s" "${ottoResult}" | cut -f2) - -if [ -z "${fromDb}" -o -z "${toDb}" ]; then - printf "ERROR: empty fromDb or toDb for ottoRequest id=%s\n" "${requestId}" 1>&2 - printf " got: fromDb='%s' toDb='%s'\n" "${fromDb}" "${toDb}" 1>&2 - exit 255 -fi - -printf "# ottoRequest id=%s: fromDb='%s' toDb='%s'\n" \ - "${requestId}" "${fromDb}" "${toDb}" 1>&2 +### errors - set error status in the table +function setErrorStatus() { + id="${1}" + hgsql -N -e \ + "UPDATE ottoRequest SET status=7 WHERE id=${id};" hgcentraltest +} ############################################################################ # genarkLookup - query genark table for accession, asmName, clade # arg: gcAccession (e.g. GCF_000002285.3) # sets: _acc, _asmName, _clade ############################################################################ function genarkLookup() { local acc=$1 local result=$(hgsql -N -e \ - "select gcAccession,asmName,clade from genark where gcAccession='${acc}';" \ + "SELECT gcAccession,asmName,clade from genark WHERE gcAccession='${acc}';" \ hgcentraltest) if [ -z "${result}" ]; then printf "ERROR: accession '%s' not found in hgcentraltest.genark\n" "${acc}" 1>&2 + setErrorStatus ${requestId} return 1 fi _acc=$(printf "%s" "${result}" | cut -f1) _asmName=$(printf "%s" "${result}" | cut -f2) _clade=$(printf "%s" "${result}" | cut -f3) } ############################################################################ # dbDbCladeLookup - look up clade for a UCSC database name # from dbDb.name.clade.tsv (in same directory as this script) # arg: dbName (e.g. hg38, rn7) # sets: _clade ############################################################################ function dbDbCladeLookup() { local dbName=$1 @@ -91,30 +82,93 @@ } ############################################################################ # cladeMap - convert genark/dbDb plural clade to kegAlignLastz singular form # primates -> primate, mammals -> mammal, everything else -> other ############################################################################ function cladeMap() { local genarkClade=$1 case "${genarkClade}" in primates) printf "primate" ;; mammals) printf "mammal" ;; *) printf "other" ;; esac } +############################################################################ +# twoBitPath - return path to 2bit file +############################################################################ +function twoBitPath() { + local asmName=$1 + case ${asmName} in + GC[AF]_*) + local gcX=$(printf "%s" "${asmName}" | cut -c1-3) + local d0=$(printf "%s" "${asmName}" | cut -c5-7) + local d1=$(printf "%s" "${asmName}" | cut -c8-10) + local d2=$(printf "%s" "${asmName}" | cut -c11-13) + printf "/hive/data/genomes/asmHubs/%s/%s/%s/%s/%s/%s.2bit" \ + "${gcX}" "${d0}" "${d1}" "${d2}" "${asmName}" "${asmName}" + ;; + *) + printf "/hive/data/genomes/%s/%s.2bit" "${asmName}" "${asmName}" + ;; + esac +} +############################################################################ + +############################################################################ +# asmN50 - compute N50 from the twoBit file +############################################################################ +function asmN50() { + local twoBit=$1 + twoBitInfo "${twoBit}" stdout \ + | n50.pl stdin 2>&1 \ + | grep -A1 "^[0-9].*one half size" \ + | tail -1 \ + | awk '{print $NF}' +} +############################################################################ + +############################################################################ +### main() scripting begins here +############################################################################ + +############################################################################ +# step 1: look up fromDb and toDb from ottoRequest +############################################################################ +export ottoResult=$(hgsql -N -e \ + "SELECT fromDb,toDb from ottoRequest WHERE id=${requestId} AND status = 1;" hgcentraltest) + +if [ -z "${ottoResult}" ]; then + printf "ERROR: no ottoRequest row found for id=%s AND status = 1\n" "${requestId}" 1>&2 + hgsql -e "SELECT fromDb,toDb,status from ottoRequest WHERE id=${requestId};" hgcentraltest 1>&2 + exit 255 +fi + +export fromDb=$(printf "%s" "${ottoResult}" | cut -f1) +export toDb=$(printf "%s" "${ottoResult}" | cut -f2) + +if [ -z "${fromDb}" -o -z "${toDb}" ]; then + printf "ERROR: empty fromDb or toDb for ottoRequest id=%s\n" "${requestId}" 1>&2 + printf " got: fromDb='%s' toDb='%s'\n" "${fromDb}" "${toDb}" 1>&2 + setErrorStatus ${requestId} + exit 255 +fi + +printf "# ottoRequest id=%s: fromDb='%s' toDb='%s'\n" \ + "${requestId}" "${fromDb}" "${toDb}" 1>&2 + ############################################################################ # step 2: look up both identifiers -- GenArk accession or UCSC db name ############################################################################ case "${fromDb}" in GC[AF]_*) genarkLookup "${fromDb}" || exit 255 export fromId="${_acc}_${_asmName}" export fromClade="${_clade}" ;; *) dbDbCladeLookup "${fromDb}" || exit 255 export fromId="${fromDb}" export fromClade="${_clade}" ;; esac @@ -124,98 +178,65 @@ genarkLookup "${toDb}" || exit 255 export toId="${_acc}_${_asmName}" export toClade="${_clade}" ;; *) dbDbCladeLookup "${toDb}" || exit 255 export toId="${toDb}" export toClade="${_clade}" ;; esac printf "# from: %s clade=%s\n" "${fromId}" "${fromClade}" 1>&2 printf "# to: %s clade=%s\n" "${toId}" "${toClade}" 1>&2 ############################################################################ -# step 2b: compare N50 -- better N50 assembly becomes the alignment target +# step 3: determine N50 for each to decide target vs. query ############################################################################ -function twoBitPath() { - local asmName=$1 - case ${asmName} in - GC[AF]_*) - local gcX=$(printf "%s" "${asmName}" | cut -c1-3) - local d0=$(printf "%s" "${asmName}" | cut -c5-7) - local d1=$(printf "%s" "${asmName}" | cut -c8-10) - local d2=$(printf "%s" "${asmName}" | cut -c11-13) - printf "/hive/data/genomes/asmHubs/%s/%s/%s/%s/%s/%s.2bit" \ - "${gcX}" "${d0}" "${d1}" "${d2}" "${asmName}" "${asmName}" - ;; - *) - printf "/hive/data/genomes/%s/%s.2bit" "${asmName}" "${asmName}" - ;; - esac -} - -function asmN50() { - local twoBit=$1 - twoBitInfo "${twoBit}" stdout \ - | n50.pl stdin 2>&1 \ - | grep -A1 "^[0-9].*one half size" \ - | tail -1 \ - | awk '{print $NF}' -} export from2bit=$(twoBitPath "${fromDb}") export to2bit=$(twoBitPath "${toDb}") if [ ! -s "${from2bit}" ]; then printf "ERROR: 2bit file not found: %s\n" "${from2bit}" 1>&2 + setErrorStatus ${requestId} exit 255 fi if [ ! -s "${to2bit}" ]; then printf "ERROR: 2bit file not found: %s\n" "${to2bit}" 1>&2 + setErrorStatus ${requestId} exit 255 fi export fromN50=$(asmN50 "${from2bit}") export toN50=$(asmN50 "${to2bit}") printf "# from N50: %s (%s)\n" "${fromN50}" "${fromDb}" 1>&2 printf "# to N50: %s (%s)\n" "${toN50}" "${toDb}" 1>&2 if [ -n "${fromN50}" -a -n "${toN50}" ]; then if [ "${toN50}" -gt "${fromN50}" ]; then printf "# swapping: %s (N50=%s) becomes target over %s (N50=%s)\n" \ "${toId}" "${toN50}" "${fromId}" "${fromN50}" 1>&2 tmpId="${fromId}"; export fromId="${toId}"; export toId="${tmpId}" tmpClade="${fromClade}"; export fromClade="${toClade}"; export toClade="${tmpClade}" fi else printf "WARNING: could not determine N50, keeping original target/query order\n" 1>&2 fi -############################################################################ -# step 3: map clades and build the command -############################################################################ -export fromCladeArg=$(cladeMap "${fromClade}") -export toCladeArg=$(cladeMap "${toClade}") - -export cmd="kegAlignLastz.sh ${fromId} ${toId} ${fromCladeArg} ${toCladeArg}" - -printf "# %s\n" "${cmd}" 1>&2 - ############################################################################ # step 4: compute buildDir and update ottoRequest table ############################################################################ # derive accession ID and Query label the same way kegAlignLastz.sh does export tAccId=$(printf "%s" "${fromId}" | cut -d'_' -f1-2) export qAccId=$(printf "%s" "${toId}" | cut -d'_' -f1-2) export Query="${qAccId^}" export DS=$(date "+%F") # compute buildDir -- UCSC db default, then GenArk override export buildDir="/hive/data/genomes/${fromId}/bed/lastz${Query}.${DS}" export targetExists="/hive/data/genomes/${fromId}/bed" case ${fromId} in @@ -229,17 +250,28 @@ targetExists="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${fromId}/trackData" ;; esac # reuse existing build directory if one is already in progress working=$(ls -d ${targetExists}/lastz${Query}.* 2> /dev/null | wc -l) if [ "${working}" -gt 0 ]; then buildDir=$(ls -d ${targetExists}/lastz${Query}.* | tail -1) printf "# existing buildDir: %s\n" "${buildDir}" 1>&2 fi printf "# buildDir: %s\n" "${buildDir}" 1>&2 # store buildDir in ottoRequest table for workflowMonitor.sh hgsql -N -e \ - "UPDATE ottoRequest SET buildDir='${buildDir}' WHERE id=${requestId};" \ + "UPDATE ottoRequest SET buildDir='${buildDir}', status=2 WHERE id=${requestId};" \ hgcentraltest + +############################################################################ +# step 5: map clades and build the kegAlignLastz.sh command +############################################################################ +export fromCladeArg=$(cladeMap "${fromClade}") +export toCladeArg=$(cladeMap "${toClade}") + +export cmd="kegAlignLastz.sh ${fromId} ${toId} ${fromCladeArg} ${toCladeArg}" + +printf "####### kegAlignLastz.sh script would be:\n" 1>&2 +printf "# %s\n" "${cmd}" 1>&2