f0b8830616972bad195079b7ce775c919ff1e25f hiram Thu Apr 23 15:13:50 2026 -0700 correctly decide which is going to be target and query based on the N50 sizes of the two given assemblies refs #31811 diff --git src/hg/utils/otto/userRequests/ottoRequestAlign.sh src/hg/utils/otto/userRequests/ottoRequestAlign.sh index cc1ec80e3ea..7b9559b659b 100755 --- src/hg/utils/otto/userRequests/ottoRequestAlign.sh +++ src/hg/utils/otto/userRequests/ottoRequestAlign.sh @@ -1,146 +1,245 @@ #!/bin/bash # ottoRequestAlign.sh - look up an ottoRequest row by id and construct # the kegAlignLastz.sh command line from genark metadata # # usage: ottoRequestAlign.sh # # Queries hgcentraltest.ottoRequest for fromDb/toDb, then looks up # each accession in hgcentraltest.genark for asmName and clade. # Prints and executes the resulting kegAlignLastz.sh command. set -beEu -o pipefail if [ $# != 1 ]; then printf "usage: ottoRequestAlign.sh \n" 1>&2 printf " where is a row id from hgcentraltest.ottoRequest\n" 1>&2 exit 255 fi export requestId="$1" # validate id is a positive integer case "${requestId}" in ''|*[!0-9]*) printf "ERROR: id must be a positive integer, got: '%s'\n" "${requestId}" 1>&2 exit 255 ;; esac ############################################################################ # step 1: look up fromDb and toDb from ottoRequest ############################################################################ export ottoResult=$(hgsql -N -e \ "select fromDb,toDb from ottoRequest where id=${requestId};" hgcentraltest) if [ -z "${ottoResult}" ]; then printf "ERROR: no ottoRequest row found for id=%s\n" "${requestId}" 1>&2 exit 255 fi export fromDb=$(printf "%s" "${ottoResult}" | cut -f1) export toDb=$(printf "%s" "${ottoResult}" | cut -f2) if [ -z "${fromDb}" -o -z "${toDb}" ]; then printf "ERROR: empty fromDb or toDb for ottoRequest id=%s\n" "${requestId}" 1>&2 printf " got: fromDb='%s' toDb='%s'\n" "${fromDb}" "${toDb}" 1>&2 exit 255 fi printf "# ottoRequest id=%s: fromDb='%s' toDb='%s'\n" \ "${requestId}" "${fromDb}" "${toDb}" 1>&2 ############################################################################ # genarkLookup - query genark table for accession, asmName, clade # arg: gcAccession (e.g. GCF_000002285.3) # sets: _acc, _asmName, _clade ############################################################################ function genarkLookup() { local acc=$1 local result=$(hgsql -N -e \ "select gcAccession,asmName,clade from genark where gcAccession='${acc}';" \ hgcentraltest) if [ -z "${result}" ]; then printf "ERROR: accession '%s' not found in hgcentraltest.genark\n" "${acc}" 1>&2 return 1 fi _acc=$(printf "%s" "${result}" | cut -f1) _asmName=$(printf "%s" "${result}" | cut -f2) _clade=$(printf "%s" "${result}" | cut -f3) } ############################################################################ # dbDbCladeLookup - look up clade for a UCSC database name # from dbDb.name.clade.tsv (in same directory as this script) # arg: dbName (e.g. hg38, rn7) # sets: _clade ############################################################################ function dbDbCladeLookup() { local dbName=$1 local scriptDir=$(cd "$(dirname "$0")" && pwd) local tsvFile="${scriptDir}/dbDb.name.clade.tsv" if [ ! -s "${tsvFile}" ]; then printf "ERROR: clade lookup file not found: %s\n" "${tsvFile}" 1>&2 return 1 fi _clade=$(grep -v '^#' "${tsvFile}" | awk -F'\t' -v db="${dbName}" '$1==db {print $2}') if [ -z "${_clade}" ]; then printf "ERROR: UCSC db '%s' not found in %s\n" "${dbName}" "${tsvFile}" 1>&2 return 1 fi } ############################################################################ # cladeMap - convert genark/dbDb plural clade to kegAlignLastz singular form # primates -> primate, mammals -> mammal, everything else -> other ############################################################################ function cladeMap() { local genarkClade=$1 case "${genarkClade}" in primates) printf "primate" ;; mammals) printf "mammal" ;; *) printf "other" ;; esac } ############################################################################ # step 2: look up both identifiers -- GenArk accession or UCSC db name ############################################################################ case "${fromDb}" in GC[AF]_*) genarkLookup "${fromDb}" || exit 255 export fromId="${_acc}_${_asmName}" export fromClade="${_clade}" ;; *) dbDbCladeLookup "${fromDb}" || exit 255 export fromId="${fromDb}" export fromClade="${_clade}" ;; esac case "${toDb}" in GC[AF]_*) genarkLookup "${toDb}" || exit 255 export toId="${_acc}_${_asmName}" export toClade="${_clade}" ;; *) dbDbCladeLookup "${toDb}" || exit 255 export toId="${toDb}" export toClade="${_clade}" ;; esac printf "# from: %s clade=%s\n" "${fromId}" "${fromClade}" 1>&2 printf "# to: %s clade=%s\n" "${toId}" "${toClade}" 1>&2 +############################################################################ +# step 2b: compare N50 -- better N50 assembly becomes the alignment target +############################################################################ +function twoBitPath() { + local asmName=$1 + case ${asmName} in + GC[AF]_*) + local gcX=$(printf "%s" "${asmName}" | cut -c1-3) + local d0=$(printf "%s" "${asmName}" | cut -c5-7) + local d1=$(printf "%s" "${asmName}" | cut -c8-10) + local d2=$(printf "%s" "${asmName}" | cut -c11-13) + printf "/hive/data/genomes/asmHubs/%s/%s/%s/%s/%s/%s.2bit" \ + "${gcX}" "${d0}" "${d1}" "${d2}" "${asmName}" "${asmName}" + ;; + *) + printf "/hive/data/genomes/%s/%s.2bit" "${asmName}" "${asmName}" + ;; + esac +} + +function asmN50() { + local twoBit=$1 + twoBitInfo "${twoBit}" stdout \ + | n50.pl stdin 2>&1 \ + | grep -A1 "^[0-9].*one half size" \ + | tail -1 \ + | awk '{print $NF}' +} + +export from2bit=$(twoBitPath "${fromDb}") +export to2bit=$(twoBitPath "${toDb}") + +if [ ! -s "${from2bit}" ]; then + printf "ERROR: 2bit file not found: %s\n" "${from2bit}" 1>&2 + exit 255 +fi + +if [ ! -s "${to2bit}" ]; then + printf "ERROR: 2bit file not found: %s\n" "${to2bit}" 1>&2 + exit 255 +fi + +export fromN50=$(asmN50 "${from2bit}") +export toN50=$(asmN50 "${to2bit}") + +printf "# from N50: %s (%s)\n" "${fromN50}" "${fromDb}" 1>&2 +printf "# to N50: %s (%s)\n" "${toN50}" "${toDb}" 1>&2 + +if [ -n "${fromN50}" -a -n "${toN50}" ]; then + if [ "${toN50}" -gt "${fromN50}" ]; then + printf "# swapping: %s (N50=%s) becomes target over %s (N50=%s)\n" \ + "${toId}" "${toN50}" "${fromId}" "${fromN50}" 1>&2 + tmpId="${fromId}"; export fromId="${toId}"; export toId="${tmpId}" + tmpClade="${fromClade}"; export fromClade="${toClade}"; export toClade="${tmpClade}" + fi +else + printf "WARNING: could not determine N50, keeping original target/query order\n" 1>&2 +fi + ############################################################################ # step 3: map clades and build the command ############################################################################ export fromCladeArg=$(cladeMap "${fromClade}") export toCladeArg=$(cladeMap "${toClade}") export cmd="kegAlignLastz.sh ${fromId} ${toId} ${fromCladeArg} ${toCladeArg}" printf "# %s\n" "${cmd}" 1>&2 + +############################################################################ +# step 4: compute buildDir and update ottoRequest table +############################################################################ + +# derive accession ID and Query label the same way kegAlignLastz.sh does +export tAccId=$(printf "%s" "${fromId}" | cut -d'_' -f1-2) +export qAccId=$(printf "%s" "${toId}" | cut -d'_' -f1-2) +export Query="${qAccId^}" +export DS=$(date "+%F") + +# compute buildDir -- UCSC db default, then GenArk override +export buildDir="/hive/data/genomes/${fromId}/bed/lastz${Query}.${DS}" +export targetExists="/hive/data/genomes/${fromId}/bed" + +case ${fromId} in + GC[AF]_*) + gcX=$(printf "%s" "${fromId}" | cut -c1-3) + d0=$(printf "%s" "${fromId}" | cut -c5-7) + d1=$(printf "%s" "${fromId}" | cut -c8-10) + d2=$(printf "%s" "${fromId}" | cut -c11-13) + tGcPath="${gcX}/${d0}/${d1}/${d2}" + buildDir="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${fromId}/trackData/lastz${Query}.${DS}" + targetExists="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${fromId}/trackData" + ;; +esac + +# reuse existing build directory if one is already in progress +working=$(ls -d ${targetExists}/lastz${Query}.* 2> /dev/null | wc -l) +if [ "${working}" -gt 0 ]; then + buildDir=$(ls -d ${targetExists}/lastz${Query}.* | tail -1) + printf "# existing buildDir: %s\n" "${buildDir}" 1>&2 +fi + +printf "# buildDir: %s\n" "${buildDir}" 1>&2 + +# store buildDir in ottoRequest table for workflowMonitor.sh +hgsql -N -e \ + "UPDATE ottoRequest SET buildDir='${buildDir}' WHERE id=${requestId};" \ + hgcentraltest