src/hg/utils/otto/userRequests/ottoRequestAlign.sh f0b8830616972bad195079b7ce775c919ff1e25f

f0b8830616972bad195079b7ce775c919ff1e25f
hiram
  Thu Apr 23 15:13:50 2026 -0700
correctly decide which is going to be target and query based on the N50 sizes of the two given assemblies refs #31811

diff --git src/hg/utils/otto/userRequests/ottoRequestAlign.sh src/hg/utils/otto/userRequests/ottoRequestAlign.sh
index cc1ec80e3ea..7b9559b659b 100755
--- src/hg/utils/otto/userRequests/ottoRequestAlign.sh
+++ src/hg/utils/otto/userRequests/ottoRequestAlign.sh
@@ -1,146 +1,245 @@
 #!/bin/bash
 
 # ottoRequestAlign.sh - look up an ottoRequest row by id and construct
 #   the kegAlignLastz.sh command line from genark metadata
 #
 # usage: ottoRequestAlign.sh <id>
 #
 # Queries hgcentraltest.ottoRequest for fromDb/toDb, then looks up
 # each accession in hgcentraltest.genark for asmName and clade.
 # Prints and executes the resulting kegAlignLastz.sh command.
 
 set -beEu -o pipefail
 
 if [ $# != 1 ]; then
   printf "usage: ottoRequestAlign.sh <id>\n" 1>&2
   printf "  where <id> is a row id from hgcentraltest.ottoRequest\n" 1>&2
   exit 255
 fi
 
 export requestId="$1"
 
 # validate id is a positive integer
 case "${requestId}" in
   ''|*[!0-9]*)
     printf "ERROR: id must be a positive integer, got: '%s'\n" "${requestId}" 1>&2
     exit 255
     ;;
 esac
 
 ############################################################################
 # step 1: look up fromDb and toDb from ottoRequest
 ############################################################################
 export ottoResult=$(hgsql -N -e \
   "select fromDb,toDb from ottoRequest where id=${requestId};" hgcentraltest)
 
 if [ -z "${ottoResult}" ]; then
   printf "ERROR: no ottoRequest row found for id=%s\n" "${requestId}" 1>&2
   exit 255
 fi
 
 export fromDb=$(printf "%s" "${ottoResult}" | cut -f1)
 export toDb=$(printf "%s" "${ottoResult}" | cut -f2)
 
 if [ -z "${fromDb}" -o -z "${toDb}" ]; then
   printf "ERROR: empty fromDb or toDb for ottoRequest id=%s\n" "${requestId}" 1>&2
   printf "  got: fromDb='%s' toDb='%s'\n" "${fromDb}" "${toDb}" 1>&2
   exit 255
 fi
 
 printf "# ottoRequest id=%s: fromDb='%s' toDb='%s'\n" \
   "${requestId}" "${fromDb}" "${toDb}" 1>&2
 
 ############################################################################
 # genarkLookup - query genark table for accession, asmName, clade
 #   arg: gcAccession (e.g. GCF_000002285.3)
 #   sets: _acc, _asmName, _clade
 ############################################################################
 function genarkLookup() {
   local acc=$1
   local result=$(hgsql -N -e \
     "select gcAccession,asmName,clade from genark where gcAccession='${acc}';" \
     hgcentraltest)
   if [ -z "${result}" ]; then
     printf "ERROR: accession '%s' not found in hgcentraltest.genark\n" "${acc}" 1>&2
     return 1
   fi
   _acc=$(printf "%s" "${result}" | cut -f1)
   _asmName=$(printf "%s" "${result}" | cut -f2)
   _clade=$(printf "%s" "${result}" | cut -f3)
 }
 
 ############################################################################
 # dbDbCladeLookup - look up clade for a UCSC database name
 #   from dbDb.name.clade.tsv (in same directory as this script)
 #   arg: dbName (e.g. hg38, rn7)
 #   sets: _clade
 ############################################################################
 function dbDbCladeLookup() {
   local dbName=$1
   local scriptDir=$(cd "$(dirname "$0")" && pwd)
   local tsvFile="${scriptDir}/dbDb.name.clade.tsv"
   if [ ! -s "${tsvFile}" ]; then
     printf "ERROR: clade lookup file not found: %s\n" "${tsvFile}" 1>&2
     return 1
   fi
   _clade=$(grep -v '^#' "${tsvFile}" | awk -F'\t' -v db="${dbName}" '$1==db {print $2}')
   if [ -z "${_clade}" ]; then
     printf "ERROR: UCSC db '%s' not found in %s\n" "${dbName}" "${tsvFile}" 1>&2
     return 1
   fi
 }
 
 ############################################################################
 # cladeMap - convert genark/dbDb plural clade to kegAlignLastz singular form
 #   primates -> primate, mammals -> mammal, everything else -> other
 ############################################################################
 function cladeMap() {
   local genarkClade=$1
   case "${genarkClade}" in
     primates) printf "primate" ;;
     mammals)  printf "mammal"  ;;
     *)        printf "other"   ;;
   esac
 }
 
 ############################################################################
 # step 2: look up both identifiers -- GenArk accession or UCSC db name
 ############################################################################
 case "${fromDb}" in
   GC[AF]_*)
     genarkLookup "${fromDb}" || exit 255
     export fromId="${_acc}_${_asmName}"
     export fromClade="${_clade}"
     ;;
   *)
     dbDbCladeLookup "${fromDb}" || exit 255
     export fromId="${fromDb}"
     export fromClade="${_clade}"
     ;;
 esac
 
 case "${toDb}" in
   GC[AF]_*)
     genarkLookup "${toDb}" || exit 255
     export toId="${_acc}_${_asmName}"
     export toClade="${_clade}"
     ;;
   *)
     dbDbCladeLookup "${toDb}" || exit 255
     export toId="${toDb}"
     export toClade="${_clade}"
     ;;
 esac
 
 printf "# from: %s  clade=%s\n" "${fromId}" "${fromClade}" 1>&2
 printf "#   to: %s  clade=%s\n" "${toId}" "${toClade}" 1>&2
 
+############################################################################
+# step 2b: compare N50 -- better N50 assembly becomes the alignment target
+############################################################################
+function twoBitPath() {
+  local asmName=$1
+  case ${asmName} in
+    GC[AF]_*)
+      local gcX=$(printf "%s" "${asmName}" | cut -c1-3)
+      local d0=$(printf "%s" "${asmName}" | cut -c5-7)
+      local d1=$(printf "%s" "${asmName}" | cut -c8-10)
+      local d2=$(printf "%s" "${asmName}" | cut -c11-13)
+      printf "/hive/data/genomes/asmHubs/%s/%s/%s/%s/%s/%s.2bit" \
+        "${gcX}" "${d0}" "${d1}" "${d2}" "${asmName}" "${asmName}"
+      ;;
+    *)
+      printf "/hive/data/genomes/%s/%s.2bit" "${asmName}" "${asmName}"
+      ;;
+  esac
+}
+
+function asmN50() {
+  local twoBit=$1
+  twoBitInfo "${twoBit}" stdout \
+    | n50.pl stdin 2>&1 \
+    | grep -A1 "^[0-9].*one half size" \
+    | tail -1 \
+    | awk '{print $NF}'
+}
+
+export from2bit=$(twoBitPath "${fromDb}")
+export to2bit=$(twoBitPath "${toDb}")
+
+if [ ! -s "${from2bit}" ]; then
+  printf "ERROR: 2bit file not found: %s\n" "${from2bit}" 1>&2
+  exit 255
+fi
+
+if [ ! -s "${to2bit}" ]; then
+  printf "ERROR: 2bit file not found: %s\n" "${to2bit}" 1>&2
+  exit 255
+fi
+
+export fromN50=$(asmN50 "${from2bit}")
+export toN50=$(asmN50 "${to2bit}")
+
+printf "# from N50: %s (%s)\n" "${fromN50}" "${fromDb}" 1>&2
+printf "#   to N50: %s (%s)\n" "${toN50}" "${toDb}" 1>&2
+
+if [ -n "${fromN50}" -a -n "${toN50}" ]; then
+  if [ "${toN50}" -gt "${fromN50}" ]; then
+    printf "# swapping: %s (N50=%s) becomes target over %s (N50=%s)\n" \
+      "${toId}" "${toN50}" "${fromId}" "${fromN50}" 1>&2
+    tmpId="${fromId}"; export fromId="${toId}"; export toId="${tmpId}"
+    tmpClade="${fromClade}"; export fromClade="${toClade}"; export toClade="${tmpClade}"
+  fi
+else
+  printf "WARNING: could not determine N50, keeping original target/query order\n" 1>&2
+fi
+
 ############################################################################
 # step 3: map clades and build the command
 ############################################################################
 export fromCladeArg=$(cladeMap "${fromClade}")
 export toCladeArg=$(cladeMap "${toClade}")
 
 export cmd="kegAlignLastz.sh ${fromId} ${toId} ${fromCladeArg} ${toCladeArg}"
 
 printf "# %s\n" "${cmd}" 1>&2
+
+############################################################################
+# step 4: compute buildDir and update ottoRequest table
+############################################################################
+
+# derive accession ID and Query label the same way kegAlignLastz.sh does
+export tAccId=$(printf "%s" "${fromId}" | cut -d'_' -f1-2)
+export qAccId=$(printf "%s" "${toId}" | cut -d'_' -f1-2)
+export Query="${qAccId^}"
+export DS=$(date "+%F")
+
+# compute buildDir -- UCSC db default, then GenArk override
+export buildDir="/hive/data/genomes/${fromId}/bed/lastz${Query}.${DS}"
+export targetExists="/hive/data/genomes/${fromId}/bed"
+
+case ${fromId} in
+  GC[AF]_*)
+    gcX=$(printf "%s" "${fromId}" | cut -c1-3)
+    d0=$(printf "%s" "${fromId}" | cut -c5-7)
+    d1=$(printf "%s" "${fromId}" | cut -c8-10)
+    d2=$(printf "%s" "${fromId}" | cut -c11-13)
+    tGcPath="${gcX}/${d0}/${d1}/${d2}"
+    buildDir="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${fromId}/trackData/lastz${Query}.${DS}"
+    targetExists="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${fromId}/trackData"
+    ;;
+esac
+
+# reuse existing build directory if one is already in progress
+working=$(ls -d ${targetExists}/lastz${Query}.* 2> /dev/null | wc -l)
+if [ "${working}" -gt 0 ]; then
+  buildDir=$(ls -d ${targetExists}/lastz${Query}.* | tail -1)
+  printf "# existing buildDir: %s\n" "${buildDir}" 1>&2
+fi
+
+printf "# buildDir: %s\n" "${buildDir}" 1>&2
+
+# store buildDir in ottoRequest table for workflowMonitor.sh
+hgsql -N -e \
+  "UPDATE ottoRequest SET buildDir='${buildDir}' WHERE id=${requestId};" \
+  hgcentraltest