96adb44e8265446fccbaac39dd13bae7217d4825 hiram Fri Apr 17 15:13:38 2026 -0700 verify given GenArk assemblies actually exist in GenArk before proceeding refs #37380 diff --git src/hg/utils/automation/pairLastz.sh src/hg/utils/automation/pairLastz.sh index b56e6665c19..437efb6e556 100755 --- src/hg/utils/automation/pairLastz.sh +++ src/hg/utils/automation/pairLastz.sh @@ -1,36 +1,46 @@ #!/bin/bash set -beEu -o pipefail export userName="`whoami`" export bigHub="hgwdev" export workHorse="hgwdev" export smallClusterHub="hgwdev" export fileServer="hgwdev" +# parse optional -force flag to skip genark table verification +export forceRun=0 +if [ $# -gt 0 ] && [ "$1" = "-force" ]; then + forceRun=1 + shift +fi + if [ $# != 4 ]; then printf "ERROR: arg count: %d != 4\n" "$#" 1>&2 - printf "usage: pairLastz.sh <target> <query> <tClade> <qClade> + printf "usage: pairLastz.sh [-force] <target> <query> <tClade> <qClade> Where target/query is either a UCSC db name, or is an assembly hub identifier, e.g.: GCA_002844635.1_USMARCv1.0.1 And [tq]Clade is one of: primate|mammal|other +The -force option skips the hgcentraltest.genark table verification + for GenArk assembly identifiers. + Will create directory to work in, for example if, UCSC db: /hive/data/target/bed/lastzQuery.yyyy-mm-dd/ Or, in the assembly hub build directory: /hive/data/genomes/asmHubs/allBuild/GCA/002/844/635/GCA_002844635.1_USMARCv1.0/trackData/lastzQuery.yyyy-mm-dd Will set up a DEF file there, and a run.sh script to run all steps and output makeDoc text to document what happened. AND MORE, it will run the swap operation into the corresponding blastz.target.swap directory in the query genome work space. Email will be sent to: '$userName' upon completion. e.g.: pairLastz.sh rn7 papAnu4 mammal primate\n" 1>&2 @@ -131,44 +141,73 @@ oDate=`egrep -m 1 -i "^#[[:space:]]*Date:" ${asmRpt} | tr -d '\r' | sed -e 's/.*ate: \+//;'` ;; *) oDate="" ;; esac printf "%s" "${oDate}" } # check if this database is actually a database browser or a promoted # hub. It could have a MySQL database, but it won't have a chromInfo function promotedHub() { export db=$1 } +# verifyGenark - verify a GenArk accession exists in hgcentraltest.genark +# returns 0 if found, 1 if not found +function verifyGenark() { + local asmAccession=$1 + local fullName=$2 + local count=$(hgsql -N -e "SELECT COUNT(*) FROM genark WHERE gcAccession='${asmAccession}';" hgcentraltest) + if [ "$count" -eq 0 ]; then + printf "ERROR: assembly '%s' not found in GenArk\n" "$fullName" 1>&2 + return 1 + fi + return 0 +} + ############################################################################## ############################################################################## ### start seconds export startT=`date "+%s"` export target="$1" export query="$2" export tClade="$3" export qClade="$4" export tGcPath=$(gcPath $target) export qGcPath=$(gcPath $query) export tAsmId=$(asmId $target) export qAsmId=$(asmId $query) + +# verify GenArk assemblies exist in hgcentraltest.genark unless -force +if [ "$forceRun" -eq 0 ]; then + export genarkErrors=0 + case $target in + GC[AF]_*) verifyGenark "$tAsmId" "$target" || genarkErrors=$((genarkErrors+1)) ;; + esac + case $query in + GC[AF]_*) verifyGenark "$qAsmId" "$query" || genarkErrors=$((genarkErrors+1)) ;; + esac + if [ "$genarkErrors" -gt 0 ]; then + printf "Use -force to skip this check\n" 1>&2 + exit 255 + fi +fi + printf "# tq: '${target}' '${query}' '${tClade}' '${qClade}'\n" 1>&2 printf "# tq gcPath: '${tGcPath}' '${qGcPath}'\n" 1>&2 printf "# tq asmId: '${tAsmId}' '${qAsmId}'\n" 1>&2 # upper case first character export Target="${tAsmId^}" export Query="${qAsmId^}" export DS=`date "+%F"` # assume UCSC db build export buildDir="/hive/data/genomes/${target}/bed/lastz${Query}.${DS}" export targetExists="/hive/data/genomes/${target}/bed" export symLink="/hive/data/genomes/${target}/bed/lastz.${qAsmId}" export swapDir="/hive/data/genomes/${query}/bed/blastz.${tAsmId}.swap"