src/hg/utils/automation/kegAlignLastz.sh 7edafbf8e253dec09305b4c540ebff3cd9557837

7edafbf8e253dec09305b4c540ebff3cd9557837
hiram
  Fri Apr 17 16:30:08 2026 -0700
add the genark verification check and fixup errors in the scripting refs #31811

diff --git src/hg/utils/automation/kegAlignLastz.sh src/hg/utils/automation/kegAlignLastz.sh
index 28875950c8b..d99ca5ca12a 100755
--- src/hg/utils/automation/kegAlignLastz.sh
+++ src/hg/utils/automation/kegAlignLastz.sh
@@ -1,38 +1,48 @@
 #!/bin/bash
 
 set -beEu -o pipefail
 
 export userName="`whoami`"
 
 export bigHub="hgwdev"
 export workHorse="hgwdev"
 export smallClusterHub="hgwdev"
 export fileServer="hgwdev"
 #  a python virtual environment to run this
 export planemoCmd="/hive/users/hiram/galaxy/venv3.12/bin/planemo"
 
+# parse optional -force flag to skip genark table verification
+export forceRun=0
+if [ $# -gt 0 ] && [ "$1" = "-force" ]; then
+  forceRun=1
+  shift
+fi
+
 if [ $# != 4 ]; then
   printf "ERROR: arg count: %d != 4\n" "$#" 1>&2
 
-  printf "usage: kegAlignLastz.sh <target> <query> <tClade> <qClade>
+  printf "usage: kegAlignLastz.sh [-force] <target> <query> <tClade> <qClade>
 
 Where target/query is either a UCSC db name, or is an
    assembly hub identifier, e.g.: GCA_002844635.1_USMARCv1.0.1
 
 And [tq]Clade is one of: primate|mammal|other
 
+The -force option skips the hgcentraltest.genark table verification
+   for GenArk assembly identifiers.
+
 Will create directory to work in, for example if, UCSC db:
   /hive/data/target/bed/lastzQuery.yyyy-mm-dd/
 
 Or, in the assembly hub build directory:
 /hive/data/genomes/asmHubs/allBuild/GCA/002/844/635/GCA_002844635.1_USMARCv1.0/trackData/lastzQuery.yyyy-mm-dd
 
 Will set up a DEF file there, and a kegAlign.sh script to run all steps
   and output makeDoc text to document what happened.
   Also sets up a lastzRun.sh script that could be used to run the UCSC
   lastz procedure.
 
 AND MORE, it will run the swap operation into the corresponding
   blastz.target.swap directory in the query genome work space.
 
 Email will be sent to: '$userName' upon completion.
@@ -146,44 +156,73 @@
   export asmName=$1
   case $asmName in
      GC[AF]_*)
        gcxPath=$(gcPath $asmName)
        asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}"
        asmRpt="${asmDir}/${asmName}_assembly_report.txt"
        oDate=`egrep -m 1 -i "^#[[:space:]]*Date:" ${asmRpt} | tr -d '\r' | sed -e 's/.*ate: \+//;'`
        ;;
      *)
        oDate=""
        ;;
   esac
   printf "%s" "${oDate}"
 }
 
+# verifyGenark - verify a GenArk accession exists in hgcentraltest.genark
+#   returns 0 if found, 1 if not found
+function verifyGenark() {
+  local asmAccession=$1
+  local fullName=$2
+  local count=$(hgsql -N -e "SELECT COUNT(*) FROM genark WHERE gcAccession='${asmAccession}';" hgcentraltest)
+  if [ "$count" -eq 0 ]; then
+    printf "ERROR: assembly '%s' not found in GenArk\n" "$fullName" 1>&2
+    return 1
+  fi
+  return 0
+}
+
 ##############################################################################
 ##############################################################################
 ### start seconds
 export startT=`date "+%s"`
 
 export target="$1"
 export query="$2"
 export tClade="$3"
 export qClade="$4"
 
 export tGcPath=$(gcPath $target)
 export qGcPath=$(gcPath $query)
 export tAccId=$(accId $target)
 export qAccId=$(accId $query)
+
+# verify GenArk assemblies exist in hgcentraltest.genark unless -force
+if [ "$forceRun" -eq 0 ]; then
+  export genarkErrors=0
+  case $target in
+    GC[AF]_*) verifyGenark "$tAsmId" "$target" || genarkErrors=$((genarkErrors+1)) ;;
+  esac
+  case $query in
+    GC[AF]_*) verifyGenark "$qAsmId" "$query" || genarkErrors=$((genarkErrors+1)) ;;
+  esac
+  if [ "$genarkErrors" -gt 0 ]; then
+    printf "Use -force to skip this check\n" 1>&2
+    exit 255
+  fi
+fi
+
 printf "# tq: '${target}' '${query}' '${tClade}' '${qClade}'\n" 1>&2
 printf "# tq gcPath: '${tGcPath}' '${qGcPath}'\n" 1>&2
 printf "# tq accId: '${tAccId}' '${qAccId}'\n" 1>&2
 
 # upper case first character
 export Target="${tAccId^}"
 export Query="${qAccId^}"
 
 export DS=`date "+%F"`
 
 # assume UCSC db build
 export buildDir="/hive/data/genomes/${target}/bed/lastz${Query}.${DS}"
 export targetExists="/hive/data/genomes/${target}/bed"
 export symLink="/hive/data/genomes/${target}/bed/lastz.${qAccId}"
 export swapDir="/hive/data/genomes/${query}/bed/blastz.${tAccId}.swap"
@@ -506,30 +545,31 @@
 if [ $primaryDone -eq 0 ]; then
   mkdir -p "${buildDir}"
 
 ### setup the DEF file
 printf "%s" "${defString}" > ${buildDir}/DEF
 ### and the yaml file
 printf "%s" "${yamlString}" > ${buildDir}/${tAccId}.${qAccId}.yaml
 
 ### setup the buildDir/lastzRun.sh script
 printf "#!/bin/bash
 
 set -beEu -o pipefail
 
 export targetDb=\"${tAccId}\"
 export queryDb=\"${qAccId}\"
+export TargetDb=\"${Target}\"
 export QueryDb=\"${Query}\"
 
 cd ${buildDir}
 time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\
   $tFullName $qFullName -workhorse=${workHorse} -smallClusterHub=${smallClusterHub} \\
     -bigClusterHub=${bigHub} \\
     -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1
 
 grep -w real do.log | sed -e 's/^/    # /;'
 
 sed -e 's/^/    # /;' fb.\${targetDb}.chain\${QueryDb}Link.txt
 sed -e 's/^/    # /;' fb.\${targetDb}.chainSyn\${QueryDb}Link.txt
 
 time (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=${workHorse} -buildDir=\`pwd\` \\
    ${tRbestArgs} \\
@@ -557,37 +597,37 @@
 export QueryDb=\"${Query}\"
 export Target=\"${Target}\"
 export tSizes=\"${targetSizes}\"
 export qSizes=\"${querySizes}\"
 
 ############################################################################
 # chainBigBedFb - convert chain to bigBed and compute featureBits
 # args: db chainName chainGz sizesFile fbFile
 function chainBigBedFb() {
   local db=\$1
   local chainName=\$2
   local chainGz=\$3
   local sizesFile=\$4
   local fbFile=\$5
   chainToBigChain \"\${chainGz}\" \${chainName}.tab \${chainName}Link.tab
-  bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab \${chainName}.tab \${sizesFile} \${chainName}.bb
-  bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab \${chainName}Link.tab \${sizesFile} \${chainName}Link.bb
+  bedToBigBed -type=bed6+6 -as=\$HOME/kent/src/hg/lib/bigChain.as -tab \${chainName}.tab \${sizesFile} \${chainName}.bb
+  bedToBigBed -type=bed4+1 -as=\$HOME/kent/src/hg/lib/bigLink.as -tab \${chainName}Link.tab \${sizesFile} \${chainName}Link.bb
   rm -f \${chainName}.tab \${chainName}Link.tab chain.tab link.tab
-  local totalBases=\`ave -col=2 \${sizesFile} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\`
+  local totalBases=\`ave -col=2 \${sizesFile} | grep \"^total\" | awk '{printf \"%%d\", \$2}'\`
   local basesCovered=\`bigBedInfo \${chainName}Link.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\`
-  local percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\`
-  printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${fbFile}
+  local percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%%.3f\", 100.0*\$1/\$2}'\`
+  printf \"%%d bases of %%d (%%s%%) in intersection\\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${fbFile}
 }
 ############################################################################
 
 cd \${buildDir}
 mkdir -p log
 export DS=\`date \"+%%F_%%T_%%s\"\`
 if [ -s successInvocationId.txt ]; then
   DS=\`cut -f1 successInvocationId.txt\`
 fi
 export logDir=\"\${buildDir}/log\"
 export logFile=\"\${logDir}/\${DS}.log\"
 ### only try to run this if there is no indication it was successful
 if [ ! -s \"successInvocationId.txt\" ]; then
   time (\${PM} run \\
     ~/kent/src/hg/utils/automation/kegAlign.json.ga \\
@@ -604,104 +644,104 @@
   printf \"%%s\\tinvocation ID: %%s\\t%%s\\n\" \"\${DS}\" \"\${invocationId}\" \"\${logDir}/runOutput.\${DS}.json\" > successInvocationId.txt
 fi
 ### use that runOutput log file to get the history ID for deletion
 
 ### install allChain into buildDir/axtChain/
 mkdir -p \${buildDir}/axtChain
 if [ ! -s \"\${buildDir}/axtChain/\${targetDb}.\${queryDb}.all.chain.gz\" ]; then
   allChainFile=\`ls result/\${DS}/allChain__*.chain\`
   gzip -c \"\${allChainFile}\" > \${buildDir}/axtChain/\${targetDb}.\${queryDb}.all.chain.gz
 fi
 
 ### convert target allChain to bigBed and measure featureBits
 cd \${buildDir}/axtChain
 if [ ! -s \"\${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt\" ]; then
   allChain=\"\${targetDb}.\${queryDb}.all.chain.gz\"
-  chainBigBedFb \${targetDb} allChain \${allChain} \${tSizes} \${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt
+  chainBigBedFb \${targetDb} chain\${QueryDb} \${allChain} \${tSizes} \${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt
 fi
 cat \"\${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt\" 1>&2
 
 ### install netChainSubset (over.chain) for target
 cd \${buildDir}
 if [ ! -s \"\${buildDir}/axtChain/\${targetDb}.\${queryDb}.over.chain.gz\" ]; then
-  overChainFile=\`ls result/\${DS}/'netChainSubset on dataset 7 and 21__'*.chain\`
+  overChainFile=\`ls result/\${DS}/'netChainSubset on dataset 8 and 22__'*.chain\`
   gzip -c \"\${overChainFile}\" > \${buildDir}/axtChain/\${targetDb}.\${queryDb}.over.chain.gz
 fi
 
 ### convert target over.chain to bigBed and measure featureBits
 cd \${buildDir}/axtChain
-if [ ! -s \"\${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt\" ]; then
+if [ ! -s \"\${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt\" ]; then
   overChain=\"\${targetDb}.\${queryDb}.over.chain.gz\"
-  chainBigBedFb \${targetDb} overChain \${overChain} \${tSizes} \${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt
+  chainBigBedFb \${targetDb} chainLiftOver\${QueryDb} \${overChain} \${tSizes} \${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt
 fi
-cat \"\${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt\" 1>&2
+cat \"\${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt\" 1>&2
 
 ### install allChainSwap into swapDir/axtChain/
 mkdir -p \${swapDir}/axtChain
 cd \${buildDir}
 if [ ! -s \"\${swapDir}/axtChain/\${queryDb}.\${targetDb}.all.chain.gz\" ]; then
   allChainSwapFile=\`ls result/\${DS}/allChainSwap__*.chain\`
   gzip -c \"\${allChainSwapFile}\" > \${swapDir}/axtChain/\${queryDb}.\${targetDb}.all.chain.gz
 fi
 
 ### convert swap allChain to bigBed and measure featureBits
 cd \${swapDir}/axtChain
 if [ ! -s \"\${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt\" ]; then
   allChain=\"\${queryDb}.\${targetDb}.all.chain.gz\"
-  chainBigBedFb \${queryDb} allChain \${allChain} \${qSizes} \${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt
+  chainBigBedFb \${queryDb} chain\${TargetDb} \${allChain} \${qSizes} \${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt
 fi
 cat \"\${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt\" 1>&2
 
 ### install netChainSubset (over.chain) for swap
 cd \${buildDir}
 if [ ! -s \"\${swapDir}/axtChain/\${queryDb}.\${targetDb}.over.chain.gz\" ]; then
-  overChainSwapFile=\`ls result/\${DS}/'netChainSubset on dataset 8 and 35__'*.chain\`
+  overChainSwapFile=\`ls result/\${DS}/'netChainSubset on dataset 9 and 36__'*.chain\`
   gzip -c \"\${overChainSwapFile}\" > \${swapDir}/axtChain/\${queryDb}.\${targetDb}.over.chain.gz
 fi
 
 ### convert swap over.chain to bigBed and measure featureBits
 cd \${swapDir}/axtChain
-if [ ! -s \"\${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt\" ]; then
+if [ ! -s \"\${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt\" ]; then
   overChain=\"\${queryDb}.\${targetDb}.over.chain.gz\"
-  chainBigBedFb \${queryDb} overChain \${overChain} \${qSizes} \${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt
+  chainBigBedFb \${queryDb} chainLiftOver\${TargetDb} \${overChain} \${qSizes} \${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt
 fi
-cat \"\${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt\" 1>&2
+cat \"\${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt\" 1>&2
 
 " > ${buildDir}/kegAlign.sh
 chmod +x ${buildDir}/kegAlign.sh
 
 ### run the primary alignment, galaxy kegAlign style
 printf "running: time (${buildDir}/kegAlign.sh) >> ${buildDir}/kegAlign.log 2>&1\n" 1>&2
 
 # time (${buildDir}/kegAlign.sh) >> ${buildDir}/kegAlign.log 2>&1
 
 ### run the primary alignment, UCSC lastz style
 printf "running: time (${buildDir}/lastzRun.sh) >> ${buildDir}/lastzRun.log 2>&1\n" 1>&2
 
 # time (${buildDir}/lastzRun.sh) >> ${buildDir}/lastzRun.log 2>&1
 
 
 exit $?
 
 ### typical contents of the invocation_download:
 ### ls result/2026-04-02_16:00:02_1775170802
 ### 'Batched LASTZ on dataset 5__70a08f89-a77d-4333-b425-b13959e40d1b.axt'
 ### 'KegAlign on dataset 3 and 4__a32df3e9-13d0-4c5e-b765-df69c657ae89.tgz'
 ### allChainSwap__5e11f323-58fc-43f9-afbd-6cff21f0e7af.chain
 ### allChain__74038a6d-d735-4784-8b17-31a25ddbb036.chain
-###'netChainSubset on dataset 7 and 21__72cbd3a9-4136-4e72-86c9-9da0c230c5d0.chain'
-###'netChainSubset on dataset 8 and 35__9acc344f-19da-4ef9-a578-7282940badc0.chain'
+###'netChainSubset on dataset 8 and 22__72cbd3a9-4136-4e72-86c9-9da0c230c5d0.chain'
+###'netChainSubset on dataset 9 and 36__9acc344f-19da-4ef9-a578-7282940badc0.chain'
 
 # rebuild trackDb if possible here
 if [ -x "${tTdb}" ]; then
    ${tTdb}
 else
    printf "# do not find tTdb '%s'\n" "${tTdb}" 1>&2
 fi
 
 fi      ###     if [ $primaryDone -eq 0 ]; then
 
 exit $?
 
 #### print out the makeDoc.txt to this point into buildDir/makeDoc.txt
 
 printf "##############################################################################