6348b62549ccdad39b50fb4fd493d593b44ab31d
hiram
  Fri Apr 30 13:41:29 2021 -0700
a supervisor script to the doBlastzChainNet script to run everything refs #26682

diff --git src/hg/utils/automation/pairLastz.sh src/hg/utils/automation/pairLastz.sh
new file mode 100755
index 0000000..e9c4ae8
--- /dev/null
+++ src/hg/utils/automation/pairLastz.sh
@@ -0,0 +1,434 @@
+#!/bin/bash
+
+### XXX set -beEu -o pipefail
+
+if [ $# != 4 ]; then
+  printf "usage: pairLastz.sh <target> <query> <tClade> <qClade>
+
+Where target/query is either a UCSC db name, or is an
+   assembly hub identifier, e.g.: GCA_002844635.1_USMARCv1.0.1
+
+And [tq]Clade is one of: primate|mammal|other
+
+Will create directory to work in, for example if, UCSC db:
+  /hive/data/target/bed/lastzQuery.yyyy-mm-dd/
+
+Or, in the assembly hub build directory:
+/hive/data/genomes/asmHubs/allBuild/GCA/002/844/635/GCA_002844635.1_USMARCv1.0/trackData/lastzQuery.yyyy-mm-dd
+
+Will set up a DEF file there, and a run.sh script to run all steps
+  and output makeDoc text to document what happened.
+
+AND MORE, it will run the swap operation into the corresponding
+  blastz.target.swap directory in the query genome work space.
+
+e.g.: pairLastz.sh rn7 papAnu4 -qClade=mammal -tClade=primate\n" 1>&2
+  exit 255
+fi
+
+# asmId - if assembly hub, determine GCx_012345678.9 name
+#         if not, return the asmName (== UCSC database name)
+function asmId() {
+  export asmName=$1
+  export id="${asmName}"
+  case $asmName in
+     GC[AF]_*)
+       id=`echo $asmName | cut -d'_' -f1-2`
+       ;;
+     *)
+       ;;
+  esac
+  printf "%s" "${id}"
+}
+
+# gcPath - if assembly hub, determine GCx/012/345/678 path
+#          if not return empty string "" (== UCSC database name)
+function gcPath() {
+  export asmName=$1
+  export GCxPath=""
+  case $asmName in
+     GC[AF]_*)
+       gcX=`echo $asmName | cut -c1-3`
+       d0=`echo $asmName | cut -c5-7`
+       d1=`echo $asmName | cut -c8-10`
+       d2=`echo $asmName | cut -c11-13`
+       GCxPath="${gcX}/${d0}/${d1}/${d2}"
+       ;;
+     *)
+       ;;
+  esac
+  printf "%s" "${GCxPath}"
+}
+
+# seqCount - determine the sequence count in given genome target
+function seqCount() {
+  export asmName=$1
+  export sizes="/hive/data/genomes/${asmName}/chrom.sizes"
+  case $asmName in
+     GC[AF]_*)
+       gcxPath=$(gcPath $asmName)
+       id=$(asmId $asmName)
+       count=`cat /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt | wc -l`
+       ;;
+     *)
+       count=`cat ${sizes} | wc -l`
+       ;;
+  esac
+  printf "%s" "${count}"
+}
+
+function orgName() {
+  export asmName=$1
+  case $asmName in
+     GC[AF]_*)
+       gcxPath=$(gcPath $asmName)
+       asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}"
+       asmRpt="${asmDir}/${asmName}_assembly_report.txt"
+       oName=`egrep -i "^# organism name:" ${asmRpt} | tr -d '\r' | sed -e 's/.*(//; s/).*//'`
+       ;;
+     *)
+       oName=`hgsql -N -e "select organism from dbDb where name=\"${asmName}\";" hgcentraltest`
+       ;;
+  esac
+  printf "%s" "${oName}"
+}
+
+export target="$1"
+export query="$2"
+export tClade="$3"
+export qClade="$4"
+
+export tGcPath=$(gcPath $target)
+export qGcPath=$(gcPath $query)
+export tAsmId=$(asmId $target)
+export qAsmId=$(asmId $query)
+printf "# tq gcPath: '${tGcPath}' '${qGcPath}'\n" 1>&2
+printf "# tq asmId: '${tAsmId}' '${qAsmId}'\n" 1>&2
+
+# upper case first character
+export Target="${tAsmId^}"
+export Query="${qAsmId^}"
+
+export DS=`date "+%F"`
+
+# assume UCSC db build
+export buildDir="/hive/data/genomes/${target}/bed/lastz${Query}.${DS}"
+export targetExists="/hive/data/genomes/${target}/bed"
+export symLink="/hive/data/genomes/${target}/bed/lastz.${qAsmId}"
+export swapDir="/hive/data/genomes/${query}/bed/blastz.${tAsmId}.swap"
+export queryExists="/hive/data/genomes/${query}/bed"
+export swapLink="/hive/data/genomes/${query}/bed/lastz.${tAsmId}"
+export targetSizes="/hive/data/genomes/${target}/chrom.sizes"
+export querySizes="/hive/data/genomes/${query}/chrom.sizes"
+export target2bit="/hive/data/genomes/${target}/${target}.2bit"
+export query2bit="/hive/data/genomes/${query}/${query}.2bit"
+export trackHub=""
+export tRbestArgs=""
+export qRbestArgs=""
+
+
+#  override those specifications if assembly hub
+case $target in
+     GC[AF]_*) trackHub="-trackHub -noDbNameCheck"
+       buildDir="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz${Query}.${DS}"
+       symLink="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz${qAsmId}.${DS}"
+       targetExists="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData"
+       targetSizes="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.chrom.sizes.txt"
+       target2bit="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.2bit"
+       tRbestArgs="-target2Bit=\"${target2bit}\" \\
+-targetSizes=\"${targetSizes}\""
+       swapRbestArgs="-query2bit=\"${target2bit}\" \\
+-querySizes=\"${targetSizes}\""
+       ;;
+esac
+
+case $query in
+     GC[AF]_*) trackHub="-trackHub -noDbNameCheck"
+       swapDir="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/blastz.${tAsmId}.swap"
+       swapLink="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/lastz.${tAsmId}"
+       queryExists="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData"
+       querySizes="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.chrom.sizes.txt"
+       query2bit="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.2bit"
+       tRbestArgs="-query2Bit=\"${query2bit}\" \\
+-querySizes=\"${querySizes}\""
+       swapRbestArgs="-target2bit=\"${query2bit}\" \\
+-targetSizes=\"${querySizes}\""
+       ;;
+esac
+
+
+if [ ! -d "${targetExists}" ]; then
+  printf "ERROR: can not find ${targetExists}\n" 1>&2
+  exit 255
+fi
+
+if [ ! -d "${queryExists}" ]; then
+  printf "ERROR: can not find ${queryExists}\n" 1>&2
+  exit 255
+fi
+
+if [ ! -s "${target2bit}" ]; then
+  printf "ERROR: can not find ${target2bit}\n" 1>&2
+  exit 255
+fi
+
+if [ ! -s "${query2bit}" ]; then
+  printf "ERROR: can not find ${query2bit}\n" 1>&2
+  exit 255
+fi
+
+
+if [ ! -s "${targetSizes}" ]; then
+  printf "ERROR: can not find ${targetSizes}\n" 1>&2
+  exit 255
+fi
+
+if [ ! -s "${querySizes}" ]; then
+  printf "ERROR: can not find ${querySizes}\n" 1>&2
+  exit 255
+fi
+
+export doneCount=0
+
+if [ -L "${symLink}" ]; then
+  printf "# ${query} -> ${target} already done\n" 1>&2
+  doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'`
+fi
+
+if [ -L "${swapLink}" ]; then
+  printf "# swap ${query} -> ${target} already done\n" 1>&2
+  doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'`
+fi
+
+export working=`ls -d ${targetExists}/lastz${Query}.* 2> /dev/null | wc -l`
+if [ "${working}" -gt 0 ]; then
+  printf "# in progress, ${query} -> ${target}:\n" 1>&2
+  printf "# " 1>&2
+  ls -ogd ${targetExists}/lastz${Query}.* 1>&2
+  if [ "${doneCount}" -ne 2 ]; then
+    exit 0
+  fi
+fi
+
+if [ -d "${swapDir}" ]; then
+  printf "# swap in progress ${query} -> ${target}\n" 1>&2
+  printf "# " 1>&2
+  ls -ogd "${swapDir}" 1>&2
+  if [ "${doneCount}" -ne 2 ]; then
+    exit 0
+  fi
+fi
+
+if [ "${doneCount}" -eq 2 ]; then
+    printf "# all done\n" 1>&2
+    exit 0
+fi
+
+export tOrgName="$(orgName $target)"
+export qOrgName="$(orgName $query)"
+export tSequenceCount="$(seqCount $target)"
+export qSequenceCount="$(seqCount $query)"
+printf "# working: %s\n" "${buildDir}" 1>&2
+printf "# target: $target - $tOrgName - $tClade - $tSequenceCount sequences\n" 1>&2
+printf "#  query: $query - $qOrgName - $qClade - $qSequenceCount sequences\n" 1>&2
+
+export seq1Limit="40"
+if [ "${tSequenceCount}" -gt 50000 ]; then
+  seq1Limit="100"
+fi
+
+export seq2Limit="100"
+if [ "${qSequenceCount}" -gt 50000 ]; then
+  seq2Limit="500"
+fi
+
+export minScore="3000"
+export linearGap="medium"
+
+case $tClade in
+   "primate")
+     case $qClade in
+        "primate")
+           minScore="5000"
+           ;;
+        "mammal")
+           ;;
+        "other")
+           minScore="5000"
+           linearGap="loose"
+           ;;
+     esac
+     ;;
+   "mammal")
+     case $qClade in
+        "primate")
+           ;;
+        "mammal")
+           ;;
+        "other")
+           minScore="5000"
+           linearGap="loose"
+           ;;
+     esac
+     ;;
+   "other")
+      minScore="5000"
+      linearGap="loose"
+esac
+
+mkdir "${buildDir}"
+
+export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target}
+BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
+
+# TARGET: ${tOrgName} ${Target}
+SEQ1_DIR=${target2bit}
+SEQ1_LEN=${targetSizes}
+SEQ1_CHUNK=20000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=${seq1Limit}
+
+# QUERY: ${qOrgName} ${Query}
+SEQ2_DIR=${query2bit}
+SEQ2_LEN=${querySizes}
+SEQ2_CHUNK=20000000
+SEQ2_LAP=0
+SEQ2_LIMIT=${seq2Limit}
+
+BASE=${buildDir}
+TMPDIR=/dev/shm
+"
+
+### setup the DEF file
+printf "%s" "${defString}" > ${buildDir}/DEF
+
+### setup the buildDir/run.sh script
+printf "#!/bin/bash
+
+set -beEu -o pipefail
+
+export targetDb=\"${tAsmId}\"
+export QueryDb=\"${Query}\"
+
+cd ${buildDir}
+time (doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\
+  -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\
+    -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1
+
+grep -w real do.log | sed -e 's/^/    # /;'
+
+sed -e 's/^/    # /;' fb.\${targetDb}.chain\${QueryDb}Link.txt
+sed -e 's/^/    # /;' fb.\${targetDb}.chainSyn\${QueryDb}Link.txt
+
+time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\
+   ${tRbestArgs} \\
+   \${targetDb} \${QueryDb}) > rbest.log 2>&1
+
+grep -w real rbest.log | sed -e 's/^/    # /;'
+
+sed -e 's/^/    #/;' fb.\${targetDb}.chainRBest.\${QueryDb}.txt
+" > ${buildDir}/run.sh
+chmod +x ${buildDir}/run.sh
+
+#### print out the makeDoc.txt to this point into buildDir/makeDoc.txt
+
+printf "##############################################################################
+# LASTZ ${tOrgName} ${Target} vs. $qOrgName ${Query} (DONE - $DS - Hiram)
+    mkdir $buildDir
+    cd $buildDir
+
+    printf '${defString}
+' > DEF
+
+time (doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\
+  -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\
+    -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1
+grep -w real do.log | sed -e 's/^/    # /;'
+" > ${buildDir}/makeDoc.txt
+grep -w real $buildDir/do.log | sed -e 's/^/    # /;' >> ${buildDir}/makeDoc.txt
+
+printf "\nsed -e 's/^/    # /;' fb.${tAsmId}.chain${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt
+sed -e 's/^/    # /;' $buildDir/fb.${tAsmId}.chain${Query}Link.txt >> ${buildDir}/makeDoc.txt
+
+printf "sed -e 's/^/    # /;' fb.${tAsmId}.chainSyn${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt
+sed -e 's/^/    # /;' $buildDir/fb.${tAsmId}.chainSyn${Query}Link.txt >> ${buildDir}/makeDoc.txt
+
+printf "\ntime (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\
+   ${tRbestArgs} \\
+   ${tAsmId} ${Query}) > rbest.log 2>&1
+
+grep -w real rbest.log | sed -e 's/^/    # /;'\n" >> ${buildDir}/makeDoc.txt
+grep -w real $buildDir/rbest.log | sed -e 's/^/    # /;' >> ${buildDir}/makeDoc.txt
+printf "\nsed -e 's/^/    #/;' fb.${tAsmId}.chainRBest.${Query}.txt\n" >> ${buildDir}/makeDoc.txt
+sed -e 's/^/    #/;' ${buildDir}/fb.${tAsmId}.chainRBest.${Query}.txt >> ${buildDir}/makeDoc.txt
+
+printf "\n### and for the swap\n" >> ${buildDir}/makeDoc.txt
+
+cat ${buildDir}/makeDoc.txt
+
+### XXX time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1
+
+printf "# swap into: ${swapDir}\n" 1>&2
+
+mkdir ${swapDir}
+
+printf "#!/bin/bash
+
+set -beEu -o pipefail
+
+export targetDb=\"${tAsmId}\"
+export Target=\"${Target}\"
+export queryDb=\"${qAsmId}\"
+
+cd ${swapDir}
+time (doBlastzChainNet.pl ${trackHub}  -swap -verbose=2 \\
+  ${buildDir}/DEF \\
+  -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\
+    -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1
+
+grep -w real do.log | sed -e 's/^/    # /;'
+
+sed -e 's/^/    # /;' fb.\${queryDb}.chain\${Target}Link.txt
+sed -e 's/^/    # /;' fb.\${queryDb}.chainSyn\${Target}Link.txt
+time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\
+   ${swapRbestArgs} \\
+   \${queryDb} \${targetDb}) > rbest.log 2>&1
+
+grep -w real rbest.log | sed -e 's/^/    # /;'
+
+sed -e 's/^/    # /;' fb.\${queryDb}.chainRBest.\${Target}.txt
+" > ${swapDir}/runSwap.sh
+
+chmod +x  ${swapDir}/runSwap.sh
+
+### XXX time (${swapDir}/runSwap.sh) >> ${swapDir}/swap.log 2>&1
+
+### continue the make doc
+printf "\ncd ${swapDir}\n" >> ${buildDir}/makeDoc.txt
+
+printf "\ntime (doBlastzChainNet.pl ${trackHub}  -swap -verbose=2 \\
+  ${buildDir}/DEF \\
+  -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\
+    -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1
+grep -w real swap.log | sed -e 's/^/    # /;'
+" >> ${buildDir}/makeDoc.txt
+grep -w real ${swapDir}/swap.log | sed -e 's/^/    # /;' >> ${buildDir}/makeDoc.txt
+
+printf "\nsed -e 's/^/    # /;' fb.${qAsmId}.chain${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt
+sed -e 's/^/    # /;' ${swapDir}/fb.${tAsmId}.chain${Target}Link.txt >> ${buildDir}/makeDoc.txt
+
+printf "sed -e 's/^/    # /;' fb.${qAsmId}.chainSyn${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt
+sed -e 's/^/    # /;' ${swapDir}/fb.${qAsmId}.chainSyn${Target}Link.txt >> ${buildDir}/makeDoc.txt
+
+printf "\ntime (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\
+   ${swapRbestArgs} \\
+   ${qAsmId} ${tAsmId}) > rbest.log 2>&1
+
+grep -w real rbest.log | sed -e 's/^/    # /;'\n" >> ${buildDir}/makeDoc.txt
+grep -w real ${swapDir}/rbest.log | sed -e 's/^/    # /;' >> ${buildDir}/makeDoc.txt
+printf "\nsed -e 's/^/    #/;' fb.${qAsmId}.chainRBest.${Target}.txt\n" >> ${buildDir}/makeDoc.txt
+sed -e 's/^/    #/;' ${swapDir}/fb.${qAsmId}.chainRBest.${Target}.txt >> ${buildDir}/makeDoc.txt
+
+### XXX ####
+cat ${buildDir}/makeDoc.txt
+