6348b62549ccdad39b50fb4fd493d593b44ab31d hiram Fri Apr 30 13:41:29 2021 -0700 a supervisor script to the doBlastzChainNet script to run everything refs #26682 diff --git src/hg/utils/automation/pairLastz.sh src/hg/utils/automation/pairLastz.sh new file mode 100755 index 0000000..e9c4ae8 --- /dev/null +++ src/hg/utils/automation/pairLastz.sh @@ -0,0 +1,434 @@ +#!/bin/bash + +### XXX set -beEu -o pipefail + +if [ $# != 4 ]; then + printf "usage: pairLastz.sh <target> <query> <tClade> <qClade> + +Where target/query is either a UCSC db name, or is an + assembly hub identifier, e.g.: GCA_002844635.1_USMARCv1.0.1 + +And [tq]Clade is one of: primate|mammal|other + +Will create directory to work in, for example if, UCSC db: + /hive/data/target/bed/lastzQuery.yyyy-mm-dd/ + +Or, in the assembly hub build directory: +/hive/data/genomes/asmHubs/allBuild/GCA/002/844/635/GCA_002844635.1_USMARCv1.0/trackData/lastzQuery.yyyy-mm-dd + +Will set up a DEF file there, and a run.sh script to run all steps + and output makeDoc text to document what happened. + +AND MORE, it will run the swap operation into the corresponding + blastz.target.swap directory in the query genome work space. + +e.g.: pairLastz.sh rn7 papAnu4 -qClade=mammal -tClade=primate\n" 1>&2 + exit 255 +fi + +# asmId - if assembly hub, determine GCx_012345678.9 name +# if not, return the asmName (== UCSC database name) +function asmId() { + export asmName=$1 + export id="${asmName}" + case $asmName in + GC[AF]_*) + id=`echo $asmName | cut -d'_' -f1-2` + ;; + *) + ;; + esac + printf "%s" "${id}" +} + +# gcPath - if assembly hub, determine GCx/012/345/678 path +# if not return empty string "" (== UCSC database name) +function gcPath() { + export asmName=$1 + export GCxPath="" + case $asmName in + GC[AF]_*) + gcX=`echo $asmName | cut -c1-3` + d0=`echo $asmName | cut -c5-7` + d1=`echo $asmName | cut -c8-10` + d2=`echo $asmName | cut -c11-13` + GCxPath="${gcX}/${d0}/${d1}/${d2}" + ;; + *) + ;; + esac + printf "%s" "${GCxPath}" +} + +# seqCount - determine the sequence count in given genome target +function seqCount() { + export asmName=$1 + export sizes="/hive/data/genomes/${asmName}/chrom.sizes" + case $asmName in + GC[AF]_*) + gcxPath=$(gcPath $asmName) + id=$(asmId $asmName) + count=`cat /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt | wc -l` + ;; + *) + count=`cat ${sizes} | wc -l` + ;; + esac + printf "%s" "${count}" +} + +function orgName() { + export asmName=$1 + case $asmName in + GC[AF]_*) + gcxPath=$(gcPath $asmName) + asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}" + asmRpt="${asmDir}/${asmName}_assembly_report.txt" + oName=`egrep -i "^# organism name:" ${asmRpt} | tr -d '\r' | sed -e 's/.*(//; s/).*//'` + ;; + *) + oName=`hgsql -N -e "select organism from dbDb where name=\"${asmName}\";" hgcentraltest` + ;; + esac + printf "%s" "${oName}" +} + +export target="$1" +export query="$2" +export tClade="$3" +export qClade="$4" + +export tGcPath=$(gcPath $target) +export qGcPath=$(gcPath $query) +export tAsmId=$(asmId $target) +export qAsmId=$(asmId $query) +printf "# tq gcPath: '${tGcPath}' '${qGcPath}'\n" 1>&2 +printf "# tq asmId: '${tAsmId}' '${qAsmId}'\n" 1>&2 + +# upper case first character +export Target="${tAsmId^}" +export Query="${qAsmId^}" + +export DS=`date "+%F"` + +# assume UCSC db build +export buildDir="/hive/data/genomes/${target}/bed/lastz${Query}.${DS}" +export targetExists="/hive/data/genomes/${target}/bed" +export symLink="/hive/data/genomes/${target}/bed/lastz.${qAsmId}" +export swapDir="/hive/data/genomes/${query}/bed/blastz.${tAsmId}.swap" +export queryExists="/hive/data/genomes/${query}/bed" +export swapLink="/hive/data/genomes/${query}/bed/lastz.${tAsmId}" +export targetSizes="/hive/data/genomes/${target}/chrom.sizes" +export querySizes="/hive/data/genomes/${query}/chrom.sizes" +export target2bit="/hive/data/genomes/${target}/${target}.2bit" +export query2bit="/hive/data/genomes/${query}/${query}.2bit" +export trackHub="" +export tRbestArgs="" +export qRbestArgs="" + + +# override those specifications if assembly hub +case $target in + GC[AF]_*) trackHub="-trackHub -noDbNameCheck" + buildDir="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz${Query}.${DS}" + symLink="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz${qAsmId}.${DS}" + targetExists="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData" + targetSizes="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.chrom.sizes.txt" + target2bit="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.2bit" + tRbestArgs="-target2Bit=\"${target2bit}\" \\ +-targetSizes=\"${targetSizes}\"" + swapRbestArgs="-query2bit=\"${target2bit}\" \\ +-querySizes=\"${targetSizes}\"" + ;; +esac + +case $query in + GC[AF]_*) trackHub="-trackHub -noDbNameCheck" + swapDir="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/blastz.${tAsmId}.swap" + swapLink="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/lastz.${tAsmId}" + queryExists="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData" + querySizes="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.chrom.sizes.txt" + query2bit="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.2bit" + tRbestArgs="-query2Bit=\"${query2bit}\" \\ +-querySizes=\"${querySizes}\"" + swapRbestArgs="-target2bit=\"${query2bit}\" \\ +-targetSizes=\"${querySizes}\"" + ;; +esac + + +if [ ! -d "${targetExists}" ]; then + printf "ERROR: can not find ${targetExists}\n" 1>&2 + exit 255 +fi + +if [ ! -d "${queryExists}" ]; then + printf "ERROR: can not find ${queryExists}\n" 1>&2 + exit 255 +fi + +if [ ! -s "${target2bit}" ]; then + printf "ERROR: can not find ${target2bit}\n" 1>&2 + exit 255 +fi + +if [ ! -s "${query2bit}" ]; then + printf "ERROR: can not find ${query2bit}\n" 1>&2 + exit 255 +fi + + +if [ ! -s "${targetSizes}" ]; then + printf "ERROR: can not find ${targetSizes}\n" 1>&2 + exit 255 +fi + +if [ ! -s "${querySizes}" ]; then + printf "ERROR: can not find ${querySizes}\n" 1>&2 + exit 255 +fi + +export doneCount=0 + +if [ -L "${symLink}" ]; then + printf "# ${query} -> ${target} already done\n" 1>&2 + doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'` +fi + +if [ -L "${swapLink}" ]; then + printf "# swap ${query} -> ${target} already done\n" 1>&2 + doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'` +fi + +export working=`ls -d ${targetExists}/lastz${Query}.* 2> /dev/null | wc -l` +if [ "${working}" -gt 0 ]; then + printf "# in progress, ${query} -> ${target}:\n" 1>&2 + printf "# " 1>&2 + ls -ogd ${targetExists}/lastz${Query}.* 1>&2 + if [ "${doneCount}" -ne 2 ]; then + exit 0 + fi +fi + +if [ -d "${swapDir}" ]; then + printf "# swap in progress ${query} -> ${target}\n" 1>&2 + printf "# " 1>&2 + ls -ogd "${swapDir}" 1>&2 + if [ "${doneCount}" -ne 2 ]; then + exit 0 + fi +fi + +if [ "${doneCount}" -eq 2 ]; then + printf "# all done\n" 1>&2 + exit 0 +fi + +export tOrgName="$(orgName $target)" +export qOrgName="$(orgName $query)" +export tSequenceCount="$(seqCount $target)" +export qSequenceCount="$(seqCount $query)" +printf "# working: %s\n" "${buildDir}" 1>&2 +printf "# target: $target - $tOrgName - $tClade - $tSequenceCount sequences\n" 1>&2 +printf "# query: $query - $qOrgName - $qClade - $qSequenceCount sequences\n" 1>&2 + +export seq1Limit="40" +if [ "${tSequenceCount}" -gt 50000 ]; then + seq1Limit="100" +fi + +export seq2Limit="100" +if [ "${qSequenceCount}" -gt 50000 ]; then + seq2Limit="500" +fi + +export minScore="3000" +export linearGap="medium" + +case $tClade in + "primate") + case $qClade in + "primate") + minScore="5000" + ;; + "mammal") + ;; + "other") + minScore="5000" + linearGap="loose" + ;; + esac + ;; + "mammal") + case $qClade in + "primate") + ;; + "mammal") + ;; + "other") + minScore="5000" + linearGap="loose" + ;; + esac + ;; + "other") + minScore="5000" + linearGap="loose" +esac + +mkdir "${buildDir}" + +export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target} +BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz + +# TARGET: ${tOrgName} ${Target} +SEQ1_DIR=${target2bit} +SEQ1_LEN=${targetSizes} +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 +SEQ1_LIMIT=${seq1Limit} + +# QUERY: ${qOrgName} ${Query} +SEQ2_DIR=${query2bit} +SEQ2_LEN=${querySizes} +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 +SEQ2_LIMIT=${seq2Limit} + +BASE=${buildDir} +TMPDIR=/dev/shm +" + +### setup the DEF file +printf "%s" "${defString}" > ${buildDir}/DEF + +### setup the buildDir/run.sh script +printf "#!/bin/bash + +set -beEu -o pipefail + +export targetDb=\"${tAsmId}\" +export QueryDb=\"${Query}\" + +cd ${buildDir} +time (doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\ + -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ + -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1 + +grep -w real do.log | sed -e 's/^/ # /;' + +sed -e 's/^/ # /;' fb.\${targetDb}.chain\${QueryDb}Link.txt +sed -e 's/^/ # /;' fb.\${targetDb}.chainSyn\${QueryDb}Link.txt + +time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ + ${tRbestArgs} \\ + \${targetDb} \${QueryDb}) > rbest.log 2>&1 + +grep -w real rbest.log | sed -e 's/^/ # /;' + +sed -e 's/^/ #/;' fb.\${targetDb}.chainRBest.\${QueryDb}.txt +" > ${buildDir}/run.sh +chmod +x ${buildDir}/run.sh + +#### print out the makeDoc.txt to this point into buildDir/makeDoc.txt + +printf "############################################################################## +# LASTZ ${tOrgName} ${Target} vs. $qOrgName ${Query} (DONE - $DS - Hiram) + mkdir $buildDir + cd $buildDir + + printf '${defString} +' > DEF + +time (doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\ + -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ + -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1 +grep -w real do.log | sed -e 's/^/ # /;' +" > ${buildDir}/makeDoc.txt +grep -w real $buildDir/do.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt + +printf "\nsed -e 's/^/ # /;' fb.${tAsmId}.chain${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt +sed -e 's/^/ # /;' $buildDir/fb.${tAsmId}.chain${Query}Link.txt >> ${buildDir}/makeDoc.txt + +printf "sed -e 's/^/ # /;' fb.${tAsmId}.chainSyn${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt +sed -e 's/^/ # /;' $buildDir/fb.${tAsmId}.chainSyn${Query}Link.txt >> ${buildDir}/makeDoc.txt + +printf "\ntime (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ + ${tRbestArgs} \\ + ${tAsmId} ${Query}) > rbest.log 2>&1 + +grep -w real rbest.log | sed -e 's/^/ # /;'\n" >> ${buildDir}/makeDoc.txt +grep -w real $buildDir/rbest.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt +printf "\nsed -e 's/^/ #/;' fb.${tAsmId}.chainRBest.${Query}.txt\n" >> ${buildDir}/makeDoc.txt +sed -e 's/^/ #/;' ${buildDir}/fb.${tAsmId}.chainRBest.${Query}.txt >> ${buildDir}/makeDoc.txt + +printf "\n### and for the swap\n" >> ${buildDir}/makeDoc.txt + +cat ${buildDir}/makeDoc.txt + +### XXX time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1 + +printf "# swap into: ${swapDir}\n" 1>&2 + +mkdir ${swapDir} + +printf "#!/bin/bash + +set -beEu -o pipefail + +export targetDb=\"${tAsmId}\" +export Target=\"${Target}\" +export queryDb=\"${qAsmId}\" + +cd ${swapDir} +time (doBlastzChainNet.pl ${trackHub} -swap -verbose=2 \\ + ${buildDir}/DEF \\ + -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ + -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1 + +grep -w real do.log | sed -e 's/^/ # /;' + +sed -e 's/^/ # /;' fb.\${queryDb}.chain\${Target}Link.txt +sed -e 's/^/ # /;' fb.\${queryDb}.chainSyn\${Target}Link.txt +time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ + ${swapRbestArgs} \\ + \${queryDb} \${targetDb}) > rbest.log 2>&1 + +grep -w real rbest.log | sed -e 's/^/ # /;' + +sed -e 's/^/ # /;' fb.\${queryDb}.chainRBest.\${Target}.txt +" > ${swapDir}/runSwap.sh + +chmod +x ${swapDir}/runSwap.sh + +### XXX time (${swapDir}/runSwap.sh) >> ${swapDir}/swap.log 2>&1 + +### continue the make doc +printf "\ncd ${swapDir}\n" >> ${buildDir}/makeDoc.txt + +printf "\ntime (doBlastzChainNet.pl ${trackHub} -swap -verbose=2 \\ + ${buildDir}/DEF \\ + -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ + -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1 +grep -w real swap.log | sed -e 's/^/ # /;' +" >> ${buildDir}/makeDoc.txt +grep -w real ${swapDir}/swap.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt + +printf "\nsed -e 's/^/ # /;' fb.${qAsmId}.chain${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt +sed -e 's/^/ # /;' ${swapDir}/fb.${tAsmId}.chain${Target}Link.txt >> ${buildDir}/makeDoc.txt + +printf "sed -e 's/^/ # /;' fb.${qAsmId}.chainSyn${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt +sed -e 's/^/ # /;' ${swapDir}/fb.${qAsmId}.chainSyn${Target}Link.txt >> ${buildDir}/makeDoc.txt + +printf "\ntime (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ + ${swapRbestArgs} \\ + ${qAsmId} ${tAsmId}) > rbest.log 2>&1 + +grep -w real rbest.log | sed -e 's/^/ # /;'\n" >> ${buildDir}/makeDoc.txt +grep -w real ${swapDir}/rbest.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt +printf "\nsed -e 's/^/ #/;' fb.${qAsmId}.chainRBest.${Target}.txt\n" >> ${buildDir}/makeDoc.txt +sed -e 's/^/ #/;' ${swapDir}/fb.${qAsmId}.chainRBest.${Target}.txt >> ${buildDir}/makeDoc.txt + +### XXX #### +cat ${buildDir}/makeDoc.txt +