67834b02649fc82ec728bd72a50309d87128007d hiram Fri Apr 30 21:41:59 2021 -0700 correct featureBits result file name refs #26682 diff --git src/hg/utils/automation/pairLastz.sh src/hg/utils/automation/pairLastz.sh index e9c4ae8..f8abc69 100755 --- src/hg/utils/automation/pairLastz.sh +++ src/hg/utils/automation/pairLastz.sh @@ -1,434 +1,435 @@ #!/bin/bash -### XXX set -beEu -o pipefail +set -beEu -o pipefail if [ $# != 4 ]; then printf "usage: pairLastz.sh <target> <query> <tClade> <qClade> Where target/query is either a UCSC db name, or is an assembly hub identifier, e.g.: GCA_002844635.1_USMARCv1.0.1 And [tq]Clade is one of: primate|mammal|other Will create directory to work in, for example if, UCSC db: /hive/data/target/bed/lastzQuery.yyyy-mm-dd/ Or, in the assembly hub build directory: /hive/data/genomes/asmHubs/allBuild/GCA/002/844/635/GCA_002844635.1_USMARCv1.0/trackData/lastzQuery.yyyy-mm-dd Will set up a DEF file there, and a run.sh script to run all steps and output makeDoc text to document what happened. AND MORE, it will run the swap operation into the corresponding blastz.target.swap directory in the query genome work space. e.g.: pairLastz.sh rn7 papAnu4 -qClade=mammal -tClade=primate\n" 1>&2 exit 255 fi # asmId - if assembly hub, determine GCx_012345678.9 name # if not, return the asmName (== UCSC database name) function asmId() { export asmName=$1 export id="${asmName}" case $asmName in GC[AF]_*) id=`echo $asmName | cut -d'_' -f1-2` ;; *) ;; esac printf "%s" "${id}" } # gcPath - if assembly hub, determine GCx/012/345/678 path # if not return empty string "" (== UCSC database name) function gcPath() { export asmName=$1 export GCxPath="" case $asmName in GC[AF]_*) gcX=`echo $asmName | cut -c1-3` d0=`echo $asmName | cut -c5-7` d1=`echo $asmName | cut -c8-10` d2=`echo $asmName | cut -c11-13` GCxPath="${gcX}/${d0}/${d1}/${d2}" ;; *) ;; esac printf "%s" "${GCxPath}" } # seqCount - determine the sequence count in given genome target function seqCount() { export asmName=$1 export sizes="/hive/data/genomes/${asmName}/chrom.sizes" case $asmName in GC[AF]_*) gcxPath=$(gcPath $asmName) id=$(asmId $asmName) count=`cat /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt | wc -l` ;; *) count=`cat ${sizes} | wc -l` ;; esac printf "%s" "${count}" } function orgName() { export asmName=$1 case $asmName in GC[AF]_*) gcxPath=$(gcPath $asmName) asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}" asmRpt="${asmDir}/${asmName}_assembly_report.txt" oName=`egrep -i "^# organism name:" ${asmRpt} | tr -d '\r' | sed -e 's/.*(//; s/).*//'` ;; *) oName=`hgsql -N -e "select organism from dbDb where name=\"${asmName}\";" hgcentraltest` ;; esac printf "%s" "${oName}" } export target="$1" export query="$2" export tClade="$3" export qClade="$4" export tGcPath=$(gcPath $target) export qGcPath=$(gcPath $query) export tAsmId=$(asmId $target) export qAsmId=$(asmId $query) printf "# tq gcPath: '${tGcPath}' '${qGcPath}'\n" 1>&2 printf "# tq asmId: '${tAsmId}' '${qAsmId}'\n" 1>&2 # upper case first character export Target="${tAsmId^}" export Query="${qAsmId^}" export DS=`date "+%F"` # assume UCSC db build export buildDir="/hive/data/genomes/${target}/bed/lastz${Query}.${DS}" export targetExists="/hive/data/genomes/${target}/bed" export symLink="/hive/data/genomes/${target}/bed/lastz.${qAsmId}" export swapDir="/hive/data/genomes/${query}/bed/blastz.${tAsmId}.swap" export queryExists="/hive/data/genomes/${query}/bed" export swapLink="/hive/data/genomes/${query}/bed/lastz.${tAsmId}" export targetSizes="/hive/data/genomes/${target}/chrom.sizes" export querySizes="/hive/data/genomes/${query}/chrom.sizes" export target2bit="/hive/data/genomes/${target}/${target}.2bit" export query2bit="/hive/data/genomes/${query}/${query}.2bit" export trackHub="" export tRbestArgs="" export qRbestArgs="" # override those specifications if assembly hub case $target in GC[AF]_*) trackHub="-trackHub -noDbNameCheck" buildDir="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz${Query}.${DS}" symLink="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz${qAsmId}.${DS}" targetExists="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData" targetSizes="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.chrom.sizes.txt" target2bit="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.2bit" tRbestArgs="-target2Bit=\"${target2bit}\" \\ -targetSizes=\"${targetSizes}\"" swapRbestArgs="-query2bit=\"${target2bit}\" \\ -querySizes=\"${targetSizes}\"" ;; esac case $query in GC[AF]_*) trackHub="-trackHub -noDbNameCheck" swapDir="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/blastz.${tAsmId}.swap" swapLink="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/lastz.${tAsmId}" queryExists="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData" querySizes="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.chrom.sizes.txt" query2bit="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.2bit" tRbestArgs="-query2Bit=\"${query2bit}\" \\ -querySizes=\"${querySizes}\"" swapRbestArgs="-target2bit=\"${query2bit}\" \\ -targetSizes=\"${querySizes}\"" ;; esac if [ ! -d "${targetExists}" ]; then printf "ERROR: can not find ${targetExists}\n" 1>&2 exit 255 fi if [ ! -d "${queryExists}" ]; then printf "ERROR: can not find ${queryExists}\n" 1>&2 exit 255 fi if [ ! -s "${target2bit}" ]; then printf "ERROR: can not find ${target2bit}\n" 1>&2 exit 255 fi if [ ! -s "${query2bit}" ]; then printf "ERROR: can not find ${query2bit}\n" 1>&2 exit 255 fi if [ ! -s "${targetSizes}" ]; then printf "ERROR: can not find ${targetSizes}\n" 1>&2 exit 255 fi if [ ! -s "${querySizes}" ]; then printf "ERROR: can not find ${querySizes}\n" 1>&2 exit 255 fi export doneCount=0 if [ -L "${symLink}" ]; then printf "# ${query} -> ${target} already done\n" 1>&2 doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'` fi if [ -L "${swapLink}" ]; then printf "# swap ${query} -> ${target} already done\n" 1>&2 doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'` fi export working=`ls -d ${targetExists}/lastz${Query}.* 2> /dev/null | wc -l` if [ "${working}" -gt 0 ]; then printf "# in progress, ${query} -> ${target}:\n" 1>&2 printf "# " 1>&2 ls -ogd ${targetExists}/lastz${Query}.* 1>&2 if [ "${doneCount}" -ne 2 ]; then exit 0 fi fi if [ -d "${swapDir}" ]; then printf "# swap in progress ${query} -> ${target}\n" 1>&2 printf "# " 1>&2 ls -ogd "${swapDir}" 1>&2 if [ "${doneCount}" -ne 2 ]; then exit 0 fi fi if [ "${doneCount}" -eq 2 ]; then printf "# all done\n" 1>&2 exit 0 fi export tOrgName="$(orgName $target)" export qOrgName="$(orgName $query)" export tSequenceCount="$(seqCount $target)" export qSequenceCount="$(seqCount $query)" printf "# working: %s\n" "${buildDir}" 1>&2 printf "# target: $target - $tOrgName - $tClade - $tSequenceCount sequences\n" 1>&2 printf "# query: $query - $qOrgName - $qClade - $qSequenceCount sequences\n" 1>&2 export seq1Limit="40" if [ "${tSequenceCount}" -gt 50000 ]; then seq1Limit="100" fi export seq2Limit="100" if [ "${qSequenceCount}" -gt 50000 ]; then seq2Limit="500" fi export minScore="3000" export linearGap="medium" case $tClade in "primate") case $qClade in "primate") minScore="5000" ;; "mammal") ;; "other") minScore="5000" linearGap="loose" ;; esac ;; "mammal") case $qClade in "primate") ;; "mammal") ;; "other") minScore="5000" linearGap="loose" ;; esac ;; "other") minScore="5000" linearGap="loose" esac mkdir "${buildDir}" export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target} BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz # TARGET: ${tOrgName} ${Target} SEQ1_DIR=${target2bit} SEQ1_LEN=${targetSizes} SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=${seq1Limit} # QUERY: ${qOrgName} ${Query} SEQ2_DIR=${query2bit} SEQ2_LEN=${querySizes} SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=${seq2Limit} BASE=${buildDir} TMPDIR=/dev/shm " ### setup the DEF file printf "%s" "${defString}" > ${buildDir}/DEF ### setup the buildDir/run.sh script printf "#!/bin/bash set -beEu -o pipefail export targetDb=\"${tAsmId}\" +export queryDb=\"${qAsmId}\" export QueryDb=\"${Query}\" cd ${buildDir} -time (doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\ +time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\ -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1 grep -w real do.log | sed -e 's/^/ # /;' sed -e 's/^/ # /;' fb.\${targetDb}.chain\${QueryDb}Link.txt sed -e 's/^/ # /;' fb.\${targetDb}.chainSyn\${QueryDb}Link.txt -time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ +time (~/kent/src/hg/utils/automation/doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ ${tRbestArgs} \\ - \${targetDb} \${QueryDb}) > rbest.log 2>&1 + \${targetDb} \${queryDb}) > rbest.log 2>&1 grep -w real rbest.log | sed -e 's/^/ # /;' sed -e 's/^/ #/;' fb.\${targetDb}.chainRBest.\${QueryDb}.txt " > ${buildDir}/run.sh chmod +x ${buildDir}/run.sh +### run the primary alignment +time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1 + #### print out the makeDoc.txt to this point into buildDir/makeDoc.txt printf "############################################################################## # LASTZ ${tOrgName} ${Target} vs. $qOrgName ${Query} (DONE - $DS - Hiram) mkdir $buildDir cd $buildDir printf '${defString} ' > DEF -time (doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\ +time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\ -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1 grep -w real do.log | sed -e 's/^/ # /;' " > ${buildDir}/makeDoc.txt grep -w real $buildDir/do.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt printf "\nsed -e 's/^/ # /;' fb.${tAsmId}.chain${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt sed -e 's/^/ # /;' $buildDir/fb.${tAsmId}.chain${Query}Link.txt >> ${buildDir}/makeDoc.txt printf "sed -e 's/^/ # /;' fb.${tAsmId}.chainSyn${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt sed -e 's/^/ # /;' $buildDir/fb.${tAsmId}.chainSyn${Query}Link.txt >> ${buildDir}/makeDoc.txt -printf "\ntime (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ +printf "\ntime (~/kent/src/hg/utils/automation/doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ ${tRbestArgs} \\ - ${tAsmId} ${Query}) > rbest.log 2>&1 + ${tAsmId} ${qAsmId}) > rbest.log 2>&1 grep -w real rbest.log | sed -e 's/^/ # /;'\n" >> ${buildDir}/makeDoc.txt grep -w real $buildDir/rbest.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt printf "\nsed -e 's/^/ #/;' fb.${tAsmId}.chainRBest.${Query}.txt\n" >> ${buildDir}/makeDoc.txt sed -e 's/^/ #/;' ${buildDir}/fb.${tAsmId}.chainRBest.${Query}.txt >> ${buildDir}/makeDoc.txt printf "\n### and for the swap\n" >> ${buildDir}/makeDoc.txt cat ${buildDir}/makeDoc.txt -### XXX time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1 - printf "# swap into: ${swapDir}\n" 1>&2 mkdir ${swapDir} printf "#!/bin/bash set -beEu -o pipefail export targetDb=\"${tAsmId}\" export Target=\"${Target}\" export queryDb=\"${qAsmId}\" cd ${swapDir} -time (doBlastzChainNet.pl ${trackHub} -swap -verbose=2 \\ +time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -swap -verbose=2 \\ ${buildDir}/DEF \\ -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1 grep -w real do.log | sed -e 's/^/ # /;' sed -e 's/^/ # /;' fb.\${queryDb}.chain\${Target}Link.txt sed -e 's/^/ # /;' fb.\${queryDb}.chainSyn\${Target}Link.txt -time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ +time (~/kent/src/hg/utils/automation/doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ ${swapRbestArgs} \\ \${queryDb} \${targetDb}) > rbest.log 2>&1 grep -w real rbest.log | sed -e 's/^/ # /;' sed -e 's/^/ # /;' fb.\${queryDb}.chainRBest.\${Target}.txt " > ${swapDir}/runSwap.sh chmod +x ${swapDir}/runSwap.sh -### XXX time (${swapDir}/runSwap.sh) >> ${swapDir}/swap.log 2>&1 +time (${swapDir}/runSwap.sh) >> ${swapDir}/swap.log 2>&1 ### continue the make doc printf "\ncd ${swapDir}\n" >> ${buildDir}/makeDoc.txt -printf "\ntime (doBlastzChainNet.pl ${trackHub} -swap -verbose=2 \\ +printf "\ntime (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -swap -verbose=2 \\ ${buildDir}/DEF \\ -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1 grep -w real swap.log | sed -e 's/^/ # /;' " >> ${buildDir}/makeDoc.txt grep -w real ${swapDir}/swap.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt printf "\nsed -e 's/^/ # /;' fb.${qAsmId}.chain${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt sed -e 's/^/ # /;' ${swapDir}/fb.${tAsmId}.chain${Target}Link.txt >> ${buildDir}/makeDoc.txt printf "sed -e 's/^/ # /;' fb.${qAsmId}.chainSyn${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt sed -e 's/^/ # /;' ${swapDir}/fb.${qAsmId}.chainSyn${Target}Link.txt >> ${buildDir}/makeDoc.txt -printf "\ntime (doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ +printf "\ntime (~/kent/src/hg/utils/automation/doRecipBest.pl -load -workhorse=hgwdev -buildDir=\`pwd\` \\ ${swapRbestArgs} \\ ${qAsmId} ${tAsmId}) > rbest.log 2>&1 grep -w real rbest.log | sed -e 's/^/ # /;'\n" >> ${buildDir}/makeDoc.txt grep -w real ${swapDir}/rbest.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt printf "\nsed -e 's/^/ #/;' fb.${qAsmId}.chainRBest.${Target}.txt\n" >> ${buildDir}/makeDoc.txt sed -e 's/^/ #/;' ${swapDir}/fb.${qAsmId}.chainRBest.${Target}.txt >> ${buildDir}/makeDoc.txt ### XXX #### cat ${buildDir}/makeDoc.txt -