2ce16411b0f909e7e1dfd27a01af34316c260f2f hiram Mon Jul 5 11:58:27 2021 -0700 catching up the edge cases for pair wise alignment to assembly hub genome no redmine diff --git src/hg/utils/automation/pairLastz.sh src/hg/utils/automation/pairLastz.sh index fada1d1..47795e8 100755 --- src/hg/utils/automation/pairLastz.sh +++ src/hg/utils/automation/pairLastz.sh @@ -90,74 +90,79 @@ oName=`hgsql -N -e "select organism from dbDb where name=\"${asmName}\";" hgcentraltest` ;; esac printf "%s" "${oName}" } export target="$1" export query="$2" export tClade="$3" export qClade="$4" export tGcPath=$(gcPath $target) export qGcPath=$(gcPath $query) export tAsmId=$(asmId $target) export qAsmId=$(asmId $query) +printf "# tq: '${target}' '${query}' '${tClade}' '${qClade}'\n" 1>&2 printf "# tq gcPath: '${tGcPath}' '${qGcPath}'\n" 1>&2 printf "# tq asmId: '${tAsmId}' '${qAsmId}'\n" 1>&2 # upper case first character export Target="${tAsmId^}" export Query="${qAsmId^}" export DS=`date "+%F"` # assume UCSC db build export buildDir="/hive/data/genomes/${target}/bed/lastz${Query}.${DS}" export targetExists="/hive/data/genomes/${target}/bed" export symLink="/hive/data/genomes/${target}/bed/lastz.${qAsmId}" export swapDir="/hive/data/genomes/${query}/bed/blastz.${tAsmId}.swap" export queryExists="/hive/data/genomes/${query}/bed" export swapLink="/hive/data/genomes/${query}/bed/lastz.${tAsmId}" export targetSizes="/hive/data/genomes/${target}/chrom.sizes" export querySizes="/hive/data/genomes/${query}/chrom.sizes" export target2bit="/hive/data/genomes/${target}/${target}.2bit" export query2bit="/hive/data/genomes/${query}/${query}.2bit" export trackHub="" export rBestTrackHub="" export tRbestArgs="" export qRbestArgs="" export swapRbestArgs="" +export tFullName="" +export qFullName="" # override those specifications if assembly hub case $target in GC[AF]_*) trackHub="-trackHub -noDbNameCheck" + tFullName="-tAsmId $target" rBestTrackHub="-trackHub" buildDir="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz${Query}.${DS}" - symLink="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz${qAsmId}.${DS}" + symLink="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz.${qAsmId}" targetExists="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData" targetSizes="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.chrom.sizes.txt" target2bit="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.2bit" tRbestArgs="-target2Bit=\"${target2bit}\" \\ -targetSizes=\"${targetSizes}\"" swapRbestArgs="-query2bit=\"${target2bit}\" \\ -querySizes=\"${targetSizes}\"" ;; esac case $query in GC[AF]_*) trackHub="-trackHub -noDbNameCheck" + qFullName="-qAsmId $query" rBestTrackHub="-trackHub" swapDir="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/blastz.${tAsmId}.swap" swapLink="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/lastz.${tAsmId}" queryExists="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData" querySizes="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.chrom.sizes.txt" query2bit="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.2bit" tRbestArgs="-query2Bit=\"${query2bit}\" \\ -querySizes=\"${querySizes}\"" swapRbestArgs="-target2bit=\"${query2bit}\" \\ -targetSizes=\"${querySizes}\"" ;; esac if [ ! -d "${targetExists}" ]; then @@ -180,65 +185,76 @@ exit 255 fi if [ ! -s "${targetSizes}" ]; then printf "ERROR: can not find ${targetSizes}\n" 1>&2 exit 255 fi if [ ! -s "${querySizes}" ]; then printf "ERROR: can not find ${querySizes}\n" 1>&2 exit 255 fi export doneCount=0 +export primaryDone=0 +export swapDone=0 if [ -L "${symLink}" ]; then printf "# ${query} -> ${target} already done\n" 1>&2 doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'` + primaryDone=1 +else + printf "# no symLink: $symLink\n" 1>&2 fi if [ -L "${swapLink}" ]; then printf "# swap ${query} -> ${target} already done\n" 1>&2 doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'` + swapDone=1 fi export working=`ls -d ${targetExists}/lastz${Query}.* 2> /dev/null | wc -l` if [ "${working}" -gt 0 ]; then + if [ "${primaryDone}" -eq 0 ]; then printf "# in progress, ${query} -> ${target}:\n" 1>&2 printf "# " 1>&2 ls -ogd ${targetExists}/lastz${Query}.* 1>&2 - if [ "${doneCount}" -ne 2 ]; then - exit 0 fi + buildDir=`ls -d ${targetExists}/lastz${Query}.*` + printf "# continuing: %s\n" "${buildDir}" 1>&2 +fi +export primaryPartsDone=`ls $buildDir/fb.* 2> /dev/null | wc -l` +if [ "$primaryPartsDone" -gt 0 ]; then + primaryDone="$primaryPartsDone" fi if [ -d "${swapDir}" ]; then printf "# swap in progress ${query} -> ${target}\n" 1>&2 printf "# " 1>&2 ls -ogd "${swapDir}" 1>&2 if [ "${doneCount}" -ne 2 ]; then exit 0 fi fi if [ "${doneCount}" -eq 2 ]; then printf "# all done\n" 1>&2 - exit 0 fi + export tOrgName="$(orgName $target)" export qOrgName="$(orgName $query)" export tSequenceCount="$(seqCount $target)" export qSequenceCount="$(seqCount $query)" printf "# working: %s\n" "${buildDir}" 1>&2 printf "# target: $target - $tOrgName - $tClade - $tSequenceCount sequences\n" 1>&2 printf "# query: $query - $qOrgName - $qClade - $qSequenceCount sequences\n" 1>&2 export seq1Limit="40" if [ "${tSequenceCount}" -gt 50000 ]; then seq1Limit="100" fi export seq2Limit="100" if [ "${qSequenceCount}" -gt 50000 ]; then @@ -267,174 +283,186 @@ "primate") ;; "mammal") ;; "other") minScore="5000" linearGap="loose" ;; esac ;; "other") minScore="5000" linearGap="loose" esac -mkdir "${buildDir}" - export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target} BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz # TARGET: ${tOrgName} ${Target} SEQ1_DIR=${target2bit} SEQ1_LEN=${targetSizes} SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=${seq1Limit} # QUERY: ${qOrgName} ${Query} SEQ2_DIR=${query2bit} SEQ2_LEN=${querySizes} SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=${seq2Limit} BASE=${buildDir} TMPDIR=/dev/shm " +### skip primary alignment if it is already done +### primaryDone == 0 means NOT done yet +if [ $primaryDone -eq 0 ]; then + mkdir "${buildDir}" + ### setup the DEF file printf "%s" "${defString}" > ${buildDir}/DEF ### setup the buildDir/run.sh script printf "#!/bin/bash set -beEu -o pipefail export targetDb=\"${tAsmId}\" export queryDb=\"${qAsmId}\" export QueryDb=\"${Query}\" cd ${buildDir} time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\ - -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ + $tFullName $qFullName -workhorse=hgwdev -smallClusterHub=hgwdev \\ + -bigClusterHub=ku \\ -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1 grep -w real do.log | sed -e 's/^/ # /;' sed -e 's/^/ # /;' fb.\${targetDb}.chain\${QueryDb}Link.txt sed -e 's/^/ # /;' fb.\${targetDb}.chainSyn\${QueryDb}Link.txt time (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=hgwdev -buildDir=\`pwd\` \\ ${tRbestArgs} \\ \${targetDb} \${queryDb}) > rbest.log 2>&1 grep -w real rbest.log | sed -e 's/^/ # /;' sed -e 's/^/ #/;' fb.\${targetDb}.chainRBest.\${QueryDb}.txt " > ${buildDir}/run.sh chmod +x ${buildDir}/run.sh ### run the primary alignment time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1 +fi +#### primaryDone > 0 ready for swap + #### print out the makeDoc.txt to this point into buildDir/makeDoc.txt printf "############################################################################## # LASTZ ${tOrgName} ${Target} vs. $qOrgName ${Query} (DONE - $DS - Hiram) mkdir $buildDir cd $buildDir printf '${defString} ' > DEF time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\ - -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ + ${tFullName} ${qFullName} -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1 grep -w real do.log | sed -e 's/^/ # /;' " > ${buildDir}/makeDoc.txt -grep -w real $buildDir/do.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt +(grep -w real $buildDir/do.log || true) | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt printf "\n sed -e 's/^/ # /;' fb.${tAsmId}.chain${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt sed -e 's/^/ # /;' $buildDir/fb.${tAsmId}.chain${Query}Link.txt >> ${buildDir}/makeDoc.txt printf " sed -e 's/^/ # /;' fb.${tAsmId}.chainSyn${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt sed -e 's/^/ # /;' $buildDir/fb.${tAsmId}.chainSyn${Query}Link.txt >> ${buildDir}/makeDoc.txt printf "\n time (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=hgwdev -buildDir=\`pwd\` \\ ${tRbestArgs} \\ ${tAsmId} ${qAsmId}) > rbest.log 2>&1 grep -w real rbest.log | sed -e 's/^/ # /;'\n" >> ${buildDir}/makeDoc.txt -grep -w real $buildDir/rbest.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt +(grep -w real $buildDir/rbest.log || true) | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt printf "\n sed -e 's/^/ # /;' fb.${tAsmId}.chainRBest.${Query}.txt\n" >> ${buildDir}/makeDoc.txt -sed -e 's/^/ #/;' ${buildDir}/fb.${tAsmId}.chainRBest.${Query}.txt >> ${buildDir}/makeDoc.txt +(sed -e 's/^/ # /;' ${buildDir}/fb.${tAsmId}.chainRBest.${Query}.txt || true) >> ${buildDir}/makeDoc.txt printf "\n### and for the swap\n" >> ${buildDir}/makeDoc.txt cat ${buildDir}/makeDoc.txt printf "# swap into: ${swapDir}\n" 1>&2 +if [ "$swapDone" -eq 0 ]; then mkdir ${swapDir} printf "#!/bin/bash set -beEu -o pipefail export targetDb=\"${tAsmId}\" export Target=\"${Target}\" +export Qarget=\"${Query}\" export queryDb=\"${qAsmId}\" -cd ${swapDir} time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -swap -verbose=2 \\ - ${buildDir}/DEF -swapDir=\`pwd\` \\ + ${tFullName} ${qFullName} ${buildDir}/DEF -swapDir=\`pwd\` \\ -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1 grep -w real swap.log | sed -e 's/^/ # /;' sed -e 's/^/ # /;' fb.\${queryDb}.chain\${Target}Link.txt sed -e 's/^/ # /;' fb.\${queryDb}.chainSyn\${Target}Link.txt time (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=hgwdev -buildDir=\`pwd\` \\ ${swapRbestArgs} \\ \${queryDb} \${targetDb}) > rbest.log 2>&1 grep -w real rbest.log | sed -e 's/^/ # /;' sed -e 's/^/ # /;' fb.\${queryDb}.chainRBest.\${Target}.txt " > ${swapDir}/runSwap.sh chmod +x ${swapDir}/runSwap.sh +printf "# running ${swapDir}/runSwap.sh\n" 1>&2 + time (${swapDir}/runSwap.sh) >> ${swapDir}/swap.log 2>&1 +fi ### continue the make doc + printf "\ncd ${swapDir}\n" >> ${buildDir}/makeDoc.txt printf "\ntime (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -swap -verbose=2 \\ - ${buildDir}/DEF -swapDir=\`pwd\` \\ + ${tFullName} ${qFullName} ${buildDir}/DEF -swapDir=\`pwd\` \\ -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\ -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1 grep -w real swap.log | sed -e 's/^/ # /;' " >> ${buildDir}/makeDoc.txt -grep -w real ${swapDir}/swap.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt +(grep -w real ${swapDir}/swap.log || true) | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt printf "\nsed -e 's/^/ # /;' fb.${qAsmId}.chain${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt -sed -e 's/^/ # /;' ${swapDir}/fb.${tAsmId}.chain${Target}Link.txt >> ${buildDir}/makeDoc.txt +sed -e 's/^/ # /;' ${swapDir}/fb.${qAsmId}.chain${Target}Link.txt >> ${buildDir}/makeDoc.txt printf "sed -e 's/^/ # /;' fb.${qAsmId}.chainSyn${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt sed -e 's/^/ # /;' ${swapDir}/fb.${qAsmId}.chainSyn${Target}Link.txt >> ${buildDir}/makeDoc.txt printf "\ntime (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=hgwdev -buildDir=\`pwd\` \\ ${swapRbestArgs} \\ ${qAsmId} ${tAsmId}) > rbest.log 2>&1 grep -w real rbest.log | sed -e 's/^/ # /;'\n" >> ${buildDir}/makeDoc.txt -grep -w real ${swapDir}/rbest.log | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt +(grep -w real ${swapDir}/rbest.log || true) | sed -e 's/^/ # /;' >> ${buildDir}/makeDoc.txt printf "\nsed -e 's/^/ # /;' fb.${qAsmId}.chainRBest.${Target}.txt\n" >> ${buildDir}/makeDoc.txt -sed -e 's/^/ #/;' ${swapDir}/fb.${qAsmId}.chainRBest.${Target}.txt >> ${buildDir}/makeDoc.txt +(sed -e 's/^/ # /;' ${swapDir}/fb.${qAsmId}.chainRBest.${Target}.txt || true) >> ${buildDir}/makeDoc.txt printf "\n##############################################################################\n" >> ${buildDir}/makeDoc.txt ### XXX #### cat ${buildDir}/makeDoc.txt