210af3aaa6d9d894afa8ee28875526cc3428e41e
hiram
  Wed Oct 27 10:30:16 2021 -0700
now sending email upon completion refs #28409

diff --git src/hg/utils/automation/pairLastz.sh src/hg/utils/automation/pairLastz.sh
index 028e6ca..578f530 100755
--- src/hg/utils/automation/pairLastz.sh
+++ src/hg/utils/automation/pairLastz.sh
@@ -1,541 +1,562 @@
 #!/bin/bash
 
 set -beEu -o pipefail
 
+export userName="`whoami`"
+
 if [ $# != 4 ]; then
   printf "usage: pairLastz.sh <target> <query> <tClade> <qClade>
 
 Where target/query is either a UCSC db name, or is an
    assembly hub identifier, e.g.: GCA_002844635.1_USMARCv1.0.1
 
 And [tq]Clade is one of: primate|mammal|other
 
 Will create directory to work in, for example if, UCSC db:
   /hive/data/target/bed/lastzQuery.yyyy-mm-dd/
 
 Or, in the assembly hub build directory:
 /hive/data/genomes/asmHubs/allBuild/GCA/002/844/635/GCA_002844635.1_USMARCv1.0/trackData/lastzQuery.yyyy-mm-dd
 
 Will set up a DEF file there, and a run.sh script to run all steps
   and output makeDoc text to document what happened.
 
 AND MORE, it will run the swap operation into the corresponding
   blastz.target.swap directory in the query genome work space.
 
+Email will be sent to: '$userName' upon completion.
+
 e.g.: pairLastz.sh rn7 papAnu4 mammal primate\n" 1>&2
   exit 255
 fi
 
 # asmId - if assembly hub, determine GCx_012345678.9 name
 #         if not, return the asmName (== UCSC database name)
 function asmId() {
   export asmName=$1
   export id="${asmName}"
   case $asmName in
      GC[AF]_*)
        id=`echo $asmName | cut -d'_' -f1-2`
        ;;
      *)
        ;;
   esac
   printf "%s" "${id}"
 }
 
 # gcPath - if assembly hub, determine GCx/012/345/678 path
 #          if not return empty string "" (== UCSC database name)
 function gcPath() {
   export asmName=$1
   export GCxPath=""
   case $asmName in
      GC[AF]_*)
        gcX=`echo $asmName | cut -c1-3`
        d0=`echo $asmName | cut -c5-7`
        d1=`echo $asmName | cut -c8-10`
        d2=`echo $asmName | cut -c11-13`
        GCxPath="${gcX}/${d0}/${d1}/${d2}"
        ;;
      *)
        ;;
   esac
   printf "%s" "${GCxPath}"
 }
 
 # asmSize - determine size of genome
 function asmSize() {
   export asmName=$1
   export sizes="/hive/data/genomes/${asmName}/chrom.sizes"
   case $asmName in
      GC[AF]_*)
        gcxPath=$(gcPath $asmName)
        id=$(asmId $asmName)
        size=`awk '{sum+=$2}END{print sum}' /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt`
        ;;
      *)
        size=`awk '{sum+=$2}END{print sum}' ${sizes}`
        ;;
   esac
   printf "%s" "${size}"
 }
 
 # seqCount - determine the sequence count in given genome target
 function seqCount() {
   export asmName=$1
   export sizes="/hive/data/genomes/${asmName}/chrom.sizes"
   case $asmName in
      GC[AF]_*)
        gcxPath=$(gcPath $asmName)
        id=$(asmId $asmName)
        count=`wc -l /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt | cut -d' ' -f1`
        ;;
      *)
        count=`wc -l ${sizes} | cut -d' ' -f1`
        ;;
   esac
   printf "%s" "${count}"
 }
 
 function orgName() {
   export asmName=$1
   case $asmName in
      GC[AF]_*)
        gcxPath=$(gcPath $asmName)
        asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}"
        asmRpt="${asmDir}/${asmName}_assembly_report.txt"
        oName=`egrep -i "^# organism name:" ${asmRpt} | tr -d '\r' | sed -e 's/.*(//; s/).*//'`
        ;;
      *)
        oName=`hgsql -N -e "select organism from dbDb where name=\"${asmName}\";" hgcentraltest`
        ;;
   esac
   printf "%s" "${oName}"
 }
 
 export target="$1"
 export query="$2"
 export tClade="$3"
 export qClade="$4"
 
 export tGcPath=$(gcPath $target)
 export qGcPath=$(gcPath $query)
 export tAsmId=$(asmId $target)
 export qAsmId=$(asmId $query)
 printf "# tq: '${target}' '${query}' '${tClade}' '${qClade}'\n" 1>&2
 printf "# tq gcPath: '${tGcPath}' '${qGcPath}'\n" 1>&2
 printf "# tq asmId: '${tAsmId}' '${qAsmId}'\n" 1>&2
 
 # upper case first character
 export Target="${tAsmId^}"
 export Query="${qAsmId^}"
 
 export DS=`date "+%F"`
 
 # assume UCSC db build
 export buildDir="/hive/data/genomes/${target}/bed/lastz${Query}.${DS}"
 export targetExists="/hive/data/genomes/${target}/bed"
 export symLink="/hive/data/genomes/${target}/bed/lastz.${qAsmId}"
 export swapDir="/hive/data/genomes/${query}/bed/blastz.${tAsmId}.swap"
 export queryExists="/hive/data/genomes/${query}/bed"
 export swapLink="/hive/data/genomes/${query}/bed/lastz.${tAsmId}"
 export targetSizes="/hive/data/genomes/${target}/chrom.sizes"
 export querySizes="/hive/data/genomes/${query}/chrom.sizes"
 export target2bit="/hive/data/genomes/${target}/${target}.2bit"
 export query2bit="/hive/data/genomes/${query}/${query}.2bit"
 export trackHub=""
 export rBestTrackHub=""
 export tRbestArgs=""
 export qRbestArgs=""
 export tSwapRbestArgs=""
 export qSwapRbestArgs=""
 export tFullName=""
 export qFullName=""
 
 #  override those specifications if assembly hub
 case $target in
      GC[AF]_*) trackHub="-trackHub -noDbNameCheck"
        tFullName="-tAsmId $target"
        rBestTrackHub="-trackHub"
        buildDir="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz${Query}.${DS}"
        symLink="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData/lastz.${qAsmId}"
        targetExists="/hive/data/genomes/asmHubs/allBuild/${tGcPath}/${target}/trackData"
        targetSizes="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.chrom.sizes.txt"
        target2bit="/hive/data/genomes/asmHubs/${tGcPath}/${tAsmId}/${tAsmId}.2bit"
        tRbestArgs="-target2Bit=\"${target2bit}\" \\
 -targetSizes=\"${targetSizes}\""
        tSwapRbestArgs="-query2bit=\"${target2bit}\" \\
 -querySizes=\"${targetSizes}\""
        ;;
 esac
 
 case $query in
      GC[AF]_*) trackHub="-trackHub -noDbNameCheck"
        qFullName="-qAsmId $query"
        rBestTrackHub="-trackHub"
        swapDir="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/blastz.${tAsmId}.swap"
        swapLink="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData/lastz.${tAsmId}"
        queryExists="/hive/data/genomes/asmHubs/allBuild/${qGcPath}/${query}/trackData"
        querySizes="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.chrom.sizes.txt"
        query2bit="/hive/data/genomes/asmHubs/${qGcPath}/${qAsmId}/${qAsmId}.2bit"
        qRbestArgs="-query2Bit=\"${query2bit}\" \\
 -querySizes=\"${querySizes}\""
        qSwapRbestArgs="-target2bit=\"${query2bit}\" \\
 -targetSizes=\"${querySizes}\""
        ;;
 esac
 
 
 if [ ! -d "${targetExists}" ]; then
   printf "ERROR: can not find ${targetExists}\n" 1>&2
   exit 255
 fi
 
 if [ ! -d "${queryExists}" ]; then
   printf "ERROR: can not find ${queryExists}\n" 1>&2
   exit 255
 fi
 
 if [ ! -s "${target2bit}" ]; then
   printf "ERROR: can not find ${target2bit}\n" 1>&2
   exit 255
 fi
 
 if [ ! -s "${query2bit}" ]; then
   printf "ERROR: can not find ${query2bit}\n" 1>&2
   exit 255
 fi
 
 
 if [ ! -s "${targetSizes}" ]; then
   printf "ERROR: can not find ${targetSizes}\n" 1>&2
   exit 255
 fi
 
 if [ ! -s "${querySizes}" ]; then
   printf "ERROR: can not find ${querySizes}\n" 1>&2
   exit 255
 fi
 
 export doneCount=0
 export primaryDone=0
 export swapDone=0
 
 if [ -L "${symLink}" ]; then
   printf "# ${query} -> ${target} already done\n" 1>&2
   doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'`
   primaryDone=1
 else
   printf "# no symLink: $symLink\n" 1>&2
 fi
 
 if [ -L "${swapLink}" ]; then
   printf "# swap ${query} -> ${target} already done\n" 1>&2
   doneCount=`echo $doneCount | awk '{printf "%d", $1+1}'`
   swapDone=1
 fi
 
 export working=`ls -d ${targetExists}/lastz${Query}.* 2> /dev/null | wc -l`
 if [ "${working}" -gt 0 ]; then
   if [ "${primaryDone}" -eq 0 ]; then
     printf "# in progress, ${query} -> ${target}:\n" 1>&2
     printf "# " 1>&2
     ls -ogd ${targetExists}/lastz${Query}.* 1>&2
   fi
   buildDir=`ls -d ${targetExists}/lastz${Query}.*`
   printf "# continuing: %s\n" "${buildDir}" 1>&2
 fi
 export primaryPartsDone=`ls $buildDir/fb.* 2> /dev/null | wc -l`
 if [ "$primaryPartsDone" -gt 0 ]; then
   primaryDone="$primaryPartsDone"
 fi
 
 if [ -d "${swapDir}" ]; then
   printf "# swap in progress ${query} -> ${target}\n" 1>&2
   printf "# " 1>&2
   ls -ogd "${swapDir}" 1>&2
   if [ "${doneCount}" -ne 2 ]; then
     exit 0
   fi
 fi
 
 if [ "${doneCount}" -eq 2 ]; then
     printf "# all done\n" 1>&2
 fi
 
 
 export tOrgName="$(orgName $target)"
 export qOrgName="$(orgName $query)"
 export tAsmSize="$(asmSize $target)"
 export qAsmSize="$(asmSize $query)"
 export tSequenceCount="$(seqCount $target)"
 export qSequenceCount="$(seqCount $query)"
 printf "# working: %s\n" "${buildDir}" 1>&2
 printf "# target: $target - $tOrgName - $tClade - $tSequenceCount sequences\n" 1>&2
 printf "#  query: $query - $qOrgName - $qClade - $qSequenceCount sequences\n" 1>&2
 LC_NUMERIC=en_US printf "#  sizes: target: %'d - query: %'d\n" "${tAsmSize}" "${qAsmSize}" 1>&2
 
 export seq1Limit="40"
 if [ "${tSequenceCount}" -gt 50000 ]; then
   seq1Limit="100"
 fi
 
 export seq2Limit="100"
 if [ "${qSequenceCount}" -gt 50000 ]; then
   seq2Limit="500"
 fi
 
 export minScore="3000"
 export linearGap="medium"
 
 case $tClade in
    "primate")
      case $qClade in
         "primate")
            minScore="5000"
            ;;
         "mammal")
            ;;
         "other")
            minScore="5000"
            linearGap="loose"
            ;;
      esac
      ;;
    "mammal")
      case $qClade in
         "primate")
            ;;
         "mammal")
            ;;
         "other")
            minScore="5000"
            linearGap="loose"
            ;;
      esac
      ;;
    "other")
       minScore="5000"
       linearGap="loose"
 esac
 
 if [ "$tClade" == "primate" -a "$qClade" == "primate" ]; then
 
 export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target}
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
 BLASTZ_T=2
 BLASTZ_O=600
 BLASTZ_E=150
 BLASTZ_M=254
 BLASTZ_K=4500
 BLASTZ_Y=15000
 BLASTZ_Q=/hive/data/staging/data/blastz/human_chimp.v2.q
 #       A     C     G     T
 # A    90  -330  -236  -356
 # C  -330   100  -318  -236
 # G  -236  -318   100  -330
 # T  -356  -236  -330    90
 
 # TARGET: ${tOrgName} ${Target}
 SEQ1_DIR=${target2bit}
 SEQ1_LEN=${targetSizes}
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=${seq1Limit}
 
 # QUERY: ${qOrgName} ${Query}
 SEQ2_DIR=${query2bit}
 SEQ2_LEN=${querySizes}
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=${seq2Limit}
 
 BASE=${buildDir}
 TMPDIR=/dev/shm
 "
 else
 
 export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target}
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
 
 # TARGET: ${tOrgName} ${Target}
 SEQ1_DIR=${target2bit}
 SEQ1_LEN=${targetSizes}
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=${seq1Limit}
 
 # QUERY: ${qOrgName} ${Query}
 SEQ2_DIR=${query2bit}
 SEQ2_LEN=${querySizes}
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=${seq2Limit}
 
 BASE=${buildDir}
 TMPDIR=/dev/shm
 "
 
 fi
 
 ### skip primary alignment if it is already done
 ###  primaryDone == 0 means NOT done yet
 if [ $primaryDone -eq 0 ]; then
   mkdir "${buildDir}"
 
 ### setup the DEF file
 printf "%s" "${defString}" > ${buildDir}/DEF
 
 ### setup the buildDir/run.sh script
 printf "#!/bin/bash
 
 set -beEu -o pipefail
 
 export targetDb=\"${tAsmId}\"
 export queryDb=\"${qAsmId}\"
 export QueryDb=\"${Query}\"
 
 cd ${buildDir}
 time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\
   $tFullName $qFullName -workhorse=hgwdev -smallClusterHub=hgwdev \\
     -bigClusterHub=ku \\
     -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1
 
 grep -w real do.log | sed -e 's/^/    # /;'
 
 sed -e 's/^/    # /;' fb.\${targetDb}.chain\${QueryDb}Link.txt
 sed -e 's/^/    # /;' fb.\${targetDb}.chainSyn\${QueryDb}Link.txt
 
 time (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=hgwdev -buildDir=\`pwd\` \\
    ${tRbestArgs} \\
    ${qRbestArgs} \\
    \${targetDb} \${queryDb}) > rbest.log 2>&1
 
 grep -w real rbest.log | sed -e 's/^/    # /;'
 
 sed -e 's/^/    #/;' fb.\${targetDb}.chainRBest.\${QueryDb}.txt
 " > ${buildDir}/run.sh
 chmod +x ${buildDir}/run.sh
 
 ### run the primary alignment
 printf "running: time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1\n" 1>&2
 
 time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1
 
 fi      ###     if [ $primaryDone -eq 0 ]; then
 
 #### print out the makeDoc.txt to this point into buildDir/makeDoc.txt
 
-export userName="`whoami`"
-
 printf "##############################################################################
-# LASTZ ${tOrgName} ${Target} vs. $qOrgName ${Query} (DONE - $DS - $userName)
+# LASTZ ${tOrgName} ${Target} vs. $qOrgName ${Query}
+#    (DONE - $DS - $userName)
+
     mkdir $buildDir
     cd $buildDir
 
     printf '${defString}
 ' > DEF
 
     time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\
       ${tFullName} ${qFullName} -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\
         -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1
     grep -w real do.log | sed -e 's/^/    # /;'
 " > ${buildDir}/makeDoc.txt
 
 (grep -w real $buildDir/do.log || true) | sed -e 's/^/    # /;' | head -1 >> ${buildDir}/makeDoc.txt
 
 printf "\n    sed -e 's/^/    # /;' fb.${tAsmId}.chain${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt
 sed -e 's/^/    # /;' $buildDir/fb.${tAsmId}.chain${Query}Link.txt >> ${buildDir}/makeDoc.txt
 
 printf "    sed -e 's/^/    # /;' fb.${tAsmId}.chainSyn${Query}Link.txt\n" >> ${buildDir}/makeDoc.txt
 sed -e 's/^/    # /;' $buildDir/fb.${tAsmId}.chainSyn${Query}Link.txt >> ${buildDir}/makeDoc.txt
 
 printf "\n    time (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=hgwdev -buildDir=\`pwd\` \\
       ${tRbestArgs} \\
       ${qRbestArgs} \\
         ${tAsmId} ${qAsmId}) > rbest.log 2>&1
 
     grep -w real rbest.log | sed -e 's/^/    # /;'\n" >> ${buildDir}/makeDoc.txt
 
 (grep -w real $buildDir/rbest.log || true) | sed -e 's/^/    # /;' >> ${buildDir}/makeDoc.txt
 
 printf "\n    sed -e 's/^/    # /;' fb.${tAsmId}.chainRBest.${Query}.txt\n" >> ${buildDir}/makeDoc.txt
 (sed -e 's/^/    # /;' ${buildDir}/fb.${tAsmId}.chainRBest.${Query}.txt || true) >> ${buildDir}/makeDoc.txt
 
 printf "\n    ### and for the swap\n" >> ${buildDir}/makeDoc.txt
 
 cat ${buildDir}/makeDoc.txt
 
 printf "# swap into: ${swapDir}\n" 1>&2
 
 if [ "$swapDone" -eq 0 ]; then
 mkdir ${swapDir}
 
 printf "#!/bin/bash
 
 set -beEu -o pipefail
 
 cd $swapDir
 
 export targetDb=\"${tAsmId}\"
 export Target=\"${Target}\"
 export Qarget=\"${Query}\"
 export queryDb=\"${qAsmId}\"
 
 time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub}  -swap -verbose=2 \\
   ${tFullName} ${qFullName} ${buildDir}/DEF -swapDir=\`pwd\` \\
   -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\
     -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1
 
 grep -w real swap.log | sed -e 's/^/    # /;'
 
 sed -e 's/^/    # /;' fb.\${queryDb}.chain\${Target}Link.txt
 sed -e 's/^/    # /;' fb.\${queryDb}.chainSyn\${Target}Link.txt
 time (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=hgwdev -buildDir=\`pwd\` \\
    ${tSwapRbestArgs} \\
    ${qSwapRbestArgs} \\
    \${queryDb} \${targetDb}) > rbest.log 2>&1
 
 grep -w real rbest.log | sed -e 's/^/    # /;'
 
 sed -e 's/^/    # /;' fb.\${queryDb}.chainRBest.\${Target}.txt
 " > ${swapDir}/runSwap.sh
 
 chmod +x  ${swapDir}/runSwap.sh
 
 printf "# running ${swapDir}/runSwap.sh\n" 1>&2
 
 time (${swapDir}/runSwap.sh) >> ${swapDir}/doSwap.log 2>&1
 fi      ### if [ "$swapDone" -eq 0 ]; then
 
 ### continue the make doc
 
 printf "\n    cd ${swapDir}\n" >> ${buildDir}/makeDoc.txt
 
 printf "\n   time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -swap -verbose=2 \\
   ${tFullName} ${qFullName} ${buildDir}/DEF -swapDir=\`pwd\` \\
   -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \\
     -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > swap.log 2>&1
 
     grep -w real swap.log | sed -e 's/^/    # /;'
 " >> ${buildDir}/makeDoc.txt
 
 (grep -w real ${swapDir}/swap.log || true) | sed -e 's/^/    # /;' >> ${buildDir}/makeDoc.txt
 
 printf "\n    sed -e 's/^/    # /;' fb.${qAsmId}.chain${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt
 sed -e 's/^/    # /;' ${swapDir}/fb.${qAsmId}.chain${Target}Link.txt >> ${buildDir}/makeDoc.txt
 
 printf "    sed -e 's/^/    # /;' fb.${qAsmId}.chainSyn${Target}Link.txt\n" >> ${buildDir}/makeDoc.txt
 sed -e 's/^/    # /;' ${swapDir}/fb.${qAsmId}.chainSyn${Target}Link.txt >> ${buildDir}/makeDoc.txt
 
 printf "\    time (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=hgwdev -buildDir=\`pwd\` \\
    ${tSwapRbestArgs} \\
    ${qSwapRbestArgs} \\
    ${qAsmId} ${tAsmId}) > rbest.log 2>&1
 
     grep -w real rbest.log | sed -e 's/^/    # /;'\n" >> ${buildDir}/makeDoc.txt
 (grep -w real ${swapDir}/rbest.log || true) | sed -e 's/^/    # /;' >> ${buildDir}/makeDoc.txt
 printf "\n    sed -e 's/^/    # /;' fb.${qAsmId}.chainRBest.${Target}.txt\n" >> ${buildDir}/makeDoc.txt
 (sed -e 's/^/    # /;' ${swapDir}/fb.${qAsmId}.chainRBest.${Target}.txt || true) >> ${buildDir}/makeDoc.txt
 
 printf "\n##############################################################################\n" >> ${buildDir}/makeDoc.txt
 
 ### show completed makeDoc.txt ####
 cat ${buildDir}/makeDoc.txt
+
+export toAddress="$userName"
+export fromAddress="$userName"
+export subject="pair lastz DONE $target $query"
+printf "To: $toAddress
+From: $fromAddress
+Subject: $subject
+
+##################################################################
+" > /tmp/send.txt.$$
+date >> /tmp/send.txt.$$
+printf "##################################################################\n" >> /tmp/send.txt.$$
+cat ${buildDir}/makeDoc.txt >> /tmp/send.txt.$$
+
+cat /tmp/send.txt.$$ | /usr/sbin/sendmail -t -oi
+
+rm -f /tmp/send.txt.$$