c1c72d06c1897453aa237aa30124d66a14464f59
hiram
  Wed Oct 13 10:12:40 2021 -0700
fixup the primate to primate DEF file and begin to add the auto detection of which should be target vs query refs #28318

diff --git src/hg/utils/automation/pairLastz.sh src/hg/utils/automation/pairLastz.sh
index 8539b77..028e6ca 100755
--- src/hg/utils/automation/pairLastz.sh
+++ src/hg/utils/automation/pairLastz.sh
@@ -48,42 +48,59 @@
   export GCxPath=""
   case $asmName in
      GC[AF]_*)
        gcX=`echo $asmName | cut -c1-3`
        d0=`echo $asmName | cut -c5-7`
        d1=`echo $asmName | cut -c8-10`
        d2=`echo $asmName | cut -c11-13`
        GCxPath="${gcX}/${d0}/${d1}/${d2}"
        ;;
      *)
        ;;
   esac
   printf "%s" "${GCxPath}"
 }
 
+# asmSize - determine size of genome
+function asmSize() {
+  export asmName=$1
+  export sizes="/hive/data/genomes/${asmName}/chrom.sizes"
+  case $asmName in
+     GC[AF]_*)
+       gcxPath=$(gcPath $asmName)
+       id=$(asmId $asmName)
+       size=`awk '{sum+=$2}END{print sum}' /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt`
+       ;;
+     *)
+       size=`awk '{sum+=$2}END{print sum}' ${sizes}`
+       ;;
+  esac
+  printf "%s" "${size}"
+}
+
 # seqCount - determine the sequence count in given genome target
 function seqCount() {
   export asmName=$1
   export sizes="/hive/data/genomes/${asmName}/chrom.sizes"
   case $asmName in
      GC[AF]_*)
        gcxPath=$(gcPath $asmName)
        id=$(asmId $asmName)
-       count=`cat /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt | wc -l`
+       count=`wc -l /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt | cut -d' ' -f1`
        ;;
      *)
-       count=`cat ${sizes} | wc -l`
+       count=`wc -l ${sizes} | cut -d' ' -f1`
        ;;
   esac
   printf "%s" "${count}"
 }
 
 function orgName() {
   export asmName=$1
   case $asmName in
      GC[AF]_*)
        gcxPath=$(gcPath $asmName)
        asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}"
        asmRpt="${asmDir}/${asmName}_assembly_report.txt"
        oName=`egrep -i "^# organism name:" ${asmRpt} | tr -d '\r' | sed -e 's/.*(//; s/).*//'`
        ;;
      *)
@@ -234,35 +251,38 @@
   printf "# swap in progress ${query} -> ${target}\n" 1>&2
   printf "# " 1>&2
   ls -ogd "${swapDir}" 1>&2
   if [ "${doneCount}" -ne 2 ]; then
     exit 0
   fi
 fi
 
 if [ "${doneCount}" -eq 2 ]; then
     printf "# all done\n" 1>&2
 fi
 
 
 export tOrgName="$(orgName $target)"
 export qOrgName="$(orgName $query)"
+export tAsmSize="$(asmSize $target)"
+export qAsmSize="$(asmSize $query)"
 export tSequenceCount="$(seqCount $target)"
 export qSequenceCount="$(seqCount $query)"
 printf "# working: %s\n" "${buildDir}" 1>&2
 printf "# target: $target - $tOrgName - $tClade - $tSequenceCount sequences\n" 1>&2
 printf "#  query: $query - $qOrgName - $qClade - $qSequenceCount sequences\n" 1>&2
+LC_NUMERIC=en_US printf "#  sizes: target: %'d - query: %'d\n" "${tAsmSize}" "${qAsmSize}" 1>&2
 
 export seq1Limit="40"
 if [ "${tSequenceCount}" -gt 50000 ]; then
   seq1Limit="100"
 fi
 
 export seq2Limit="100"
 if [ "${qSequenceCount}" -gt 50000 ]; then
   seq2Limit="500"
 fi
 
 export minScore="3000"
 export linearGap="medium"
 
 case $tClade in
@@ -284,50 +304,88 @@
         "primate")
            ;;
         "mammal")
            ;;
         "other")
            minScore="5000"
            linearGap="loose"
            ;;
      esac
      ;;
    "other")
       minScore="5000"
       linearGap="loose"
 esac
 
+if [ "$tClade" == "primate" -a "$qClade" == "primate" ]; then
+
 export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target}
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
+BLASTZ_T=2
+BLASTZ_O=600
+BLASTZ_E=150
+BLASTZ_M=254
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_Q=/hive/data/staging/data/blastz/human_chimp.v2.q
+#       A     C     G     T
+# A    90  -330  -236  -356
+# C  -330   100  -318  -236
+# G  -236  -318   100  -330
+# T  -356  -236  -330    90
 
 # TARGET: ${tOrgName} ${Target}
 SEQ1_DIR=${target2bit}
 SEQ1_LEN=${targetSizes}
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=${seq1Limit}
 
 # QUERY: ${qOrgName} ${Query}
 SEQ2_DIR=${query2bit}
 SEQ2_LEN=${querySizes}
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=${seq2Limit}
 
 BASE=${buildDir}
 TMPDIR=/dev/shm
 "
+else
+
+export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target}
+BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz
+
+# TARGET: ${tOrgName} ${Target}
+SEQ1_DIR=${target2bit}
+SEQ1_LEN=${targetSizes}
+SEQ1_CHUNK=20000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=${seq1Limit}
+
+# QUERY: ${qOrgName} ${Query}
+SEQ2_DIR=${query2bit}
+SEQ2_LEN=${querySizes}
+SEQ2_CHUNK=20000000
+SEQ2_LAP=0
+SEQ2_LIMIT=${seq2Limit}
+
+BASE=${buildDir}
+TMPDIR=/dev/shm
+"
+
+fi
 
 ### skip primary alignment if it is already done
 ###  primaryDone == 0 means NOT done yet
 if [ $primaryDone -eq 0 ]; then
   mkdir "${buildDir}"
 
 ### setup the DEF file
 printf "%s" "${defString}" > ${buildDir}/DEF
 
 ### setup the buildDir/run.sh script
 printf "#!/bin/bash
 
 set -beEu -o pipefail
 
 export targetDb=\"${tAsmId}\"