src/hg/utils/automation/kegAlignLastz.sh f08a32f711b71809d508587d53612a3d701b8749

f08a32f711b71809d508587d53612a3d701b8749
hiram
  Mon Apr 13 16:23:53 2026 -0700
adding processing of the result into bigBed files and in the usual lastz output locations refs #31811

diff --git src/hg/utils/automation/kegAlignLastz.sh src/hg/utils/automation/kegAlignLastz.sh
index ab66399d438..41e88cf8206 100755
--- src/hg/utils/automation/kegAlignLastz.sh
+++ src/hg/utils/automation/kegAlignLastz.sh
@@ -109,30 +109,44 @@
   export asmName=$1
   case $asmName in
      GC[AF]_*)
        gcxPath=$(gcPath $asmName)
        asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}"
        asmRpt="${asmDir}/${asmName}_assembly_report.txt"
        oName=`egrep -m 1 -i "^# organism name:" ${asmRpt} | tr -d '\r' | sed -e 's/.*(//; s/).*//'`
        ;;
      *)
        oName=`hgsql -N -e "select organism from dbDb where name=\"${asmName}\";" hgcentraltest`
        ;;
   esac
   printf "%s" "${oName}"
 }
 
+function faGzUrl() {
+  export asmName=$1
+  case $asmName in
+     GC[AF]_*)
+       gcxPath=$(gcPath $asmName)
+       id=$(asmId $asmName)
+       printf "https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s.fa.gz" "${gcxPath}" "${id}" "${id}"
+       ;;
+     *)
+       printf "https://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.fa.gz" "${asmName}" "${asmName}"
+       ;;
+  esac
+}
+
 function orgDate() {
   export asmName=$1
   case $asmName in
      GC[AF]_*)
        gcxPath=$(gcPath $asmName)
        asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}"
        asmRpt="${asmDir}/${asmName}_assembly_report.txt"
        oDate=`egrep -m 1 -i "^#[[:space:]]*Date:" ${asmRpt} | tr -d '\r' | sed -e 's/.*ate: \+//;'`
        ;;
      *)
        oDate=""
        ;;
   esac
   printf "%s" "${oDate}"
 }
@@ -300,30 +314,32 @@
   fi
 fi
 
 if [ "${doneCount}" -eq 2 ]; then
     printf "# all done\n" 1>&2
 fi
 
 export tOrgName="$(orgName $target)"
 export qOrgName="$(orgName $query)"
 export tOrgDate="$(orgDate $target)"
 export qOrgDate="$(orgDate $query)"
 export tAsmSize="$(asmSize $target)"
 export qAsmSize="$(asmSize $query)"
 export tSequenceCount="$(seqCount $target)"
 export qSequenceCount="$(seqCount $query)"
+export tFaGzUrl="$(faGzUrl $target)"
+export qFaGzUrl="$(faGzUrl $query)"
 printf "# working: %s\n" "${buildDir}" 1>&2
 printf "# target: $target - $tOrgName - $tOrgDate - $tClade - $tSequenceCount sequences\n" 1>&2
 printf "#  query: $query - $qOrgName - $qOrgDate - $qClade - $qSequenceCount sequences\n" 1>&2
 LC_NUMERIC=en_US printf "#  sizes: target: %'d - query: %'d\n" "${tAsmSize}" "${qAsmSize}" 1>&2
 
 export seq1Limit="40"
 if [ "${tSequenceCount}" -gt 50000 ]; then
   seq1Limit="100"
 fi
 
 export seq2Limit="100"
 if [ "${qSequenceCount}" -gt 50000 ]; then
   seq2Limit="500"
 fi
 
@@ -354,34 +370,34 @@
            minScore="5000"
            linearGap="loose"
            ;;
      esac
      ;;
    "other")
       minScore="5000"
       linearGap="loose"
 esac
 
 if [ "$tClade" == "primate" -a "$qClade" == "primate" ]; then
 
 export yamlString="# ${target}.${query}.yaml
 TARGET_Sequence:
   class: File
-  path: https://hgdownload.soe.ucsc.edu/hubs/GCA/011/100/615/GCA_011100615.1/GCA_011100615.1.fa.gz
+  path: ${tFaGzUrl}
 QUERY_Sequence:
   class: File
-  path: https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz
+  path: ${qFaGzUrl}
 # axtChain options
 axtChainMinScore: ${minScore}
 linear_gap_options.linear_gap: ${linearGap}
 # lastz options
 xdropX: 910
 ydropY: 15000
 stepZ: 1
 noTransitionT: false
 strand_selectorB: both
 seeding_options.seed_selector: 12of19
 hspthreshK: 4500
 gappedthreshL: 4500
 innerH: 2000
 "
 
@@ -410,34 +426,34 @@
 # QUERY: ${qOrgName} ${qOrgDate} ${query}
 SEQ2_DIR=${query2bit}
 SEQ2_LEN=${querySizes}
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=${seq2Limit}
 
 BASE=${buildDir}
 TMPDIR=/dev/shm
 "
 else
 
 export yamlString="# ${target}.${query}.yaml
 TARGET_Sequence:
   class: File
-  path: https://hgdownload.soe.ucsc.edu/hubs/GCA/011/100/615/GCA_011100615.1/GCA_011100615.1.fa.gz
+  path: ${tFaGzUrl}
 QUERY_Sequence:
   class: File
-  path: https://hgdownload.soe.ucsc.edu/goldenPath/mm39/bigZips/mm39.fa.gz
+  path: ${qFaGzUrl}
 # axtChain options
 axtChainMinScore: ${minScore}
 linear_gap_options.linear_gap: ${linearGap}
 # lastz options
 xdropX: 910
 ydropY: 9400
 stepZ: 1
 noTransitionT: true
 strand_selectorB: both
 seeding_options.seed_selector: 12of19
 hspthreshK: 3000
 gappedthreshL: 3000
 innerH: 2000
 "
 export defString="# ${qOrgName} ${Query} vs. ${tOrgName} ${Target}
@@ -467,53 +483,138 @@
 ###  primaryDone == 0 means NOT done yet
 if [ $primaryDone -eq 0 ]; then
   mkdir "${buildDir}"
 
 ### setup the DEF file
 printf "%s" "${defString}" > ${buildDir}/DEF
 ### and the yaml file
 printf "%s" "${yamlString}" > ${buildDir}/${tAsmId}.${qAsmId}.yaml
 
 ### setup the buildDir/run.sh script
 printf "#!/bin/bash
 
 set -beEu -o pipefail
 
 export buildDir=\"${buildDir}\"
+export swapDir=\"${swapDir}\"
 export PM=\"/hive/users/hiram/galaxy/venv3.12/bin/planemo\"
 
 export targetDb=\"${tAsmId}\"
 export queryDb=\"${qAsmId}\"
 export QueryDb=\"${Query}\"
 
 cd \${buildDir}
 mkdir -p log
-export DS=\`date \"+\%F_\%T_\%s\"\`
+export DS=\`date \"+\%%F_\%%T_\%%s\"\`
 export logDir=\"\${buildDir}/log\"
 export logFile=\"\${logDir}/\${DS}.log\"
 time (\${PM} run \\
   \"~/kent/src/hg/utils/automation/kegAlign.json.gz\" \\
      \"\${targetDb}.\${queryDb}.yaml\" --profile vgp \\
         --history_name \"\${targetDb}.\${queryDb}.kegAlign\" \\
-           --test_output_json \"\${logDir}/runOutput.\${DS}.json) >> \"\${logFile}\" 2>&1
+           --test_output_json \"\${logDir}/runOutput.\${DS}.json\") >> \"\${logFile}\" 2>&1
 
-export invockId=\`jq '.tests[0].data.invocation_details.details.invocation_id' \${logDir}/runOutput\${DS}.json | tr -d '\"'\`
-printf \"invocation ID: \%s\\n\" \"\${invockId}\" 1>&2
+export invocationId=\`jq '.tests[0].data.invocation_details.details.invocation_id' \"\${logDir}/runOutput.\${DS}.json\" | tr -d '\"'\`
+printf \"invocation ID: %%s\\n\" \"\${invocationId}\" 1>&2
 mkdir -p result/\${DS}
-\${PM} invocation_download \"${invockId}\" --profile vgp \\
+\${PM} invocation_download \"\${invocationId}\" --profile vgp \\
   --output_directory result/\${DS}
 
+### install allChain into buildDir/axtChain/
+mkdir -p \${buildDir}/axtChain
+export allChainFile=\`ls result/\${DS}/allChain__*.chain\`
+gzip -c \"\${allChainFile}\" > \${buildDir}/axtChain/\${targetDb}.\${queryDb}.all.chain.gz
+
+### convert target allChain to bigBed and measure featureBits
+cd \${buildDir}/axtChain
+export tSizes=\"${targetSizes}\"
+export allChain=\"\${targetDb}.\${queryDb}.all.chain.gz\"
+hgLoadChain -test -noBin -tIndex \${targetDb} allChain \"\${allChain}\"
+sed 's/.000000//' chain.tab | awk 'BEGIN {OFS=\"\\t\"} {print \\\$2, \\\$4, \\\$5, \\\$11, 1000, \\\$8, \\\$3, \\\$6, \\\$7, \\\$9, \\\$10, \\\$1}' > allChain.tab
+awk 'BEGIN {OFS=\"\\t\"} {print \\\$1, \\\$2, \\\$3, \\\$5, \\\$4}' link.tab | sort -k1,1 -k2,2n > allChainLink.tab
+bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab allChain.tab \${tSizes} allChain.bb
+bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab allChainLink.tab \${tSizes} allChainLink.bb
+rm -f allChain.tab allChainLink.tab chain.tab link.tab
+export totalBases=\`ave -col=2 \${tSizes} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\`
+export basesCovered=\`bigBedInfo allChainLink.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\`
+export percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\`
+printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt
+cat \${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt
+
+### install netChainSubset (over.chain) for target
+cd \${buildDir}
+export overChainFile=\`ls result/\${DS}/'netChainSubset on dataset 7 and 21__'*.chain\`
+gzip -c \"\${overChainFile}\" > \${buildDir}/axtChain/\${targetDb}.\${queryDb}.over.chain.gz
+
+### convert target over.chain to bigBed and measure featureBits
+cd \${buildDir}/axtChain
+export overChain=\"\${targetDb}.\${queryDb}.over.chain.gz\"
+hgLoadChain -test -noBin -tIndex \${targetDb} overChain \"\${overChain}\"
+sed 's/.000000//' chain.tab | awk 'BEGIN {OFS=\"\\t\"} {print \\\$2, \\\$4, \\\$5, \\\$11, 1000, \\\$8, \\\$3, \\\$6, \\\$7, \\\$9, \\\$10, \\\$1}' > overChain.tab
+awk 'BEGIN {OFS=\"\\t\"} {print \\\$1, \\\$2, \\\$3, \\\$5, \\\$4}' link.tab | sort -k1,1 -k2,2n > overChainLink.tab
+bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab overChain.tab \${tSizes} overChain.bb
+bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab overChainLink.tab \${tSizes} overChainLink.bb
+rm -f overChain.tab overChainLink.tab chain.tab link.tab
+export totalBases=\`ave -col=2 \${tSizes} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\`
+export basesCovered=\`bigBedInfo overChainLink.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\`
+export percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\`
+printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt
+cat \${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt
+
+### install allChainSwap into swapDir/axtChain/
+mkdir -p \${swapDir}/axtChain
+cd \${buildDir}
+export allChainSwapFile=\`ls result/\${DS}/allChainSwap__*.chain\`
+gzip -c \"\${allChainSwapFile}\" > \${swapDir}/axtChain/\${queryDb}.\${targetDb}.all.chain.gz
+
+### convert swap allChain to bigBed and measure featureBits
+cd \${swapDir}/axtChain
+export qSizes=\"${querySizes}\"
+export allChain=\"\${queryDb}.\${targetDb}.all.chain.gz\"
+hgLoadChain -test -noBin -tIndex \${queryDb} allChain \"\${allChain}\"
+sed 's/.000000//' chain.tab | awk 'BEGIN {OFS=\"\\t\"} {print \\\$2, \\\$4, \\\$5, \\\$11, 1000, \\\$8, \\\$3, \\\$6, \\\$7, \\\$9, \\\$10, \\\$1}' > allChain.tab
+awk 'BEGIN {OFS=\"\\t\"} {print \\\$1, \\\$2, \\\$3, \\\$5, \\\$4}' link.tab | sort -k1,1 -k2,2n > allChainLink.tab
+bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab allChain.tab \${qSizes} allChain.bb
+bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab allChainLink.tab \${qSizes} allChainLink.bb
+rm -f allChain.tab allChainLink.tab chain.tab link.tab
+export totalBases=\`ave -col=2 \${qSizes} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\`
+export basesCovered=\`bigBedInfo allChainLink.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\`
+export percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\`
+export Target=\"${Target}\"
+printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt
+cat \${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt
+
+### install netChainSubset (over.chain) for swap
+cd \${buildDir}
+export overChainSwapFile=\`ls result/\${DS}/'netChainSubset on dataset 8 and 35__'*.chain\`
+gzip -c \"\${overChainSwapFile}\" > \${swapDir}/axtChain/\${queryDb}.\${targetDb}.over.chain.gz
+
+### convert swap over.chain to bigBed and measure featureBits
+cd \${swapDir}/axtChain
+export overChain=\"\${queryDb}.\${targetDb}.over.chain.gz\"
+hgLoadChain -test -noBin -tIndex \${queryDb} overChain \"\${overChain}\"
+sed 's/.000000//' chain.tab | awk 'BEGIN {OFS=\"\\t\"} {print \\\$2, \\\$4, \\\$5, \\\$11, 1000, \\\$8, \\\$3, \\\$6, \\\$7, \\\$9, \\\$10, \\\$1}' > overChain.tab
+awk 'BEGIN {OFS=\"\\t\"} {print \\\$1, \\\$2, \\\$3, \\\$5, \\\$4}' link.tab | sort -k1,1 -k2,2n > overChainLink.tab
+bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab overChain.tab \${qSizes} overChain.bb
+bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab overChainLink.tab \${qSizes} overChainLink.bb
+rm -f overChain.tab overChainLink.tab chain.tab link.tab
+export totalBases=\`ave -col=2 \${qSizes} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\`
+export basesCovered=\`bigBedInfo overChainLink.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\`
+export percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\`
+printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt
+cat \${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt
+
 " > ${buildDir}/run.sh
 chmod +x ${buildDir}/run.sh
 
 ### run the primary alignment
 printf "running: time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1\n" 1>&2
 
 # time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1
 
 exit $?
 
 ### typical contents of the invocation_download:
 ### ls result/2026-04-02_16:00:02_1775170802
 ### 'Batched LASTZ on dataset 5__70a08f89-a77d-4333-b425-b13959e40d1b.axt'
 ### 'KegAlign on dataset 3 and 4__a32df3e9-13d0-4c5e-b765-df69c657ae89.tgz'
 ### allChainSwap__5e11f323-58fc-43f9-afbd-6cff21f0e7af.chain