fd4b9f71e184d33ced4948ba1ba8618faed8d576 hiram Thu Apr 16 13:42:54 2026 -0700 make it possible to run either the galaxy kegAlign or the classic UCSC process, function up the bigBed creation and featureBits calculations, make everything in the kegAlign process reentrant so the script can be run repeatedly to fix broken issues refs #31811 diff --git src/hg/utils/automation/kegAlignLastz.sh src/hg/utils/automation/kegAlignLastz.sh index 2553faabe46..fa7af802c7e 100755 --- src/hg/utils/automation/kegAlignLastz.sh +++ src/hg/utils/automation/kegAlignLastz.sh @@ -1,44 +1,48 @@ #!/bin/bash set -beEu -o pipefail export userName="`whoami`" export bigHub="hgwdev" export workHorse="hgwdev" export smallClusterHub="hgwdev" export fileServer="hgwdev" +# a python virtual environment to run this +export planemoCmd="/hive/users/hiram/galaxy/venv3.12/bin/planemo" if [ $# != 4 ]; then printf "ERROR: arg count: %d != 4\n" "$#" 1>&2 printf "usage: kegAlignLastz.sh <target> <query> <tClade> <qClade> Where target/query is either a UCSC db name, or is an assembly hub identifier, e.g.: GCA_002844635.1_USMARCv1.0.1 And [tq]Clade is one of: primate|mammal|other Will create directory to work in, for example if, UCSC db: /hive/data/target/bed/lastzQuery.yyyy-mm-dd/ Or, in the assembly hub build directory: /hive/data/genomes/asmHubs/allBuild/GCA/002/844/635/GCA_002844635.1_USMARCv1.0/trackData/lastzQuery.yyyy-mm-dd -Will set up a DEF file there, and a run.sh script to run all steps +Will set up a DEF file there, and a kegAlign.sh script to run all steps and output makeDoc text to document what happened. + Also sets up a lastzRun.sh script that could be used to run the UCSC + lastz procedure. AND MORE, it will run the swap operation into the corresponding blastz.target.swap directory in the query genome work space. Email will be sent to: '$userName' upon completion. e.g.: kegAlignLastz.sh rn7 papAnu4 mammal primate\n" 1>&2 exit 255 fi # accId - if assembly hub, determine GCx_012345678.9 name # if not, return the asmName (== UCSC database name) function accId() { export asmName=$1 export id="${asmName}" @@ -93,82 +97,79 @@ export asmName=$1 export sizes="/hive/data/genomes/${asmName}/chrom.sizes" case $asmName in GC[AF]_*) gcxPath=$(gcPath $asmName) id=$(accId $asmName) count=`wc -l /hive/data/genomes/asmHubs/${gcxPath}/${id}/${id}.chrom.sizes.txt | cut -d' ' -f1` ;; *) count=`wc -l ${sizes} | cut -d' ' -f1` ;; esac printf "%s" "${count}" } +# obtain the organism name out of the assembly_report.txt file function orgName() { export asmName=$1 case $asmName in GC[AF]_*) gcxPath=$(gcPath $asmName) asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}" asmRpt="${asmDir}/${asmName}_assembly_report.txt" oName=`egrep -m 1 -i "^# organism name:" ${asmRpt} | tr -d '\r' | sed -e 's/.*(//; s/).*//'` ;; *) oName=`hgsql -N -e "select organism from dbDb where name=\"${asmName}\";" hgcentraltest` ;; esac printf "%s" "${oName}" } +# generate URL to the fa.gz files to pass off to galaxy function faGzUrl() { export asmName=$1 case $asmName in GC[AF]_*) gcxPath=$(gcPath $asmName) id=$(accId $asmName) printf "https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s.fa.gz" "${gcxPath}" "${id}" "${id}" ;; *) printf "https://hgdownload.soe.ucsc.edu/goldenPath/%s/bigZips/%s.fa.gz" "${asmName}" "${asmName}" ;; esac } +# get the assembly date out of the assembly_report.txt file function orgDate() { export asmName=$1 case $asmName in GC[AF]_*) gcxPath=$(gcPath $asmName) asmDir="/hive/data/outside/ncbi/genomes/${gcxPath}/${asmName}" asmRpt="${asmDir}/${asmName}_assembly_report.txt" oDate=`egrep -m 1 -i "^#[[:space:]]*Date:" ${asmRpt} | tr -d '\r' | sed -e 's/.*ate: \+//;'` ;; *) oDate="" ;; esac printf "%s" "${oDate}" } -# check if this database is actually a database browser or a promoted -# hub. It could have a MySQL database, but it won't have a chromInfo -function promotedHub() { - export db=$1 -} - ############################################################################## ############################################################################## ### start seconds export startT=`date "+%s"` export target="$1" export query="$2" export tClade="$3" export qClade="$4" export tGcPath=$(gcPath $target) export qGcPath=$(gcPath $query) export tAccId=$(accId $target) export qAccId=$(accId $query) printf "# tq: '${target}' '${query}' '${tClade}' '${qClade}'\n" 1>&2 @@ -477,151 +478,198 @@ TMPDIR=/dev/shm " fi ### skip primary alignment if it is already done ### primaryDone == 0 means NOT done yet if [ $primaryDone -eq 0 ]; then mkdir "${buildDir}" ### setup the DEF file printf "%s" "${defString}" > ${buildDir}/DEF ### and the yaml file printf "%s" "${yamlString}" > ${buildDir}/${tAccId}.${qAccId}.yaml -### setup the buildDir/run.sh script +### setup the buildDir/lastzRun.sh script +printf "#!/bin/bash + +set -beEu -o pipefail + +export targetDb=\"${tAccId}\" +export queryDb=\"${qAccId}\" +export QueryDb=\"${Query}\" + +cd ${buildDir} +time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl ${trackHub} -verbose=2 \`pwd\`/DEF -syntenicNet \\ + $tFullName $qFullName -workhorse=${workHorse} -smallClusterHub=${smallClusterHub} \\ + -bigClusterHub=${bigHub} \\ + -chainMinScore=${minScore} -chainLinearGap=${linearGap}) > do.log 2>&1 + +grep -w real do.log | sed -e 's/^/ # /;' + +sed -e 's/^/ # /;' fb.\${targetDb}.chain\${QueryDb}Link.txt +sed -e 's/^/ # /;' fb.\${targetDb}.chainSyn\${QueryDb}Link.txt + +time (~/kent/src/hg/utils/automation/doRecipBest.pl ${rBestTrackHub} -load -workhorse=${workHorse} -buildDir=\`pwd\` \\ + ${tRbestArgs} \\ + ${qRbestArgs} \\ + \${targetDb} \${queryDb}) > rbest.log 2>&1 + +grep -w real rbest.log | sed -e 's/^/ # /;' + +sed -e 's/^/ #/;' fb.\${targetDb}.chainRBest.\${QueryDb}.txt +" > ${buildDir}/lastzRun.sh +chmod +x ${buildDir}/lastzRun.sh + +### setup the buildDir/kegAlign.sh script printf "#!/bin/bash +# exit on any failure set -beEu -o pipefail export buildDir=\"${buildDir}\" export swapDir=\"${swapDir}\" -export PM=\"/hive/users/hiram/galaxy/venv3.12/bin/planemo\" +export PM=\"${planemoCmd}\" export targetDb=\"${tAccId}\" export queryDb=\"${qAccId}\" export QueryDb=\"${Query}\" +export Target=\"${Target}\" +export tSizes=\"${targetSizes}\" +export qSizes=\"${querySizes}\" + +############################################################################ +# chainBigBedFb - convert chain to bigBed and compute featureBits +# args: db chainName chainGz sizesFile fbFile +function chainBigBedFb() { + local db=\$1 + local chainName=\$2 + local chainGz=\$3 + local sizesFile=\$4 + local fbFile=\$5 + chainToBigChain \"\${chainGz}\" \${chainName}.tab \${chainName}Link.tab + bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab \${chainName}.tab \${sizesFile} \${chainName}.bb + bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab \${chainName}Link.tab \${sizesFile} \${chainName}Link.bb + rm -f \${chainName}.tab \${chainName}Link.tab chain.tab link.tab + local totalBases=\`ave -col=2 \${sizesFile} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\` + local basesCovered=\`bigBedInfo \${chainName}Link.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\` + local percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\` + printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${fbFile} +} +############################################################################ cd \${buildDir} mkdir -p log export DS=\`date \"+%%F_%%T_%%s\"\` +if [ -s successInvocationId.txt ]; then + DS=\`cut -f1 successInvocationId.txt\` +fi export logDir=\"\${buildDir}/log\" export logFile=\"\${logDir}/\${DS}.log\" +### only try to run this if there is no indication it was successful +if [ ! -s \"successInvocationId.txt\" ]; then time (\${PM} run \\ ~/kent/src/hg/utils/automation/kegAlign.json.ga \\ \"\${targetDb}.\${queryDb}.yaml\" --profile vgp \\ --history_name \"\${targetDb}.\${queryDb}.kegAlign\" \\ --test_output_json \"\${logDir}/runOutput.\${DS}.json\") >> \"\${logFile}\" 2>&1 -export invocationId=\`jq '.tests[0].data.invocation_details.details.invocation_id' \"\${logDir}/runOutput.\${DS}.json\" | tr -d '\"'\` + invocationId=\`jq '.tests[0].data.invocation_details.details.invocation_id' \"\${logDir}/runOutput.\${DS}.json\" | tr -d '\"'\` printf \"invocation ID: %%s\\n\" \"\${invocationId}\" 1>&2 mkdir -p result/\${DS} \${PM} invocation_download \"\${invocationId}\" --profile vgp \\ --output_directory result/\${DS} + # record the invocation ID and associated log file name + printf \"%%s\\tinvocation ID: %%s\\t%%s\\n\" \"\${DS}\" \"\${invocationId}\" \"\${logDir}/runOutput.\${DS}.json\" > successInvocationId.txt +fi +### use that runOutput log file to get the history ID for deletion ### install allChain into buildDir/axtChain/ mkdir -p \${buildDir}/axtChain -export allChainFile=\`ls result/\${DS}/allChain__*.chain\` +if [ ! -s \"\${buildDir}/axtChain/\${targetDb}.\${queryDb}.all.chain.gz\" ]; then + allChainFile=\`ls result/\${DS}/allChain__*.chain\` gzip -c \"\${allChainFile}\" > \${buildDir}/axtChain/\${targetDb}.\${queryDb}.all.chain.gz +fi ### convert target allChain to bigBed and measure featureBits cd \${buildDir}/axtChain -export tSizes=\"${targetSizes}\" -export allChain=\"\${targetDb}.\${queryDb}.all.chain.gz\" -hgLoadChain -test -noBin -tIndex \${targetDb} allChain \"\${allChain}\" -sed 's/.000000//' chain.tab | awk 'BEGIN {OFS=\"\\t\"} {print \\\$2, \\\$4, \\\$5, \\\$11, 1000, \\\$8, \\\$3, \\\$6, \\\$7, \\\$9, \\\$10, \\\$1}' > allChain.tab -awk 'BEGIN {OFS=\"\\t\"} {print \\\$1, \\\$2, \\\$3, \\\$5, \\\$4}' link.tab | sort -k1,1 -k2,2n > allChainLink.tab -bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab allChain.tab \${tSizes} allChain.bb -bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab allChainLink.tab \${tSizes} allChainLink.bb -rm -f allChain.tab allChainLink.tab chain.tab link.tab -export totalBases=\`ave -col=2 \${tSizes} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\` -export basesCovered=\`bigBedInfo allChainLink.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\` -export percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\` -printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt -cat \${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt +if [ ! -s \"\${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt\" ]; then + allChain=\"\${targetDb}.\${queryDb}.all.chain.gz\" + chainBigBedFb \${targetDb} allChain \${allChain} \${tSizes} \${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt +fi +cat \"\${buildDir}/fb.\${targetDb}.chain\${QueryDb}Link.txt\" 1>&2 ### install netChainSubset (over.chain) for target cd \${buildDir} -export overChainFile=\`ls result/\${DS}/'netChainSubset on dataset 7 and 21__'*.chain\` +if [ ! -s \"\${buildDir}/axtChain/\${targetDb}.\${queryDb}.over.chain.gz\" ]; then + overChainFile=\`ls result/\${DS}/'netChainSubset on dataset 7 and 21__'*.chain\` gzip -c \"\${overChainFile}\" > \${buildDir}/axtChain/\${targetDb}.\${queryDb}.over.chain.gz +fi ### convert target over.chain to bigBed and measure featureBits cd \${buildDir}/axtChain -export overChain=\"\${targetDb}.\${queryDb}.over.chain.gz\" -hgLoadChain -test -noBin -tIndex \${targetDb} overChain \"\${overChain}\" -sed 's/.000000//' chain.tab | awk 'BEGIN {OFS=\"\\t\"} {print \\\$2, \\\$4, \\\$5, \\\$11, 1000, \\\$8, \\\$3, \\\$6, \\\$7, \\\$9, \\\$10, \\\$1}' > overChain.tab -awk 'BEGIN {OFS=\"\\t\"} {print \\\$1, \\\$2, \\\$3, \\\$5, \\\$4}' link.tab | sort -k1,1 -k2,2n > overChainLink.tab -bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab overChain.tab \${tSizes} overChain.bb -bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab overChainLink.tab \${tSizes} overChainLink.bb -rm -f overChain.tab overChainLink.tab chain.tab link.tab -export totalBases=\`ave -col=2 \${tSizes} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\` -export basesCovered=\`bigBedInfo overChainLink.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\` -export percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\` -printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt -cat \${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt +if [ ! -s \"\${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt\" ]; then + overChain=\"\${targetDb}.\${queryDb}.over.chain.gz\" + chainBigBedFb \${targetDb} overChain \${overChain} \${tSizes} \${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt +fi +cat \"\${buildDir}/fb.\${targetDb}.chainSyn\${QueryDb}Link.txt\" 1>&2 ### install allChainSwap into swapDir/axtChain/ mkdir -p \${swapDir}/axtChain cd \${buildDir} -export allChainSwapFile=\`ls result/\${DS}/allChainSwap__*.chain\` +if [ ! -s \"\${swapDir}/axtChain/\${queryDb}.\${targetDb}.all.chain.gz\" ]; then + allChainSwapFile=\`ls result/\${DS}/allChainSwap__*.chain\` gzip -c \"\${allChainSwapFile}\" > \${swapDir}/axtChain/\${queryDb}.\${targetDb}.all.chain.gz +fi ### convert swap allChain to bigBed and measure featureBits cd \${swapDir}/axtChain -export qSizes=\"${querySizes}\" -export allChain=\"\${queryDb}.\${targetDb}.all.chain.gz\" -hgLoadChain -test -noBin -tIndex \${queryDb} allChain \"\${allChain}\" -sed 's/.000000//' chain.tab | awk 'BEGIN {OFS=\"\\t\"} {print \\\$2, \\\$4, \\\$5, \\\$11, 1000, \\\$8, \\\$3, \\\$6, \\\$7, \\\$9, \\\$10, \\\$1}' > allChain.tab -awk 'BEGIN {OFS=\"\\t\"} {print \\\$1, \\\$2, \\\$3, \\\$5, \\\$4}' link.tab | sort -k1,1 -k2,2n > allChainLink.tab -bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab allChain.tab \${qSizes} allChain.bb -bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab allChainLink.tab \${qSizes} allChainLink.bb -rm -f allChain.tab allChainLink.tab chain.tab link.tab -export totalBases=\`ave -col=2 \${qSizes} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\` -export basesCovered=\`bigBedInfo allChainLink.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\` -export percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\` -export Target=\"${Target}\" -printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt -cat \${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt +if [ ! -s \"\${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt\" ]; then + allChain=\"\${queryDb}.\${targetDb}.all.chain.gz\" + chainBigBedFb \${queryDb} allChain \${allChain} \${qSizes} \${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt +fi +cat \"\${swapDir}/fb.\${queryDb}.chain\${Target}Link.txt\" 1>&2 ### install netChainSubset (over.chain) for swap cd \${buildDir} -export overChainSwapFile=\`ls result/\${DS}/'netChainSubset on dataset 8 and 35__'*.chain\` +if [ ! -s \"\${swapDir}/axtChain/\${queryDb}.\${targetDb}.over.chain.gz\" ]; then + overChainSwapFile=\`ls result/\${DS}/'netChainSubset on dataset 8 and 35__'*.chain\` gzip -c \"\${overChainSwapFile}\" > \${swapDir}/axtChain/\${queryDb}.\${targetDb}.over.chain.gz +fi ### convert swap over.chain to bigBed and measure featureBits cd \${swapDir}/axtChain -export overChain=\"\${queryDb}.\${targetDb}.over.chain.gz\" -hgLoadChain -test -noBin -tIndex \${queryDb} overChain \"\${overChain}\" -sed 's/.000000//' chain.tab | awk 'BEGIN {OFS=\"\\t\"} {print \\\$2, \\\$4, \\\$5, \\\$11, 1000, \\\$8, \\\$3, \\\$6, \\\$7, \\\$9, \\\$10, \\\$1}' > overChain.tab -awk 'BEGIN {OFS=\"\\t\"} {print \\\$1, \\\$2, \\\$3, \\\$5, \\\$4}' link.tab | sort -k1,1 -k2,2n > overChainLink.tab -bedToBigBed -type=bed6+6 -as=~/kent/src/hg/lib/bigChain.as -tab overChain.tab \${qSizes} overChain.bb -bedToBigBed -type=bed4+1 -as=~/kent/src/hg/lib/bigLink.as -tab overChainLink.tab \${qSizes} overChainLink.bb -rm -f overChain.tab overChainLink.tab chain.tab link.tab -export totalBases=\`ave -col=2 \${qSizes} | grep \"^total\" | awk '{printf \"%d\", \\\$2}'\` -export basesCovered=\`bigBedInfo overChainLink.bb | grep \"basesCovered\" | cut -d' ' -f2 | tr -d ','\` -export percentCovered=\`echo \${basesCovered} \${totalBases} | awk '{printf \"%.3f\", 100.0*\\\$1/\\\$2}'\` -printf \"%d bases of %d (%s%%%%) in intersection\\n\" \"\${basesCovered}\" \"\${totalBases}\" \"\${percentCovered}\" > \${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt -cat \${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt - -" > ${buildDir}/run.sh -chmod +x ${buildDir}/run.sh - -### run the primary alignment -printf "running: time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1\n" 1>&2 - -# time (${buildDir}/run.sh) >> ${buildDir}/do.log 2>&1 +if [ ! -s \"\${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt\" ]; then + overChain=\"\${queryDb}.\${targetDb}.over.chain.gz\" + chainBigBedFb \${queryDb} overChain \${overChain} \${qSizes} \${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt +fi +cat \"\${swapDir}/fb.\${queryDb}.chainSyn\${Target}Link.txt\" 1>&2 + +" > ${buildDir}/kegAlign.sh +chmod +x ${buildDir}/kegAlign.sh + +### run the primary alignment, galaxy kegAlign style +printf "running: time (${buildDir}/kegAlign.sh) >> ${buildDir}/kegAlign.log 2>&1\n" 1>&2 + +# time (${buildDir}/kegAlign.sh) >> ${buildDir}/kegAlign.log 2>&1 + +### run the primary alignment, UCSC lastz style +printf "running: time (${buildDir}/lastzRun.sh) >> ${buildDir}/lastzRun.log 2>&1\n" 1>&2 + +# time (${buildDir}/lastzRun.sh) >> ${buildDir}/lastzRun.log 2>&1 + exit $? ### typical contents of the invocation_download: ### ls result/2026-04-02_16:00:02_1775170802 ### 'Batched LASTZ on dataset 5__70a08f89-a77d-4333-b425-b13959e40d1b.axt' ### 'KegAlign on dataset 3 and 4__a32df3e9-13d0-4c5e-b765-df69c657ae89.tgz' ### allChainSwap__5e11f323-58fc-43f9-afbd-6cff21f0e7af.chain ### allChain__74038a6d-d735-4784-8b17-31a25ddbb036.chain ###'netChainSubset on dataset 7 and 21__72cbd3a9-4136-4e72-86c9-9da0c230c5d0.chain' ###'netChainSubset on dataset 8 and 35__9acc344f-19da-4ef9-a578-7282940badc0.chain' # rebuild trackDb if possible here if [ -x "${tTdb}" ]; then ${tTdb}