d016d8f62aac199e12ae92583784e426b22a1c14 hiram Fri Aug 1 10:38:15 2025 -0700 catch up to current script with automatic repeat masker identification and email complete message refs #23891 diff --git src/hg/makeDb/doc/asmHubs/runBuild src/hg/makeDb/doc/asmHubs/runBuild index 8f07ccf5d6b..9d8c917efe0 100755 --- src/hg/makeDb/doc/asmHubs/runBuild +++ src/hg/makeDb/doc/asmHubs/runBuild @@ -1,166 +1,294 @@ #!/bin/bash set -beEu -o pipefail -export gcxName=$1 -export asmId=$2 -export clade=$3 -export sciName=$4 -### !!! the 'clade' is obsolete and is unused anywhere. It is passed around -### because it used to be a requirement for the gateway page script, but it -### it isn't used there either +export LANG=C +export asmId=$1 +export clade=$2 +export sciName=$3 +### the 'clade' is used to establish specific options for the build -export asmHubName=$clade +### start seconds +export startT=`date "+%s"` -export rmskSpecies="${sciName}" +export bigHub="hgwdev" +export smallHub="hgwdev" +export fileServer="hgwdev" +export workHorse="hgwdev" +# export bigHub="ku" +# export smallHub="ku" + +export rmskSpecies="xy" ### This ucscNames decision needs to be an automatic process since ### some browsers have been built with ucscNames and other have not. ### This is important for track updates, such as ncbiRefSeq ### export ucscNames="-ucscNames" export ucscNames="" export augustusSpecies="-augustusSpecies=human" export ncbiRmsk="-ncbiRmsk" export noRmsk="" +export RMSK="" export subGroup="vertebrate_other" +# sleep 5 to 25 seconds to avoid the ssh overload of a bunch of jobs +# all starting at the same time. +export sleepTime=$((RANDOM % 21 + 5)) +sleep "${sleepTime}" + if [ "${clade}" = "primates" ]; then subGroup="vertebrate_mammalian" + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` elif [ "${clade}" = "mammals" ]; then subGroup="vertebrate_mammalian" + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` elif [ "${clade}" = "fishes" ]; then subGroup="vertebrate_other" augustusSpecies="-augustusSpecies=zebrafish" + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` elif [ "${clade}" = "fish" ]; then subGroup="vertebrate_other" augustusSpecies="-augustusSpecies=zebrafish" + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` elif [ "${clade}" = "birds" ]; then subGroup="vertebrate_other" augustusSpecies="-augustusSpecies=chicken" + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` elif [ "${clade}" = "vertebrate" ]; then subGroup="vertebrate_other" + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` elif [ "${clade}" = "invertebrate" ]; then subGroup="invertebrate" + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` elif [ "${clade}" = "invertebrates" ]; then subGroup="invertebrate" elif [ "${clade}" = "fungi" ]; then subGroup="fungi" augustusSpecies="-augustusSpecies=saccharomyces" + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` elif [ "${clade}" = "plants" ]; then subGroup="plants" + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` augustusSpecies="-augustusSpecies=arabidopsis" elif [ "${clade}" = "vertebrate_mammalian" ]; then + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` subGroup="vertebrate_mammalian" elif [ "${clade}" = "vertebrate_other" ]; then + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` subGroup="vertebrate_other" elif [ "${clade}" = "nematode" ]; then + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` subGroup="invertebrate" - asmHubName="invertebrate" augustusSpecies="-augustusSpecies=caenorhabditis" elif [ "${clade}" = "drosophila" ]; then + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` subGroup="invertebrate" - asmHubName="invertebrate" augustusSpecies="-augustusSpecies=fly" elif [ "${clade}" = "Amellifera" ]; then + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` subGroup="invertebrate" - asmHubName="invertebrate" augustusSpecies="-augustusSpecies=honeybee1" elif [ "${clade}" = "Agambiae" ]; then + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` subGroup="invertebrate" - asmHubName="invertebrate" augustusSpecies="-augustusSpecies=culex" elif [ "${clade}" = "Scerevisiae" ]; then + rmskSpecies=`/hive/data/genomes/asmHubs/allBuild/rmCheck/findTaxon.pl "${sciName}" 2> /dev/null` subGroup="fungi" - asmHubName="fungi" augustusSpecies="-augustusSpecies=saccharomyces" elif [ "${clade}" = "viral" ]; then subGroup="viral" rmskSpecies="viruses" augustusSpecies="-noAugustus -noXenoRefSeq" +elif [ "${clade}" = "archaea" ]; then + subGroup="archaea" + noRmsk="-noRmsk" + augustusSpecies="-noAugustus -noXenoRefSeq" elif [ "${clade}" = "bacteria" ]; then subGroup="bacteria" noRmsk="-noRmsk" augustusSpecies="-noAugustus -noXenoRefSeq" else printf "ERROR: unrecognized clade: '%s'\n" "${clade}" 1>&2 exit 255 fi +if [ "${rmskSpecies}" = "xy" ]; then + noRmsk="-noRmsk" +elif [ "x${rmskSpecies}y" = "xy" ]; then + export RMSK="-runRepeatModeler" + printf "# rmskSpecies: $sciName NOT FOUND, running RepeatModeler\n" 1>&2 +else + printf "# rmskSpecies: $sciName -> $rmskSpecies\n" 1>&2 + RMSK='-rmskSpecies="'"${rmskSpecies}"'"' +# RMSK='-runRepeatModeler -rmskSpecies="'"${rmskSpecies}"'"' +fi + export stepStart="download" +## export stepStart="sequence" +## export stepStart="gatewayPage" +## export stepStart="chromAlias" +## export stepStart="cytoBand" +## export stepStart="repeatModeler" +## export stepStart="repeatMasker" +## export stepStart="simpleRepeat" +## export stepStart="allGaps" +## export stepStart="windowMasker" +## export stepStart="addMask" +## export stepStart="gapOverlap" +## export stepStart="tandemDups" +## export stepStart="cpgIslands" +## export stepStart="ncbiGene" +## export stepStart="ncbiRefSeq" +## export stepStart="xenoRefGene" +## export stepStart="augustus" +## export stepStart="trackDb" + +## export stepEnd="gatewayPage" +## export stepEnd="chromAlias" +## export stepEnd="repeatModeler" +## export stepEnd="repeatMasker" +### export stepEnd="ncbiRefSeq" +## export stepEnd="gatewayPage" +## export stepEnd="addMask" +## export stepEnd="ncbiGene" +### export stepEnd="ncbiRefSeq" +### export stepStart="trackDb" +## export stepEnd="windowMasker" export stepEnd="trackDb" # download, sequence, assemblyGap, gatewayPage, cytoBand, gc5Base, # repeatMasker, simpleRepeat, allGaps, idKeys, windowMasker, addMask, # gapOverlap, tandemDups, cpgIslands, ncbiGene, ncbiRefSeq, xenoRefGene, # augustus, trackDb, cleanup export linkTop="/hive/data/genomes/asmHubs" export TOP0="/hive/data/genomes/asmHubs" export TOP="/hive/data/genomes/asmHubs/allBuild" cd $TOP -export gcPrefix=`echo $gcxName | cut -c1-3` +export gcX="${asmId:0:3}" export topBuild="genbankBuild" export genbankRefseq="genbank" -if [ "${gcPrefix}" = "GCF" ]; then +if [ "${gcX}" = "GCF" ]; then topBuild="refseqBuild" genbankRefseq="refseq" fi -export gc0=`echo $gcxName | cut -c5-7` -export gc1=`echo $gcxName | cut -c8-10` -export gc2=`echo $gcxName | cut -c11-13` +export d0="${asmId:4:3}" +export d1="${asmId:7:3}" +export d2="${asmId:10:3}" -export buildDir=`printf "%s/%s/%s/%s/%s/%s" "${TOP0}/${topBuild}" "${gcPrefix}" "${gc0}" "${gc1}" "${gc2}" "${asmId}"` +export buildDir=`printf "%s/%s/%s/%s/%s/%s" "${TOP0}/${topBuild}" "${gcX}" "${d0}" "${d1}" "${d2}" "${asmId}"` export linkDir=`printf "%s/%s/%s/%s" "${linkTop}" "${genbankRefseq}" "${subGroup}" "${sciName}"` if [ -d "${buildDir}" ]; then # printf "# removing: %s\n" "${buildDir}" 1>&2 # rm -fr "${buildDir}" if [ -s "${buildDir}/${asmId}.trackDb.txt" ]; then - printf "# Already done $gcxName\n" - printf "# Already done $gcxName\n" 1>&2 - exit 0 + printf "# Already done $asmId\n" + printf "# Already done $asmId\n" 1>&2 +## exit 0 fi - printf "# partially done $gcxName\n" - printf "# partially done $gcxName\n" 1>&2 + printf "# partially done $asmId\n" + printf "# partially done $asmId\n" 1>&2 fi mkdir -p ${TOP0}/${topBuild}/buildLogs/${subGroup} export logFile="${TOP0}/${topBuild}/buildLogs/${subGroup}/${asmId}.log" if [ ! -d "${buildDir}" ]; then mkdir -p "${buildDir}" fi if [ ! -d "${linkDir}" ]; then mkdir -p "${linkDir}" fi if [ ! -L "${linkDir}/${asmId}" ]; then + printf "@ ln -s \"${buildDir}\" \"${linkDir}\"\n" 1>&2 ln -s "${buildDir}" "${linkDir}" fi ls -d "${buildDir}" "${linkDir}/${asmId}" echo "========================= "`date "+%F %T"` >> "${logFile}" echo "/hive/data/genomes/asmHubs/${genbankRefseq}Build/doIdKeys \"${asmId}\" &" >> "${logFile}" /hive/data/genomes/asmHubs/${genbankRefseq}Build/doIdKeys "${asmId}" >> "${logFile}" 2>&1 & echo "### \$HOME/kent/src/hg/utils/automation/doAssemblyHub.pl \ -continue=\"${stepStart}\" -stop=\"${stepEnd}\" \ - -rmskSpecies=\"${rmskSpecies}\" -bigClusterHub=ku -buildDir=\`pwd\` \ - -asmHubName=$asmHubName -fileServer=hgwdev -smallClusterHub=hgwdev \ - ${noRmsk} ${ncbiRmsk} ${ucscNames} ${augustusSpecies} -workhorse=hgwdev \"${asmId}\"" >> "${logFile}" + ${RMSK} -bigClusterHub="${bigHub}" -buildDir=\`pwd\` \ + -fileServer=${fileServer} -smallClusterHub=${smallHub} \ + ${noRmsk} ${ncbiRmsk} ${ucscNames} ${augustusSpecies} -workhorse=${workHorse} \"${asmId}\"" >> "${logFile}" cd "${buildDir}" if [ ! -s "${buildDir}/build.log" ]; then + symLinkPath=`readlink "${buildDir}/build.log" || true` + if [ "x${symLinkPath}y" != "xy" ]; then # empty symLink + rm -f "${buildDir}/build.log" + fi ln -s "${logFile}" "${buildDir}/build.log" fi time ($HOME/kent/src/hg/utils/automation/doAssemblyHub.pl \ -continue="${stepStart}" -stop="${stepEnd}" \ - -rmskSpecies="${rmskSpecies}" -bigClusterHub=ku -buildDir=`pwd` \ - -asmHubName=$asmHubName -fileServer=hgwdev -smallClusterHub=hgwdev \ - ${noRmsk} ${ncbiRmsk} ${ucscNames} ${augustusSpecies} -workhorse=hgwdev "${asmId}") >> "${logFile}" 2>&1 + ${RMSK} -bigClusterHub="${bigHub}" -buildDir=`pwd` \ + -fileServer=${fileServer} -smallClusterHub=${smallHub} \ + ${noRmsk} ${ncbiRmsk} ${ucscNames} ${augustusSpecies} -workhorse=${workHorse} "${asmId}") >> "${logFile}" 2>&1 + +export docDir="$HOME/kent/src/hg/makeDb/doc/asmHubs" +export userName="`whoami`" +export runCmd="./runBuild $asmId $clade $sciName" + +export lockFile="/hive/data/genomes/asmHubs/allBuild/docLock" +touch $lockFile + +# one at a time only to append to this file +if [ -d "${docDir}" ]; then + # establish lock + exec 200<$lockFile + flock -x 200 || exit 255 + + printf "%s\n" "${runCmd}" >> "$docDir/genArk.done.list" + + sleep 10 + + # release lock + exec 200>&- + +fi + +### end seconds +export endT=`date "+%s"` + +### Send completion message: +printf "To: $userName +Reply-to: $userName +Return-path: $userName +Subject: genArk build done: $asmId + +################################################################## + +$runCmd + + (runBuild) + +" > /tmp/send.txt.$$ + +### show elapsed time +printf "%s\t%s\n" "${endT}" "${startT}" | awk -F$'\t' '{ +seconds=$1-$2 +hours=int(seconds/3600) +minutes=int((seconds-(hours*3600))/60) +s=seconds % 60 +printf "### elapsed time: %02dh %02dm %02ds\n\n", hours, minutes, s +}' >> /tmp/send.txt.$$ + +date >> /tmp/send.txt.$$ +printf "##################################################################\n" >> /tmp/send.txt.$$ + +# cat /tmp/send.txt.$$ | /usr/sbin/sendmail -t -oi + +rm -f /tmp/send.txt.$$