7f114623aa512095de2a6a83cab942ca3829ea60 hiram Tue Aug 15 10:18:17 2023 -0700 reChain the chains refs #31561 diff --git src/hg/makeDb/doc/hg38/hprcReChain.txt src/hg/makeDb/doc/hg38/hprcReChain.txt new file mode 100644 index 0000000..7f4a633 --- /dev/null +++ src/hg/makeDb/doc/hg38/hprcReChain.txt @@ -0,0 +1,108 @@ +############################################################################## +### reChain the chains obtained from the HPRC project +### done - 2023-08-14 - Hiram + +mkdir /hive/data/genomes/hg38/bed/hprc/reChain +cd /hive/data/genomes/hg38/bed/hprc/reChain + +# avoid the broken one GCA_018504655.1 +ls ../bigChainToChain/hg38.chainHprc*.chain | grep -v GCA_018504655.1 \ + > chain.list + +# using this 'runOne' script for each one: +####################################################################### + +#!/bin/bash + +set -beEu -o pipefail + +export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit" +export chainFile="${1}" +export inChain="../bigChainToChain/${chainFile}" +export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'` + +# may already be done +if [ -s "${chainFile}" ]; then + exit 0 +fi + +export buildDir="" +export twoBit="" +export asmId="" + +case "${asmName}" in + GCA_*) gcX="${asmName:0:3}" + d0="${asmName:4:3}" + d1="${asmName:7:3}" + d2="${asmName:10:3}" + tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`" + buildDir="`realpath ${tB}`" + asmId="`basename $buildDir`" + twoBit="${buildDir}/${asmId}.2bit" + ;; + T2T-*) + asmId="hs1" + buildDir="/hive/data/genomes/$asmId" + twoBit="${buildDir}/${asmId}.2bit" + ;; + *) printf "ERROR: unrecognized $asmName\n" 1>&2 + exit 255 + ;; +esac + +printf "%s\n" "${chainFile}" 1>&2 + +time (chainToAxt "${inChain}" "${hg38TwoBit}" \ + "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \ + stdin "${hg38TwoBit}" "${twoBit}" "${chainFile}") > err/${chainFile} 2>&1 + +hgLoadChain hg38 measureReChain "${chainFile}" +featureBits hg38 measureReChain > featBits/fb.hg38.${asmName}.txt 2>&1 +featureBits hg38 measureReChainLink > featBits/fb.hg38.${asmName}Link.txt 2>&1 +hgsql -N -e 'select count(*) from measureReChainLink;' hg38 > counts/hg38.${asmName}.linkCount.txt +hgsql -N -e 'select count(*) from measureReChain;' hg38 > counts/hg38.${asmName}.chainCount.txt + +############################################################## + +to generate a jobList + +printf "#LOOP +./runOne $(path1) +#ENDLOOP +" > template + +gensub2 chain.list single template jobList + +# running them all, one at a time + +mkdir featBits counts err + +time (~/kent/src/hg/utils/automation/perlPara.pl 1 jobList) > do.log 2>&1 +# real 236m9.479s + +############################################################################## +# turn those chains into nets (DONE - 2023-08-15 - Hiram) + +mkdir /hive/data/genomes/hg38/bed/hprc/reNet +cd /hive/data/genomes/hg38/bed/hprc/reNet + +ls ../reChain | grep hg38.chainHprc > chain.list + +printf '#LOOP +runOne $(path1) {check out exists+ $(root1).net} +#ENDLOOP +' > template + +gensub2 chain.list single template jobList + +para -ram=4g create jobList +para push +para time +Completed: 88 of 88 jobs +CPU time in finished jobs: 285s 4.74m 0.08h 0.00d 0.000 y +IO & Wait Time: 206s 3.44m 0.06h 0.00d 0.000 y +Average job time: 6s 0.09m 0.00h 0.00d +Longest finished job: 8s 0.13m 0.00h 0.00d +Submission to last job: 8s 0.13m 0.00h 0.00d + +##############################################################################