65a0fd8af813529a001a714b3c550b0063c9de6e
hiram
  Thu Oct 12 13:18:46 2023 -0700
re-chain the reChains and re-net the reNets refs #31561

diff --git src/hg/makeDb/doc/hg38/hprcReChain.txt src/hg/makeDb/doc/hg38/hprcReChain.txt
index 7f4a633..161b34b 100644
--- src/hg/makeDb/doc/hg38/hprcReChain.txt
+++ src/hg/makeDb/doc/hg38/hprcReChain.txt
@@ -1,108 +1,315 @@
 ##############################################################################
 ### reChain the chains obtained from the HPRC project
 ### done - 2023-08-14 - Hiram
 
 mkdir /hive/data/genomes/hg38/bed/hprc/reChain
 cd /hive/data/genomes/hg38/bed/hprc/reChain
 
 # avoid the broken one GCA_018504655.1
 ls ../bigChainToChain/hg38.chainHprc*.chain | grep -v GCA_018504655.1 \
   > chain.list
 
 # using this 'runOne' script for each one:
 #######################################################################
 
 #!/bin/bash
 
 set -beEu -o pipefail
 
 export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit"
 export chainFile="${1}"
 export inChain="../bigChainToChain/${chainFile}"
 export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`
 
 # may already be done
 if [ -s "${chainFile}" ]; then
   exit 0
 fi
 
 export buildDir=""
 export twoBit=""
 export asmId=""
 
 case "${asmName}" in
     GCA_*) gcX="${asmName:0:3}"
        d0="${asmName:4:3}"
        d1="${asmName:7:3}"
        d2="${asmName:10:3}"
        tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`"
        buildDir="`realpath ${tB}`"
        asmId="`basename $buildDir`"
        twoBit="${buildDir}/${asmId}.2bit"
        ;;
      T2T-*)
        asmId="hs1"
        buildDir="/hive/data/genomes/$asmId"
        twoBit="${buildDir}/${asmId}.2bit"
        ;;
      *)  printf "ERROR: unrecognized $asmName\n" 1>&2
        exit 255
        ;;
 esac
 
 printf "%s\n" "${chainFile}" 1>&2
 
 time (chainToAxt "${inChain}" "${hg38TwoBit}" \
    "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \
       stdin "${hg38TwoBit}" "${twoBit}" "${chainFile}") > err/${chainFile} 2>&1
 
 hgLoadChain hg38 measureReChain "${chainFile}"
 featureBits hg38 measureReChain > featBits/fb.hg38.${asmName}.txt 2>&1
 featureBits hg38 measureReChainLink > featBits/fb.hg38.${asmName}Link.txt 2>&1
 hgsql -N -e 'select count(*) from measureReChainLink;' hg38 > counts/hg38.${asmName}.linkCount.txt
 hgsql -N -e 'select count(*) from measureReChain;' hg38 > counts/hg38.${asmName}.chainCount.txt
 
 ##############################################################
 
 to generate a jobList
 
 printf "#LOOP
 ./runOne $(path1)
 #ENDLOOP
 " > template
 
 gensub2 chain.list single template jobList
 
 # running them all, one at a time
 
 mkdir featBits counts err
 
 time (~/kent/src/hg/utils/automation/perlPara.pl 1 jobList) > do.log 2>&1 
 # real    236m9.479s
 
 ##############################################################################
 # turn those chains into nets (DONE - 2023-08-15 - Hiram)
 
 mkdir /hive/data/genomes/hg38/bed/hprc/reNet
 cd /hive/data/genomes/hg38/bed/hprc/reNet
 
 ls ../reChain  | grep hg38.chainHprc > chain.list
 
 printf '#LOOP
 runOne $(path1) {check out exists+ $(root1).net}
 #ENDLOOP
 ' > template
 
 gensub2 chain.list single template jobList
 
 para -ram=4g create jobList
 para push
 para time
 Completed: 88 of 88 jobs
 CPU time in finished jobs:        285s       4.74m     0.08h    0.00d  0.000 y
 IO & Wait Time:                   206s       3.44m     0.06h    0.00d  0.000 y
 Average job time:                   6s       0.09m     0.00h    0.00d
 Longest finished job:               8s       0.13m     0.00h    0.00d
 Submission to last job:             8s       0.13m     0.00h    0.00d
 
 ##############################################################################
+# there was some question that arose if the above was correct.
+# The procedure was repeated again (DONE 2023-10-12 - Hiram)
+
+mkdir /hive/data/genomes/hg38/bed/hprc/firstPassReChain
+cd /hive/data/genomes/hg38/bed/hprc/firstPassReChain
+
+# using this runOne script:
+
+####################################################################
+#!/bin/bash
+
+set -beEu -o pipefail
+
+export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit"
+export chainFile="${1}"
+export inChain="../bigChainToChain/${chainFile}"
+export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`
+
+# may already be done
+if [ -s "${chainFile}" ]; then
+  exit 0
+fi
+
+export buildDir=""
+export twoBit=""
+export asmId=""
+
+case "${asmName}" in
+    GCA_*) gcX="${asmName:0:3}"
+       d0="${asmName:4:3}"
+       d1="${asmName:7:3}"
+       d2="${asmName:10:3}"
+       tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`"
+       buildDir="`realpath ${tB}`"
+       asmId="`basename $buildDir`"
+       twoBit="${buildDir}/${asmId}.2bit"
+       ;;
+     T2T-*)
+       asmId="hs1"
+       buildDir="/hive/data/genomes/$asmId"
+       twoBit="${buildDir}/${asmId}.2bit"
+       ;;
+     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
+       exit 255
+       ;;
+esac
+
+printf "%s\n" "${chainFile}" 1>&2
+
+time (chainToAxt "${inChain}" "${hg38TwoBit}" \
+   "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \
+      stdin "${hg38TwoBit}" "${twoBit}" "${chainFile}") > err/${chainFile} 2>&1
+
+####################################################################
+
+# reusing the jobList from previous time:
+
+head jobList
+./runOne hg38.chainHprcGCA_018466835.1.chain
+./runOne hg38.chainHprcGCA_018466845.1.chain
+./runOne hg38.chainHprcGCA_018466855.1.chain
+./runOne hg38.chainHprcGCA_018466985.1.chain
+... etc
+./runOne hg38.chainHprcGCA_018506975.1.chain
+./runOne hg38.chainHprcGCA_018852585.1.chain
+./runOne hg38.chainHprcGCA_018852595.1.chain
+./runOne hg38.chainHprcT2T-CHM13v2.0.chain
+
+# running with perlPara.pl:
+
+time (perlPara.pl 7 jobList) > 7.log 2>&1
+# real    26m53.073s
+
+# and loading all those new chain files:
+############################
+
+#!/bin/bash
+
+set -beEu -o pipefail
+
+ls hg38.chainHprc*.chain |  while read chainFile
+do
+export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`
+
+export buildDir=""
+export twoBit=""
+export asmId=""
+
+case "${asmName}" in
+    GCA_*) asmId=`echo $asmName | sed -e 's/\./v/;'`
+       ;;
+     T2T-*)
+       asmId="Hs1"
+       ;;
+     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
+       exit 255
+       ;;
+esac
+
+printf "hgLoadChain hg38 chainHprc${asmId} ${chainFile}\n" 1>&2
+hgLoadChain hg38 chainHprc${asmId} ${chainFile}
+
+done
+
+############ and then netting thos chains
+
+mkdir /hive/data/genomes/hg38/bed/hprc/firstPassReNet
+cd /hive/data/genomes/hg38/bed/hprc/firstPassReNet
+
+# using this runOne script:
+###########################
+#!/bin/bash
+
+set -beEu -o pipefail
+
+export chain="${1}"
+export seq1Len="/hive/data/genomes/hg38/chrom.sizes"
+B=`echo $chain | sed -e 's/hg38.chainHprc//; s/.chain//;'`
+export seq2Len="xx"
+case "${B}" in
+   T2T-CHM13v2.0)
+     seq2Len="/hive/data/genomes/hs1/chrom.sizes"
+     ;;
+   GCA_*)
+     gcX="${B:0:3}"
+     d0="${B:4:3}"
+     d1="${B:7:3}"
+     d2="${B:10:3}"
+     D="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${B}_*`"
+     if [ -d "${D}" ]; then
+       asmId=`basename $D`
+       seq2Len="${D}/${asmId}.chrom.sizes"
+     else
+       printf "ERROR: can not find: $chain\n" 1>&2
+       exit 255
+     fi
+     ;;
+   *)
+     printf "ERROR: unrecognized: $chain\n" 1>&2
+     exit 255
+     ;;
+esac
+
+chainPreNet ../firstPassReChain/${chain} ${seq1Len} ${seq2Len} stdout \
+  | chainNet stdin -minSpace=1 ${seq1Len} ${seq2Len} stdout /dev/null \
+  | netSyntenic stdin hg38.chainHprc${B}.net
+
+###########################
+
+# using the jobList from before:
+head jobList
+
+runOne hg38.chainHprcGCA_018466835.1.chain {check out exists+ hg38.chainHprcGCA_018466835.1.net}
+runOne hg38.chainHprcGCA_018466845.1.chain {check out exists+ hg38.chainHprcGCA_018466845.1.net}
+runOne hg38.chainHprcGCA_018466855.1.chain {check out exists+ hg38.chainHprcGCA_018466855.1.net}
+...
+runOne hg38.chainHprcGCA_018852585.1.chain {check out exists+ hg38.chainHprcGCA_018852585.1.net}
+runOne hg38.chainHprcGCA_018852595.1.chain {check out exists+ hg38.chainHprcGCA_018852595.1.net}
+runOne hg38.chainHprcT2T-CHM13v2.0.chain {check out exists+ hg38.chainHprcT2T-CHM13v2.0.net}
+
+###########################
+
+## this can be run in parasol, it takes only a few seconds:
+para -ram=4g create jobList
+para push
+para time
+
+Completed: 88 of 88 jobs
+CPU time in finished jobs:        351s       5.85m     0.10h    0.00d  0.000 y
+IO & Wait Time:                   118s       1.96m     0.03h    0.00d  0.000 y
+Average job time:                   5s       0.09m     0.00h    0.00d
+Longest finished job:               8s       0.13m     0.00h    0.00d
+Submission to last job:            26s       0.43m     0.01h    0.00d
+
+###########################
+
+Loading these nets:
+
+#!/bin/bash
+
+set -beEu -o pipefail
+
+ls hg38.chainHprc*.net | while read netFile
+do
+export asmName=`echo $netFile | sed -e 's/hg38.chainHprc//; s/.net//;'`
+
+export buildDir=""
+export twoBit=""
+export asmId=""
+
+case "${asmName}" in
+    GCA_*) asmId=`echo $asmName | sed -e 's/\./v/;'`
+       ;;
+     T2T-*)
+       asmId="Hs1"
+       ;;
+     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
+       exit 255
+       ;;
+esac
+
+printf "hgLoadNet -warn hg38 netHprc${asmId} ${netFile}\n" 1>&2
+hgLoadNet -warn hg38 netHprc${asmId} ${netFile}
+
+done
+
+##############################################################################