2e2674507b0968331df3f081e65b08678f860ca3
braney
  Sat Oct 14 16:45:14 2023 -0700
some cleanup of the HPRC tracks

diff --git src/hg/makeDb/doc/hg38/hprcChain.txt src/hg/makeDb/doc/hg38/hprcChain.txt
index 161b34b..c2d8083 100644
--- src/hg/makeDb/doc/hg38/hprcChain.txt
+++ src/hg/makeDb/doc/hg38/hprcChain.txt
@@ -1,315 +1,137 @@
 ##############################################################################
-### reChain the chains obtained from the HPRC project
+### chain the chains obtained from the HPRC project
 ### done - 2023-08-14 - Hiram
 
-mkdir /hive/data/genomes/hg38/bed/hprc/reChain
-cd /hive/data/genomes/hg38/bed/hprc/reChain
+mkdir /hive/data/genomes/hg38/bed/hprc/chain
+cd /hive/data/genomes/hg38/bed/hprc/chain
 
 # avoid the broken one GCA_018504655.1
 ls ../bigChainToChain/hg38.chainHprc*.chain | grep -v GCA_018504655.1 \
   > chain.list
 
 # using this 'runOne' script for each one:
 #######################################################################
 
 #!/bin/bash
 
 set -beEu -o pipefail
 
 export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit"
 export chainFile="${1}"
 export inChain="../bigChainToChain/${chainFile}"
 export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`
 
 # may already be done
 if [ -s "${chainFile}" ]; then
   exit 0
 fi
 
 export buildDir=""
 export twoBit=""
 export asmId=""
 
 case "${asmName}" in
     GCA_*) gcX="${asmName:0:3}"
        d0="${asmName:4:3}"
        d1="${asmName:7:3}"
        d2="${asmName:10:3}"
        tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`"
        buildDir="`realpath ${tB}`"
        asmId="`basename $buildDir`"
        twoBit="${buildDir}/${asmId}.2bit"
        ;;
      T2T-*)
        asmId="hs1"
        buildDir="/hive/data/genomes/$asmId"
        twoBit="${buildDir}/${asmId}.2bit"
        ;;
      *)  printf "ERROR: unrecognized $asmName\n" 1>&2
        exit 255
        ;;
 esac
 
 printf "%s\n" "${chainFile}" 1>&2
 
 time (chainToAxt "${inChain}" "${hg38TwoBit}" \
    "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \
-      stdin "${hg38TwoBit}" "${twoBit}" "${chainFile}") > err/${chainFile} 2>&1
+      stdin "${hg38TwoBit}" "${twoBit}" stdout | chainDupFilter stdin /scratch/tmp/${asmName}.noDup.chain /scratch/tmp/${asmName}.dup.chain ) > err/${chainFile} 2>&1
+time (chainToAxt /scratch/tmp/${asmName}.dup.chain   "${hg38TwoBit}" \
+   "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \
+      stdin "${hg38TwoBit}" "${twoBit}" stdout | cat - /scratch/tmp/${asmName}.noDup.chain | chainSort stdin stdout | awk '/chain/ {$13=NR} { print}' > "${chainFile}"  ) > err/${chainFile} 2>&1
+rm -f /scratch/tmp/${asmName}.dup.chain /scratch/tmp/${asmName}.noDup.chain
 
-hgLoadChain hg38 measureReChain "${chainFile}"
-featureBits hg38 measureReChain > featBits/fb.hg38.${asmName}.txt 2>&1
-featureBits hg38 measureReChainLink > featBits/fb.hg38.${asmName}Link.txt 2>&1
-hgsql -N -e 'select count(*) from measureReChainLink;' hg38 > counts/hg38.${asmName}.linkCount.txt
-hgsql -N -e 'select count(*) from measureReChain;' hg38 > counts/hg38.${asmName}.chainCount.txt
 
 ##############################################################
 
 to generate a jobList
 
 printf "#LOOP
 ./runOne $(path1)
 #ENDLOOP
 " > template
 
 gensub2 chain.list single template jobList
 
 # running them all, one at a time
 
 mkdir featBits counts err
 
 time (~/kent/src/hg/utils/automation/perlPara.pl 1 jobList) > do.log 2>&1 
 # real    236m9.479s
 
 ##############################################################################
 # turn those chains into nets (DONE - 2023-08-15 - Hiram)
 
-mkdir /hive/data/genomes/hg38/bed/hprc/reNet
-cd /hive/data/genomes/hg38/bed/hprc/reNet
+mkdir /hive/data/genomes/hg38/bed/hprc/net
+cd /hive/data/genomes/hg38/bed/hprc/net
 
-ls ../reChain  | grep hg38.chainHprc > chain.list
+ls ../chain  | grep hg38.chainHprc > chain.list
 
 printf '#LOOP
 runOne $(path1) {check out exists+ $(root1).net}
 #ENDLOOP
 ' > template
 
 gensub2 chain.list single template jobList
 
 para -ram=4g create jobList
 para push
 para time
 Completed: 88 of 88 jobs
 CPU time in finished jobs:        285s       4.74m     0.08h    0.00d  0.000 y
 IO & Wait Time:                   206s       3.44m     0.06h    0.00d  0.000 y
 Average job time:                   6s       0.09m     0.00h    0.00d
 Longest finished job:               8s       0.13m     0.00h    0.00d
 Submission to last job:             8s       0.13m     0.00h    0.00d
 
-##############################################################################
-# there was some question that arose if the above was correct.
-# The procedure was repeated again (DONE 2023-10-12 - Hiram)
-
-mkdir /hive/data/genomes/hg38/bed/hprc/firstPassReChain
-cd /hive/data/genomes/hg38/bed/hprc/firstPassReChain
-
-# using this runOne script:
-
-####################################################################
-#!/bin/bash
-
-set -beEu -o pipefail
-
-export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit"
-export chainFile="${1}"
-export inChain="../bigChainToChain/${chainFile}"
-export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`
-
-# may already be done
-if [ -s "${chainFile}" ]; then
-  exit 0
-fi
-
-export buildDir=""
-export twoBit=""
-export asmId=""
-
-case "${asmName}" in
-    GCA_*) gcX="${asmName:0:3}"
-       d0="${asmName:4:3}"
-       d1="${asmName:7:3}"
-       d2="${asmName:10:3}"
-       tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`"
-       buildDir="`realpath ${tB}`"
-       asmId="`basename $buildDir`"
-       twoBit="${buildDir}/${asmId}.2bit"
-       ;;
-     T2T-*)
-       asmId="hs1"
-       buildDir="/hive/data/genomes/$asmId"
-       twoBit="${buildDir}/${asmId}.2bit"
-       ;;
-     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
-       exit 255
-       ;;
-esac
-
-printf "%s\n" "${chainFile}" 1>&2
-
-time (chainToAxt "${inChain}" "${hg38TwoBit}" \
-   "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \
-      stdin "${hg38TwoBit}" "${twoBit}" "${chainFile}") > err/${chainFile} 2>&1
-
-####################################################################
-
-# reusing the jobList from previous time:
-
-head jobList
-./runOne hg38.chainHprcGCA_018466835.1.chain
-./runOne hg38.chainHprcGCA_018466845.1.chain
-./runOne hg38.chainHprcGCA_018466855.1.chain
-./runOne hg38.chainHprcGCA_018466985.1.chain
-... etc
-./runOne hg38.chainHprcGCA_018506975.1.chain
-./runOne hg38.chainHprcGCA_018852585.1.chain
-./runOne hg38.chainHprcGCA_018852595.1.chain
-./runOne hg38.chainHprcT2T-CHM13v2.0.chain
-
-# running with perlPara.pl:
-
-time (perlPara.pl 7 jobList) > 7.log 2>&1
-# real    26m53.073s
-
-# and loading all those new chain files:
-############################
-
-#!/bin/bash
-
-set -beEu -o pipefail
-
-ls hg38.chainHprc*.chain |  while read chainFile
-do
-export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`
-
-export buildDir=""
-export twoBit=""
-export asmId=""
-
-case "${asmName}" in
-    GCA_*) asmId=`echo $asmName | sed -e 's/\./v/;'`
-       ;;
-     T2T-*)
-       asmId="Hs1"
-       ;;
-     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
-       exit 255
-       ;;
-esac
-
-printf "hgLoadChain hg38 chainHprc${asmId} ${chainFile}\n" 1>&2
-hgLoadChain hg38 chainHprc${asmId} ${chainFile}
-
-done
-
-############ and then netting thos chains
-
-mkdir /hive/data/genomes/hg38/bed/hprc/firstPassReNet
-cd /hive/data/genomes/hg38/bed/hprc/firstPassReNet
-
-# using this runOne script:
-###########################
-#!/bin/bash
-
-set -beEu -o pipefail
-
-export chain="${1}"
-export seq1Len="/hive/data/genomes/hg38/chrom.sizes"
-B=`echo $chain | sed -e 's/hg38.chainHprc//; s/.chain//;'`
-export seq2Len="xx"
-case "${B}" in
-   T2T-CHM13v2.0)
-     seq2Len="/hive/data/genomes/hs1/chrom.sizes"
-     ;;
-   GCA_*)
-     gcX="${B:0:3}"
-     d0="${B:4:3}"
-     d1="${B:7:3}"
-     d2="${B:10:3}"
-     D="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${B}_*`"
-     if [ -d "${D}" ]; then
-       asmId=`basename $D`
-       seq2Len="${D}/${asmId}.chrom.sizes"
-     else
-       printf "ERROR: can not find: $chain\n" 1>&2
-       exit 255
-     fi
-     ;;
-   *)
-     printf "ERROR: unrecognized: $chain\n" 1>&2
-     exit 255
-     ;;
-esac
-
-chainPreNet ../firstPassReChain/${chain} ${seq1Len} ${seq2Len} stdout \
-  | chainNet stdin -minSpace=1 ${seq1Len} ${seq2Len} stdout /dev/null \
-  | netSyntenic stdin hg38.chainHprc${B}.net
-
-###########################
-
-# using the jobList from before:
-head jobList
-
-runOne hg38.chainHprcGCA_018466835.1.chain {check out exists+ hg38.chainHprcGCA_018466835.1.net}
-runOne hg38.chainHprcGCA_018466845.1.chain {check out exists+ hg38.chainHprcGCA_018466845.1.net}
-runOne hg38.chainHprcGCA_018466855.1.chain {check out exists+ hg38.chainHprcGCA_018466855.1.net}
-...
-runOne hg38.chainHprcGCA_018852585.1.chain {check out exists+ hg38.chainHprcGCA_018852585.1.net}
-runOne hg38.chainHprcGCA_018852595.1.chain {check out exists+ hg38.chainHprcGCA_018852595.1.net}
-runOne hg38.chainHprcT2T-CHM13v2.0.chain {check out exists+ hg38.chainHprcT2T-CHM13v2.0.net}
-
-###########################
-
-## this can be run in parasol, it takes only a few seconds:
-para -ram=4g create jobList
-para push
-para time
-
-Completed: 88 of 88 jobs
-CPU time in finished jobs:        351s       5.85m     0.10h    0.00d  0.000 y
-IO & Wait Time:                   118s       1.96m     0.03h    0.00d  0.000 y
-Average job time:                   5s       0.09m     0.00h    0.00d
-Longest finished job:               8s       0.13m     0.00h    0.00d
-Submission to last job:            26s       0.43m     0.01h    0.00d
-
 ###########################
 
 Loading these nets:
 
 #!/bin/bash
 
 set -beEu -o pipefail
 
 ls hg38.chainHprc*.net | while read netFile
 do
 export asmName=`echo $netFile | sed -e 's/hg38.chainHprc//; s/.net//;'`
 
 export buildDir=""
 export twoBit=""
 export asmId=""
 
 case "${asmName}" in
     GCA_*) asmId=`echo $asmName | sed -e 's/\./v/;'`
        ;;
      T2T-*)
        asmId="Hs1"
        ;;
      *)  printf "ERROR: unrecognized $asmName\n" 1>&2
        exit 255
        ;;
 esac
 
 printf "hgLoadNet -warn hg38 netHprc${asmId} ${netFile}\n" 1>&2
 hgLoadNet -warn hg38 netHprc${asmId} ${netFile}
 
 done
-
-##############################################################################