393037b58276445a2af3ca72e7299eb2b39d5e9d
braney
  Fri Oct 13 16:37:36 2023 -0700
move some doc files around

diff --git src/hg/makeDb/doc/hg38/hprcChain.txt src/hg/makeDb/doc/hg38/hprcChain.txt
index b367bdd..161b34b 100644
--- src/hg/makeDb/doc/hg38/hprcChain.txt
+++ src/hg/makeDb/doc/hg38/hprcChain.txt
@@ -1,66 +1,315 @@
-# make chains out of the PAF files from Erik Garrison
-
-mkdir /cluster/data/hg38/bed/hprcChain
-
-# get PAF files from Erik Garrison <erik.garrison@gmail.com>
-wget "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/scratch/2021_11_16_pggb_wgg.88/untangle/pggb.wgg.88.all.vs.chm13.untangle-m10000-s0-j0.paf.gz"
-wget https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/scratch/2021_11_16_pggb_wgg.88/untangle/pggb.wgg.88.all.vs.grch38.untangle-m10000-s0-j0.paf.gz
-
-
-gunzip *.gz
-paf2chain -i pggb.wgg.88.all.vs.chm13.untangle-m10000-s0-j0.paf > pggb.wgg.88.all.vs.chm13.untangle-m10000-s0-j0.chain
-paf2chain -i pggb.wgg.88.all.vs.grch38.untangle-m10000-s0-j0.paf > pggb.wgg.88.all.vs.grch38.untangle-m10000-s0-j0.chain
-
-cat << __EOF__ > makeBigChain
-source=../../$1
-reference=$2
-other=$3
-twoBitDir=../../2bits
-other2bit=`echo $other | tr '#' '.'`.2bit
-
-mkdir -p $reference/$other
-cd $reference/$other
-echo cd $reference/$other
-awk -v other=$other '/^chain/ {if (index($8,other)) doPrint=1; else doPrint=0;} {if (doPrint) print} ' $source | sed "s/$other#//" | sed "s/$reference#//" > $other.chain
-chainToAxt $other.chain $twoBitDir/$reference.2bit $twoBitDir/$other2bit /tmp/$reference.$other.axt
-axtChain -linearGap=loose /tmp/$reference.$other.axt $twoBitDir/$reference.2bit $twoBitDir/$other2bit $other.re.chain
-rm /tmp/$reference.$other.axt
-hgLoadChain -noBin -test $reference bigChain $other.re.chain
-sed 's/\.000000//' chain.tab | awk 'BEGIN {OFS="\t"} {print $2, $4, $5, $11, 1000, $8, $3, $6, $7, $9, $10, $1}' > $other.re.bigChain
-bedToBigBed -type=bed6+6 -as=$HOME/kent/src/hg/lib/bigChain.as -tab $other.re.bigChain ../../sizes/$reference.chrom.sizes bigChain.bb
-awk 'BEGIN {OFS="\t"} {print $1, $2, $3, $5, $4}' link.tab | sort -k1,1 -k2,2n > bigChain.bigLink
-bedToBigBed -type=bed4+1 -as=$HOME/kent/src/hg/lib/bigLink.as -tab bigChain.bigLink ../../sizes/$reference.chrom.sizes bigChain.link.bb 
-__EOF__
-
-chmod +x makeBigChain
-
-for ass in chm13 grch38
-do
-source=pggb.wgg.88.all.vs.$ass.untangle-m10000-s0-j0.chain 
-#awk '/^chain/ {print $8}' $source | sed 's/#.*//' | sort -u > $ass.other.txt
-awk '/^chain/ {print $8}' $source | tr '#' ' ' | awk '{if (NF == 2) {print $1} else {print $1 "#" $2}}' | sort -u >   $ass.other.txt
-for other in `cat $ass.other.txt`
+##############################################################################
+### reChain the chains obtained from the HPRC project
+### done - 2023-08-14 - Hiram
+
+mkdir /hive/data/genomes/hg38/bed/hprc/reChain
+cd /hive/data/genomes/hg38/bed/hprc/reChain
+
+# avoid the broken one GCA_018504655.1
+ls ../bigChainToChain/hg38.chainHprc*.chain | grep -v GCA_018504655.1 \
+  > chain.list
+
+# using this 'runOne' script for each one:
+#######################################################################
+
+#!/bin/bash
+
+set -beEu -o pipefail
+
+export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit"
+export chainFile="${1}"
+export inChain="../bigChainToChain/${chainFile}"
+export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`
+
+# may already be done
+if [ -s "${chainFile}" ]; then
+  exit 0
+fi
+
+export buildDir=""
+export twoBit=""
+export asmId=""
+
+case "${asmName}" in
+    GCA_*) gcX="${asmName:0:3}"
+       d0="${asmName:4:3}"
+       d1="${asmName:7:3}"
+       d2="${asmName:10:3}"
+       tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`"
+       buildDir="`realpath ${tB}`"
+       asmId="`basename $buildDir`"
+       twoBit="${buildDir}/${asmId}.2bit"
+       ;;
+     T2T-*)
+       asmId="hs1"
+       buildDir="/hive/data/genomes/$asmId"
+       twoBit="${buildDir}/${asmId}.2bit"
+       ;;
+     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
+       exit 255
+       ;;
+esac
+
+printf "%s\n" "${chainFile}" 1>&2
+
+time (chainToAxt "${inChain}" "${hg38TwoBit}" \
+   "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \
+      stdin "${hg38TwoBit}" "${twoBit}" "${chainFile}") > err/${chainFile} 2>&1
+
+hgLoadChain hg38 measureReChain "${chainFile}"
+featureBits hg38 measureReChain > featBits/fb.hg38.${asmName}.txt 2>&1
+featureBits hg38 measureReChainLink > featBits/fb.hg38.${asmName}Link.txt 2>&1
+hgsql -N -e 'select count(*) from measureReChainLink;' hg38 > counts/hg38.${asmName}.linkCount.txt
+hgsql -N -e 'select count(*) from measureReChain;' hg38 > counts/hg38.${asmName}.chainCount.txt
+
+##############################################################
+
+to generate a jobList
+
+printf "#LOOP
+./runOne $(path1)
+#ENDLOOP
+" > template
+
+gensub2 chain.list single template jobList
+
+# running them all, one at a time
+
+mkdir featBits counts err
+
+time (~/kent/src/hg/utils/automation/perlPara.pl 1 jobList) > do.log 2>&1 
+# real    236m9.479s
+
+##############################################################################
+# turn those chains into nets (DONE - 2023-08-15 - Hiram)
+
+mkdir /hive/data/genomes/hg38/bed/hprc/reNet
+cd /hive/data/genomes/hg38/bed/hprc/reNet
+
+ls ../reChain  | grep hg38.chainHprc > chain.list
+
+printf '#LOOP
+runOne $(path1) {check out exists+ $(root1).net}
+#ENDLOOP
+' > template
+
+gensub2 chain.list single template jobList
+
+para -ram=4g create jobList
+para push
+para time
+Completed: 88 of 88 jobs
+CPU time in finished jobs:        285s       4.74m     0.08h    0.00d  0.000 y
+IO & Wait Time:                   206s       3.44m     0.06h    0.00d  0.000 y
+Average job time:                   6s       0.09m     0.00h    0.00d
+Longest finished job:               8s       0.13m     0.00h    0.00d
+Submission to last job:             8s       0.13m     0.00h    0.00d
+
+##############################################################################
+# there was some question that arose if the above was correct.
+# The procedure was repeated again (DONE 2023-10-12 - Hiram)
+
+mkdir /hive/data/genomes/hg38/bed/hprc/firstPassReChain
+cd /hive/data/genomes/hg38/bed/hprc/firstPassReChain
+
+# using this runOne script:
+
+####################################################################
+#!/bin/bash
+
+set -beEu -o pipefail
+
+export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit"
+export chainFile="${1}"
+export inChain="../bigChainToChain/${chainFile}"
+export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`
+
+# may already be done
+if [ -s "${chainFile}" ]; then
+  exit 0
+fi
+
+export buildDir=""
+export twoBit=""
+export asmId=""
+
+case "${asmName}" in
+    GCA_*) gcX="${asmName:0:3}"
+       d0="${asmName:4:3}"
+       d1="${asmName:7:3}"
+       d2="${asmName:10:3}"
+       tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`"
+       buildDir="`realpath ${tB}`"
+       asmId="`basename $buildDir`"
+       twoBit="${buildDir}/${asmId}.2bit"
+       ;;
+     T2T-*)
+       asmId="hs1"
+       buildDir="/hive/data/genomes/$asmId"
+       twoBit="${buildDir}/${asmId}.2bit"
+       ;;
+     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
+       exit 255
+       ;;
+esac
+
+printf "%s\n" "${chainFile}" 1>&2
+
+time (chainToAxt "${inChain}" "${hg38TwoBit}" \
+   "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \
+      stdin "${hg38TwoBit}" "${twoBit}" "${chainFile}") > err/${chainFile} 2>&1
+
+####################################################################
+
+# reusing the jobList from previous time:
+
+head jobList
+./runOne hg38.chainHprcGCA_018466835.1.chain
+./runOne hg38.chainHprcGCA_018466845.1.chain
+./runOne hg38.chainHprcGCA_018466855.1.chain
+./runOne hg38.chainHprcGCA_018466985.1.chain
+... etc
+./runOne hg38.chainHprcGCA_018506975.1.chain
+./runOne hg38.chainHprcGCA_018852585.1.chain
+./runOne hg38.chainHprcGCA_018852595.1.chain
+./runOne hg38.chainHprcT2T-CHM13v2.0.chain
+
+# running with perlPara.pl:
+
+time (perlPara.pl 7 jobList) > 7.log 2>&1
+# real    26m53.073s
+
+# and loading all those new chain files:
+############################
+
+#!/bin/bash
+
+set -beEu -o pipefail
+
+ls hg38.chainHprc*.chain |  while read chainFile
 do
-echo makeBigChain $source $ass $other
+export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'`
+
+export buildDir=""
+export twoBit=""
+export asmId=""
+
+case "${asmName}" in
+    GCA_*) asmId=`echo $asmName | sed -e 's/\./v/;'`
+       ;;
+     T2T-*)
+       asmId="Hs1"
+       ;;
+     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
+       exit 255
+       ;;
+esac
+
+printf "hgLoadChain hg38 chainHprc${asmId} ${chainFile}\n" 1>&2
+hgLoadChain hg38 chainHprc${asmId} ${chainFile}
+
 done
-done > jobs
 
-para create jobs
+############ and then netting thos chains
+
+mkdir /hive/data/genomes/hg38/bed/hprc/firstPassReNet
+cd /hive/data/genomes/hg38/bed/hprc/firstPassReNet
 
-cd grch38
-for i in *; do  for j in $i/*link.bb; do  ln -s `pwd`/$j /gbdb/hg38/hprcChain/$i.link.bb; done;done
-for i in *; do  for j in $i/*n.bb; do  ln -s `pwd`/$j /gbdb/hg38/hprcChain/$i.bb; done;done
+# using this runOne script:
+###########################
+#!/bin/bash
 
+set -beEu -o pipefail
 
-cat << __EOF__ > trackDb.header.ra
-track hprcChain
-compositeTrack on
-shortLabel HPRC Chains
-longLabel HPRC Chains
-type bed 3
-__EOF__
+export chain="${1}"
+export seq1Len="/hive/data/genomes/hg38/chrom.sizes"
+B=`echo $chain | sed -e 's/hg38.chainHprc//; s/.chain//;'`
+export seq2Len="xx"
+case "${B}" in
+   T2T-CHM13v2.0)
+     seq2Len="/hive/data/genomes/hs1/chrom.sizes"
+     ;;
+   GCA_*)
+     gcX="${B:0:3}"
+     d0="${B:4:3}"
+     d1="${B:7:3}"
+     d2="${B:10:3}"
+     D="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${B}_*`"
+     if [ -d "${D}" ]; then
+       asmId=`basename $D`
+       seq2Len="${D}/${asmId}.chrom.sizes"
+     else
+       printf "ERROR: can not find: $chain\n" 1>&2
+       exit 255
+     fi
+     ;;
+   *)
+     printf "ERROR: unrecognized: $chain\n" 1>&2
+     exit 255
+     ;;
+esac
 
-cp trackDb.header.ra $HOME/kent/src/hg/makeDb/trackDb/human/hg38/hprcChain.ra
+chainPreNet ../firstPassReChain/${chain} ${seq1Len} ${seq2Len} stdout \
+  | chainNet stdin -minSpace=1 ${seq1Len} ${seq2Len} stdout /dev/null \
+  | netSyntenic stdin hg38.chainHprc${B}.net
 
-for i in *; do echo track "$i"Chain; echo "type bigChain"; echo "bigDataUrl /gbdb/hg38/hprcChain/$i.bb";  echo "linkDataUrl /gbdb/hg38/hprcChain/$i.link.bb"; echo shortLabel $i;echo longLabel $i; echo subTrack hprcChain; echo ""; done >> $HOME/kent/src/hg/makeDb/trackDb/human/hg38/hprcChain.ra
+###########################
+
+# using the jobList from before:
+head jobList
+
+runOne hg38.chainHprcGCA_018466835.1.chain {check out exists+ hg38.chainHprcGCA_018466835.1.net}
+runOne hg38.chainHprcGCA_018466845.1.chain {check out exists+ hg38.chainHprcGCA_018466845.1.net}
+runOne hg38.chainHprcGCA_018466855.1.chain {check out exists+ hg38.chainHprcGCA_018466855.1.net}
+...
+runOne hg38.chainHprcGCA_018852585.1.chain {check out exists+ hg38.chainHprcGCA_018852585.1.net}
+runOne hg38.chainHprcGCA_018852595.1.chain {check out exists+ hg38.chainHprcGCA_018852595.1.net}
+runOne hg38.chainHprcT2T-CHM13v2.0.chain {check out exists+ hg38.chainHprcT2T-CHM13v2.0.net}
+
+###########################
+
+## this can be run in parasol, it takes only a few seconds:
+para -ram=4g create jobList
+para push
+para time
+
+Completed: 88 of 88 jobs
+CPU time in finished jobs:        351s       5.85m     0.10h    0.00d  0.000 y
+IO & Wait Time:                   118s       1.96m     0.03h    0.00d  0.000 y
+Average job time:                   5s       0.09m     0.00h    0.00d
+Longest finished job:               8s       0.13m     0.00h    0.00d
+Submission to last job:            26s       0.43m     0.01h    0.00d
+
+###########################
+
+Loading these nets:
+
+#!/bin/bash
+
+set -beEu -o pipefail
+
+ls hg38.chainHprc*.net | while read netFile
+do
+export asmName=`echo $netFile | sed -e 's/hg38.chainHprc//; s/.net//;'`
+
+export buildDir=""
+export twoBit=""
+export asmId=""
+
+case "${asmName}" in
+    GCA_*) asmId=`echo $asmName | sed -e 's/\./v/;'`
+       ;;
+     T2T-*)
+       asmId="Hs1"
+       ;;
+     *)  printf "ERROR: unrecognized $asmName\n" 1>&2
+       exit 255
+       ;;
+esac
+
+printf "hgLoadNet -warn hg38 netHprc${asmId} ${netFile}\n" 1>&2
+hgLoadNet -warn hg38 netHprc${asmId} ${netFile}
+
+done
 
+##############################################################################