393037b58276445a2af3ca72e7299eb2b39d5e9d braney Fri Oct 13 16:37:36 2023 -0700 move some doc files around diff --git src/hg/makeDb/doc/hg38/hprcChain.txt src/hg/makeDb/doc/hg38/hprcChain.txt index b367bdd..161b34b 100644 --- src/hg/makeDb/doc/hg38/hprcChain.txt +++ src/hg/makeDb/doc/hg38/hprcChain.txt @@ -1,66 +1,315 @@ -# make chains out of the PAF files from Erik Garrison - -mkdir /cluster/data/hg38/bed/hprcChain - -# get PAF files from Erik Garrison -wget "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/scratch/2021_11_16_pggb_wgg.88/untangle/pggb.wgg.88.all.vs.chm13.untangle-m10000-s0-j0.paf.gz" -wget https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/scratch/2021_11_16_pggb_wgg.88/untangle/pggb.wgg.88.all.vs.grch38.untangle-m10000-s0-j0.paf.gz - - -gunzip *.gz -paf2chain -i pggb.wgg.88.all.vs.chm13.untangle-m10000-s0-j0.paf > pggb.wgg.88.all.vs.chm13.untangle-m10000-s0-j0.chain -paf2chain -i pggb.wgg.88.all.vs.grch38.untangle-m10000-s0-j0.paf > pggb.wgg.88.all.vs.grch38.untangle-m10000-s0-j0.chain - -cat << __EOF__ > makeBigChain -source=../../$1 -reference=$2 -other=$3 -twoBitDir=../../2bits -other2bit=`echo $other | tr '#' '.'`.2bit - -mkdir -p $reference/$other -cd $reference/$other -echo cd $reference/$other -awk -v other=$other '/^chain/ {if (index($8,other)) doPrint=1; else doPrint=0;} {if (doPrint) print} ' $source | sed "s/$other#//" | sed "s/$reference#//" > $other.chain -chainToAxt $other.chain $twoBitDir/$reference.2bit $twoBitDir/$other2bit /tmp/$reference.$other.axt -axtChain -linearGap=loose /tmp/$reference.$other.axt $twoBitDir/$reference.2bit $twoBitDir/$other2bit $other.re.chain -rm /tmp/$reference.$other.axt -hgLoadChain -noBin -test $reference bigChain $other.re.chain -sed 's/\.000000//' chain.tab | awk 'BEGIN {OFS="\t"} {print $2, $4, $5, $11, 1000, $8, $3, $6, $7, $9, $10, $1}' > $other.re.bigChain -bedToBigBed -type=bed6+6 -as=$HOME/kent/src/hg/lib/bigChain.as -tab $other.re.bigChain ../../sizes/$reference.chrom.sizes bigChain.bb -awk 'BEGIN {OFS="\t"} {print $1, $2, $3, $5, $4}' link.tab | sort -k1,1 -k2,2n > bigChain.bigLink -bedToBigBed -type=bed4+1 -as=$HOME/kent/src/hg/lib/bigLink.as -tab bigChain.bigLink ../../sizes/$reference.chrom.sizes bigChain.link.bb -__EOF__ - -chmod +x makeBigChain - -for ass in chm13 grch38 -do -source=pggb.wgg.88.all.vs.$ass.untangle-m10000-s0-j0.chain -#awk '/^chain/ {print $8}' $source | sed 's/#.*//' | sort -u > $ass.other.txt -awk '/^chain/ {print $8}' $source | tr '#' ' ' | awk '{if (NF == 2) {print $1} else {print $1 "#" $2}}' | sort -u > $ass.other.txt -for other in `cat $ass.other.txt` +############################################################################## +### reChain the chains obtained from the HPRC project +### done - 2023-08-14 - Hiram + +mkdir /hive/data/genomes/hg38/bed/hprc/reChain +cd /hive/data/genomes/hg38/bed/hprc/reChain + +# avoid the broken one GCA_018504655.1 +ls ../bigChainToChain/hg38.chainHprc*.chain | grep -v GCA_018504655.1 \ + > chain.list + +# using this 'runOne' script for each one: +####################################################################### + +#!/bin/bash + +set -beEu -o pipefail + +export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit" +export chainFile="${1}" +export inChain="../bigChainToChain/${chainFile}" +export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'` + +# may already be done +if [ -s "${chainFile}" ]; then + exit 0 +fi + +export buildDir="" +export twoBit="" +export asmId="" + +case "${asmName}" in + GCA_*) gcX="${asmName:0:3}" + d0="${asmName:4:3}" + d1="${asmName:7:3}" + d2="${asmName:10:3}" + tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`" + buildDir="`realpath ${tB}`" + asmId="`basename $buildDir`" + twoBit="${buildDir}/${asmId}.2bit" + ;; + T2T-*) + asmId="hs1" + buildDir="/hive/data/genomes/$asmId" + twoBit="${buildDir}/${asmId}.2bit" + ;; + *) printf "ERROR: unrecognized $asmName\n" 1>&2 + exit 255 + ;; +esac + +printf "%s\n" "${chainFile}" 1>&2 + +time (chainToAxt "${inChain}" "${hg38TwoBit}" \ + "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \ + stdin "${hg38TwoBit}" "${twoBit}" "${chainFile}") > err/${chainFile} 2>&1 + +hgLoadChain hg38 measureReChain "${chainFile}" +featureBits hg38 measureReChain > featBits/fb.hg38.${asmName}.txt 2>&1 +featureBits hg38 measureReChainLink > featBits/fb.hg38.${asmName}Link.txt 2>&1 +hgsql -N -e 'select count(*) from measureReChainLink;' hg38 > counts/hg38.${asmName}.linkCount.txt +hgsql -N -e 'select count(*) from measureReChain;' hg38 > counts/hg38.${asmName}.chainCount.txt + +############################################################## + +to generate a jobList + +printf "#LOOP +./runOne $(path1) +#ENDLOOP +" > template + +gensub2 chain.list single template jobList + +# running them all, one at a time + +mkdir featBits counts err + +time (~/kent/src/hg/utils/automation/perlPara.pl 1 jobList) > do.log 2>&1 +# real 236m9.479s + +############################################################################## +# turn those chains into nets (DONE - 2023-08-15 - Hiram) + +mkdir /hive/data/genomes/hg38/bed/hprc/reNet +cd /hive/data/genomes/hg38/bed/hprc/reNet + +ls ../reChain | grep hg38.chainHprc > chain.list + +printf '#LOOP +runOne $(path1) {check out exists+ $(root1).net} +#ENDLOOP +' > template + +gensub2 chain.list single template jobList + +para -ram=4g create jobList +para push +para time +Completed: 88 of 88 jobs +CPU time in finished jobs: 285s 4.74m 0.08h 0.00d 0.000 y +IO & Wait Time: 206s 3.44m 0.06h 0.00d 0.000 y +Average job time: 6s 0.09m 0.00h 0.00d +Longest finished job: 8s 0.13m 0.00h 0.00d +Submission to last job: 8s 0.13m 0.00h 0.00d + +############################################################################## +# there was some question that arose if the above was correct. +# The procedure was repeated again (DONE 2023-10-12 - Hiram) + +mkdir /hive/data/genomes/hg38/bed/hprc/firstPassReChain +cd /hive/data/genomes/hg38/bed/hprc/firstPassReChain + +# using this runOne script: + +#################################################################### +#!/bin/bash + +set -beEu -o pipefail + +export hg38TwoBit="/hive/data/genomes/hg38/hg38.2bit" +export chainFile="${1}" +export inChain="../bigChainToChain/${chainFile}" +export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'` + +# may already be done +if [ -s "${chainFile}" ]; then + exit 0 +fi + +export buildDir="" +export twoBit="" +export asmId="" + +case "${asmName}" in + GCA_*) gcX="${asmName:0:3}" + d0="${asmName:4:3}" + d1="${asmName:7:3}" + d2="${asmName:10:3}" + tB="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${asmName}_*`" + buildDir="`realpath ${tB}`" + asmId="`basename $buildDir`" + twoBit="${buildDir}/${asmId}.2bit" + ;; + T2T-*) + asmId="hs1" + buildDir="/hive/data/genomes/$asmId" + twoBit="${buildDir}/${asmId}.2bit" + ;; + *) printf "ERROR: unrecognized $asmName\n" 1>&2 + exit 255 + ;; +esac + +printf "%s\n" "${chainFile}" 1>&2 + +time (chainToAxt "${inChain}" "${hg38TwoBit}" \ + "${twoBit}" stdout | axtChain -minScore=0 -linearGap=loose \ + stdin "${hg38TwoBit}" "${twoBit}" "${chainFile}") > err/${chainFile} 2>&1 + +#################################################################### + +# reusing the jobList from previous time: + +head jobList +./runOne hg38.chainHprcGCA_018466835.1.chain +./runOne hg38.chainHprcGCA_018466845.1.chain +./runOne hg38.chainHprcGCA_018466855.1.chain +./runOne hg38.chainHprcGCA_018466985.1.chain +... etc +./runOne hg38.chainHprcGCA_018506975.1.chain +./runOne hg38.chainHprcGCA_018852585.1.chain +./runOne hg38.chainHprcGCA_018852595.1.chain +./runOne hg38.chainHprcT2T-CHM13v2.0.chain + +# running with perlPara.pl: + +time (perlPara.pl 7 jobList) > 7.log 2>&1 +# real 26m53.073s + +# and loading all those new chain files: +############################ + +#!/bin/bash + +set -beEu -o pipefail + +ls hg38.chainHprc*.chain | while read chainFile do -echo makeBigChain $source $ass $other +export asmName=`echo $chainFile | sed -e 's/hg38.chainHprc//; s/.chain//;'` + +export buildDir="" +export twoBit="" +export asmId="" + +case "${asmName}" in + GCA_*) asmId=`echo $asmName | sed -e 's/\./v/;'` + ;; + T2T-*) + asmId="Hs1" + ;; + *) printf "ERROR: unrecognized $asmName\n" 1>&2 + exit 255 + ;; +esac + +printf "hgLoadChain hg38 chainHprc${asmId} ${chainFile}\n" 1>&2 +hgLoadChain hg38 chainHprc${asmId} ${chainFile} + done -done > jobs -para create jobs +############ and then netting thos chains + +mkdir /hive/data/genomes/hg38/bed/hprc/firstPassReNet +cd /hive/data/genomes/hg38/bed/hprc/firstPassReNet -cd grch38 -for i in *; do for j in $i/*link.bb; do ln -s `pwd`/$j /gbdb/hg38/hprcChain/$i.link.bb; done;done -for i in *; do for j in $i/*n.bb; do ln -s `pwd`/$j /gbdb/hg38/hprcChain/$i.bb; done;done +# using this runOne script: +########################### +#!/bin/bash +set -beEu -o pipefail -cat << __EOF__ > trackDb.header.ra -track hprcChain -compositeTrack on -shortLabel HPRC Chains -longLabel HPRC Chains -type bed 3 -__EOF__ +export chain="${1}" +export seq1Len="/hive/data/genomes/hg38/chrom.sizes" +B=`echo $chain | sed -e 's/hg38.chainHprc//; s/.chain//;'` +export seq2Len="xx" +case "${B}" in + T2T-CHM13v2.0) + seq2Len="/hive/data/genomes/hs1/chrom.sizes" + ;; + GCA_*) + gcX="${B:0:3}" + d0="${B:4:3}" + d1="${B:7:3}" + d2="${B:10:3}" + D="`ls -d /hive/data/genomes/asmHubs/allBuild/${gcX}/${d0}/${d1}/${d2}/${B}_*`" + if [ -d "${D}" ]; then + asmId=`basename $D` + seq2Len="${D}/${asmId}.chrom.sizes" + else + printf "ERROR: can not find: $chain\n" 1>&2 + exit 255 + fi + ;; + *) + printf "ERROR: unrecognized: $chain\n" 1>&2 + exit 255 + ;; +esac -cp trackDb.header.ra $HOME/kent/src/hg/makeDb/trackDb/human/hg38/hprcChain.ra +chainPreNet ../firstPassReChain/${chain} ${seq1Len} ${seq2Len} stdout \ + | chainNet stdin -minSpace=1 ${seq1Len} ${seq2Len} stdout /dev/null \ + | netSyntenic stdin hg38.chainHprc${B}.net -for i in *; do echo track "$i"Chain; echo "type bigChain"; echo "bigDataUrl /gbdb/hg38/hprcChain/$i.bb"; echo "linkDataUrl /gbdb/hg38/hprcChain/$i.link.bb"; echo shortLabel $i;echo longLabel $i; echo subTrack hprcChain; echo ""; done >> $HOME/kent/src/hg/makeDb/trackDb/human/hg38/hprcChain.ra +########################### + +# using the jobList from before: +head jobList + +runOne hg38.chainHprcGCA_018466835.1.chain {check out exists+ hg38.chainHprcGCA_018466835.1.net} +runOne hg38.chainHprcGCA_018466845.1.chain {check out exists+ hg38.chainHprcGCA_018466845.1.net} +runOne hg38.chainHprcGCA_018466855.1.chain {check out exists+ hg38.chainHprcGCA_018466855.1.net} +... +runOne hg38.chainHprcGCA_018852585.1.chain {check out exists+ hg38.chainHprcGCA_018852585.1.net} +runOne hg38.chainHprcGCA_018852595.1.chain {check out exists+ hg38.chainHprcGCA_018852595.1.net} +runOne hg38.chainHprcT2T-CHM13v2.0.chain {check out exists+ hg38.chainHprcT2T-CHM13v2.0.net} + +########################### + +## this can be run in parasol, it takes only a few seconds: +para -ram=4g create jobList +para push +para time + +Completed: 88 of 88 jobs +CPU time in finished jobs: 351s 5.85m 0.10h 0.00d 0.000 y +IO & Wait Time: 118s 1.96m 0.03h 0.00d 0.000 y +Average job time: 5s 0.09m 0.00h 0.00d +Longest finished job: 8s 0.13m 0.00h 0.00d +Submission to last job: 26s 0.43m 0.01h 0.00d + +########################### + +Loading these nets: + +#!/bin/bash + +set -beEu -o pipefail + +ls hg38.chainHprc*.net | while read netFile +do +export asmName=`echo $netFile | sed -e 's/hg38.chainHprc//; s/.net//;'` + +export buildDir="" +export twoBit="" +export asmId="" + +case "${asmName}" in + GCA_*) asmId=`echo $asmName | sed -e 's/\./v/;'` + ;; + T2T-*) + asmId="Hs1" + ;; + *) printf "ERROR: unrecognized $asmName\n" 1>&2 + exit 255 + ;; +esac + +printf "hgLoadNet -warn hg38 netHprc${asmId} ${netFile}\n" 1>&2 +hgLoadNet -warn hg38 netHprc${asmId} ${netFile} + +done +##############################################################################