7b9b6f6f3a35a1b35d8cd40c06b6267623b957cd hiram Fri Jun 30 14:30:05 2023 -0700 adding coverage/complement tracks from the HPRC chainLinks refs #31561 diff --git src/hg/makeDb/doc/hg38/hprcCoverage.txt src/hg/makeDb/doc/hg38/hprcCoverage.txt index dd611d0..1891864 100644 --- src/hg/makeDb/doc/hg38/hprcCoverage.txt +++ src/hg/makeDb/doc/hg38/hprcCoverage.txt @@ -1,196 +1,278 @@ +################## first experiment include below ########################### +### calculate coverage from the HPRC chainLink files - 2023-06-30 - Hiram +### this becomes the track: 'HPRC Coverage 89' + +mkdir /hive/data/genomes/hg38/bed/hprc/linkBeds +cd /hive/data/genomes/hg38/bed/hprc/linkBeds + +### transform the chainLink.bb files into single cover bed files + +### create a parasol joblist from all the chainLink.bb files: + +ls ../bigChains/*Link.bb | while read BB +do + S=`basename "${BB}" | sed -e 's#.*bigChains/##; s#-to-.*##;'` + printf "runOne ${BB} {check out exists+ singleCover/${S}.bed}\n" +done > jobList + +### the runOne script: + +### +#!/bin/bash + +export bbFile="${1}" +export result="${2}" + +/cluster/bin/x86_64/bigBedToBed "${bbFile}" stdout | /cluster/bin/scripts/bedSingleCover.pl stdin > "${result}" +### + +chmod +s runOne +mkdir singleCover +para -ram=6g create jobList +para push +para time +Completed: 89 of 89 jobs +CPU time in finished jobs: 730s 12.16m 0.20h 0.01d 0.000 y +IO & Wait Time: 184s 3.07m 0.05h 0.00d 0.000 y +Average job time: 10s 0.17m 0.00h 0.00d +Longest finished job: 17s 0.28m 0.00h 0.00d + +### combine all those results into one file, then turn it into a bigBed + +cat singleCover/*.bed \ + | $HOME/bin/x86_64/gnusort -S500G --parallel=64 -k1,1 -k2,2n > allLinks.bed + +bedItemOverlapCount hg38 allLinks.bed > allLinks.bedGraph + +### normalize the counts to the range [0.1:1.0] given there are 89 samples: + +awk '{printf "%s\t%s\t%s\t%.3f\n", $1, $2, $3, $4/89}' allLinks.bedGraph \ + > hprc.coverage.normalized.bedGraph + +### identify regions with ZERO coverage + +bedInvert.pl /hive/data/genomes/hg38/chrom.sizes \ + hprc.coverage.normalized.bedGraph > hprc.zeroCoverage.bed + +### and add in that ZERO coverage with the others +( awk '{printf "%s\t%s\t%s\t0\n", $1, $2, $3}' hprc.zeroCoverage.bed; +cat hprc.coverage.normalized.bedGraph) | sort -k1,1 -k2,2n \ + > hprc.coverage.89samples.bedGraph + +bedGraphToBigWig hprc.coverage.89samples.bedGraph \ + /hive/data/genomes/hg38/chrom.sizes hprc.coverage.89samples.bw + +### track bigWig obtained from /gbdb/ +ln -s `pwd`/hprc.coverage.89samples.bw /gbdb/hg38/hprc/hprcCoverage89.bw + +### create a complement: '1.0 - score' to emphasize the low counts: + +awk '{printf "%s\t%s\t%s\t%.3f\n", $1, $2, $3, 1.0 - $4}' \ + hprc.coverage.89samples.bedGraph > hprc.coverage.complement89.bedGraph + +bedGraphToBigWig hprc.coverage.complement89.bedGraph \ + /hive/data/genomes/hg38/chrom.sizes hprc.coverage.complement89.bw + +ln -s `pwd`/hprc.coverage.complement89.bw /gbdb/hg38/hprc/hprcComplement89.bw + +############################################################################### +### first experiment with the lastz/chain/net liftOver files +### 2023-04-12 - Hiram +### this became the track 'HPRC Coverage 97' in the Experimental track group + ## calculate coverage measurement from liftOver chain files ## collect files together in one location mkdir /hive/data/genomes/hg38/bed/hprc/liftOver/chains cd /hive/data/genomes/hg38/bed/hprc/liftOver/chains cut -f1 ~/kent/src/hg/makeDb/doc/hprcAsmHub/hprc.orderList.tsv \ | cut -d'_' -f1-2 > accessionId.txt # need to do the 'realpath' to the /hive/data/ location to be available # for cluster runs. /gbdb/hg38 isn't available for cluster reference for id in `cat accessionId.txt` do p0="/gbdb/hg38/liftOver/hg38To${id}.over.chain.gz" ln -s `realpath "${p0}"` ./ done # and one extra for the T2T ln -s /hive/data/genomes/hg38/bed/lastzGCA_009914755.4.2022-03-28/axtChain/hg38.GCA_009914755.4.over.chain.gz \ hg38ToGCA_009914755.4.over.chain.gz # should be 97: ls *.over.chain.gz | wc -l 97 ## chrom.sizes were previously obtained by Max: mkdir /hive/data/genomes/hg38/bed/hprc/sizes cd /hive/data/genomes/hg38/bed/hprc/sizes for i in `cat ../accs.txt`; do wget `curl -s "https://genome-test.gi.ucsc.edu/list/files?genome=$i&format=text" | grep sizes`; done ## and the T2T cp -p /hive/data/genomes/asmHubs/genbankBuild/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0.chrom.sizes \ GCA_009914755.4.chrom.sizes.txt ## and the 2bit files mkdir /hive/data/genomes/hg38/bed/hprc/2bits cd /hive/data/genomes/hg38/bed/hprc/2bits cut -f1 ~/kent/src/hg/makeDb/doc/hprcAsmHub/hprc.orderList.tsv \ | cut -d'_' -f1-2 > accessionId.txt for id in `cat accessionId.txt` do export id for P in `curl -s -L "https://genome-test.gi.ucsc.edu/list/files?genome=$id;format=text" \ | grep 2bit | grep -v bpt | sed -e 's#https://hgdownload.soe.ucsc.edu/hubs#/hive/data/genomes/asmHubs##;'` do rp=`realpath $P` echo ln -s \"${rp}\" ./$id.2bit ln -s "${rp}" ./$id.2bit done done ## and the T2T 2bit file: cp -p /hive/data/genomes/asmHubs/genbankBuild/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0.2bit \ GCA_009914755.4.2bit ## convert the chain files to bed: mkdir /hive/data/genomes/hg38/bed/hprc/chainToBed cd /hive/data/genomes/hg38/bed/hprc/chainToBed ## make a cluster jobList ls ../liftOver/chains | grep chain.gz | while read C do s="${C#hg38To}" c="${s%.over.chain.gz}" printf "./runOne %s {check out exists+ %s.bed}\n" "${c}" "${c}" done > jobList printf '#!/bin/bash set -beEu -o pipefail export S="${1}" export overChain="/hive/data/genomes/hg38/bed/hprc/liftOver/chains/hg38To${S}.over.chain.gz" export twoBit="/hive/data/genomes/hg38/bed/hprc/2bits/${S}.2bit" export sizes="/hive/data/genomes/hg38/bed/hprc/sizes/${S}.chrom.sizes.txt" if [ "${overChain}" -nt "${S}.bed" ]; then /cluster/bin/x86_64/chainToPsl $overChain /hive/data/genomes/hg38/chrom.sizes \ $sizes /hive/data/genomes/hg38/hg38.2bit $twoBit stdout \ | /cluster/bin/x86_64/pslToBed stdin stdout \ | /cluster/bin/x86_64/bedToExons stdin ${S}.bed touch -r $overChain ${S}.bed fi ' > runOne chmod +x runOne ## running on ku ssh ku cd /hive/data/genomes/hg38/bed/hprc/chainToBed para create jobList para push para time Completed: 96 of 96 jobs CPU time in finished jobs: 54194s 903.24m 15.05h 0.63d 0.002 y IO & Wait Time: 231s 3.84m 0.06h 0.00d 0.000 y Average job time: 567s 9.45m 0.16h 0.01d Longest finished job: 1378s 22.97m 0.38h 0.02d Submission to last job: 2286s 38.10m 0.64h 0.03d ## and the T2T lift.over: ssh hgwdev cd /hive/data/genomes/hg38/bed/hprc/chainToBed ./runOne GCA_009914755.4 ## put all the bed files into one file mkdir ../coverage $HOME/bin/x86_64/gnusort -k1,1 -k2,2n -S100G --parallel=32 *.bed \ > ../coverage/allOvers.bed ## calculate the coverage numers cd /hive/data/genomes/hg38/bed/hprc/coverage bedItemOverlapCount hg38 allOvers.bed > allOvers.bedGraph touch -r allOvers.sorted.bed allOvers.bedGraph ## account for those areas not covered at all bedInvert.pl /hive/data/genomes/hg38/chrom.sizes allOvers.bedGraph \ > zeroCounts.bed touch -r allOvers.bedGraph zeroCounts.bed ## add in the zero counts (awk '{printf "%s\t%d\t%d\t0\n", $1,$2,$3}' zeroCounts.bed; cat allOvers.bedGraph) | $HOME/bin/x86_64/gnusort -k1,1 -k2,2n \ -S100G --parallel=32 > fullGenome.bedGraph ## final answer: bedGraphToBigWig fullGenome.bedGraph /hive/data/genomes/hg38/chrom.sizes \ hprcCoverage.bw ## count up percent of genome in each coverage number printf '#!/usr/bin/env perl use strict; use warnings; my $genomeSize = `ave -col=2 /hive/data/genomes/hg38/chrom.sizes | grep -w total`; chomp $genomeSize; $genomeSize =~ s/.000000//; $genomeSize =~ s/total +//; printf "# hg38 total genome size: %s\n", $genomeSize; my $countedBases = 0; my @genomeProfile; # index N is the coverage depth 0 to 97 # value is total bases at that coverage open (FH, "< fullGenome.bedGraph") or die "can not read fullGenome.bedGraph"; while (my $line = <FH>) { chomp $line; my ($chr, $start, $end, $cover) = split('\s+', $line); $countedBases += $end - $start; $genomeProfile[$cover] += $end - $start; } close (FH); printf STDERR "# counted bases: %d\n", $countedBases; for (my $i = 0; $i < scalar(@genomeProfile); ++$i) { printf "%d\t%d\t%.4f\n", $i, $genomeProfile[$i], 100.0 * $genomeProfile[$i] / $genomeSize; } ' > percentGenome.pl chmod +x percentGenome.pl ./percentGenome.pl > profile.txt ## created an ordered by size bed file: awk '{printf "%s\t%d\t%d\t%d\t%d\n", $1,$2,$3,$4,$3-$2}' fullGenome.bedGraph \ | sort -k5,5nr > bySize.bed5 ## profile sizes of blocks for under 20 coverage awk '$4 < 20' bySize.bed5 | ave -col=5 stdin Q1 1.000000 median 1.000000 Q3 3.000000 average 2358.696411 min 1.000000 max 30180026.000000 count 97609 total 230229998.000000 standard deviation 172056.394886