cd2ba925d41f6ed1749ac3e6a3f799b329fe9879 hiram Wed Oct 15 09:32:10 2025 -0700 checking these files from Heng Li refs #36494 diff --git src/hg/makeDb/doc/hs1/LCR.txt src/hg/makeDb/doc/hs1/LCR.txt index 2a7d9a8a720..191263362aa 100644 --- src/hg/makeDb/doc/hs1/LCR.txt +++ src/hg/makeDb/doc/hs1/LCR.txt @@ -1,88 +1,187 @@ # LCR - Low Complexity Regions from Heng Li # 2025-10-15 - Hiram From Heng Li paper: https://arxiv.org/abs/2509.23057 Download files from: https://zenodo.org/records/17204470 Specifically: chm13v2.cen-mask.bed chm13v2.PAR.bed chm13v2.lcr-v4.bed.gz mkdir /hive/data/genomes/hs1/bed/LCR cd /hive/data/genomes/hs1/bed/LCR # the chromEnd coordinates in chm13v2.lcr-v4.bed.gz were all 5 bp beyond # the ends of the chromosomes, fixed the coordinates: # zcat chm13v2.lcr-v4.bed.gz > chm13v2.lcr-v4.bed # Edited the chm13v2.lcr-v4.bed file to fix those coordinates, # discovered by running bedToBigBed on the file # I added a name column to chm13v2.cen-mask.bed to identify # the type of centromere: # chr13 0 17508596 acrocentric p-arm # chr14 0 12708411 acrocentric p-arm # chr15 0 17694466 acrocentric p-arm # chr21 0 11306378 acrocentric p-arm # chr22 0 15711065 acrocentric p-arm And all the others were named: # chr1 121619169 142242033 pericentromeric region # and, gave names to the items in chm13v2.PAR.bed # chrX 0 2394410 chrX PAR1 # chrX 153925834 154259566 chrX PAR2 # chrY 0 2458320 chrY PAR1 # chrY 62122809 62460029 chrY PAR2 # resulting source files here: # -rw-r--r-- 1 118 Oct 14 09:36 chm13v2.PAR.bed # -rw-r--r-- 1 1013 Oct 14 09:47 chm13v2.cen-mask.bed # -rw-rw-r-- 1 3971675 Oct 14 09:57 chm13v2.lcr-v4.bed ## converting to bigBed files: bedToBigBed -tab -type=bed4 chm13v2.PAR.bed ../../chrom.sizes chm13v2.PAR.bb bedToBigBed -tab -type=bed4 chm13v2.cen-mask.bed ../../chrom.sizes \ chm13v2.cen-mask.bb bedToBigBed -type=bed4+1 -as=lcr.as chm13v2.lcr-v4.bed \ ../../chrom.sizes chm13v2.lcr-v4.bb ## and then intersecting with simple repeats ### fix the chrom names in simpleRepeat.bed.gz, get a sed file: grep -v "^#" ../chromAlias/GCA_009914755.4_T2T-CHM13v2.0.chromAlias.txt \ | awk -F$'\t' '{printf "s/%s/%s/;\n", $1, $5}' > genbank.ucsc.sed head genbank.ucsc.sed s/CP068254.1/chrM/; s/CP068255.2/chrX/; s/CP068256.2/chr22/; ln -s /hive/data/genomes/asmHubs/genbankBuild/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/trackData/simpleRepeat/simpleRepeat.bed.gz ./ zcat simpleRepeat.bed.gz | sed -f genbank.ucsc.sed | bedSingleCover.pl stdin > trf.singleCover.bed bedToBigBed -tab -type=bed4 trf.singleCover.bed ../../chrom.sizes trf.singleCover.bb bedSingleCover.pl chm13v2.lcr-v4.bed > chm13v2.lcr-v4.singleCover.bed bedIntersect -minCoverage=0.0000000014 trf.singleCover.bed \ chm13v2.lcr-v4.singleCover.bed stdout | sort -k1,1 -k2,2n > lcr.AND.trf.bed bedToBigBed -tab -type=bed4 lcr.AND.trf.bed ../../chrom.sizes lcr.AND.trf.bb +bedInvert.pl ../../chrom.sizes chm13v2.lcr-v4.singleCover.bed \ + > chm13v2.lcr-v4.invert.bed + +bedInvert.pl ../../chrom.sizes trf.singleCover.bed > trf.invert.bed + +bedIntersect -minCoverage=0.0000000014 trf.invert.bed \ + chm13v2.lcr-v4.singleCover.bed stdout | sort -k1,1 -k2,2n \ + > in.lcr.not.trf.bed + +bedIntersect -minCoverage=0.0000000014 chm13v2.lcr-v4.invert.bed \ + trf.singleCover.bed stdout | sort -k1,1 -k2,2n > in.trf.not.lcr.bed + +bedToBigBed -tab -type=bed4 in.trf.not.lcr.bed ../../chrom.sizes inTrfNotLcr.bb + +bedToBigBed -tab -type=bed4 in.lcr.not.trf.bed ../../chrom.sizes inLcrNotTrf.bb + +for F in *.bb +do + printf "bigBedInfo $F:\t" + bigBedInfo $F | grep basesCovered +done +bigBedInfo trf.singleCover.bb: basesCovered: 277,065,041 +bigBedInfo inTrfNotLcr.bb: basesCovered: 215,694,223 +bigBedInfo chm13v2.cen-mask.bb: basesCovered: 202,448,824 +bigBedInfo chm13v2.lcr-v4.bb: basesCovered: 79,604,249 +bigBedInfo lcr.AND.trf.bb: basesCovered: 61,370,818 +bigBedInfo inLcrNotTrf.bb: basesCovered: 18,233,431 +bigBedInfo chm13v2.PAR.bb: basesCovered: 5,523,682 + +########## in trackDb/human/hs1 add LCR.ra file to define the tracks: + +track LCRs +superTrack on show +type bed 4 +shortLabel LCRs +longLabel Low complexity regions from Heng Li, longdust measurements +html LCRs +group map + +track hs1LCR +parent LCRs +shortLabel LCR +longLabel low-complexity regions excluding alpha and HSAT2/3 satellites. +type bigBed 5 . +visibility hide +priority 1 +bigDataUrl /gbdb/hs1/LCRs/chm13v2.lcr-v4.bb +html LCRs + +track hs1CenMask +parent LCRs +shortLabel Cent-Sat +longLabel Centromeric satellite repeats +type bigBed 5 . +visibility hide +priority 2 +bigDataUrl /gbdb/hs1/LCRs/chm13v2.cen-mask.bb +html LCRs + +track hs1PAR +parent LCRs +shortLabel PAR region +longLabel the PAR regions on chrX, chrY +type bigBed 4 . +visibility hide +priority 3 +bigDataUrl /gbdb/hs1/LCRs/chm13v2.PAR.bb +html LCRs + +track lcrANDTrf +parent LCRs +shortLabel in LCR AND TRF +longLabel intersection of LCR track and the trf/simpleRepeats track +type bigBed 4 . +visibility hide +priority 4 +bigDataUrl /gbdb/hs1/LCRs/lcr.AND.trf.bb +html LCRs + +track inLcrNotTrf +parent LCRs +shortLabel in LCR not TRF +longLabel areas in the LCR track not in the trf/simpleRepeats track +type bigBed 4 . +visibility hide +priority 5 +bigDataUrl /gbdb/hs1/LCRs/inLcrNotTrf.bb +html LCRs + +track inTrfNotLcr +parent LCRs +shortLabel in TRF not LCR +longLabel areas in the TRF track not in the LCR +type bigBed 4 . +visibility hide +priority 5 +bigDataUrl /gbdb/hs1/LCRs/inTrfNotLcr.bb +html LCRs +