cd2ba925d41f6ed1749ac3e6a3f799b329fe9879
hiram
  Wed Oct 15 09:32:10 2025 -0700
checking these files from Heng Li refs #36494

diff --git src/hg/makeDb/doc/hs1/LCR.txt src/hg/makeDb/doc/hs1/LCR.txt
index 2a7d9a8a720..191263362aa 100644
--- src/hg/makeDb/doc/hs1/LCR.txt
+++ src/hg/makeDb/doc/hs1/LCR.txt
@@ -1,88 +1,187 @@
 
 # LCR - Low Complexity Regions from Heng Li
 # 2025-10-15 - Hiram
 
 From Heng Li paper:
 
   https://arxiv.org/abs/2509.23057
 
 Download files from:
 
   https://zenodo.org/records/17204470
 
 Specifically:
 
  chm13v2.cen-mask.bed
  chm13v2.PAR.bed
  chm13v2.lcr-v4.bed.gz
 
 mkdir /hive/data/genomes/hs1/bed/LCR
 cd /hive/data/genomes/hs1/bed/LCR
 
 # the chromEnd coordinates in chm13v2.lcr-v4.bed.gz were all 5 bp beyond
 #   the ends of the chromosomes, fixed the coordinates:
 #   zcat chm13v2.lcr-v4.bed.gz > chm13v2.lcr-v4.bed
 #   Edited the chm13v2.lcr-v4.bed file to fix those coordinates,
 #   discovered by running bedToBigBed on the file
 
 #   I added a name column to chm13v2.cen-mask.bed to identify
 #    the type of centromere:
 # chr13   0       17508596        acrocentric p-arm
 # chr14   0       12708411        acrocentric p-arm
 # chr15   0       17694466        acrocentric p-arm
 # chr21   0       11306378        acrocentric p-arm
 # chr22   0       15711065        acrocentric p-arm
 
 And all the others were named:
 
 # chr1    121619169       142242033       pericentromeric region
 
 # and, gave names to the items in chm13v2.PAR.bed
 
 # chrX    0       2394410 chrX PAR1
 # chrX    153925834       154259566       chrX PAR2
 # chrY    0       2458320 chrY PAR1
 # chrY    62122809        62460029        chrY PAR2
 
 # resulting source files here:
 # -rw-r--r-- 1      118 Oct 14 09:36 chm13v2.PAR.bed
 # -rw-r--r-- 1     1013 Oct 14 09:47 chm13v2.cen-mask.bed
 # -rw-rw-r-- 1  3971675 Oct 14 09:57 chm13v2.lcr-v4.bed
 
 ## converting to bigBed files:
 
 bedToBigBed -tab -type=bed4 chm13v2.PAR.bed ../../chrom.sizes chm13v2.PAR.bb
 
 bedToBigBed -tab -type=bed4 chm13v2.cen-mask.bed ../../chrom.sizes \
   chm13v2.cen-mask.bb
 
 bedToBigBed -type=bed4+1 -as=lcr.as chm13v2.lcr-v4.bed \
   ../../chrom.sizes chm13v2.lcr-v4.bb
 
 ## and then intersecting with simple repeats
 
 ### fix the chrom names in simpleRepeat.bed.gz, get a sed file:
 
 grep -v "^#" ../chromAlias/GCA_009914755.4_T2T-CHM13v2.0.chromAlias.txt \
   | awk -F$'\t' '{printf "s/%s/%s/;\n", $1, $5}' > genbank.ucsc.sed
 
 head genbank.ucsc.sed
 
 s/CP068254.1/chrM/;
 s/CP068255.2/chrX/;
 s/CP068256.2/chr22/;
 
 
 ln -s /hive/data/genomes/asmHubs/genbankBuild/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/trackData/simpleRepeat/simpleRepeat.bed.gz ./
 
 zcat simpleRepeat.bed.gz | sed -f genbank.ucsc.sed | bedSingleCover.pl stdin > trf.singleCover.bed
 
 bedToBigBed -tab -type=bed4 trf.singleCover.bed ../../chrom.sizes trf.singleCover.bb
 
 bedSingleCover.pl chm13v2.lcr-v4.bed > chm13v2.lcr-v4.singleCover.bed
 
 bedIntersect -minCoverage=0.0000000014 trf.singleCover.bed \
     chm13v2.lcr-v4.singleCover.bed stdout | sort -k1,1 -k2,2n > lcr.AND.trf.bed
 
 bedToBigBed -tab -type=bed4 lcr.AND.trf.bed ../../chrom.sizes lcr.AND.trf.bb
 
+bedInvert.pl ../../chrom.sizes chm13v2.lcr-v4.singleCover.bed \
+    > chm13v2.lcr-v4.invert.bed
+
+bedInvert.pl ../../chrom.sizes trf.singleCover.bed > trf.invert.bed
+
+bedIntersect -minCoverage=0.0000000014 trf.invert.bed \
+    chm13v2.lcr-v4.singleCover.bed stdout | sort -k1,1 -k2,2n \
+       > in.lcr.not.trf.bed
+
+bedIntersect -minCoverage=0.0000000014 chm13v2.lcr-v4.invert.bed \
+   trf.singleCover.bed stdout | sort -k1,1 -k2,2n > in.trf.not.lcr.bed
+
+bedToBigBed -tab -type=bed4 in.trf.not.lcr.bed ../../chrom.sizes inTrfNotLcr.bb
+
+bedToBigBed -tab -type=bed4 in.lcr.not.trf.bed ../../chrom.sizes inLcrNotTrf.bb
+
+for F in *.bb
+do
+  printf "bigBedInfo $F:\t"
+  bigBedInfo $F | grep basesCovered
+done
+bigBedInfo trf.singleCover.bb:  basesCovered: 277,065,041
+bigBedInfo inTrfNotLcr.bb:      basesCovered: 215,694,223
+bigBedInfo chm13v2.cen-mask.bb: basesCovered: 202,448,824
+bigBedInfo chm13v2.lcr-v4.bb:   basesCovered: 79,604,249
+bigBedInfo lcr.AND.trf.bb:      basesCovered: 61,370,818
+bigBedInfo inLcrNotTrf.bb:      basesCovered: 18,233,431
+bigBedInfo chm13v2.PAR.bb:      basesCovered: 5,523,682
+
+########## in trackDb/human/hs1 add LCR.ra file to define the tracks:
+
+track LCRs
+superTrack on show
+type bed 4
+shortLabel LCRs
+longLabel Low complexity regions from Heng Li, longdust measurements
+html LCRs
+group map
+
+track hs1LCR
+parent LCRs
+shortLabel LCR
+longLabel low-complexity regions excluding alpha and HSAT2/3 satellites.
+type bigBed 5 .
+visibility hide
+priority 1
+bigDataUrl /gbdb/hs1/LCRs/chm13v2.lcr-v4.bb
+html LCRs
+
+track hs1CenMask
+parent LCRs
+shortLabel Cent-Sat
+longLabel Centromeric satellite repeats
+type bigBed 5 .
+visibility hide
+priority 2
+bigDataUrl /gbdb/hs1/LCRs/chm13v2.cen-mask.bb
+html LCRs
+
+track hs1PAR
+parent LCRs
+shortLabel PAR region
+longLabel the PAR regions on chrX, chrY
+type bigBed 4 .
+visibility hide
+priority 3
+bigDataUrl /gbdb/hs1/LCRs/chm13v2.PAR.bb
+html LCRs
+
+track lcrANDTrf
+parent LCRs
+shortLabel in LCR AND TRF
+longLabel intersection of LCR track and the trf/simpleRepeats track
+type bigBed 4 .
+visibility hide
+priority 4
+bigDataUrl /gbdb/hs1/LCRs/lcr.AND.trf.bb
+html LCRs
+
+track inLcrNotTrf
+parent LCRs
+shortLabel in LCR not TRF
+longLabel areas in the LCR track not in the trf/simpleRepeats track
+type bigBed 4 .
+visibility hide
+priority 5
+bigDataUrl /gbdb/hs1/LCRs/inLcrNotTrf.bb
+html LCRs
+
+track inTrfNotLcr
+parent LCRs
+shortLabel in TRF not LCR
+longLabel areas in the TRF track not in the LCR
+type bigBed 4 .
+visibility hide
+priority 5
+bigDataUrl /gbdb/hs1/LCRs/inTrfNotLcr.bb
+html LCRs
+