8507a408f2eb8830af68f9bbb94a258861450bcc lrnassar Tue Mar 31 10:03:13 2026 -0700 Adding makedoc for mm10 ENCODE4 cCREs track. refs #37131 diff --git src/hg/makeDb/doc/mm10/encode4.cCREs.txt src/hg/makeDb/doc/mm10/encode4.cCREs.txt new file mode 100644 index 00000000000..3dac38931e4 --- /dev/null +++ src/hg/makeDb/doc/mm10/encode4.cCREs.txt @@ -0,0 +1,206 @@ +# ENCODE4 cCREs (candidate Cis-Regulatory Elements) for mm10 +# Redmine #37131 +# Lou Nassar, 2026-02-20 + +# This track migrates the ENCODE4 mouse cCRE data from an external track hub +# into native UCSC trackDb. It follows the same approach used for hg38 +# (see hg38/encode4.cCREs.txt). + +# The original hub was prepared by Mingshi Gao (Weng lab, UMass Chan Medical School): +# http://users.wenglab.org/gaomingshi/Mouse_ENCODE/hub.txt + +# Hub was previously cloned locally via hubClone: +# /hive/data/outside/encode4/ccre/ENCODE_Mouse_Regulation/ + +# Total data: 91 files (1 registry bigBed + 18 core cCRE bigBeds + 72 signal bigWigs) +# Registry contains 926,843 cCREs across mm10 + +############################################################################## +# Step 1: Copy and rename data files from hub clone +############################################################################## + +# Source hub clone directory: +# /hive/data/outside/encode4/ccre/ENCODE_Mouse_Regulation/ +# Destination: +# /hive/data/outside/encode4/ccre/mouse/ + +mkdir -p /hive/data/outside/encode4/ccre/mouse/coreCollection + +# Registry file (bigBed 9+5, 926,843 cCREs): +cp /hive/data/outside/encode4/ccre/ENCODE_Mouse_Regulation/mm10-cCREs.annotated.bigBed \ + /hive/data/outside/encode4/ccre/mouse/encodeCcreRegistry.bb + +# 18 core cCRE bigBed files (.bigBed -> .bb) +# Each file name is 4 ENCFF accessions joined by underscores (DNase_H3K4me3_H3K27ac_CTCF): +cp ENCODE_Mouse_Regulation/ENCFF325FZS_ENCFF396LHC_ENCFF010WPI_ENCFF196GIY.bigBed mouse/coreCollection/ENCFF325FZS_ENCFF396LHC_ENCFF010WPI_ENCFF196GIY.bb # midbrain, postnatal 0d +cp ENCODE_Mouse_Regulation/ENCFF784TNW_ENCFF990HLQ_ENCFF258KWF_ENCFF811CFN.bigBed mouse/coreCollection/ENCFF784TNW_ENCFF990HLQ_ENCFF258KWF_ENCFF811CFN.bb # lung, postnatal 0d +cp ENCODE_Mouse_Regulation/ENCFF374LSN_ENCFF887KNZ_ENCFF269FYI_ENCFF644MRR.bigBed mouse/coreCollection/ENCFF374LSN_ENCFF887KNZ_ENCFF269FYI_ENCFF644MRR.bb # kidney, postnatal 0d +cp ENCODE_Mouse_Regulation/ENCFF822FDB_ENCFF615BKA_ENCFF683GNP_ENCFF494FQV.bigBed mouse/coreCollection/ENCFF822FDB_ENCFF615BKA_ENCFF683GNP_ENCFF494FQV.bb # hindbrain, postnatal 0d +cp ENCODE_Mouse_Regulation/ENCFF727CYI_ENCFF050AHA_ENCFF422JMO_ENCFF247XHY.bigBed mouse/coreCollection/ENCFF727CYI_ENCFF050AHA_ENCFF422JMO_ENCFF247XHY.bb # forebrain, postnatal 0d +cp ENCODE_Mouse_Regulation/ENCFF947JCM_ENCFF738LHV_ENCFF306XSO_ENCFF055IZM.bigBed mouse/coreCollection/ENCFF947JCM_ENCFF738LHV_ENCFF306XSO_ENCFF055IZM.bb # heart, postnatal 0d +cp ENCODE_Mouse_Regulation/ENCFF907KGU_ENCFF341GEF_ENCFF544RNA_ENCFF935ZIU.bigBed mouse/coreCollection/ENCFF907KGU_ENCFF341GEF_ENCFF544RNA_ENCFF935ZIU.bb # liver, postnatal 0d +cp ENCODE_Mouse_Regulation/ENCFF616XIK_ENCFF127UZI_ENCFF865CCV_ENCFF355LDM.bigBed mouse/coreCollection/ENCFF616XIK_ENCFF127UZI_ENCFF865CCV_ENCFF355LDM.bb # lung, embryo E14.5 +cp ENCODE_Mouse_Regulation/ENCFF738YPD_ENCFF083LRQ_ENCFF500YJD_ENCFF394DGY.bigBed mouse/coreCollection/ENCFF738YPD_ENCFF083LRQ_ENCFF500YJD_ENCFF394DGY.bb # stomach, postnatal 0d +cp ENCODE_Mouse_Regulation/ENCFF389BAE_ENCFF244PVS_ENCFF622QOD_ENCFF990MGK.bigBed mouse/coreCollection/ENCFF389BAE_ENCFF244PVS_ENCFF622QOD_ENCFF990MGK.bb # liver, male adult 8w +cp ENCODE_Mouse_Regulation/ENCFF744DCK_ENCFF673XLI_ENCFF859YPP_ENCFF196NRY.bigBed mouse/coreCollection/ENCFF744DCK_ENCFF673XLI_ENCFF859YPP_ENCFF196NRY.bb # kidney, male adult 8w +cp ENCODE_Mouse_Regulation/ENCFF686CDW_ENCFF200ISF_ENCFF284KSX_ENCFF239UQO.bigBed mouse/coreCollection/ENCFF686CDW_ENCFF200ISF_ENCFF284KSX_ENCFF239UQO.bb # thymus, male adult 8w +cp ENCODE_Mouse_Regulation/ENCFF145VWQ_ENCFF430CHA_ENCFF770LBL_ENCFF244ZJY.bigBed mouse/coreCollection/ENCFF145VWQ_ENCFF430CHA_ENCFF770LBL_ENCFF244ZJY.bb # MEL cell line +cp ENCODE_Mouse_Regulation/ENCFF323LVW_ENCFF186DCG_ENCFF595IQM_ENCFF233QSB.bigBed mouse/coreCollection/ENCFF323LVW_ENCFF186DCG_ENCFF595IQM_ENCFF233QSB.bb # liver, embryo E14.5 +cp ENCODE_Mouse_Regulation/ENCFF228LWM_ENCFF420ORJ_ENCFF666CND_ENCFF499GIZ.bigBed mouse/coreCollection/ENCFF228LWM_ENCFF420ORJ_ENCFF666CND_ENCFF499GIZ.bb # CH12.LX cell line +cp ENCODE_Mouse_Regulation/ENCFF862WQC_ENCFF660WNU_ENCFF266FCV_ENCFF809JIR.bigBed mouse/coreCollection/ENCFF862WQC_ENCFF660WNU_ENCFF266FCV_ENCFF809JIR.bb # heart, male adult 8w +cp ENCODE_Mouse_Regulation/ENCFF514QGG_ENCFF523GNO_ENCFF920QDX_ENCFF803MIB.bigBed mouse/coreCollection/ENCFF514QGG_ENCFF523GNO_ENCFF920QDX_ENCFF803MIB.bb # spleen, male adult 8w +cp ENCODE_Mouse_Regulation/ENCFF265EIS_ENCFF288SAJ_ENCFF645OJK_ENCFF087SVX.bigBed mouse/coreCollection/ENCFF265EIS_ENCFF288SAJ_ENCFF645OJK_ENCFF087SVX.bb # cerebellum, male adult 8w + +# 72 signal bigWig files (.bigWig -> .bw) +# 4 per biosample (DNase, H3K4me3, H3K27ac, CTCF), derived by splitting the +# accession combos above. For example, the first biosample (midbrain, postnatal 0d) +# yields: ENCFF325FZS.bw (DNase), ENCFF396LHC.bw (H3K4me3), +# ENCFF010WPI.bw (H3K27ac), ENCFF196GIY.bw (CTCF) +# All 72: +for acc in ENCFF325FZS ENCFF396LHC ENCFF010WPI ENCFF196GIY \ + ENCFF784TNW ENCFF990HLQ ENCFF258KWF ENCFF811CFN \ + ENCFF374LSN ENCFF887KNZ ENCFF269FYI ENCFF644MRR \ + ENCFF822FDB ENCFF615BKA ENCFF683GNP ENCFF494FQV \ + ENCFF727CYI ENCFF050AHA ENCFF422JMO ENCFF247XHY \ + ENCFF947JCM ENCFF738LHV ENCFF306XSO ENCFF055IZM \ + ENCFF907KGU ENCFF341GEF ENCFF544RNA ENCFF935ZIU \ + ENCFF616XIK ENCFF127UZI ENCFF865CCV ENCFF355LDM \ + ENCFF738YPD ENCFF083LRQ ENCFF500YJD ENCFF394DGY \ + ENCFF389BAE ENCFF244PVS ENCFF622QOD ENCFF990MGK \ + ENCFF744DCK ENCFF673XLI ENCFF859YPP ENCFF196NRY \ + ENCFF686CDW ENCFF200ISF ENCFF284KSX ENCFF239UQO \ + ENCFF145VWQ ENCFF430CHA ENCFF770LBL ENCFF244ZJY \ + ENCFF323LVW ENCFF186DCG ENCFF595IQM ENCFF233QSB \ + ENCFF228LWM ENCFF420ORJ ENCFF666CND ENCFF499GIZ \ + ENCFF862WQC ENCFF660WNU ENCFF266FCV ENCFF809JIR \ + ENCFF514QGG ENCFF523GNO ENCFF920QDX ENCFF803MIB \ + ENCFF265EIS ENCFF288SAJ ENCFF645OJK ENCFF087SVX; do + cp ENCODE_Mouse_Regulation/${acc}.bigWig mouse/coreCollection/${acc}.bw +done + +# A reproducibility script that performs all of the above is saved at: +# /hive/data/outside/encode4/ccre/mouse/buildFromHub.py +# Usage: python3 buildFromHub.py [--dry-run] + +############################################################################## +# Step 2: Create /gbdb symlinks +############################################################################## + +mkdir -p /gbdb/mm10/encode4/ccre/coreCollection + +# Registry symlink: +ln -s /hive/data/outside/encode4/ccre/mouse/encodeCcreRegistry.bb \ + /gbdb/mm10/encode4/ccre/encodeCcreRegistry.bb + +# Core collection symlinks (18 bigBed + 72 bigWig = 90 files): +ln -s /hive/data/outside/encode4/ccre/mouse/coreCollection/* \ + /gbdb/mm10/encode4/ccre/coreCollection/ + +# Total: 91 symlinks (1 registry + 90 core collection) + +############################################################################## +# Step 3: Create trackDb configuration +############################################################################## + +# trackDb files are in kent/src/hg/makeDb/trackDb/mouse/mm10/: +# +# encode.cCREs.override.ra — main entry point, defines: +# - cCREs superTrack (parent for ENCODE3 + ENCODE4 cCRE tracks) +# - Override to reparent existing ENCODE3 encodeCcreCombined track under cCREs +# - cCREregistry track (ENCODE4 registry, bigBed 9+5, 926,843 cCREs) +# with filterValues for 8 cCRE classes +# - include encode4.ccres.ra +# +# encode4.ccres.ra — composite track definition (1,222 lines), defines: +# - coreCcres composite with 5 views (cCREs, DNase, H3K4me3, H3K27ac, CTCF) +# - 5 subGroups: organ (9 values), biosampleType (2), view (5), +# simpleBiosample (18), dataType (5) +# - 18 cCRE subtracks + 72 signal subtracks = 90 subtracks total +# - 3 biosamples on by default: forebrain postnatal 0d, heart postnatal 0d, +# liver male adult 8w (15 tracks) + +# The trackDb was modeled after the hg38 version, with adaptations for mm10: +# - 18 biosamples (vs 170 in hg38), no donor subGroup needed +# - All SCREEN URLs use assembly=mm10 (hub had GRCh38 bug, fixed) +# - Hub declared core cCRE bigBeds as type bigBed 9+1 but actual files are +# bigBed 9+5 (corrected in trackDb) +# - subGroup tag values cannot contain dots; replaced with underscores +# (CH12.LX -> CH12_LX, 14.5 -> 14_5) + +# Added to mm10/trackDb.ra (after existing ENCODE includes): +# include encode.cCREs.override.ra + +############################################################################## +# Step 4: Create HTML description pages +############################################################################## + +# 3 HTML files in kent/src/hg/makeDb/trackDb/mouse/mm10/: +# +# cCREsSuper.html — Supertrack description (ENCODE3 vs ENCODE4 context, +# 926,843 cCREs, 18 biosamples) +# +# cCREregistry.html — Registry description (926,843 elements, V4 methodology +# with rDHS + 7,658 TF rPeak anchoring, 8 classification criteria, data +# access, references) +# +# coreCollection.html — Core collection description (18 biosamples, 4 assays, +# 8 cCRE classes with colors, data access with mm10 example commands) +# +# Adapted from hg38 versions with mm10-specific counts, assembly references, +# and corrected credits per data provider feedback. + +############################################################################## +# Step 5: Load trackDb +############################################################################## + +cd /cluster/home/lrnassar/kent/src/hg/makeDb/trackDb + +# Sandbox (personal testing): +make DBS=mm10 + +# Dev (hgwdev): +make alpha DBS=mm10 + +############################################################################## +# Step 6: Validation +############################################################################## + +# Data integrity checks: +# - Registry: 926,843 cCREs confirmed (all EM10E prefix) +# - All 18 core files: exactly 926,843 items each +# - Core-to-registry ID consistency: 100% (zero mismatches) +# - Zero overlapping elements in registry +# - Element sizes: 150-350 bp (mean 269, median 278) +# - Genome coverage: 249.5 Mb = 8.85% of mm10 +# - All 91 files pass bigBedInfo/bigWigInfo + +# Known cosmetic issues from data provider (not bugs): +# - autoSql table names say "hg38cCRE"/"hg38core_cCRE" for mm10 data +# - Z-score min of -10.00 for H3K4me3/H3K27ac/CTCF is sentinel for missing data +# - Registry class "CA" maps to core class "CA-only"; core adds "Low-DNase" + +############################################################################## +# Track hierarchy summary +############################################################################## + +# cCREs (superTrack, group=regulation) +# ├── encodeCcreCombined (ENCODE3 registry, reparented, snowflake pennant) +# ├── cCREregistry (ENCODE4 registry, bigBed 9+5, 926,843 cCREs) +# └── coreCcres (ENCODE4 core collection composite) +# ├── cCREs_view (18 bigBed 9+5 subtracks, one per biosample) +# ├── DNase_view (18 bigWig subtracks) +# ├── H3K4me3_view (18 bigWig subtracks) +# ├── H3K27ac_view (18 bigWig subtracks) +# └── CTCF_view (18 bigWig subtracks) + +# 18 biosamples: +# Cerebellum male adult 8w, CH12.LX, Forebrain postnatal 0d, +# Heart male adult 8w, Heart postnatal 0d, Hindbrain postnatal 0d, +# Kidney male adult 8w, Kidney postnatal 0d, Liver embryo E14.5, +# Liver male adult 8w, Liver postnatal 0d, Lung embryo E14.5, +# Lung postnatal 0d, MEL, Midbrain postnatal 0d, +# Spleen male adult 8w, Stomach postnatal 0d, Thymus male adult 8w + +# gbdb contents (/gbdb/mm10/encode4/ccre/): +# encodeCcreRegistry.bb -- registry (926,843 cCREs) +# coreCollection/ -- 18 .bb + 72 .bw = 90 files