7e9cf9d7dace5cd9a0519bee134364b87b86d832
lrnassar
  Fri Mar 27 12:01:25 2026 -0700
Updating makedocs for hg38 and mm10 ENCODE4 Regulation tracks to reflect all
changes from 2026-03-27: priorities, symlink renaming, color darkening,
colorSettingsUrl, longLabel cleanup, Data Access sections, HTML audit fixes,
code review fixes, and local ENCFF file cleanup (4.3 TB freed). refs #34923

diff --git src/hg/makeDb/doc/hg38/encode4.regulation.txt src/hg/makeDb/doc/hg38/encode4.regulation.txt
index b28ac58efbe..6845c8b31bd 100644
--- src/hg/makeDb/doc/hg38/encode4.regulation.txt
+++ src/hg/makeDb/doc/hg38/encode4.regulation.txt
@@ -1,18 +1,18 @@
 # ENCODE4 Integrated Regulation Track (wgEncodeReg4) for hg38
 # Redmine #34923
-# Lou Nassar, 2026-03-12
+# Lou Nassar, 2026-03-12 (updated 2026-03-27)
 
 # This track converts the ENCODE V4 Regulation hub into a native UCSC browser
 # supertrack containing organ-averaged multiWig signal tracks, TF rPeak clusters,
 # and individual experiment composites (bigComposite/faceted format) for
 # epigenetics, RNA-seq, and TF ChIP-seq.
 
 # The original hub was prepared by Mingshi Gao (Weng lab, UMass Chan Medical School):
 #   https://users.wenglab.org/gaomingshi/ENCODE_Reg/hub.txt
 # It was cloned locally for processing:
 #   /hive/data/outside/encode4/ccre/ENCODE_V4_Regulation/hub.txt (93K lines)
 
 # Scripts are located at:
 #   kent/src/hg/makeDb/scripts/encode4regulation/
 
 ##############################################################################
@@ -28,221 +28,259 @@
 # Total data: ~5.6 TB across ~7,000 files (bigWig + bigBed)
 
 ##############################################################################
 # Step 2: Create gbdb symlinks
 ##############################################################################
 
 # Only organ-averaged multiWig files and TF rPeak files need gbdb symlinks.
 # Individual experiment tracks use S3 URLs directly.
 
 # Organ-averaged multiWig signals (163 organ signals + 102 RNA strand files)
 mkdir -p /gbdb/hg38/encode4/regulation/organAve
 cd /hive/data/outside/encode4/ccre/ENCODE_V4_Regulation
 for f in $(grep -oP '^\S+\.bigWig' hub.txt | grep -v ENCFF | sort -u); do
     [ -f "$f" ] && ln -s $(pwd)/$f /gbdb/hg38/encode4/regulation/organAve/$f
 done
-# Also symlink strand-specific RNA files:
 for f in *.minus.bigWig *.plus.bigWig; do
     [ -f "$f" ] && ln -s $(pwd)/$f /gbdb/hg38/encode4/regulation/organAve/$f
 done
 # Total: 265 symlinks
 
+# Symlinks were renamed to UCSC convention (camelCase, .bw extension):
+#   ave.adipose.H3K27ac.bigWig -> adiposeH3K27ac.bw
+#   adrenal_gland.minus.bigWig -> adrenalGlandMinus.bw
+# The rename_symlinks.py script handles this and updates bigDataUrl in the .ra.
+
 # TF rPeak clusters (from existing ENCODEv4TFrPeaks bed)
 mkdir -p /gbdb/hg38/encode4/regulation/tfRpeak
 ln -s /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks/no_trim.TF_name.rPeaks.bb \
     /gbdb/hg38/encode4/regulation/tfRpeak/TFrPeakClusters.bb
 ln -s /hive/data/genomes/hg38/bed/ENCODEv4TFrPeaks/no_trim.TF_name.decorator.bb \
     /gbdb/hg38/encode4/regulation/tfRpeak/TFrPeakClustersDecorator.bb
 
 # Final symlink counts:
 #   organAve/  265
 #   tfRpeak/     2
 #   Total:     267
 
 ##############################################################################
 # Step 3: Validate local files against ENCODE portal
 ##############################################################################
 
 # Run the validation script to verify md5sums of local files against the
 # ENCODE REST API and generate S3 URL mapping for bigComposite tracks.
 
 cd /hive/users/lrnassar/claude/RM34923
 python3 kent/src/hg/makeDb/scripts/encode4regulation/validate_encode_urls.py
 
 # This produces:
 #   encode4_url_mapping.tsv  — maps 6,747 accessions to S3 URLs with md5 validation
 #   encode4_validation.log   — detailed log of any mismatches
 # The mapping file is used by the bigComposite conversion script (Step 5).
+# All 6,747 S3 URLs verified accessible via bigWigInfo/bigBedInfo (0 failures).
 
 ##############################################################################
 # Step 4: Generate multiWig trackDb stanzas
 ##############################################################################
 
 # The multiWig organ-averaged tracks (H3K27ac, DNase, ATAC, H3K4me3, CTCF, Txn)
 # are generated from the hub.txt and assembled into the main wgEncodeReg4.ra.
 
 cd /hive/users/lrnassar/claude/RM34923
 python3 kent/src/hg/makeDb/scripts/encode4regulation/generate_multiwig_ra.py \
     > multiwig_output.ra
 
 # The output was manually integrated into wgEncodeReg4.ra along with:
-#   - SuperTrack definition (priority 1.5, group=regulation)
-#   - TF rPeak track (bigBed 12+ with decorator, 912-factor filterValues)
+#   - SuperTrack definition (priority 0.9, group=regulation)
+#   - TF rPeak track (bigBed 12+ with decorator, 911-factor filterValues)
 #   - Include directives for the 3 bigComposite files
 # The main file is hand-maintained:
 #   kent/src/hg/makeDb/trackDb/human/hg38/wgEncodeReg4.ra (~3,327 lines)
 
 # multiWig track details:
-#   wgEncodeReg4MarkH3k27ac — full visibility, 5 organs default ON
-#   wgEncodeReg4Dnase       — hidden
-#   wgEncodeReg4Atac        — hidden
-#   wgEncodeReg4MarkH3k4me3 — hidden
-#   wgEncodeReg4MarkCtcf    — hidden
-#   wgEncodeReg4Txn         — hidden
+#   wgEncodeReg4MarkH3k27ac — full visibility, 5 organs default ON (priority 1.4)
+#   wgEncodeReg4Dnase       — hidden (priority 1.1)
+#   wgEncodeReg4Atac        — hidden (priority 1.2)
+#   wgEncodeReg4MarkH3k4me3 — hidden (priority 1.3)
+#   wgEncodeReg4MarkCtcf    — hidden (priority 1.5)
+#   wgEncodeReg4Txn         — hidden (priority 1.6)
+#   wgEncodeReg4TfPeaks     — hidden (priority 1.7)
 # Each multiWig has both tissue-only and all-biosamples variants as subtracks.
 
 ##############################################################################
 # Step 5: Generate bigComposite (faceted) individual experiment tracks
 ##############################################################################
 
 # The three individual experiment composites use the new bigComposite/faceted
 # format (refs #36320). This was a two-step process:
 #
 # Step 5a: Generate traditional composites from hub
 #   generate_composites.py parses hub.txt and creates compositeTrack-on-style
 #   .ra files with subGroups, views, dimensions, etc.
 #
 # Step 5b: Convert to bigComposite faceted format
 #   convert_to_bigcomposite.py reads those .ra files, strips subGroups/views,
 #   adds metaDataUrl + primaryKey, generates metadata TSVs, and uses S3 URLs
 #   from the validation mapping (Step 3).
 
-# First regenerate the traditional composites (if needed as intermediate):
 cd /hive/users/lrnassar/claude/RM34923
 python3 kent/src/hg/makeDb/scripts/encode4regulation/generate_composites.py
-
-# Then convert to bigComposite format:
 python3 kent/src/hg/makeDb/scripts/encode4regulation/convert_to_bigcomposite.py
 
 # This overwrites the .ra files in place and creates metadata TSVs:
 #   /gbdb/hg38/encode4/regulation/wgEncodeReg4Epigenetics_metadata.tsv
 #   /gbdb/hg38/encode4/regulation/wgEncodeReg4RnaSeq_metadata.tsv
 #   /gbdb/hg38/encode4/regulation/wgEncodeReg4TfChip_metadata.tsv
 #
 # Output .ra files (in kent/src/hg/makeDb/trackDb/human/hg38/):
-#   wgEncodeReg4Epigenetics.ra — 3,199 subtracks, facets: Assay, Organ,
-#                                 Biosample Type, Life Stage
-#   wgEncodeReg4RnaSeq.ra      — 1,046 subtracks, facets: Organ,
-#                                 Biosample Type, Life Stage, Strand
-#   wgEncodeReg4TfChip.ra      — 2,502 subtracks, facets: TF, Organ,
-#                                 Biosample Type, Life Stage
+#   wgEncodeReg4Epigenetics.ra — 3,199 subtracks (priority 2.0)
+#     Facets: Assay, Organ, Biosample Type, Life Stage
+#   wgEncodeReg4RnaSeq.ra      — 1,046 subtracks (priority 2.1)
+#     Facets: Organ, Biosample Type, Life Stage, Strand
+#   wgEncodeReg4TfChip.ra      — 2,502 subtracks (priority 2.2)
+#     Facets: TF, Organ, Biosample Type, Life Stage
 #
-# The "Biosample" column is prefixed with underscore (_Biosample) to hide
-# it from the faceted UI while retaining it as metadata (per Jonathan Casper's
-# recommendation, refs #36320).
+# Faceted UI features:
+#   - _Biosample column hidden from facets via underscore prefix (refs #36320)
+#   - All facet values capitalized (e.g., "Cell line", "Adult")
+#   - longLabels cleaned: underscores replaced with spaces for readability
+#   - colorSettingsUrl for facet color indicators:
+#     Epigenetics: colored by Assay (epi_colors.json)
+#     RnaSeq: colored by Organ (organ_colors.json)
+#     TfChip: no facet colors (uses score-based spectrum coloring)
 #
 # Default-ON subtracks (5 per composite):
 #   Epigenetics: untreated K562, one per assay (DNase, ATAC, H3K4me3, H3K27ac, CTCF)
 #   RNA-seq: K562 +/- strand, GM12878 +/- strand, HepG2 + strand
 #   TF ChIP: K562 CTCF, POLR2A, MYC, MAX, EP300
 #
 # All subtracks use S3 URLs (encode-public.s3.amazonaws.com) for bigDataUrl.
 
+##############################################################################
+# Step 5c: Track color adjustments
+##############################################################################
+
+# Several organ colors from the hub had poor contrast against a white background.
+# All colors with brightness > 160 (on the perceived brightness scale) were
+# darkened to a target brightness of 140 while preserving hue. This affects
+# the multiWig subtracks and the bigComposite subtracks. 22 unique colors
+# were adjusted across all .ra files.
+
 ##############################################################################
 # Step 6: Create HTML description pages
 ##############################################################################
 
-# 11 HTML files were created in kent/src/hg/makeDb/trackDb/human/hg38/:
+# 11 HTML files in kent/src/hg/makeDb/trackDb/human/hg38/:
 #   wgEncodeReg4.html                — SuperTrack overview
 #   wgEncodeReg4MarkH3k27ac.html     — H3K27ac layered signal
 #   wgEncodeReg4Dnase.html           — DNase layered signal
 #   wgEncodeReg4Atac.html            — ATAC layered signal
 #   wgEncodeReg4MarkH3k4me3.html     — H3K4me3 layered signal
 #   wgEncodeReg4MarkCtcf.html        — CTCF layered signal
 #   wgEncodeReg4Txn.html             — Transcription layered signal
 #   wgEncodeReg4TfPeaks.html         — TF rPeaks
 #   wgEncodeReg4Epigenetics.html     — Individual epigenetics composite
 #   wgEncodeReg4RnaSeq.html          — Individual RNA-seq composite
 #   wgEncodeReg4TfChip.html          — Individual TF ChIP composite
+#
+# Each layered track HTML includes an organ/tissue availability table
+# from the original hub. All HTMLs include Data Access sections with
+# bigWigToWig/bigBedToBed examples. Production lab credits match the
+# original hub per assay type.
 
 ##############################################################################
 # Step 7: Add related tracks and trackDb include
 ##############################################################################
 
 # Added to kent/src/hg/makeDb/trackDb/human/hg38/trackDb.ra:
 #   include wgEncodeReg4.ra alpha
 
 # Added reciprocal entries to relatedTracks.ra:
 #   hg38 wgEncodeReg4 wgEncodeReg ENCODE4 update of ENCODE3 Regulation
 #   hg38 wgEncodeReg wgEncodeReg4 ENCODE4 update of ENCODE3 Regulation
 #   hg38 wgEncodeReg4 cCREs Related ENCODE4 cCRE annotations
 #   hg38 cCREs wgEncodeReg4 Related ENCODE4 regulation data
 
 ##############################################################################
 # Step 8: Release tags (ENCODE3 transition)
 ##############################################################################
 
 # On alpha (dev): ENCODE4 visible, ENCODE3 hidden with snowflake
 # On beta/public: ENCODE3 visible as-is, ENCODE4 not visible
 #
 # Approach: Duplicate wgEncodeReg.ra into wgEncodeReg.alpha.ra with:
 #   - superTrack on hide (hidden by default)
 #   - pennantIcon snowflake.png (deprecation notice)
 #   - Inner includes tagged with release alpha
 # The original wgEncodeReg.ra gets inner includes tagged beta,public.
+# Key: inner include directives must also carry release tags.
 #
 # trackDb.ra includes:
 #   include wgEncodeReg.ra beta,public
 #   include wgEncodeReg.alpha.ra alpha
 #   include wgEncodeReg4.ra alpha
 
 ##############################################################################
 # Step 9: Load trackDb
 ##############################################################################
 
 cd /cluster/home/lrnassar/kent/src/hg/makeDb/trackDb
 
 # Sandbox (personal testing):
 make DBS=hg38
 
 # Dev (hgwdev):
 make alpha DBS=hg38
 
-# 20,658 track descriptions loaded
+##############################################################################
+# Step 10: Cleanup — delete local ENCFF files (now S3-served)
+##############################################################################
+
+# Individual experiment files (ENCFF*.bigWig, ENCFF*.bigBed) and ?proxy=TRUE
+# download artifacts were deleted from the source hub directory, since they
+# are now served via S3 URLs. Only the organ-averaged multiWig files
+# (referenced by gbdb symlinks) and hub.txt/HTML docs were preserved.
+#
+# hg38: Deleted 7,598 files (3.6 TB freed), 280 files remaining (2.8 TB)
+# Script: cleanup_local_files.py
 
 ##############################################################################
 # Track hierarchy summary
 ##############################################################################
 
-# wgEncodeReg4 (superTrack, priority 1.5, group=regulation)
-# ├── wgEncodeReg4MarkH3k27ac (multiWig, full, 5 organs ON)
-# ├── wgEncodeReg4Dnase       (multiWig, hide)
-# ├── wgEncodeReg4Atac        (multiWig, hide)
-# ├── wgEncodeReg4MarkH3k4me3 (multiWig, hide)
-# ├── wgEncodeReg4MarkCtcf    (multiWig, hide)
-# ├── wgEncodeReg4Txn         (multiWig, hide)
-# ├── wgEncodeReg4TfPeaks     (bigBed 12+ with decorator, hide)
-# ├── wgEncodeReg4Epigenetics (bigComposite faceted, 3,199 subtracks, 5 ON)
-# ├── wgEncodeReg4RnaSeq      (bigComposite faceted, 1,046 subtracks, 5 ON)
-# └── wgEncodeReg4TfChip      (bigComposite faceted, 2,502 subtracks, 5 ON)
+# Regulation group priority order: cCREs (0.8) > wgEncodeReg4 (0.9) > wgEncodeReg (1)
+#
+# wgEncodeReg4 (superTrack, priority 0.9, group=regulation)
+# ├── wgEncodeReg4MarkH3k27ac (multiWig, full, 5 organs ON)    priority 1.4
+# ├── wgEncodeReg4Dnase       (multiWig, hide)                  priority 1.1
+# ├── wgEncodeReg4Atac        (multiWig, hide)                  priority 1.2
+# ├── wgEncodeReg4MarkH3k4me3 (multiWig, hide)                  priority 1.3
+# ├── wgEncodeReg4MarkCtcf    (multiWig, hide)                  priority 1.5
+# ├── wgEncodeReg4Txn         (multiWig, hide)                  priority 1.6
+# ├── wgEncodeReg4TfPeaks     (bigBed 12+ with decorator, hide) priority 1.7
+# ├── wgEncodeReg4Epigenetics (bigComposite faceted, 3,199)     priority 2.0
+# ├── wgEncodeReg4RnaSeq      (bigComposite faceted, 1,046)     priority 2.1
+# └── wgEncodeReg4TfChip      (bigComposite faceted, 2,502)     priority 2.2
 
 # gbdb contents (/gbdb/hg38/encode4/regulation/):
-#   organAve/                          — 265 multiWig symlinks
-#   tfRpeak/                           — 2 TF rPeak symlinks
+#   organAve/                          — 265 multiWig symlinks (camelCase .bw)
+#   tfRpeak/                           — 2 TF rPeak symlinks (.bb)
 #   wgEncodeReg4Epigenetics_metadata.tsv
 #   wgEncodeReg4RnaSeq_metadata.tsv
 #   wgEncodeReg4TfChip_metadata.tsv
+#   epi_colors.json                    — Assay facet colors for Epigenetics
+#   organ_colors.json                  — Organ facet colors for RNA-seq
 
 ##############################################################################
 # Known upstream hub issues (reported to Weng lab)
 ##############################################################################
 
 # Report at:
 #   https://hgwdev.gi.ucsc.edu/~lrnassar/temp/encode4_regulation_hub_issues.md
 #
 # 1. Duplicate bigDataUrl — tissue-only and all-biosamples variants share
 #    same file for 5 non-RNA assays (83 pairs, 166 subtracks)
 # 2. tp. prefix inconsistency — RNA correctly uses tp. prefix; other assays
 #    don't, causing issue 1
 # 3. "paraythroid" typo — should be "parathyroid"; affects 3 assays,
 #    3 filenames, 21 hub lines
 # 4. Bad track name: ATAC_ENCFF128Muscle (should be ATAC_ENCFF128OID) —