src/hg/makeDb/doc/hg38/encode4.cCREs.txt 2197cb4ee08a947521c3d4204d2fc37ee4ab18fe

2197cb4ee08a947521c3d4204d2fc37ee4ab18fe
lrnassar
  Mon Nov 3 14:57:41 2025 -0800
ENCODE4 cCREs makedoc, refs #34923

diff --git src/hg/makeDb/doc/hg38/encode4.cCREs.txt src/hg/makeDb/doc/hg38/encode4.cCREs.txt
new file mode 100644
index 00000000000..c75b3b57059
--- /dev/null
+++ src/hg/makeDb/doc/hg38/encode4.cCREs.txt
@@ -0,0 +1,603 @@
+# Original hub was prepared as a hub: https://users.wenglab.org/gaomingshi/ENCODE_Reg/hub.txt
+
+# Cloned it locally to work with: /cluster/home/lrnassar/public_html/track_hubs/ENCODE4/ENCODE_V4_Regulation/hub.txt
+
+# Cloned it with -download into this dir to process the data: /hive/data/outside/encode4/ccre/
+
+# The data were then processed using 3 AI-created scripts:
+# 1. Copy the files from the hubClone dir (/hive/data/outside/encode4/ccre/) to rename the files and place them in the correct dirs
+# 2. Script to process ENCODE hub.txt file and create trackDb RA file with various transformations applied, linking to the gbdb locations
+# 3. Script to restructure trackDb.ra with view containers. Converts fileType subgroup to view subgroup and creates view parent tracks.
+
+# Some small edits were made after the trackDb.ra file was made, such as removing newlines. See the ticket for details.
+
+# Make all symlinks: ln -s /hive/data/outside/encode4/ccre/human/coreCollection/* /gbdb/hg38/encode4/ccre/coreCollection/ ln -s /hive/data/outside/encode4/ccre/human/encode4CcreCombined.bb /gbdb/hg38/encode4/ccre/encode4CcreCombined.bb
+
+# The three python scripts will be pasted below in order:
+
+#!/usr/bin/env python3
+"""
+Script to process ENCODE hub.txt file and copy bigBed/bigWig files
+with appropriate renaming and destination directories.
+"""
+
+import os
+import shutil
+import re
+from pathlib import Path
+
+def extract_filename_from_url(url):
+    """Extract the filename from a URL."""
+    # Remove query parameters and fragments
+    url = url.split('?')[0].split('#')[0]
+    # Get the last part of the URL path
+    filename = url.rstrip('/').split('/')[-1]
+    return filename
+
+def process_hub_file(hub_file_path):
+    """Process the hub.txt file and copy files according to rules."""
+    
+    source_dir = "/hive/data/outside/encode4/ccre/ENCODE_V4_Regulation/"
+    dest_dir_human = "/hive/data/outside/encode4/ccre/human/"
+    dest_dir_core = "/hive/data/outside/encode4/ccre/human/coreCollection/"
+    
+    # Create destination directories if they don't exist
+    os.makedirs(dest_dir_human, exist_ok=True)
+    os.makedirs(dest_dir_core, exist_ok=True)
+    
+    files_copied = 0
+    
+    # Read and process the hub file
+    with open(hub_file_path, 'r') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            
+            # Look for lines starting with bigDataUrl
+            if line.startswith('bigDataUrl'):
+                # Extract the URL (everything after 'bigDataUrl ')
+                parts = line.split(None, 1)  # Split on whitespace, max 1 split
+                if len(parts) < 2:
+                    continue
+                
+                url = parts[1]
+                filename = extract_filename_from_url(url)
+                
+                # Rule 1: Special case for GRCh38-cCREs.annotated.bigBed
+                if filename == "GRCh38-cCREs.annotated.bigBed":
+                    source_file = os.path.join(source_dir, filename)
+                    dest_file = os.path.join(dest_dir_human, "encode4CcreCombined.bb")
+                    
+                    if os.path.exists(source_file):
+                        print(f"Copying {filename} -> encode4CcreCombined.bb")
+                        shutil.copy2(source_file, dest_file)
+                        files_copied += 1
+                    else:
+                        print(f"Warning: Source file not found: {source_file}")
+                
+                # Rule 2: Other bigBed files
+                elif filename.endswith('.bigBed'):
+                    source_file = os.path.join(source_dir, filename)
+                    new_filename = filename.replace('.bigBed', '.bb')
+                    dest_file = os.path.join(dest_dir_core, new_filename)
+                    
+                    if os.path.exists(source_file):
+                        print(f"Copying {filename} -> {new_filename}")
+                        shutil.copy2(source_file, dest_file)
+                        files_copied += 1
+                    else:
+                        print(f"Warning: Source file not found: {source_file}")
+                
+                # Rule 3: bigWig files
+                elif 'bigWig' in url:
+                    source_file = os.path.join(source_dir, filename)
+                    # Remove .bigWig?proxy=TRUE or just .bigWig and add .bw
+                    new_filename = re.sub(r'\.bigWig(\?.*)?$', '.bw', filename)
+                    dest_file = os.path.join(dest_dir_core, new_filename)
+                    
+                    if os.path.exists(source_file):
+                        print(f"Copying {filename} -> {new_filename}")
+                        shutil.copy2(source_file, dest_file)
+                        files_copied += 1
+                    else:
+                        print(f"Warning: Source file not found: {source_file}")
+    
+    return files_copied
+
+def main():
+    hub_file = "/cluster/home/lrnassar/public_html/track_hubs/ENCODE4/ENCODE_V4_Regulation/hub.txt"
+    
+    print("Processing hub.txt file...")
+    print(f"Hub file: {hub_file}")
+    print("-" * 60)
+    
+    if not os.path.exists(hub_file):
+        print(f"Error: Hub file not found at {hub_file}")
+        return 1
+    
+    try:
+        total_copied = process_hub_file(hub_file)
+        print("-" * 60)
+        print(f"\nTotal files copied: {total_copied}")
+        return 0
+    except Exception as e:
+        print(f"Error processing file: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+if __name__ == "__main__":
+    exit(main())
+
+########################################
+
+#!/usr/bin/env python3
+"""
+Script to process ENCODE hub.txt file and create trackDb RA file
+with various transformations applied.
+"""
+
+import os
+import re
+
+def extract_filename_from_url(url):
+    """Extract the filename from a URL."""
+    # Remove query parameters and fragments
+    url = url.split('?')[0].split('#')[0]
+    # Get the last part of the URL path
+    filename = url.rstrip('/').split('/')[-1]
+    return filename
+
+def capitalize_after_equals(text):
+    """Capitalize the first letter after each equals sign in a line."""
+    result = []
+    parts = text.split('=')
+    
+    for i, part in enumerate(parts):
+        if i == 0:
+            # First part - keep as is
+            result.append(part)
+        else:
+            # Capitalize first character after the equals sign
+            if part:
+                result.append(part[0].upper() + part[1:])
+            else:
+                result.append(part)
+    
+    return '='.join(result)
+
+def transform_line(line):
+    """Apply transformations to a line based on the rules."""
+    
+    # Rule: Replace "Core-cCREs" with "coreCcres" everywhere
+    line = line.replace('Core-cCREs', 'coreCcres')
+    
+    # Rule: subGroup2 biosampleType line - capitalize first, before cleaning
+    if line.startswith('subGroup2 biosampleType Biosample_type'):
+        line = capitalize_after_equals(line)
+    
+    # Rule: subGroup1 organ line - capitalize first, before cleaning
+    if line.startswith('subGroup1 organ Organ/Tissue'):
+        line = capitalize_after_equals(line)
+    
+    # Rule: Clean subGroup lines - replace commas with dashes and replace all non-conforming characters with underscores
+    # Valid characters are: a-z, A-Z, 0-9, _, -
+    # Special case: μ (mu) becomes 'u'
+    # Everything else (dots, parentheses, slashes, other non-ASCII, etc.) becomes underscore
+    # This must happen AFTER capitalization
+    if line.startswith('subGroup'):
+        line = line.replace(',', '-')  # Replace commas with dashes
+        line = line.replace('μ', 'u')  # Replace mu with u
+        # Replace any character that's not ASCII alphanumeric, underscore, dash, or whitespace/equals with underscore
+        cleaned = []
+        for char in line:
+            # Check if it's ASCII alphanumeric (a-z, A-Z, 0-9)
+            if (char >= 'a' and char <= 'z') or (char >= 'A' and char <= 'Z') or (char >= '0' and char <= '9'):
+                cleaned.append(char)
+            elif char in ('_', '-', ' ', '\t', '='):
+                cleaned.append(char)
+            else:
+                cleaned.append('_')
+        line = ''.join(cleaned)
+    
+    # Rule: type bigBed 9+1 -> type bigBed 9 + 2
+    if line.strip() == 'type bigBed 9+1':
+        return 'type bigBed 9 + 2'
+    
+    # Rule: bigDataUrl with bigBed
+    if line.startswith('bigDataUrl') and line.endswith('.bigBed'):
+        parts = line.split(None, 1)  # Split on whitespace, max 1 split
+        if len(parts) == 2:
+            url = parts[1]
+            filename = extract_filename_from_url(url)
+            new_filename = filename.replace('.bigBed', '.bb')
+            return f'bigDataUrl /gbdb/hg38/encode4/ccre/coreCollection/{new_filename}'
+    
+    # Rule: bigDataUrl with bigWig
+    if line.startswith('bigDataUrl') and 'bigWig' in line:
+        parts = line.split(None, 1)  # Split on whitespace, max 1 split
+        if len(parts) == 2:
+            url = parts[1]
+            filename = extract_filename_from_url(url)
+            # Remove .bigWig and any query parameters, add .bw
+            new_filename = re.sub(r'\.bigWig.*$', '.bw', filename)
+            return f'bigDataUrl /gbdb/hg38/encode4/ccre/coreCollection/{new_filename}'
+    
+    # Rule: visibility squish -> visibility pack
+    if line.strip() == 'visibility squish':
+        return 'visibility pack'
+    
+    # No transformation needed
+    return line
+
+def process_hub_file(input_file, output_file):
+    """Process the hub.txt file and write transformed output."""
+    
+    lines_written = 0
+    lines_skipped = 0
+    lines_transformed = 0
+    
+    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
+        for line_num, line in enumerate(infile, 1):
+            # Skip first 32 lines
+            if line_num <= 32:
+                lines_skipped += 1
+                continue
+            
+            # Remove trailing newline for processing
+            line_stripped = line.rstrip('\n')
+            
+            # Transform the line
+            transformed_line = transform_line(line_stripped)
+            
+            # Track if line was transformed
+            if transformed_line != line_stripped:
+                lines_transformed += 1
+            
+            # Write the line (add back newline)
+            outfile.write(transformed_line + '\n')
+            lines_written += 1
+    
+    return lines_written, lines_skipped, lines_transformed
+
+def main():
+    input_file = "/cluster/home/lrnassar/public_html/track_hubs/ENCODE4/ENCODE_V4_Regulation/hub.txt"
+    output_file = "/cluster/home/lrnassar/kent/src/hg/makeDb/trackDb/human/hg38/encode4.ccres.ra"
+    
+    print("Processing hub.txt file...")
+    print(f"Input file:  {input_file}")
+    print(f"Output file: {output_file}")
+    print("-" * 60)
+    
+    if not os.path.exists(input_file):
+        print(f"Error: Input file not found at {input_file}")
+        return 1
+    
+    # Create output directory if it doesn't exist
+    output_dir = os.path.dirname(output_file)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+    
+    try:
+        lines_written, lines_skipped, lines_transformed = process_hub_file(input_file, output_file)
+        print("-" * 60)
+        print(f"\nLines skipped (first 32): {lines_skipped}")
+        print(f"Lines written: {lines_written}")
+        print(f"Lines transformed: {lines_transformed}")
+        print(f"\nOutput written to: {output_file}")
+        return 0
+    except Exception as e:
+        print(f"Error processing file: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+if __name__ == "__main__":
+    exit(main())
+
+#####################################
+
+#!/usr/bin/env python3
+"""
+Script to restructure ENCODE hub.txt with view containers.
+Converts fileType subgroup to view subgroup and creates view parent tracks.
+"""
+
+import os
+import re
+from collections import OrderedDict
+
+def extract_filename_from_url(url):
+    """Extract the filename from a URL."""
+    url = url.split('?')[0].split('#')[0]
+    filename = url.rstrip('/').split('/')[-1]
+    return filename
+
+def capitalize_after_equals(text):
+    """Capitalize the first letter after each equals sign in a line."""
+    result = []
+    parts = text.split('=')
+    
+    for i, part in enumerate(parts):
+        if i == 0:
+            result.append(part)
+        else:
+            if part:
+                result.append(part[0].upper() + part[1:])
+            else:
+                result.append(part)
+    
+    return '='.join(result)
+
+def clean_subgroup_line(line):
+    """Clean subgroup line by replacing invalid characters."""
+    line = line.replace(',', '-')  # Replace commas with dashes
+    line = line.replace('μ', 'u')  # Replace mu with u
+    cleaned = []
+    for char in line:
+        if (char >= 'a' and char <= 'z') or (char >= 'A' and char <= 'Z') or (char >= '0' and char <= '9'):
+            cleaned.append(char)
+        elif char in ('_', '-', ' ', '\t', '='):
+            cleaned.append(char)
+        else:
+            cleaned.append('_')
+    return ''.join(cleaned)
+
+def clean_long_label(label):
+    """Clean up longLabel by capitalizing first letter and replacing underscores with spaces."""
+    # Replace underscores with spaces
+    label = label.replace('_', ' ')
+    
+    # Capitalize the first letter
+    if label:
+        label = label[0].upper() + label[1:]
+    
+    return label
+
+def transform_basic_line(line):
+    """Apply basic transformations to a line."""
+    # Replace "Core-cCREs" with "coreCcres"
+    line = line.replace('Core-cCREs', 'coreCcres')
+    
+    # Clean longLabel formatting
+    if line.startswith('longLabel '):
+        parts = line.split(None, 1)  # Split into 'longLabel' and the rest
+        if len(parts) == 2:
+            cleaned_label = clean_long_label(parts[1])
+            line = f'longLabel {cleaned_label}'
+    
+    # Capitalize biosampleType values
+    if line.startswith('subGroup2 biosampleType Biosample_type'):
+        line = capitalize_after_equals(line)
+    
+    # Capitalize organ values
+    if line.startswith('subGroup1 organ Organ/Tissue'):
+        line = capitalize_after_equals(line)
+    
+    # Clean subGroup lines
+    if line.startswith('subGroup'):
+        line = clean_subgroup_line(line)
+    
+    # Change subGroup3 fileType to view
+    if line.startswith('subGroup3 fileType'):
+        # Replace fileType with view and add _view suffix to each value
+        line = line.replace('subGroup3 fileType File_Type', 'subGroup3 view Views')
+        # Add _view suffix to the tag part (before =)
+        # Pattern: word=word -> word_view=word
+        line = re.sub(r'(\w+)=(\w+)', r'\1_view=\2', line)
+        # But don't add _view to the subGroup3, view, and Views parts
+        line = line.replace('subGroup3_view', 'subGroup3')
+        line = line.replace('view_view', 'view')
+        line = line.replace('Views_view', 'Views')
+    
+    # Change type bigBed 9+1 to type bigBed 9 + 2
+    if line.strip() == 'type bigBed 9+1':
+        return 'type bigBed 9 + 2'
+    
+    # Transform bigDataUrl with bigBed
+    if line.startswith('bigDataUrl') and line.endswith('.bigBed'):
+        parts = line.split(None, 1)
+        if len(parts) == 2:
+            url = parts[1]
+            filename = extract_filename_from_url(url)
+            new_filename = filename.replace('.bigBed', '.bb')
+            return f'bigDataUrl /gbdb/hg38/encode4/ccre/coreCollection/{new_filename}'
+    
+    # Transform bigDataUrl with bigWig
+    if line.startswith('bigDataUrl') and 'bigWig' in line:
+        parts = line.split(None, 1)
+        if len(parts) == 2:
+            url = parts[1]
+            filename = extract_filename_from_url(url)
+            new_filename = re.sub(r'\.bigWig.*$', '.bw', filename)
+            return f'bigDataUrl /gbdb/hg38/encode4/ccre/coreCollection/{new_filename}'
+    
+    # Change visibility squish to visibility pack
+    if line.strip() == 'visibility squish':
+        return 'visibility pack'
+    
+    return line
+
+def parse_track_stanza(lines):
+    """Parse a track stanza into a dictionary."""
+    stanza = OrderedDict()
+    current_key = None
+    
+    for line in lines:
+        line = line.rstrip('\n')
+        if not line.strip():
+            continue
+            
+        # Check if this is a key-value line
+        if ' ' in line and not line.startswith(' '):
+            parts = line.split(None, 1)
+            key = parts[0]
+            value = parts[1] if len(parts) > 1 else ''
+            stanza[key] = value
+            current_key = key
+        elif current_key and line.startswith(' '):
+            # Continuation of previous line
+            stanza[current_key] += '\n' + line
+    
+    return stanza
+
+def extract_filetype_from_subgroups(subgroups_line):
+    """Extract the fileType value from a subGroups line."""
+    if not subgroups_line:
+        return None
+    
+    # Look for fileType=value pattern
+    match = re.search(r'fileType=(\w+)', subgroups_line)
+    if match:
+        return match.group(1)
+    return None
+
+def process_hub_file(input_file, output_file):
+    """Process the hub.txt file and restructure with views."""
+    
+    with open(input_file, 'r') as f:
+        lines = f.readlines()
+    
+    # Skip first 32 lines entirely
+    # Process remaining lines into track stanzas
+    current_stanza_lines = []
+    stanzas = []
+    
+    for i in range(32, len(lines)):
+        line = lines[i].rstrip('\n')
+        
+        if line.startswith('track ') and current_stanza_lines:
+            # Save previous stanza
+            stanzas.append(current_stanza_lines)
+            current_stanza_lines = [line]
+        else:
+            current_stanza_lines.append(line)
+    
+    # Don't forget the last stanza
+    if current_stanza_lines:
+        stanzas.append(current_stanza_lines)
+    
+    # Organize tracks by view
+    composite_stanza = None
+    tracks_by_view = {}  # {view_name: [list of track stanzas]}
+    view_types = {}  # {view_name: 'bigBed' or 'bigWig'}
+    
+    for stanza_lines in stanzas:
+        if not stanza_lines or not stanza_lines[0].startswith('track '):
+            continue
+        
+        # Transform all lines in the stanza
+        transformed_lines = []
+        filetype = None
+        track_type = None
+        is_composite = False
+        
+        for line in stanza_lines:
+            # Skip itemRgb and priority lines
+            if line.startswith('itemRgb ') or line.startswith('priority '):
+                continue
+                
+            transformed_line = transform_basic_line(line)
+            transformed_lines.append(transformed_line)
+            
+            # Check if composite
+            if 'compositeTrack' in line:
+                is_composite = True
+            
+            # Extract fileType from subGroups
+            if line.startswith('subGroups '):
+                match = re.search(r'fileType=(\w+)', line)
+                if match:
+                    filetype = match.group(1)
+            
+            # Extract type
+            if line.startswith('type '):
+                if 'bigBed' in line:
+                    track_type = 'bigBed'
+                elif 'bigWig' in line:
+                    track_type = 'bigWig'
+        
+        if is_composite:
+            composite_stanza = transformed_lines
+        elif filetype:
+            # Add to the appropriate view group
+            if filetype not in tracks_by_view:
+                tracks_by_view[filetype] = []
+                view_types[filetype] = track_type if track_type else 'bigBed'
+            tracks_by_view[filetype].append(transformed_lines)
+    
+    # Write output
+    with open(output_file, 'w') as f:
+        # Write composite stanza first
+        if composite_stanza:
+            for line in composite_stanza:
+                f.write(line + '\n')
+            f.write('\n')
+        
+        # Write each view and its children
+        for view_name in tracks_by_view.keys():
+            view_name_with_suffix = f'{view_name}_view'
+            
+            # Write view parent stanza
+            f.write(f'    track {view_name_with_suffix}\n')
+            f.write(f'    view {view_name_with_suffix}\n')
+            f.write(f'    parent coreCcres\n')
+            f.write(f'    shortLabel {view_name}\n')
+            
+            if view_name == 'cCREs':
+                f.write(f'    visibility pack\n')
+                f.write(f'    type bigBed\n')
+            else:
+                f.write(f'    visibility dense\n')
+                f.write(f'    type bigWig\n')
+            
+            f.write('\n')
+            
+            # Write all tracks for this view
+            for track_lines in tracks_by_view[view_name]:
+                for line in track_lines:
+                    if line.startswith('parent '):
+                        # Change parent to the view
+                        f.write(f'parent {view_name_with_suffix}\n')
+                    elif line.startswith('subGroups '):
+                        # Replace fileType=X with view=X_view
+                        modified_line = re.sub(r'fileType=(\w+)', r'view=\1_view', line)
+                        f.write(modified_line + '\n')
+                    else:
+                        f.write(line + '\n')
+                
+                f.write('\n')
+    
+    print(f"Created {len(tracks_by_view)} view containers: {', '.join(tracks_by_view.keys())}")
+    total_tracks = sum(len(tracks) for tracks in tracks_by_view.values())
+    print(f"Organized {total_tracks} tracks into views")
+
+def main():
+    input_file = "/cluster/home/lrnassar/public_html/track_hubs/ENCODE4/ENCODE_V4_Regulation/hub.txt"
+    output_file = "/cluster/home/lrnassar/kent/src/hg/makeDb/trackDb/human/hg38/encode4.ccres.ra"
+    
+    print("Processing hub.txt file with view restructuring...")
+    print(f"Input file:  {input_file}")
+    print(f"Output file: {output_file}")
+    print("-" * 60)
+    
+    if not os.path.exists(input_file):
+        print(f"Error: Input file not found at {input_file}")
+        return 1
+    
+    output_dir = os.path.dirname(output_file)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+    
+    try:
+        process_hub_file(input_file, output_file)
+        print("-" * 60)
+        print(f"\nOutput written to: {output_file}")
+        return 0
+    except Exception as e:
+        print(f"Error processing file: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+if __name__ == "__main__":
+    exit(main())