98004ba10f89d74192132ac37d9be95ebfdb3d0e
lrnassar
  Mon Mar 9 16:13:35 2026 -0700
Adding MLQ automate script to the tree, this runs on a qateam crontab and checks for any updated or new MLQs, accepts pending questions that pass a spam check, and creates/updates redmine tickets. It has been running for months on my personal crontab and is now being migrated to qateam. Refs #36801

diff --git src/utils/qa/mlqAutomate.py src/utils/qa/mlqAutomate.py
new file mode 100755
index 00000000000..748fee08c08
--- /dev/null
+++ src/utils/qa/mlqAutomate.py
@@ -0,0 +1,1901 @@
+#!/usr/bin/env python3
+"""
+MLQ Automation Script
+Monitors Gmail for mailing list emails, moderates pending messages,
+and creates/updates Redmine tickets.
+"""
+
+import os
+import sys
+import argparse
+import base64
+import re
+import logging
+import html as html_module
+import time
+from datetime import datetime, timedelta
+from difflib import SequenceMatcher
+import email
+from email import policy
+from email.utils import parseaddr
+from email.mime.text import MIMEText
+from functools import wraps
+import pytz
+import requests
+from google.oauth2.credentials import Credentials
+from google_auth_oauthlib.flow import InstalledAppFlow
+from google.auth.transport.requests import Request
+from googleapiclient.discovery import build
+import anthropic
+
+# Configuration
+CONFIG = {
+    'REDMINE_URL': 'https://redmine.gi.ucsc.edu',
+    'REDMINE_API_KEY': '',
+    'REDMINE_PROJECT': 'maillists',
+    'CALENDAR_ID': 'ucsc.edu_anbl4254jlssgo3gc2l5c8un5c@group.calendar.google.com',
+    'CLAUDE_API_KEY': '',
+
+    # Mailing lists
+    'MODERATED_LISTS': ['genome@soe.ucsc.edu', 'genome-mirror@soe.ucsc.edu'],
+    'UNMODERATED_LISTS': ['genome-www@soe.ucsc.edu'],
+
+    # Name mapping from calendar to Redmine
+    'NAME_MAPPING': {
+        'Jairo': 'Jairo Navarro',
+        'Lou': 'Lou Nassar',
+        'Gerardo': 'Gerardo Perez',
+        'Gera': 'Gerardo Perez',
+        'Clay': 'Clay Fischer',
+        'Matt': 'Matt Speir',
+    },
+
+    # Redmine User IDs
+    'USER_IDS': {
+        'Jairo Navarro': 163,
+        'Lou Nassar': 171,
+        'Gerardo Perez': 179,
+        'Clay Fischer': 161,
+        'Matt Speir': 150,
+    },
+
+    # Redmine field IDs
+    'TRACKER_ID': 7,        # MLQ
+    'PRIORITY_ID': 12,      # Unprioritized
+    'STATUS_ID': 1,         # New
+    'CUSTOM_FIELDS': {
+        'MLQ Category - primary': 28,
+        'Email': 40,
+        'MLM': 9,
+    },
+}
+
+SCOPES = [
+    'https://www.googleapis.com/auth/gmail.readonly',
+    'https://www.googleapis.com/auth/gmail.send',
+    'https://www.googleapis.com/auth/gmail.modify',
+    'https://www.googleapis.com/auth/calendar.readonly',
+]
+
+MLQ_CATEGORIES = [
+    "Other", "Alignments", "BLAT", "Bug Report", "CAPTCHA", "Command-line Utilities",
+    "Conservation", "Custom Track", "Data - Availability (when)", "Data - Interpretation (what)",
+    "Data - Location (where)", "Data Contribution", "Data Integrator", "Data Requests", "dbSNP",
+    "Downloads", "ENCODE", "External Tools", "Feature Request", "GBiB", "GBiC",
+    "Gene Interactions (hgGeneGraph)", "Gene Tracks", "Help Docs (Info)", "Hubs", "IP blocked",
+    "JSON hubApi", "Licenses", "LiftOver", "Login", "Mirror - Asia", "Mirror - Europe",
+    "Mirror Site & Utilities", "Multi-region", "MySQL", "PCR", "Publications & Citing",
+    "Sessions", "Slow Performance", "Table Browser", "Track Collection Builder", "User Accounts",
+    "Variant Annotation Integrator", "Widget"
+]
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PST = pytz.timezone('America/Los_Angeles')
+DRY_RUN = False
+
+# Setup logging
+LOG_FILE = os.environ.get('MLQ_LOG_FILE', os.path.join(SCRIPT_DIR, 'mlq_automate.log'))
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(levelname)s] %(message)s',
+    handlers=[
+        logging.FileHandler(LOG_FILE),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
+
+# Suppress httpx INFO logs (Anthropic client HTTP request logging)
+logging.getLogger('httpx').setLevel(logging.WARNING)
+
+# Config file path
+MLQ_CONF_PATH = os.path.expanduser('~/.hg.conf')
+
+
+def load_config_file():
+    """
+    Load configuration from ~/.hg.conf file.
+    Uses the standard UCSC kent configuration file.
+    Requires 600 permissions for security.
+    """
+    if not os.path.exists(MLQ_CONF_PATH):
+        logger.error(f"Configuration file not found: {MLQ_CONF_PATH}")
+        logger.error("Please add the following keys to ~/.hg.conf:")
+        logger.error("  redmine.apiKey=YOUR_REDMINE_API_KEY")
+        logger.error("  claude.apiKey=YOUR_CLAUDE_API_KEY")
+        sys.exit(1)
+
+    # Check file permissions (must be 600 for security)
+    file_stat = os.stat(MLQ_CONF_PATH)
+    file_mode = file_stat.st_mode & 0o777
+    if file_mode != 0o600:
+        logger.error(f"Configuration file {MLQ_CONF_PATH} has insecure permissions: {oct(file_mode)}")
+        logger.error("For security, this file must have 600 permissions.")
+        logger.error(f"Run: chmod 600 {MLQ_CONF_PATH}")
+        sys.exit(1)
+
+    # Parse key=value pairs
+    config_values = {}
+    with open(MLQ_CONF_PATH, 'r') as f:
+        for line_num, line in enumerate(f, 1):
+            line = line.strip()
+            # Skip empty lines and comments
+            if not line or line.startswith('#'):
+                continue
+            if '=' not in line:
+                logger.warning(f"Skipping invalid line {line_num} in {MLQ_CONF_PATH}: {line}")
+                continue
+            key, value = line.split('=', 1)
+            config_values[key.strip()] = value.strip()
+
+    # Map config file keys to CONFIG dict
+    key_mapping = {
+        'redmine.apiKey': 'REDMINE_API_KEY',
+        'claude.apiKey': 'CLAUDE_API_KEY',
+    }
+
+    for conf_key, config_key in key_mapping.items():
+        if conf_key in config_values:
+            CONFIG[config_key] = config_values[conf_key]
+
+    # Validate required keys
+    if not CONFIG['REDMINE_API_KEY']:
+        logger.error(f"Missing redmine.apiKey in {MLQ_CONF_PATH}")
+        sys.exit(1)
+    if not CONFIG['CLAUDE_API_KEY']:
+        logger.error(f"Missing claude.apiKey in {MLQ_CONF_PATH}")
+        sys.exit(1)
+
+    logger.info(f"Loaded configuration from {MLQ_CONF_PATH}")
+
+
+def retry(max_attempts=3, delay=2, backoff=2, exceptions=(Exception,)):
+    """Retry decorator with exponential backoff."""
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            attempts = 0
+            current_delay = delay
+            while attempts < max_attempts:
+                try:
+                    return func(*args, **kwargs)
+                except exceptions as e:
+                    attempts += 1
+                    if attempts == max_attempts:
+                        logger.error(f"{func.__name__} failed after {max_attempts} attempts: {e}")
+                        raise
+                    logger.warning(f"{func.__name__} attempt {attempts} failed: {e}. Retrying in {current_delay}s...")
+                    time.sleep(current_delay)
+                    current_delay *= backoff
+        return wrapper
+    return decorator
+
+
+def get_google_credentials():
+    """Get or refresh Google API credentials."""
+    creds = None
+    token_path = os.path.expanduser('~/.gmail_token.json')
+    creds_path = os.path.expanduser('~/.gmail_credentials.json')
+
+    if os.path.exists(token_path):
+        creds = Credentials.from_authorized_user_file(token_path, SCOPES)
+
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+        else:
+            flow = InstalledAppFlow.from_client_secrets_file(creds_path, SCOPES)
+            creds = flow.run_local_server(port=8080)
+        with open(token_path, 'w') as f:
+            f.write(creds.to_json())
+
+    return creds
+
+
+def get_current_mlm():
+    """Get the current MLM based on PST time rules."""
+    creds = get_google_credentials()
+    service = build('calendar', 'v3', credentials=creds, cache_discovery=False)
+
+    now = datetime.now(PST)
+    target_date = now
+
+    # After 5pm, use next day's MLM
+    if now.hour >= 17:
+        target_date += timedelta(days=1)
+
+    # Weekend handling: Fri 5pm+ through Mon 5pm uses Monday's MLM
+    weekday = target_date.weekday()
+    if weekday == 5:  # Saturday
+        target_date += timedelta(days=2)
+    elif weekday == 6:  # Sunday
+        target_date += timedelta(days=1)
+
+    # Tuesday is always Matt
+    if target_date.weekday() == 1:  # Tuesday
+        return 'Matt Speir'
+
+    start = target_date.replace(hour=0, minute=0, second=0, microsecond=0)
+    end = target_date.replace(hour=23, minute=59, second=59, microsecond=0)
+
+    events = service.events().list(
+        calendarId=CONFIG['CALENDAR_ID'],
+        timeMin=start.isoformat(),
+        timeMax=end.isoformat(),
+        singleEvents=True
+    ).execute().get('items', [])
+
+    for event in events:
+        title = event.get('summary', '')
+        match = re.search(r'MLM(?:\s+Rotating)?:\s*(\w+)', title, re.IGNORECASE)
+        if match:
+            cal_name = match.group(1)
+            return CONFIG['NAME_MAPPING'].get(cal_name, cal_name)
+
+    logger.warning(f"No MLM found for {target_date.date()}")
+    return None
+
+
+@retry(max_attempts=3, delay=2, exceptions=(anthropic.APIError,))
+def analyze_email_with_claude(subject, body, sender):
+    """
+    Use Claude to analyze an email in a single call.
+    Returns dict with: is_spam, category, draft_response
+    """
+    client = anthropic.Anthropic(api_key=CONFIG['CLAUDE_API_KEY'])
+    categories_list = ", ".join(MLQ_CATEGORIES)
+
+    prompt = f"""Analyze this email for the UCSC Genome Browser support team.
+
+From: {sender}
+Subject: {subject}
+Body:
+{body[:3000]}
+
+Provide your analysis in this exact format:
+
+SPAM: [YES or NO]
+CATEGORY: [Pick one from: {categories_list}]
+DRAFT_RESPONSE: [If not spam, write a helpful, professional response under 200 words. If spam, write "N/A"]
+
+Important:
+- Mark as SPAM if it is:
+  - Conference/journal solicitations asking for paper submissions
+  - Promotions for workshops, courses, training programs, or webinars
+  - Marketing or promotional emails advertising services or products
+  - Mass-sent announcements unrelated to genome browser support
+  - Contains sensitive personal medical information (specific names with genetic test results, medical conditions, family medical history, or personal health details) - these are privacy concerns
+- Mark as NOT SPAM if it is a genuine question about using the UCSC Genome Browser (general genetics questions without personal identifying info are OK)
+- For CATEGORY, pick the most specific match. Use "Other" if unsure.
+- For DRAFT_RESPONSE, be helpful and concise. Ask clarifying questions if needed. Point to relevant documentation when appropriate."""
+
+    response = client.messages.create(
+        model="claude-sonnet-4-20250514",
+        max_tokens=800,
+        messages=[{"role": "user", "content": prompt}]
+    )
+
+    result_text = response.content[0].text.strip()
+
+    # Parse the response
+    is_spam = False
+    category = "Other"
+    draft_response = None
+
+    for line in result_text.split('\n'):
+        line = line.strip()
+        if line.upper().startswith('SPAM:'):
+            is_spam = 'YES' in line.upper()
+        elif line.upper().startswith('CATEGORY:'):
+            cat = line.split(':', 1)[1].strip()
+            # Validate category
+            if cat in MLQ_CATEGORIES:
+                category = cat
+            else:
+                for c in MLQ_CATEGORIES:
+                    if c.lower() == cat.lower():
+                        category = c
+                        break
+        elif line.upper().startswith('DRAFT_RESPONSE:'):
+            draft_response = line.split(':', 1)[1].strip()
+            # Capture multi-line response
+            idx = result_text.find('DRAFT_RESPONSE:')
+            if idx != -1:
+                draft_response = result_text[idx + len('DRAFT_RESPONSE:'):].strip()
+                if draft_response.upper() == 'N/A':
+                    draft_response = None
+
+    return {
+        'is_spam': is_spam,
+        'category': category,
+        'draft_response': draft_response
+    }
+
+
+@retry(max_attempts=3, delay=2, exceptions=(anthropic.APIError,))
+def batch_check_spam_with_claude(messages):
+    """
+    Use Claude to determine spam status for multiple emails in one call.
+    Returns dict mapping message index to True (spam) or False (not spam).
+    """
+    if not messages:
+        return {}
+
+    client = anthropic.Anthropic(api_key=CONFIG['CLAUDE_API_KEY'])
+
+    # Build the prompt with all messages
+    emails_text = ""
+    for i, msg in enumerate(messages, 1):
+        emails_text += f"""
+--- EMAIL {i} ---
+From: {msg['original_from']}
+Subject: {msg['original_subject']}
+Body:
+{msg['original_body'][:1500]}
+"""
+
+    prompt = f"""Analyze these emails and determine if each is spam. This is for a genome browser technical support mailing list.
+
+{emails_text}
+
+Mark as SPAM if it is:
+- Conference/journal solicitations asking for paper submissions
+- Promotions for workshops, courses, training programs, or webinars
+- Marketing or promotional emails advertising services or products
+- Phishing or scam attempts
+- Mass-sent unsolicited emails unrelated to genome browser support
+- Announcements about events, courses, or programs (not questions about the browser)
+- Contains sensitive personal medical information (specific names with genetic test results, medical conditions, family medical history, or personal health details) - these are privacy concerns
+
+Mark as NOT SPAM if it is:
+- A genuine question about the UCSC Genome Browser
+- A technical support request
+- A follow-up to an existing conversation
+- Someone asking how to use browser features for their research
+- General questions about genetic data without personal identifying information
+
+Reply with one line per email in this exact format:
+EMAIL 1: SPAM or NOT SPAM
+EMAIL 2: SPAM or NOT SPAM
+(etc.)"""
+
+    response = client.messages.create(
+        model="claude-sonnet-4-20250514",
+        max_tokens=100,
+        messages=[{"role": "user", "content": prompt}]
+    )
+
+    result_text = response.content[0].text.strip().upper()
+    results = {}
+
+    for line in result_text.split('\n'):
+        line = line.strip()
+        match = re.match(r'EMAIL\s*(\d+):\s*(SPAM|NOT SPAM)', line)
+        if match:
+            idx = int(match.group(1)) - 1  # Convert to 0-based index
+            is_spam = match.group(2) == 'SPAM'
+            results[idx] = is_spam
+
+    # Default to not spam for any missing results
+    for i in range(len(messages)):
+        if i not in results:
+            logger.warning(f"No spam result for message {i+1}, defaulting to not spam")
+            results[i] = False
+
+    return results
+
+
+def parse_raw_email_headers(raw_email):
+    """Parse Subject, From, and body from raw email text.
+
+    Properly handles multipart MIME emails by extracting just the text/plain part.
+    """
+    # Parse using Python's email module for proper MIME handling
+    msg = email.message_from_string(raw_email, policy=policy.default)
+
+    # Get headers
+    subject = msg.get('subject', '')
+    # Priority: x-original-sender > reply-to > from
+    sender = (
+        msg.get('x-original-sender') or
+        msg.get('reply-to') or
+        msg.get('from', '')
+    )
+
+    # Extract body - prefer text/plain, fall back to text/html converted to text
+    body = ''
+    html_body = ''
+
+    if msg.is_multipart():
+        for part in msg.walk():
+            content_type = part.get_content_type()
+            if content_type == 'text/plain' and not body:
+                try:
+                    body = part.get_content()
+                except Exception:
+                    # Fallback: try to decode manually
+                    payload = part.get_payload(decode=True)
+                    if payload:
+                        charset = part.get_content_charset() or 'utf-8'
+                        body = payload.decode(charset, errors='ignore')
+            elif content_type == 'text/html' and not html_body:
+                try:
+                    html_body = part.get_content()
+                except Exception:
+                    payload = part.get_payload(decode=True)
+                    if payload:
+                        charset = part.get_content_charset() or 'utf-8'
+                        html_body = payload.decode(charset, errors='ignore')
+    else:
+        # Not multipart - check content type
+        content_type = msg.get_content_type()
+        try:
+            content = msg.get_content()
+        except Exception:
+            payload = msg.get_payload(decode=True)
+            if payload:
+                charset = msg.get_content_charset() or 'utf-8'
+                content = payload.decode(charset, errors='ignore')
+            else:
+                content = msg.get_payload()
+
+        if content_type == 'text/plain':
+            body = content
+        elif content_type == 'text/html':
+            html_body = content
+
+    # If no text/plain, convert HTML to text
+    if not body and html_body:
+        body = html_to_text(html_body)
+
+    return subject, sender, body
+
+
+def get_pending_moderation_emails(group_name):
+    """Get pending moderation notification emails for a group."""
+    creds = get_google_credentials()
+    service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
+
+    # Search for pending moderation emails for this group (exclude trash)
+    query = f'subject:"{group_name} - soe.ucsc.edu admins: Message Pending" -in:trash'
+    results = service.users().messages().list(userId='me', q=query, maxResults=50).execute()
+
+    pending = []
+    for msg_ref in results.get('messages', []):
+        msg = service.users().messages().get(userId='me', id=msg_ref['id'], format='full').execute()
+        headers = {h['name']: h['value'] for h in msg['payload']['headers']}
+
+        # Extract the approval address from the From header
+        from_addr = headers.get('From', '')
+        approve_addr = None
+        if '+msgappr@' in from_addr:
+            match = re.search(r'<([^>]+\+msgappr@[^>]+)>', from_addr)
+            if match:
+                approve_addr = match.group(1)
+
+        # Extract the attached original message
+        original_subject = ''
+        original_from = ''
+        original_body = ''
+        original_attachments = []  # Will store attachment info for later extraction
+
+        for part in msg['payload'].get('parts', []):
+            if part['mimeType'] == 'message/rfc822':
+                # Check if original email is in an attachment (needs separate fetch)
+                if 'attachmentId' in part.get('body', {}):
+                    att_id = part['body']['attachmentId']
+                    att = service.users().messages().attachments().get(
+                        userId='me', messageId=msg_ref['id'], id=att_id
+                    ).execute()
+                    raw_email = base64.urlsafe_b64decode(att['data']).decode('utf-8', errors='ignore')
+                    original_subject, original_from, original_body = parse_raw_email_headers(raw_email)
+                    # Note: Attachments in raw RFC822 would need MIME parsing - skip for now
+                else:
+                    # Fallback: nested parts format (Gmail pre-parsed the RFC822)
+                    # Helper to recursively find content of specific MIME type
+                    def find_content_recursive(p, mime_type):
+                        if p.get('mimeType') == mime_type and p.get('body', {}).get('data'):
+                            return base64.urlsafe_b64decode(p['body']['data']).decode('utf-8', errors='ignore')
+                        for sp in p.get('parts', []):
+                            result = find_content_recursive(sp, mime_type)
+                            if result:
+                                return result
+                        return ''
+
+                    # Helper to recursively find email headers (Subject/From) in nested MIME structure
+                    def find_headers_recursive(p):
+                        if 'headers' in p:
+                            headers = {h['name']: h['value'] for h in p['headers']}
+                            # Only return if we found actual email headers (not just Content-Type)
+                            if 'Subject' in headers or 'From' in headers:
+                                return headers
+                        for sp in p.get('parts', []):
+                            result = find_headers_recursive(sp)
+                            if result:
+                                return result
+                        return {}
+
+                    nested_headers = find_headers_recursive(part)
+                    if nested_headers:
+                        original_subject = nested_headers.get('Subject', original_subject)
+                        original_from = (
+                            nested_headers.get('X-Original-Sender') or
+                            nested_headers.get('Reply-To') or
+                            nested_headers.get('From') or
+                            original_from
+                        )
+
+                    # Try text/plain first, fall back to HTML converted to text
+                    original_body = find_content_recursive(part, 'text/plain')
+                    if not original_body:
+                        html_body = find_content_recursive(part, 'text/html')
+                        if html_body:
+                            original_body = html_to_text(html_body)
+
+                    # Extract attachments from nested RFC822 structure
+                    original_attachments = extract_email_attachments(service, msg_ref['id'], part)
+
+        if approve_addr:
+            # Get the notification's Message-Id and Subject for proper reply threading
+            notification_message_id = headers.get('Message-Id') or headers.get('Message-ID', '')
+            notification_subject = headers.get('Subject', '')
+
+            pending.append({
+                'gmail_id': msg_ref['id'],
+                'approve_addr': approve_addr,
+                'original_subject': original_subject,
+                'original_from': original_from,
+                'original_body': original_body,
+                'original_attachments': original_attachments,
+                'notification_message_id': notification_message_id,
+                'notification_subject': notification_subject,
+            })
+
+    return pending
+
+
+def moderate_message(pending_msg, approve=True):
+    """Approve a pending message by sending email reply."""
+    if DRY_RUN:
+        logger.info(f"  [DRY RUN] Would approve: {pending_msg['original_subject'][:50]}")
+        return True
+
+    creds = get_google_credentials()
+    service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
+
+    to_addr = pending_msg['approve_addr']
+
+    # Create a properly formatted reply with threading headers
+    message = MIMEText('')
+    message['To'] = to_addr
+    message['From'] = 'gbauto@ucsc.edu'
+
+    # Use Re: prefix on original subject to indicate reply
+    orig_subject = pending_msg.get('notification_subject', '')
+    if orig_subject and not orig_subject.startswith('Re:'):
+        message['Subject'] = f'Re: {orig_subject}'
+    else:
+        message['Subject'] = orig_subject or 'Re: Moderation'
+
+    # Add threading headers to make this a proper reply
+    msg_id = pending_msg.get('notification_message_id', '')
+    if msg_id:
+        message['In-Reply-To'] = msg_id
+        message['References'] = msg_id
+
+    encoded = base64.urlsafe_b64encode(message.as_bytes()).decode('utf-8')
+
+    try:
+        service.users().messages().send(
+            userId='me',
+            body={'raw': encoded}
+        ).execute()
+        return True
+    except Exception as e:
+        logger.error(f"  Error approving message: {e}")
+        return False
+
+
+def send_error_notification(subject, body):
+    """Send an error notification email to the QA team.
+
+    Used to alert the team when Redmine API operations fail after retries.
+    """
+    if DRY_RUN:
+        logger.info(f"  [DRY RUN] Would send error notification: {subject}")
+        return True
+
+    try:
+        creds = get_google_credentials()
+        service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
+
+        message = MIMEText(body)
+        message['To'] = 'browserqa-group@ucsc.edu'
+        message['From'] = 'gbauto@ucsc.edu'
+        message['Subject'] = f'[MLQ Automation Error] {subject}'
+
+        encoded = base64.urlsafe_b64encode(message.as_bytes()).decode('utf-8')
+
+        service.users().messages().send(
+            userId='me',
+            body={'raw': encoded}
+        ).execute()
+        logger.info(f"Sent error notification email: {subject}")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to send error notification email: {e}")
+        return False
+
+
+def delete_moderation_email(gmail_id):
+    """Delete/archive the moderation notification after processing."""
+    if DRY_RUN:
+        return
+
+    creds = get_google_credentials()
+    service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
+    try:
+        service.users().messages().trash(userId='me', id=gmail_id).execute()
+    except Exception as e:
+        logger.error(f"  Error trashing notification: {e}")
+
+
+def get_emails_from_gmail(group_email, minutes_ago=60):
+    """Get recent emails sent to a mailing list."""
+    creds = get_google_credentials()
+    service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
+
+    cutoff = datetime.now(PST) - timedelta(minutes=minutes_ago)
+    query = f"to:{group_email} after:{int(cutoff.timestamp())}"
+
+    results = service.users().messages().list(
+        userId='me', q=query, maxResults=50
+    ).execute()
+
+    messages = []
+    for msg_ref in results.get('messages', []):
+        msg = service.users().messages().get(
+            userId='me', id=msg_ref['id'], format='full'
+        ).execute()
+
+        headers = {h['name']: h['value'] for h in msg['payload']['headers']}
+        subject = headers.get('Subject', '(no subject)')
+
+        # Get original sender - Google Groups may rewrite From to the list address
+        # Priority: X-Original-Sender > Reply-To > From
+        from_addr = (
+            headers.get('X-Original-Sender') or
+            headers.get('Reply-To') or
+            headers.get('From', '')
+        )
+
+        # Skip moderation-related emails (notifications and our own replies)
+        if 'Message Pending' in subject:
+            continue
+        if 'Moderation response' in subject:
+            continue
+        if 'gbauto@ucsc.edu' in from_addr:
+            continue
+        if '+msgappr@' in from_addr or '+msgrej@' in from_addr:
+            continue
+
+        # Extract body - prefer text/plain, fall back to text/html converted to text
+        def extract_body_content(payload, target_type):
+            """Recursively find content of target_type in email payload."""
+            if payload.get('mimeType') == target_type:
+                data = payload.get('body', {}).get('data', '')
+                if data:
+                    return base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
+            if 'parts' in payload:
+                for part in payload['parts']:
+                    result = extract_body_content(part, target_type)
+                    if result:
+                        return result
+            return ''
+
+        # Try text/plain first, fall back to HTML converted to text
+        body = extract_body_content(msg['payload'], 'text/plain')
+        if not body:
+            html_body = extract_body_content(msg['payload'], 'text/html')
+            if html_body:
+                body = html_to_text(html_body)
+
+        # Extract attachments
+        attachments = extract_email_attachments(service, msg['id'], msg['payload'])
+
+        messages.append({
+            'id': msg['id'],
+            'thread_id': msg['threadId'],
+            'subject': subject,
+            'from': from_addr,
+            'to': headers.get('To', ''),
+            'cc': headers.get('Cc', ''),
+            'date': headers.get('Date', ''),
+            'body': body,
+            'attachments': attachments,
+            'timestamp': int(msg['internalDate']) / 1000,
+        })
+
+    return messages
+
+
+def html_to_text(html):
+    """Convert HTML to plain text by stripping tags and decoding entities.
+
+    Used as a fallback when an email only has HTML content (no text/plain part).
+    """
+    if not html:
+        return ''
+
+    # Remove script and style tags with their content
+    text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
+
+    # Convert <br> and </p> to newlines
+    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
+    text = re.sub(r'</p>', '\n\n', text, flags=re.IGNORECASE)
+    text = re.sub(r'</div>', '\n', text, flags=re.IGNORECASE)
+    text = re.sub(r'</li>', '\n', text, flags=re.IGNORECASE)
+
+    # Remove all other HTML tags
+    text = re.sub(r'<[^>]+>', '', text)
+
+    # Decode HTML entities
+    text = html_module.unescape(text)
+
+    # Normalize whitespace (collapse multiple spaces/newlines)
+    text = re.sub(r'[ \t]+', ' ', text)  # Collapse horizontal whitespace
+    text = re.sub(r'\n[ \t]+', '\n', text)  # Remove leading spaces on lines
+    text = re.sub(r'[ \t]+\n', '\n', text)  # Remove trailing spaces on lines
+    text = re.sub(r'\n{3,}', '\n\n', text)  # Collapse multiple newlines
+
+    return text.strip()
+
+
+def sanitize_for_redmine(text):
+    """Clean up text for Redmine textile compatibility."""
+    # Remove emojis and other 4-byte UTF-8 characters that cause MySQL utf8 encoding issues
+    # MySQL utf8 is 3-byte max; utf8mb4 supports 4-byte but many Redmine installs use utf8
+    text = re.sub(r'[\U00010000-\U0010FFFF]', '', text)
+
+    # Remove Outlook duplicate URL format: URL<URL> -> URL
+    # Outlook often includes the URL twice: once as display text, once in angle brackets
+    text = re.sub(
+        r'(https?://[^\s<>\[\]]+)<https?://[^\s<>\[\]>]+>',
+        r'\1',
+        text
+    )
+
+    # Note: We do NOT auto-link URLs here. Redmine uses Textile format by default
+    # and has its own auto-linking for bare URLs. Adding Markdown-style [URL](URL)
+    # links causes display issues in Textile mode.
+
+    lines = text.split('\n')
+    cleaned = []
+
+    for line in lines:
+        # Remove leading whitespace that would trigger code blocks (4+ spaces or tabs)
+        stripped = line.lstrip(' \t')
+        leading = len(line) - len(stripped)
+
+        if leading >= 4:
+            # Reduce to 2 spaces to preserve some structure without triggering code block
+            line = '  ' + stripped
+        elif leading > 0 and line.startswith('\t'):
+            # Convert tabs to spaces
+            line = '  ' + stripped
+
+        # Escape # at start of line to prevent header formatting
+        if line.startswith('#'):
+            line = '\\' + line
+
+        # Escape --- or *** that would become horizontal rules
+        if re.match(r'^[-*_]{3,}\s*$', line):
+            line = '\\' + line
+
+        # Escape leading - or * followed by space that would become list items
+        # (only if it doesn't look intentional, i.e., not followed by more text structure)
+        if re.match(r'^[-*]\s+[a-z]', line, re.IGNORECASE):
+            # Looks like prose starting with dash, not a list - escape it
+            if not re.match(r'^[-*]\s+\S+\s*$', line):  # Single word items are likely lists
+                line = '\\' + line
+
+        cleaned.append(line)
+
+    return '\n'.join(cleaned)
+
+
+def strip_quoted_content(body):
+    """Remove quoted reply content and Google Groups footer from email body.
+
+    Handles three reply styles:
+    1. Top-posted: New content at top, quoted content below (most common)
+    2. Inline/interleaved: Short quotes with replies below each (less common)
+    3. Bottom-posted: Quoted content at top, new content below (e.g., some
+       Thunderbird/Linux mail clients)
+
+    For bottom-posted replies, when a quote header ("On ... wrote:") appears
+    within the first 3 lines, the quoted block is skipped and only the new
+    content that follows is kept.
+    """
+    lines = body.split('\n')
+    cleaned = []
+    i = 0
+
+    # Pattern to strip Unicode control characters (LTR/RTL marks, etc.)
+    # These can appear at the start or end of lines in emails with mixed-direction text
+    unicode_control_chars = '[\u200e\u200f\u202a-\u202e\u2066-\u2069]'
+    unicode_control_pattern = re.compile(f'^{unicode_control_chars}+|{unicode_control_chars}+$')
+
+    while i < len(lines):
+        line = lines[i]
+        # Strip Unicode control characters for pattern matching
+        line_stripped = unicode_control_pattern.sub('', line)
+        # Also strip leading quote markers ("> ") to catch quoted "On...wrote:" patterns
+        # e.g., "> On Jan 15, 2026, at 12:01 AM, Name wrote:"
+        line_unquoted = re.sub(r'^>\s*', '', line_stripped)
+
+        # Detect "On <date> <person> wrote:" quote headers
+        is_quote_header = False
+        quote_header_lines = 1  # How many lines this header spans
+
+        if re.match(r'^On .+wrote:\s*$', line_unquoted, re.IGNORECASE):
+            is_quote_header = True
+        elif re.match(r'^On .+, at .+wrote:\s*$', line_unquoted, re.IGNORECASE):
+            is_quote_header = True
+        elif re.match(r'^On .+@.+>\s*$', line_unquoted) or re.match(r'^On .*\d{4}.*[AP]M .+', line_unquoted):
+            if i + 1 < len(lines):
+                next_stripped = unicode_control_pattern.sub('', lines[i + 1])
+                next_unquoted = re.sub(r'^>\s*', '', next_stripped).strip().lower()
+                if next_unquoted.endswith('wrote:'):
+                    is_quote_header = True
+                    quote_header_lines = 2
+        elif line_unquoted.strip().lower().endswith('wrote:') and i > 0:
+            prev = lines[i - 1]
+            prev_stripped = unicode_control_pattern.sub('', prev)
+            prev_unquoted = re.sub(r'^>\s*', '', prev_stripped)
+            if re.match(r'^On .+', prev_unquoted, re.IGNORECASE):
+                if cleaned and cleaned[-1] == prev:
+                    cleaned.pop()
+                is_quote_header = True
+
+        if is_quote_header:
+            # Bottom-posted reply: quote header near the start means new content
+            # is below the quoted block. Skip the quoted block, keep what follows.
+            # Allow at most 1 non-blank line above (e.g., a greeting like "Hi,")
+            non_blank_above = sum(1 for l in cleaned if l.strip())
+            if non_blank_above <= 1:
+                # Skip past the quote header
+                i += quote_header_lines
+                # Skip the quoted lines (starting with ">")
+                while i < len(lines):
+                    l = lines[i]
+                    l_stripped = unicode_control_pattern.sub('', l).strip()
+                    if l_stripped.startswith('>') or l_stripped == '':
+                        i += 1
+                    else:
+                        break
+                # Reset cleaned — anything before the quote header was blank/trivial
+                cleaned = []
+                continue
+            else:
+                # Top-posted reply: we already have real content above, stop here
+                break
+
+        if re.match(r'^-{4,}\s*Original Message\s*-{4,}', line_stripped, re.IGNORECASE):
+            break
+        # Gmail forwarded message format: "---------- Forwarded message ---------"
+        if re.match(r'^-{4,}\s*Forwarded message\s*-{4,}', line_stripped, re.IGNORECASE):
+            break
+        # Outlook single-line forward format
+        if re.match(r'^From:.*Sent:.*To:', line_stripped, re.IGNORECASE):
+            break
+        # Outlook multi-line forward format:
+        # From: Name
+        # Sent: Date
+        # To: Recipients
+        if re.match(r'^From:\s*.+', line_stripped, re.IGNORECASE):
+            # Look ahead for Sent: and To: on subsequent lines
+            if i + 2 < len(lines):
+                next1 = unicode_control_pattern.sub('', lines[i + 1])
+                next2 = unicode_control_pattern.sub('', lines[i + 2])
+                if re.match(r'^Sent:\s*.+', next1, re.IGNORECASE) and re.match(r'^To:\s*.+', next2, re.IGNORECASE):
+                    break
+        # Stop at Google Groups footer
+        if 'You received this message because you are subscribed to the Google Groups' in line:
+            break
+        # Skip Google Groups unsubscribe line
+        if 'To unsubscribe from this group and stop receiving emails from it' in line:
+            i += 1
+            continue
+        if line.strip() == '---' and len(cleaned) > 0:
+            i += 1
+            continue
+
+        # Handle quoted lines (starting with >)
+        # Keep quoted lines for inline reply context - they provide important context
+        # The "On ... wrote:" and other patterns above will stop at the full original message
+        cleaned.append(line)
+        i += 1
+
+    # Remove trailing dashes and whitespace
+    while cleaned and cleaned[-1].strip() in ('', '---', '--'):
+        cleaned.pop()
+
+    return '\n'.join(cleaned).rstrip()
+
+
+# Attachment handling constants
+SIGNATURE_ATTACHMENT_PATTERNS = [
+    r'^logo',
+    r'^signature',
+    r'^banner',
+    r'^icon',
+    r'^footer',
+    r'^divider',
+    # Note: Removed image\d* pattern - Outlook uses this for legitimate inline images.
+    # Size filter (MIN_ATTACHMENT_SIZE) handles small signature icons instead.
+]
+MIN_ATTACHMENT_SIZE = 1024  # Skip attachments smaller than 1KB (likely icons)
+MAX_ATTACHMENT_SIZE = 10 * 1024 * 1024  # 10MB max per attachment
+
+
+def is_signature_attachment(filename, size_bytes):
+    """Check if an attachment is likely a signature/icon that should be skipped."""
+    if not filename:
+        return True
+
+    # Skip very small images (likely icons/spacers)
+    if size_bytes < MIN_ATTACHMENT_SIZE:
+        return True
+
+    # Check filename against signature patterns
+    name_lower = filename.lower().rsplit('.', 1)[0]  # Remove extension
+    for pattern in SIGNATURE_ATTACHMENT_PATTERNS:
+        if re.match(pattern, name_lower, re.IGNORECASE):
+            return True
+
+    return False
+
+
+def extract_email_attachments(gmail_service, message_id, payload):
+    """
+    Extract all attachments from an email.
+    Returns list of dicts with: filename, mimeType, data, size
+    Filters out signature images and oversized files.
+    """
+    attachments = []
+
+    def find_attachments_recursive(part):
+        mime_type = part.get('mimeType', '')
+        filename = part.get('filename', '')
+        body = part.get('body', {})
+        attachment_id = body.get('attachmentId')
+        size = body.get('size', 0)
+
+        # Check if this part is an attachment (has attachmentId or filename with size)
+        if attachment_id and filename:
+            # Skip signature/icon attachments
+            if is_signature_attachment(filename, size):
+                logger.debug(f"Skipping signature attachment: {filename} ({size} bytes)")
+            elif size > MAX_ATTACHMENT_SIZE:
+                logger.warning(f"Skipping oversized attachment: {filename} ({size} bytes)")
+            else:
+                # Download the attachment
+                try:
+                    att_data = gmail_service.users().messages().attachments().get(
+                        userId='me', messageId=message_id, id=attachment_id
+                    ).execute()
+                    file_data = base64.urlsafe_b64decode(att_data['data'])
+                    attachments.append({
+                        'filename': filename,
+                        'mimeType': mime_type,
+                        'data': file_data,
+                        'size': len(file_data)
+                    })
+                except Exception as e:
+                    logger.warning(f"Failed to download attachment {filename}: {e}")
+
+        # Recurse into nested parts
+        for subpart in part.get('parts', []):
+            find_attachments_recursive(subpart)
+
+    find_attachments_recursive(payload)
+    return attachments
+
+
+def upload_attachments_to_redmine(attachments):
+    """
+    Upload attachments to Redmine and return tokens.
+    Returns list of dicts with: filename, token, content_type
+    """
+    if not attachments:
+        return []
+
+    uploaded = []
+    for att in attachments:
+        try:
+            upload_url = f"{CONFIG['REDMINE_URL']}/uploads.json?filename={att['filename']}"
+            headers = {
+                'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY'],
+                'Content-Type': 'application/octet-stream'
+            }
+
+            resp = requests.post(upload_url, data=att['data'], headers=headers, timeout=60)
+
+            if resp.status_code == 201:
+                token = resp.json()['upload']['token']
+                uploaded.append({
+                    'filename': att['filename'],
+                    'token': token,
+                    'content_type': att['mimeType']
+                })
+                logger.debug(f"Uploaded attachment: {att['filename']}")
+            elif resp.status_code == 422:
+                logger.warning(f"Attachment too large for Redmine: {att['filename']}")
+            else:
+                logger.warning(f"Failed to upload {att['filename']}: {resp.status_code}")
+        except Exception as e:
+            logger.warning(f"Error uploading attachment {att['filename']}: {e}")
+
+    return uploaded
+
+
+def replace_inline_images(body, uploaded_attachments):
+    """
+    Replace inline image placeholders with Redmine inline image syntax.
+    Handles:
+    - Gmail format: [image: filename.png]
+    - Outlook CID format: [cid:filename.png@identifier]
+    """
+    if not uploaded_attachments:
+        return body
+
+    # Build a map of filenames (case-insensitive)
+    filename_map = {att['filename'].lower(): att['filename'] for att in uploaded_attachments}
+
+    def replace_gmail_placeholder(match):
+        """Handle Gmail [image: filename] format."""
+        placeholder_name = match.group(1).strip()
+        lookup = placeholder_name.lower()
+        if lookup in filename_map:
+            actual_filename = filename_map[lookup]
+            return f'!{actual_filename}!'
+        return match.group(0)
+
+    def replace_cid_placeholder(match):
+        """Handle Outlook [cid:filename@identifier] format."""
+        cid_content = match.group(1)
+        # Extract filename from cid:filename@identifier or cid:filename
+        if '@' in cid_content:
+            filename_part = cid_content.split('@')[0]
+        else:
+            filename_part = cid_content
+
+        lookup = filename_part.lower()
+        if lookup in filename_map:
+            actual_filename = filename_map[lookup]
+            return f'!{actual_filename}!'
+        return match.group(0)
+
+    # Replace Gmail [image: filename.png] patterns
+    body = re.sub(r'\[image:\s*([^\]]+)\]', replace_gmail_placeholder, body, flags=re.IGNORECASE)
+
+    # Replace Outlook [cid:filename.png@identifier] patterns
+    body = re.sub(r'\[cid:([^\]]+)\]', replace_cid_placeholder, body, flags=re.IGNORECASE)
+
+    return body
+
+
+def extract_email_address(from_header):
+    """Extract just the email address from a From header."""
+    _, email = parseaddr(from_header)
+    return email.lower()
+
+
+def extract_all_email_addresses(header_value):
+    """Extract all email addresses from a header (To, CC can have multiple)."""
+    if not header_value:
+        return []
+    # Handle multiple addresses separated by commas
+    # parseaddr only handles single address, so we split first
+    addresses = []
+    for part in header_value.split(','):
+        _, email = parseaddr(part.strip())
+        if email:
+            addresses.append(email.lower())
+    return addresses
+
+
+def extract_hgusersuggestion_email(subject):
+    """Extract user email from hgUserSuggestion form subjects.
+
+    hgUserSuggestion form submissions have subjects like:
+    'hgUserSuggestion mmcgary44@gmail.com 2026-01-28 05:59:21'
+
+    Returns the extracted email, or None if not an hgUserSuggestion subject.
+    """
+    if not subject:
+        return None
+
+    # Check if this is an hgUserSuggestion subject
+    if 'hgusersuggestion' not in subject.lower():
+        return None
+
+    # Extract email using pattern: hgUserSuggestion <email> <date> <time>
+    # Email pattern should match common email formats
+    match = re.search(r'hgusersuggestion\s+([^\s]+@[^\s]+)\s+\d{4}-\d{2}-\d{2}', subject, re.IGNORECASE)
+    if match:
+        return match.group(1).lower()
+
+    return None
+
+
+def normalize_subject(subject):
+    """Normalize subject for matching by removing common prefixes and list tags.
+
+    Removes these patterns from ANYWHERE in the subject (not just the start)
+    to handle cases like 'TICKET-123 Re: [genome] Original subject'.
+    """
+    if not subject:
+        return ''
+
+    s = subject.strip()
+
+    # Remove mailing list tags from anywhere
+    # Generate patterns from CONFIG to stay in sync with configured lists
+    for addr in CONFIG['MODERATED_LISTS'] + CONFIG['UNMODERATED_LISTS']:
+        list_name = addr.split('@')[0]
+        # Escape the brackets for regex and remove case-insensitively
+        pattern = re.escape(f'[{list_name}]')
+        s = re.sub(pattern, '', s, flags=re.IGNORECASE)
+
+    # Remove [External] tags added by email gateways
+    s = re.sub(r'\[external\]', '', s, flags=re.IGNORECASE)
+    s = re.sub(r'\bexternal:\s*', '', s, flags=re.IGNORECASE)
+
+    # Remove reply/forward prefixes from anywhere
+    # Use word boundary \b to avoid matching inside words (e.g., "Re-install")
+    reply_forward_patterns = [
+        r'\bre:\s*',      # Re: RE:
+        r'\bfwd?:\s*',    # Fwd: FW: Fw:
+        r'\baw:\s*',      # AW: (German "Antwort")
+    ]
+    for pattern in reply_forward_patterns:
+        s = re.sub(pattern, '', s, flags=re.IGNORECASE)
+
+    # Strip leading punctuation and whitespace (e.g., ": Subject" -> "Subject")
+    s = re.sub(r'^[\s:,\-]+', '', s)
+
+    # Collapse multiple whitespace and strip
+    s = ' '.join(s.split())
+
+    return s
+
+
+@retry(max_attempts=3, delay=2, exceptions=(requests.RequestException,))
+def find_existing_ticket(subject, thread_emails):
+    """Find an existing Redmine ticket by subject and email match.
+
+    Requires both:
+    1. Normalized subject match
+    2. At least one email from thread_emails matches the ticket's Email field
+
+    Exception: if any thread participant is a @ucsc.edu address (staff), the
+    email match is skipped and subject match alone is sufficient. Staff replies
+    to mailing list threads typically go to the list, not the original sender,
+    so the original sender's email won't appear in To/CC.
+
+    thread_emails should include all participants (From, To, CC) to handle
+    replies where the original sender appears in To/CC fields.
+    """
+    normalized = normalize_subject(subject)
+
+    url = f"{CONFIG['REDMINE_URL']}/issues.json"
+    params = {
+        'project_id': CONFIG['REDMINE_PROJECT'],
+        'subject': f"~{normalized}",
+        'status_id': '*',
+        'limit': 100,
+    }
+    headers = {'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY']}
+
+    resp = requests.get(url, params=params, headers=headers, timeout=30)
+    resp.raise_for_status()
+    data = resp.json()
+
+    email_list = [e.lower() for e in thread_emails]
+    has_staff_participant = any(e.endswith('@ucsc.edu') for e in email_list)
+
+    for issue in data.get('issues', []):
+        if normalize_subject(issue['subject']).lower() != normalized.lower():
+            continue
+
+        # Staff replies to mailing list threads don't need email match —
+        # subject match is sufficient since staff wouldn't start a new
+        # unrelated thread with the same subject
+        if has_staff_participant:
+            return issue['id']
+
+        # For external senders, require email match to avoid false positives
+        # on generic subjects
+        email_field = next(
+            (f for f in issue.get('custom_fields', [])
+             if f['id'] == CONFIG['CUSTOM_FIELDS']['Email']),
+            None
+        )
+        if email_field and email_field.get('value'):
+            ticket_emails = [e.strip().lower() for e in email_field['value'].split(',')]
+            if any(te in ee or ee in te for te in email_list for ee in ticket_emails):
+                return issue['id']
+
+    return None
+
+
+def normalize_for_comparison(text):
+    """Normalize text for duplicate detection by removing formatting."""
+    # Remove Redmine/markdown formatting characters
+    text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)  # Quote markers
+    text = re.sub(r'!\S+!', '', text)  # Redmine inline images: !filename.png!
+    text = re.sub(r'[*_`~]', '', text)  # Bold, italic, code markers (keep ! for punctuation)
+    text = re.sub(r'---\s*(New Email Update|AI Suggested Response).*?---', '', text, flags=re.DOTALL)
+    text = re.sub(r'^From:.*$', '', text, flags=re.MULTILINE)  # Remove From: lines
+    # Collapse whitespace and lowercase
+    return ' '.join(text.split()).lower()
+
+
+def text_similarity(text1, text2):
+    """Calculate similarity ratio between two texts using SequenceMatcher."""
+    return SequenceMatcher(None, text1, text2).ratio()
+
+
+def text_containment(text1, text2, threshold=0.85):
+    """Check if the shorter text is substantially contained within the longer text.
+
+    This handles cases where a draft reply is posted to Redmine, then sent via email
+    with additional content (greeting, signature, etc.). The draft would be contained
+    within the email even though overall similarity is low.
+
+    Returns True if at least `threshold` (default 85%) of the shorter text is found
+    as a contiguous match within the longer text.
+    """
+    if not text1 or not text2:
+        return False
+
+    shorter, longer = (text1, text2) if len(text1) <= len(text2) else (text2, text1)
+
+    if len(shorter) < 20:
+        return False
+
+    # Find the longest contiguous match
+    matcher = SequenceMatcher(None, shorter, longer)
+    match = matcher.find_longest_match(0, len(shorter), 0, len(longer))
+
+    # Calculate what percentage of the shorter text is contained in the longer
+    containment_ratio = match.size / len(shorter)
+    return containment_ratio >= threshold
+
+
+def content_exists_in_ticket(ticket, email_body, similarity_threshold=0.80, containment_threshold=0.85):
+    """Check if email content already exists in ticket description or journals.
+
+    Uses two methods to detect duplicates:
+    1. Overall similarity (default 85% threshold) - catches near-identical content
+    2. Containment check (default 85% threshold) - catches cases where a draft
+       reply was posted to Redmine, then sent via email with additional content
+       (greeting, signature, etc.)
+
+    This prevents the script from reopening tickets when staff post their draft
+    reply to Redmine and then send it via email.
+    """
+    stripped = strip_quoted_content(email_body).strip()
+    if len(stripped) < 20:
+        # Very short content, skip duplicate check
+        return False
+
+    email_normalized = normalize_for_comparison(stripped)
+
+    # Quick check: if email content is very short after normalization, skip
+    if len(email_normalized) < 20:
+        return False
+
+    def is_duplicate(text1, text2, context_name):
+        """Check if two texts are duplicates using similarity or containment."""
+        # Check overall similarity
+        similarity = text_similarity(text1, text2)
+        if similarity >= similarity_threshold:
+            logger.debug(f"Found similar content in {context_name} ({similarity*100:.0f}% similarity)")
+            return True
+
+        # Check containment (shorter text contained in longer)
+        if text_containment(text1, text2, containment_threshold):
+            logger.debug(f"Found contained content in {context_name}")
+            return True
+
+        return False
+
+    # Check ticket description
+    desc = ticket.get('description', '')
+    if desc:
+        desc_normalized = normalize_for_comparison(desc)
+        if desc_normalized and is_duplicate(email_normalized, desc_normalized, "ticket description"):
+            return True
+
+    # Check all journal notes
+    for journal in ticket.get('journals', []):
+        notes = journal.get('notes', '')
+        if notes:
+            notes_normalized = normalize_for_comparison(notes)
+            if notes_normalized and is_duplicate(email_normalized, notes_normalized, f"journal #{journal.get('id')}"):
+                return True
+
+    logger.debug("No similar content found in ticket")
+    return False
+
+
+@retry(max_attempts=3, delay=2, exceptions=(requests.RequestException,))
+def get_ticket_journals(ticket_id):
+    """Get journal (comment) history for a ticket."""
+    url = f"{CONFIG['REDMINE_URL']}/issues/{ticket_id}.json?include=journals"
+    headers = {'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY']}
+
+    resp = requests.get(url, headers=headers, timeout=30)
+    resp.raise_for_status()
+    return resp.json().get('issue', {})
+
+
+def create_ticket(subject, body, sender_emails, mlm_name, category='Other', attachments=None):
+    """Create a new Redmine ticket with optional attachments.
+
+    Includes retry logic for transient server errors (5xx) and network issues.
+    Sends email notification to QA team if all retries fail.
+    """
+    if DRY_RUN:
+        att_info = f" with {len(attachments)} attachment(s)" if attachments else ""
+        logger.info(f"  [DRY RUN] Would create ticket: {subject[:50]}{att_info}")
+        logger.info(f"    Category: {category}, MLM: {mlm_name}")
+        return None
+
+    url = f"{CONFIG['REDMINE_URL']}/issues.json"
+    headers = {
+        'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY'],
+        'Content-Type': 'application/json',
+    }
+
+    # Strip emojis from subject (body should already be sanitized via sanitize_for_redmine)
+    clean_subject = re.sub(r'[\U00010000-\U0010FFFF]', '', subject)
+
+    data = {
+        'issue': {
+            'project_id': CONFIG['REDMINE_PROJECT'],
+            'subject': clean_subject,
+            'description': f"From: {sender_emails[0]}\n\n{body}",
+            'tracker_id': CONFIG['TRACKER_ID'],
+            'priority_id': CONFIG['PRIORITY_ID'],
+            'status_id': CONFIG['STATUS_ID'],
+            'custom_fields': [
+                {'id': CONFIG['CUSTOM_FIELDS']['MLQ Category - primary'], 'value': category},
+                {'id': CONFIG['CUSTOM_FIELDS']['Email'], 'value': ', '.join(sender_emails)},
+                {'id': CONFIG['CUSTOM_FIELDS']['MLM'], 'value': mlm_name},
+            ],
+        }
+    }
+
+    # Add attachments if provided (from upload_attachments_to_redmine)
+    if attachments:
+        data['issue']['uploads'] = [
+            {'token': att['token'], 'filename': att['filename'], 'content_type': att['content_type']}
+            for att in attachments
+        ]
+
+    # Retry logic for transient errors
+    max_attempts = 3
+    retry_delay = 5  # seconds
+    last_error = None
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            resp = requests.post(url, json=data, headers=headers, timeout=30)
+
+            if resp.status_code == 201:
+                ticket_id = resp.json()['issue']['id']
+                att_info = f" with {len(attachments)} attachment(s)" if attachments else ""
+                logger.info(f"Created ticket #{ticket_id}: {subject[:50]}{att_info}")
+                return ticket_id
+
+            # Log detailed error info
+            logger.error(f"Redmine API error creating ticket (attempt {attempt}/{max_attempts}): "
+                        f"HTTP {resp.status_code} - {resp.text[:500]}")
+
+            # Don't retry client errors (4xx) - something wrong with our request
+            if 400 <= resp.status_code < 500:
+                last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
+                break
+
+            # Server error (5xx) - retry after delay
+            if resp.status_code >= 500:
+                last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
+                if attempt < max_attempts:
+                    logger.info(f"  Retrying in {retry_delay} seconds...")
+                    time.sleep(retry_delay)
+                    retry_delay *= 2  # Exponential backoff
+                continue
+
+        except requests.RequestException as e:
+            logger.error(f"Network error creating ticket (attempt {attempt}/{max_attempts}): {e}")
+            last_error = f"Network error: {e}"
+            if attempt < max_attempts:
+                logger.info(f"  Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                retry_delay *= 2
+                continue
+
+    # All retries failed - send notification email
+    error_body = f"""The MLQ automation script failed to create a Redmine ticket after {max_attempts} attempts.
+
+Subject: {subject}
+From: {', '.join(sender_emails)}
+Category: {category}
+MLM: {mlm_name}
+
+Error: {last_error}
+
+This email needs to be manually processed or the issue investigated.
+
+Timestamp: {datetime.now(PST).strftime('%Y-%m-%d %H:%M:%S %Z')}
+"""
+    send_error_notification(f"Failed to create ticket: {subject[:50]}", error_body)
+    return None
+
+
+def update_ticket(ticket_id, comment, reopen=False, new_mlm=None, attachments=None):
+    """Add a comment to an existing ticket, optionally reopening it and attaching files.
+
+    Includes retry logic for transient server errors (5xx) and network issues.
+    Sends email notification to QA team if all retries fail.
+    """
+    if DRY_RUN:
+        action = "reopen and update" if reopen else "update"
+        att_info = f" with {len(attachments)} attachment(s)" if attachments else ""
+        logger.info(f"  [DRY RUN] Would {action} ticket #{ticket_id}{att_info}")
+        return True
+
+    url = f"{CONFIG['REDMINE_URL']}/issues/{ticket_id}.json"
+    headers = {
+        'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY'],
+        'Content-Type': 'application/json',
+    }
+
+    # Strip emojis from comment (may not have gone through sanitize_for_redmine)
+    clean_comment = re.sub(r'[\U00010000-\U0010FFFF]', '', comment)
+
+    data = {'issue': {'notes': clean_comment}}
+    if reopen:
+        data['issue']['status_id'] = CONFIG['STATUS_ID']
+        data['issue']['assigned_to_id'] = ''  # Clear assignee
+        if new_mlm:
+            data['issue']['custom_fields'] = [
+                {'id': CONFIG['CUSTOM_FIELDS']['MLM'], 'value': new_mlm}
+            ]
+
+    # Add attachments if provided (from upload_attachments_to_redmine)
+    if attachments:
+        data['issue']['uploads'] = [
+            {'token': att['token'], 'filename': att['filename'], 'content_type': att['content_type']}
+            for att in attachments
+        ]
+
+    # Retry logic for transient errors
+    max_attempts = 3
+    retry_delay = 5  # seconds
+    last_error = None
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            resp = requests.put(url, json=data, headers=headers, timeout=30)
+
+            if resp.status_code in (200, 204):
+                action = "Reopened and updated" if reopen else "Updated"
+                att_info = f" with {len(attachments)} attachment(s)" if attachments else ""
+                logger.info(f"{action} ticket #{ticket_id}{att_info}")
+                return True
+
+            # Log detailed error info
+            logger.error(f"Redmine API error updating ticket #{ticket_id} (attempt {attempt}/{max_attempts}): "
+                        f"HTTP {resp.status_code} - {resp.text[:500]}")
+
+            # Don't retry client errors (4xx) - something wrong with our request
+            if 400 <= resp.status_code < 500:
+                last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
+                break
+
+            # Server error (5xx) - retry after delay
+            if resp.status_code >= 500:
+                last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
+                if attempt < max_attempts:
+                    logger.info(f"  Retrying in {retry_delay} seconds...")
+                    time.sleep(retry_delay)
+                    retry_delay *= 2  # Exponential backoff
+                continue
+
+        except requests.RequestException as e:
+            logger.error(f"Network error updating ticket #{ticket_id} (attempt {attempt}/{max_attempts}): {e}")
+            last_error = f"Network error: {e}"
+            if attempt < max_attempts:
+                logger.info(f"  Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                retry_delay *= 2
+                continue
+
+    # All retries failed - send notification email
+    comment_preview = comment[:200] + '...' if len(comment) > 200 else comment
+    error_body = f"""The MLQ automation script failed to update Redmine ticket #{ticket_id} after {max_attempts} attempts.
+
+Ticket: #{ticket_id}
+Action: {'Reopen and update' if reopen else 'Update'}
+Comment preview: {comment_preview}
+
+Error: {last_error}
+
+This update needs to be manually applied or the issue investigated.
+
+Timestamp: {datetime.now(PST).strftime('%Y-%m-%d %H:%M:%S %Z')}
+"""
+    send_error_notification(f"Failed to update ticket #{ticket_id}", error_body)
+    return False
+
+
+def create_tickets_for_approved(approved_messages):
+    """Create or update Redmine tickets for approved moderation messages."""
+    if not approved_messages:
+        return
+
+    mlm_name = get_current_mlm()
+    if not mlm_name:
+        logger.error("Cannot determine MLM, skipping ticket creation for approved messages")
+        return
+
+    mlm_user_id = CONFIG['USER_IDS'].get(mlm_name)
+    if not mlm_user_id:
+        logger.error(f"No Redmine user ID for MLM: {mlm_name}")
+        return
+
+    logger.info(f"Creating tickets for {len(approved_messages)} approved message(s), MLM: {mlm_name}")
+
+    for msg in approved_messages:
+        subject = msg['original_subject']
+        sender = msg['original_from']
+        body = msg['original_body']
+        group_email = msg['group_email']
+        attachments = msg.get('original_attachments', [])
+
+        # Add list prefix to subject if not present (for consistency with delivered emails)
+        group_name = group_email.split('@')[0]
+        prefix = f"[{group_name}]"
+        if not subject.lower().startswith(prefix.lower()):
+            subject = f"{prefix} {subject}"
+
+        # Extract sender email address
+        sender_email = extract_email_address(sender)
+        sender_emails = [sender_email]
+
+        # For hgUserSuggestion forms, the actual user's email is in the subject
+        # Add it to sender_emails so ticket matching works when team replies to user
+        user_email = extract_hgusersuggestion_email(msg['original_subject'])
+        if user_email and user_email not in sender_emails:
+            sender_emails.append(user_email)
+            logger.info(f"  hgUserSuggestion: added user email {user_email} to ticket")
+
+        # Upload attachments to Redmine first (we need filenames for inline replacement)
+        uploaded_attachments = []
+        if attachments:
+            logger.info(f"  Uploading {len(attachments)} attachment(s)")
+            uploaded_attachments = upload_attachments_to_redmine(attachments)
+
+        # Process body: replace inline image placeholders, strip quotes, sanitize
+        # Order matters: replace images BEFORE stripping to keep them in context
+        processed_body = body
+        if uploaded_attachments:
+            processed_body = replace_inline_images(processed_body, uploaded_attachments)
+        processed_body = sanitize_for_redmine(strip_quoted_content(processed_body))
+
+        # Check for existing ticket (handles follow-up messages in threads)
+        existing_ticket = find_existing_ticket(subject, sender_emails)
+
+        if existing_ticket:
+            # Update existing ticket
+            logger.info(f"  Found existing ticket #{existing_ticket} for: {subject[:50]}")
+            ticket = get_ticket_journals(existing_ticket)
+
+            # Check if content already exists to avoid duplicates
+            if content_exists_in_ticket(ticket, body):
+                logger.info(f"  Skipping duplicate content for ticket #{existing_ticket}")
+                continue
+
+            # Skip empty updates (e.g., email was entirely quoted content)
+            if not processed_body.strip() and not uploaded_attachments:
+                logger.info(f"  Skipping empty update for ticket #{existing_ticket}")
+                continue
+
+            ticket_status = ticket.get('status', {}).get('name', '').lower()
+            is_closed = 'closed' in ticket_status or 'resolved' in ticket_status
+
+            comment = f"--- New Email Update ---\n\nFrom: {sender}\n\n{processed_body}"
+            update_ticket(existing_ticket, comment, reopen=is_closed,
+                         new_mlm=mlm_name if is_closed else None,
+                         attachments=uploaded_attachments)
+        else:
+            # Analyze with Claude for category and draft response
+            analysis = analyze_email_with_claude(subject, body, sender)
+
+            logger.info(f"  Category: {analysis['category']}")
+
+            # Create new ticket with attachments
+            ticket_id = create_ticket(
+                subject,
+                processed_body,
+                sender_emails,
+                mlm_name,
+                category=analysis['category'],
+                attachments=uploaded_attachments
+            )
+
+            if ticket_id and analysis['draft_response']:
+                draft_note = f"--- AI Suggested Response (Draft) ---\n\n{analysis['draft_response']}"
+                update_ticket(ticket_id, draft_note)
+
+
+def process_moderated_lists():
+    """Process pending messages in moderated mailing lists."""
+    # Collect all pending messages from all moderated lists
+    all_pending = []
+    for group_email in CONFIG['MODERATED_LISTS']:
+        group_name = group_email.split('@')[0]
+        logger.info(f"Checking pending messages for {group_email}")
+
+        pending = get_pending_moderation_emails(group_name)
+        logger.info(f"  Found {len(pending)} pending message(s)")
+
+        for msg in pending:
+            msg['group_email'] = group_email
+            all_pending.append(msg)
+
+    if not all_pending:
+        return
+
+    # Batch spam check all pending messages in one API call
+    logger.info(f"Batch checking {len(all_pending)} message(s) for spam")
+    spam_results = batch_check_spam_with_claude(all_pending)
+
+    # Process results and collect approved messages
+    approved_messages = []
+    for i, msg in enumerate(all_pending):
+        is_spam = spam_results.get(i, False)
+
+        if is_spam:
+            logger.info(f"  SPAM detected (not approving): {msg['original_subject'][:50]}")
+            delete_moderation_email(msg['gmail_id'])
+        else:
+            logger.info(f"  Approving: {msg['original_subject'][:50]}")
+            if moderate_message(msg, approve=True):
+                delete_moderation_email(msg['gmail_id'])
+                approved_messages.append(msg)
+
+    # Create/update tickets for approved messages immediately
+    create_tickets_for_approved(approved_messages)
+
+
+def process_emails():
+    """Process emails and create/update Redmine tickets."""
+    mlm_name = get_current_mlm()
+    if not mlm_name:
+        logger.error("Cannot determine MLM, skipping email processing")
+        return
+
+    mlm_user_id = CONFIG['USER_IDS'].get(mlm_name)
+    if not mlm_user_id:
+        logger.error(f"No Redmine user ID for MLM: {mlm_name}")
+        return
+
+    logger.info(f"Current MLM: {mlm_name} (ID: {mlm_user_id})")
+
+    all_lists = CONFIG['MODERATED_LISTS'] + CONFIG['UNMODERATED_LISTS']
+
+    # Group emails by thread
+    threads = {}
+    for group_email in all_lists:
+        emails = get_emails_from_gmail(group_email)
+        for email in emails:
+            thread_id = email['thread_id']
+            if thread_id not in threads:
+                threads[thread_id] = {
+                    'subject': email['subject'],
+                    'emails': [],
+                    'group': group_email,
+                }
+            threads[thread_id]['emails'].append(email)
+
+    for thread_id, thread in threads.items():
+        thread['emails'].sort(key=lambda x: x['timestamp'])
+        first_email = thread['emails'][0]
+
+        # Sender emails - just the From addresses (for ticket creation)
+        sender_emails = list(set(extract_email_address(e['from']) for e in thread['emails']))
+
+        # All participant emails (From, To, CC) for matching existing tickets
+        # This handles replies where original sender appears in To/CC
+        # Note: We include mailing list addresses because some tickets (like hgUserSuggestion
+        # form submissions) have the mailing list as their Email field
+        thread_participants = set()
+        for e in thread['emails']:
+            thread_participants.add(extract_email_address(e['from']))
+            thread_participants.update(extract_all_email_addresses(e.get('to', '')))
+            thread_participants.update(extract_all_email_addresses(e.get('cc', '')))
+
+        # Remove empty strings but keep mailing list addresses for matching
+        thread_participants = [e for e in thread_participants if e]
+
+        existing_ticket = find_existing_ticket(thread['subject'], thread_participants)
+
+        if existing_ticket:
+            # Check for new messages to add
+            ticket = get_ticket_journals(existing_ticket)
+            last_update = datetime.fromisoformat(
+                ticket.get('created_on', '2000-01-01T00:00:00Z').replace('Z', '+00:00')
+            )
+
+            ticket_status = ticket.get('status', {}).get('name', '').lower()
+            is_closed = 'closed' in ticket_status or 'resolved' in ticket_status
+
+            for journal in ticket.get('journals', []):
+                notes = journal.get('notes', '')
+                if '--- New Email Update ---' in notes or 'From:' in notes:
+                    journal_time = datetime.fromisoformat(
+                        journal['created_on'].replace('Z', '+00:00')
+                    )
+                    if journal_time > last_update:
+                        last_update = journal_time
+
+            first_update = True
+            for email in thread['emails']:
+                email_time = datetime.fromtimestamp(email['timestamp'], tz=pytz.UTC)
+                if email_time > last_update:
+                    # Check if this content already exists in the ticket
+                    if content_exists_in_ticket(ticket, email['body']):
+                        logger.info(f"  Skipping duplicate content for ticket #{existing_ticket}")
+                        continue
+
+                    # Upload attachments from this email
+                    email_attachments = email.get('attachments', [])
+                    uploaded_attachments = []
+                    if email_attachments:
+                        logger.info(f"  Uploading {len(email_attachments)} attachment(s)")
+                        uploaded_attachments = upload_attachments_to_redmine(email_attachments)
+
+                    # Process body with inline image replacement
+                    processed_body = email['body']
+                    if uploaded_attachments:
+                        processed_body = replace_inline_images(processed_body, uploaded_attachments)
+                    processed_body = sanitize_for_redmine(strip_quoted_content(processed_body))
+
+                    # Skip empty updates (e.g., email was entirely quoted content)
+                    if not processed_body.strip() and not uploaded_attachments:
+                        logger.info(f"  Skipping empty update for ticket #{existing_ticket}")
+                        continue
+
+                    comment = f"--- New Email Update ---\n\nFrom: {email['from']}\n\n{processed_body}"
+                    reopen = is_closed and first_update
+                    update_ticket(existing_ticket, comment, reopen=reopen,
+                                 new_mlm=mlm_name if reopen else None,
+                                 attachments=uploaded_attachments)
+                    first_update = False
+        else:
+            # Analyze email with Claude (single call for spam, category, draft)
+            analysis = analyze_email_with_claude(
+                first_email['subject'],
+                first_email['body'],
+                first_email['from']
+            )
+
+            if analysis['is_spam']:
+                logger.info(f"Skipping spam: {first_email['subject'][:50]}")
+                continue
+
+            logger.info(f"  Category: {analysis['category']}")
+
+            # Upload attachments from the first email
+            first_attachments = first_email.get('attachments', [])
+            uploaded_attachments = []
+            if first_attachments:
+                logger.info(f"  Uploading {len(first_attachments)} attachment(s)")
+                uploaded_attachments = upload_attachments_to_redmine(first_attachments)
+
+            # Process body with inline image replacement
+            processed_body = first_email['body']
+            if uploaded_attachments:
+                processed_body = replace_inline_images(processed_body, uploaded_attachments)
+            processed_body = sanitize_for_redmine(strip_quoted_content(processed_body))
+
+            ticket_id = create_ticket(
+                thread['subject'],
+                processed_body,
+                sender_emails,
+                mlm_name,
+                category=analysis['category'],
+                attachments=uploaded_attachments
+            )
+
+            if ticket_id:
+                if len(thread['emails']) > 1:
+                    for email in thread['emails'][1:]:
+                        # Upload attachments from follow-up emails
+                        email_attachments = email.get('attachments', [])
+                        email_uploaded = []
+                        if email_attachments:
+                            logger.info(f"  Uploading {len(email_attachments)} attachment(s)")
+                            email_uploaded = upload_attachments_to_redmine(email_attachments)
+
+                        email_body = email['body']
+                        if email_uploaded:
+                            email_body = replace_inline_images(email_body, email_uploaded)
+                        email_body = sanitize_for_redmine(strip_quoted_content(email_body))
+
+                        comment = f"Message from: {email['from']}\n\n{email_body}"
+                        update_ticket(ticket_id, comment, attachments=email_uploaded)
+
+                if analysis['draft_response']:
+                    draft_note = f"--- AI Suggested Response (Draft) ---\n\n{analysis['draft_response']}"
+                    update_ticket(ticket_id, draft_note)
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(description='MLQ Automation Script')
+    parser.add_argument('--dry-run', action='store_true', help='Run without making changes')
+    args = parser.parse_args()
+
+    global DRY_RUN
+    DRY_RUN = args.dry_run
+
+    if DRY_RUN:
+        logger.info("=== DRY RUN MODE - No changes will be made ===")
+
+    # Load configuration from ~/.hg.conf
+    load_config_file()
+
+    logger.info(f"=== MLQ Automation - {datetime.now(PST).strftime('%Y-%m-%d %H:%M:%S %Z')} ===")
+
+    # Process emails first - this catches direct posts from staff (not moderated)
+    # Must run BEFORE moderation approval to avoid timestamp issues
+    logger.info("--- Processing Emails ---")
+    process_emails()
+
+    # Process moderated list pending messages (approve and create tickets)
+    logger.info("--- Processing Moderated Lists ---")
+    process_moderated_lists()
+
+    logger.info("=== Complete ===")
+
+
+if __name__ == '__main__':
+    main()