src/utils/qa/mlqAutomate.py 47d5af90d317bffde6f0843beabd2d72c88fbfce

47d5af90d317bffde6f0843beabd2d72c88fbfce
lrnassar
  Tue Mar 10 08:18:06 2026 -0700
Adding retry logic around the gmail API call, because it fails periodically (but then works next time it runs). Refs #36801

diff --git src/utils/qa/mlqAutomate.py src/utils/qa/mlqAutomate.py
index 748fee08c08..30d93b7f1bd 100755
--- src/utils/qa/mlqAutomate.py
+++ src/utils/qa/mlqAutomate.py
@@ -1,1901 +1,1903 @@
 #!/usr/bin/env python3
 """
 MLQ Automation Script
 Monitors Gmail for mailing list emails, moderates pending messages,
 and creates/updates Redmine tickets.
 """
 
 import os
 import sys
 import argparse
 import base64
 import re
 import logging
 import html as html_module
 import time
 from datetime import datetime, timedelta
 from difflib import SequenceMatcher
 import email
 from email import policy
 from email.utils import parseaddr
 from email.mime.text import MIMEText
 from functools import wraps
 import pytz
 import requests
 from google.oauth2.credentials import Credentials
 from google_auth_oauthlib.flow import InstalledAppFlow
 from google.auth.transport.requests import Request
 from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
 import anthropic
 
 # Configuration
 CONFIG = {
     'REDMINE_URL': 'https://redmine.gi.ucsc.edu',
     'REDMINE_API_KEY': '',
     'REDMINE_PROJECT': 'maillists',
     'CALENDAR_ID': 'ucsc.edu_anbl4254jlssgo3gc2l5c8un5c@group.calendar.google.com',
     'CLAUDE_API_KEY': '',
 
     # Mailing lists
     'MODERATED_LISTS': ['genome@soe.ucsc.edu', 'genome-mirror@soe.ucsc.edu'],
     'UNMODERATED_LISTS': ['genome-www@soe.ucsc.edu'],
 
     # Name mapping from calendar to Redmine
     'NAME_MAPPING': {
         'Jairo': 'Jairo Navarro',
         'Lou': 'Lou Nassar',
         'Gerardo': 'Gerardo Perez',
         'Gera': 'Gerardo Perez',
         'Clay': 'Clay Fischer',
         'Matt': 'Matt Speir',
     },
 
     # Redmine User IDs
     'USER_IDS': {
         'Jairo Navarro': 163,
         'Lou Nassar': 171,
         'Gerardo Perez': 179,
         'Clay Fischer': 161,
         'Matt Speir': 150,
     },
 
     # Redmine field IDs
     'TRACKER_ID': 7,        # MLQ
     'PRIORITY_ID': 12,      # Unprioritized
     'STATUS_ID': 1,         # New
     'CUSTOM_FIELDS': {
         'MLQ Category - primary': 28,
         'Email': 40,
         'MLM': 9,
     },
 }
 
 SCOPES = [
     'https://www.googleapis.com/auth/gmail.readonly',
     'https://www.googleapis.com/auth/gmail.send',
     'https://www.googleapis.com/auth/gmail.modify',
     'https://www.googleapis.com/auth/calendar.readonly',
 ]
 
 MLQ_CATEGORIES = [
     "Other", "Alignments", "BLAT", "Bug Report", "CAPTCHA", "Command-line Utilities",
     "Conservation", "Custom Track", "Data - Availability (when)", "Data - Interpretation (what)",
     "Data - Location (where)", "Data Contribution", "Data Integrator", "Data Requests", "dbSNP",
     "Downloads", "ENCODE", "External Tools", "Feature Request", "GBiB", "GBiC",
     "Gene Interactions (hgGeneGraph)", "Gene Tracks", "Help Docs (Info)", "Hubs", "IP blocked",
     "JSON hubApi", "Licenses", "LiftOver", "Login", "Mirror - Asia", "Mirror - Europe",
     "Mirror Site & Utilities", "Multi-region", "MySQL", "PCR", "Publications & Citing",
     "Sessions", "Slow Performance", "Table Browser", "Track Collection Builder", "User Accounts",
     "Variant Annotation Integrator", "Widget"
 ]
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 PST = pytz.timezone('America/Los_Angeles')
 DRY_RUN = False
 
 # Setup logging
 LOG_FILE = os.environ.get('MLQ_LOG_FILE', os.path.join(SCRIPT_DIR, 'mlq_automate.log'))
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s [%(levelname)s] %(message)s',
     handlers=[
         logging.FileHandler(LOG_FILE),
         logging.StreamHandler(sys.stdout)
     ]
 )
 logger = logging.getLogger(__name__)
 
 # Suppress httpx INFO logs (Anthropic client HTTP request logging)
 logging.getLogger('httpx').setLevel(logging.WARNING)
 
 # Config file path
 MLQ_CONF_PATH = os.path.expanduser('~/.hg.conf')
 
 
 def load_config_file():
     """
     Load configuration from ~/.hg.conf file.
     Uses the standard UCSC kent configuration file.
     Requires 600 permissions for security.
     """
     if not os.path.exists(MLQ_CONF_PATH):
         logger.error(f"Configuration file not found: {MLQ_CONF_PATH}")
         logger.error("Please add the following keys to ~/.hg.conf:")
         logger.error("  redmine.apiKey=YOUR_REDMINE_API_KEY")
         logger.error("  claude.apiKey=YOUR_CLAUDE_API_KEY")
         sys.exit(1)
 
     # Check file permissions (must be 600 for security)
     file_stat = os.stat(MLQ_CONF_PATH)
     file_mode = file_stat.st_mode & 0o777
     if file_mode != 0o600:
         logger.error(f"Configuration file {MLQ_CONF_PATH} has insecure permissions: {oct(file_mode)}")
         logger.error("For security, this file must have 600 permissions.")
         logger.error(f"Run: chmod 600 {MLQ_CONF_PATH}")
         sys.exit(1)
 
     # Parse key=value pairs
     config_values = {}
     with open(MLQ_CONF_PATH, 'r') as f:
         for line_num, line in enumerate(f, 1):
             line = line.strip()
             # Skip empty lines and comments
             if not line or line.startswith('#'):
                 continue
             if '=' not in line:
                 logger.warning(f"Skipping invalid line {line_num} in {MLQ_CONF_PATH}: {line}")
                 continue
             key, value = line.split('=', 1)
             config_values[key.strip()] = value.strip()
 
     # Map config file keys to CONFIG dict
     key_mapping = {
         'redmine.apiKey': 'REDMINE_API_KEY',
         'claude.apiKey': 'CLAUDE_API_KEY',
     }
 
     for conf_key, config_key in key_mapping.items():
         if conf_key in config_values:
             CONFIG[config_key] = config_values[conf_key]
 
     # Validate required keys
     if not CONFIG['REDMINE_API_KEY']:
         logger.error(f"Missing redmine.apiKey in {MLQ_CONF_PATH}")
         sys.exit(1)
     if not CONFIG['CLAUDE_API_KEY']:
         logger.error(f"Missing claude.apiKey in {MLQ_CONF_PATH}")
         sys.exit(1)
 
     logger.info(f"Loaded configuration from {MLQ_CONF_PATH}")
 
 
 def retry(max_attempts=3, delay=2, backoff=2, exceptions=(Exception,)):
     """Retry decorator with exponential backoff."""
     def decorator(func):
         @wraps(func)
         def wrapper(*args, **kwargs):
             attempts = 0
             current_delay = delay
             while attempts < max_attempts:
                 try:
                     return func(*args, **kwargs)
                 except exceptions as e:
                     attempts += 1
                     if attempts == max_attempts:
                         logger.error(f"{func.__name__} failed after {max_attempts} attempts: {e}")
                         raise
                     logger.warning(f"{func.__name__} attempt {attempts} failed: {e}. Retrying in {current_delay}s...")
                     time.sleep(current_delay)
                     current_delay *= backoff
         return wrapper
     return decorator
 
 
 def get_google_credentials():
     """Get or refresh Google API credentials."""
     creds = None
     token_path = os.path.expanduser('~/.gmail_token.json')
     creds_path = os.path.expanduser('~/.gmail_credentials.json')
 
     if os.path.exists(token_path):
         creds = Credentials.from_authorized_user_file(token_path, SCOPES)
 
     if not creds or not creds.valid:
         if creds and creds.expired and creds.refresh_token:
             creds.refresh(Request())
         else:
             flow = InstalledAppFlow.from_client_secrets_file(creds_path, SCOPES)
             creds = flow.run_local_server(port=8080)
         with open(token_path, 'w') as f:
             f.write(creds.to_json())
 
     return creds
 
 
 def get_current_mlm():
     """Get the current MLM based on PST time rules."""
     creds = get_google_credentials()
     service = build('calendar', 'v3', credentials=creds, cache_discovery=False)
 
     now = datetime.now(PST)
     target_date = now
 
     # After 5pm, use next day's MLM
     if now.hour >= 17:
         target_date += timedelta(days=1)
 
     # Weekend handling: Fri 5pm+ through Mon 5pm uses Monday's MLM
     weekday = target_date.weekday()
     if weekday == 5:  # Saturday
         target_date += timedelta(days=2)
     elif weekday == 6:  # Sunday
         target_date += timedelta(days=1)
 
     # Tuesday is always Matt
     if target_date.weekday() == 1:  # Tuesday
         return 'Matt Speir'
 
     start = target_date.replace(hour=0, minute=0, second=0, microsecond=0)
     end = target_date.replace(hour=23, minute=59, second=59, microsecond=0)
 
     events = service.events().list(
         calendarId=CONFIG['CALENDAR_ID'],
         timeMin=start.isoformat(),
         timeMax=end.isoformat(),
         singleEvents=True
     ).execute().get('items', [])
 
     for event in events:
         title = event.get('summary', '')
         match = re.search(r'MLM(?:\s+Rotating)?:\s*(\w+)', title, re.IGNORECASE)
         if match:
             cal_name = match.group(1)
             return CONFIG['NAME_MAPPING'].get(cal_name, cal_name)
 
     logger.warning(f"No MLM found for {target_date.date()}")
     return None
 
 
 @retry(max_attempts=3, delay=2, exceptions=(anthropic.APIError,))
 def analyze_email_with_claude(subject, body, sender):
     """
     Use Claude to analyze an email in a single call.
     Returns dict with: is_spam, category, draft_response
     """
     client = anthropic.Anthropic(api_key=CONFIG['CLAUDE_API_KEY'])
     categories_list = ", ".join(MLQ_CATEGORIES)
 
     prompt = f"""Analyze this email for the UCSC Genome Browser support team.
 
 From: {sender}
 Subject: {subject}
 Body:
 {body[:3000]}
 
 Provide your analysis in this exact format:
 
 SPAM: [YES or NO]
 CATEGORY: [Pick one from: {categories_list}]
 DRAFT_RESPONSE: [If not spam, write a helpful, professional response under 200 words. If spam, write "N/A"]
 
 Important:
 - Mark as SPAM if it is:
   - Conference/journal solicitations asking for paper submissions
   - Promotions for workshops, courses, training programs, or webinars
   - Marketing or promotional emails advertising services or products
   - Mass-sent announcements unrelated to genome browser support
   - Contains sensitive personal medical information (specific names with genetic test results, medical conditions, family medical history, or personal health details) - these are privacy concerns
 - Mark as NOT SPAM if it is a genuine question about using the UCSC Genome Browser (general genetics questions without personal identifying info are OK)
 - For CATEGORY, pick the most specific match. Use "Other" if unsure.
 - For DRAFT_RESPONSE, be helpful and concise. Ask clarifying questions if needed. Point to relevant documentation when appropriate."""
 
     response = client.messages.create(
         model="claude-sonnet-4-20250514",
         max_tokens=800,
         messages=[{"role": "user", "content": prompt}]
     )
 
     result_text = response.content[0].text.strip()
 
     # Parse the response
     is_spam = False
     category = "Other"
     draft_response = None
 
     for line in result_text.split('\n'):
         line = line.strip()
         if line.upper().startswith('SPAM:'):
             is_spam = 'YES' in line.upper()
         elif line.upper().startswith('CATEGORY:'):
             cat = line.split(':', 1)[1].strip()
             # Validate category
             if cat in MLQ_CATEGORIES:
                 category = cat
             else:
                 for c in MLQ_CATEGORIES:
                     if c.lower() == cat.lower():
                         category = c
                         break
         elif line.upper().startswith('DRAFT_RESPONSE:'):
             draft_response = line.split(':', 1)[1].strip()
             # Capture multi-line response
             idx = result_text.find('DRAFT_RESPONSE:')
             if idx != -1:
                 draft_response = result_text[idx + len('DRAFT_RESPONSE:'):].strip()
                 if draft_response.upper() == 'N/A':
                     draft_response = None
 
     return {
         'is_spam': is_spam,
         'category': category,
         'draft_response': draft_response
     }
 
 
 @retry(max_attempts=3, delay=2, exceptions=(anthropic.APIError,))
 def batch_check_spam_with_claude(messages):
     """
     Use Claude to determine spam status for multiple emails in one call.
     Returns dict mapping message index to True (spam) or False (not spam).
     """
     if not messages:
         return {}
 
     client = anthropic.Anthropic(api_key=CONFIG['CLAUDE_API_KEY'])
 
     # Build the prompt with all messages
     emails_text = ""
     for i, msg in enumerate(messages, 1):
         emails_text += f"""
 --- EMAIL {i} ---
 From: {msg['original_from']}
 Subject: {msg['original_subject']}
 Body:
 {msg['original_body'][:1500]}
 """
 
     prompt = f"""Analyze these emails and determine if each is spam. This is for a genome browser technical support mailing list.
 
 {emails_text}
 
 Mark as SPAM if it is:
 - Conference/journal solicitations asking for paper submissions
 - Promotions for workshops, courses, training programs, or webinars
 - Marketing or promotional emails advertising services or products
 - Phishing or scam attempts
 - Mass-sent unsolicited emails unrelated to genome browser support
 - Announcements about events, courses, or programs (not questions about the browser)
 - Contains sensitive personal medical information (specific names with genetic test results, medical conditions, family medical history, or personal health details) - these are privacy concerns
 
 Mark as NOT SPAM if it is:
 - A genuine question about the UCSC Genome Browser
 - A technical support request
 - A follow-up to an existing conversation
 - Someone asking how to use browser features for their research
 - General questions about genetic data without personal identifying information
 
 Reply with one line per email in this exact format:
 EMAIL 1: SPAM or NOT SPAM
 EMAIL 2: SPAM or NOT SPAM
 (etc.)"""
 
     response = client.messages.create(
         model="claude-sonnet-4-20250514",
         max_tokens=100,
         messages=[{"role": "user", "content": prompt}]
     )
 
     result_text = response.content[0].text.strip().upper()
     results = {}
 
     for line in result_text.split('\n'):
         line = line.strip()
         match = re.match(r'EMAIL\s*(\d+):\s*(SPAM|NOT SPAM)', line)
         if match:
             idx = int(match.group(1)) - 1  # Convert to 0-based index
             is_spam = match.group(2) == 'SPAM'
             results[idx] = is_spam
 
     # Default to not spam for any missing results
     for i in range(len(messages)):
         if i not in results:
             logger.warning(f"No spam result for message {i+1}, defaulting to not spam")
             results[i] = False
 
     return results
 
 
 def parse_raw_email_headers(raw_email):
     """Parse Subject, From, and body from raw email text.
 
     Properly handles multipart MIME emails by extracting just the text/plain part.
     """
     # Parse using Python's email module for proper MIME handling
     msg = email.message_from_string(raw_email, policy=policy.default)
 
     # Get headers
     subject = msg.get('subject', '')
     # Priority: x-original-sender > reply-to > from
     sender = (
         msg.get('x-original-sender') or
         msg.get('reply-to') or
         msg.get('from', '')
     )
 
     # Extract body - prefer text/plain, fall back to text/html converted to text
     body = ''
     html_body = ''
 
     if msg.is_multipart():
         for part in msg.walk():
             content_type = part.get_content_type()
             if content_type == 'text/plain' and not body:
                 try:
                     body = part.get_content()
                 except Exception:
                     # Fallback: try to decode manually
                     payload = part.get_payload(decode=True)
                     if payload:
                         charset = part.get_content_charset() or 'utf-8'
                         body = payload.decode(charset, errors='ignore')
             elif content_type == 'text/html' and not html_body:
                 try:
                     html_body = part.get_content()
                 except Exception:
                     payload = part.get_payload(decode=True)
                     if payload:
                         charset = part.get_content_charset() or 'utf-8'
                         html_body = payload.decode(charset, errors='ignore')
     else:
         # Not multipart - check content type
         content_type = msg.get_content_type()
         try:
             content = msg.get_content()
         except Exception:
             payload = msg.get_payload(decode=True)
             if payload:
                 charset = msg.get_content_charset() or 'utf-8'
                 content = payload.decode(charset, errors='ignore')
             else:
                 content = msg.get_payload()
 
         if content_type == 'text/plain':
             body = content
         elif content_type == 'text/html':
             html_body = content
 
     # If no text/plain, convert HTML to text
     if not body and html_body:
         body = html_to_text(html_body)
 
     return subject, sender, body
 
 
+@retry(max_attempts=3, delay=2, exceptions=(HttpError,))
 def get_pending_moderation_emails(group_name):
     """Get pending moderation notification emails for a group."""
     creds = get_google_credentials()
     service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
 
     # Search for pending moderation emails for this group (exclude trash)
     query = f'subject:"{group_name} - soe.ucsc.edu admins: Message Pending" -in:trash'
     results = service.users().messages().list(userId='me', q=query, maxResults=50).execute()
 
     pending = []
     for msg_ref in results.get('messages', []):
         msg = service.users().messages().get(userId='me', id=msg_ref['id'], format='full').execute()
         headers = {h['name']: h['value'] for h in msg['payload']['headers']}
 
         # Extract the approval address from the From header
         from_addr = headers.get('From', '')
         approve_addr = None
         if '+msgappr@' in from_addr:
             match = re.search(r'<([^>]+\+msgappr@[^>]+)>', from_addr)
             if match:
                 approve_addr = match.group(1)
 
         # Extract the attached original message
         original_subject = ''
         original_from = ''
         original_body = ''
         original_attachments = []  # Will store attachment info for later extraction
 
         for part in msg['payload'].get('parts', []):
             if part['mimeType'] == 'message/rfc822':
                 # Check if original email is in an attachment (needs separate fetch)
                 if 'attachmentId' in part.get('body', {}):
                     att_id = part['body']['attachmentId']
                     att = service.users().messages().attachments().get(
                         userId='me', messageId=msg_ref['id'], id=att_id
                     ).execute()
                     raw_email = base64.urlsafe_b64decode(att['data']).decode('utf-8', errors='ignore')
                     original_subject, original_from, original_body = parse_raw_email_headers(raw_email)
                     # Note: Attachments in raw RFC822 would need MIME parsing - skip for now
                 else:
                     # Fallback: nested parts format (Gmail pre-parsed the RFC822)
                     # Helper to recursively find content of specific MIME type
                     def find_content_recursive(p, mime_type):
                         if p.get('mimeType') == mime_type and p.get('body', {}).get('data'):
                             return base64.urlsafe_b64decode(p['body']['data']).decode('utf-8', errors='ignore')
                         for sp in p.get('parts', []):
                             result = find_content_recursive(sp, mime_type)
                             if result:
                                 return result
                         return ''
 
                     # Helper to recursively find email headers (Subject/From) in nested MIME structure
                     def find_headers_recursive(p):
                         if 'headers' in p:
                             headers = {h['name']: h['value'] for h in p['headers']}
                             # Only return if we found actual email headers (not just Content-Type)
                             if 'Subject' in headers or 'From' in headers:
                                 return headers
                         for sp in p.get('parts', []):
                             result = find_headers_recursive(sp)
                             if result:
                                 return result
                         return {}
 
                     nested_headers = find_headers_recursive(part)
                     if nested_headers:
                         original_subject = nested_headers.get('Subject', original_subject)
                         original_from = (
                             nested_headers.get('X-Original-Sender') or
                             nested_headers.get('Reply-To') or
                             nested_headers.get('From') or
                             original_from
                         )
 
                     # Try text/plain first, fall back to HTML converted to text
                     original_body = find_content_recursive(part, 'text/plain')
                     if not original_body:
                         html_body = find_content_recursive(part, 'text/html')
                         if html_body:
                             original_body = html_to_text(html_body)
 
                     # Extract attachments from nested RFC822 structure
                     original_attachments = extract_email_attachments(service, msg_ref['id'], part)
 
         if approve_addr:
             # Get the notification's Message-Id and Subject for proper reply threading
             notification_message_id = headers.get('Message-Id') or headers.get('Message-ID', '')
             notification_subject = headers.get('Subject', '')
 
             pending.append({
                 'gmail_id': msg_ref['id'],
                 'approve_addr': approve_addr,
                 'original_subject': original_subject,
                 'original_from': original_from,
                 'original_body': original_body,
                 'original_attachments': original_attachments,
                 'notification_message_id': notification_message_id,
                 'notification_subject': notification_subject,
             })
 
     return pending
 
 
 def moderate_message(pending_msg, approve=True):
     """Approve a pending message by sending email reply."""
     if DRY_RUN:
         logger.info(f"  [DRY RUN] Would approve: {pending_msg['original_subject'][:50]}")
         return True
 
     creds = get_google_credentials()
     service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
 
     to_addr = pending_msg['approve_addr']
 
     # Create a properly formatted reply with threading headers
     message = MIMEText('')
     message['To'] = to_addr
     message['From'] = 'gbauto@ucsc.edu'
 
     # Use Re: prefix on original subject to indicate reply
     orig_subject = pending_msg.get('notification_subject', '')
     if orig_subject and not orig_subject.startswith('Re:'):
         message['Subject'] = f'Re: {orig_subject}'
     else:
         message['Subject'] = orig_subject or 'Re: Moderation'
 
     # Add threading headers to make this a proper reply
     msg_id = pending_msg.get('notification_message_id', '')
     if msg_id:
         message['In-Reply-To'] = msg_id
         message['References'] = msg_id
 
     encoded = base64.urlsafe_b64encode(message.as_bytes()).decode('utf-8')
 
     try:
         service.users().messages().send(
             userId='me',
             body={'raw': encoded}
         ).execute()
         return True
     except Exception as e:
         logger.error(f"  Error approving message: {e}")
         return False
 
 
 def send_error_notification(subject, body):
     """Send an error notification email to the QA team.
 
     Used to alert the team when Redmine API operations fail after retries.
     """
     if DRY_RUN:
         logger.info(f"  [DRY RUN] Would send error notification: {subject}")
         return True
 
     try:
         creds = get_google_credentials()
         service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
 
         message = MIMEText(body)
         message['To'] = 'browserqa-group@ucsc.edu'
         message['From'] = 'gbauto@ucsc.edu'
         message['Subject'] = f'[MLQ Automation Error] {subject}'
 
         encoded = base64.urlsafe_b64encode(message.as_bytes()).decode('utf-8')
 
         service.users().messages().send(
             userId='me',
             body={'raw': encoded}
         ).execute()
         logger.info(f"Sent error notification email: {subject}")
         return True
     except Exception as e:
         logger.error(f"Failed to send error notification email: {e}")
         return False
 
 
 def delete_moderation_email(gmail_id):
     """Delete/archive the moderation notification after processing."""
     if DRY_RUN:
         return
 
     creds = get_google_credentials()
     service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
     try:
         service.users().messages().trash(userId='me', id=gmail_id).execute()
     except Exception as e:
         logger.error(f"  Error trashing notification: {e}")
 
 
 def get_emails_from_gmail(group_email, minutes_ago=60):
     """Get recent emails sent to a mailing list."""
     creds = get_google_credentials()
     service = build('gmail', 'v1', credentials=creds, cache_discovery=False)
 
     cutoff = datetime.now(PST) - timedelta(minutes=minutes_ago)
     query = f"to:{group_email} after:{int(cutoff.timestamp())}"
 
     results = service.users().messages().list(
         userId='me', q=query, maxResults=50
     ).execute()
 
     messages = []
     for msg_ref in results.get('messages', []):
         msg = service.users().messages().get(
             userId='me', id=msg_ref['id'], format='full'
         ).execute()
 
         headers = {h['name']: h['value'] for h in msg['payload']['headers']}
         subject = headers.get('Subject', '(no subject)')
 
         # Get original sender - Google Groups may rewrite From to the list address
         # Priority: X-Original-Sender > Reply-To > From
         from_addr = (
             headers.get('X-Original-Sender') or
             headers.get('Reply-To') or
             headers.get('From', '')
         )
 
         # Skip moderation-related emails (notifications and our own replies)
         if 'Message Pending' in subject:
             continue
         if 'Moderation response' in subject:
             continue
         if 'gbauto@ucsc.edu' in from_addr:
             continue
         if '+msgappr@' in from_addr or '+msgrej@' in from_addr:
             continue
 
         # Extract body - prefer text/plain, fall back to text/html converted to text
         def extract_body_content(payload, target_type):
             """Recursively find content of target_type in email payload."""
             if payload.get('mimeType') == target_type:
                 data = payload.get('body', {}).get('data', '')
                 if data:
                     return base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
             if 'parts' in payload:
                 for part in payload['parts']:
                     result = extract_body_content(part, target_type)
                     if result:
                         return result
             return ''
 
         # Try text/plain first, fall back to HTML converted to text
         body = extract_body_content(msg['payload'], 'text/plain')
         if not body:
             html_body = extract_body_content(msg['payload'], 'text/html')
             if html_body:
                 body = html_to_text(html_body)
 
         # Extract attachments
         attachments = extract_email_attachments(service, msg['id'], msg['payload'])
 
         messages.append({
             'id': msg['id'],
             'thread_id': msg['threadId'],
             'subject': subject,
             'from': from_addr,
             'to': headers.get('To', ''),
             'cc': headers.get('Cc', ''),
             'date': headers.get('Date', ''),
             'body': body,
             'attachments': attachments,
             'timestamp': int(msg['internalDate']) / 1000,
         })
 
     return messages
 
 
 def html_to_text(html):
     """Convert HTML to plain text by stripping tags and decoding entities.
 
     Used as a fallback when an email only has HTML content (no text/plain part).
     """
     if not html:
         return ''
 
     # Remove script and style tags with their content
     text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
     text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
 
     # Convert <br> and </p> to newlines
     text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
     text = re.sub(r'</p>', '\n\n', text, flags=re.IGNORECASE)
     text = re.sub(r'</div>', '\n', text, flags=re.IGNORECASE)
     text = re.sub(r'</li>', '\n', text, flags=re.IGNORECASE)
 
     # Remove all other HTML tags
     text = re.sub(r'<[^>]+>', '', text)
 
     # Decode HTML entities
     text = html_module.unescape(text)
 
     # Normalize whitespace (collapse multiple spaces/newlines)
     text = re.sub(r'[ \t]+', ' ', text)  # Collapse horizontal whitespace
     text = re.sub(r'\n[ \t]+', '\n', text)  # Remove leading spaces on lines
     text = re.sub(r'[ \t]+\n', '\n', text)  # Remove trailing spaces on lines
     text = re.sub(r'\n{3,}', '\n\n', text)  # Collapse multiple newlines
 
     return text.strip()
 
 
 def sanitize_for_redmine(text):
     """Clean up text for Redmine textile compatibility."""
     # Remove emojis and other 4-byte UTF-8 characters that cause MySQL utf8 encoding issues
     # MySQL utf8 is 3-byte max; utf8mb4 supports 4-byte but many Redmine installs use utf8
     text = re.sub(r'[\U00010000-\U0010FFFF]', '', text)
 
     # Remove Outlook duplicate URL format: URL<URL> -> URL
     # Outlook often includes the URL twice: once as display text, once in angle brackets
     text = re.sub(
         r'(https?://[^\s<>\[\]]+)<https?://[^\s<>\[\]>]+>',
         r'\1',
         text
     )
 
     # Note: We do NOT auto-link URLs here. Redmine uses Textile format by default
     # and has its own auto-linking for bare URLs. Adding Markdown-style [URL](URL)
     # links causes display issues in Textile mode.
 
     lines = text.split('\n')
     cleaned = []
 
     for line in lines:
         # Remove leading whitespace that would trigger code blocks (4+ spaces or tabs)
         stripped = line.lstrip(' \t')
         leading = len(line) - len(stripped)
 
         if leading >= 4:
             # Reduce to 2 spaces to preserve some structure without triggering code block
             line = '  ' + stripped
         elif leading > 0 and line.startswith('\t'):
             # Convert tabs to spaces
             line = '  ' + stripped
 
         # Escape # at start of line to prevent header formatting
         if line.startswith('#'):
             line = '\\' + line
 
         # Escape --- or *** that would become horizontal rules
         if re.match(r'^[-*_]{3,}\s*$', line):
             line = '\\' + line
 
         # Escape leading - or * followed by space that would become list items
         # (only if it doesn't look intentional, i.e., not followed by more text structure)
         if re.match(r'^[-*]\s+[a-z]', line, re.IGNORECASE):
             # Looks like prose starting with dash, not a list - escape it
             if not re.match(r'^[-*]\s+\S+\s*$', line):  # Single word items are likely lists
                 line = '\\' + line
 
         cleaned.append(line)
 
     return '\n'.join(cleaned)
 
 
 def strip_quoted_content(body):
     """Remove quoted reply content and Google Groups footer from email body.
 
     Handles three reply styles:
     1. Top-posted: New content at top, quoted content below (most common)
     2. Inline/interleaved: Short quotes with replies below each (less common)
     3. Bottom-posted: Quoted content at top, new content below (e.g., some
        Thunderbird/Linux mail clients)
 
     For bottom-posted replies, when a quote header ("On ... wrote:") appears
     within the first 3 lines, the quoted block is skipped and only the new
     content that follows is kept.
     """
     lines = body.split('\n')
     cleaned = []
     i = 0
 
     # Pattern to strip Unicode control characters (LTR/RTL marks, etc.)
     # These can appear at the start or end of lines in emails with mixed-direction text
     unicode_control_chars = '[\u200e\u200f\u202a-\u202e\u2066-\u2069]'
     unicode_control_pattern = re.compile(f'^{unicode_control_chars}+|{unicode_control_chars}+$')
 
     while i < len(lines):
         line = lines[i]
         # Strip Unicode control characters for pattern matching
         line_stripped = unicode_control_pattern.sub('', line)
         # Also strip leading quote markers ("> ") to catch quoted "On...wrote:" patterns
         # e.g., "> On Jan 15, 2026, at 12:01 AM, Name wrote:"
         line_unquoted = re.sub(r'^>\s*', '', line_stripped)
 
         # Detect "On <date> <person> wrote:" quote headers
         is_quote_header = False
         quote_header_lines = 1  # How many lines this header spans
 
         if re.match(r'^On .+wrote:\s*$', line_unquoted, re.IGNORECASE):
             is_quote_header = True
         elif re.match(r'^On .+, at .+wrote:\s*$', line_unquoted, re.IGNORECASE):
             is_quote_header = True
         elif re.match(r'^On .+@.+>\s*$', line_unquoted) or re.match(r'^On .*\d{4}.*[AP]M .+', line_unquoted):
             if i + 1 < len(lines):
                 next_stripped = unicode_control_pattern.sub('', lines[i + 1])
                 next_unquoted = re.sub(r'^>\s*', '', next_stripped).strip().lower()
                 if next_unquoted.endswith('wrote:'):
                     is_quote_header = True
                     quote_header_lines = 2
         elif line_unquoted.strip().lower().endswith('wrote:') and i > 0:
             prev = lines[i - 1]
             prev_stripped = unicode_control_pattern.sub('', prev)
             prev_unquoted = re.sub(r'^>\s*', '', prev_stripped)
             if re.match(r'^On .+', prev_unquoted, re.IGNORECASE):
                 if cleaned and cleaned[-1] == prev:
                     cleaned.pop()
                 is_quote_header = True
 
         if is_quote_header:
             # Bottom-posted reply: quote header near the start means new content
             # is below the quoted block. Skip the quoted block, keep what follows.
             # Allow at most 1 non-blank line above (e.g., a greeting like "Hi,")
             non_blank_above = sum(1 for l in cleaned if l.strip())
             if non_blank_above <= 1:
                 # Skip past the quote header
                 i += quote_header_lines
                 # Skip the quoted lines (starting with ">")
                 while i < len(lines):
                     l = lines[i]
                     l_stripped = unicode_control_pattern.sub('', l).strip()
                     if l_stripped.startswith('>') or l_stripped == '':
                         i += 1
                     else:
                         break
                 # Reset cleaned — anything before the quote header was blank/trivial
                 cleaned = []
                 continue
             else:
                 # Top-posted reply: we already have real content above, stop here
                 break
 
         if re.match(r'^-{4,}\s*Original Message\s*-{4,}', line_stripped, re.IGNORECASE):
             break
         # Gmail forwarded message format: "---------- Forwarded message ---------"
         if re.match(r'^-{4,}\s*Forwarded message\s*-{4,}', line_stripped, re.IGNORECASE):
             break
         # Outlook single-line forward format
         if re.match(r'^From:.*Sent:.*To:', line_stripped, re.IGNORECASE):
             break
         # Outlook multi-line forward format:
         # From: Name
         # Sent: Date
         # To: Recipients
         if re.match(r'^From:\s*.+', line_stripped, re.IGNORECASE):
             # Look ahead for Sent: and To: on subsequent lines
             if i + 2 < len(lines):
                 next1 = unicode_control_pattern.sub('', lines[i + 1])
                 next2 = unicode_control_pattern.sub('', lines[i + 2])
                 if re.match(r'^Sent:\s*.+', next1, re.IGNORECASE) and re.match(r'^To:\s*.+', next2, re.IGNORECASE):
                     break
         # Stop at Google Groups footer
         if 'You received this message because you are subscribed to the Google Groups' in line:
             break
         # Skip Google Groups unsubscribe line
         if 'To unsubscribe from this group and stop receiving emails from it' in line:
             i += 1
             continue
         if line.strip() == '---' and len(cleaned) > 0:
             i += 1
             continue
 
         # Handle quoted lines (starting with >)
         # Keep quoted lines for inline reply context - they provide important context
         # The "On ... wrote:" and other patterns above will stop at the full original message
         cleaned.append(line)
         i += 1
 
     # Remove trailing dashes and whitespace
     while cleaned and cleaned[-1].strip() in ('', '---', '--'):
         cleaned.pop()
 
     return '\n'.join(cleaned).rstrip()
 
 
 # Attachment handling constants
 SIGNATURE_ATTACHMENT_PATTERNS = [
     r'^logo',
     r'^signature',
     r'^banner',
     r'^icon',
     r'^footer',
     r'^divider',
     # Note: Removed image\d* pattern - Outlook uses this for legitimate inline images.
     # Size filter (MIN_ATTACHMENT_SIZE) handles small signature icons instead.
 ]
 MIN_ATTACHMENT_SIZE = 1024  # Skip attachments smaller than 1KB (likely icons)
 MAX_ATTACHMENT_SIZE = 10 * 1024 * 1024  # 10MB max per attachment
 
 
 def is_signature_attachment(filename, size_bytes):
     """Check if an attachment is likely a signature/icon that should be skipped."""
     if not filename:
         return True
 
     # Skip very small images (likely icons/spacers)
     if size_bytes < MIN_ATTACHMENT_SIZE:
         return True
 
     # Check filename against signature patterns
     name_lower = filename.lower().rsplit('.', 1)[0]  # Remove extension
     for pattern in SIGNATURE_ATTACHMENT_PATTERNS:
         if re.match(pattern, name_lower, re.IGNORECASE):
             return True
 
     return False
 
 
 def extract_email_attachments(gmail_service, message_id, payload):
     """
     Extract all attachments from an email.
     Returns list of dicts with: filename, mimeType, data, size
     Filters out signature images and oversized files.
     """
     attachments = []
 
     def find_attachments_recursive(part):
         mime_type = part.get('mimeType', '')
         filename = part.get('filename', '')
         body = part.get('body', {})
         attachment_id = body.get('attachmentId')
         size = body.get('size', 0)
 
         # Check if this part is an attachment (has attachmentId or filename with size)
         if attachment_id and filename:
             # Skip signature/icon attachments
             if is_signature_attachment(filename, size):
                 logger.debug(f"Skipping signature attachment: {filename} ({size} bytes)")
             elif size > MAX_ATTACHMENT_SIZE:
                 logger.warning(f"Skipping oversized attachment: {filename} ({size} bytes)")
             else:
                 # Download the attachment
                 try:
                     att_data = gmail_service.users().messages().attachments().get(
                         userId='me', messageId=message_id, id=attachment_id
                     ).execute()
                     file_data = base64.urlsafe_b64decode(att_data['data'])
                     attachments.append({
                         'filename': filename,
                         'mimeType': mime_type,
                         'data': file_data,
                         'size': len(file_data)
                     })
                 except Exception as e:
                     logger.warning(f"Failed to download attachment {filename}: {e}")
 
         # Recurse into nested parts
         for subpart in part.get('parts', []):
             find_attachments_recursive(subpart)
 
     find_attachments_recursive(payload)
     return attachments
 
 
 def upload_attachments_to_redmine(attachments):
     """
     Upload attachments to Redmine and return tokens.
     Returns list of dicts with: filename, token, content_type
     """
     if not attachments:
         return []
 
     uploaded = []
     for att in attachments:
         try:
             upload_url = f"{CONFIG['REDMINE_URL']}/uploads.json?filename={att['filename']}"
             headers = {
                 'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY'],
                 'Content-Type': 'application/octet-stream'
             }
 
             resp = requests.post(upload_url, data=att['data'], headers=headers, timeout=60)
 
             if resp.status_code == 201:
                 token = resp.json()['upload']['token']
                 uploaded.append({
                     'filename': att['filename'],
                     'token': token,
                     'content_type': att['mimeType']
                 })
                 logger.debug(f"Uploaded attachment: {att['filename']}")
             elif resp.status_code == 422:
                 logger.warning(f"Attachment too large for Redmine: {att['filename']}")
             else:
                 logger.warning(f"Failed to upload {att['filename']}: {resp.status_code}")
         except Exception as e:
             logger.warning(f"Error uploading attachment {att['filename']}: {e}")
 
     return uploaded
 
 
 def replace_inline_images(body, uploaded_attachments):
     """
     Replace inline image placeholders with Redmine inline image syntax.
     Handles:
     - Gmail format: [image: filename.png]
     - Outlook CID format: [cid:filename.png@identifier]
     """
     if not uploaded_attachments:
         return body
 
     # Build a map of filenames (case-insensitive)
     filename_map = {att['filename'].lower(): att['filename'] for att in uploaded_attachments}
 
     def replace_gmail_placeholder(match):
         """Handle Gmail [image: filename] format."""
         placeholder_name = match.group(1).strip()
         lookup = placeholder_name.lower()
         if lookup in filename_map:
             actual_filename = filename_map[lookup]
             return f'!{actual_filename}!'
         return match.group(0)
 
     def replace_cid_placeholder(match):
         """Handle Outlook [cid:filename@identifier] format."""
         cid_content = match.group(1)
         # Extract filename from cid:filename@identifier or cid:filename
         if '@' in cid_content:
             filename_part = cid_content.split('@')[0]
         else:
             filename_part = cid_content
 
         lookup = filename_part.lower()
         if lookup in filename_map:
             actual_filename = filename_map[lookup]
             return f'!{actual_filename}!'
         return match.group(0)
 
     # Replace Gmail [image: filename.png] patterns
     body = re.sub(r'\[image:\s*([^\]]+)\]', replace_gmail_placeholder, body, flags=re.IGNORECASE)
 
     # Replace Outlook [cid:filename.png@identifier] patterns
     body = re.sub(r'\[cid:([^\]]+)\]', replace_cid_placeholder, body, flags=re.IGNORECASE)
 
     return body
 
 
 def extract_email_address(from_header):
     """Extract just the email address from a From header."""
     _, email = parseaddr(from_header)
     return email.lower()
 
 
 def extract_all_email_addresses(header_value):
     """Extract all email addresses from a header (To, CC can have multiple)."""
     if not header_value:
         return []
     # Handle multiple addresses separated by commas
     # parseaddr only handles single address, so we split first
     addresses = []
     for part in header_value.split(','):
         _, email = parseaddr(part.strip())
         if email:
             addresses.append(email.lower())
     return addresses
 
 
 def extract_hgusersuggestion_email(subject):
     """Extract user email from hgUserSuggestion form subjects.
 
     hgUserSuggestion form submissions have subjects like:
     'hgUserSuggestion mmcgary44@gmail.com 2026-01-28 05:59:21'
 
     Returns the extracted email, or None if not an hgUserSuggestion subject.
     """
     if not subject:
         return None
 
     # Check if this is an hgUserSuggestion subject
     if 'hgusersuggestion' not in subject.lower():
         return None
 
     # Extract email using pattern: hgUserSuggestion <email> <date> <time>
     # Email pattern should match common email formats
     match = re.search(r'hgusersuggestion\s+([^\s]+@[^\s]+)\s+\d{4}-\d{2}-\d{2}', subject, re.IGNORECASE)
     if match:
         return match.group(1).lower()
 
     return None
 
 
 def normalize_subject(subject):
     """Normalize subject for matching by removing common prefixes and list tags.
 
     Removes these patterns from ANYWHERE in the subject (not just the start)
     to handle cases like 'TICKET-123 Re: [genome] Original subject'.
     """
     if not subject:
         return ''
 
     s = subject.strip()
 
     # Remove mailing list tags from anywhere
     # Generate patterns from CONFIG to stay in sync with configured lists
     for addr in CONFIG['MODERATED_LISTS'] + CONFIG['UNMODERATED_LISTS']:
         list_name = addr.split('@')[0]
         # Escape the brackets for regex and remove case-insensitively
         pattern = re.escape(f'[{list_name}]')
         s = re.sub(pattern, '', s, flags=re.IGNORECASE)
 
     # Remove [External] tags added by email gateways
     s = re.sub(r'\[external\]', '', s, flags=re.IGNORECASE)
     s = re.sub(r'\bexternal:\s*', '', s, flags=re.IGNORECASE)
 
     # Remove reply/forward prefixes from anywhere
     # Use word boundary \b to avoid matching inside words (e.g., "Re-install")
     reply_forward_patterns = [
         r'\bre:\s*',      # Re: RE:
         r'\bfwd?:\s*',    # Fwd: FW: Fw:
         r'\baw:\s*',      # AW: (German "Antwort")
     ]
     for pattern in reply_forward_patterns:
         s = re.sub(pattern, '', s, flags=re.IGNORECASE)
 
     # Strip leading punctuation and whitespace (e.g., ": Subject" -> "Subject")
     s = re.sub(r'^[\s:,\-]+', '', s)
 
     # Collapse multiple whitespace and strip
     s = ' '.join(s.split())
 
     return s
 
 
 @retry(max_attempts=3, delay=2, exceptions=(requests.RequestException,))
 def find_existing_ticket(subject, thread_emails):
     """Find an existing Redmine ticket by subject and email match.
 
     Requires both:
     1. Normalized subject match
     2. At least one email from thread_emails matches the ticket's Email field
 
     Exception: if any thread participant is a @ucsc.edu address (staff), the
     email match is skipped and subject match alone is sufficient. Staff replies
     to mailing list threads typically go to the list, not the original sender,
     so the original sender's email won't appear in To/CC.
 
     thread_emails should include all participants (From, To, CC) to handle
     replies where the original sender appears in To/CC fields.
     """
     normalized = normalize_subject(subject)
 
     url = f"{CONFIG['REDMINE_URL']}/issues.json"
     params = {
         'project_id': CONFIG['REDMINE_PROJECT'],
         'subject': f"~{normalized}",
         'status_id': '*',
         'limit': 100,
     }
     headers = {'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY']}
 
     resp = requests.get(url, params=params, headers=headers, timeout=30)
     resp.raise_for_status()
     data = resp.json()
 
     email_list = [e.lower() for e in thread_emails]
     has_staff_participant = any(e.endswith('@ucsc.edu') for e in email_list)
 
     for issue in data.get('issues', []):
         if normalize_subject(issue['subject']).lower() != normalized.lower():
             continue
 
         # Staff replies to mailing list threads don't need email match —
         # subject match is sufficient since staff wouldn't start a new
         # unrelated thread with the same subject
         if has_staff_participant:
             return issue['id']
 
         # For external senders, require email match to avoid false positives
         # on generic subjects
         email_field = next(
             (f for f in issue.get('custom_fields', [])
              if f['id'] == CONFIG['CUSTOM_FIELDS']['Email']),
             None
         )
         if email_field and email_field.get('value'):
             ticket_emails = [e.strip().lower() for e in email_field['value'].split(',')]
             if any(te in ee or ee in te for te in email_list for ee in ticket_emails):
                 return issue['id']
 
     return None
 
 
 def normalize_for_comparison(text):
     """Normalize text for duplicate detection by removing formatting."""
     # Remove Redmine/markdown formatting characters
     text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)  # Quote markers
     text = re.sub(r'!\S+!', '', text)  # Redmine inline images: !filename.png!
     text = re.sub(r'[*_`~]', '', text)  # Bold, italic, code markers (keep ! for punctuation)
     text = re.sub(r'---\s*(New Email Update|AI Suggested Response).*?---', '', text, flags=re.DOTALL)
     text = re.sub(r'^From:.*$', '', text, flags=re.MULTILINE)  # Remove From: lines
     # Collapse whitespace and lowercase
     return ' '.join(text.split()).lower()
 
 
 def text_similarity(text1, text2):
     """Calculate similarity ratio between two texts using SequenceMatcher."""
     return SequenceMatcher(None, text1, text2).ratio()
 
 
 def text_containment(text1, text2, threshold=0.85):
     """Check if the shorter text is substantially contained within the longer text.
 
     This handles cases where a draft reply is posted to Redmine, then sent via email
     with additional content (greeting, signature, etc.). The draft would be contained
     within the email even though overall similarity is low.
 
     Returns True if at least `threshold` (default 85%) of the shorter text is found
     as a contiguous match within the longer text.
     """
     if not text1 or not text2:
         return False
 
     shorter, longer = (text1, text2) if len(text1) <= len(text2) else (text2, text1)
 
     if len(shorter) < 20:
         return False
 
     # Find the longest contiguous match
     matcher = SequenceMatcher(None, shorter, longer)
     match = matcher.find_longest_match(0, len(shorter), 0, len(longer))
 
     # Calculate what percentage of the shorter text is contained in the longer
     containment_ratio = match.size / len(shorter)
     return containment_ratio >= threshold
 
 
 def content_exists_in_ticket(ticket, email_body, similarity_threshold=0.80, containment_threshold=0.85):
     """Check if email content already exists in ticket description or journals.
 
     Uses two methods to detect duplicates:
     1. Overall similarity (default 85% threshold) - catches near-identical content
     2. Containment check (default 85% threshold) - catches cases where a draft
        reply was posted to Redmine, then sent via email with additional content
        (greeting, signature, etc.)
 
     This prevents the script from reopening tickets when staff post their draft
     reply to Redmine and then send it via email.
     """
     stripped = strip_quoted_content(email_body).strip()
     if len(stripped) < 20:
         # Very short content, skip duplicate check
         return False
 
     email_normalized = normalize_for_comparison(stripped)
 
     # Quick check: if email content is very short after normalization, skip
     if len(email_normalized) < 20:
         return False
 
     def is_duplicate(text1, text2, context_name):
         """Check if two texts are duplicates using similarity or containment."""
         # Check overall similarity
         similarity = text_similarity(text1, text2)
         if similarity >= similarity_threshold:
             logger.debug(f"Found similar content in {context_name} ({similarity*100:.0f}% similarity)")
             return True
 
         # Check containment (shorter text contained in longer)
         if text_containment(text1, text2, containment_threshold):
             logger.debug(f"Found contained content in {context_name}")
             return True
 
         return False
 
     # Check ticket description
     desc = ticket.get('description', '')
     if desc:
         desc_normalized = normalize_for_comparison(desc)
         if desc_normalized and is_duplicate(email_normalized, desc_normalized, "ticket description"):
             return True
 
     # Check all journal notes
     for journal in ticket.get('journals', []):
         notes = journal.get('notes', '')
         if notes:
             notes_normalized = normalize_for_comparison(notes)
             if notes_normalized and is_duplicate(email_normalized, notes_normalized, f"journal #{journal.get('id')}"):
                 return True
 
     logger.debug("No similar content found in ticket")
     return False
 
 
 @retry(max_attempts=3, delay=2, exceptions=(requests.RequestException,))
 def get_ticket_journals(ticket_id):
     """Get journal (comment) history for a ticket."""
     url = f"{CONFIG['REDMINE_URL']}/issues/{ticket_id}.json?include=journals"
     headers = {'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY']}
 
     resp = requests.get(url, headers=headers, timeout=30)
     resp.raise_for_status()
     return resp.json().get('issue', {})
 
 
 def create_ticket(subject, body, sender_emails, mlm_name, category='Other', attachments=None):
     """Create a new Redmine ticket with optional attachments.
 
     Includes retry logic for transient server errors (5xx) and network issues.
     Sends email notification to QA team if all retries fail.
     """
     if DRY_RUN:
         att_info = f" with {len(attachments)} attachment(s)" if attachments else ""
         logger.info(f"  [DRY RUN] Would create ticket: {subject[:50]}{att_info}")
         logger.info(f"    Category: {category}, MLM: {mlm_name}")
         return None
 
     url = f"{CONFIG['REDMINE_URL']}/issues.json"
     headers = {
         'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY'],
         'Content-Type': 'application/json',
     }
 
     # Strip emojis from subject (body should already be sanitized via sanitize_for_redmine)
     clean_subject = re.sub(r'[\U00010000-\U0010FFFF]', '', subject)
 
     data = {
         'issue': {
             'project_id': CONFIG['REDMINE_PROJECT'],
             'subject': clean_subject,
             'description': f"From: {sender_emails[0]}\n\n{body}",
             'tracker_id': CONFIG['TRACKER_ID'],
             'priority_id': CONFIG['PRIORITY_ID'],
             'status_id': CONFIG['STATUS_ID'],
             'custom_fields': [
                 {'id': CONFIG['CUSTOM_FIELDS']['MLQ Category - primary'], 'value': category},
                 {'id': CONFIG['CUSTOM_FIELDS']['Email'], 'value': ', '.join(sender_emails)},
                 {'id': CONFIG['CUSTOM_FIELDS']['MLM'], 'value': mlm_name},
             ],
         }
     }
 
     # Add attachments if provided (from upload_attachments_to_redmine)
     if attachments:
         data['issue']['uploads'] = [
             {'token': att['token'], 'filename': att['filename'], 'content_type': att['content_type']}
             for att in attachments
         ]
 
     # Retry logic for transient errors
     max_attempts = 3
     retry_delay = 5  # seconds
     last_error = None
 
     for attempt in range(1, max_attempts + 1):
         try:
             resp = requests.post(url, json=data, headers=headers, timeout=30)
 
             if resp.status_code == 201:
                 ticket_id = resp.json()['issue']['id']
                 att_info = f" with {len(attachments)} attachment(s)" if attachments else ""
                 logger.info(f"Created ticket #{ticket_id}: {subject[:50]}{att_info}")
                 return ticket_id
 
             # Log detailed error info
             logger.error(f"Redmine API error creating ticket (attempt {attempt}/{max_attempts}): "
                         f"HTTP {resp.status_code} - {resp.text[:500]}")
 
             # Don't retry client errors (4xx) - something wrong with our request
             if 400 <= resp.status_code < 500:
                 last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
                 break
 
             # Server error (5xx) - retry after delay
             if resp.status_code >= 500:
                 last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
                 if attempt < max_attempts:
                     logger.info(f"  Retrying in {retry_delay} seconds...")
                     time.sleep(retry_delay)
                     retry_delay *= 2  # Exponential backoff
                 continue
 
         except requests.RequestException as e:
             logger.error(f"Network error creating ticket (attempt {attempt}/{max_attempts}): {e}")
             last_error = f"Network error: {e}"
             if attempt < max_attempts:
                 logger.info(f"  Retrying in {retry_delay} seconds...")
                 time.sleep(retry_delay)
                 retry_delay *= 2
                 continue
 
     # All retries failed - send notification email
     error_body = f"""The MLQ automation script failed to create a Redmine ticket after {max_attempts} attempts.
 
 Subject: {subject}
 From: {', '.join(sender_emails)}
 Category: {category}
 MLM: {mlm_name}
 
 Error: {last_error}
 
 This email needs to be manually processed or the issue investigated.
 
 Timestamp: {datetime.now(PST).strftime('%Y-%m-%d %H:%M:%S %Z')}
 """
     send_error_notification(f"Failed to create ticket: {subject[:50]}", error_body)
     return None
 
 
 def update_ticket(ticket_id, comment, reopen=False, new_mlm=None, attachments=None):
     """Add a comment to an existing ticket, optionally reopening it and attaching files.
 
     Includes retry logic for transient server errors (5xx) and network issues.
     Sends email notification to QA team if all retries fail.
     """
     if DRY_RUN:
         action = "reopen and update" if reopen else "update"
         att_info = f" with {len(attachments)} attachment(s)" if attachments else ""
         logger.info(f"  [DRY RUN] Would {action} ticket #{ticket_id}{att_info}")
         return True
 
     url = f"{CONFIG['REDMINE_URL']}/issues/{ticket_id}.json"
     headers = {
         'X-Redmine-API-Key': CONFIG['REDMINE_API_KEY'],
         'Content-Type': 'application/json',
     }
 
     # Strip emojis from comment (may not have gone through sanitize_for_redmine)
     clean_comment = re.sub(r'[\U00010000-\U0010FFFF]', '', comment)
 
     data = {'issue': {'notes': clean_comment}}
     if reopen:
         data['issue']['status_id'] = CONFIG['STATUS_ID']
         data['issue']['assigned_to_id'] = ''  # Clear assignee
         if new_mlm:
             data['issue']['custom_fields'] = [
                 {'id': CONFIG['CUSTOM_FIELDS']['MLM'], 'value': new_mlm}
             ]
 
     # Add attachments if provided (from upload_attachments_to_redmine)
     if attachments:
         data['issue']['uploads'] = [
             {'token': att['token'], 'filename': att['filename'], 'content_type': att['content_type']}
             for att in attachments
         ]
 
     # Retry logic for transient errors
     max_attempts = 3
     retry_delay = 5  # seconds
     last_error = None
 
     for attempt in range(1, max_attempts + 1):
         try:
             resp = requests.put(url, json=data, headers=headers, timeout=30)
 
             if resp.status_code in (200, 204):
                 action = "Reopened and updated" if reopen else "Updated"
                 att_info = f" with {len(attachments)} attachment(s)" if attachments else ""
                 logger.info(f"{action} ticket #{ticket_id}{att_info}")
                 return True
 
             # Log detailed error info
             logger.error(f"Redmine API error updating ticket #{ticket_id} (attempt {attempt}/{max_attempts}): "
                         f"HTTP {resp.status_code} - {resp.text[:500]}")
 
             # Don't retry client errors (4xx) - something wrong with our request
             if 400 <= resp.status_code < 500:
                 last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
                 break
 
             # Server error (5xx) - retry after delay
             if resp.status_code >= 500:
                 last_error = f"HTTP {resp.status_code}: {resp.text[:500]}"
                 if attempt < max_attempts:
                     logger.info(f"  Retrying in {retry_delay} seconds...")
                     time.sleep(retry_delay)
                     retry_delay *= 2  # Exponential backoff
                 continue
 
         except requests.RequestException as e:
             logger.error(f"Network error updating ticket #{ticket_id} (attempt {attempt}/{max_attempts}): {e}")
             last_error = f"Network error: {e}"
             if attempt < max_attempts:
                 logger.info(f"  Retrying in {retry_delay} seconds...")
                 time.sleep(retry_delay)
                 retry_delay *= 2
                 continue
 
     # All retries failed - send notification email
     comment_preview = comment[:200] + '...' if len(comment) > 200 else comment
     error_body = f"""The MLQ automation script failed to update Redmine ticket #{ticket_id} after {max_attempts} attempts.
 
 Ticket: #{ticket_id}
 Action: {'Reopen and update' if reopen else 'Update'}
 Comment preview: {comment_preview}
 
 Error: {last_error}
 
 This update needs to be manually applied or the issue investigated.
 
 Timestamp: {datetime.now(PST).strftime('%Y-%m-%d %H:%M:%S %Z')}
 """
     send_error_notification(f"Failed to update ticket #{ticket_id}", error_body)
     return False
 
 
 def create_tickets_for_approved(approved_messages):
     """Create or update Redmine tickets for approved moderation messages."""
     if not approved_messages:
         return
 
     mlm_name = get_current_mlm()
     if not mlm_name:
         logger.error("Cannot determine MLM, skipping ticket creation for approved messages")
         return
 
     mlm_user_id = CONFIG['USER_IDS'].get(mlm_name)
     if not mlm_user_id:
         logger.error(f"No Redmine user ID for MLM: {mlm_name}")
         return
 
     logger.info(f"Creating tickets for {len(approved_messages)} approved message(s), MLM: {mlm_name}")
 
     for msg in approved_messages:
         subject = msg['original_subject']
         sender = msg['original_from']
         body = msg['original_body']
         group_email = msg['group_email']
         attachments = msg.get('original_attachments', [])
 
         # Add list prefix to subject if not present (for consistency with delivered emails)
         group_name = group_email.split('@')[0]
         prefix = f"[{group_name}]"
         if not subject.lower().startswith(prefix.lower()):
             subject = f"{prefix} {subject}"
 
         # Extract sender email address
         sender_email = extract_email_address(sender)
         sender_emails = [sender_email]
 
         # For hgUserSuggestion forms, the actual user's email is in the subject
         # Add it to sender_emails so ticket matching works when team replies to user
         user_email = extract_hgusersuggestion_email(msg['original_subject'])
         if user_email and user_email not in sender_emails:
             sender_emails.append(user_email)
             logger.info(f"  hgUserSuggestion: added user email {user_email} to ticket")
 
         # Upload attachments to Redmine first (we need filenames for inline replacement)
         uploaded_attachments = []
         if attachments:
             logger.info(f"  Uploading {len(attachments)} attachment(s)")
             uploaded_attachments = upload_attachments_to_redmine(attachments)
 
         # Process body: replace inline image placeholders, strip quotes, sanitize
         # Order matters: replace images BEFORE stripping to keep them in context
         processed_body = body
         if uploaded_attachments:
             processed_body = replace_inline_images(processed_body, uploaded_attachments)
         processed_body = sanitize_for_redmine(strip_quoted_content(processed_body))
 
         # Check for existing ticket (handles follow-up messages in threads)
         existing_ticket = find_existing_ticket(subject, sender_emails)
 
         if existing_ticket:
             # Update existing ticket
             logger.info(f"  Found existing ticket #{existing_ticket} for: {subject[:50]}")
             ticket = get_ticket_journals(existing_ticket)
 
             # Check if content already exists to avoid duplicates
             if content_exists_in_ticket(ticket, body):
                 logger.info(f"  Skipping duplicate content for ticket #{existing_ticket}")
                 continue
 
             # Skip empty updates (e.g., email was entirely quoted content)
             if not processed_body.strip() and not uploaded_attachments:
                 logger.info(f"  Skipping empty update for ticket #{existing_ticket}")
                 continue
 
             ticket_status = ticket.get('status', {}).get('name', '').lower()
             is_closed = 'closed' in ticket_status or 'resolved' in ticket_status
 
             comment = f"--- New Email Update ---\n\nFrom: {sender}\n\n{processed_body}"
             update_ticket(existing_ticket, comment, reopen=is_closed,
                          new_mlm=mlm_name if is_closed else None,
                          attachments=uploaded_attachments)
         else:
             # Analyze with Claude for category and draft response
             analysis = analyze_email_with_claude(subject, body, sender)
 
             logger.info(f"  Category: {analysis['category']}")
 
             # Create new ticket with attachments
             ticket_id = create_ticket(
                 subject,
                 processed_body,
                 sender_emails,
                 mlm_name,
                 category=analysis['category'],
                 attachments=uploaded_attachments
             )
 
             if ticket_id and analysis['draft_response']:
                 draft_note = f"--- AI Suggested Response (Draft) ---\n\n{analysis['draft_response']}"
                 update_ticket(ticket_id, draft_note)
 
 
 def process_moderated_lists():
     """Process pending messages in moderated mailing lists."""
     # Collect all pending messages from all moderated lists
     all_pending = []
     for group_email in CONFIG['MODERATED_LISTS']:
         group_name = group_email.split('@')[0]
         logger.info(f"Checking pending messages for {group_email}")
 
         pending = get_pending_moderation_emails(group_name)
         logger.info(f"  Found {len(pending)} pending message(s)")
 
         for msg in pending:
             msg['group_email'] = group_email
             all_pending.append(msg)
 
     if not all_pending:
         return
 
     # Batch spam check all pending messages in one API call
     logger.info(f"Batch checking {len(all_pending)} message(s) for spam")
     spam_results = batch_check_spam_with_claude(all_pending)
 
     # Process results and collect approved messages
     approved_messages = []
     for i, msg in enumerate(all_pending):
         is_spam = spam_results.get(i, False)
 
         if is_spam:
             logger.info(f"  SPAM detected (not approving): {msg['original_subject'][:50]}")
             delete_moderation_email(msg['gmail_id'])
         else:
             logger.info(f"  Approving: {msg['original_subject'][:50]}")
             if moderate_message(msg, approve=True):
                 delete_moderation_email(msg['gmail_id'])
                 approved_messages.append(msg)
 
     # Create/update tickets for approved messages immediately
     create_tickets_for_approved(approved_messages)
 
 
 def process_emails():
     """Process emails and create/update Redmine tickets."""
     mlm_name = get_current_mlm()
     if not mlm_name:
         logger.error("Cannot determine MLM, skipping email processing")
         return
 
     mlm_user_id = CONFIG['USER_IDS'].get(mlm_name)
     if not mlm_user_id:
         logger.error(f"No Redmine user ID for MLM: {mlm_name}")
         return
 
     logger.info(f"Current MLM: {mlm_name} (ID: {mlm_user_id})")
 
     all_lists = CONFIG['MODERATED_LISTS'] + CONFIG['UNMODERATED_LISTS']
 
     # Group emails by thread
     threads = {}
     for group_email in all_lists:
         emails = get_emails_from_gmail(group_email)
         for email in emails:
             thread_id = email['thread_id']
             if thread_id not in threads:
                 threads[thread_id] = {
                     'subject': email['subject'],
                     'emails': [],
                     'group': group_email,
                 }
             threads[thread_id]['emails'].append(email)
 
     for thread_id, thread in threads.items():
         thread['emails'].sort(key=lambda x: x['timestamp'])
         first_email = thread['emails'][0]
 
         # Sender emails - just the From addresses (for ticket creation)
         sender_emails = list(set(extract_email_address(e['from']) for e in thread['emails']))
 
         # All participant emails (From, To, CC) for matching existing tickets
         # This handles replies where original sender appears in To/CC
         # Note: We include mailing list addresses because some tickets (like hgUserSuggestion
         # form submissions) have the mailing list as their Email field
         thread_participants = set()
         for e in thread['emails']:
             thread_participants.add(extract_email_address(e['from']))
             thread_participants.update(extract_all_email_addresses(e.get('to', '')))
             thread_participants.update(extract_all_email_addresses(e.get('cc', '')))
 
         # Remove empty strings but keep mailing list addresses for matching
         thread_participants = [e for e in thread_participants if e]
 
         existing_ticket = find_existing_ticket(thread['subject'], thread_participants)
 
         if existing_ticket:
             # Check for new messages to add
             ticket = get_ticket_journals(existing_ticket)
             last_update = datetime.fromisoformat(
                 ticket.get('created_on', '2000-01-01T00:00:00Z').replace('Z', '+00:00')
             )
 
             ticket_status = ticket.get('status', {}).get('name', '').lower()
             is_closed = 'closed' in ticket_status or 'resolved' in ticket_status
 
             for journal in ticket.get('journals', []):
                 notes = journal.get('notes', '')
                 if '--- New Email Update ---' in notes or 'From:' in notes:
                     journal_time = datetime.fromisoformat(
                         journal['created_on'].replace('Z', '+00:00')
                     )
                     if journal_time > last_update:
                         last_update = journal_time
 
             first_update = True
             for email in thread['emails']:
                 email_time = datetime.fromtimestamp(email['timestamp'], tz=pytz.UTC)
                 if email_time > last_update:
                     # Check if this content already exists in the ticket
                     if content_exists_in_ticket(ticket, email['body']):
                         logger.info(f"  Skipping duplicate content for ticket #{existing_ticket}")
                         continue
 
                     # Upload attachments from this email
                     email_attachments = email.get('attachments', [])
                     uploaded_attachments = []
                     if email_attachments:
                         logger.info(f"  Uploading {len(email_attachments)} attachment(s)")
                         uploaded_attachments = upload_attachments_to_redmine(email_attachments)
 
                     # Process body with inline image replacement
                     processed_body = email['body']
                     if uploaded_attachments:
                         processed_body = replace_inline_images(processed_body, uploaded_attachments)
                     processed_body = sanitize_for_redmine(strip_quoted_content(processed_body))
 
                     # Skip empty updates (e.g., email was entirely quoted content)
                     if not processed_body.strip() and not uploaded_attachments:
                         logger.info(f"  Skipping empty update for ticket #{existing_ticket}")
                         continue
 
                     comment = f"--- New Email Update ---\n\nFrom: {email['from']}\n\n{processed_body}"
                     reopen = is_closed and first_update
                     update_ticket(existing_ticket, comment, reopen=reopen,
                                  new_mlm=mlm_name if reopen else None,
                                  attachments=uploaded_attachments)
                     first_update = False
         else:
             # Analyze email with Claude (single call for spam, category, draft)
             analysis = analyze_email_with_claude(
                 first_email['subject'],
                 first_email['body'],
                 first_email['from']
             )
 
             if analysis['is_spam']:
                 logger.info(f"Skipping spam: {first_email['subject'][:50]}")
                 continue
 
             logger.info(f"  Category: {analysis['category']}")
 
             # Upload attachments from the first email
             first_attachments = first_email.get('attachments', [])
             uploaded_attachments = []
             if first_attachments:
                 logger.info(f"  Uploading {len(first_attachments)} attachment(s)")
                 uploaded_attachments = upload_attachments_to_redmine(first_attachments)
 
             # Process body with inline image replacement
             processed_body = first_email['body']
             if uploaded_attachments:
                 processed_body = replace_inline_images(processed_body, uploaded_attachments)
             processed_body = sanitize_for_redmine(strip_quoted_content(processed_body))
 
             ticket_id = create_ticket(
                 thread['subject'],
                 processed_body,
                 sender_emails,
                 mlm_name,
                 category=analysis['category'],
                 attachments=uploaded_attachments
             )
 
             if ticket_id:
                 if len(thread['emails']) > 1:
                     for email in thread['emails'][1:]:
                         # Upload attachments from follow-up emails
                         email_attachments = email.get('attachments', [])
                         email_uploaded = []
                         if email_attachments:
                             logger.info(f"  Uploading {len(email_attachments)} attachment(s)")
                             email_uploaded = upload_attachments_to_redmine(email_attachments)
 
                         email_body = email['body']
                         if email_uploaded:
                             email_body = replace_inline_images(email_body, email_uploaded)
                         email_body = sanitize_for_redmine(strip_quoted_content(email_body))
 
                         comment = f"Message from: {email['from']}\n\n{email_body}"
                         update_ticket(ticket_id, comment, attachments=email_uploaded)
 
                 if analysis['draft_response']:
                     draft_note = f"--- AI Suggested Response (Draft) ---\n\n{analysis['draft_response']}"
                     update_ticket(ticket_id, draft_note)
 
 
 def main():
     """Main entry point."""
     parser = argparse.ArgumentParser(description='MLQ Automation Script')
     parser.add_argument('--dry-run', action='store_true', help='Run without making changes')
     args = parser.parse_args()
 
     global DRY_RUN
     DRY_RUN = args.dry_run
 
     if DRY_RUN:
         logger.info("=== DRY RUN MODE - No changes will be made ===")
 
     # Load configuration from ~/.hg.conf
     load_config_file()
 
     logger.info(f"=== MLQ Automation - {datetime.now(PST).strftime('%Y-%m-%d %H:%M:%S %Z')} ===")
 
     # Process emails first - this catches direct posts from staff (not moderated)
     # Must run BEFORE moderation approval to avoid timestamp issues
     logger.info("--- Processing Emails ---")
     process_emails()
 
     # Process moderated list pending messages (approve and create tickets)
     logger.info("--- Processing Moderated Lists ---")
     process_moderated_lists()
 
     logger.info("=== Complete ===")
 
 
 if __name__ == '__main__':
     main()