833be021d3df3caba9a3ef6e2a6ceb9d41e2162b
lrnassar
  Sat Apr 11 12:45:32 2026 -0700
Increase Anthropic API retry resilience in mlqAutomate. Retries now wait longer
(5s/15s/45s/135s instead of 2s/4s/8s) to ride out transient 529 overload errors.
If retries are still exhausted, the script logs a clear error message and skips
the affected emails instead of crashing, so the next cron run picks them up. refs #36801

diff --git src/utils/qa/mlqAutomate.py src/utils/qa/mlqAutomate.py
index c6763538349..ba8649df17b 100755
--- src/utils/qa/mlqAutomate.py
+++ src/utils/qa/mlqAutomate.py
@@ -245,31 +245,31 @@
         timeMax=end.isoformat(),
         singleEvents=True
     ).execute().get('items', [])
 
     for event in events:
         title = event.get('summary', '')
         match = re.search(r'MLM(?:\s+Rotating)?:\s*(\w+)', title, re.IGNORECASE)
         if match:
             cal_name = match.group(1)
             return CONFIG['NAME_MAPPING'].get(cal_name, cal_name)
 
     logger.warning(f"No MLM found for {target_date.date()}")
     return None
 
 
-@retry(max_attempts=3, delay=2, exceptions=(anthropic.APIError,))
+@retry(max_attempts=5, delay=5, backoff=3, exceptions=(anthropic.APIError,))
 def analyze_email_with_claude(subject, body, sender, group_email=None):
     """
     Use Claude to analyze an email in a single call.
     Returns dict with: is_spam, category, draft_response
     """
     client = anthropic.Anthropic(api_key=CONFIG['CLAUDE_API_KEY'])
     categories_list = ", ".join(MLQ_CATEGORIES)
 
     # The medical info spam rule only applies to public/moderated lists.
     # Unmoderated lists (e.g., genome-www) are only accessible to the Browser team,
     # so personal medical information is not a privacy concern there.
     is_unmoderated = group_email in CONFIG['UNMODERATED_LISTS']
 
     medical_rule = ""
     if not is_unmoderated:
@@ -329,31 +329,31 @@
             draft_response = line.split(':', 1)[1].strip()
             # Capture multi-line response
             idx = result_text.find('DRAFT_RESPONSE:')
             if idx != -1:
                 draft_response = result_text[idx + len('DRAFT_RESPONSE:'):].strip()
                 if draft_response.upper() == 'N/A':
                     draft_response = None
 
     return {
         'is_spam': is_spam,
         'category': category,
         'draft_response': draft_response
     }
 
 
-@retry(max_attempts=3, delay=2, exceptions=(anthropic.APIError,))
+@retry(max_attempts=5, delay=5, backoff=3, exceptions=(anthropic.APIError,))
 def batch_check_spam_with_claude(messages):
     """
     Use Claude to determine spam status for multiple emails in one call.
     Returns dict mapping message index to True (spam) or False (not spam).
     """
     if not messages:
         return {}
 
     client = anthropic.Anthropic(api_key=CONFIG['CLAUDE_API_KEY'])
 
     # Build the prompt with all messages
     emails_text = ""
     for i, msg in enumerate(messages, 1):
         emails_text += f"""
 --- EMAIL {i} ---
@@ -1644,32 +1644,37 @@
 
             # Skip empty updates (e.g., email was entirely quoted content)
             if not processed_body.strip() and not uploaded_attachments:
                 logger.info(f"  Skipping empty update for ticket #{existing_ticket}")
                 continue
 
             ticket_status = ticket.get('status', {}).get('name', '').lower()
             is_closed = 'closed' in ticket_status or 'resolved' in ticket_status
 
             comment = f"--- New Email Update ---\n\nFrom: {sender}\n\n{processed_body}"
             update_ticket(existing_ticket, comment, reopen=is_closed,
                          new_mlm=mlm_name if is_closed else None,
                          attachments=uploaded_attachments)
         else:
             # Analyze with Claude for category and draft response
+            try:
                 analysis = analyze_email_with_claude(subject, body, sender,
                                                     group_email=group_email)
+            except anthropic.APIError as e:
+                logger.error(f"Anthropic API overloaded after retries, skipping email "
+                             f"'{subject[:50]}'. Will be retried next run. Error: {e}")
+                continue
 
             logger.info(f"  Category: {analysis['category']}")
 
             # Create new ticket with attachments
             ticket_id = create_ticket(
                 subject,
                 processed_body,
                 sender_emails,
                 mlm_name,
                 category=analysis['category'],
                 attachments=uploaded_attachments
             )
 
             if ticket_id and analysis['draft_response']:
                 draft_note = f"--- AI Suggested Response (Draft) ---\n\n{analysis['draft_response']}"
@@ -1684,31 +1689,36 @@
         group_name = group_email.split('@')[0]
         logger.info(f"Checking pending messages for {group_email}")
 
         pending = get_pending_moderation_emails(group_name)
         logger.info(f"  Found {len(pending)} pending message(s)")
 
         for msg in pending:
             msg['group_email'] = group_email
             all_pending.append(msg)
 
     if not all_pending:
         return
 
     # Batch spam check all pending messages in one API call
     logger.info(f"Batch checking {len(all_pending)} message(s) for spam")
+    try:
         spam_results = batch_check_spam_with_claude(all_pending)
+    except anthropic.APIError as e:
+        logger.error(f"Anthropic API overloaded after retries, skipping spam check this run. "
+                     f"Pending messages will be retried in the next run. Error: {e}")
+        return
 
     # Process results and collect approved messages
     approved_messages = []
     for i, msg in enumerate(all_pending):
         is_spam = spam_results.get(i, False)
 
         if is_spam:
             logger.info(f"  SPAM detected (not approving): {msg['original_subject'][:50]}")
             delete_moderation_email(msg['gmail_id'])
         else:
             logger.info(f"  Approving: {msg['original_subject'][:50]}")
             if moderate_message(msg, approve=True):
                 delete_moderation_email(msg['gmail_id'])
                 approved_messages.append(msg)
 
@@ -1810,36 +1820,41 @@
                     processed_body = sanitize_for_redmine(strip_quoted_content(processed_body))
 
                     # Skip empty updates (e.g., email was entirely quoted content)
                     if not processed_body.strip() and not uploaded_attachments:
                         logger.info(f"  Skipping empty update for ticket #{existing_ticket}")
                         continue
 
                     comment = f"--- New Email Update ---\n\nFrom: {email['from']}\n\n{processed_body}"
                     reopen = is_closed and first_update
                     update_ticket(existing_ticket, comment, reopen=reopen,
                                  new_mlm=mlm_name if reopen else None,
                                  attachments=uploaded_attachments)
                     first_update = False
         else:
             # Analyze email with Claude (single call for spam, category, draft)
+            try:
                 analysis = analyze_email_with_claude(
                     first_email['subject'],
                     first_email['body'],
                     first_email['from'],
                     group_email=thread['group']
                 )
+            except anthropic.APIError as e:
+                logger.error(f"Anthropic API overloaded after retries, skipping email "
+                             f"'{first_email['subject'][:50]}'. Will be retried next run. Error: {e}")
+                continue
 
             if analysis['is_spam']:
                 logger.info(f"Skipping spam: {first_email['subject'][:50]}")
                 continue
 
             logger.info(f"  Category: {analysis['category']}")
 
             # Upload attachments from the first email
             first_attachments = first_email.get('attachments', [])
             uploaded_attachments = []
             if first_attachments:
                 logger.info(f"  Uploading {len(first_attachments)} attachment(s)")
                 uploaded_attachments = upload_attachments_to_redmine(first_attachments)
 
             # Process body with inline image replacement