953908311427465877fb3d27ae8e9dc2469e71ac
lrnassar
  Wed Dec 3 17:39:14 2025 -0800
Adding a new cron script that will report CAPTCHA bypass usage, refs #36797

diff --git src/utils/qa/reportCaptchaBypass.py src/utils/qa/reportCaptchaBypass.py
new file mode 100644
index 00000000000..0c069282da3
--- /dev/null
+++ src/utils/qa/reportCaptchaBypass.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+# Reports user agent captcha exceptions as well as API key captcha bypasses on the RR/euro/dev. Refs #36797
+
+import os
+import re
+import subprocess
+from datetime import datetime
+from collections import defaultdict
+from pathlib import Path
+
+def get_current_year_and_month():
+    """Get current year and determine which year's logs to check."""
+    now = datetime.now()
+    current_month = now.month
+    current_year = now.year
+    
+    # If running in January, look at previous year's logs
+    if current_month == 1:
+        log_year = current_year - 1
+    else:
+        log_year = current_year
+    
+    return log_year
+
+def get_last_n_error_logs(directory, n=5):
+    """Get the last n error log files from a directory."""
+    if not os.path.exists(directory):
+        return []
+    
+    # Find all error_log.*.gz files
+    log_files = []
+    for file in os.listdir(directory):
+        if file.startswith('error_log.') and file.endswith('.gz'):
+            full_path = os.path.join(directory, file)
+            log_files.append(full_path)
+    
+    # Sort by filename (which includes date) and get last n
+    log_files.sort()
+    return log_files[-n:]
+
+def extract_api_keys_from_log(log_file):
+    """Extract API keys and their counts from a gzipped log file."""
+    api_key_counts = defaultdict(int)
+    
+    try:
+        # Use zcat and grep to extract API keys
+        cmd = f"zcat {log_file} | grep -oP 'CAPTCHAPASS_APIKEY \\K\\S+'"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        
+        if result.returncode == 0:
+            for api_key in result.stdout.strip().split('\n'):
+                if api_key:  # Skip empty lines
+                    api_key_counts[api_key] += 1
+    except Exception as e:
+        print(f"Error processing {log_file}: {e}")
+    
+    return api_key_counts
+
+def extract_bot_agents_from_log(log_file):
+    """Extract bot agents and their counts from a gzipped log file."""
+    bot_agent_counts = defaultdict(int)
+    
+    try:
+        # Use zcat and grep to extract lines with CAPTCHAPASS (not CAPTCHAPASS_APIKEY)
+        cmd = f"zcat {log_file} | grep 'CAPTCHAPASS' | grep -v 'CAPTCHAPASS_APIKEY'"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        
+        if result.returncode == 0:
+            for line in result.stdout.strip().split('\n'):
+                if line:
+                    # Extract the agent name after "matches"
+                    match = re.search(r'matches\s+(\S+)', line)
+                    if match:
+                        agent = match.group(1)
+                        # Strip trailing comma and other punctuation
+                        agent = agent.rstrip('.,;:')
+                        bot_agent_counts[agent] += 1
+    except Exception as e:
+        print(f"Error processing {log_file}: {e}")
+    
+    return bot_agent_counts
+
+def get_api_key_mappings_rr():
+    """Query the RR database to get userName to apiKey mappings."""
+    mappings = {}
+    
+    try:
+        cmd = 'hgsql -h genome-centdb -e "select userName, apiKey from apiKeys" hgcentral'
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        
+        if result.returncode == 0:
+            lines = result.stdout.strip().split('\n')
+            # Skip header line
+            for line in lines[1:]:
+                parts = line.strip().split('\t')
+                if len(parts) == 2:
+                    username, api_key = parts
+                    mappings[api_key] = username
+    except Exception as e:
+        print(f"Error querying RR database: {e}")
+    
+    return mappings
+
+def get_api_key_mappings_euro():
+    """Query the Euro database to get userName to apiKey mappings."""
+    mappings = {}
+    
+    try:
+        cmd = 'ssh qateam@genome-euro "hgsql -e \'select userName, apiKey from apiKeys\' hgcentral"'
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        
+        if result.returncode == 0:
+            lines = result.stdout.strip().split('\n')
+            # Skip header line
+            for line in lines[1:]:
+                parts = line.strip().split('\t')
+                if len(parts) == 2:
+                    username, api_key = parts
+                    mappings[api_key] = username
+    except Exception as e:
+        print(f"Error querying Euro database: {e}")
+    
+    return mappings
+
+def get_api_key_mappings_dev():
+    """Query the Dev database to get userName to apiKey mappings."""
+    mappings = {}
+    
+    try:
+        cmd = 'hgsql -e "select userName, apiKey from apiKeys" hgcentraltest'
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        
+        if result.returncode == 0:
+            lines = result.stdout.strip().split('\n')
+            # Skip header line
+            for line in lines[1:]:
+                parts = line.strip().split('\t')
+                if len(parts) == 2:
+                    username, api_key = parts
+                    mappings[api_key] = username
+    except Exception as e:
+        print(f"Error querying Dev database: {e}")
+    
+    return mappings
+
+def process_region(region_name, directories, get_mappings_func):
+    """Process logs for a specific region and generate report."""
+    # Collect all API key usage across all logs
+    total_api_key_counts = defaultdict(int)
+    total_bot_agent_counts = defaultdict(int)
+    
+    # Process each directory
+    for base_dir in directories:
+        log_files = get_last_n_error_logs(base_dir, n=5)
+        
+        for log_file in log_files:
+            # Extract API keys
+            api_key_counts = extract_api_keys_from_log(log_file)
+            for api_key, count in api_key_counts.items():
+                total_api_key_counts[api_key] += count
+            
+            # Extract bot agents
+            bot_agent_counts = extract_bot_agents_from_log(log_file)
+            for agent, count in bot_agent_counts.items():
+                total_bot_agent_counts[agent] += count
+    
+    # Get username mappings
+    mappings = get_mappings_func()
+    
+    # Print API Key Usage Report
+    print("=" * 80)
+    print(f"{region_name} API KEY USAGE REPORT")
+    print("=" * 80)
+    
+    if total_api_key_counts:
+        # Sort by usage count (descending)
+        sorted_keys = sorted(total_api_key_counts.items(), key=lambda x: x[1], reverse=True)
+        
+        print(f"{'Count':<10} {'Username':<30} {'API Key':<44}")
+        print("-" * 80)
+        
+        for api_key, count in sorted_keys:
+            username = mappings.get(api_key, "UNKNOWN")
+            print(f"{count:<10} {username:<30} {api_key:<44}")
+        
+        print("-" * 80)
+        print(f"Total API calls: {sum(total_api_key_counts.values())}")
+        print(f"Unique API keys: {len(total_api_key_counts)}")
+        
+        # Report unknown keys
+        unknown_keys = [k for k in total_api_key_counts.keys() if k not in mappings]
+        if unknown_keys:
+            print(f"\nWarning: {len(unknown_keys)} API key(s) not found in database:")
+            for key in unknown_keys:
+                print(f"  {key} (used {total_api_key_counts[key]} times)")
+    else:
+        print("No API key usage found.")
+    
+    print("\n")
+    
+    # Print Bot Agent Report
+    print("=" * 80)
+    print(f"{region_name} BOT AGENT USAGE REPORT")
+    print("=" * 80)
+    
+    if total_bot_agent_counts:
+        # Sort by usage count (descending)
+        sorted_agents = sorted(total_bot_agent_counts.items(), key=lambda x: x[1], reverse=True)
+        
+        print(f"{'Count':<10} {'Bot Agent':<70}")
+        print("-" * 80)
+        
+        for agent, count in sorted_agents:
+            print(f"{count:<10} {agent:<70}")
+        
+        print("-" * 80)
+        print(f"Total bot agent matches: {sum(total_bot_agent_counts.values())}")
+        print(f"Unique bot agents: {len(total_bot_agent_counts)}")
+    else:
+        print("No bot agent usage found.")
+    
+    print("\n")
+
+def main():
+    print("API CAPTCHA key and user agent exception usage over the last month.\n")
+    
+    # Determine which year to check
+    log_year = get_current_year_and_month()
+    
+    # Process RR region
+    rr_dirs = [
+        f"/hive/data/inside/wwwstats/RR/{log_year}/hgw0",
+        f"/hive/data/inside/wwwstats/RR/{log_year}/hgw1",
+        f"/hive/data/inside/wwwstats/RR/{log_year}/hgw2"
+    ]
+    process_region("RR", rr_dirs, get_api_key_mappings_rr)
+    
+    # Process Euro region
+    euro_dirs = [
+        f"/hive/data/inside/wwwstats/euroNode/{log_year}"
+    ]
+    process_region("EURO", euro_dirs, get_api_key_mappings_euro)
+    
+    # Process Dev region
+    dev_dirs = [
+        f"/hive/data/inside/wwwstats/genome-test/{log_year}"
+    ]
+    process_region("DEV", dev_dirs, get_api_key_mappings_dev)
+
+if __name__ == "__main__":
+    main()