bd2a9f1bb1cd2b0f8881a4ab4e5385455e8e69aa
braney
  Thu Apr 23 11:26:06 2026 -0700
add parseQuickLiftLogs to kent/src

diff --git src/hg/logCrawl/parseQuickLiftLogs/parseQuickLiftLogs src/hg/logCrawl/parseQuickLiftLogs/parseQuickLiftLogs
new file mode 100755
index 00000000000..c25b62d1902
--- /dev/null
+++ src/hg/logCrawl/parseQuickLiftLogs/parseQuickLiftLogs
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+"""Parse UCSC RR access logs for quickLift usage.
+
+Given a directory laid out as .../YYYY/hgwN/access_log.YYYYMMDD.gz, scans every
+access_log file and classifies quickLift-related requests. Emits three tables:
+weekly counts, source-to-destination assembly pairs, and top destinations.
+
+Distinguished signals (earlier naive greps conflated these):
+  SUBMIT    - request URL is /cgi-bin/hgConvert?...doQuickLift=on...hglft_doConvert=Submit
+              -> real user-initiated conversion
+  LIVE      - request URL carries quickLift.<hubId>.<db>=<chainId>
+              -> ongoing session with a quickLift hub active
+  DOC       - request URL hits quickLift.html
+  SHADOW    - boolshad.doQuickLift=0 (ignored; fires on every hgConvert page view)
+
+Usage:
+  parseQuickLiftLogs [LOG_DIR]
+  parseQuickLiftLogs /hive/data/inside/wwwstats/RR/2026
+"""
+
+import argparse
+import collections
+import glob
+import gzip
+import os
+import re
+import sys
+
+DEFAULT_LOG_DIR = "/hive/data/inside/wwwstats/RR/2026"
+
+# Apache combined log: IP - - [date] "REQ" status size "REFERER" "UA" time unit
+LINE_RE = re.compile(
+    r'^(\S+) \S+ \S+ \[([^\]]+)\] "([^"]*)" \S+ \S+ "([^"]*)" "[^"]*"'
+)
+REQ_RE = re.compile(r'^\S+\s+(\S+)')            # method PATH proto -> PATH
+DB_PARAM_RE = re.compile(r'[?&]db=([^&"\s]+)')
+TO_DB_RE = re.compile(r'[?&]hglft_toDb=([^&"\s]+)')
+SUBMIT_RE = re.compile(
+    r'^/cgi-bin/hgConvert\?[^"\s]*doQuickLift=on[^"\s]*hglft_doConvert=Submit'
+)
+LIVE_RE = re.compile(r'[?&]quickLift\.\d+\.[A-Za-z0-9_]+=')
+DOC_RE = re.compile(r'/quickLift\.html')
+HGSID_RE = re.compile(r'hgsid=([0-9A-Za-z_]+)')
+WEEK_RE = re.compile(r'access_log\.(\d{8})\.gz$')
+
+
+def find_logs(root):
+    # Accept either a year dir (with hgw* children) or a single-host dir.
+    files = sorted(glob.glob(os.path.join(root, "hgw*", "access_log.*.gz")))
+    if not files:
+        files = sorted(glob.glob(os.path.join(root, "access_log.*.gz")))
+    return files
+
+
+def classify(path, tally):
+    m = WEEK_RE.search(path)
+    week = m.group(1) if m else "unknown"
+    try:
+        fh = gzip.open(path, "rt", errors="replace")
+    except OSError as e:
+        print(f"# skip {path}: {e}", file=sys.stderr)
+        return
+
+    with fh:
+        for line in fh:
+            if "uickLift" not in line:   # cheap filter first (matches Q or q)
+                continue
+            m = LINE_RE.match(line)
+            if not m:
+                continue
+            ip, _ts, request, referer = m.groups()
+            m2 = REQ_RE.match(request)
+            if not m2:
+                continue
+            path_q = m2.group(1)
+
+            # SUBMIT: actual hgConvert submit with doQuickLift=on
+            if SUBMIT_RE.match(path_q):
+                to = TO_DB_RE.search(path_q)
+                to_db = to.group(1) if to else "(unknown)"
+                fr = DB_PARAM_RE.search(referer)
+                from_db = fr.group(1) if fr else "(unknown)"
+                tally["submit"][week] += 1
+                tally["pair"][(from_db, to_db)] += 1
+                tally["dest"][to_db] += 1
+                if fr:
+                    tally["src"][from_db] += 1
+                sid = HGSID_RE.search(path_q)
+                if sid:
+                    tally["submit_sessions"].add(sid.group(1))
+                tally["submit_ips"].add(ip)
+                continue
+
+            # LIVE: hgTracks with an active quickLift hub
+            if LIVE_RE.search(path_q):
+                tally["live"][week] += 1
+                for m3 in re.finditer(r'quickLift\.(\d+)\.', path_q):
+                    tally["hub"][m3.group(1)] += 1
+                continue
+
+            # DOC: quickLift.html
+            if DOC_RE.search(path_q):
+                tally["doc"][week] += 1
+                continue
+
+
+MARKDOWN = False
+
+
+def print_table(title, rows, headers):
+    print(f"\n## {title}\n")
+    if MARKDOWN:
+        print("| " + " | ".join(headers) + " |")
+        aligns = ["---:" if any(isinstance(r[i], int) for r in rows) else "---"
+                  for i in range(len(headers))]
+        print("| " + " | ".join(aligns) + " |")
+        for r in rows:
+            print("| " + " | ".join(str(v) for v in r) + " |")
+        return
+    widths = [max(len(str(r[i])) for r in [headers] + rows) for i in range(len(headers))]
+    def fmt(row):
+        parts = []
+        for i, v in enumerate(row):
+            s = str(v)
+            parts.append(s.rjust(widths[i]) if isinstance(v, int) else s.ljust(widths[i]))
+        return "  ".join(parts)
+    print(fmt(headers))
+    print("  ".join("-" * w for w in widths))
+    for r in rows:
+        print(fmt(r))
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    ap.add_argument("log_dir", nargs="?", default=DEFAULT_LOG_DIR,
+                    help=f"root containing hgw*/access_log.*.gz (default: {DEFAULT_LOG_DIR})")
+    ap.add_argument("--top-pairs", type=int, default=30,
+                    help="max source->dest rows to show (default: 30)")
+    ap.add_argument("--top-dests", type=int, default=15,
+                    help="max destination rows to show (default: 15)")
+    ap.add_argument("--markdown", action="store_true",
+                    help="emit GitHub-flavored pipe tables instead of aligned text")
+    args = ap.parse_args()
+    global MARKDOWN
+    MARKDOWN = args.markdown
+
+    logs = find_logs(args.log_dir)
+    if not logs:
+        sys.exit(f"no access_log.*.gz files under {args.log_dir}")
+
+    tally = {
+        "submit": collections.Counter(),
+        "live":   collections.Counter(),
+        "doc":    collections.Counter(),
+        "pair":   collections.Counter(),
+        "dest":   collections.Counter(),
+        "src":    collections.Counter(),
+        "hub":    collections.Counter(),
+        "submit_sessions": set(),
+        "submit_ips":      set(),
+    }
+
+    for path in logs:
+        print(f"# scanning {path}", file=sys.stderr)
+        classify(path, tally)
+
+    # Weekly table
+    weeks = sorted(set(tally["submit"]) | set(tally["live"]) | set(tally["doc"]))
+    weekly_rows = [
+        [w, tally["submit"].get(w, 0), tally["live"].get(w, 0), tally["doc"].get(w, 0)]
+        for w in weeks
+    ]
+    print_table("Weekly counts", weekly_rows, ["week_ending", "submit", "live_hgTracks", "docs"])
+
+    # Source -> destination
+    pair_rows = [
+        [f, t, c]
+        for (f, t), c in tally["pair"].most_common(args.top_pairs)
+    ]
+    print_table("Top source-to-destination pairs (SUBMIT-only)", pair_rows,
+                ["from", "to", "count"])
+
+    # Top destinations
+    dest_rows = [[d, c] for d, c in tally["dest"].most_common(args.top_dests)]
+    print_table("Top destination assemblies", dest_rows, ["to", "count"])
+
+    # Top sources (only counts submits where source was parseable)
+    src_rows = [[s, c] for s, c in tally["src"].most_common(args.top_dests)]
+    print_table("Top source assemblies", src_rows, ["from", "count"])
+
+    # Hub usage
+    hub_rows = [[h, c] for h, c in tally["hub"].most_common(10)]
+    print_table("Top quickLift hub IDs (live sessions)", hub_rows, ["hubId", "hits"])
+
+    # Summary
+    total_submit = sum(tally["submit"].values())
+    total_live   = sum(tally["live"].values())
+    total_doc    = sum(tally["doc"].values())
+    known_src    = sum(c for (f, _), c in tally["pair"].items() if f != "(unknown)")
+    print("\n## Summary")
+    print(f"log files scanned:            {len(logs)}")
+    print(f"SUBMIT conversions:           {total_submit}")
+    print(f"  with parseable source db:   {known_src} ({100*known_src/total_submit:.0f}%)" if total_submit else "  (none)")
+    print(f"live hgTracks hits:           {total_live}")
+    print(f"quickLift.html hits:          {total_doc}")
+    print(f"unique SUBMIT hgsid values:   {len(tally['submit_sessions'])}")
+    print(f"unique SUBMIT client IPs:     {len(tally['submit_ips'])}")
+
+
+if __name__ == "__main__":
+    main()