Initial commit: KQL ↔ SDL PowerQuery proof of equivalence

This commit is contained in:
marc
2026-06-01 09:57:14 +02:00
commit 23cbaa9c08
91 changed files with 5966 additions and 0 deletions
+199
View File
@@ -0,0 +1,199 @@
#!/usr/bin/env python3
"""End-to-end proof harness.
Steps:
1. Loads sample_data/events.jsonl into memory.
2. Runs each rule's Python reference implementation against the in-memory
events. This is the canonical "ground truth" the same logical operation
that both the KQL and the PowerQuery engines evaluate.
3. Optionally ingests the events to SentinelOne SDL via /api/addEvents,
then runs each rule's PowerQuery via /api/powerQuery and compares the
fired set against the reference.
4. Emits reports/PROOF.md with side-by-side results.
Run modes:
python harness/prove_equivalence.py # local-only proof
python harness/prove_equivalence.py --ingest # ingest + remote PQ
"""
from __future__ import annotations
import argparse
import json
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from rules import RULES, NOW, RECENT_START # noqa: E402
SAMPLE = ROOT / "sample_data" / "events.jsonl"
REPORT = ROOT / "reports" / "PROOF.md"
REPORT_JSON = ROOT / "reports" / "PROOF.json"
def load_events() -> list[dict]:
return [json.loads(l) for l in SAMPLE.read_text().splitlines() if l.strip()]
def canonical(rule, rows):
"""Return a sorted, hashable representation of fired rows for comparison."""
keys = sorted({rule["key"](r) for r in rows}, key=lambda x: str(x))
return keys
def run_local(events):
out = {}
for r in RULES:
rows = r["ref"](events)
out[r["id"]] = {
"description": r["description"],
"fired_rows": rows,
"fired_keys": canonical(r, rows),
}
return out
def run_pq(run_id: str | None = None):
from sdl_client import power_query
out = {}
recent_ms = int(RECENT_START.timestamp() * 1000)
scope = f"proof_run_id='{run_id}' " if run_id else ""
print(f" scope = {scope.strip() or '(none)'}")
print(f" RECENT_MS = {recent_ms} ({RECENT_START.isoformat()})")
print(f" NOW = {NOW.isoformat()}")
print()
for i, r in enumerate(RULES, 1):
q = scope + r["pq"].format(RECENT_MS=str(recent_ms))
print(f" [{i:>2}/{len(RULES)}] {r['id']:<48} ", end="", flush=True)
t0 = time.time()
try:
resp = power_query(q, start_time="2h")
cols_meta = resp.get("columns") or []
cols = [c["name"] if isinstance(c, dict) else c for c in cols_meta]
vals = resp.get("values") or []
rows = [dict(zip(cols, v)) for v in vals]
elapsed = time.time() - t0
status = resp.get("status", "ok")
print(f"-> {len(rows):>3} rows matching={resp.get('matchingEvents')} "
f"({elapsed:.1f}s, {status})")
out[r["id"]] = {"ok": True, "rowcount": len(rows),
"rows": rows[:50], "status": status,
"matching": resp.get("matchingEvents")}
except Exception as e:
elapsed = time.time() - t0
msg = str(e)[:200]
print(f"-> ERROR ({elapsed:.1f}s): {msg}")
out[r["id"]] = {"ok": False, "error": msg}
return out
def ingest():
from sdl_client import ingest_jsonl, power_query
n, run_id = ingest_jsonl(SAMPLE)
print(f"Ingested {n} events to SDL (proof_run_id={run_id})")
# Poll until SDL reports the events are indexed.
print("Waiting for SDL indexing ...", end="", flush=True)
for i in range(30): # up to 60s
time.sleep(2)
r = power_query(f"proof_run_id='{run_id}' | group n=count()", "30m")
vals = r.get("values") or []
cnt = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
print(f" {cnt}", end="", flush=True)
if cnt >= n:
print(" ✓ ready")
return run_id
print(" (timeout, proceeding anyway)")
return run_id
def write_report(local_results, pq_results=None):
REPORT.parent.mkdir(exist_ok=True)
md = ["# KQL ↔ PowerQuery equivalence proof",
"",
f"Sample dataset: `sample_data/events.jsonl` ({len(load_events())} events)",
f"Time anchor (NOW): `{NOW.isoformat()}`",
f"Recent window start: `{RECENT_START.isoformat()}`",
"",
"Each rule below is expressed three ways:",
"1. **KQL** — verbatim/condensed from the Microsoft Sentinel docs.",
"2. **PowerQuery (PQ)** — SDL equivalent, runnable on `<XDR endpoint>`.",
"3. **Python reference** — canonical implementation of the same logical "
"operation tree against the in-memory dataset. Acts as ground truth.",
"",
"The PowerQuery is considered equivalent to the KQL when its result "
"set matches the Python reference. The Python reference encodes the "
"*same operations* that the KQL parser/optimiser would produce, so a "
"match certifies KQL/PQ parity on this dataset.",
""]
for r in RULES:
rid = r["id"]
loc = local_results[rid]
md += [f"## {rid}", "",
f"_{r['description']}_", "",
"### KQL", "```kusto", r["kql"].strip(), "```",
"### PowerQuery", "```", r["pq"].strip(), "```",
f"### Reference fired: {len(loc['fired_rows'])} row(s)"]
if loc["fired_rows"]:
sample = loc["fired_rows"][:5]
md.append("```json")
md.append(json.dumps(sample, default=str, indent=2))
md.append("```")
if pq_results:
pq = pq_results.get(rid, {})
if pq.get("ok"):
pq_keys = []
for row in pq.get("rows", []):
try:
pq_keys.append(r["key"](row))
except Exception:
pq_keys.append(tuple(row.items()))
pq_keys = sorted({k for k in pq_keys}, key=lambda x: str(x))
ref_keys = loc["fired_keys"]
match = "✅ MATCH" if pq_keys == ref_keys else "⚠️ DIFFERS"
md += [f"### SDL PowerQuery result: {pq['rowcount']} row(s) — {match}"]
if pq_keys != ref_keys:
md += ["Reference keys:", "```",
json.dumps([list(k) for k in ref_keys], default=str), "```",
"PQ keys:", "```",
json.dumps([list(k) for k in pq_keys], default=str), "```"]
else:
md.append(f"### SDL PowerQuery error: `{pq.get('error', '?')}`")
md.append("")
REPORT.write_text("\n".join(md))
REPORT_JSON.write_text(json.dumps(
{"local": {k: {"fired_keys": [list(x) for x in v["fired_keys"]],
"n": len(v["fired_rows"])}
for k, v in local_results.items()},
"pq": pq_results or {}},
default=str, indent=2))
print(f"Wrote {REPORT}")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--ingest", action="store_true",
help="Ingest sample events to SDL before querying")
ap.add_argument("--pq", action="store_true",
help="Also run each PQ against SDL and compare")
args = ap.parse_args()
events = load_events()
print(f"Loaded {len(events)} events")
local_results = run_local(events)
fired_total = sum(len(v["fired_rows"]) for v in local_results.values())
print(f"Local reference: {fired_total} total fired rows across {len(RULES)} rules")
pq_results = None
run_id = None
if args.ingest:
run_id = ingest()
if args.pq:
pq_results = run_pq(run_id=run_id)
write_report(local_results, pq_results)
if __name__ == "__main__":
main()