mirror of
https://github.com/marcredhat/kql
synced 2026-06-08 13:23:58 +00:00
200 lines
7.6 KiB
Python
200 lines
7.6 KiB
Python
#!/usr/bin/env python3
|
||
"""End-to-end proof harness.
|
||
|
||
Steps:
|
||
1. Loads sample_data/events.jsonl into memory.
|
||
2. Runs each rule's Python reference implementation against the in-memory
|
||
events. This is the canonical "ground truth" – the same logical operation
|
||
that both the KQL and the PowerQuery engines evaluate.
|
||
3. Optionally ingests the events to SentinelOne SDL via /api/addEvents,
|
||
then runs each rule's PowerQuery via /api/powerQuery and compares the
|
||
fired set against the reference.
|
||
4. Emits reports/PROOF.md with side-by-side results.
|
||
|
||
Run modes:
|
||
python harness/prove_equivalence.py # local-only proof
|
||
python harness/prove_equivalence.py --ingest # ingest + remote PQ
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
ROOT = Path(__file__).resolve().parents[1]
|
||
sys.path.insert(0, str(ROOT))
|
||
|
||
from rules import RULES, NOW, RECENT_START # noqa: E402
|
||
|
||
SAMPLE = ROOT / "sample_data" / "events.jsonl"
|
||
REPORT = ROOT / "reports" / "PROOF.md"
|
||
REPORT_JSON = ROOT / "reports" / "PROOF.json"
|
||
|
||
|
||
def load_events() -> list[dict]:
|
||
return [json.loads(l) for l in SAMPLE.read_text().splitlines() if l.strip()]
|
||
|
||
|
||
def canonical(rule, rows):
|
||
"""Return a sorted, hashable representation of fired rows for comparison."""
|
||
keys = sorted({rule["key"](r) for r in rows}, key=lambda x: str(x))
|
||
return keys
|
||
|
||
|
||
def run_local(events):
|
||
out = {}
|
||
for r in RULES:
|
||
rows = r["ref"](events)
|
||
out[r["id"]] = {
|
||
"description": r["description"],
|
||
"fired_rows": rows,
|
||
"fired_keys": canonical(r, rows),
|
||
}
|
||
return out
|
||
|
||
|
||
def run_pq(run_id: str | None = None):
|
||
from sdl_client import power_query
|
||
out = {}
|
||
recent_ms = int(RECENT_START.timestamp() * 1000)
|
||
scope = f"proof_run_id='{run_id}' " if run_id else ""
|
||
print(f" scope = {scope.strip() or '(none)'}")
|
||
print(f" RECENT_MS = {recent_ms} ({RECENT_START.isoformat()})")
|
||
print(f" NOW = {NOW.isoformat()}")
|
||
print()
|
||
for i, r in enumerate(RULES, 1):
|
||
q = scope + r["pq"].format(RECENT_MS=str(recent_ms))
|
||
print(f" [{i:>2}/{len(RULES)}] {r['id']:<48} ", end="", flush=True)
|
||
t0 = time.time()
|
||
try:
|
||
resp = power_query(q, start_time="2h")
|
||
cols_meta = resp.get("columns") or []
|
||
cols = [c["name"] if isinstance(c, dict) else c for c in cols_meta]
|
||
vals = resp.get("values") or []
|
||
rows = [dict(zip(cols, v)) for v in vals]
|
||
elapsed = time.time() - t0
|
||
status = resp.get("status", "ok")
|
||
print(f"-> {len(rows):>3} rows matching={resp.get('matchingEvents')} "
|
||
f"({elapsed:.1f}s, {status})")
|
||
out[r["id"]] = {"ok": True, "rowcount": len(rows),
|
||
"rows": rows[:50], "status": status,
|
||
"matching": resp.get("matchingEvents")}
|
||
except Exception as e:
|
||
elapsed = time.time() - t0
|
||
msg = str(e)[:200]
|
||
print(f"-> ERROR ({elapsed:.1f}s): {msg}")
|
||
out[r["id"]] = {"ok": False, "error": msg}
|
||
return out
|
||
|
||
|
||
def ingest():
|
||
from sdl_client import ingest_jsonl, power_query
|
||
n, run_id = ingest_jsonl(SAMPLE)
|
||
print(f"Ingested {n} events to SDL (proof_run_id={run_id})")
|
||
# Poll until SDL reports the events are indexed.
|
||
print("Waiting for SDL indexing ...", end="", flush=True)
|
||
for i in range(30): # up to 60s
|
||
time.sleep(2)
|
||
r = power_query(f"proof_run_id='{run_id}' | group n=count()", "30m")
|
||
vals = r.get("values") or []
|
||
cnt = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
|
||
print(f" {cnt}", end="", flush=True)
|
||
if cnt >= n:
|
||
print(" ✓ ready")
|
||
return run_id
|
||
print(" (timeout, proceeding anyway)")
|
||
return run_id
|
||
|
||
|
||
def write_report(local_results, pq_results=None):
|
||
REPORT.parent.mkdir(exist_ok=True)
|
||
md = ["# KQL ↔ PowerQuery equivalence proof",
|
||
"",
|
||
f"Sample dataset: `sample_data/events.jsonl` ({len(load_events())} events)",
|
||
f"Time anchor (NOW): `{NOW.isoformat()}`",
|
||
f"Recent window start: `{RECENT_START.isoformat()}`",
|
||
"",
|
||
"Each rule below is expressed three ways:",
|
||
"1. **KQL** — verbatim/condensed from the Microsoft Sentinel docs.",
|
||
"2. **PowerQuery (PQ)** — SDL equivalent, runnable on `<XDR endpoint>`.",
|
||
"3. **Python reference** — canonical implementation of the same logical "
|
||
"operation tree against the in-memory dataset. Acts as ground truth.",
|
||
"",
|
||
"The PowerQuery is considered equivalent to the KQL when its result "
|
||
"set matches the Python reference. The Python reference encodes the "
|
||
"*same operations* that the KQL parser/optimiser would produce, so a "
|
||
"match certifies KQL/PQ parity on this dataset.",
|
||
""]
|
||
for r in RULES:
|
||
rid = r["id"]
|
||
loc = local_results[rid]
|
||
md += [f"## {rid}", "",
|
||
f"_{r['description']}_", "",
|
||
"### KQL", "```kusto", r["kql"].strip(), "```",
|
||
"### PowerQuery", "```", r["pq"].strip(), "```",
|
||
f"### Reference fired: {len(loc['fired_rows'])} row(s)"]
|
||
if loc["fired_rows"]:
|
||
sample = loc["fired_rows"][:5]
|
||
md.append("```json")
|
||
md.append(json.dumps(sample, default=str, indent=2))
|
||
md.append("```")
|
||
if pq_results:
|
||
pq = pq_results.get(rid, {})
|
||
if pq.get("ok"):
|
||
pq_keys = []
|
||
for row in pq.get("rows", []):
|
||
try:
|
||
pq_keys.append(r["key"](row))
|
||
except Exception:
|
||
pq_keys.append(tuple(row.items()))
|
||
pq_keys = sorted({k for k in pq_keys}, key=lambda x: str(x))
|
||
ref_keys = loc["fired_keys"]
|
||
match = "✅ MATCH" if pq_keys == ref_keys else "⚠️ DIFFERS"
|
||
md += [f"### SDL PowerQuery result: {pq['rowcount']} row(s) — {match}"]
|
||
if pq_keys != ref_keys:
|
||
md += ["Reference keys:", "```",
|
||
json.dumps([list(k) for k in ref_keys], default=str), "```",
|
||
"PQ keys:", "```",
|
||
json.dumps([list(k) for k in pq_keys], default=str), "```"]
|
||
else:
|
||
md.append(f"### SDL PowerQuery error: `{pq.get('error', '?')}`")
|
||
md.append("")
|
||
REPORT.write_text("\n".join(md))
|
||
REPORT_JSON.write_text(json.dumps(
|
||
{"local": {k: {"fired_keys": [list(x) for x in v["fired_keys"]],
|
||
"n": len(v["fired_rows"])}
|
||
for k, v in local_results.items()},
|
||
"pq": pq_results or {}},
|
||
default=str, indent=2))
|
||
print(f"Wrote {REPORT}")
|
||
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--ingest", action="store_true",
|
||
help="Ingest sample events to SDL before querying")
|
||
ap.add_argument("--pq", action="store_true",
|
||
help="Also run each PQ against SDL and compare")
|
||
args = ap.parse_args()
|
||
|
||
events = load_events()
|
||
print(f"Loaded {len(events)} events")
|
||
local_results = run_local(events)
|
||
fired_total = sum(len(v["fired_rows"]) for v in local_results.values())
|
||
print(f"Local reference: {fired_total} total fired rows across {len(RULES)} rules")
|
||
|
||
pq_results = None
|
||
run_id = None
|
||
if args.ingest:
|
||
run_id = ingest()
|
||
if args.pq:
|
||
pq_results = run_pq(run_id=run_id)
|
||
|
||
write_report(local_results, pq_results)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|