Initial commit: KQL ↔ SDL PowerQuery proof of equivalence

This commit is contained in:
marc
2026-06-01 09:57:14 +02:00
commit 23cbaa9c08
91 changed files with 5966 additions and 0 deletions
+101
View File
@@ -0,0 +1,101 @@
#!/usr/bin/env python3
"""Diagnose why most of our 445 generated events are not queryable in SDL.
Strategy:
1. Take 5 CommonSecurityLog events straight from the generated JSONL,
decorate them with a unique probe marker, and ingest as a single batch.
2. Wait 10 s for indexing.
3. Query for the marker to confirm they are queryable.
4. Then bulk-ingest the entire JSONL and report per-event-type counts in SDL
vs counts in the local file - to expose where the loss happens.
"""
from __future__ import annotations
import json
import sys
import time
from collections import Counter
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from harness.sdl_client import add_events, power_query, ingest_jsonl, _clean_attrs # noqa: E402
JSONL = ROOT / "sample_data" / "events.jsonl"
MARKER = f"loss-probe-{int(time.time())}"
# ---------------------------------------------------------------------------
# Step 1: per-type counts in the local file
# ---------------------------------------------------------------------------
local_counts = Counter()
with JSONL.open() as f:
for line in f:
rec = json.loads(line)
local_counts[rec["event_type"]] += 1
print("=" * 80)
print("Local JSONL event_type counts")
print("=" * 80)
for k, v in sorted(local_counts.items()):
print(f" {k:30s} {v}")
print(f" {'TOTAL':30s} {sum(local_counts.values())}")
# ---------------------------------------------------------------------------
# Step 2: pick 5 CSL events from disk, mark them, ingest, query
# ---------------------------------------------------------------------------
csl_events = []
with JSONL.open() as f:
for line in f:
rec = json.loads(line)
if rec["event_type"] == "CommonSecurityLog":
rec["loss_marker"] = MARKER
ts_ms = int(rec["ts_epoch_ms"])
cleaned = _clean_attrs(rec)
csl_events.append({"ts": str(ts_ms * 1_000_000), "sev": 3,
"thread": "T1", "attrs": cleaned})
if len(csl_events) >= 5:
break
print()
print("=" * 80)
print(f"Step 2: ingesting 5 marker-tagged CSL events ({MARKER})")
print("=" * 80)
r = add_events(csl_events)
print(f"addEvents -> {json.dumps(r)}")
print("waiting 10 s for indexing ...")
time.sleep(10)
probe_q = f"loss_marker='{MARKER}' | group n = count() by event_type"
r = power_query(probe_q, "1h")
print(f"probe query (1h) -> matching={r.get('matchingEvents')}, rows={r.get('values')}")
# ---------------------------------------------------------------------------
# Step 3: full bulk ingest of the file via the harness helper
# ---------------------------------------------------------------------------
print()
print("=" * 80)
print("Step 3: full bulk ingest of every event in JSONL")
print("=" * 80)
sent = ingest_jsonl(JSONL)
print(f"ingest_jsonl reports {sent} events sent")
print("waiting 20 s for indexing ...")
time.sleep(20)
# ---------------------------------------------------------------------------
# Step 4: per-event-type count in SDL
# ---------------------------------------------------------------------------
print()
print("=" * 80)
print("Step 4: SDL counts by event_type")
print("=" * 80)
print(f"{'event_type':30s} {'local':>8} {'SDL':>8} {'loss%':>8}")
print("-" * 60)
for et in sorted(local_counts):
q = f"event_type='{et}' | group n = count()"
r = power_query(q, "1h")
sdl_n = 0
if r.get("values"):
sdl_n = int(r["values"][0][0] or 0)
local_n = local_counts[et]
loss = 100 * (local_n - sdl_n) / local_n if local_n else 0
print(f"{et:30s} {local_n:>8} {sdl_n:>8} {loss:>7.0f}%")