mirror of
https://github.com/marcredhat/kql
synced 2026-06-08 21:27:09 +00:00
Initial commit: KQL ↔ SDL PowerQuery proof of equivalence
This commit is contained in:
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Diagnose why most of our 445 generated events are not queryable in SDL.
|
||||
|
||||
Strategy:
|
||||
1. Take 5 CommonSecurityLog events straight from the generated JSONL,
|
||||
decorate them with a unique probe marker, and ingest as a single batch.
|
||||
2. Wait 10 s for indexing.
|
||||
3. Query for the marker to confirm they are queryable.
|
||||
4. Then bulk-ingest the entire JSONL and report per-event-type counts in SDL
|
||||
vs counts in the local file - to expose where the loss happens.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import add_events, power_query, ingest_jsonl, _clean_attrs # noqa: E402
|
||||
|
||||
JSONL = ROOT / "sample_data" / "events.jsonl"
|
||||
MARKER = f"loss-probe-{int(time.time())}"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 1: per-type counts in the local file
|
||||
# ---------------------------------------------------------------------------
|
||||
local_counts = Counter()
|
||||
with JSONL.open() as f:
|
||||
for line in f:
|
||||
rec = json.loads(line)
|
||||
local_counts[rec["event_type"]] += 1
|
||||
|
||||
print("=" * 80)
|
||||
print("Local JSONL event_type counts")
|
||||
print("=" * 80)
|
||||
for k, v in sorted(local_counts.items()):
|
||||
print(f" {k:30s} {v}")
|
||||
print(f" {'TOTAL':30s} {sum(local_counts.values())}")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 2: pick 5 CSL events from disk, mark them, ingest, query
|
||||
# ---------------------------------------------------------------------------
|
||||
csl_events = []
|
||||
with JSONL.open() as f:
|
||||
for line in f:
|
||||
rec = json.loads(line)
|
||||
if rec["event_type"] == "CommonSecurityLog":
|
||||
rec["loss_marker"] = MARKER
|
||||
ts_ms = int(rec["ts_epoch_ms"])
|
||||
cleaned = _clean_attrs(rec)
|
||||
csl_events.append({"ts": str(ts_ms * 1_000_000), "sev": 3,
|
||||
"thread": "T1", "attrs": cleaned})
|
||||
if len(csl_events) >= 5:
|
||||
break
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print(f"Step 2: ingesting 5 marker-tagged CSL events ({MARKER})")
|
||||
print("=" * 80)
|
||||
r = add_events(csl_events)
|
||||
print(f"addEvents -> {json.dumps(r)}")
|
||||
print("waiting 10 s for indexing ...")
|
||||
time.sleep(10)
|
||||
|
||||
probe_q = f"loss_marker='{MARKER}' | group n = count() by event_type"
|
||||
r = power_query(probe_q, "1h")
|
||||
print(f"probe query (1h) -> matching={r.get('matchingEvents')}, rows={r.get('values')}")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 3: full bulk ingest of the file via the harness helper
|
||||
# ---------------------------------------------------------------------------
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("Step 3: full bulk ingest of every event in JSONL")
|
||||
print("=" * 80)
|
||||
sent = ingest_jsonl(JSONL)
|
||||
print(f"ingest_jsonl reports {sent} events sent")
|
||||
print("waiting 20 s for indexing ...")
|
||||
time.sleep(20)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 4: per-event-type count in SDL
|
||||
# ---------------------------------------------------------------------------
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("Step 4: SDL counts by event_type")
|
||||
print("=" * 80)
|
||||
print(f"{'event_type':30s} {'local':>8} {'SDL':>8} {'loss%':>8}")
|
||||
print("-" * 60)
|
||||
for et in sorted(local_counts):
|
||||
q = f"event_type='{et}' | group n = count()"
|
||||
r = power_query(q, "1h")
|
||||
sdl_n = 0
|
||||
if r.get("values"):
|
||||
sdl_n = int(r["values"][0][0] or 0)
|
||||
local_n = local_counts[et]
|
||||
loss = 100 * (local_n - sdl_n) / local_n if local_n else 0
|
||||
print(f"{et:30s} {local_n:>8} {sdl_n:>8} {loss:>7.0f}%")
|
||||
Reference in New Issue
Block a user