Files
marcredhat-siem-toolkit-pat…/probe_wel_schema.py
T
marc 4df8e844e5 Sigma -> SentinelOne PowerQuery pipeline
End-to-end workflow that turns SigmaHQ rules into SDL Scheduled
custom-detection rules:

1. SIEM-toolkit provides the coverage map to find what's thin --
   MITRE ATT&CK heatmap across all detection library rules, rule
   firing status (active vs never-fired).
2. Pick Sigma rules (https://github.com/SigmaHQ/sigma) that target
   those tactics.
3. Convert the Sigma rules to PowerQuery with
   pysigma-backend-sentinelone-pq.
4. Smoke-test against your tenant's /api/powerQuery, deploy via
   /web/api/v2.1/cloud-detection/rules as Scheduled PQ rules in Draft.
5. Re-running on a different tenant is just re-pointing the
   credentials -- the converted .pq bodies travel as-is.

Files:
  README_sigma_pipeline.md       full workflow doc
  recommend_sigma_imports.py     coverage-map reader -> rule shortlist
  probe_wel_schema.py            WEL parser field discovery
  convert_test_deploy_sigma.py   pick + convert + 3 variants + deploy
  fixup_rules_6_7.py             OriginalFileName pre-processor
  run_sigma_on_tenant.py         redeploy already-converted bodies
  verify_rule_exists_via_put.py  PUT-existence test (RBAC workaround)
  verify_deployed_sigma_rules.py RBAC visibility diagnostic
  tenant_config.example.json     credentials template (gitignored real one)

Each converted rule emits three PowerQuery variants:
  <stem>.pq          faithful (S1 DV schema)
  <stem>.relaxed.pq  drops endpoint.os + event.type clauses
  <stem>.wel.pq      rewritten onto microsoft_windows_eventlog-latest

All scripts read credentials from tenant_config.json (or the
SIEM_TOOLKIT_CONFIG env var), discover the target site_id at runtime,
and persist deployed rule IDs to deployed_rule_ids.json so the verify
scripts work without hardcoded IDs.
2026-05-28 12:29:37 +02:00

99 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""
probe_wel_schema.py
Probe the tenant's Singularity Data Lake to discover what fields the
`microsoft_windows_eventlog-latest` parser emits. Output guides the WEL
mapping pipeline in convert_test_deploy_sigma.py.
Runs a series of read-only PowerQuery probes for the last 24 h. No state
changes -- safe to re-run.
"""
from __future__ import annotations
import json
import os
import pathlib
import time
import urllib.request
import urllib.error
HERE = pathlib.Path(__file__).resolve().parent
_CFG_PATH = os.environ.get("SIEM_TOOLKIT_CONFIG",
str(HERE / "tenant_config.json"))
CFG = json.load(open(_CFG_PATH))
BASE = CFG["SDL_XDR_URL"].rstrip("/")
TOK = CFG["SDL_LOG_READ_KEY"]
def pq(query: str, hours: int = 24) -> tuple[str, list, list[str]]:
end = int(time.time() * 1000); start = end - hours * 3600 * 1000
req = urllib.request.Request(
f"{BASE}/api/powerQuery",
data=json.dumps({"token": TOK, "query": query,
"startTime": str(start),
"endTime": str(end)}).encode(),
headers={"Content-Type": "application/json"}, method="POST")
try:
d = json.loads(urllib.request.urlopen(req, timeout=60).read())
return ("OK", d.get("values") or [],
[c.get("name") for c in (d.get("columns") or [])])
except urllib.error.HTTPError as e:
return (f"HTTP{e.code}", [e.read().decode()[:250]], [])
except Exception as e:
return (f"{type(e).__name__}", [str(e)], [])
PROBES: list[tuple[str, str]] = [
("WEL distribution by EventID",
"parser.name='microsoft_windows_eventlog-latest' "
"| group n=count() by EventID | sort -n | limit 20"),
("WEL channel / provider distribution",
"parser.name='microsoft_windows_eventlog-latest' "
"| group n=count() by Channel | sort -n | limit 15"),
("WEL ProviderName distribution",
"parser.name='microsoft_windows_eventlog-latest' "
"| group n=count() by ProviderName | sort -n | limit 15"),
("WEL EID=4688 row sample (Security: process creation)",
"parser.name='microsoft_windows_eventlog-latest' EventID=4688 "
"| columns CommandLine, NewProcessName, ParentProcessName, "
"SubjectUserName, ProcessId | limit 3"),
("WEL EID=1 row sample (Sysmon: process creation)",
"parser.name='microsoft_windows_eventlog-latest' EventID=1 "
"| columns CommandLine, Image, ParentImage, User, ProcessGuid | limit 3"),
("Probe alternate camelCase fields on the WEL parser",
"parser.name='microsoft_windows_eventlog-latest' "
"| columns commandLine, image, parentImage, eventId | limit 3"),
("Probe nested process.* fields on the WEL parser",
"parser.name='microsoft_windows_eventlog-latest' "
"| columns process.cmdLine, process.image.path, "
"process.parentImage.path, event.id | limit 3"),
("EID=4688 count alone (volume sanity)",
"parser.name='microsoft_windows_eventlog-latest' EventID=4688 "
"| group n=count() | limit 1"),
("EID=1 count alone",
"parser.name='microsoft_windows_eventlog-latest' EventID=1 "
"| group n=count() | limit 1"),
("Any cmdline-bearing record sample (raw)",
"parser.name='microsoft_windows_eventlog-latest' "
"| columns rawMessage | limit 1"),
]
def main() -> int:
print(f"\n{'='*78}\n WEL parser schema probe -- last 24 h\n "
f"endpoint: {BASE}/api/powerQuery\n{'='*78}")
for label, query in PROBES:
status, rows, cols = pq(query)
oneline = query.replace("\n", " ")
print(f"\n--- {label} ---")
print(f" query : {oneline[:160]}{'...' if len(oneline)>160 else ''}")
print(f" status: {status} cols: {cols}")
for r in rows[:10]:
r_str = str(r)
print(f" {r_str[:240]}{'...' if len(r_str)>240 else ''}")
return 0
if __name__ == "__main__":
raise SystemExit(main())