v0.1 Mick Marc merged

This commit is contained in:
marc
2026-05-20 23:44:53 +02:00
commit 79efb6bf7d
50 changed files with 5190 additions and 0 deletions
+100
View File
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
"""Probe the SDL tenant to understand why Avelios Medical field-population shows 0%."""
import json, time, urllib.request, urllib.error
import os
def _load_sdl_cfg():
import json as _j, os as _o, sys as _s
here = _o.path.dirname(_o.path.abspath(__file__))
candidates = [
_o.environ.get("SDL_CONFIG"),
_o.path.join(here, "sdl_config.json"),
_o.path.join(here, "..", "sdl_config.json"),
]
for p in candidates:
if p and _o.path.exists(p):
with open(p) as fh:
return _j.load(fh)
_s.stderr.write(
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
"(see sdl_config.example.json)\n")
_s.exit(2)
CFG = _load_sdl_cfg()
BASE = CFG['base_url'].rstrip('/')
KEY = CFG['log_read_key']
END_MS = int(time.time() * 1000)
START_MS = END_MS - 24 * 3600 * 1000 # last 24h
def pq(query: str, max_count: int = 10) -> dict:
body = json.dumps({
"token": KEY, "query": query,
"startTime": START_MS, "endTime": END_MS,
"maxCount": max_count,
}).encode()
req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
headers={"Content-Type": "application/json"})
try:
return json.loads(urllib.request.urlopen(req, timeout=30).read())
except urllib.error.HTTPError as e:
return {"_err": f"HTTP {e.code}: {e.read().decode()[:200]}"}
except Exception as e:
return {"_err": str(e)[:200]}
def show(label, d):
if "_err" in d:
print(f"[ERR] {label}: {d['_err']}"); return
cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])]
vals = d.get('values', []) or d.get('matches', [])
print(f"[OK ] {label} cols={cols} rows={len(vals)}")
for v in vals[:8]:
print(f" {v}")
# 1. Distinct dataSource.name values containing 'velio'
print("=" * 70)
print("1. Source-name spellings containing 'velio'")
print("=" * 70)
show("by dataSource.name",
pq("| group n=count() by dataSource.name | sort -n | limit 50", max_count=50))
# 2. Try a few candidate names
print()
print("=" * 70)
print("2. Try filtering by candidate names")
print("=" * 70)
for cand in ["Avelios Medical", "Avelios-Medical", "Avelios-Medical-OCSF",
"avelios", "Avelios"]:
d = pq(f"| filter dataSource.name == '{cand}' | group n=count()", max_count=1)
n = (d.get('values') or [[None]])[0][0] if 'values' in d else d
print(f" {cand!r:<35} -> {n}")
for cand in ["Avelios Medical", "Avelios-Medical-OCSF", "avelios"]:
d = pq(f"| filter dataSource.name contains '{cand}' | group n=count()", max_count=1)
n = (d.get('values') or [[None]])[0][0] if 'values' in d else d
print(f" contains {cand!r:<25} -> {n}")
# 3. Sample one raw event to see what column names actually come back
print()
print("=" * 70)
print("3. Sample one event — what keys/columns are returned?")
print("=" * 70)
d = pq("| filter dataSource.name contains 'velio' | limit 1", max_count=1)
if "_err" in d:
print(" ", d["_err"])
else:
print(" columns:", [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])][:30])
print(" first row sample:", str((d.get('values') or [None])[0])[:400])
# 4. If we got columns, check which OCSF fields exist
print()
print("=" * 70)
print("4. Field presence in last 24h for Avelios (using columns command)")
print("=" * 70)
d = pq("| filter dataSource.name contains 'velio' | "
"columns dataSource.name, metadata.product.name, metadata.event_code, "
"actor.user.name, src_endpoint.ip, dst_endpoint.ip | limit 5",
max_count=5)
show("columns view", d)
+89
View File
@@ -0,0 +1,89 @@
#!/usr/bin/env python3
"""Inspect Avelios Medical events: one query, full row dump, then field stats from Python."""
import json, time, urllib.request, collections
import os
def _load_sdl_cfg():
import json as _j, os as _o, sys as _s
here = _o.path.dirname(_o.path.abspath(__file__))
candidates = [
_o.environ.get("SDL_CONFIG"),
_o.path.join(here, "sdl_config.json"),
_o.path.join(here, "..", "sdl_config.json"),
]
for p in candidates:
if p and _o.path.exists(p):
with open(p) as fh:
return _j.load(fh)
_s.stderr.write(
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
"(see sdl_config.example.json)\n")
_s.exit(2)
CFG = _load_sdl_cfg()
BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key']
NOW = int(time.time() * 1000)
START = NOW - 72 * 3600 * 1000 # last 3 days
def pq(query, mc=200):
body = json.dumps({"token": KEY, "query": query,
"startTime": START, "endTime": NOW,
"maxCount": mc}).encode()
req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req, timeout=60).read())
print("Fetching Avelios Medical sample (max 200, last 72h) ...")
d = pq("| filter dataSource.name == 'Avelios Medical' | limit 200")
cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])]
vals = d.get('values', []) or []
print(f"Columns returned ({len(cols)}): {cols}")
print(f"Rows: {len(vals)}")
print()
# Tally non-null rate per returned column
counts = {c: 0 for c in cols}
for row in vals:
for c, v in zip(cols, row):
if v not in (None, '', 'null'):
counts[c] += 1
print("=== Column populated-rate (out of returned columns) ===")
for c in cols:
n = counts[c]
pct = round(100 * n / max(1, len(vals)), 1)
print(f" {c:<35} {n:>4} / {len(vals)} {pct:>5}%")
print()
print("=== First 2 events (pretty) ===")
for row in vals[:2]:
print(json.dumps(dict(zip(cols, row)), indent=2, default=str)[:1500])
print("---")
print()
print("=== Distinct fields IN the message body (if JSON) ===")
# If the events carry a structured body, peek inside it
field_freq = collections.Counter()
for row in vals:
rd = dict(zip(cols, row))
msg = rd.get('message') or rd.get('body') or rd.get('attributes')
if isinstance(msg, str):
try:
j = json.loads(msg)
except Exception:
continue
else:
j = msg
if isinstance(j, dict):
def walk(obj, prefix=''):
for k, v in obj.items():
key = f"{prefix}.{k}" if prefix else k
if isinstance(v, dict):
walk(v, key)
else:
field_freq[key] += 1
walk(j)
for k, c in field_freq.most_common(40):
print(f" {k:<45} in {c:>3} events")
+53
View File
@@ -0,0 +1,53 @@
#!/usr/bin/env python3
"""Search wider windows for Avelios Medical events."""
import json, time, urllib.request
import os
def _load_sdl_cfg():
import json as _j, os as _o, sys as _s
here = _o.path.dirname(_o.path.abspath(__file__))
candidates = [
_o.environ.get("SDL_CONFIG"),
_o.path.join(here, "sdl_config.json"),
_o.path.join(here, "..", "sdl_config.json"),
]
for p in candidates:
if p and _o.path.exists(p):
with open(p) as fh:
return _j.load(fh)
_s.stderr.write(
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
"(see sdl_config.example.json)\n")
_s.exit(2)
CFG = _load_sdl_cfg()
BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key']
NOW = int(time.time() * 1000)
def pq(query, start_ms, end_ms, mc=5):
body = json.dumps({"token": KEY, "query": query,
"startTime": start_ms, "endTime": end_ms,
"maxCount": mc}).encode()
req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
headers={"Content-Type": "application/json"})
try:
return json.loads(urllib.request.urlopen(req, timeout=60).read())
except Exception as e:
return {"_err": str(e)[:200]}
for days in (1, 3, 7):
start = NOW - days * 24 * 3600 * 1000
print(f"\n=== last {days}d ===")
d = pq("| group n=count() by dataSource.name | sort -n | limit 30", start, NOW, mc=30)
if "_err" in d:
print(d["_err"]); continue
for row in d.get("values", []):
name = row[0]
if name and "velio" in name.lower():
print(f" HIT: {row}")
# show top 10 in this window
for row in (d.get("values", []) or [])[:10]:
print(f" {row}")
+77
View File
@@ -0,0 +1,77 @@
#!/usr/bin/env python3
"""Probe what PowerQuery syntax this SDL tenant accepts."""
import json, time, urllib.request, urllib.error, sys
import os
def _load_sdl_cfg():
import json as _j, os as _o, sys as _s
here = _o.path.dirname(_o.path.abspath(__file__))
candidates = [
_o.environ.get("SDL_CONFIG"),
_o.path.join(here, "sdl_config.json"),
_o.path.join(here, "..", "sdl_config.json"),
]
for p in candidates:
if p and _o.path.exists(p):
with open(p) as fh:
return _j.load(fh)
_s.stderr.write(
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
"(see sdl_config.example.json)\n")
_s.exit(2)
CFG = _load_sdl_cfg()
URL = CFG['base_url'].rstrip('/') + '/api/powerQuery'
END_MS = int(time.time() * 1000)
START_MS = END_MS - 3600 * 1000 # last hour
def run(label: str, query: str):
body = json.dumps({
"token": CFG['log_read_key'],
"query": query,
"startTime": START_MS,
"endTime": END_MS,
"maxCount": 5,
}).encode()
req = urllib.request.Request(URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = urllib.request.urlopen(req, timeout=30).read()
d = json.loads(resp)
st = d.get('status', '?')
cols = d.get('columns') or []
vals = d.get('values') or d.get('matches') or []
print(f"[OK ] {label:<40} status={st} cols={len(cols)} rows={len(vals)}")
if vals:
print(f" sample={str(vals[0])[:160]}")
except urllib.error.HTTPError as e:
body = e.read().decode()
try:
j = json.loads(body)
msg = j.get('message', body)[:200]
except Exception:
msg = body[:200]
print(f"[ERR] {label:<40} HTTP {e.code}: {msg}")
except Exception as e:
print(f"[ERR] {label:<40} {type(e).__name__}: {str(e)[:160]}")
CASES = [
("leading-pipe single-stage", "| group total=count()"),
("no-pipe single-stage", "group total=count()"),
("leading-pipe multi-stage", "| group events=count() by dataSource.name | sort -events | limit 5"),
("no-pipe multi-stage", "group events=count() by dataSource.name | sort -events | limit 5"),
("no-pipe trim sort", "group events=count() by dataSource.name | limit 5"),
("filter then group", "dataSource.name=='SentinelOne' | group events=count()"),
("filter (modern keyword)", "filter dataSource.name=='SentinelOne' | group events=count()"),
("dataset-style with sort", "group events=count() by dataSource.name | sort events desc | limit 5"),
("count() as alias", "| count() as events"),
("group by event.type", "group events=count() by event.type | limit 5"),
]
print(f"URL: {URL}")
print(f"Window: last 1h ({START_MS}..{END_MS} ms)")
print()
for label, q in CASES:
run(label, q)
+63
View File
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""Probe /api/ingest/simulate-filter using small 1-day windows + long client
timeouts to avoid urllib aborting before the SDL query returns.
Run one case at a time and print elapsed time so we can tell whether failures
are HTTP errors or slow tenant queries.
"""
from __future__ import annotations
import json
import sys
import time
import urllib.request
import urllib.error
URL = "http://localhost:8001/api/ingest/simulate-filter"
TIMEOUT = 600 # seconds — generous; SDL queries on large tenants can take >60s
# Smallest windows first so cheap calls succeed before we try the expensive ones.
CASES = [
("empty body, 1d", {"days": 1}),
("bogus source, 1d", {"source": "definitely-no-such-source", "days": 1}),
("source only, 1d", {"source": "Avelios Medical", "days": 1}),
("source only, 7d", {"source": "Avelios Medical", "days": 7}),
("event_type only, 1d", {"event_type": "login", "days": 1}),
("source + event_type, 7d", {"source": "Avelios Medical", "event_type": "login", "days": 7}),
]
def hit(body: dict) -> tuple[int, str, float]:
data = json.dumps(body).encode()
req = urllib.request.Request(
URL,
data=data,
headers={"Content-Type": "application/json"},
method="POST",
)
t0 = time.monotonic()
try:
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
return r.status, r.read().decode(), time.monotonic() - t0
except urllib.error.HTTPError as e:
return e.code, e.read().decode(), time.monotonic() - t0
except Exception as e:
return -1, f"{type(e).__name__}: {e or 'no detail'}", time.monotonic() - t0
# Allow narrowing via CLI: `python3 probe_simulate_filter.py 2 3` runs cases 2 & 3.
indices = [int(x) for x in sys.argv[1:]] if len(sys.argv) > 1 else range(len(CASES))
for i in indices:
if i >= len(CASES):
continue
label, body = CASES[i]
print("=" * 78)
print(f"[{i}] {label:<32} body={body}")
sys.stdout.flush()
status, payload, elapsed = hit(body)
print(f" HTTP {status} elapsed={elapsed:.1f}s")
try:
parsed = json.loads(payload)
print(" " + json.dumps(parsed, indent=2).replace("\n", "\n "))
except Exception:
print(f" raw: {payload[:800]}")
+80
View File
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""Trigger /api/quality/sync-from-sdl and pretty-print the result.
Then re-list /api/quality/parsers to confirm the new files appear in the
Parser Test Runner dropdown.
"""
from __future__ import annotations
import json
import sys
import time
import urllib.request
import urllib.error
BACKEND = "http://localhost:8001"
TIMEOUT = 300
def call(method: str, path: str) -> tuple[int, dict | str, float]:
req = urllib.request.Request(BACKEND + path, method=method)
t0 = time.monotonic()
try:
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
return r.status, json.loads(r.read().decode()), time.monotonic() - t0
except urllib.error.HTTPError as e:
body = e.read().decode()
try:
return e.code, json.loads(body), time.monotonic() - t0
except Exception:
return e.code, body, time.monotonic() - t0
except Exception as e:
return -1, f"{type(e).__name__}: {e or 'no detail'}", time.monotonic() - t0
print("=" * 72)
print("POST /api/quality/sync-from-sdl")
print("=" * 72)
status, body, elapsed = call("POST", "/api/quality/sync-from-sdl")
print(f"HTTP {status} elapsed={elapsed:.1f}s")
if isinstance(body, dict):
if "detail" in body:
print(f" ERROR: {body['detail']}")
else:
print(f" downloaded: {body.get('downloaded')}")
print(f" errors: {len(body.get('errors') or [])}")
print(f" directory: {body.get('directory')}")
names = body.get("parsers") or []
print(f"\n sample of parser filenames (first 25):")
for n in names[:25]:
print(f" {n}")
if len(names) > 25:
print(f" ... and {len(names) - 25} more")
# Highlight anything that looks like a customer/custom parser
custom = [n for n in names if "avelios" in n.lower() or "ocsf" in n.lower()]
if custom:
print("\n matched custom-parser patterns (avelios / ocsf):")
for n in custom:
print(f"{n}")
errs = body.get("errors") or []
if errs:
print(f"\n errors (first 5 of {len(errs)}):")
for e in errs[:5]:
print(f" - {e}")
else:
print(f" raw: {str(body)[:600]}")
print()
print("=" * 72)
print("GET /api/quality/parsers (Parser Test Runner dropdown source)")
print("=" * 72)
status, body, elapsed = call("GET", "/api/quality/parsers")
print(f"HTTP {status} elapsed={elapsed:.1f}s")
if isinstance(body, dict):
print(f" count: {body.get('count')}")
print(f" parsers:")
for n in (body.get("parsers") or [])[:50]:
print(f" {n}")
if (body.get("count") or 0) > 50:
print(f" ... and {body['count'] - 50} more")
else:
print(f" raw: {str(body)[:400]}")
+7
View File
@@ -0,0 +1,7 @@
{
"_comment": "Copy to sdl_config.json (or set $SDL_CONFIG to its path). Only the keys you need are required. log_read_key for queries; config_read_key for listFiles/getFile (parser sync).",
"base_url": "https://xdr.us1.sentinelone.net",
"log_read_key": "REPLACE_WITH_LOG_READ_KEY",
"config_read_key": "REPLACE_WITH_CONFIG_READ_KEY",
"console_api_token": "REPLACE_WITH_CONSOLE_API_TOKEN_OR_LEAVE_BLANK"
}
+100
View File
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
"""
Pull every parser under /logParsers/ from the SDL tenant and drop it into
./parsers/ so the SIEM-Toolkit Parser Test Runner can list it.
Auth: config_read_key from sentinelone-sdl-api/config.json
"""
from __future__ import annotations
import json
import os
import sys
import urllib.request
import urllib.parse
import urllib.error
def _load_sdl_cfg():
import json as _j, os as _o, sys as _s
here = _o.path.dirname(_o.path.abspath(__file__))
candidates = [
_o.environ.get("SDL_CONFIG"),
_o.path.join(here, "sdl_config.json"),
_o.path.join(here, "..", "sdl_config.json"),
]
for p in candidates:
if p and _o.path.exists(p):
with open(p) as fh:
return _j.load(fh)
_s.stderr.write(
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
"(see sdl_config.example.json)\n")
_s.exit(2)
SDL_CFG_PATH = os.environ.get('SDL_CONFIG') # placeholder; cfg loaded below
DEST = os.environ.get('PARSERS_DIR', os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'parsers'))
def call(base_url: str, token: str, path: str, params: dict) -> dict:
"""POST with JSON body — works for both listFiles and getFile on SDL."""
url = f"{base_url.rstrip('/')}{path}"
body = json.dumps({**params, "token": token}).encode()
req = urllib.request.Request(url, data=body, headers={
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
})
try:
with urllib.request.urlopen(req, timeout=30) as r:
return json.loads(r.read())
except urllib.error.HTTPError as e:
body = e.read().decode(errors="replace")[:300]
raise RuntimeError(f"HTTP {e.code} {path}: {body}")
def main() -> int:
cfg = _load_sdl_cfg()
base = cfg["base_url"]
# config_read_key first (per docs), fall back to console_api_token
token = cfg.get("config_read_key") or cfg.get("console_api_token")
if not token:
print("No config_read_key or console_api_token in config.json", file=sys.stderr)
return 2
print(f"Listing /logParsers/ from {base} ...")
res = call(base, token, "/api/listFiles", {"pathPrefix": "/logParsers/"})
paths = res.get("paths", [])
print(f"Found {len(paths)} files under /logParsers/")
os.makedirs(DEST, exist_ok=True)
fetched, skipped, failed = 0, 0, []
for p in paths:
# Strip leading /logParsers/, sanitize for filesystem
name = p.rsplit("/", 1)[-1] or "_unnamed"
# Avoid colliding with existing sample files? Always overwrite to keep fresh.
try:
r = call(base, token, "/api/getFile", {"path": p})
except Exception as e:
failed.append((p, str(e)))
continue
content = r.get("content")
if content is None:
failed.append((p, "no content"))
continue
out = os.path.join(DEST, name)
with open(out, "w", encoding="utf-8") as fh:
fh.write(content)
ver = r.get("version", "?")
print(f" + {name:<60} v{ver} ({len(content)} bytes)")
fetched += 1
print()
print(f"Done: fetched={fetched}, failed={len(failed)}")
if failed:
for p, err in failed[:10]:
print(f" ! {p}: {err}")
return 0
if __name__ == "__main__":
sys.exit(main())
+27
View File
@@ -0,0 +1,27 @@
#!/usr/bin/env python3
"""Verify the Parser Test Runner accepts multi-line NDJSON for JSON-mode parsers."""
import json, urllib.request
import os
LINES = [
'{"timestamp":"2026-05-14T00:00:41.969Z","event_type":"DATA_IMPORT_COMPLETED","event_category":"data_transfer","severity":"INFO","outcome":"success","source":{"application":"Avelios Medical"}}',
'{"timestamp":"2026-05-14T00:07:41.969Z","event_type":"PERFORMANCE_DEGRADATION","event_category":"system","severity":"MEDIUM","outcome":"success","source":{"application":"Avelios Medical"}}',
'{"timestamp":"2026-05-14T00:24:41.969Z","event_type":"MALWARE_DETECTED","event_category":"security","severity":"CRITICAL","outcome":"detected","source":{"application":"Avelios Medical"},"details":{"malware_name":"Trojan.GenericKD"}}',
]
body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": "\n".join(LINES)}).encode()
req = urllib.request.Request(
"http://localhost:8001/api/quality/test-parser",
data=body, headers={"Content-Type": "application/json"})
r = json.loads(urllib.request.urlopen(req, timeout=30).read())
print(f"matched = {r.get('matched')}")
print(f"mode = {r.get('mode')}")
print(f"payloads = {r.get('payload_count')} (showing {r.get('showing_payload')})")
print(f"extracted = {r.get('extracted_count')}")
print(f"derived = {r.get('derived_count')}")
print(f"parse_errors = {r.get('parse_errors')}")
print()
print("rewrites applied (first payload):")
for rw in r.get("rewrites_applied", [])[:10]:
print(f" {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")
+22
View File
@@ -0,0 +1,22 @@
#!/usr/bin/env python3
import json, urllib.request
import os
log = '{"timestamp": "2026-05-14T00:24:41.969Z", "event_id": "d5c76dd2-5320-4b32-bd27-09acedfb5fdb", "event_type": "MALWARE_DETECTED", "event_category": "security", "severity": "CRITICAL", "source": {"application": "Avelios Medical", "module": "SecurityMonitor"}, "outcome": "detected", "details": {"malware_name": "Trojan.GenericKD"}}'
body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": log}).encode()
req = urllib.request.Request(
"http://localhost:8001/api/quality/test-parser",
data=body, headers={"Content-Type": "application/json"})
r = json.loads(urllib.request.urlopen(req, timeout=30).read())
print(f"matched={r.get('matched')} mode={r.get('mode')} "
f"extracted={r.get('extracted_count')} derived={r.get('derived_count')}")
print()
print("json-extract fields (first 12):")
for f in [x for x in r.get("fields", []) if x.get("source") == "json-extract"][:12]:
print(f" {f['field']:<32} = {str(f['value'])[:50]}")
print()
print("rewrites applied:")
for rw in r.get("rewrites_applied", [])[:12]:
print(f" {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")