Add helper scripts: SDL parser sync, PQ probes, test-parser smoke tests

2026-06-08 12:33:51 +00:00 · 2026-05-20 19:41:00 +02:00
parent 8dbd38f3bb
commit d8d62478c0
8 changed files with 475 additions and 0 deletions
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""Probe the SDL tenant to understand why Avelios Medical field-population shows 0%."""
+import json, time, urllib.request, urllib.error
+import os
+
+def _load_sdl_cfg():
+    import json as _j, os as _o, sys as _s
+    here = _o.path.dirname(_o.path.abspath(__file__))
+    candidates = [
+        _o.environ.get("SDL_CONFIG"),
+        _o.path.join(here, "sdl_config.json"),
+        _o.path.join(here, "..", "sdl_config.json"),
+    ]
+    for p in candidates:
+        if p and _o.path.exists(p):
+            with open(p) as fh:
+                return _j.load(fh)
+    _s.stderr.write(
+        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
+        "(see sdl_config.example.json)\n")
+    _s.exit(2)
+
+
+CFG = _load_sdl_cfg()
+BASE = CFG['base_url'].rstrip('/')
+KEY  = CFG['log_read_key']
+END_MS   = int(time.time() * 1000)
+START_MS = END_MS - 24 * 3600 * 1000   # last 24h
+
+
+def pq(query: str, max_count: int = 10) -> dict:
+    body = json.dumps({
+        "token": KEY, "query": query,
+        "startTime": START_MS, "endTime": END_MS,
+        "maxCount": max_count,
+    }).encode()
+    req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
+                                 headers={"Content-Type": "application/json"})
+    try:
+        return json.loads(urllib.request.urlopen(req, timeout=30).read())
+    except urllib.error.HTTPError as e:
+        return {"_err": f"HTTP {e.code}: {e.read().decode()[:200]}"}
+    except Exception as e:
+        return {"_err": str(e)[:200]}
+
+
+def show(label, d):
+    if "_err" in d:
+        print(f"[ERR] {label}: {d['_err']}"); return
+    cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])]
+    vals = d.get('values', []) or d.get('matches', [])
+    print(f"[OK ] {label}  cols={cols}  rows={len(vals)}")
+    for v in vals[:8]:
+        print(f"     {v}")
+
+
+# 1. Distinct dataSource.name values containing 'velio'
+print("=" * 70)
+print("1. Source-name spellings containing 'velio'")
+print("=" * 70)
+show("by dataSource.name",
+     pq("| group n=count() by dataSource.name | sort -n | limit 50", max_count=50))
+
+# 2. Try a few candidate names
+print()
+print("=" * 70)
+print("2. Try filtering by candidate names")
+print("=" * 70)
+for cand in ["Avelios Medical", "Avelios-Medical", "Avelios-Medical-OCSF",
+             "avelios", "Avelios"]:
+    d = pq(f"| filter dataSource.name == '{cand}' | group n=count()", max_count=1)
+    n = (d.get('values') or [[None]])[0][0] if 'values' in d else d
+    print(f"  {cand!r:<35}  -> {n}")
+for cand in ["Avelios Medical", "Avelios-Medical-OCSF", "avelios"]:
+    d = pq(f"| filter dataSource.name contains '{cand}' | group n=count()", max_count=1)
+    n = (d.get('values') or [[None]])[0][0] if 'values' in d else d
+    print(f"  contains {cand!r:<25}  -> {n}")
+
+# 3. Sample one raw event to see what column names actually come back
+print()
+print("=" * 70)
+print("3. Sample one event — what keys/columns are returned?")
+print("=" * 70)
+d = pq("| filter dataSource.name contains 'velio' | limit 1", max_count=1)
+if "_err" in d:
+    print("  ", d["_err"])
+else:
+    print("  columns:", [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])][:30])
+    print("  first row sample:", str((d.get('values') or [None])[0])[:400])
+
+# 4. If we got columns, check which OCSF fields exist
+print()
+print("=" * 70)
+print("4. Field presence in last 24h for Avelios (using columns command)")
+print("=" * 70)
+d = pq("| filter dataSource.name contains 'velio' | "
+       "columns dataSource.name, metadata.product.name, metadata.event_code, "
+       "actor.user.name, src_endpoint.ip, dst_endpoint.ip | limit 5",
+       max_count=5)
+show("columns view", d)
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""Inspect Avelios Medical events: one query, full row dump, then field stats from Python."""
+import json, time, urllib.request, collections
+import os
+
+def _load_sdl_cfg():
+    import json as _j, os as _o, sys as _s
+    here = _o.path.dirname(_o.path.abspath(__file__))
+    candidates = [
+        _o.environ.get("SDL_CONFIG"),
+        _o.path.join(here, "sdl_config.json"),
+        _o.path.join(here, "..", "sdl_config.json"),
+    ]
+    for p in candidates:
+        if p and _o.path.exists(p):
+            with open(p) as fh:
+                return _j.load(fh)
+    _s.stderr.write(
+        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
+        "(see sdl_config.example.json)\n")
+    _s.exit(2)
+
+
+CFG = _load_sdl_cfg()
+BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key']
+NOW = int(time.time() * 1000)
+START = NOW - 72 * 3600 * 1000          # last 3 days
+
+
+def pq(query, mc=200):
+    body = json.dumps({"token": KEY, "query": query,
+                       "startTime": START, "endTime": NOW,
+                       "maxCount": mc}).encode()
+    req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
+                                 headers={"Content-Type": "application/json"})
+    return json.loads(urllib.request.urlopen(req, timeout=60).read())
+
+
+print("Fetching Avelios Medical sample (max 200, last 72h) ...")
+d = pq("| filter dataSource.name == 'Avelios Medical' | limit 200")
+cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])]
+vals = d.get('values', []) or []
+print(f"Columns returned ({len(cols)}): {cols}")
+print(f"Rows: {len(vals)}")
+print()
+
+# Tally non-null rate per returned column
+counts = {c: 0 for c in cols}
+for row in vals:
+    for c, v in zip(cols, row):
+        if v not in (None, '', 'null'):
+            counts[c] += 1
+print("=== Column populated-rate (out of returned columns) ===")
+for c in cols:
+    n = counts[c]
+    pct = round(100 * n / max(1, len(vals)), 1)
+    print(f"  {c:<35} {n:>4} / {len(vals)}   {pct:>5}%")
+
+print()
+print("=== First 2 events (pretty) ===")
+for row in vals[:2]:
+    print(json.dumps(dict(zip(cols, row)), indent=2, default=str)[:1500])
+    print("---")
+
+print()
+print("=== Distinct fields IN the message body (if JSON) ===")
+# If the events carry a structured body, peek inside it
+field_freq = collections.Counter()
+for row in vals:
+    rd = dict(zip(cols, row))
+    msg = rd.get('message') or rd.get('body') or rd.get('attributes')
+    if isinstance(msg, str):
+        try:
+            j = json.loads(msg)
+        except Exception:
+            continue
+    else:
+        j = msg
+    if isinstance(j, dict):
+        def walk(obj, prefix=''):
+            for k, v in obj.items():
+                key = f"{prefix}.{k}" if prefix else k
+                if isinstance(v, dict):
+                    walk(v, key)
+                else:
+                    field_freq[key] += 1
+        walk(j)
+for k, c in field_freq.most_common(40):
+    print(f"  {k:<45} in {c:>3} events")
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""Search wider windows for Avelios Medical events."""
+import json, time, urllib.request
+import os
+
+def _load_sdl_cfg():
+    import json as _j, os as _o, sys as _s
+    here = _o.path.dirname(_o.path.abspath(__file__))
+    candidates = [
+        _o.environ.get("SDL_CONFIG"),
+        _o.path.join(here, "sdl_config.json"),
+        _o.path.join(here, "..", "sdl_config.json"),
+    ]
+    for p in candidates:
+        if p and _o.path.exists(p):
+            with open(p) as fh:
+                return _j.load(fh)
+    _s.stderr.write(
+        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
+        "(see sdl_config.example.json)\n")
+    _s.exit(2)
+
+
+CFG = _load_sdl_cfg()
+BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key']
+NOW = int(time.time() * 1000)
+
+
+def pq(query, start_ms, end_ms, mc=5):
+    body = json.dumps({"token": KEY, "query": query,
+                       "startTime": start_ms, "endTime": end_ms,
+                       "maxCount": mc}).encode()
+    req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
+                                 headers={"Content-Type": "application/json"})
+    try:
+        return json.loads(urllib.request.urlopen(req, timeout=60).read())
+    except Exception as e:
+        return {"_err": str(e)[:200]}
+
+
+for days in (1, 3, 7):
+    start = NOW - days * 24 * 3600 * 1000
+    print(f"\n=== last {days}d ===")
+    d = pq("| group n=count() by dataSource.name | sort -n | limit 30", start, NOW, mc=30)
+    if "_err" in d:
+        print(d["_err"]); continue
+    for row in d.get("values", []):
+        name = row[0]
+        if name and "velio" in name.lower():
+            print(f"  HIT: {row}")
+    # show top 10 in this window
+    for row in (d.get("values", []) or [])[:10]:
+        print(f"  {row}")
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""Probe what PowerQuery syntax this SDL tenant accepts."""
+import json, time, urllib.request, urllib.error, sys
+import os
+
+def _load_sdl_cfg():
+    import json as _j, os as _o, sys as _s
+    here = _o.path.dirname(_o.path.abspath(__file__))
+    candidates = [
+        _o.environ.get("SDL_CONFIG"),
+        _o.path.join(here, "sdl_config.json"),
+        _o.path.join(here, "..", "sdl_config.json"),
+    ]
+    for p in candidates:
+        if p and _o.path.exists(p):
+            with open(p) as fh:
+                return _j.load(fh)
+    _s.stderr.write(
+        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
+        "(see sdl_config.example.json)\n")
+    _s.exit(2)
+
+
+CFG = _load_sdl_cfg()
+URL = CFG['base_url'].rstrip('/') + '/api/powerQuery'
+END_MS = int(time.time() * 1000)
+START_MS = END_MS - 3600 * 1000  # last hour
+
+
+def run(label: str, query: str):
+    body = json.dumps({
+        "token":     CFG['log_read_key'],
+        "query":     query,
+        "startTime": START_MS,
+        "endTime":   END_MS,
+        "maxCount":  5,
+    }).encode()
+    req = urllib.request.Request(URL, data=body, headers={"Content-Type": "application/json"})
+    try:
+        resp = urllib.request.urlopen(req, timeout=30).read()
+        d = json.loads(resp)
+        st = d.get('status', '?')
+        cols = d.get('columns') or []
+        vals = d.get('values') or d.get('matches') or []
+        print(f"[OK ] {label:<40} status={st} cols={len(cols)} rows={len(vals)}")
+        if vals:
+            print(f"      sample={str(vals[0])[:160]}")
+    except urllib.error.HTTPError as e:
+        body = e.read().decode()
+        try:
+            j = json.loads(body)
+            msg = j.get('message', body)[:200]
+        except Exception:
+            msg = body[:200]
+        print(f"[ERR] {label:<40} HTTP {e.code}: {msg}")
+    except Exception as e:
+        print(f"[ERR] {label:<40} {type(e).__name__}: {str(e)[:160]}")
+
+
+CASES = [
+    ("leading-pipe single-stage",  "| group total=count()"),
+    ("no-pipe single-stage",       "group total=count()"),
+    ("leading-pipe multi-stage",   "| group events=count() by dataSource.name | sort -events | limit 5"),
+    ("no-pipe multi-stage",        "group events=count() by dataSource.name | sort -events | limit 5"),
+    ("no-pipe trim sort",          "group events=count() by dataSource.name | limit 5"),
+    ("filter then group",          "dataSource.name=='SentinelOne' | group events=count()"),
+    ("filter (modern keyword)",    "filter dataSource.name=='SentinelOne' | group events=count()"),
+    ("dataset-style with sort",    "group events=count() by dataSource.name | sort events desc | limit 5"),
+    ("count() as alias",           "| count() as events"),
+    ("group by event.type",        "group events=count() by event.type | limit 5"),
+]
+
+print(f"URL: {URL}")
+print(f"Window: last 1h ({START_MS}..{END_MS} ms)")
+print()
+for label, q in CASES:
+    run(label, q)
@@ -0,0 +1,7 @@
+{
+  "_comment": "Copy to sdl_config.json (or set $SDL_CONFIG to its path). Only the keys you need are required. log_read_key for queries; config_read_key for listFiles/getFile (parser sync).",
+  "base_url": "https://xdr.us1.sentinelone.net",
+  "log_read_key":     "REPLACE_WITH_LOG_READ_KEY",
+  "config_read_key":  "REPLACE_WITH_CONFIG_READ_KEY",
+  "console_api_token": "REPLACE_WITH_CONSOLE_API_TOKEN_OR_LEAVE_BLANK"
+}
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""
+Pull every parser under /logParsers/ from the SDL tenant and drop it into
+./parsers/ so the SIEM-Toolkit Parser Test Runner can list it.
+
+Auth: config_read_key from sentinelone-sdl-api/config.json
+"""
+from __future__ import annotations
+import json
+import os
+import sys
+import urllib.request
+import urllib.parse
+import urllib.error
+
+def _load_sdl_cfg():
+    import json as _j, os as _o, sys as _s
+    here = _o.path.dirname(_o.path.abspath(__file__))
+    candidates = [
+        _o.environ.get("SDL_CONFIG"),
+        _o.path.join(here, "sdl_config.json"),
+        _o.path.join(here, "..", "sdl_config.json"),
+    ]
+    for p in candidates:
+        if p and _o.path.exists(p):
+            with open(p) as fh:
+                return _j.load(fh)
+    _s.stderr.write(
+        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
+        "(see sdl_config.example.json)\n")
+    _s.exit(2)
+
+
+SDL_CFG_PATH = os.environ.get('SDL_CONFIG')  # placeholder; cfg loaded below
+DEST = os.environ.get('PARSERS_DIR', os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'parsers'))
+def call(base_url: str, token: str, path: str, params: dict) -> dict:
+    """POST with JSON body — works for both listFiles and getFile on SDL."""
+    url = f"{base_url.rstrip('/')}{path}"
+    body = json.dumps({**params, "token": token}).encode()
+    req = urllib.request.Request(url, data=body, headers={
+        "Authorization": f"Bearer {token}",
+        "Content-Type":  "application/json",
+    })
+    try:
+        with urllib.request.urlopen(req, timeout=30) as r:
+            return json.loads(r.read())
+    except urllib.error.HTTPError as e:
+        body = e.read().decode(errors="replace")[:300]
+        raise RuntimeError(f"HTTP {e.code} {path}: {body}")
+
+
+def main() -> int:
+    cfg = _load_sdl_cfg()
+    base = cfg["base_url"]
+    # config_read_key first (per docs), fall back to console_api_token
+    token = cfg.get("config_read_key") or cfg.get("console_api_token")
+    if not token:
+        print("No config_read_key or console_api_token in config.json", file=sys.stderr)
+        return 2
+
+    print(f"Listing /logParsers/ from {base} ...")
+    res = call(base, token, "/api/listFiles", {"pathPrefix": "/logParsers/"})
+    paths = res.get("paths", [])
+    print(f"Found {len(paths)} files under /logParsers/")
+
+    os.makedirs(DEST, exist_ok=True)
+    fetched, skipped, failed = 0, 0, []
+
+    for p in paths:
+        # Strip leading /logParsers/, sanitize for filesystem
+        name = p.rsplit("/", 1)[-1] or "_unnamed"
+        # Avoid colliding with existing sample files? Always overwrite to keep fresh.
+        try:
+            r = call(base, token, "/api/getFile", {"path": p})
+        except Exception as e:
+            failed.append((p, str(e)))
+            continue
+
+        content = r.get("content")
+        if content is None:
+            failed.append((p, "no content"))
+            continue
+
+        out = os.path.join(DEST, name)
+        with open(out, "w", encoding="utf-8") as fh:
+            fh.write(content)
+        ver = r.get("version", "?")
+        print(f"  + {name:<60} v{ver}  ({len(content)} bytes)")
+        fetched += 1
+
+    print()
+    print(f"Done: fetched={fetched}, failed={len(failed)}")
+    if failed:
+        for p, err in failed[:10]:
+            print(f"  ! {p}: {err}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+"""Verify the Parser Test Runner accepts multi-line NDJSON for JSON-mode parsers."""
+import json, urllib.request
+import os
+
+LINES = [
+    '{"timestamp":"2026-05-14T00:00:41.969Z","event_type":"DATA_IMPORT_COMPLETED","event_category":"data_transfer","severity":"INFO","outcome":"success","source":{"application":"Avelios Medical"}}',
+    '{"timestamp":"2026-05-14T00:07:41.969Z","event_type":"PERFORMANCE_DEGRADATION","event_category":"system","severity":"MEDIUM","outcome":"success","source":{"application":"Avelios Medical"}}',
+    '{"timestamp":"2026-05-14T00:24:41.969Z","event_type":"MALWARE_DETECTED","event_category":"security","severity":"CRITICAL","outcome":"detected","source":{"application":"Avelios Medical"},"details":{"malware_name":"Trojan.GenericKD"}}',
+]
+
+body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": "\n".join(LINES)}).encode()
+req = urllib.request.Request(
+    "http://localhost:8001/api/quality/test-parser",
+    data=body, headers={"Content-Type": "application/json"})
+r = json.loads(urllib.request.urlopen(req, timeout=30).read())
+
+print(f"matched      = {r.get('matched')}")
+print(f"mode         = {r.get('mode')}")
+print(f"payloads     = {r.get('payload_count')}  (showing {r.get('showing_payload')})")
+print(f"extracted    = {r.get('extracted_count')}")
+print(f"derived      = {r.get('derived_count')}")
+print(f"parse_errors = {r.get('parse_errors')}")
+print()
+print("rewrites applied (first payload):")
+for rw in r.get("rewrites_applied", [])[:10]:
+    print(f"  {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import json, urllib.request
+import os
+
+log = '{"timestamp": "2026-05-14T00:24:41.969Z", "event_id": "d5c76dd2-5320-4b32-bd27-09acedfb5fdb", "event_type": "MALWARE_DETECTED", "event_category": "security", "severity": "CRITICAL", "source": {"application": "Avelios Medical", "module": "SecurityMonitor"}, "outcome": "detected", "details": {"malware_name": "Trojan.GenericKD"}}'
+
+body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": log}).encode()
+req = urllib.request.Request(
+    "http://localhost:8001/api/quality/test-parser",
+    data=body, headers={"Content-Type": "application/json"})
+r = json.loads(urllib.request.urlopen(req, timeout=30).read())
+
+print(f"matched={r.get('matched')}  mode={r.get('mode')}  "
+      f"extracted={r.get('extracted_count')}  derived={r.get('derived_count')}")
+print()
+print("json-extract fields (first 12):")
+for f in [x for x in r.get("fields", []) if x.get("source") == "json-extract"][:12]:
+    print(f"  {f['field']:<32} = {str(f['value'])[:50]}")
+print()
+print("rewrites applied:")
+for rw in r.get("rewrites_applied", [])[:12]:
+    print(f"  {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")