From 8dbd38f3bb9bec115477730e3f23f7058b27876e Mon Sep 17 00:00:00 2001 From: marc Date: Wed, 20 May 2026 19:40:24 +0200 Subject: [PATCH 1/2] Fix Parser Test Runner JSON mode, Filter Simulator PQ syntax, dropdown source - backend/routers/quality.py * Add GET /api/quality/parsers (lists actual files in /app/parsers) * Support SDL JSON auto-extract parsers ($=json{parse=json}$) * Apply parser rewrite blocks with correct $0/$N backref translation * Accept single JSON / JSON array / NDJSON in test-parser body * Flatten JSON inside 'message' for Field Population coverage - backend/routers/ingest.py * Rewrite simulate-filter PowerQuery to valid SDL syntax * Correct field name: src.name -> dataSource.name - frontend/index.html * Parser dropdown loads from /api/quality/parsers * Add 'Last 7d' lookback option * Render JSON-mode test results with badges + payload counter --- backend/routers/ingest.py | 11 ++- backend/routers/quality.py | 165 ++++++++++++++++++++++++++++++++++++- frontend/index.html | 63 +++++++++++--- 3 files changed, 219 insertions(+), 20 deletions(-) diff --git a/backend/routers/ingest.py b/backend/routers/ingest.py index 5b03f9a..a665731 100644 --- a/backend/routers/ingest.py +++ b/backend/routers/ingest.py @@ -92,12 +92,15 @@ async def simulate_filter(rule: FilterRule): clauses = [] if rule.source: - clauses.append(f'src.name = "{rule.source}"') + clauses.append(f"dataSource.name=='{rule.source}'") if rule.event_type: - clauses.append(f'event.type = "{rule.event_type}"') + clauses.append(f"event.type=='{rule.event_type}'") - filter_expr = " AND ".join(clauses) if clauses else "true" - query = f"| filter {filter_expr} | count() as events" + if clauses: + filter_expr = " and ".join(clauses) + query = f"| filter {filter_expr} | group events=count()" + else: + query = "| group events=count()" try: result = await s1_client.run_powerquery(query, from_dt, to_dt) diff --git a/backend/routers/quality.py b/backend/routers/quality.py index 7b266b7..3e3f8ae 100644 --- a/backend/routers/quality.py +++ b/backend/routers/quality.py @@ -2,11 +2,26 @@ from fastapi import APIRouter, HTTPException from pydantic import BaseModel from datetime import datetime, timedelta from services import s1_client +import os import re router = APIRouter() +@router.get("/parsers") +def list_parser_files(): + """List parser filenames available under /app/parsers/ for the Test Runner.""" + parsers_dir = "/app/parsers" + try: + names = sorted( + e.name for e in os.scandir(parsers_dir) + if e.is_file() and not e.name.startswith(".") + ) + except FileNotFoundError: + names = [] + return {"parsers": names, "count": len(names)} + + def _date_range_hours(hours: int) -> tuple[str, str]: now = datetime.utcnow() return ( @@ -52,11 +67,41 @@ class TestParserRequest(BaseModel): # Helpers # --------------------------------------------------------------------------- +def _flatten_dict(d: dict, prefix: str = "", out: dict | None = None) -> dict: + """Recursively flatten a nested dict into dotted keys.""" + if out is None: + out = {} + if not isinstance(d, dict): + return out + for k, v in d.items(): + key = f"{prefix}.{k}" if prefix else k + if isinstance(v, dict): + _flatten_dict(v, key, out) + else: + out[key] = v + return out + + def _flatten_event(event: dict) -> dict: - """Return a flat field→value dict from a PowerQuery result row.""" - if isinstance(event, dict): - return {k: v for k, v in event.items()} - return {} + """Return a flat field→value dict from a PowerQuery result row. + + If the row only carries a JSON-stringified payload in `message` (i.e. the + parser wasn't applied at query time), parse and flatten it inline so the + UI can measure field population accurately. The original raw `message` + is preserved under its own key. + """ + if not isinstance(event, dict): + return {} + flat = dict(event) + msg = flat.get("message") + if isinstance(msg, str) and msg.startswith("{") and msg.endswith("}"): + try: + parsed = __import__("json").loads(msg) + if isinstance(parsed, dict): + flat.update(_flatten_dict(parsed)) + except Exception: + pass + return flat def _extract_format_strings(content: str) -> list[str]: @@ -204,6 +249,117 @@ async def test_parser(req: TestParserRequest): format_strings = _extract_format_strings(content) + # ── JSON auto-extract path ────────────────────────────────────────────── + # SDL parsers that use `$=json{parse=json}$` (or any format containing + # `parse=json`) auto-extract every top-level JSON key as an attribute. + # The regex-based path can't model that — handle it explicitly so users + # can test JSON-shaped logs against JSON-mode parsers. + log_input = req.log_line.strip() + is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{") + if is_json_mode: + import json as _json + # Support multi-line input (one JSON object per line, or a JSON array) + lines = [ln for ln in (l.strip() for l in log_input.splitlines()) if ln] + payloads: list[dict] = [] + parse_errors: list[str] = [] + # Single line: try direct parse; if it's a JSON array, expand. + if len(lines) == 1: + try: + obj = _json.loads(lines[0]) + except Exception as e: + return { + "parser_name": req.parser_name, + "matched": False, + "message": f"Parser expects JSON but log line could not be parsed as JSON: {e}", + "fields": [], + } + if isinstance(obj, list): + payloads = [x for x in obj if isinstance(x, dict)] + elif isinstance(obj, dict): + payloads = [obj] + else: + return { + "parser_name": req.parser_name, + "matched": False, + "message": "Parser expects a JSON object (got scalar).", + "fields": [], + } + else: + # Multi-line: one JSON object per line (NDJSON) + for i, ln in enumerate(lines, 1): + try: + obj = _json.loads(ln) + if isinstance(obj, dict): + payloads.append(obj) + else: + parse_errors.append(f"line {i}: not a JSON object") + except Exception as e: + parse_errors.append(f"line {i}: {e}") + + if not payloads: + return { + "parser_name": req.parser_name, + "matched": False, + "message": "No valid JSON objects found. " + " | ".join(parse_errors[:3]), + "fields": [], + } + + # Use the first payload for the detail table; report totals. + payload = payloads[0] + extracted = _flatten_dict(payload) + # Apply lightweight rewrites if present (input/output/match/replace blocks). + # We only handle simple literal/regex matches with $0 or string replacements; + # this is best-effort, intended for quick visual verification. + rewrites_applied = [] + rewrite_re = re.compile( + r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}', + re.DOTALL, + ) + derived: dict[str, str] = {} + for m in rewrite_re.finditer(content): + in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4) + src_val = extracted.get(in_field) + if src_val is None: + continue + try: + m2 = re.search(match_pat, str(src_val)) + except re.error: + continue + if not m2: + continue + # SDL uses $0 for whole match, $1.. for groups. Translate to Python + # \g<0>, \g<1>, ... so re.sub doesn't read \0 as a null byte. + def _to_py_backref(s: str) -> str: + return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s) + try: + val = re.sub(match_pat, _to_py_backref(replace_val), str(src_val), count=1) + except re.error: + val = replace_val + derived[out_field] = val + rewrites_applied.append({ + "input": in_field, "input_value": src_val, + "output": out_field, "matched_on": match_pat, "result": val, + }) + + fields = ( + [{"field": k, "value": v, "source": "json-extract"} for k, v in sorted(extracted.items())] + + [{"field": k, "value": v, "source": "rewrite"} for k, v in sorted(derived.items())] + ) + return { + "parser_name": req.parser_name, + "matched": True, + "mode": "json", + "format_matched": "$=json{parse=json}$", + "fields": fields, + "rewrites_applied": rewrites_applied, + "extracted_count": len(extracted), + "derived_count": len(derived), + "payload_count": len(payloads), + "parse_errors": parse_errors, + "showing_payload": 1, + } + + # ── Regex format-string path (original) ───────────────────────────────── for fmt in format_strings: try: compiled, py_to_sdl = _sdl_format_to_regex(fmt) @@ -221,6 +377,7 @@ async def test_parser(req: TestParserRequest): return { "parser_name": req.parser_name, "matched": True, + "mode": "regex", "format_matched": fmt, "fields": fields, } diff --git a/frontend/index.html b/frontend/index.html index 6a4723e..f4c5ebb 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -696,6 +696,7 @@ function renderQuality() { + @@ -911,12 +913,21 @@ async function qtLoadParsers() { if (qsSel) qsSel.innerHTML = sourcePlaceholder + sourceOptions if (qpSel) qpSel.innerHTML = sourcePlaceholder + sourceOptions - // Populate parser dropdown + // Populate parser dropdown from /app/parsers/ directory (not from coverage map) const qtSel = document.getElementById('qt-parser') if (qtSel) { - parserNames.forEach(n => { - const o = document.createElement('option'); o.value = n; o.textContent = n; qtSel.appendChild(o) - }) + try { + const p = await apiGet('/api/quality/parsers') + qtSel.innerHTML = '' + ;(p.parsers || []).forEach(n => { + const o = document.createElement('option'); o.value = n; o.textContent = n; qtSel.appendChild(o) + }) + if (!p.parsers || p.parsers.length === 0) { + qtSel.innerHTML = '' + } + } catch (err) { + qtSel.innerHTML = '' + } } } catch(e) { // If no sources synced yet, fall back to empty state with hint @@ -940,26 +951,54 @@ async function qtTest() { if (!r.matched) { document.getElementById('qt-result').innerHTML = `
- ⚠ No format pattern matched this log line. -

The parser's format strings didn't produce a match. Check that the log sample matches the expected format, or that the parser has SDL format strings (some parsers use grok/dottedJson which aren't tested here).

+ ⚠ ${esc(r.message || 'No format pattern matched this log line.')} +

The parser's format strings didn't produce a match. Check that the log sample matches the expected format, or that the parser uses grok/dottedJson which aren't tested here.

` return } - const rows = r.fields.map(f => ` + const extracts = (r.fields || []).filter(f => f.source !== 'rewrite') + const rewrites = (r.fields || []).filter(f => f.source === 'rewrite') + const rowsExtract = extracts.map(f => ` ${esc(f.field)} ${esc(String(f.value))} `).join('') + const rowsRewrite = rewrites.map(f => ` + ${esc(f.field)} + ${esc(String(f.value))} + `).join('') + const modeBadge = r.mode === 'json' + ? 'JSON auto-extract' + : 'regex format' + const counts = r.mode === 'json' + ? `${r.extracted_count} extracted · ${r.derived_count} rewritten` + + (r.payload_count > 1 ? ` · showing payload ${r.showing_payload}/${r.payload_count}` : '') + + `` : '' + const parseWarn = (r.parse_errors && r.parse_errors.length) + ? `
+ ${r.parse_errors.length} line(s) skipped: ${r.parse_errors.slice(0,3).map(esc).join(' | ')}${r.parse_errors.length>3?' …':''} +
` : '' document.getElementById('qt-result').innerHTML = `
- Matched format: ${esc(r.format_matched)} + Matched format: ${esc(r.format_matched)} ${modeBadge} +
${counts}
+ ${parseWarn}
+ + + + + + ${rowsExtract} +
Extracted FieldValue
+ ${rewrites.length ? ` +

Derived (rewrites applied — ${rewrites.length})

- - + + - ${rows} -
FieldExtracted ValueOutput FieldValue
` + ${rowsRewrite} + ` : ''}` } catch(e) { document.getElementById('qt-result').innerHTML = errBox(e.message) } finally { setBtn('btn-qt', false, 'Test') } From d8d62478c05e06a1b3c31d64976cc64c0e488a2a Mon Sep 17 00:00:00 2001 From: marc Date: Wed, 20 May 2026 19:41:00 +0200 Subject: [PATCH 2/2] Add helper scripts: SDL parser sync, PQ probes, test-parser smoke tests --- tools/probe_avelios.py | 100 ++++++++++++++++++++++++++++++++++ tools/probe_avelios_fields.py | 89 ++++++++++++++++++++++++++++++ tools/probe_avelios_wide.py | 53 ++++++++++++++++++ tools/probe_pq_syntax.py | 77 ++++++++++++++++++++++++++ tools/sdl_config.example.json | 7 +++ tools/sync_sdl_parsers.py | 100 ++++++++++++++++++++++++++++++++++ tools/test_avelios_multi.py | 27 +++++++++ tools/test_avelios_parser.py | 22 ++++++++ 8 files changed, 475 insertions(+) create mode 100644 tools/probe_avelios.py create mode 100644 tools/probe_avelios_fields.py create mode 100644 tools/probe_avelios_wide.py create mode 100644 tools/probe_pq_syntax.py create mode 100644 tools/sdl_config.example.json create mode 100644 tools/sync_sdl_parsers.py create mode 100644 tools/test_avelios_multi.py create mode 100644 tools/test_avelios_parser.py diff --git a/tools/probe_avelios.py b/tools/probe_avelios.py new file mode 100644 index 0000000..a1b3e5c --- /dev/null +++ b/tools/probe_avelios.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""Probe the SDL tenant to understand why Avelios Medical field-population shows 0%.""" +import json, time, urllib.request, urllib.error +import os + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +CFG = _load_sdl_cfg() +BASE = CFG['base_url'].rstrip('/') +KEY = CFG['log_read_key'] +END_MS = int(time.time() * 1000) +START_MS = END_MS - 24 * 3600 * 1000 # last 24h + + +def pq(query: str, max_count: int = 10) -> dict: + body = json.dumps({ + "token": KEY, "query": query, + "startTime": START_MS, "endTime": END_MS, + "maxCount": max_count, + }).encode() + req = urllib.request.Request(BASE + '/api/powerQuery', data=body, + headers={"Content-Type": "application/json"}) + try: + return json.loads(urllib.request.urlopen(req, timeout=30).read()) + except urllib.error.HTTPError as e: + return {"_err": f"HTTP {e.code}: {e.read().decode()[:200]}"} + except Exception as e: + return {"_err": str(e)[:200]} + + +def show(label, d): + if "_err" in d: + print(f"[ERR] {label}: {d['_err']}"); return + cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])] + vals = d.get('values', []) or d.get('matches', []) + print(f"[OK ] {label} cols={cols} rows={len(vals)}") + for v in vals[:8]: + print(f" {v}") + + +# 1. Distinct dataSource.name values containing 'velio' +print("=" * 70) +print("1. Source-name spellings containing 'velio'") +print("=" * 70) +show("by dataSource.name", + pq("| group n=count() by dataSource.name | sort -n | limit 50", max_count=50)) + +# 2. Try a few candidate names +print() +print("=" * 70) +print("2. Try filtering by candidate names") +print("=" * 70) +for cand in ["Avelios Medical", "Avelios-Medical", "Avelios-Medical-OCSF", + "avelios", "Avelios"]: + d = pq(f"| filter dataSource.name == '{cand}' | group n=count()", max_count=1) + n = (d.get('values') or [[None]])[0][0] if 'values' in d else d + print(f" {cand!r:<35} -> {n}") +for cand in ["Avelios Medical", "Avelios-Medical-OCSF", "avelios"]: + d = pq(f"| filter dataSource.name contains '{cand}' | group n=count()", max_count=1) + n = (d.get('values') or [[None]])[0][0] if 'values' in d else d + print(f" contains {cand!r:<25} -> {n}") + +# 3. Sample one raw event to see what column names actually come back +print() +print("=" * 70) +print("3. Sample one event — what keys/columns are returned?") +print("=" * 70) +d = pq("| filter dataSource.name contains 'velio' | limit 1", max_count=1) +if "_err" in d: + print(" ", d["_err"]) +else: + print(" columns:", [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])][:30]) + print(" first row sample:", str((d.get('values') or [None])[0])[:400]) + +# 4. If we got columns, check which OCSF fields exist +print() +print("=" * 70) +print("4. Field presence in last 24h for Avelios (using columns command)") +print("=" * 70) +d = pq("| filter dataSource.name contains 'velio' | " + "columns dataSource.name, metadata.product.name, metadata.event_code, " + "actor.user.name, src_endpoint.ip, dst_endpoint.ip | limit 5", + max_count=5) +show("columns view", d) diff --git a/tools/probe_avelios_fields.py b/tools/probe_avelios_fields.py new file mode 100644 index 0000000..8e8d22d --- /dev/null +++ b/tools/probe_avelios_fields.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Inspect Avelios Medical events: one query, full row dump, then field stats from Python.""" +import json, time, urllib.request, collections +import os + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +CFG = _load_sdl_cfg() +BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key'] +NOW = int(time.time() * 1000) +START = NOW - 72 * 3600 * 1000 # last 3 days + + +def pq(query, mc=200): + body = json.dumps({"token": KEY, "query": query, + "startTime": START, "endTime": NOW, + "maxCount": mc}).encode() + req = urllib.request.Request(BASE + '/api/powerQuery', data=body, + headers={"Content-Type": "application/json"}) + return json.loads(urllib.request.urlopen(req, timeout=60).read()) + + +print("Fetching Avelios Medical sample (max 200, last 72h) ...") +d = pq("| filter dataSource.name == 'Avelios Medical' | limit 200") +cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])] +vals = d.get('values', []) or [] +print(f"Columns returned ({len(cols)}): {cols}") +print(f"Rows: {len(vals)}") +print() + +# Tally non-null rate per returned column +counts = {c: 0 for c in cols} +for row in vals: + for c, v in zip(cols, row): + if v not in (None, '', 'null'): + counts[c] += 1 +print("=== Column populated-rate (out of returned columns) ===") +for c in cols: + n = counts[c] + pct = round(100 * n / max(1, len(vals)), 1) + print(f" {c:<35} {n:>4} / {len(vals)} {pct:>5}%") + +print() +print("=== First 2 events (pretty) ===") +for row in vals[:2]: + print(json.dumps(dict(zip(cols, row)), indent=2, default=str)[:1500]) + print("---") + +print() +print("=== Distinct fields IN the message body (if JSON) ===") +# If the events carry a structured body, peek inside it +field_freq = collections.Counter() +for row in vals: + rd = dict(zip(cols, row)) + msg = rd.get('message') or rd.get('body') or rd.get('attributes') + if isinstance(msg, str): + try: + j = json.loads(msg) + except Exception: + continue + else: + j = msg + if isinstance(j, dict): + def walk(obj, prefix=''): + for k, v in obj.items(): + key = f"{prefix}.{k}" if prefix else k + if isinstance(v, dict): + walk(v, key) + else: + field_freq[key] += 1 + walk(j) +for k, c in field_freq.most_common(40): + print(f" {k:<45} in {c:>3} events") diff --git a/tools/probe_avelios_wide.py b/tools/probe_avelios_wide.py new file mode 100644 index 0000000..86bf857 --- /dev/null +++ b/tools/probe_avelios_wide.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +"""Search wider windows for Avelios Medical events.""" +import json, time, urllib.request +import os + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +CFG = _load_sdl_cfg() +BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key'] +NOW = int(time.time() * 1000) + + +def pq(query, start_ms, end_ms, mc=5): + body = json.dumps({"token": KEY, "query": query, + "startTime": start_ms, "endTime": end_ms, + "maxCount": mc}).encode() + req = urllib.request.Request(BASE + '/api/powerQuery', data=body, + headers={"Content-Type": "application/json"}) + try: + return json.loads(urllib.request.urlopen(req, timeout=60).read()) + except Exception as e: + return {"_err": str(e)[:200]} + + +for days in (1, 3, 7): + start = NOW - days * 24 * 3600 * 1000 + print(f"\n=== last {days}d ===") + d = pq("| group n=count() by dataSource.name | sort -n | limit 30", start, NOW, mc=30) + if "_err" in d: + print(d["_err"]); continue + for row in d.get("values", []): + name = row[0] + if name and "velio" in name.lower(): + print(f" HIT: {row}") + # show top 10 in this window + for row in (d.get("values", []) or [])[:10]: + print(f" {row}") diff --git a/tools/probe_pq_syntax.py b/tools/probe_pq_syntax.py new file mode 100644 index 0000000..128b40d --- /dev/null +++ b/tools/probe_pq_syntax.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +"""Probe what PowerQuery syntax this SDL tenant accepts.""" +import json, time, urllib.request, urllib.error, sys +import os + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +CFG = _load_sdl_cfg() +URL = CFG['base_url'].rstrip('/') + '/api/powerQuery' +END_MS = int(time.time() * 1000) +START_MS = END_MS - 3600 * 1000 # last hour + + +def run(label: str, query: str): + body = json.dumps({ + "token": CFG['log_read_key'], + "query": query, + "startTime": START_MS, + "endTime": END_MS, + "maxCount": 5, + }).encode() + req = urllib.request.Request(URL, data=body, headers={"Content-Type": "application/json"}) + try: + resp = urllib.request.urlopen(req, timeout=30).read() + d = json.loads(resp) + st = d.get('status', '?') + cols = d.get('columns') or [] + vals = d.get('values') or d.get('matches') or [] + print(f"[OK ] {label:<40} status={st} cols={len(cols)} rows={len(vals)}") + if vals: + print(f" sample={str(vals[0])[:160]}") + except urllib.error.HTTPError as e: + body = e.read().decode() + try: + j = json.loads(body) + msg = j.get('message', body)[:200] + except Exception: + msg = body[:200] + print(f"[ERR] {label:<40} HTTP {e.code}: {msg}") + except Exception as e: + print(f"[ERR] {label:<40} {type(e).__name__}: {str(e)[:160]}") + + +CASES = [ + ("leading-pipe single-stage", "| group total=count()"), + ("no-pipe single-stage", "group total=count()"), + ("leading-pipe multi-stage", "| group events=count() by dataSource.name | sort -events | limit 5"), + ("no-pipe multi-stage", "group events=count() by dataSource.name | sort -events | limit 5"), + ("no-pipe trim sort", "group events=count() by dataSource.name | limit 5"), + ("filter then group", "dataSource.name=='SentinelOne' | group events=count()"), + ("filter (modern keyword)", "filter dataSource.name=='SentinelOne' | group events=count()"), + ("dataset-style with sort", "group events=count() by dataSource.name | sort events desc | limit 5"), + ("count() as alias", "| count() as events"), + ("group by event.type", "group events=count() by event.type | limit 5"), +] + +print(f"URL: {URL}") +print(f"Window: last 1h ({START_MS}..{END_MS} ms)") +print() +for label, q in CASES: + run(label, q) diff --git a/tools/sdl_config.example.json b/tools/sdl_config.example.json new file mode 100644 index 0000000..0c96307 --- /dev/null +++ b/tools/sdl_config.example.json @@ -0,0 +1,7 @@ +{ + "_comment": "Copy to sdl_config.json (or set $SDL_CONFIG to its path). Only the keys you need are required. log_read_key for queries; config_read_key for listFiles/getFile (parser sync).", + "base_url": "https://xdr.us1.sentinelone.net", + "log_read_key": "REPLACE_WITH_LOG_READ_KEY", + "config_read_key": "REPLACE_WITH_CONFIG_READ_KEY", + "console_api_token": "REPLACE_WITH_CONSOLE_API_TOKEN_OR_LEAVE_BLANK" +} diff --git a/tools/sync_sdl_parsers.py b/tools/sync_sdl_parsers.py new file mode 100644 index 0000000..675745a --- /dev/null +++ b/tools/sync_sdl_parsers.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +""" +Pull every parser under /logParsers/ from the SDL tenant and drop it into +./parsers/ so the SIEM-Toolkit Parser Test Runner can list it. + +Auth: config_read_key from sentinelone-sdl-api/config.json +""" +from __future__ import annotations +import json +import os +import sys +import urllib.request +import urllib.parse +import urllib.error + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +SDL_CFG_PATH = os.environ.get('SDL_CONFIG') # placeholder; cfg loaded below +DEST = os.environ.get('PARSERS_DIR', os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'parsers')) +def call(base_url: str, token: str, path: str, params: dict) -> dict: + """POST with JSON body — works for both listFiles and getFile on SDL.""" + url = f"{base_url.rstrip('/')}{path}" + body = json.dumps({**params, "token": token}).encode() + req = urllib.request.Request(url, data=body, headers={ + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + }) + try: + with urllib.request.urlopen(req, timeout=30) as r: + return json.loads(r.read()) + except urllib.error.HTTPError as e: + body = e.read().decode(errors="replace")[:300] + raise RuntimeError(f"HTTP {e.code} {path}: {body}") + + +def main() -> int: + cfg = _load_sdl_cfg() + base = cfg["base_url"] + # config_read_key first (per docs), fall back to console_api_token + token = cfg.get("config_read_key") or cfg.get("console_api_token") + if not token: + print("No config_read_key or console_api_token in config.json", file=sys.stderr) + return 2 + + print(f"Listing /logParsers/ from {base} ...") + res = call(base, token, "/api/listFiles", {"pathPrefix": "/logParsers/"}) + paths = res.get("paths", []) + print(f"Found {len(paths)} files under /logParsers/") + + os.makedirs(DEST, exist_ok=True) + fetched, skipped, failed = 0, 0, [] + + for p in paths: + # Strip leading /logParsers/, sanitize for filesystem + name = p.rsplit("/", 1)[-1] or "_unnamed" + # Avoid colliding with existing sample files? Always overwrite to keep fresh. + try: + r = call(base, token, "/api/getFile", {"path": p}) + except Exception as e: + failed.append((p, str(e))) + continue + + content = r.get("content") + if content is None: + failed.append((p, "no content")) + continue + + out = os.path.join(DEST, name) + with open(out, "w", encoding="utf-8") as fh: + fh.write(content) + ver = r.get("version", "?") + print(f" + {name:<60} v{ver} ({len(content)} bytes)") + fetched += 1 + + print() + print(f"Done: fetched={fetched}, failed={len(failed)}") + if failed: + for p, err in failed[:10]: + print(f" ! {p}: {err}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/test_avelios_multi.py b/tools/test_avelios_multi.py new file mode 100644 index 0000000..c3f2ffd --- /dev/null +++ b/tools/test_avelios_multi.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +"""Verify the Parser Test Runner accepts multi-line NDJSON for JSON-mode parsers.""" +import json, urllib.request +import os + +LINES = [ + '{"timestamp":"2026-05-14T00:00:41.969Z","event_type":"DATA_IMPORT_COMPLETED","event_category":"data_transfer","severity":"INFO","outcome":"success","source":{"application":"Avelios Medical"}}', + '{"timestamp":"2026-05-14T00:07:41.969Z","event_type":"PERFORMANCE_DEGRADATION","event_category":"system","severity":"MEDIUM","outcome":"success","source":{"application":"Avelios Medical"}}', + '{"timestamp":"2026-05-14T00:24:41.969Z","event_type":"MALWARE_DETECTED","event_category":"security","severity":"CRITICAL","outcome":"detected","source":{"application":"Avelios Medical"},"details":{"malware_name":"Trojan.GenericKD"}}', +] + +body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": "\n".join(LINES)}).encode() +req = urllib.request.Request( + "http://localhost:8001/api/quality/test-parser", + data=body, headers={"Content-Type": "application/json"}) +r = json.loads(urllib.request.urlopen(req, timeout=30).read()) + +print(f"matched = {r.get('matched')}") +print(f"mode = {r.get('mode')}") +print(f"payloads = {r.get('payload_count')} (showing {r.get('showing_payload')})") +print(f"extracted = {r.get('extracted_count')}") +print(f"derived = {r.get('derived_count')}") +print(f"parse_errors = {r.get('parse_errors')}") +print() +print("rewrites applied (first payload):") +for rw in r.get("rewrites_applied", [])[:10]: + print(f" {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}") diff --git a/tools/test_avelios_parser.py b/tools/test_avelios_parser.py new file mode 100644 index 0000000..48afcc2 --- /dev/null +++ b/tools/test_avelios_parser.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +import json, urllib.request +import os + +log = '{"timestamp": "2026-05-14T00:24:41.969Z", "event_id": "d5c76dd2-5320-4b32-bd27-09acedfb5fdb", "event_type": "MALWARE_DETECTED", "event_category": "security", "severity": "CRITICAL", "source": {"application": "Avelios Medical", "module": "SecurityMonitor"}, "outcome": "detected", "details": {"malware_name": "Trojan.GenericKD"}}' + +body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": log}).encode() +req = urllib.request.Request( + "http://localhost:8001/api/quality/test-parser", + data=body, headers={"Content-Type": "application/json"}) +r = json.loads(urllib.request.urlopen(req, timeout=30).read()) + +print(f"matched={r.get('matched')} mode={r.get('mode')} " + f"extracted={r.get('extracted_count')} derived={r.get('derived_count')}") +print() +print("json-extract fields (first 12):") +for f in [x for x in r.get("fields", []) if x.get("source") == "json-extract"][:12]: + print(f" {f['field']:<32} = {str(f['value'])[:50]}") +print() +print("rewrites applied:") +for rw in r.get("rewrites_applied", [])[:12]: + print(f" {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")