Merge pull request #1 from marcredhat/fix/json-parser-and-pq-syntax

Fix Parser Test Runner JSON mode, Filter Simulator PQ syntax, and parser dropdown
2026-06-11 05:41:19 +00:00 · 2026-05-20 15:25:39 -04:00
parent 6cd9da82da d8d62478c0
commit 0013adbe7e
11 changed files with 694 additions and 20 deletions
@@ -92,12 +92,15 @@ async def simulate_filter(rule: FilterRule):
    clauses = []
    if rule.source:
-        clauses.append(f'src.name = "{rule.source}"')
+        clauses.append(f"dataSource.name=='{rule.source}'")
    if rule.event_type:
-        clauses.append(f'event.type = "{rule.event_type}"')
+        clauses.append(f"event.type=='{rule.event_type}'")
-    filter_expr = " AND ".join(clauses) if clauses else "true"
+    if clauses:
-    query = f"| filter {filter_expr} | count() as events"
+        filter_expr = " and ".join(clauses)
        query = f"| filter {filter_expr} | group events=count()"
    else:
        query = "| group events=count()"
    try:
        result = await s1_client.run_powerquery(query, from_dt, to_dt)
@@ -2,11 +2,26 @@ from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 from datetime import datetime, timedelta
 from services import s1_client
 import os
 import re
 router = APIRouter()
@router.get("/parsers")
 def list_parser_files():
    """List parser filenames available under /app/parsers/ for the Test Runner."""
    parsers_dir = "/app/parsers"
    try:
        names = sorted(
            e.name for e in os.scandir(parsers_dir)
            if e.is_file() and not e.name.startswith(".")
        )
    except FileNotFoundError:
        names = []
    return {"parsers": names, "count": len(names)}
 def _date_range_hours(hours: int) -> tuple[str, str]:
    now = datetime.utcnow()
    return (
@@ -52,11 +67,41 @@ class TestParserRequest(BaseModel):
 # Helpers
 # ---------------------------------------------------------------------------
 def _flatten_dict(d: dict, prefix: str = "", out: dict | None = None) -> dict:
    """Recursively flatten a nested dict into dotted keys."""
    if out is None:
        out = {}
    if not isinstance(d, dict):
        return out
    for k, v in d.items():
        key = f"{prefix}.{k}" if prefix else k
        if isinstance(v, dict):
            _flatten_dict(v, key, out)
        else:
            out[key] = v
    return out
 def _flatten_event(event: dict) -> dict:
-    """Return a flat field→value dict from a PowerQuery result row."""
+    """Return a flat field→value dict from a PowerQuery result row.
-    if isinstance(event, dict):
+
-        return {k: v for k, v in event.items()}
+    If the row only carries a JSON-stringified payload in `message` (i.e. the
-    return {}
+    parser wasn't applied at query time), parse and flatten it inline so the
    UI can measure field population accurately. The original raw `message`
    is preserved under its own key.
    """
    if not isinstance(event, dict):
        return {}
    flat = dict(event)
    msg = flat.get("message")
    if isinstance(msg, str) and msg.startswith("{") and msg.endswith("}"):
        try:
            parsed = __import__("json").loads(msg)
            if isinstance(parsed, dict):
                flat.update(_flatten_dict(parsed))
        except Exception:
            pass
    return flat
 def _extract_format_strings(content: str) -> list[str]:
@@ -204,6 +249,117 @@ async def test_parser(req: TestParserRequest):
    format_strings = _extract_format_strings(content)
    # ── JSON auto-extract path ──────────────────────────────────────────────
    # SDL parsers that use `$=json{parse=json}$` (or any format containing
    # `parse=json`) auto-extract every top-level JSON key as an attribute.
    # The regex-based path can't model that — handle it explicitly so users
    # can test JSON-shaped logs against JSON-mode parsers.
    log_input = req.log_line.strip()
    is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{")
    if is_json_mode:
        import json as _json
        # Support multi-line input (one JSON object per line, or a JSON array)
        lines = [ln for ln in (l.strip() for l in log_input.splitlines()) if ln]
        payloads: list[dict] = []
        parse_errors: list[str] = []
        # Single line: try direct parse; if it's a JSON array, expand.
        if len(lines) == 1:
            try:
                obj = _json.loads(lines[0])
            except Exception as e:
                return {
                    "parser_name": req.parser_name,
                    "matched": False,
                    "message": f"Parser expects JSON but log line could not be parsed as JSON: {e}",
                    "fields": [],
                }
            if isinstance(obj, list):
                payloads = [x for x in obj if isinstance(x, dict)]
            elif isinstance(obj, dict):
                payloads = [obj]
            else:
                return {
                    "parser_name": req.parser_name,
                    "matched": False,
                    "message": "Parser expects a JSON object (got scalar).",
                    "fields": [],
                }
        else:
            # Multi-line: one JSON object per line (NDJSON)
            for i, ln in enumerate(lines, 1):
                try:
                    obj = _json.loads(ln)
                    if isinstance(obj, dict):
                        payloads.append(obj)
                    else:
                        parse_errors.append(f"line {i}: not a JSON object")
                except Exception as e:
                    parse_errors.append(f"line {i}: {e}")
        if not payloads:
            return {
                "parser_name": req.parser_name,
                "matched": False,
                "message": "No valid JSON objects found. " + " | ".join(parse_errors[:3]),
                "fields": [],
            }
        # Use the first payload for the detail table; report totals.
        payload = payloads[0]
        extracted = _flatten_dict(payload)
        # Apply lightweight rewrites if present (input/output/match/replace blocks).
        # We only handle simple literal/regex matches with $0 or string replacements;
        # this is best-effort, intended for quick visual verification.
        rewrites_applied = []
        rewrite_re = re.compile(
            r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}',
            re.DOTALL,
        )
        derived: dict[str, str] = {}
        for m in rewrite_re.finditer(content):
            in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4)
            src_val = extracted.get(in_field)
            if src_val is None:
                continue
            try:
                m2 = re.search(match_pat, str(src_val))
            except re.error:
                continue
            if not m2:
                continue
            # SDL uses $0 for whole match, $1.. for groups. Translate to Python
            # \g<0>, \g<1>, ... so re.sub doesn't read \0 as a null byte.
            def _to_py_backref(s: str) -> str:
                return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s)
            try:
                val = re.sub(match_pat, _to_py_backref(replace_val), str(src_val), count=1)
            except re.error:
                val = replace_val
            derived[out_field] = val
            rewrites_applied.append({
                "input": in_field, "input_value": src_val,
                "output": out_field, "matched_on": match_pat, "result": val,
            })
        fields = (
            [{"field": k, "value": v, "source": "json-extract"} for k, v in sorted(extracted.items())]
            + [{"field": k, "value": v, "source": "rewrite"}     for k, v in sorted(derived.items())]
        )
        return {
            "parser_name": req.parser_name,
            "matched": True,
            "mode": "json",
            "format_matched": "$=json{parse=json}$",
            "fields": fields,
            "rewrites_applied": rewrites_applied,
            "extracted_count": len(extracted),
            "derived_count": len(derived),
            "payload_count": len(payloads),
            "parse_errors": parse_errors,
            "showing_payload": 1,
        }
    # ── Regex format-string path (original) ─────────────────────────────────
    for fmt in format_strings:
        try:
            compiled, py_to_sdl = _sdl_format_to_regex(fmt)
@@ -221,6 +377,7 @@ async def test_parser(req: TestParserRequest):
            return {
                "parser_name": req.parser_name,
                "matched": True,
                "mode": "regex",
                "format_matched": fmt,
                "fields": fields,
            }
@@ -825,6 +825,7 @@ function renderQuality() {
          <option value="6">Last 6h</option>
          <option value="24" selected>Last 24h</option>
          <option value="72">Last 3d</option>
          <option value="168">Last 7d</option>
        </select>
        <select id="qs-limit" class="bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-300 focus:outline-none focus:border-purple-600">
          <option value="10" selected>10 events</option>
@@ -850,6 +851,7 @@ function renderQuality() {
          <option value="6">Last 6h</option>
          <option value="24" selected>Last 24h</option>
          <option value="72">Last 3d</option>
          <option value="168">Last 7d</option>
        </select>
        <button onclick="qpAnalyze()" id="btn-qp"
          class="px-4 py-2 text-sm bg-purple-700 hover:bg-purple-600 rounded-lg text-white transition-colors">Analyze</button>
@@ -1050,12 +1052,21 @@ async function qtLoadParsers() {
    if (qsSel) qsSel.innerHTML = sourcePlaceholder + sourceOptions
    if (qpSel) qpSel.innerHTML = sourcePlaceholder + sourceOptions
-    // Populate parser dropdown
+    // Populate parser dropdown from /app/parsers/ directory (not from coverage map)
    const qtSel = document.getElementById('qt-parser')
    if (qtSel) {
-      parserNames.forEach(n => {
+      try {
-        const o = document.createElement('option'); o.value = n; o.textContent = n; qtSel.appendChild(o)
+        const p = await apiGet('/api/quality/parsers')
-      })
+        qtSel.innerHTML = '<option value="">— select parser —</option>'
        ;(p.parsers || []).forEach(n => {
          const o = document.createElement('option'); o.value = n; o.textContent = n; qtSel.appendChild(o)
        })
        if (!p.parsers || p.parsers.length === 0) {
          qtSel.innerHTML = '<option value="">— no parser files in /app/parsers — drop JSON files there or click "Load SDL Parsers" —</option>'
        }
      } catch (err) {
        qtSel.innerHTML = '<option value="">— could not load parsers: ' + esc(err.message || err) + ' —</option>'
      }
    }
  } catch(e) {
    // If no sources synced yet, fall back to empty state with hint
@@ -1079,26 +1090,54 @@ async function qtTest() {
    if (!r.matched) {
      document.getElementById('qt-result').innerHTML = `
        <div class="p-3 bg-amber-900/30 border border-amber-700/50 rounded-lg text-sm text-amber-300">
-          ⚠ No format pattern matched this log line.
+          ⚠ ${esc(r.message || 'No format pattern matched this log line.')}
-          <p class="text-xs text-amber-500 mt-1">The parser's format strings didn't produce a match. Check that the log sample matches the expected format, or that the parser has SDL format strings (some parsers use grok/dottedJson which aren't tested here).</p>
+          <p class="text-xs text-amber-500 mt-1">The parser's format strings didn't produce a match. Check that the log sample matches the expected format, or that the parser uses grok/dottedJson which aren't tested here.</p>
        </div>`
      return
    }
-    const rows = r.fields.map(f => `<tr class="border-b border-gray-800/40">
+    const extracts = (r.fields || []).filter(f => f.source !== 'rewrite')
    const rewrites = (r.fields || []).filter(f => f.source === 'rewrite')
    const rowsExtract = extracts.map(f => `<tr class="border-b border-gray-800/40">
      <td class="py-1.5 pr-4 font-mono text-xs text-purple-300">${esc(f.field)}</td>
      <td class="py-1.5 font-mono text-xs text-gray-200">${esc(String(f.value))}</td>
    </tr>`).join('')
    const rowsRewrite = rewrites.map(f => `<tr class="border-b border-gray-800/40">
      <td class="py-1.5 pr-4 font-mono text-xs text-emerald-300">${esc(f.field)}</td>
      <td class="py-1.5 font-mono text-xs text-gray-200">${esc(String(f.value))}</td>
    </tr>`).join('')
    const modeBadge = r.mode === 'json'
      ? '<span class="px-2 py-0.5 ml-2 text-xs rounded bg-purple-900/60 border border-purple-700 text-purple-300">JSON auto-extract</span>'
      : '<span class="px-2 py-0.5 ml-2 text-xs rounded bg-blue-900/60 border border-blue-700 text-blue-300">regex format</span>'
    const counts = r.mode === 'json'
      ? `<span class="text-gray-500">${r.extracted_count} extracted · ${r.derived_count} rewritten` +
        (r.payload_count > 1 ? ` · showing payload ${r.showing_payload}/${r.payload_count}` : '') +
        `</span>` : ''
    const parseWarn = (r.parse_errors && r.parse_errors.length)
      ? `<div class="mt-2 p-2 bg-amber-900/30 border border-amber-700/50 rounded text-xs text-amber-300">
           ${r.parse_errors.length} line(s) skipped: ${r.parse_errors.slice(0,3).map(esc).join(' | ')}${r.parse_errors.length>3?' …':''}
         </div>` : ''
    document.getElementById('qt-result').innerHTML = `
      <div class="mb-3 p-2 bg-gray-800/60 rounded text-xs text-gray-500 font-mono break-all">
-        <span class="text-gray-600">Matched format: </span>${esc(r.format_matched)}
+        <span class="text-gray-600">Matched format: </span>${esc(r.format_matched)} ${modeBadge}
        <div class="mt-1">${counts}</div>
        ${parseWarn}
      </div>
      <table class="w-full mb-4">
        <thead><tr class="text-left text-gray-500 border-b border-gray-800">
          <th class="pb-2 pr-4 text-xs font-medium">Extracted Field</th>
          <th class="pb-2 text-xs font-medium">Value</th>
        </tr></thead>
        <tbody>${rowsExtract}</tbody>
      </table>
      ${rewrites.length ? `
      <h4 class="text-xs font-semibold text-emerald-300 mb-2">Derived (rewrites applied — ${rewrites.length})</h4>
      <table class="w-full">
        <thead><tr class="text-left text-gray-500 border-b border-gray-800">
-          <th class="pb-2 pr-4 text-xs font-medium">Field</th>
+          <th class="pb-2 pr-4 text-xs font-medium">Output Field</th>
-          <th class="pb-2 text-xs font-medium">Extracted Value</th>
+          <th class="pb-2 text-xs font-medium">Value</th>
        </tr></thead>
-        <tbody>${rows}</tbody>
+        <tbody>${rowsRewrite}</tbody>
-      </table>`
+      </table>` : ''}`
  } catch(e) {
    document.getElementById('qt-result').innerHTML = errBox(e.message)
  } finally { setBtn('btn-qt', false, 'Test') }
@@ -0,0 +1,100 @@
 #!/usr/bin/env python3
 """Probe the SDL tenant to understand why Avelios Medical field-population shows 0%."""
 import json, time, urllib.request, urllib.error
 import os
 def _load_sdl_cfg():
    import json as _j, os as _o, sys as _s
    here = _o.path.dirname(_o.path.abspath(__file__))
    candidates = [
        _o.environ.get("SDL_CONFIG"),
        _o.path.join(here, "sdl_config.json"),
        _o.path.join(here, "..", "sdl_config.json"),
    ]
    for p in candidates:
        if p and _o.path.exists(p):
            with open(p) as fh:
                return _j.load(fh)
    _s.stderr.write(
        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
        "(see sdl_config.example.json)\n")
    _s.exit(2)
 CFG = _load_sdl_cfg()
 BASE = CFG['base_url'].rstrip('/')
 KEY  = CFG['log_read_key']
 END_MS   = int(time.time() * 1000)
 START_MS = END_MS - 24 * 3600 * 1000   # last 24h
 def pq(query: str, max_count: int = 10) -> dict:
    body = json.dumps({
        "token": KEY, "query": query,
        "startTime": START_MS, "endTime": END_MS,
        "maxCount": max_count,
    }).encode()
    req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
                                 headers={"Content-Type": "application/json"})
    try:
        return json.loads(urllib.request.urlopen(req, timeout=30).read())
    except urllib.error.HTTPError as e:
        return {"_err": f"HTTP {e.code}: {e.read().decode()[:200]}"}
    except Exception as e:
        return {"_err": str(e)[:200]}
 def show(label, d):
    if "_err" in d:
        print(f"[ERR] {label}: {d['_err']}"); return
    cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])]
    vals = d.get('values', []) or d.get('matches', [])
    print(f"[OK ] {label}  cols={cols}  rows={len(vals)}")
    for v in vals[:8]:
        print(f"     {v}")
 # 1. Distinct dataSource.name values containing 'velio'
 print("=" * 70)
 print("1. Source-name spellings containing 'velio'")
 print("=" * 70)
 show("by dataSource.name",
     pq("| group n=count() by dataSource.name | sort -n | limit 50", max_count=50))
 # 2. Try a few candidate names
 print()
 print("=" * 70)
 print("2. Try filtering by candidate names")
 print("=" * 70)
 for cand in ["Avelios Medical", "Avelios-Medical", "Avelios-Medical-OCSF",
             "avelios", "Avelios"]:
    d = pq(f"| filter dataSource.name == '{cand}' | group n=count()", max_count=1)
    n = (d.get('values') or [[None]])[0][0] if 'values' in d else d
    print(f"  {cand!r:<35}  -> {n}")
 for cand in ["Avelios Medical", "Avelios-Medical-OCSF", "avelios"]:
    d = pq(f"| filter dataSource.name contains '{cand}' | group n=count()", max_count=1)
    n = (d.get('values') or [[None]])[0][0] if 'values' in d else d
    print(f"  contains {cand!r:<25}  -> {n}")
 # 3. Sample one raw event to see what column names actually come back
 print()
 print("=" * 70)
 print("3. Sample one event — what keys/columns are returned?")
 print("=" * 70)
 d = pq("| filter dataSource.name contains 'velio' | limit 1", max_count=1)
 if "_err" in d:
    print("  ", d["_err"])
 else:
    print("  columns:", [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])][:30])
    print("  first row sample:", str((d.get('values') or [None])[0])[:400])
 # 4. If we got columns, check which OCSF fields exist
 print()
 print("=" * 70)
 print("4. Field presence in last 24h for Avelios (using columns command)")
 print("=" * 70)
 d = pq("| filter dataSource.name contains 'velio' | "
       "columns dataSource.name, metadata.product.name, metadata.event_code, "
       "actor.user.name, src_endpoint.ip, dst_endpoint.ip | limit 5",
       max_count=5)
 show("columns view", d)
@@ -0,0 +1,89 @@
 #!/usr/bin/env python3
 """Inspect Avelios Medical events: one query, full row dump, then field stats from Python."""
 import json, time, urllib.request, collections
 import os
 def _load_sdl_cfg():
    import json as _j, os as _o, sys as _s
    here = _o.path.dirname(_o.path.abspath(__file__))
    candidates = [
        _o.environ.get("SDL_CONFIG"),
        _o.path.join(here, "sdl_config.json"),
        _o.path.join(here, "..", "sdl_config.json"),
    ]
    for p in candidates:
        if p and _o.path.exists(p):
            with open(p) as fh:
                return _j.load(fh)
    _s.stderr.write(
        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
        "(see sdl_config.example.json)\n")
    _s.exit(2)
 CFG = _load_sdl_cfg()
 BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key']
 NOW = int(time.time() * 1000)
 START = NOW - 72 * 3600 * 1000          # last 3 days
 def pq(query, mc=200):
    body = json.dumps({"token": KEY, "query": query,
                       "startTime": START, "endTime": NOW,
                       "maxCount": mc}).encode()
    req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
                                 headers={"Content-Type": "application/json"})
    return json.loads(urllib.request.urlopen(req, timeout=60).read())
 print("Fetching Avelios Medical sample (max 200, last 72h) ...")
 d = pq("| filter dataSource.name == 'Avelios Medical' | limit 200")
 cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])]
 vals = d.get('values', []) or []
 print(f"Columns returned ({len(cols)}): {cols}")
 print(f"Rows: {len(vals)}")
 print()
 # Tally non-null rate per returned column
 counts = {c: 0 for c in cols}
 for row in vals:
    for c, v in zip(cols, row):
        if v not in (None, '', 'null'):
            counts[c] += 1
 print("=== Column populated-rate (out of returned columns) ===")
 for c in cols:
    n = counts[c]
    pct = round(100 * n / max(1, len(vals)), 1)
    print(f"  {c:<35} {n:>4} / {len(vals)}   {pct:>5}%")
 print()
 print("=== First 2 events (pretty) ===")
 for row in vals[:2]:
    print(json.dumps(dict(zip(cols, row)), indent=2, default=str)[:1500])
    print("---")
 print()
 print("=== Distinct fields IN the message body (if JSON) ===")
 # If the events carry a structured body, peek inside it
 field_freq = collections.Counter()
 for row in vals:
    rd = dict(zip(cols, row))
    msg = rd.get('message') or rd.get('body') or rd.get('attributes')
    if isinstance(msg, str):
        try:
            j = json.loads(msg)
        except Exception:
            continue
    else:
        j = msg
    if isinstance(j, dict):
        def walk(obj, prefix=''):
            for k, v in obj.items():
                key = f"{prefix}.{k}" if prefix else k
                if isinstance(v, dict):
                    walk(v, key)
                else:
                    field_freq[key] += 1
        walk(j)
 for k, c in field_freq.most_common(40):
    print(f"  {k:<45} in {c:>3} events")
@@ -0,0 +1,53 @@
 #!/usr/bin/env python3
 """Search wider windows for Avelios Medical events."""
 import json, time, urllib.request
 import os
 def _load_sdl_cfg():
    import json as _j, os as _o, sys as _s
    here = _o.path.dirname(_o.path.abspath(__file__))
    candidates = [
        _o.environ.get("SDL_CONFIG"),
        _o.path.join(here, "sdl_config.json"),
        _o.path.join(here, "..", "sdl_config.json"),
    ]
    for p in candidates:
        if p and _o.path.exists(p):
            with open(p) as fh:
                return _j.load(fh)
    _s.stderr.write(
        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
        "(see sdl_config.example.json)\n")
    _s.exit(2)
 CFG = _load_sdl_cfg()
 BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key']
 NOW = int(time.time() * 1000)
 def pq(query, start_ms, end_ms, mc=5):
    body = json.dumps({"token": KEY, "query": query,
                       "startTime": start_ms, "endTime": end_ms,
                       "maxCount": mc}).encode()
    req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
                                 headers={"Content-Type": "application/json"})
    try:
        return json.loads(urllib.request.urlopen(req, timeout=60).read())
    except Exception as e:
        return {"_err": str(e)[:200]}
 for days in (1, 3, 7):
    start = NOW - days * 24 * 3600 * 1000
    print(f"\n=== last {days}d ===")
    d = pq("| group n=count() by dataSource.name | sort -n | limit 30", start, NOW, mc=30)
    if "_err" in d:
        print(d["_err"]); continue
    for row in d.get("values", []):
        name = row[0]
        if name and "velio" in name.lower():
            print(f"  HIT: {row}")
    # show top 10 in this window
    for row in (d.get("values", []) or [])[:10]:
        print(f"  {row}")
@@ -0,0 +1,77 @@
 #!/usr/bin/env python3
 """Probe what PowerQuery syntax this SDL tenant accepts."""
 import json, time, urllib.request, urllib.error, sys
 import os
 def _load_sdl_cfg():
    import json as _j, os as _o, sys as _s
    here = _o.path.dirname(_o.path.abspath(__file__))
    candidates = [
        _o.environ.get("SDL_CONFIG"),
        _o.path.join(here, "sdl_config.json"),
        _o.path.join(here, "..", "sdl_config.json"),
    ]
    for p in candidates:
        if p and _o.path.exists(p):
            with open(p) as fh:
                return _j.load(fh)
    _s.stderr.write(
        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
        "(see sdl_config.example.json)\n")
    _s.exit(2)
 CFG = _load_sdl_cfg()
 URL = CFG['base_url'].rstrip('/') + '/api/powerQuery'
 END_MS = int(time.time() * 1000)
 START_MS = END_MS - 3600 * 1000  # last hour
 def run(label: str, query: str):
    body = json.dumps({
        "token":     CFG['log_read_key'],
        "query":     query,
        "startTime": START_MS,
        "endTime":   END_MS,
        "maxCount":  5,
    }).encode()
    req = urllib.request.Request(URL, data=body, headers={"Content-Type": "application/json"})
    try:
        resp = urllib.request.urlopen(req, timeout=30).read()
        d = json.loads(resp)
        st = d.get('status', '?')
        cols = d.get('columns') or []
        vals = d.get('values') or d.get('matches') or []
        print(f"[OK ] {label:<40} status={st} cols={len(cols)} rows={len(vals)}")
        if vals:
            print(f"      sample={str(vals[0])[:160]}")
    except urllib.error.HTTPError as e:
        body = e.read().decode()
        try:
            j = json.loads(body)
            msg = j.get('message', body)[:200]
        except Exception:
            msg = body[:200]
        print(f"[ERR] {label:<40} HTTP {e.code}: {msg}")
    except Exception as e:
        print(f"[ERR] {label:<40} {type(e).__name__}: {str(e)[:160]}")
 CASES = [
    ("leading-pipe single-stage",  "| group total=count()"),
    ("no-pipe single-stage",       "group total=count()"),
    ("leading-pipe multi-stage",   "| group events=count() by dataSource.name | sort -events | limit 5"),
    ("no-pipe multi-stage",        "group events=count() by dataSource.name | sort -events | limit 5"),
    ("no-pipe trim sort",          "group events=count() by dataSource.name | limit 5"),
    ("filter then group",          "dataSource.name=='SentinelOne' | group events=count()"),
    ("filter (modern keyword)",    "filter dataSource.name=='SentinelOne' | group events=count()"),
    ("dataset-style with sort",    "group events=count() by dataSource.name | sort events desc | limit 5"),
    ("count() as alias",           "| count() as events"),
    ("group by event.type",        "group events=count() by event.type | limit 5"),
 ]
 print(f"URL: {URL}")
 print(f"Window: last 1h ({START_MS}..{END_MS} ms)")
 print()
 for label, q in CASES:
    run(label, q)
@@ -0,0 +1,7 @@
 {
  "_comment": "Copy to sdl_config.json (or set $SDL_CONFIG to its path). Only the keys you need are required. log_read_key for queries; config_read_key for listFiles/getFile (parser sync).",
  "base_url": "https://xdr.us1.sentinelone.net",
  "log_read_key":     "REPLACE_WITH_LOG_READ_KEY",
  "config_read_key":  "REPLACE_WITH_CONFIG_READ_KEY",
  "console_api_token": "REPLACE_WITH_CONSOLE_API_TOKEN_OR_LEAVE_BLANK"
 }
@@ -0,0 +1,100 @@
 #!/usr/bin/env python3
 """
 Pull every parser under /logParsers/ from the SDL tenant and drop it into
 ./parsers/ so the SIEM-Toolkit Parser Test Runner can list it.
 Auth: config_read_key from sentinelone-sdl-api/config.json
 """
 from __future__ import annotations
 import json
 import os
 import sys
 import urllib.request
 import urllib.parse
 import urllib.error
 def _load_sdl_cfg():
    import json as _j, os as _o, sys as _s
    here = _o.path.dirname(_o.path.abspath(__file__))
    candidates = [
        _o.environ.get("SDL_CONFIG"),
        _o.path.join(here, "sdl_config.json"),
        _o.path.join(here, "..", "sdl_config.json"),
    ]
    for p in candidates:
        if p and _o.path.exists(p):
            with open(p) as fh:
                return _j.load(fh)
    _s.stderr.write(
        "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
        "(see sdl_config.example.json)\n")
    _s.exit(2)
 SDL_CFG_PATH = os.environ.get('SDL_CONFIG')  # placeholder; cfg loaded below
 DEST = os.environ.get('PARSERS_DIR', os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'parsers'))
 def call(base_url: str, token: str, path: str, params: dict) -> dict:
    """POST with JSON body — works for both listFiles and getFile on SDL."""
    url = f"{base_url.rstrip('/')}{path}"
    body = json.dumps({**params, "token": token}).encode()
    req = urllib.request.Request(url, data=body, headers={
        "Authorization": f"Bearer {token}",
        "Content-Type":  "application/json",
    })
    try:
        with urllib.request.urlopen(req, timeout=30) as r:
            return json.loads(r.read())
    except urllib.error.HTTPError as e:
        body = e.read().decode(errors="replace")[:300]
        raise RuntimeError(f"HTTP {e.code} {path}: {body}")
 def main() -> int:
    cfg = _load_sdl_cfg()
    base = cfg["base_url"]
    # config_read_key first (per docs), fall back to console_api_token
    token = cfg.get("config_read_key") or cfg.get("console_api_token")
    if not token:
        print("No config_read_key or console_api_token in config.json", file=sys.stderr)
        return 2
    print(f"Listing /logParsers/ from {base} ...")
    res = call(base, token, "/api/listFiles", {"pathPrefix": "/logParsers/"})
    paths = res.get("paths", [])
    print(f"Found {len(paths)} files under /logParsers/")
    os.makedirs(DEST, exist_ok=True)
    fetched, skipped, failed = 0, 0, []
    for p in paths:
        # Strip leading /logParsers/, sanitize for filesystem
        name = p.rsplit("/", 1)[-1] or "_unnamed"
        # Avoid colliding with existing sample files? Always overwrite to keep fresh.
        try:
            r = call(base, token, "/api/getFile", {"path": p})
        except Exception as e:
            failed.append((p, str(e)))
            continue
        content = r.get("content")
        if content is None:
            failed.append((p, "no content"))
            continue
        out = os.path.join(DEST, name)
        with open(out, "w", encoding="utf-8") as fh:
            fh.write(content)
        ver = r.get("version", "?")
        print(f"  + {name:<60} v{ver}  ({len(content)} bytes)")
        fetched += 1
    print()
    print(f"Done: fetched={fetched}, failed={len(failed)}")
    if failed:
        for p, err in failed[:10]:
            print(f"  ! {p}: {err}")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
@@ -0,0 +1,27 @@
 #!/usr/bin/env python3
 """Verify the Parser Test Runner accepts multi-line NDJSON for JSON-mode parsers."""
 import json, urllib.request
 import os
 LINES = [
    '{"timestamp":"2026-05-14T00:00:41.969Z","event_type":"DATA_IMPORT_COMPLETED","event_category":"data_transfer","severity":"INFO","outcome":"success","source":{"application":"Avelios Medical"}}',
    '{"timestamp":"2026-05-14T00:07:41.969Z","event_type":"PERFORMANCE_DEGRADATION","event_category":"system","severity":"MEDIUM","outcome":"success","source":{"application":"Avelios Medical"}}',
    '{"timestamp":"2026-05-14T00:24:41.969Z","event_type":"MALWARE_DETECTED","event_category":"security","severity":"CRITICAL","outcome":"detected","source":{"application":"Avelios Medical"},"details":{"malware_name":"Trojan.GenericKD"}}',
 ]
 body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": "\n".join(LINES)}).encode()
 req = urllib.request.Request(
    "http://localhost:8001/api/quality/test-parser",
    data=body, headers={"Content-Type": "application/json"})
 r = json.loads(urllib.request.urlopen(req, timeout=30).read())
 print(f"matched      = {r.get('matched')}")
 print(f"mode         = {r.get('mode')}")
 print(f"payloads     = {r.get('payload_count')}  (showing {r.get('showing_payload')})")
 print(f"extracted    = {r.get('extracted_count')}")
 print(f"derived      = {r.get('derived_count')}")
 print(f"parse_errors = {r.get('parse_errors')}")
 print()
 print("rewrites applied (first payload):")
 for rw in r.get("rewrites_applied", [])[:10]:
    print(f"  {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")
@@ -0,0 +1,22 @@
 #!/usr/bin/env python3
 import json, urllib.request
 import os
 log = '{"timestamp": "2026-05-14T00:24:41.969Z", "event_id": "d5c76dd2-5320-4b32-bd27-09acedfb5fdb", "event_type": "MALWARE_DETECTED", "event_category": "security", "severity": "CRITICAL", "source": {"application": "Avelios Medical", "module": "SecurityMonitor"}, "outcome": "detected", "details": {"malware_name": "Trojan.GenericKD"}}'
 body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": log}).encode()
 req = urllib.request.Request(
    "http://localhost:8001/api/quality/test-parser",
    data=body, headers={"Content-Type": "application/json"})
 r = json.loads(urllib.request.urlopen(req, timeout=30).read())
 print(f"matched={r.get('matched')}  mode={r.get('mode')}  "
      f"extracted={r.get('extracted_count')}  derived={r.get('derived_count')}")
 print()
 print("json-extract fields (first 12):")
 for f in [x for x in r.get("fields", []) if x.get("source") == "json-extract"][:12]:
    print(f"  {f['field']:<32} = {str(f['value'])[:50]}")
 print()
 print("rewrites applied:")
 for rw in r.get("rewrites_applied", [])[:12]:
    print(f"  {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")