Fix Parser Test Runner JSON mode, Filter Simulator PQ syntax, dropdown source

- backend/routers/quality.py
 * Add GET /api/quality/parsers (lists actual files in /app/parsers)
 * Support SDL JSON auto-extract parsers ($=json{parse=json}$)
 * Apply parser rewrite blocks with correct $0/$N backref translation
 * Accept single JSON / JSON array / NDJSON in test-parser body
 * Flatten JSON inside 'message' for Field Population coverage
- backend/routers/ingest.py
 * Rewrite simulate-filter PowerQuery to valid SDL syntax
 * Correct field name: src.name -> dataSource.name
- frontend/index.html
 * Parser dropdown loads from /api/quality/parsers
 * Add 'Last 7d' lookback option
 * Render JSON-mode test results with badges + payload counter
This commit is contained in:
marc
2026-05-20 19:40:24 +02:00
parent 6e137438b1
commit 8dbd38f3bb
3 changed files with 219 additions and 20 deletions
+161 -4
View File
@@ -2,11 +2,26 @@ from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from datetime import datetime, timedelta
from services import s1_client
import os
import re
router = APIRouter()
@router.get("/parsers")
def list_parser_files():
"""List parser filenames available under /app/parsers/ for the Test Runner."""
parsers_dir = "/app/parsers"
try:
names = sorted(
e.name for e in os.scandir(parsers_dir)
if e.is_file() and not e.name.startswith(".")
)
except FileNotFoundError:
names = []
return {"parsers": names, "count": len(names)}
def _date_range_hours(hours: int) -> tuple[str, str]:
now = datetime.utcnow()
return (
@@ -52,11 +67,41 @@ class TestParserRequest(BaseModel):
# Helpers
# ---------------------------------------------------------------------------
def _flatten_dict(d: dict, prefix: str = "", out: dict | None = None) -> dict:
"""Recursively flatten a nested dict into dotted keys."""
if out is None:
out = {}
if not isinstance(d, dict):
return out
for k, v in d.items():
key = f"{prefix}.{k}" if prefix else k
if isinstance(v, dict):
_flatten_dict(v, key, out)
else:
out[key] = v
return out
def _flatten_event(event: dict) -> dict:
"""Return a flat field→value dict from a PowerQuery result row."""
if isinstance(event, dict):
return {k: v for k, v in event.items()}
return {}
"""Return a flat field→value dict from a PowerQuery result row.
If the row only carries a JSON-stringified payload in `message` (i.e. the
parser wasn't applied at query time), parse and flatten it inline so the
UI can measure field population accurately. The original raw `message`
is preserved under its own key.
"""
if not isinstance(event, dict):
return {}
flat = dict(event)
msg = flat.get("message")
if isinstance(msg, str) and msg.startswith("{") and msg.endswith("}"):
try:
parsed = __import__("json").loads(msg)
if isinstance(parsed, dict):
flat.update(_flatten_dict(parsed))
except Exception:
pass
return flat
def _extract_format_strings(content: str) -> list[str]:
@@ -204,6 +249,117 @@ async def test_parser(req: TestParserRequest):
format_strings = _extract_format_strings(content)
# ── JSON auto-extract path ──────────────────────────────────────────────
# SDL parsers that use `$=json{parse=json}$` (or any format containing
# `parse=json`) auto-extract every top-level JSON key as an attribute.
# The regex-based path can't model that — handle it explicitly so users
# can test JSON-shaped logs against JSON-mode parsers.
log_input = req.log_line.strip()
is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{")
if is_json_mode:
import json as _json
# Support multi-line input (one JSON object per line, or a JSON array)
lines = [ln for ln in (l.strip() for l in log_input.splitlines()) if ln]
payloads: list[dict] = []
parse_errors: list[str] = []
# Single line: try direct parse; if it's a JSON array, expand.
if len(lines) == 1:
try:
obj = _json.loads(lines[0])
except Exception as e:
return {
"parser_name": req.parser_name,
"matched": False,
"message": f"Parser expects JSON but log line could not be parsed as JSON: {e}",
"fields": [],
}
if isinstance(obj, list):
payloads = [x for x in obj if isinstance(x, dict)]
elif isinstance(obj, dict):
payloads = [obj]
else:
return {
"parser_name": req.parser_name,
"matched": False,
"message": "Parser expects a JSON object (got scalar).",
"fields": [],
}
else:
# Multi-line: one JSON object per line (NDJSON)
for i, ln in enumerate(lines, 1):
try:
obj = _json.loads(ln)
if isinstance(obj, dict):
payloads.append(obj)
else:
parse_errors.append(f"line {i}: not a JSON object")
except Exception as e:
parse_errors.append(f"line {i}: {e}")
if not payloads:
return {
"parser_name": req.parser_name,
"matched": False,
"message": "No valid JSON objects found. " + " | ".join(parse_errors[:3]),
"fields": [],
}
# Use the first payload for the detail table; report totals.
payload = payloads[0]
extracted = _flatten_dict(payload)
# Apply lightweight rewrites if present (input/output/match/replace blocks).
# We only handle simple literal/regex matches with $0 or string replacements;
# this is best-effort, intended for quick visual verification.
rewrites_applied = []
rewrite_re = re.compile(
r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}',
re.DOTALL,
)
derived: dict[str, str] = {}
for m in rewrite_re.finditer(content):
in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4)
src_val = extracted.get(in_field)
if src_val is None:
continue
try:
m2 = re.search(match_pat, str(src_val))
except re.error:
continue
if not m2:
continue
# SDL uses $0 for whole match, $1.. for groups. Translate to Python
# \g<0>, \g<1>, ... so re.sub doesn't read \0 as a null byte.
def _to_py_backref(s: str) -> str:
return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s)
try:
val = re.sub(match_pat, _to_py_backref(replace_val), str(src_val), count=1)
except re.error:
val = replace_val
derived[out_field] = val
rewrites_applied.append({
"input": in_field, "input_value": src_val,
"output": out_field, "matched_on": match_pat, "result": val,
})
fields = (
[{"field": k, "value": v, "source": "json-extract"} for k, v in sorted(extracted.items())]
+ [{"field": k, "value": v, "source": "rewrite"} for k, v in sorted(derived.items())]
)
return {
"parser_name": req.parser_name,
"matched": True,
"mode": "json",
"format_matched": "$=json{parse=json}$",
"fields": fields,
"rewrites_applied": rewrites_applied,
"extracted_count": len(extracted),
"derived_count": len(derived),
"payload_count": len(payloads),
"parse_errors": parse_errors,
"showing_payload": 1,
}
# ── Regex format-string path (original) ─────────────────────────────────
for fmt in format_strings:
try:
compiled, py_to_sdl = _sdl_format_to_regex(fmt)
@@ -221,6 +377,7 @@ async def test_parser(req: TestParserRequest):
return {
"parser_name": req.parser_name,
"matched": True,
"mode": "regex",
"format_matched": fmt,
"fields": fields,
}