Sync upstream features; preserve fork KV scanner, parsers, verifier

Brought in 35 upstream commits (MITRE heatmap, health score, dependency map, PowerQuery playground, onboarding tracker, product grouping, modern UI redesign). Preserved fork additions: backend/routers/quality.py KV scanner, pattern refs, JS keys, JSON mode, /parsers + /sync-from-sdl endpoints parsers/ 96 OCSF + tenant parsers tools/stormshield-verify/ end-to-end ingest regression test .gitignore un-ignored parsers/* CHANGES.md, PATCHES.md
2026-06-08 12:33:51 +00:00 · 2026-05-22 18:19:52 +02:00
parent a7ebcac9a6
commit 7c1687efce
102 changed files with 13912 additions and 178 deletions
@@ -90,28 +90,21 @@ async def simulate_filter(rule: FilterRule):
    """Estimate how many events and GB would be eliminated by an exclusion filter."""
    from_dt, to_dt = _date_range(rule.days)

-    # Build Scalyr filter expression clauses (uses = not ==, SDL syntax)
    clauses = []
    if rule.source:
-        clauses.append(f"dataSource.name = '{rule.source}'")
+        clauses.append(f"dataSource.name=='{rule.source}'")
    if rule.event_type:
-        clauses.append(f"event.type = '{rule.event_type}'")
+        clauses.append(f"event.type=='{rule.event_type}'")

    if clauses:
-        filter_expr = " ".join(clauses)
-        query = f"{filter_expr} | group events=count()"
+        filter_expr = " and ".join(clauses)
+        query = f"| filter {filter_expr} | group events=count()"
    else:
-        query = "dataSource.name != '' | group events=count()"
+        query = "| group events=count()"

    try:
        result = await s1_client.run_powerquery(query, from_dt, to_dt)
-        err = result.get("error") if isinstance(result, dict) else None
-        if err:
-            raise HTTPException(502, f"PowerQuery error: {err}")
-        rows = result.get("events") or []
-        events = rows[0].get("events", 0) if rows else 0
-    except HTTPException:
-        raise
+        events = (result.get("events") or [{}])[0].get("events", 0) if isinstance(result.get("events"), list) else 0
    except Exception as e:
        raise HTTPException(502, f"PowerQuery error: {e}")

@@ -8,13 +8,15 @@ import re
 router = APIRouter()


+PARSERS_DIR = "/app/parsers"
+
+
@router.get("/parsers")
 def list_parser_files():
    """List parser filenames available under /app/parsers/ for the Test Runner."""
-    parsers_dir = "/app/parsers"
    try:
        names = sorted(
-            e.name for e in os.scandir(parsers_dir)
+            e.name for e in os.scandir(PARSERS_DIR)
            if e.is_file() and not e.name.startswith(".")
        )
    except FileNotFoundError:
@@ -22,6 +24,54 @@ def list_parser_files():
    return {"parsers": names, "count": len(names)}


+@router.post("/sync-from-sdl")
+async def sync_parsers_from_sdl():
+    """Download every parser file under /logParsers/ on the SDL tenant into
+    /app/parsers/. After this call returns, the Parser Test Runner dropdown
+    will include all tenant parsers (including custom ones).
+
+    Requires SDL_CONFIG_READ_KEY in .env (Configuration Read scope on the
+    Data Lake API key).
+    """
+    if not s1_client.SDL_CONFIG_READ_KEY:
+        raise HTTPException(
+            400,
+            "SDL_CONFIG_READ_KEY is not set in .env. Generate a Data Lake API key "
+            "with 'Configuration Read' scope in the S1 console and add it to .env."
+        )
+
+    try:
+        names = await s1_client.list_sdl_parsers()
+    except Exception as e:
+        raise HTTPException(502, f"SDL listFiles failed: {e}")
+
+    os.makedirs(PARSERS_DIR, exist_ok=True)
+    downloaded: list[str] = []
+    errors: list[dict] = []
+
+    for name in names:
+        # The path on SDL is /logParsers/<name>; we write to /app/parsers/<sanitized-name>.
+        safe_name = name.replace("/", "_")
+        try:
+            resp = await s1_client.get_sdl_parser(name)
+            content = resp.get("content")
+            if content is None:
+                errors.append({"parser": name, "error": "no content field in response"})
+                continue
+            with open(os.path.join(PARSERS_DIR, safe_name), "w", encoding="utf-8") as fh:
+                fh.write(content)
+            downloaded.append(safe_name)
+        except Exception as e:
+            errors.append({"parser": name, "error": str(e) or e.__class__.__name__})
+
+    return {
+        "downloaded": len(downloaded),
+        "parsers": downloaded,
+        "errors": errors,
+        "directory": PARSERS_DIR,
+    }
+
+
 def _date_range_hours(hours: int) -> tuple[str, str]:
    now = datetime.utcnow()
    return (
@@ -38,7 +88,6 @@ class SampleEventsRequest(BaseModel):
    source: str
    limit: int = 20
    hours: int = 1
-    filter_mode: str = "broad"  # reserved for future use


 class FieldPopulationRequest(BaseModel):
@@ -108,41 +157,22 @@ def _flatten_event(event: dict) -> dict:
 def _extract_format_strings(content: str) -> list[str]:
    """
    Extract SDL format string values from augmented-JSON parser content.
-    Handles both:
-      - quoted keys:   "format": "..."   (valid JSON)
-      - unquoted keys:  format: "..."    (SDL augmented-JSON)
-    Skips commented-out lines (// ...).
+    Matches:  format: "..."   or   "format": "..."   (SDL parser files are
+    JS-style JSON: keys may or may not be quoted). Supports escaped quotes.
    """
-    pattern = re.compile(r'(?<!//)\"?format\"?\s*:\s*"((?:[^"\\]|\\.)*)"')
-    results = []
-    for line in content.splitlines():
-        stripped = line.strip()
-        if stripped.startswith("//"):
-            continue
-        results.extend(pattern.findall(line))
-    return results
+    pattern = re.compile(r'(?:"format"|format)\s*:\s*"((?:[^"\\]|\\.)*)"')
+    return pattern.findall(content)


 def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
    """
    Convert an SDL format string to a compiled Python regex.

-    SDL format strings may start with '.*,' to absorb a syslog header.  When
-    used with re.search that prefix is redundant AND harmful (it forces a comma
-    before the first named field, which won't exist when the log starts with
-    the field directly).  We strip the leading '.*,' so re.search can anchor
-    to the first real field at any position in the line.
+    Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can
+    translate group names back to the original SDL field names.

-    Internal '.*' wildcards (field separators for skipped fields) are kept as
-    non-greedy '.*?' so they don't consume adjacent named-field values.
-
-    Returns (compiled_pattern, py_group_to_sdl_field).
    Raises re.error if the resulting pattern cannot be compiled.
    """
-    # Strip leading/trailing .* wildcards — re.search handles positioning
-    fmt = re.sub(r'^(\.\*,?)+', '', fmt)
-    fmt = re.sub(r'(,?\.\*)+$', '', fmt)
-
    # Split on $...$ tokens
    token_pattern = re.compile(r'\$([^$]+)\$')
    parts = token_pattern.split(fmt)
@@ -152,25 +182,19 @@ def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
    py_group_to_sdl: dict[str, str] = {}
    seen_groups: dict[str, int] = {}

-    def _escape_literal(s: str) -> str:
-        """Escape literal text but keep internal .* as non-greedy wildcards."""
-        segments = re.split(r'(\.\*)', s)
-        return ''.join(r'.*?' if seg == '.*' else re.escape(seg) for seg in segments)
-
    for i, part in enumerate(parts):
        if i % 2 == 0:
-            # Literal text (possibly containing .* wildcards)
-            regex_parts.append(_escape_literal(part))
+            # Literal text
+            regex_parts.append(re.escape(part))
        else:
            # Token: either "field.name=PATTERN" or just "field.name"
            if '=' in part:
                field_name, pattern = part.split('=', 1)
            else:
                field_name = part
-                # Default: match any non-comma chars (SDL CSV fields)
-                pattern = r'[^,]*'
+                pattern = r'[^\s]+'

-            # Build a valid Python named-group identifier
+            # Build a valid Python group name
            safe = re.sub(r'[.\-]', '_', field_name)
            if safe in seen_groups:
                seen_groups[safe] += 1
@@ -181,53 +205,95 @@ def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
            py_group_to_sdl[safe] = field_name
            regex_parts.append(f'(?P<{safe}>{pattern})')

-    compiled = re.compile(''.join(regex_parts), re.IGNORECASE | re.DOTALL)
+    compiled = re.compile(''.join(regex_parts), re.IGNORECASE)
    return compiled, py_group_to_sdl


+# ---------------------------------------------------------------------------
+# SDL parser helpers: pattern refs, key=value scanner, rewrites
+# ---------------------------------------------------------------------------
+
+def _extract_patterns_block(content: str) -> dict[str, str]:
+    """Extract the top-level `patterns: { name: "regex", ... }` block."""
+    m = re.search(r'patterns\s*:\s*\{', content)
+    if not m:
+        return {}
+    depth, i = 1, m.end()
+    while i < len(content) and depth > 0:
+        c = content[i]
+        if c == '{':
+            depth += 1
+        elif c == '}':
+            depth -= 1
+        i += 1
+    block = content[m.end():i - 1]
+    return dict(re.findall(r'([A-Za-z_]\w*)\s*:\s*"((?:[^"\\]|\\.)*)"', block))
+
+
+def _resolve_pattern_refs(fmt: str, patterns: dict[str, str]) -> str:
+    """Replace $var=PatternName$ with $var=<resolved>$ when PatternName is defined."""
+    if not patterns:
+        return fmt
+
+    def sub(m: re.Match) -> str:
+        token = m.group(1)
+        if '=' in token:
+            name, pat = token.split('=', 1)
+            if pat in patterns:
+                return f"${name}={patterns[pat]}$"
+        return m.group(0)
+    return re.sub(r'\$([^$]+)\$', sub, fmt)
+
+
+_KV_TOKEN_RE = re.compile(r'\$_\$=\$([^$]+)\._\$')
+_KV_SCAN_RE = re.compile(r'([A-Za-z_][\w.-]*)=(?:"((?:[^"\\]|\\.)*)"|([^\s"]+))')
+
+
+def _is_kv_format(fmt: str) -> bool:
+    """SDL key=value scanner idiom: $_$=$<prefix>._$."""
+    return bool(_KV_TOKEN_RE.search(fmt))
+
+
+def _scan_kv(line: str, fmt: str) -> dict[str, str]:
+    """Extract key=value pairs (supports quoted values) and prefix the keys."""
+    m = _KV_TOKEN_RE.search(fmt)
+    prefix = m.group(1) if m else "unmapped"
+    out: dict[str, str] = {}
+    for km in _KV_SCAN_RE.finditer(line):
+        k = km.group(1)
+        v = km.group(2) if km.group(2) is not None else km.group(3)
+        out[f"{prefix}.{k}"] = v
+    return out
+
+
+_REWRITE_RE = re.compile(
+    # JS-style or strict JSON: keys may or may not be quoted, in any order with
+    # commas between. We assume the canonical SDL ordering input/output/match/replace.
+    r'\{\s*(?:"input"|input)\s*:\s*"([^"]+)"\s*,'
+    r'\s*(?:"output"|output)\s*:\s*"([^"]+)"\s*,'
+    r'\s*(?:"match"|match)\s*:\s*"((?:[^"\\]|\\.)*)"\s*,'
+    r'\s*(?:"replace"|replace)\s*:\s*"((?:[^"\\]|\\.)*)"',
+    re.DOTALL,
+)
+
+
+def _extract_rewrites(content: str) -> list[dict]:
+    return [
+        {"input": m.group(1), "output": m.group(2),
+         "match": m.group(3), "replace": m.group(4)}
+        for m in _REWRITE_RE.finditer(content)
+    ]
+
+
+def _to_py_backref(s: str) -> str:
+    """Translate SDL $0/$N backrefs to Python \\g<0>/\\g<N>."""
+    return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s)
+
+
 # ---------------------------------------------------------------------------
 # Endpoints
 # ---------------------------------------------------------------------------

-@router.post("/sample-unlabelled")
-async def sample_unlabelled(req: SampleEventsRequest):
-    """Return a sample of events that have no dataSource.name — these need parsers.
-    Also runs a count query so the caller can update the banner with the real total.
-    """
-    import asyncio
-    from routers import coverage as _coverage
-
-    filter_expr = "!(dataSource.name = *) !(source = 'scalyr')"
-    from_dt, to_dt = _date_range_hours(req.hours)
-
-    sample_result, count_result = await asyncio.gather(
-        s1_client.run_powerquery(f"{filter_expr} | limit {req.limit}", from_dt, to_dt),
-        s1_client.run_powerquery(f"{filter_expr} | group events=count()", from_dt, to_dt, max_count=50_000_000),
-    )
-
-    rows = sample_result if isinstance(sample_result, list) else (sample_result.get("rows") or sample_result.get("events") or [])
-
-    events = [_flatten_event(row) for row in rows]
-    non_empty_keys: set = set()
-    for ev in events:
-        for k, v in ev.items():
-            if v is not None and v != "" and v != "null":
-                non_empty_keys.add(k)
-    events = [{k: v for k, v in ev.items() if k in non_empty_keys} for ev in events]
-
-    count_rows = count_result.get("events", []) if isinstance(count_result, dict) else []
-    total = count_rows[0].get("events", 0) if count_rows else 0
-    _coverage._unlabelled_event_count = total
-
-    return {
-        "events": events,
-        "count": len(events),
-        "total": total,
-        "hours": req.hours,
-        "columns_seen": sorted(non_empty_keys),
-    }
-
-
@router.post("/sample-events")
 async def sample_events(req: SampleEventsRequest):
    """Return a sample of raw events from a given data source."""
@@ -262,39 +328,21 @@ async def field_population(req: FieldPopulationRequest):
    events = [_flatten_event(row) for row in rows]

    if not events:
-        return {
-            "source": req.source,
-            "total_sampled": 0,
-            "hours": req.hours,
-            "fields": [],
-            "fields_seen_in_sample": [],
-            "message": f"No events found for source '{req.source}' in the last {req.hours} hours.",
-        }
+        raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.")

    total = len(events)
-    _empty_scalars = {None, "", "null"}
-
-    def _is_empty(val):
-        """Return True if the value counts as unpopulated."""
-        if val is None:
-            return True
-        if isinstance(val, list):
-            return len(val) == 0
-        if isinstance(val, dict):
-            return len(val) == 0
-        return val in _empty_scalars
+    _empty = {None, "", "null"}

    # Collect all field names seen across the sample (useful for surfacing what IS there)
    all_seen_fields = sorted({k for ev in events for k in ev})

-    all_seen_fields_set = set(all_seen_fields)
-
    field_stats = []
    for field in req.fields:
-        # Skip fields that don't appear anywhere in the sample
-        if field not in all_seen_fields_set:
-            continue
-        populated = sum(1 for ev in events if not _is_empty(ev.get(field)))
+        # dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back
+        if field == "dataSource.name":
+            populated = total
+        else:
+            populated = sum(1 for ev in events if ev.get(field) not in _empty)
        rate = round((populated / total) * 100, 1)
        field_stats.append({
            "field": field,
@@ -303,8 +351,8 @@ async def field_population(req: FieldPopulationRequest):
            "rate": rate,
        })

-    # Sort descending by rate (best coverage first)
-    field_stats.sort(key=lambda x: x["rate"], reverse=True)
+    # Sort ascending by rate (worst coverage first)
+    field_stats.sort(key=lambda x: x["rate"])

    return {
        "source": req.source,
@@ -339,10 +387,7 @@ async def test_parser(req: TestParserRequest):
    # The regex-based path can't model that — handle it explicitly so users
    # can test JSON-shaped logs against JSON-mode parsers.
    log_input = req.log_line.strip()
-    # Only enter JSON mode if the log content actually looks like JSON.
-    # Don't force it based on the parser type alone — a JSON-capable parser
-    # should still fall through to regex matching for non-JSON inputs.
-    is_json_mode = log_input.startswith("{") or log_input.startswith("[")
+    is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{")
    if is_json_mode:
        import json as _json
        # Support multi-line input (one JSON object per line, or a JSON array)
@@ -394,24 +439,18 @@ async def test_parser(req: TestParserRequest):
        # Use the first payload for the detail table; report totals.
        payload = payloads[0]
        extracted = _flatten_dict(payload)
-        # SDL's parse=json puts all keys into unmapped.* namespace first, then
-        # rewrites map unmapped.X -> ocsf.field.  Mirror that so rewrites fire.
-        unmapped_aliases = {f"unmapped.{k}": v for k, v in extracted.items()}
-        extracted_with_unmapped = {**extracted, **unmapped_aliases}
-
        # Apply lightweight rewrites if present (input/output/match/replace blocks).
        # We only handle simple literal/regex matches with $0 or string replacements;
        # this is best-effort, intended for quick visual verification.
        rewrites_applied = []
-        # Handle both quoted keys ("input":) and unquoted keys (input:)
        rewrite_re = re.compile(
-            r'\{\s*"?input"?\s*:\s*"([^"]+)"\s*,\s*"?output"?\s*:\s*"([^"]+)"\s*,\s*"?match"?\s*:\s*"((?:[^"\\]|\\.)*)"\s*,\s*"?replace"?\s*:\s*"((?:[^"\\]|\\.)*)"\s*\}',
+            r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}',
            re.DOTALL,
        )
        derived: dict[str, str] = {}
        for m in rewrite_re.finditer(content):
            in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4)
-            src_val = extracted_with_unmapped.get(in_field)
+            src_val = extracted.get(in_field)
            if src_val is None:
                continue
            try:
@@ -420,10 +459,7 @@ async def test_parser(req: TestParserRequest):
                continue
            if not m2:
                continue
-            # SDL uses $0 for whole match, $1.. for groups. Translate to Python
-            # \g<0>, \g<1>, ... so re.sub doesn't read \0 as a null byte.
-            def _to_py_backref(s: str) -> str:
-                return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s)
+            # SDL uses $0/$N backrefs; module-level _to_py_backref translates them.
            try:
                val = re.sub(match_pat, _to_py_backref(replace_val), str(src_val), count=1)
            except re.error:
@@ -452,57 +488,82 @@ async def test_parser(req: TestParserRequest):
            "showing_payload": 1,
        }

-    # ── Regex format-string path ─────────────────────────────────────────────
-    def _try_prefix_match(compiled: re.Pattern, py_to_sdl: dict, log_line: str):
-        """
-        Try the full pattern; if it doesn't match, progressively shorten from
-        the right (group by group) until we get a match.  This handles logs
-        that don't include all the trailing optional fields the parser defines.
-        Returns (match, truncated) or (None, False).
-        """
-        m = compiled.search(log_line)
-        if m:
-            return m, False
-
-        # Shorten pattern by removing trailing named groups one at a time
-        p = compiled.pattern
-        # Find all (?P<name>...) group end positions (right to left)
-        group_ends = [m2.end() for m2 in re.finditer(r'\(\?P<[^>]+>[^)]*\)', p)]
-        for end in reversed(group_ends[1:]):   # keep at least 1 group
-            try:
-                shorter = re.compile(p[:end], re.IGNORECASE | re.DOTALL)
-                m2 = shorter.search(log_line)
-                if m2:
-                    return m2, True
-            except re.error:
-                continue
-        return None, False
+    # ── Regex / KV / pattern-ref path ───────────────────────────────────────
+    # Accumulate fields across all matching formats so that a parser like
+    # Stormshield (one format for the timestamp + a KV scanner for the rest +
+    # a third format to drive rewrites) returns a complete picture.
+    patterns_block = _extract_patterns_block(content)
+    extracted_fields: dict[str, str] = {}
+    formats_matched: list[str] = []

    for fmt in format_strings:
+        resolved = _resolve_pattern_refs(fmt, patterns_block)
+
+        # SDL key=value scanner idiom (handles `$_$=$prefix._$` w/ repeat:true)
+        if _is_kv_format(resolved):
+            kv = _scan_kv(req.log_line, resolved)
+            if kv:
+                extracted_fields.update(kv)
+                formats_matched.append(fmt)
+            continue
+
        try:
-            compiled, py_to_sdl = _sdl_format_to_regex(fmt)
+            compiled, py_to_sdl = _sdl_format_to_regex(resolved)
        except re.error:
            continue

-        match, truncated = _try_prefix_match(compiled, py_to_sdl, req.log_line)
+        match = compiled.search(req.log_line)
        if match:
-            fields = [
-                {"field": py_to_sdl.get(group, group), "value": value}
-                for group, value in match.groupdict().items()
-                if value is not None and value != ""
-            ]
-            return {
-                "parser_name": req.parser_name,
-                "matched": True,
-                "mode": "regex",
-                "format_matched": fmt[:120] + ("…" if len(fmt) > 120 else ""),
-                "fields": fields,
-                "note": "Partial match — log has fewer fields than the full parser format" if truncated else None,
-            }
+            for group, value in match.groupdict().items():
+                if value is None:
+                    continue
+                extracted_fields[py_to_sdl.get(group, group)] = value
+            formats_matched.append(fmt)

+    if not extracted_fields:
+        return {
+            "parser_name": req.parser_name,
+            "matched": False,
+            "message": (
+                "No format pattern matched. This parser may use SDL features "
+                "the test runner doesn't model (e.g. dottedJson, grok, multi-line). "
+                "Fields can still be parsed correctly at ingest time."
+            ),
+            "fields": [],
+        }
+
+    # Apply rewrites declared anywhere in the parser file.
+    derived: dict[str, str] = {}
+    rewrites_applied = []
+    for rw in _extract_rewrites(content):
+        src_val = extracted_fields.get(rw["input"])
+        if src_val is None:
+            continue
+        try:
+            if not re.search(rw["match"], str(src_val)):
+                continue
+            val = re.sub(rw["match"], _to_py_backref(rw["replace"]), str(src_val), count=1)
+        except re.error:
+            continue
+        derived[rw["output"]] = val
+        rewrites_applied.append({
+            "input": rw["input"], "input_value": src_val,
+            "output": rw["output"], "matched_on": rw["match"], "result": val,
+        })
+
+    fields = (
+        [{"field": k, "value": v, "source": "extract"}
+         for k, v in sorted(extracted_fields.items())]
+        + [{"field": k, "value": v, "source": "rewrite"}
+           for k, v in sorted(derived.items())]
+    )
    return {
        "parser_name": req.parser_name,
-        "matched": False,
-        "message": "No format pattern matched. Check that the log includes the log-type keyword (e.g. TRAFFIC, THREAT) and enough comma-separated fields.",
-        "fields": [],
+        "matched": True,
+        "mode": "regex",
+        "format_matched": " + ".join(formats_matched) or "(none)",
+        "fields": fields,
+        "rewrites_applied": rewrites_applied,
+        "extracted_count": len(extracted_fields),
+        "derived_count": len(derived),
    }