Add unlabelled event detection, stub parser quality, Sync All, and modern UI redesign

Key changes: - Unlabelled event banner: shows count only after Sample Events is clicked; uses broad SDL filter expression; time window synced to sync-days dropdown - Parser Quality: new "Attributes Missing" subsection listing all parsers without dataSource.name regardless of event volume - Coverage map: filter buttons (All / Complete Parser / Attributes Missing); stat card renamed to "Incomplete Parser"; stub count excluded from sync when no active sources - Sync All button: runs SDL parser sync → library sync → live sources sync in sequence - Reset now clears ActiveSource table and resets unlabelled count cache - run_powerquery: configurable max_count param (default 1000, 50M for count queries) - _DS_NAME_RE: supports both quoted and unquoted dataSource.name keys in parser files - Full modern UI redesign: slate palette, gradient cards, ring borders, pill nav, colored stat accents - Updated 7 tracked parser files synced from SDL Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-10 21:31:19 +00:00 · 2026-05-22 10:00:21 -04:00
parent 0013adbe7e
commit c5a4f796a0
15 changed files with 3498 additions and 469 deletions
@@ -38,6 +38,7 @@ class SampleEventsRequest(BaseModel):
    source: str
    limit: int = 20
    hours: int = 1
+    filter_mode: str = "broad"  # reserved for future use


 class FieldPopulationRequest(BaseModel):
@@ -107,21 +108,41 @@ def _flatten_event(event: dict) -> dict:
 def _extract_format_strings(content: str) -> list[str]:
    """
    Extract SDL format string values from augmented-JSON parser content.
-    Matches:  "format": "..."  (double-quoted value, supports escaped quotes).
+    Handles both:
+      - quoted keys:   "format": "..."   (valid JSON)
+      - unquoted keys:  format: "..."    (SDL augmented-JSON)
+    Skips commented-out lines (// ...).
    """
-    pattern = re.compile(r'"format"\s*:\s*"((?:[^"\\]|\\.)*)"')
-    return pattern.findall(content)
+    pattern = re.compile(r'(?<!//)\"?format\"?\s*:\s*"((?:[^"\\]|\\.)*)"')
+    results = []
+    for line in content.splitlines():
+        stripped = line.strip()
+        if stripped.startswith("//"):
+            continue
+        results.extend(pattern.findall(line))
+    return results


 def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
    """
    Convert an SDL format string to a compiled Python regex.

-    Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can
-    translate group names back to the original SDL field names.
+    SDL format strings may start with '.*,' to absorb a syslog header.  When
+    used with re.search that prefix is redundant AND harmful (it forces a comma
+    before the first named field, which won't exist when the log starts with
+    the field directly).  We strip the leading '.*,' so re.search can anchor
+    to the first real field at any position in the line.

+    Internal '.*' wildcards (field separators for skipped fields) are kept as
+    non-greedy '.*?' so they don't consume adjacent named-field values.
+
+    Returns (compiled_pattern, py_group_to_sdl_field).
    Raises re.error if the resulting pattern cannot be compiled.
    """
+    # Strip leading/trailing .* wildcards — re.search handles positioning
+    fmt = re.sub(r'^(\.\*,?)+', '', fmt)
+    fmt = re.sub(r'(,?\.\*)+$', '', fmt)
+
    # Split on $...$ tokens
    token_pattern = re.compile(r'\$([^$]+)\$')
    parts = token_pattern.split(fmt)
@@ -131,19 +152,25 @@ def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
    py_group_to_sdl: dict[str, str] = {}
    seen_groups: dict[str, int] = {}

+    def _escape_literal(s: str) -> str:
+        """Escape literal text but keep internal .* as non-greedy wildcards."""
+        segments = re.split(r'(\.\*)', s)
+        return ''.join(r'.*?' if seg == '.*' else re.escape(seg) for seg in segments)
+
    for i, part in enumerate(parts):
        if i % 2 == 0:
-            # Literal text
-            regex_parts.append(re.escape(part))
+            # Literal text (possibly containing .* wildcards)
+            regex_parts.append(_escape_literal(part))
        else:
            # Token: either "field.name=PATTERN" or just "field.name"
            if '=' in part:
                field_name, pattern = part.split('=', 1)
            else:
                field_name = part
-                pattern = r'[^\s]+'
+                # Default: match any non-comma chars (SDL CSV fields)
+                pattern = r'[^,]*'

-            # Build a valid Python group name
+            # Build a valid Python named-group identifier
            safe = re.sub(r'[.\-]', '_', field_name)
            if safe in seen_groups:
                seen_groups[safe] += 1
@@ -154,7 +181,7 @@ def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
            py_group_to_sdl[safe] = field_name
            regex_parts.append(f'(?P<{safe}>{pattern})')

-    compiled = re.compile(''.join(regex_parts), re.IGNORECASE)
+    compiled = re.compile(''.join(regex_parts), re.IGNORECASE | re.DOTALL)
    return compiled, py_group_to_sdl


@@ -162,6 +189,45 @@ def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
 # Endpoints
 # ---------------------------------------------------------------------------

+@router.post("/sample-unlabelled")
+async def sample_unlabelled(req: SampleEventsRequest):
+    """Return a sample of events that have no dataSource.name — these need parsers.
+    Also runs a count query so the caller can update the banner with the real total.
+    """
+    import asyncio
+    from routers import coverage as _coverage
+
+    filter_expr = "!(dataSource.name = *) !(source = 'scalyr')"
+    from_dt, to_dt = _date_range_hours(req.hours)
+
+    sample_result, count_result = await asyncio.gather(
+        s1_client.run_powerquery(f"{filter_expr} | limit {req.limit}", from_dt, to_dt),
+        s1_client.run_powerquery(f"{filter_expr} | group events=count()", from_dt, to_dt, max_count=50_000_000),
+    )
+
+    rows = sample_result if isinstance(sample_result, list) else (sample_result.get("rows") or sample_result.get("events") or [])
+
+    events = [_flatten_event(row) for row in rows]
+    non_empty_keys: set = set()
+    for ev in events:
+        for k, v in ev.items():
+            if v is not None and v != "" and v != "null":
+                non_empty_keys.add(k)
+    events = [{k: v for k, v in ev.items() if k in non_empty_keys} for ev in events]
+
+    count_rows = count_result.get("events", []) if isinstance(count_result, dict) else []
+    total = count_rows[0].get("events", 0) if count_rows else 0
+    _coverage._unlabelled_event_count = total
+
+    return {
+        "events": events,
+        "count": len(events),
+        "total": total,
+        "hours": req.hours,
+        "columns_seen": sorted(non_empty_keys),
+    }
+
+
@router.post("/sample-events")
 async def sample_events(req: SampleEventsRequest):
    """Return a sample of raw events from a given data source."""
@@ -196,21 +262,39 @@ async def field_population(req: FieldPopulationRequest):
    events = [_flatten_event(row) for row in rows]

    if not events:
-        raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.")
+        return {
+            "source": req.source,
+            "total_sampled": 0,
+            "hours": req.hours,
+            "fields": [],
+            "fields_seen_in_sample": [],
+            "message": f"No events found for source '{req.source}' in the last {req.hours} hours.",
+        }

    total = len(events)
-    _empty = {None, "", "null"}
+    _empty_scalars = {None, "", "null"}
+
+    def _is_empty(val):
+        """Return True if the value counts as unpopulated."""
+        if val is None:
+            return True
+        if isinstance(val, list):
+            return len(val) == 0
+        if isinstance(val, dict):
+            return len(val) == 0
+        return val in _empty_scalars

    # Collect all field names seen across the sample (useful for surfacing what IS there)
    all_seen_fields = sorted({k for ev in events for k in ev})

+    all_seen_fields_set = set(all_seen_fields)
+
    field_stats = []
    for field in req.fields:
-        # dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back
-        if field == "dataSource.name":
-            populated = total
-        else:
-            populated = sum(1 for ev in events if ev.get(field) not in _empty)
+        # Skip fields that don't appear anywhere in the sample
+        if field not in all_seen_fields_set:
+            continue
+        populated = sum(1 for ev in events if not _is_empty(ev.get(field)))
        rate = round((populated / total) * 100, 1)
        field_stats.append({
            "field": field,
@@ -219,8 +303,8 @@ async def field_population(req: FieldPopulationRequest):
            "rate": rate,
        })

-    # Sort ascending by rate (worst coverage first)
-    field_stats.sort(key=lambda x: x["rate"])
+    # Sort descending by rate (best coverage first)
+    field_stats.sort(key=lambda x: x["rate"], reverse=True)

    return {
        "source": req.source,
@@ -255,7 +339,10 @@ async def test_parser(req: TestParserRequest):
    # The regex-based path can't model that — handle it explicitly so users
    # can test JSON-shaped logs against JSON-mode parsers.
    log_input = req.log_line.strip()
-    is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{")
+    # Only enter JSON mode if the log content actually looks like JSON.
+    # Don't force it based on the parser type alone — a JSON-capable parser
+    # should still fall through to regex matching for non-JSON inputs.
+    is_json_mode = log_input.startswith("{") or log_input.startswith("[")
    if is_json_mode:
        import json as _json
        # Support multi-line input (one JSON object per line, or a JSON array)
@@ -307,18 +394,24 @@ async def test_parser(req: TestParserRequest):
        # Use the first payload for the detail table; report totals.
        payload = payloads[0]
        extracted = _flatten_dict(payload)
+        # SDL's parse=json puts all keys into unmapped.* namespace first, then
+        # rewrites map unmapped.X -> ocsf.field.  Mirror that so rewrites fire.
+        unmapped_aliases = {f"unmapped.{k}": v for k, v in extracted.items()}
+        extracted_with_unmapped = {**extracted, **unmapped_aliases}
+
        # Apply lightweight rewrites if present (input/output/match/replace blocks).
        # We only handle simple literal/regex matches with $0 or string replacements;
        # this is best-effort, intended for quick visual verification.
        rewrites_applied = []
+        # Handle both quoted keys ("input":) and unquoted keys (input:)
        rewrite_re = re.compile(
-            r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}',
+            r'\{\s*"?input"?\s*:\s*"([^"]+)"\s*,\s*"?output"?\s*:\s*"([^"]+)"\s*,\s*"?match"?\s*:\s*"((?:[^"\\]|\\.)*)"\s*,\s*"?replace"?\s*:\s*"((?:[^"\\]|\\.)*)"\s*\}',
            re.DOTALL,
        )
        derived: dict[str, str] = {}
        for m in rewrite_re.finditer(content):
            in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4)
-            src_val = extracted.get(in_field)
+            src_val = extracted_with_unmapped.get(in_field)
            if src_val is None:
                continue
            try:
@@ -359,32 +452,57 @@ async def test_parser(req: TestParserRequest):
            "showing_payload": 1,
        }

-    # ── Regex format-string path (original) ─────────────────────────────────
+    # ── Regex format-string path ─────────────────────────────────────────────
+    def _try_prefix_match(compiled: re.Pattern, py_to_sdl: dict, log_line: str):
+        """
+        Try the full pattern; if it doesn't match, progressively shorten from
+        the right (group by group) until we get a match.  This handles logs
+        that don't include all the trailing optional fields the parser defines.
+        Returns (match, truncated) or (None, False).
+        """
+        m = compiled.search(log_line)
+        if m:
+            return m, False
+
+        # Shorten pattern by removing trailing named groups one at a time
+        p = compiled.pattern
+        # Find all (?P<name>...) group end positions (right to left)
+        group_ends = [m2.end() for m2 in re.finditer(r'\(\?P<[^>]+>[^)]*\)', p)]
+        for end in reversed(group_ends[1:]):   # keep at least 1 group
+            try:
+                shorter = re.compile(p[:end], re.IGNORECASE | re.DOTALL)
+                m2 = shorter.search(log_line)
+                if m2:
+                    return m2, True
+            except re.error:
+                continue
+        return None, False
+
    for fmt in format_strings:
        try:
            compiled, py_to_sdl = _sdl_format_to_regex(fmt)
        except re.error:
-            # Skip unparseable format strings
            continue

-        match = compiled.search(req.log_line)
+        match, truncated = _try_prefix_match(compiled, py_to_sdl, req.log_line)
        if match:
            fields = [
                {"field": py_to_sdl.get(group, group), "value": value}
                for group, value in match.groupdict().items()
-                if value is not None
+                if value is not None and value != ""
            ]
            return {
                "parser_name": req.parser_name,
                "matched": True,
                "mode": "regex",
-                "format_matched": fmt,
+                "format_matched": fmt[:120] + ("…" if len(fmt) > 120 else ""),
                "fields": fields,
+                "note": "Partial match — log has fewer fields than the full parser format" if truncated else None,
            }

    return {
        "parser_name": req.parser_name,
        "matched": False,
-        "message": "No format pattern matched",
+        "message": "No format pattern matched. Check that the log includes the log-type keyword (e.g. TRAFFIC, THREAT) and enough comma-separated fields.",
        "fields": [],
    }