From 999c0f7b83c08e15a208e26d12315b67f9e3a6ec Mon Sep 17 00:00:00 2001 From: Mick <119439091+mickbrowns1@users.noreply.github.com> Date: Tue, 19 May 2026 12:53:48 -0400 Subject: [PATCH] Add Parser Quality page: Live Event Sampler, Field Population Rate, Parser Test Runner - New /api/quality router with three endpoints: sample-events: pull raw events from a source via PowerQuery field-population: measure % of events with each SDL field populated; surfaces dataSource.name correctly (100% when filtered by it) and returns fields_seen_in_sample so you can see what IS being extracted test-parser: converts SDL \$field=pattern\$ format strings to Python named-group regex and tests against a pasted raw log line - New "Parser Quality" nav item and page with all three tools - Home page card added for Parser Quality - Field population UI shows per-field colour-coded progress bars plus a chip list of fields actually present in the sample Co-Authored-By: Claude Sonnet 4.6 --- backend/main.py | 3 +- backend/routers/quality.py | 233 +++++++++++++++++++++++++++++++++++++ frontend/index.html | 222 +++++++++++++++++++++++++++++++++++ 3 files changed, 457 insertions(+), 1 deletion(-) create mode 100644 backend/routers/quality.py diff --git a/backend/main.py b/backend/main.py index 5e2d532..a17f3eb 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,7 +1,7 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from db import engine, Base -from routers import coverage, ingest, settings +from routers import coverage, ingest, settings, quality Base.metadata.create_all(bind=engine) @@ -18,6 +18,7 @@ app.add_middleware( app.include_router(coverage.router, prefix="/api/coverage", tags=["Coverage"]) app.include_router(ingest.router, prefix="/api/ingest", tags=["Ingest"]) app.include_router(settings.router, prefix="/api/settings", tags=["Settings"]) +app.include_router(quality.router, prefix="/api/quality", tags=["Quality"]) @app.get("/health") diff --git a/backend/routers/quality.py b/backend/routers/quality.py new file mode 100644 index 0000000..7b266b7 --- /dev/null +++ b/backend/routers/quality.py @@ -0,0 +1,233 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel +from datetime import datetime, timedelta +from services import s1_client +import re + +router = APIRouter() + + +def _date_range_hours(hours: int) -> tuple[str, str]: + now = datetime.utcnow() + return ( + (now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"), + now.strftime("%Y-%m-%dT%H:%M:%S.000Z"), + ) + + +# --------------------------------------------------------------------------- +# Models +# --------------------------------------------------------------------------- + +class SampleEventsRequest(BaseModel): + source: str + limit: int = 20 + hours: int = 1 + + +class FieldPopulationRequest(BaseModel): + source: str + hours: int = 24 + fields: list[str] = [ + "src.ip", + "src.port", + "dst.ip", + "dst.port", + "user.name", + "event.type", + "src.process.name", + "src.process.cmdline", + "tgt.file.path", + "network.direction", + "dataSource.name", + ] + + +class TestParserRequest(BaseModel): + parser_name: str + log_line: str + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _flatten_event(event: dict) -> dict: + """Return a flat field→value dict from a PowerQuery result row.""" + if isinstance(event, dict): + return {k: v for k, v in event.items()} + return {} + + +def _extract_format_strings(content: str) -> list[str]: + """ + Extract SDL format string values from augmented-JSON parser content. + Matches: "format": "..." (double-quoted value, supports escaped quotes). + """ + pattern = re.compile(r'"format"\s*:\s*"((?:[^"\\]|\\.)*)"') + return pattern.findall(content) + + +def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]: + """ + Convert an SDL format string to a compiled Python regex. + + Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can + translate group names back to the original SDL field names. + + Raises re.error if the resulting pattern cannot be compiled. + """ + # Split on $...$ tokens + token_pattern = re.compile(r'\$([^$]+)\$') + parts = token_pattern.split(fmt) + # parts alternates: literal, token, literal, token, ... + + regex_parts: list[str] = [] + py_group_to_sdl: dict[str, str] = {} + seen_groups: dict[str, int] = {} + + for i, part in enumerate(parts): + if i % 2 == 0: + # Literal text + regex_parts.append(re.escape(part)) + else: + # Token: either "field.name=PATTERN" or just "field.name" + if '=' in part: + field_name, pattern = part.split('=', 1) + else: + field_name = part + pattern = r'[^\s]+' + + # Build a valid Python group name + safe = re.sub(r'[.\-]', '_', field_name) + if safe in seen_groups: + seen_groups[safe] += 1 + safe = f"{safe}_{seen_groups[safe]}" + else: + seen_groups[safe] = 0 + + py_group_to_sdl[safe] = field_name + regex_parts.append(f'(?P<{safe}>{pattern})') + + compiled = re.compile(''.join(regex_parts), re.IGNORECASE) + return compiled, py_group_to_sdl + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@router.post("/sample-events") +async def sample_events(req: SampleEventsRequest): + """Return a sample of raw events from a given data source.""" + query = f'| filter dataSource.name = "{req.source}" | limit {req.limit}' + from_dt, to_dt = _date_range_hours(req.hours) + + result = await s1_client.run_powerquery(query, from_dt, to_dt) + + rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or []) + events = [_flatten_event(row) for row in rows] + + return { + "source": req.source, + "events": events, + "count": len(events), + "hours": req.hours, + } + + +@router.post("/field-population") +async def field_population(req: FieldPopulationRequest): + """ + Analyse how consistently each requested field is populated across a sample + of events from a data source. + """ + query = f'| filter dataSource.name = "{req.source}" | limit 500' + from_dt, to_dt = _date_range_hours(req.hours) + + result = await s1_client.run_powerquery(query, from_dt, to_dt) + + rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or []) + events = [_flatten_event(row) for row in rows] + + if not events: + raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.") + + total = len(events) + _empty = {None, "", "null"} + + # Collect all field names seen across the sample (useful for surfacing what IS there) + all_seen_fields = sorted({k for ev in events for k in ev}) + + field_stats = [] + for field in req.fields: + # dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back + if field == "dataSource.name": + populated = total + else: + populated = sum(1 for ev in events if ev.get(field) not in _empty) + rate = round((populated / total) * 100, 1) + field_stats.append({ + "field": field, + "populated": populated, + "total": total, + "rate": rate, + }) + + # Sort ascending by rate (worst coverage first) + field_stats.sort(key=lambda x: x["rate"]) + + return { + "source": req.source, + "total_sampled": total, + "hours": req.hours, + "fields": field_stats, + "fields_seen_in_sample": all_seen_fields, + } + + +@router.post("/test-parser") +async def test_parser(req: TestParserRequest): + """ + Test a parser against a raw log line by extracting and matching SDL format + strings found in the parser file. + """ + parser_path = f"/app/parsers/{req.parser_name}" + + try: + with open(parser_path, "r", encoding="utf-8") as fh: + content = fh.read() + except FileNotFoundError: + raise HTTPException(status_code=404, detail=f"Parser file not found: {req.parser_name}") + except OSError as exc: + raise HTTPException(status_code=500, detail=f"Could not read parser file: {exc}") + + format_strings = _extract_format_strings(content) + + for fmt in format_strings: + try: + compiled, py_to_sdl = _sdl_format_to_regex(fmt) + except re.error: + # Skip unparseable format strings + continue + + match = compiled.search(req.log_line) + if match: + fields = [ + {"field": py_to_sdl.get(group, group), "value": value} + for group, value in match.groupdict().items() + if value is not None + ] + return { + "parser_name": req.parser_name, + "matched": True, + "format_matched": fmt, + "fields": fields, + } + + return { + "parser_name": req.parser_name, + "matched": False, + "message": "No format pattern matched", + "fields": [], + } diff --git a/frontend/index.html b/frontend/index.html index 8b35b57..ae80c32 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -20,6 +20,7 @@ Overview Parser Coverage Ingest Dashboard + Parser Quality Onboarding
@@ -122,6 +123,7 @@ function renderHome() {
${homeCard('#/coverage','Parser Coverage Map','Cross-reference SDL parser fields against STAR and Sigma rule fields. Surface parsed-but-unused fields as reduction candidates.','Open Coverage Map','from-purple-700 to-purple-900')} ${homeCard('#/ingest','Ingest Dashboard','Visualize event volume by source and type. Project monthly GB costs and simulate exclusion filters before applying them.','Open Dashboard','from-blue-700 to-blue-900')} + ${homeCard('#/quality','Parser Quality','Sample live events to see which fields landed. Measure field population rates and test parser patterns against raw log lines.','Open Quality Tools','from-amber-700 to-amber-900')} ${homeCard('#/onboarding','Onboarding Accelerator','Step-by-step guide for onboarding a new log source using Claude Code directly — no API key required.','View Guide','from-emerald-700 to-emerald-900')}
`) @@ -637,6 +639,225 @@ async function saveSettings() { } finally { setBtn('st-save', false, 'Save to .env') } } +// ── Parser Quality ──────────────────────────────────────────────────────── + +function renderQuality() { + set(`
+
+

Parser Quality

+

Inspect live events · measure field coverage · test parser patterns

+
+ + +
+

Live Event Sampler

+

Pull recent raw events from a source and see exactly which fields landed — and which are missing.

+
+ + + + +
+
+
+ + +
+

Field Population Rate

+

Sample up to 500 events and measure what % have each key field populated. Low rates flag parser extraction failures.

+
+ + + +
+
+ + +
+
+
+ + +
+

Parser Test Runner

+

Paste a raw log line and pick a loaded parser — see which fields the format patterns would extract without deploying anything.

+
+ + +
+ +
+
+
`) + qtLoadParsers() +} + +// ── Live Event Sampler ───────────────────────────────────────────────────── + +async function qsSample() { + const source = document.getElementById('qs-source').value.trim() + if (!source) { document.getElementById('qs-result').innerHTML = errBox('Enter a source name.'); return } + setBtn('btn-qs', true) + document.getElementById('qs-result').innerHTML = '

Querying data lake…

' + try { + const r = await apiPost('/api/quality/sample-events', { + source, + limit: +document.getElementById('qs-limit').value, + hours: +document.getElementById('qs-hours').value, + }) + if (!r.events?.length) { + document.getElementById('qs-result').innerHTML = '

No events found for this source in the selected window.

' + return + } + // Collect all field names across events + const allFields = [...new Set(r.events.flatMap(e => Object.keys(e)))].sort() + const rows = r.events.map(ev => { + const cells = allFields.map(f => { + const v = ev[f] + const empty = v === null || v === undefined || v === '' || v === 'null' + return `${empty ? '∅' : esc(String(v).slice(0,40))}` + }).join('') + return `${cells}` + }).join('') + const headers = allFields.map(f => `${esc(f)}`).join('') + document.getElementById('qs-result').innerHTML = ` +

${r.count} events · ${r.hours}h window · ${allFields.length} fields seen

+
+ + ${headers} + ${rows} +
+
` + } catch(e) { + document.getElementById('qs-result').innerHTML = errBox(e.message) + } finally { setBtn('btn-qs', false, 'Sample') } +} + +// ── Field Population Rate ────────────────────────────────────────────────── + +async function qpAnalyze() { + const source = document.getElementById('qp-source').value.trim() + if (!source) { document.getElementById('qp-result').innerHTML = errBox('Enter a source name.'); return } + setBtn('btn-qp', true) + document.getElementById('qp-result').innerHTML = '

Sampling events…

' + try { + const fieldsRaw = document.getElementById('qp-fields').value + const fields = fieldsRaw.split(',').map(f => f.trim()).filter(Boolean) + const r = await apiPost('/api/quality/field-population', { + source, hours: +document.getElementById('qp-hours').value, fields + }) + const rows = r.fields.map(f => { + const pct = f.rate + const color = pct >= 80 ? 'bg-emerald-500' : pct >= 40 ? 'bg-amber-500' : 'bg-red-500' + const textColor = pct >= 80 ? 'text-emerald-400' : pct >= 40 ? 'text-amber-400' : 'text-red-400' + return ` + ${esc(f.field)} + ${pct}% + +
+
+
+ + ${f.populated.toLocaleString()} / ${f.total.toLocaleString()} + ` + }).join('') + document.getElementById('qp-result').innerHTML = ` +

${r.total_sampled} events sampled · ${r.hours}h window — sorted by worst coverage first

+ + + + + + + + ${rows} +
FieldRateCoverageEvents
+ ${r.fields_seen_in_sample?.length ? ` +
+

Fields actually present in sample (${r.fields_seen_in_sample.length} total)

+
${r.fields_seen_in_sample.map(f => + `${esc(f)}`).join('')} +
+
` : ''}` + } catch(e) { + document.getElementById('qp-result').innerHTML = errBox(e.message) + } finally { setBtn('btn-qp', false, 'Analyze') } +} + +// ── Parser Test Runner ───────────────────────────────────────────────────── + +async function qtLoadParsers() { + try { + const r = await apiGet('/api/coverage/map') + const names = [...new Set((r.sources || []).map(s => s.parser).filter(Boolean))].sort() + const sel = document.getElementById('qt-parser') + if (!sel) return + names.forEach(n => { + const o = document.createElement('option'); o.value = n; o.textContent = n; sel.appendChild(o) + }) + } catch {} +} + +async function qtTest() { + const parser = document.getElementById('qt-parser').value + const log = document.getElementById('qt-log').value.trim() + if (!parser) { document.getElementById('qt-result').innerHTML = errBox('Select a parser.'); return } + if (!log) { document.getElementById('qt-result').innerHTML = errBox('Paste a log line.'); return } + setBtn('btn-qt', true) + document.getElementById('qt-result').innerHTML = '

Testing…

' + try { + const r = await apiPost('/api/quality/test-parser', { parser_name: parser, log_line: log }) + if (!r.matched) { + document.getElementById('qt-result').innerHTML = ` +
+ ⚠ No format pattern matched this log line. +

The parser's format strings didn't produce a match. Check that the log sample matches the expected format, or that the parser has SDL format strings (some parsers use grok/dottedJson which aren't tested here).

+
` + return + } + const rows = r.fields.map(f => ` + ${esc(f.field)} + ${esc(String(f.value))} + `).join('') + document.getElementById('qt-result').innerHTML = ` +
+ Matched format: ${esc(r.format_matched)} +
+ + + + + + ${rows} +
FieldExtracted Value
` + } catch(e) { + document.getElementById('qt-result').innerHTML = errBox(e.message) + } finally { setBtn('btn-qt', false, 'Test') } +} + // ── Router ──────────────────────────────────────────────────────────────── function set(html) { document.getElementById('main').innerHTML = html } @@ -652,6 +873,7 @@ function route() { const h = location.hash || '#/' if (h === '#/coverage') { updateNav('coverage'); renderCoverage() } else if (h === '#/ingest') { updateNav('ingest'); renderIngest() } + else if (h === '#/quality') { updateNav('quality'); renderQuality() } else if (h === '#/onboarding') { updateNav('onboarding'); renderOnboarding() } else if (h === '#/settings') { updateNav('settings'); renderSettings() } else { updateNav('home'); renderHome() }