Add Parser Quality page: Live Event Sampler, Field Population Rate, Parser Test Runner

- New /api/quality router with three endpoints:
  sample-events: pull raw events from a source via PowerQuery
  field-population: measure % of events with each SDL field populated;
    surfaces dataSource.name correctly (100% when filtered by it) and
    returns fields_seen_in_sample so you can see what IS being extracted
  test-parser: converts SDL \$field=pattern\$ format strings to Python
    named-group regex and tests against a pasted raw log line
- New "Parser Quality" nav item and page with all three tools
- Home page card added for Parser Quality
- Field population UI shows per-field colour-coded progress bars plus
  a chip list of fields actually present in the sample

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mick
2026-05-19 12:53:48 -04:00
parent 058b1e7cf1
commit 999c0f7b83
3 changed files with 457 additions and 1 deletions
+2 -1
View File
@@ -1,7 +1,7 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from db import engine, Base
from routers import coverage, ingest, settings
from routers import coverage, ingest, settings, quality
Base.metadata.create_all(bind=engine)
@@ -18,6 +18,7 @@ app.add_middleware(
app.include_router(coverage.router, prefix="/api/coverage", tags=["Coverage"])
app.include_router(ingest.router, prefix="/api/ingest", tags=["Ingest"])
app.include_router(settings.router, prefix="/api/settings", tags=["Settings"])
app.include_router(quality.router, prefix="/api/quality", tags=["Quality"])
@app.get("/health")
+233
View File
@@ -0,0 +1,233 @@
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from datetime import datetime, timedelta
from services import s1_client
import re
router = APIRouter()
def _date_range_hours(hours: int) -> tuple[str, str]:
now = datetime.utcnow()
return (
(now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
)
# ---------------------------------------------------------------------------
# Models
# ---------------------------------------------------------------------------
class SampleEventsRequest(BaseModel):
source: str
limit: int = 20
hours: int = 1
class FieldPopulationRequest(BaseModel):
source: str
hours: int = 24
fields: list[str] = [
"src.ip",
"src.port",
"dst.ip",
"dst.port",
"user.name",
"event.type",
"src.process.name",
"src.process.cmdline",
"tgt.file.path",
"network.direction",
"dataSource.name",
]
class TestParserRequest(BaseModel):
parser_name: str
log_line: str
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _flatten_event(event: dict) -> dict:
"""Return a flat field→value dict from a PowerQuery result row."""
if isinstance(event, dict):
return {k: v for k, v in event.items()}
return {}
def _extract_format_strings(content: str) -> list[str]:
"""
Extract SDL format string values from augmented-JSON parser content.
Matches: "format": "..." (double-quoted value, supports escaped quotes).
"""
pattern = re.compile(r'"format"\s*:\s*"((?:[^"\\]|\\.)*)"')
return pattern.findall(content)
def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
"""
Convert an SDL format string to a compiled Python regex.
Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can
translate group names back to the original SDL field names.
Raises re.error if the resulting pattern cannot be compiled.
"""
# Split on $...$ tokens
token_pattern = re.compile(r'\$([^$]+)\$')
parts = token_pattern.split(fmt)
# parts alternates: literal, token, literal, token, ...
regex_parts: list[str] = []
py_group_to_sdl: dict[str, str] = {}
seen_groups: dict[str, int] = {}
for i, part in enumerate(parts):
if i % 2 == 0:
# Literal text
regex_parts.append(re.escape(part))
else:
# Token: either "field.name=PATTERN" or just "field.name"
if '=' in part:
field_name, pattern = part.split('=', 1)
else:
field_name = part
pattern = r'[^\s]+'
# Build a valid Python group name
safe = re.sub(r'[.\-]', '_', field_name)
if safe in seen_groups:
seen_groups[safe] += 1
safe = f"{safe}_{seen_groups[safe]}"
else:
seen_groups[safe] = 0
py_group_to_sdl[safe] = field_name
regex_parts.append(f'(?P<{safe}>{pattern})')
compiled = re.compile(''.join(regex_parts), re.IGNORECASE)
return compiled, py_group_to_sdl
# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------
@router.post("/sample-events")
async def sample_events(req: SampleEventsRequest):
"""Return a sample of raw events from a given data source."""
query = f'| filter dataSource.name = "{req.source}" | limit {req.limit}'
from_dt, to_dt = _date_range_hours(req.hours)
result = await s1_client.run_powerquery(query, from_dt, to_dt)
rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
events = [_flatten_event(row) for row in rows]
return {
"source": req.source,
"events": events,
"count": len(events),
"hours": req.hours,
}
@router.post("/field-population")
async def field_population(req: FieldPopulationRequest):
"""
Analyse how consistently each requested field is populated across a sample
of events from a data source.
"""
query = f'| filter dataSource.name = "{req.source}" | limit 500'
from_dt, to_dt = _date_range_hours(req.hours)
result = await s1_client.run_powerquery(query, from_dt, to_dt)
rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
events = [_flatten_event(row) for row in rows]
if not events:
raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.")
total = len(events)
_empty = {None, "", "null"}
# Collect all field names seen across the sample (useful for surfacing what IS there)
all_seen_fields = sorted({k for ev in events for k in ev})
field_stats = []
for field in req.fields:
# dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back
if field == "dataSource.name":
populated = total
else:
populated = sum(1 for ev in events if ev.get(field) not in _empty)
rate = round((populated / total) * 100, 1)
field_stats.append({
"field": field,
"populated": populated,
"total": total,
"rate": rate,
})
# Sort ascending by rate (worst coverage first)
field_stats.sort(key=lambda x: x["rate"])
return {
"source": req.source,
"total_sampled": total,
"hours": req.hours,
"fields": field_stats,
"fields_seen_in_sample": all_seen_fields,
}
@router.post("/test-parser")
async def test_parser(req: TestParserRequest):
"""
Test a parser against a raw log line by extracting and matching SDL format
strings found in the parser file.
"""
parser_path = f"/app/parsers/{req.parser_name}"
try:
with open(parser_path, "r", encoding="utf-8") as fh:
content = fh.read()
except FileNotFoundError:
raise HTTPException(status_code=404, detail=f"Parser file not found: {req.parser_name}")
except OSError as exc:
raise HTTPException(status_code=500, detail=f"Could not read parser file: {exc}")
format_strings = _extract_format_strings(content)
for fmt in format_strings:
try:
compiled, py_to_sdl = _sdl_format_to_regex(fmt)
except re.error:
# Skip unparseable format strings
continue
match = compiled.search(req.log_line)
if match:
fields = [
{"field": py_to_sdl.get(group, group), "value": value}
for group, value in match.groupdict().items()
if value is not None
]
return {
"parser_name": req.parser_name,
"matched": True,
"format_matched": fmt,
"fields": fields,
}
return {
"parser_name": req.parser_name,
"matched": False,
"message": "No format pattern matched",
"fields": [],
}
+222
View File
@@ -20,6 +20,7 @@
<a href="#/" data-page="home" class="nav-link flex items-center px-3 py-2 rounded-lg text-sm cursor-pointer">Overview</a>
<a href="#/coverage" data-page="coverage" class="nav-link flex items-center px-3 py-2 rounded-lg text-sm cursor-pointer">Parser Coverage</a>
<a href="#/ingest" data-page="ingest" class="nav-link flex items-center px-3 py-2 rounded-lg text-sm cursor-pointer">Ingest Dashboard</a>
<a href="#/quality" data-page="quality" class="nav-link flex items-center px-3 py-2 rounded-lg text-sm cursor-pointer">Parser Quality</a>
<a href="#/onboarding" data-page="onboarding" class="nav-link flex items-center px-3 py-2 rounded-lg text-sm cursor-pointer">Onboarding</a>
</nav>
<div class="p-3 border-t border-gray-800">
@@ -122,6 +123,7 @@ function renderHome() {
<div class="grid grid-cols-1 md:grid-cols-3 gap-5">
${homeCard('#/coverage','Parser Coverage Map','Cross-reference SDL parser fields against STAR and Sigma rule fields. Surface parsed-but-unused fields as reduction candidates.','Open Coverage Map','from-purple-700 to-purple-900')}
${homeCard('#/ingest','Ingest Dashboard','Visualize event volume by source and type. Project monthly GB costs and simulate exclusion filters before applying them.','Open Dashboard','from-blue-700 to-blue-900')}
${homeCard('#/quality','Parser Quality','Sample live events to see which fields landed. Measure field population rates and test parser patterns against raw log lines.','Open Quality Tools','from-amber-700 to-amber-900')}
${homeCard('#/onboarding','Onboarding Accelerator','Step-by-step guide for onboarding a new log source using Claude Code directly — no API key required.','View Guide','from-emerald-700 to-emerald-900')}
</div>
</div>`)
@@ -637,6 +639,225 @@ async function saveSettings() {
} finally { setBtn('st-save', false, 'Save to .env') }
}
// ── Parser Quality ────────────────────────────────────────────────────────
function renderQuality() {
set(`<div class="p-8 max-w-5xl space-y-6">
<div>
<h1 class="text-xl font-bold text-white">Parser Quality</h1>
<p class="text-sm text-gray-400 mt-1">Inspect live events · measure field coverage · test parser patterns</p>
</div>
<!-- Live Event Sampler -->
<div class="bg-gray-900 border border-gray-800 rounded-xl p-5">
<h2 class="text-sm font-semibold text-white mb-1">Live Event Sampler</h2>
<p class="text-xs text-gray-500 mb-4">Pull recent raw events from a source and see exactly which fields landed — and which are missing.</p>
<div class="flex gap-3 flex-wrap mb-4">
<input id="qs-source" placeholder="dataSource.name — e.g. Palo Alto Networks Firewall"
class="flex-1 min-w-60 bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-200 placeholder-gray-600 focus:outline-none focus:border-purple-600">
<select id="qs-hours" class="bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-300 focus:outline-none focus:border-purple-600">
<option value="1">Last 1h</option>
<option value="6">Last 6h</option>
<option value="24" selected>Last 24h</option>
<option value="72">Last 3d</option>
</select>
<select id="qs-limit" class="bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-300 focus:outline-none focus:border-purple-600">
<option value="10">10 events</option>
<option value="20" selected>20 events</option>
<option value="50">50 events</option>
</select>
<button onclick="qsSample()" id="btn-qs"
class="px-4 py-2 text-sm bg-purple-700 hover:bg-purple-600 rounded-lg text-white transition-colors">Sample</button>
</div>
<div id="qs-result"></div>
</div>
<!-- Field Population Rate -->
<div class="bg-gray-900 border border-gray-800 rounded-xl p-5">
<h2 class="text-sm font-semibold text-white mb-1">Field Population Rate</h2>
<p class="text-xs text-gray-500 mb-4">Sample up to 500 events and measure what % have each key field populated. Low rates flag parser extraction failures.</p>
<div class="flex gap-3 flex-wrap mb-3">
<input id="qp-source" placeholder="dataSource.name"
class="flex-1 min-w-60 bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-200 placeholder-gray-600 focus:outline-none focus:border-purple-600">
<select id="qp-hours" class="bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-300 focus:outline-none focus:border-purple-600">
<option value="1">Last 1h</option>
<option value="6">Last 6h</option>
<option value="24" selected>Last 24h</option>
<option value="72">Last 3d</option>
</select>
<button onclick="qpAnalyze()" id="btn-qp"
class="px-4 py-2 text-sm bg-purple-700 hover:bg-purple-600 rounded-lg text-white transition-colors">Analyze</button>
</div>
<div class="mb-3">
<label class="text-xs text-gray-500 block mb-1">Fields to check <span class="text-gray-600">(comma-separated)</span></label>
<input id="qp-fields" value="src.ip,dst.ip,user.name,event.type,src.process.name,src.process.cmdline,tgt.file.path,network.direction"
class="w-full bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-xs text-gray-300 font-mono focus:outline-none focus:border-purple-600">
</div>
<div id="qp-result"></div>
</div>
<!-- Parser Test Runner -->
<div class="bg-gray-900 border border-gray-800 rounded-xl p-5">
<h2 class="text-sm font-semibold text-white mb-1">Parser Test Runner</h2>
<p class="text-xs text-gray-500 mb-4">Paste a raw log line and pick a loaded parser — see which fields the format patterns would extract without deploying anything.</p>
<div class="flex gap-3 flex-wrap mb-3">
<select id="qt-parser" class="flex-1 bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-300 focus:outline-none focus:border-purple-600">
<option value="">— select parser —</option>
</select>
<button onclick="qtTest()" id="btn-qt"
class="px-4 py-2 text-sm bg-purple-700 hover:bg-purple-600 rounded-lg text-white transition-colors">Test</button>
</div>
<textarea id="qt-log" rows="3" placeholder="Paste raw log line here…"
class="w-full bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-xs text-gray-300 font-mono placeholder-gray-600 focus:outline-none focus:border-purple-600 mb-3"></textarea>
<div id="qt-result"></div>
</div>
</div>`)
qtLoadParsers()
}
// ── Live Event Sampler ─────────────────────────────────────────────────────
async function qsSample() {
const source = document.getElementById('qs-source').value.trim()
if (!source) { document.getElementById('qs-result').innerHTML = errBox('Enter a source name.'); return }
setBtn('btn-qs', true)
document.getElementById('qs-result').innerHTML = '<p class="text-gray-500 text-sm animate-pulse">Querying data lake…</p>'
try {
const r = await apiPost('/api/quality/sample-events', {
source,
limit: +document.getElementById('qs-limit').value,
hours: +document.getElementById('qs-hours').value,
})
if (!r.events?.length) {
document.getElementById('qs-result').innerHTML = '<p class="text-gray-500 text-sm">No events found for this source in the selected window.</p>'
return
}
// Collect all field names across events
const allFields = [...new Set(r.events.flatMap(e => Object.keys(e)))].sort()
const rows = r.events.map(ev => {
const cells = allFields.map(f => {
const v = ev[f]
const empty = v === null || v === undefined || v === '' || v === 'null'
return `<td class="py-1.5 px-2 text-xs font-mono max-w-32 truncate ${empty ? 'text-gray-700 italic' : 'text-gray-300'}" title="${esc(String(v??''))}">${empty ? '∅' : esc(String(v).slice(0,40))}</td>`
}).join('')
return `<tr class="border-b border-gray-800/40 hover:bg-gray-800/20">${cells}</tr>`
}).join('')
const headers = allFields.map(f => `<th class="pb-2 px-2 text-left font-medium whitespace-nowrap text-xs">${esc(f)}</th>`).join('')
document.getElementById('qs-result').innerHTML = `
<p class="text-xs text-gray-500 mb-2">${r.count} events · ${r.hours}h window · ${allFields.length} fields seen</p>
<div class="overflow-x-auto max-h-72 overflow-y-auto rounded border border-gray-800">
<table class="text-xs min-w-full">
<thead class="sticky top-0 bg-gray-900 text-gray-500 border-b border-gray-800"><tr>${headers}</tr></thead>
<tbody>${rows}</tbody>
</table>
</div>`
} catch(e) {
document.getElementById('qs-result').innerHTML = errBox(e.message)
} finally { setBtn('btn-qs', false, 'Sample') }
}
// ── Field Population Rate ──────────────────────────────────────────────────
async function qpAnalyze() {
const source = document.getElementById('qp-source').value.trim()
if (!source) { document.getElementById('qp-result').innerHTML = errBox('Enter a source name.'); return }
setBtn('btn-qp', true)
document.getElementById('qp-result').innerHTML = '<p class="text-gray-500 text-sm animate-pulse">Sampling events…</p>'
try {
const fieldsRaw = document.getElementById('qp-fields').value
const fields = fieldsRaw.split(',').map(f => f.trim()).filter(Boolean)
const r = await apiPost('/api/quality/field-population', {
source, hours: +document.getElementById('qp-hours').value, fields
})
const rows = r.fields.map(f => {
const pct = f.rate
const color = pct >= 80 ? 'bg-emerald-500' : pct >= 40 ? 'bg-amber-500' : 'bg-red-500'
const textColor = pct >= 80 ? 'text-emerald-400' : pct >= 40 ? 'text-amber-400' : 'text-red-400'
return `<tr class="border-b border-gray-800/40">
<td class="py-2 pr-4 font-mono text-xs text-gray-200">${esc(f.field)}</td>
<td class="py-2 pr-4 text-xs ${textColor} font-semibold w-16">${pct}%</td>
<td class="py-2 pr-4 w-48">
<div class="h-2 bg-gray-800 rounded-full overflow-hidden">
<div class="h-full ${color} rounded-full transition-all" style="width:${pct}%"></div>
</div>
</td>
<td class="py-2 text-xs text-gray-600">${f.populated.toLocaleString()} / ${f.total.toLocaleString()}</td>
</tr>`
}).join('')
document.getElementById('qp-result').innerHTML = `
<p class="text-xs text-gray-500 mb-3">${r.total_sampled} events sampled · ${r.hours}h window — sorted by worst coverage first</p>
<table class="w-full mb-4">
<thead><tr class="text-left text-gray-500 border-b border-gray-800">
<th class="pb-2 pr-4 text-xs font-medium">Field</th>
<th class="pb-2 pr-4 text-xs font-medium">Rate</th>
<th class="pb-2 pr-4 text-xs font-medium">Coverage</th>
<th class="pb-2 text-xs font-medium">Events</th>
</tr></thead>
<tbody>${rows}</tbody>
</table>
${r.fields_seen_in_sample?.length ? `
<div class="border-t border-gray-800 pt-3">
<p class="text-xs text-gray-500 mb-1">Fields actually present in sample <span class="text-gray-600">(${r.fields_seen_in_sample.length} total)</span></p>
<div class="flex flex-wrap gap-1">${r.fields_seen_in_sample.map(f =>
`<span class="px-2 py-0.5 bg-gray-800 rounded text-xs font-mono text-gray-400">${esc(f)}</span>`).join('')}
</div>
</div>` : ''}`
} catch(e) {
document.getElementById('qp-result').innerHTML = errBox(e.message)
} finally { setBtn('btn-qp', false, 'Analyze') }
}
// ── Parser Test Runner ─────────────────────────────────────────────────────
async function qtLoadParsers() {
try {
const r = await apiGet('/api/coverage/map')
const names = [...new Set((r.sources || []).map(s => s.parser).filter(Boolean))].sort()
const sel = document.getElementById('qt-parser')
if (!sel) return
names.forEach(n => {
const o = document.createElement('option'); o.value = n; o.textContent = n; sel.appendChild(o)
})
} catch {}
}
async function qtTest() {
const parser = document.getElementById('qt-parser').value
const log = document.getElementById('qt-log').value.trim()
if (!parser) { document.getElementById('qt-result').innerHTML = errBox('Select a parser.'); return }
if (!log) { document.getElementById('qt-result').innerHTML = errBox('Paste a log line.'); return }
setBtn('btn-qt', true)
document.getElementById('qt-result').innerHTML = '<p class="text-gray-500 text-sm animate-pulse">Testing…</p>'
try {
const r = await apiPost('/api/quality/test-parser', { parser_name: parser, log_line: log })
if (!r.matched) {
document.getElementById('qt-result').innerHTML = `
<div class="p-3 bg-amber-900/30 border border-amber-700/50 rounded-lg text-sm text-amber-300">
⚠ No format pattern matched this log line.
<p class="text-xs text-amber-500 mt-1">The parser's format strings didn't produce a match. Check that the log sample matches the expected format, or that the parser has SDL format strings (some parsers use grok/dottedJson which aren't tested here).</p>
</div>`
return
}
const rows = r.fields.map(f => `<tr class="border-b border-gray-800/40">
<td class="py-1.5 pr-4 font-mono text-xs text-purple-300">${esc(f.field)}</td>
<td class="py-1.5 font-mono text-xs text-gray-200">${esc(String(f.value))}</td>
</tr>`).join('')
document.getElementById('qt-result').innerHTML = `
<div class="mb-3 p-2 bg-gray-800/60 rounded text-xs text-gray-500 font-mono break-all">
<span class="text-gray-600">Matched format: </span>${esc(r.format_matched)}
</div>
<table class="w-full">
<thead><tr class="text-left text-gray-500 border-b border-gray-800">
<th class="pb-2 pr-4 text-xs font-medium">Field</th>
<th class="pb-2 text-xs font-medium">Extracted Value</th>
</tr></thead>
<tbody>${rows}</tbody>
</table>`
} catch(e) {
document.getElementById('qt-result').innerHTML = errBox(e.message)
} finally { setBtn('btn-qt', false, 'Test') }
}
// ── Router ────────────────────────────────────────────────────────────────
function set(html) { document.getElementById('main').innerHTML = html }
@@ -652,6 +873,7 @@ function route() {
const h = location.hash || '#/'
if (h === '#/coverage') { updateNav('coverage'); renderCoverage() }
else if (h === '#/ingest') { updateNav('ingest'); renderIngest() }
else if (h === '#/quality') { updateNav('quality'); renderQuality() }
else if (h === '#/onboarding') { updateNav('onboarding'); renderOnboarding() }
else if (h === '#/settings') { updateNav('settings'); renderSettings() }
else { updateNav('home'); renderHome() }