marcredhat-siem-toolkit-pat…/backend/routers/quality.py

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from datetime import datetime, timedelta
from services import s1_client
import re

router = APIRouter()


def _date_range_hours(hours: int) -> tuple[str, str]:
    now = datetime.utcnow()
    return (
        (now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
        now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
    )


# ---------------------------------------------------------------------------
# Models
# ---------------------------------------------------------------------------

class SampleEventsRequest(BaseModel):
    source: str
    limit: int = 20
    hours: int = 1


class FieldPopulationRequest(BaseModel):
    source: str
    hours: int = 24
    fields: list[str] = [
        "src.ip",
        "src.port",
        "dst.ip",
        "dst.port",
        "user.name",
        "event.type",
        "src.process.name",
        "src.process.cmdline",
        "tgt.file.path",
        "network.direction",
        "dataSource.name",
    ]


class TestParserRequest(BaseModel):
    parser_name: str
    log_line: str


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _flatten_event(event: dict) -> dict:
    """Return a flat field→value dict from a PowerQuery result row."""
    if isinstance(event, dict):
        return {k: v for k, v in event.items()}
    return {}


def _extract_format_strings(content: str) -> list[str]:
    """
    Extract SDL format string values from augmented-JSON parser content.
    Matches:  "format": "..."  (double-quoted value, supports escaped quotes).
    """
    pattern = re.compile(r'"format"\s*:\s*"((?:[^"\\]|\\.)*)"')
    return pattern.findall(content)


def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
    """
    Convert an SDL format string to a compiled Python regex.

    Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can
    translate group names back to the original SDL field names.

    Raises re.error if the resulting pattern cannot be compiled.
    """
    # Split on $...$ tokens
    token_pattern = re.compile(r'\$([^$]+)\$')
    parts = token_pattern.split(fmt)
    # parts alternates: literal, token, literal, token, ...

    regex_parts: list[str] = []
    py_group_to_sdl: dict[str, str] = {}
    seen_groups: dict[str, int] = {}

    for i, part in enumerate(parts):
        if i % 2 == 0:
            # Literal text
            regex_parts.append(re.escape(part))
        else:
            # Token: either "field.name=PATTERN" or just "field.name"
            if '=' in part:
                field_name, pattern = part.split('=', 1)
            else:
                field_name = part
                pattern = r'[^\s]+'

            # Build a valid Python group name
            safe = re.sub(r'[.\-]', '_', field_name)
            if safe in seen_groups:
                seen_groups[safe] += 1
                safe = f"{safe}_{seen_groups[safe]}"
            else:
                seen_groups[safe] = 0

            py_group_to_sdl[safe] = field_name
            regex_parts.append(f'(?P<{safe}>{pattern})')

    compiled = re.compile(''.join(regex_parts), re.IGNORECASE)
    return compiled, py_group_to_sdl


# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------

@router.post("/sample-events")
async def sample_events(req: SampleEventsRequest):
    """Return a sample of raw events from a given data source."""
    query = f'| filter dataSource.name = "{req.source}" | limit {req.limit}'
    from_dt, to_dt = _date_range_hours(req.hours)

    result = await s1_client.run_powerquery(query, from_dt, to_dt)

    rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
    events = [_flatten_event(row) for row in rows]

    return {
        "source": req.source,
        "events": events,
        "count": len(events),
        "hours": req.hours,
    }


@router.post("/field-population")
async def field_population(req: FieldPopulationRequest):
    """
    Analyse how consistently each requested field is populated across a sample
    of events from a data source.
    """
    query = f'| filter dataSource.name = "{req.source}" | limit 500'
    from_dt, to_dt = _date_range_hours(req.hours)

    result = await s1_client.run_powerquery(query, from_dt, to_dt)

    rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
    events = [_flatten_event(row) for row in rows]

    if not events:
        raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.")

    total = len(events)
    _empty = {None, "", "null"}

    # Collect all field names seen across the sample (useful for surfacing what IS there)
    all_seen_fields = sorted({k for ev in events for k in ev})

    field_stats = []
    for field in req.fields:
        # dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back
        if field == "dataSource.name":
            populated = total
        else:
            populated = sum(1 for ev in events if ev.get(field) not in _empty)
        rate = round((populated / total) * 100, 1)
        field_stats.append({
            "field": field,
            "populated": populated,
            "total": total,
            "rate": rate,
        })

    # Sort ascending by rate (worst coverage first)
    field_stats.sort(key=lambda x: x["rate"])

    return {
        "source": req.source,
        "total_sampled": total,
        "hours": req.hours,
        "fields": field_stats,
        "fields_seen_in_sample": all_seen_fields,
    }


@router.post("/test-parser")
async def test_parser(req: TestParserRequest):
    """
    Test a parser against a raw log line by extracting and matching SDL format
    strings found in the parser file.
    """
    parser_path = f"/app/parsers/{req.parser_name}"

    try:
        with open(parser_path, "r", encoding="utf-8") as fh:
            content = fh.read()
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail=f"Parser file not found: {req.parser_name}")
    except OSError as exc:
        raise HTTPException(status_code=500, detail=f"Could not read parser file: {exc}")

    format_strings = _extract_format_strings(content)

    for fmt in format_strings:
        try:
            compiled, py_to_sdl = _sdl_format_to_regex(fmt)
        except re.error:
            # Skip unparseable format strings
            continue

        match = compiled.search(req.log_line)
        if match:
            fields = [
                {"field": py_to_sdl.get(group, group), "value": value}
                for group, value in match.groupdict().items()
                if value is not None
            ]
            return {
                "parser_name": req.parser_name,
                "matched": True,
                "format_matched": fmt,
                "fields": fields,
            }

    return {
        "parser_name": req.parser_name,
        "matched": False,
        "message": "No format pattern matched",
        "fields": [],
    }