marcredhat-kql/harness/export_rules.py

#!/usr/bin/env python3
"""Export each rule's KQL and PowerQuery to disk.

The exported `.pq` files are:
  * SELF-CONTAINED and RUNNABLE — every template placeholder
    (`{RECENT_MS}`) is substituted with a concrete value from the
    current time anchor, so you can paste straight into SDL.
  * PRETTY-PRINTED — one pipeline stage per line with continuation
    indents, matching the style in pmoses-s1/claude-skills.
  * HEADER-DECORATED — a `//`-comment block names the rule, describes
    intent, lists field references, and tells the reader what
    `startTime` to use when running the query.
  * VALIDATED — after writing, every `.pq` is parsed for known
    anti-patterns from the SentinelOne PowerQuery skill's pitfalls
    list (literal `{` braces, deprecated `first()`/`last()`/
    `percentile()`, leading `*` filter, missing leading pipe before
    `join`/`union`, etc.). Errors abort the export so the published
    repo never contains broken queries.
"""
from __future__ import annotations

import json
import re
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from rules import RULES, NOW, RECENT_START, BASELINE_START  # noqa: E402


# ---------------------------------------------------------------------------
# Pretty-printer: turn a single-line PQ string into multi-line idiomatic form.
# ---------------------------------------------------------------------------
def pretty(pq: str) -> str:
    """Break a one-line PQ into idiomatic multi-line form.

    Rule: every `|` that introduces a stage starts a new line; multi-clause
    `group ... by ...` is split so each agg sits on its own indented line
    and `by ...` lines up under `group`.
    """
    # Normalise whitespace
    pq = re.sub(r"\s+", " ", pq).strip()

    # Split on " | " into stages, but keep the leading initial filter
    parts = pq.split(" | ")
    head, stages = parts[0].strip(), [s.strip() for s in parts[1:]]

    lines: list[str] = [head] if head else []
    for s in stages:
        # Break a long `group a=count(), b=sum(x) by f1, f2` into multi-line.
        m = re.match(
            r"^group\s+(.+?)\s+by\s+(.+)$", s, flags=re.IGNORECASE | re.DOTALL)
        if m:
            aggs_raw, bys = m.group(1), m.group(2)
            # Split aggs on commas NOT inside parentheses
            aggs = _split_top_level_commas(aggs_raw)
            lines.append("| group " + aggs[0].strip() + ("," if len(aggs) > 1 else ""))
            for a in aggs[1:-1]:
                lines.append("        " + a.strip() + ",")
            if len(aggs) > 1:
                lines.append("        " + aggs[-1].strip())
            lines.append("    by " + bys.strip())
            continue

        # Default: one stage per line
        lines.append("| " + s)

    return "\n".join(lines)


def _split_top_level_commas(s: str) -> list[str]:
    out: list[str] = []
    depth, cur = 0, []
    for ch in s:
        if ch == "(":
            depth += 1; cur.append(ch)
        elif ch == ")":
            depth -= 1; cur.append(ch)
        elif ch == "," and depth == 0:
            out.append("".join(cur)); cur = []
        else:
            cur.append(ch)
    if cur:
        out.append("".join(cur))
    return out


# ---------------------------------------------------------------------------
# Anti-pattern scanner — refuses to write a file containing known landmines.
# ---------------------------------------------------------------------------
PITFALLS: list[tuple[str, str]] = [
    (r"\{[A-Za-z_]+\}",
     "Unsubstituted template placeholder (e.g. {RECENT_MS}). "
     "Substitute before writing."),
    (r"\bfirst\s*\(",
     "first(x) is unreliable — use min_by(x, ts_epoch_ms)."),
    (r"\blast\s*\(",
     "last(x) is unreliable — use max_by(x, ts_epoch_ms)."),
    (r"\bpercentile\s*\(",
     "percentile(x, N) is not a real function — use p50/p95/p99."),
    (r"\bgroup_unique_values\s*\(",
     "group_unique_values does not exist — use array_agg_distinct(x, N)."),
    (r"(?m)^\s*\*\s*(\||$)",
     "Bare `*` as initial filter returns 500 — use `| limit 5` or "
     "`field = *`."),
    (r"(?m)^\s*(join|union)\b",
     "join/union must start with a leading `|`."),
    (r"(?m)^\s*#(cmdline|name|hash|ip|storylineid|username|dns)\b",
     "Shortcut fields (#cmdline, …) are unreliable across tenants — "
     "use the explicit field name."),
]


def scan(text: str) -> list[str]:
    return [msg for pat, msg in PITFALLS if re.search(pat, text)]


# ---------------------------------------------------------------------------
# Header builder
# ---------------------------------------------------------------------------
def header(rule: dict, recent_iso: str, now_iso: str) -> str:
    field_refs = sorted({f for f in re.findall(
        r"\b[A-Z][A-Za-z0-9_]+\b", rule["pq"])
        if f.lower() not in {"and", "or", "not", "true", "false",
                              "filter", "group", "by", "let", "columns",
                              "sort", "limit", "join", "union", "in",
                              "contains", "matches"}})
    lines = [
        f"// Rule: {rule['id']}",
        f"// {rule['description']}",
        f"//",
        "// Source KQL: see ../kql/" + rule['id'] + ".kql",
        "//",
        "// HOW TO RUN",
        "//   curl POST {sdl}/api/powerQuery with this body, OR paste in",
        "//   the SDL console. Set startTime = '2h' (or wider) so the API",
        "//   scans the freshly-ingested epochs that contain the events.",
        "//",
        f"// Time anchor at export: NOW = {now_iso}",
        f"// Recent-window cutoff:  {recent_iso}",
        "//   (`ts_epoch_ms` below is that cutoff expressed in ms.",
        "//   Re-run harness/export_rules.py to refresh after regenerating",
        "//   sample_data/events.jsonl.)",
        "//",
        "// Fields referenced: " + ", ".join(field_refs[:10])
        + ("…" if len(field_refs) > 10 else ""),
        "//",
        "// EDITING NOTE",
        "//   Every line that starts with `|` is a pipeline stage. Each `|`",
        "//   is REQUIRED. If you delete one (e.g. while changing a literal",
        "//   on the same line as a stage), SDL re-parses the keyword that",
        "//   follows as a search term and rejects the query with errors",
        "//   like `'estimate_distinct' is a grouping function`.",
    ]
    return "\n".join(lines) + "\n"


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
    recent_ms = int(RECENT_START.timestamp() * 1000)
    recent_iso = RECENT_START.isoformat()
    now_iso = NOW.isoformat()

    failures: list[tuple[str, list[str]]] = []
    for r in RULES:
        # 1. substitute placeholders
        body = r["pq"].replace("{RECENT_MS}", str(recent_ms))
        # 2. pretty-print
        body = pretty(body)
        # 3. scan
        bad = scan(body)
        if bad:
            failures.append((r["id"], bad))
            continue
        # 4. write
        text = header(r, recent_iso, now_iso) + "\n" + body + "\n"
        (ROOT / "pq" / f"{r['id']}.pq").write_text(text)

        # Mirror the .kql (verbatim, no substitution)
        (ROOT / "kql" / f"{r['id']}.kql").write_text(r["kql"].strip() + "\n")

    if failures:
        print("✗ Export failed — anti-patterns detected:")
        for rid, msgs in failures:
            print(f"  {rid}")
            for m in msgs:
                print(f"    - {m}")
        sys.exit(1)

    print(f"✓ Exported {len(RULES)} rules to kql/ and pq/")
    print(f"  (RECENT_MS = {recent_ms} = {recent_iso})")


if __name__ == "__main__":
    main()