v0.1 Mick Marc merged

2026-06-10 13:21:17 +00:00 · 2026-05-20 23:44:53 +02:00
commit 79efb6bf7d
50 changed files with 5190 additions and 0 deletions
@@ -0,0 +1,648 @@
+import json
+import os
+from fastapi import APIRouter, UploadFile, File, Depends, HTTPException
+from pydantic import BaseModel
+from sqlalchemy.orm import Session
+from datetime import datetime
+from db import get_db, ParsedRule, ParserField, ActiveSource
+from services import s1_client, rule_parser
+
+DETECTIONS_FILE = os.environ.get("DETECTIONS_FILE", "/app/data/detections.json")
+
+router = APIRouter()
+
+
+def _star_query_texts(rule: dict) -> list[str]:
+    """
+    Extract all PowerQuery/filter strings from a STAR rule.
+    Handles simple rules (s1ql) and correlation rules (subQueries[].subQuery).
+    """
+    texts = []
+
+    # Simple rules
+    for field in ("s1ql", "queryLang", "query", "powerQuery"):
+        v = rule.get(field)
+        # queryLang "2.0" is a version string, not a query — skip short strings
+        if v and isinstance(v, str) and len(v) > 5:
+            texts.append(v)
+
+    # Correlation rules: subQueries[].subQuery
+    cp = rule.get("correlationParams") or {}
+    for sq in cp.get("subQueries", []):
+        v = sq.get("subQuery")
+        if v and isinstance(v, str):
+            texts.append(v)
+    # Also handle older conditions[] format
+    for cond in cp.get("conditions", []):
+        for key in ("filter", "query", "subQuery"):
+            v = cond.get(key)
+            if v and isinstance(v, str):
+                texts.append(v)
+
+    return texts
+
+
+@router.post("/load-star-rules")
+async def load_star_rules(db: Session = Depends(get_db)):
+    """Fetch all STAR rules from the Management Console API and index their fields."""
+    try:
+        rules = await s1_client.get_star_rules()
+    except Exception as e:
+        raise HTTPException(502, f"S1 API error: {type(e).__name__}: {e}")
+
+    # Replace all existing STAR rules cleanly to avoid duplicate key errors
+    db.query(ParsedRule).filter_by(rule_type="star").delete()
+    db.flush()
+
+    loaded = []
+    for rule in rules:
+        all_fields: set = set()
+        for qt in _star_query_texts(rule):
+            all_fields |= rule_parser.extract_star_fields(qt)
+        fields = list(all_fields)
+        record = ParsedRule(
+            rule_id=str(rule.get("id", "")),
+            name=rule.get("name", "unnamed"),
+            rule_type="star",
+            fields_used=fields,
+            raw=json.dumps(rule),
+        )
+        db.add(record)
+        loaded.append({"id": record.rule_id, "name": record.name, "fields": fields})
+
+    db.commit()
+    return {"loaded": len(loaded), "rules": loaded}
+
+
+_EXCLUDED_PATHS = ("/rules/silent/", "/rules/dev/")
+
+
+def _import_from_api_rules(db, rules: list) -> int:
+    """
+    Import platform rules fetched directly from the S1 API into the database.
+    Each rule has a 'sources' list — the authoritative dataSource.name values.
+    """
+    db.query(ParsedRule).filter_by(rule_type="library").delete()
+    db.commit()
+
+    loaded = 0
+    seen_ids: set = set()
+    for rule in rules:
+        rule_id = str(rule.get("id", f"lib_{loaded}"))
+        if rule_id in seen_ids:
+            continue
+        seen_ids.add(rule_id)
+
+        sources = rule.get("sources") or []
+        db.add(ParsedRule(
+            rule_id=rule_id,
+            name=rule.get("name", "unnamed"),
+            rule_type="library",
+            fields_used=[],          # API rules don't expose field-level info
+            raw=json.dumps({"data_sources": sources}),
+        ))
+        loaded += 1
+        if loaded % 500 == 0:
+            db.flush()
+
+    db.commit()
+    return loaded
+
+
+def _import_detections(db, detections_file: str) -> int:
+    """
+    Import library detection rules from extracted.json into the database.
+    Replaces any existing library rules. Returns the count of rules loaded.
+    """
+    with open(detections_file, "r", encoding="utf-8") as fh:
+        data = json.load(fh)
+
+    results = data.get("results", [])
+    results = [r for r in results if not any(r.get("file", "").startswith(p) for p in _EXCLUDED_PATHS)]
+
+    db.query(ParsedRule).filter_by(rule_type="library").delete()
+    db.commit()
+
+    loaded = 0
+    seen_ids: set = set()
+    for rule in results:
+        all_fields: set = set()
+        data_sources: list[str] = []
+        for q in rule.get("queries", []):
+            all_fields.update(q.get("keys", []))
+            ds_vals = q.get("pairs", {}).get("dataSource.name", [])
+            for v in ds_vals:
+                if isinstance(v, str):
+                    data_sources.append(v)
+                elif isinstance(v, list):
+                    data_sources.extend(str(x) for x in v)
+
+        rule_id = str(rule.get("id", f"lib_{loaded}"))
+        if rule_id in seen_ids:
+            continue
+        seen_ids.add(rule_id)
+
+        db.add(ParsedRule(
+            rule_id=rule_id,
+            name=rule.get("name", "unnamed"),
+            rule_type="library",
+            fields_used=list(all_fields),
+            raw=json.dumps({"data_sources": list(set(data_sources))}),
+        ))
+        loaded += 1
+        if loaded % 500 == 0:
+            db.flush()
+
+    db.commit()
+    return loaded
+
+
+@router.post("/load-detections")
+async def load_detections(db: Session = Depends(get_db)):
+    """
+    Reload detection library rules.
+    Tries the live S1 API first (platform-rules endpoint); falls back to extracted.json.
+    """
+    # Prefer the live API — gives accurate 'sources' and is always up to date
+    try:
+        rules = await s1_client.get_platform_rules()
+        if rules:
+            loaded = _import_from_api_rules(db, rules)
+            return {"loaded": loaded, "source": "api"}
+    except Exception:
+        pass
+
+    # Fall back to local extracted.json
+    if not os.path.exists(DETECTIONS_FILE):
+        raise HTTPException(
+            404,
+            "S1 API unavailable and no detections file found — "
+            "ensure the data/ volume is mounted with detections.json"
+        )
+    try:
+        loaded = _import_detections(db, DETECTIONS_FILE)
+    except Exception as e:
+        raise HTTPException(500, f"Failed to import detections: {e}")
+    return {"loaded": loaded, "source": "file"}
+
+
+@router.post("/upload-sigma")
+async def upload_sigma(files: list[UploadFile] = File(...), db: Session = Depends(get_db)):
+    """Upload one or more Sigma YAML files and index their fields."""
+    loaded = []
+    for file in files:
+        content = (await file.read()).decode("utf-8", errors="replace")
+        fields = list(rule_parser.extract_sigma_fields(content))
+        record = ParsedRule(
+            rule_id=f"sigma_{file.filename}",
+            name=file.filename or "unnamed",
+            rule_type="sigma",
+            fields_used=fields,
+            raw=content,
+        )
+        db.merge(record)
+        loaded.append({"name": file.filename, "fields": fields})
+
+    db.commit()
+    return {"loaded": len(loaded), "rules": loaded}
+
+
+@router.post("/load-parsers-from-sdl")
+async def load_parsers_from_sdl(db: Session = Depends(get_db)):
+    """
+    Load SDL parsers from the local /app/parsers directory (mounted from ./parsers/).
+    Files are placed there by the MCP-based loader or by manual copy.
+    Falls back to a clear error if the directory is empty.
+    """
+    import os
+    parsers_dir = "/app/parsers"
+
+    try:
+        entries = [
+            e for e in os.scandir(parsers_dir)
+            if e.is_file() and not e.name.startswith(".")
+        ]
+    except FileNotFoundError:
+        raise HTTPException(503, "parsers/ directory not found — check Docker volume mount")
+
+    if not entries:
+        raise HTTPException(
+            422,
+            "No parser files found in parsers/ directory. "
+            "Use 'Load SDL Parsers via MCP' in Claude Code to populate it, "
+            "or upload a parser file manually."
+        )
+
+    loaded = []
+    errors = []
+    for entry in entries:
+        try:
+            with open(entry.path, "r", encoding="utf-8", errors="replace") as fh:
+                content = fh.read()
+
+            fields: set = set()
+            try:
+                import json as _json
+                parser_data = _json.loads(content)
+                fields = rule_parser.extract_parser_fields(parser_data)
+            except Exception:
+                pass
+            fields |= rule_parser.extract_parser_fields_from_content(content)
+
+            name = entry.name
+            db.query(ParserField).filter_by(parser_name=name).delete()
+            for f in fields:
+                db.add(ParserField(parser_name=name, field_name=f, field_type="string"))
+            loaded.append({"parser": name, "fields": list(fields), "field_count": len(fields)})
+        except Exception as e:
+            errors.append({"parser": entry.name, "error": str(e)})
+
+    db.commit()
+    return {"loaded": len(loaded), "parsers": loaded, "errors": errors}
+
+
+@router.post("/upload-parser")
+async def upload_parser(file: UploadFile = File(...), db: Session = Depends(get_db)):
+    """Upload an SDL parser JSON file and index its output fields."""
+    raw_bytes = await file.read()
+    content_str = raw_bytes.decode("utf-8", errors="replace")
+
+    # Try structured JSON extraction first, fall back to content-string extraction
+    fields: set = set()
+    try:
+        parser_data = json.loads(content_str)
+        fields = rule_parser.extract_parser_fields(parser_data)
+    except json.JSONDecodeError:
+        pass
+
+    # Always also run content-string extraction (catches $field$ SDL format strings)
+    fields |= rule_parser.extract_parser_fields_from_content(content_str)
+
+    db.query(ParserField).filter_by(parser_name=file.filename).delete()
+    for f in fields:
+        db.add(ParserField(parser_name=file.filename, field_name=f, field_type="string"))
+
+    db.commit()
+    return {"parser": file.filename, "fields": list(fields)}
+
+
+class ParserContentPayload(BaseModel):
+    parser_name: str
+    content: str  # raw SDL parser file content as string
+
+
+@router.post("/load-parser-content")
+async def load_parser_content(payload: ParserContentPayload, db: Session = Depends(get_db)):
+    """
+    Accept raw SDL parser content (as a string) and index its output fields.
+    Used by MCP-based loader scripts since the SDL HTTP API endpoint is not
+    accessible from inside Docker with standard API token auth.
+    """
+    fields: set = set()
+
+    # Try JSON parsing first (structured attributes/fields/mappings)
+    try:
+        parser_data = json.loads(payload.content)
+        fields = rule_parser.extract_parser_fields(parser_data)
+    except (json.JSONDecodeError, Exception):
+        pass
+
+    # Always run SDL format-string extraction ($field.name$ patterns)
+    fields |= rule_parser.extract_parser_fields_from_content(payload.content)
+
+    if not fields:
+        raise HTTPException(422, "No fields could be extracted from the parser content")
+
+    db.query(ParserField).filter_by(parser_name=payload.parser_name).delete()
+    for f in fields:
+        db.add(ParserField(parser_name=payload.parser_name, field_name=f, field_type="string"))
+
+    db.commit()
+    return {"parser": payload.parser_name, "fields": list(fields), "field_count": len(fields)}
+
+
+# Native SentinelOne platform sources — parsed by the system, not by SDL parsers.
+# Excluded from the coverage map as they do not require custom parser coverage.
+_S1_NATIVE_SOURCES = {
+    "SentinelOne", "asset", "alert", "vulnerability",
+    "ActivityFeed", "indicator", "misconfiguration",
+    "SentinelOne Ranger AD",
+}
+
+
+@router.post("/sync-sources")
+async def sync_sources(days: int = 7, db: Session = Depends(get_db)):
+    """Pull active dataSource.names from the SDL and store them.
+    Also detects whether a parser is already producing structured fields
+    for each source by checking if event.type is populated in the data lake.
+    Native S1 platform sources are excluded as they do not require SDL parsers.
+    """
+    import asyncio
+    from datetime import datetime, timedelta
+    now = datetime.utcnow()
+    from_dt = (now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z")
+    to_dt = now.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+
+    try:
+        volume_result, parsed_result = await asyncio.gather(
+            s1_client.run_powerquery(
+                "| group events=count() by dataSource.name | sort -events | limit 200",
+                from_dt, to_dt
+            ),
+            s1_client.run_powerquery(
+                "| filter event.type != '' | group parsed=count() by dataSource.name | limit 200",
+                from_dt, to_dt
+            ),
+        )
+    except Exception as e:
+        raise HTTPException(502, f"PowerQuery error: {e}")
+
+    # Build lookup: source_name → count of parsed events seen
+    parsed_by_source: dict[str, int] = {}
+    for row in parsed_result.get("events", []):
+        name = row.get("dataSource.name")
+        if name:
+            parsed_by_source[name] = row.get("parsed", 0)
+
+    rows = volume_result.get("events", [])
+    db.query(ActiveSource).delete()
+    synced_at = datetime.utcnow()
+    seen = 0
+    for row in rows:
+        name = row.get("dataSource.name")
+        if name and name not in _S1_NATIVE_SOURCES:
+            db.add(ActiveSource(
+                source_name=name,
+                event_count=row.get("events", 0),
+                synced_at=synced_at,
+                parser_detected=parsed_by_source.get(name, 0),
+            ))
+            seen += 1
+    db.commit()
+    return {"synced": seen, "sources": [r["dataSource.name"] for r in rows if r.get("dataSource.name") and r["dataSource.name"] not in _S1_NATIVE_SOURCES]}
+
+
+def _build_parser_ds_index() -> dict[str, dict]:
+    """
+    Read all parser files from /app/parsers/ and build an index:
+      dataSource.name (exact, from parser attributes) → {parser_name, format_type}
+
+    Format type is "grok", "dottedJson", or "custom".
+    Sources with grok/dottedJson parsers are flagged as needing a proper parser.
+    """
+    import os, re
+    parsers_dir = "/app/parsers"
+    _DS_NAME_RE = re.compile(r'"dataSource\.name"\s*:\s*"([^"]+)"')
+    _FORMAT_TYPE_RE = re.compile(r'"type"\s*:\s*"([^"]+)"')
+
+    index: dict[str, dict] = {}
+    try:
+        entries = [e for e in os.scandir(parsers_dir) if e.is_file() and not e.name.startswith(".")]
+    except FileNotFoundError:
+        return index
+
+    for entry in entries:
+        try:
+            with open(entry.path, "r", encoding="utf-8", errors="replace") as fh:
+                content = fh.read()
+        except Exception:
+            continue
+
+        # Extract dataSource.name (may appear multiple times — take first)
+        ds_match = _DS_NAME_RE.search(content)
+        if not ds_match:
+            continue
+        ds_name = ds_match.group(1).strip()
+
+        # Determine format type — look for grok/dottedJson/custom in "type" values
+        format_types = {m.group(1).lower() for m in _FORMAT_TYPE_RE.finditer(content)}
+        if "grok" in format_types:
+            fmt = "grok"
+        elif "dottedjson" in format_types:
+            fmt = "dottedJson"
+        else:
+            fmt = "custom"
+
+        index[ds_name] = {"parser_name": entry.name, "format_type": fmt}
+
+    return index
+
+
+@router.get("/map")
+def get_coverage_map(db: Session = Depends(get_db)):
+    """
+    Source-centric coverage map.
+    For each active dataSource.name in the SDL:
+      - covered       = a custom parser is loaded for it (dataSource.name matches)
+      - parser_needed = no parser, OR parser uses grok/dottedJson format
+    Also surfaces which STAR rules reference each source.
+    """
+    active_sources = db.query(ActiveSource).order_by(ActiveSource.event_count.desc()).all()
+    parser_fields_rows = db.query(ParserField).all()
+    rules = db.query(ParsedRule).all()
+
+    # parser_name → set of field names (for field count display)
+    parser_index: dict[str, set] = {}
+    for pf in parser_fields_rows:
+        parser_index.setdefault(pf.parser_name, set()).add(pf.field_name)
+
+    # Build dataSource.name → {parser_name, format_type} index from parser files
+    ds_index = _build_parser_ds_index()
+
+    def _normalize(s: str) -> str:
+        return s.lower().replace(" ", "").replace("-", "").replace("_", "").replace(".", "")
+
+    def _find_parser_info(source_name: str) -> dict | None:
+        """
+        Match priority:
+          1. Exact dataSource.name match
+          2. Normalized substring: active source name ↔ parser dataSource.name
+          3. Normalized substring: active source name ↔ parser filename
+             (catches cases where the parser file has a wrong dataSource.name)
+        """
+        # 1. Exact match on dataSource.name
+        if source_name in ds_index:
+            return ds_index[source_name]
+        sn = _normalize(source_name)
+        # 2. Normalized ds_name substring
+        for ds_name, info in ds_index.items():
+            if _normalize(ds_name) in sn or sn in _normalize(ds_name):
+                return info
+        # 3. Normalized filename substring
+        for info in ds_index.values():
+            if _normalize(info["parser_name"]) in sn or sn in _normalize(info["parser_name"]):
+                return info
+        return None
+
+    # Fields each rule needs: rule.name → set of field names
+    rule_fields_index: dict[str, set] = {
+        rule.name: set(rule.fields_used or []) for rule in rules
+    }
+
+    # Build rule index: source_name → rules that reference it
+    rule_by_source: dict[str, list] = {}
+    for rule in rules:
+        try:
+            raw_data = json.loads(rule.raw) if rule.raw else {}
+        except Exception:
+            raw_data = {}
+
+        if rule.rule_type == "library":
+            # Library rules store pre-extracted data_sources list in raw
+            data_sources = raw_data.get("data_sources", [])
+        else:
+            query_texts = _star_query_texts(raw_data)
+            data_sources = rule_parser.extract_data_sources(query_texts)
+
+        for ds in data_sources:
+            rule_by_source.setdefault(ds, []).append({"rule": rule.name, "type": rule.rule_type})
+
+    # Fields to ignore when computing "missing" — these are metadata/schema fields
+    # always present in events regardless of the parser
+    _SCHEMA_FIELDS = {
+        "dataSource.name", "dataSource.vendor", "dataSource.category",
+        "event.type", "timestamp", "src.endpoint.ip", "src.endpoint.name",
+        # Endpoint agent fields — populated by the SentinelOne agent, not by SDL parsers
+        "cmdScript.content", "endpoint.os", "endpoint.name", "endpoint.uid",
+    }
+
+    sources_out = []
+    covered_count = 0
+    needed_count = 0
+
+    for src in active_sources:
+        parser_info = _find_parser_info(src.source_name)
+        parser_in_data = (src.parser_detected or 0) > 0
+
+        if parser_info and parser_info["format_type"] == "custom":
+            status = "covered"
+            matched_parser = parser_info["parser_name"]
+            format_type = "custom"
+        elif parser_info and parser_info["format_type"] in ("grok", "dottedJson") and not parser_in_data:
+            # Known parser but primitive format and no evidence of parsing in data
+            status = "parser_needed"
+            matched_parser = parser_info["parser_name"]
+            format_type = parser_info["format_type"]
+        elif parser_in_data:
+            # Parsed fields detected in the data lake — a parser is running
+            status = "covered"
+            matched_parser = parser_info["parser_name"] if parser_info else "detected in data"
+            format_type = parser_info["format_type"] if parser_info else "unknown"
+        else:
+            status = "parser_needed"
+            matched_parser = None
+            format_type = None
+
+        if status == "covered":
+            covered_count += 1
+        else:
+            needed_count += 1
+
+        rules_for_src: list = [r for r in rule_by_source.get(src.source_name, []) if r["type"] == "library"]
+
+        # Close-match suggestions — shown when there are no library rules for this source.
+        close_matches: list = []
+        if not rules_for_src:
+            import re as _re
+
+            def _word_tokens(s: str) -> set:
+                """Split on non-alphanumeric boundaries, lowercase, drop single chars."""
+                return {t for t in _re.split(r"[^a-z0-9]+", s.lower()) if len(t) >= 2}
+
+            def _is_close(a: str, b: str) -> bool:
+                na, nb = _normalize(a), _normalize(b)
+                # 1. Simple substring match
+                if na in nb or nb in na:
+                    return True
+                # 2. Token-level: handles "Microsoft 365 Collaboration" vs "Microsoft O365"
+                #    — "365" is inside "o365", and they share "microsoft"
+                ta, tb = _word_tokens(a), _word_tokens(b)
+                shared_exact = ta & tb
+                if not shared_exact:
+                    return False  # Must share at least one word exactly
+                # Check that a DISTINCTIVE (non-shared) token from one name
+                # appears as a substring inside a token from the other.
+                # This avoids matching "Azure AD" to "Azure Platform" on "azure" alone.
+                unique_a = ta - shared_exact
+                unique_b = tb - shared_exact
+                return any(
+                    ua in ub or ub in ua
+                    for ua in unique_a for ub in unique_b
+                    if len(ua) >= 2 and len(ub) >= 2
+                )
+
+            sn = _normalize(src.source_name)
+            for lib_ds, lib_rules in rule_by_source.items():
+                lib_only = [r for r in lib_rules if r["type"] == "library"]
+                if not lib_only:
+                    continue
+                if _is_close(src.source_name, lib_ds):
+                    close_matches.append({
+                        "library_name": lib_ds,
+                        "rule_count": len(lib_only),
+                    })
+            close_matches.sort(key=lambda x: x["rule_count"], reverse=True)
+            close_matches = close_matches[:3]
+
+        # Count how many rules reference each field (frequency)
+        field_freq: dict[str, int] = {}
+        for r in rules_for_src:
+            for f in rule_fields_index.get(r["rule"], set()):
+                field_freq[f] = field_freq.get(f, 0) + 1
+
+        # Fields the parser provides
+        parser_provides = parser_index.get(matched_parser, set()) if matched_parser and matched_parser != "detected in data" else set()
+
+        # Minimum number of rules that must reference a field before we flag it.
+        # Scales with rule count so single-rule oddities don't dominate.
+        rule_count = len(rules_for_src)
+        min_rules = max(2, round(rule_count * 0.05)) if rule_count >= 10 else 2
+
+        # Missing = dotted-path fields needed by >= min_rules rules,
+        # not in schema constants, not provided by the parser.
+        missing_fields = sorted(
+            f for f, count in field_freq.items()
+            if count >= min_rules
+            and "." in f
+            and f not in _SCHEMA_FIELDS
+            and f not in parser_provides
+        )
+
+        sources_out.append({
+            "source_name": src.source_name,
+            "event_count": src.event_count,
+            "status": status,
+            "parser": matched_parser,
+            "format_type": format_type,
+            "parser_fields": len(parser_provides),
+            "parser_detected": src.parser_detected or 0,
+            "rules": rules_for_src,
+            "rule_count": len(rules_for_src),
+            "close_matches": close_matches,
+            "missing_fields": missing_fields,
+            "missing_fields_count": len(missing_fields),
+            "synced_at": src.synced_at.isoformat() if src.synced_at else None,
+        })
+
+    synced_at = active_sources[0].synced_at.isoformat() if active_sources else None
+
+    return {
+        "summary": {
+            "active_sources": len(active_sources),
+            "covered": covered_count,
+            "parser_needed": needed_count,
+            "parsers_loaded": len(parser_index),
+            "rules_loaded": len(rules),
+        },
+        "sources": sources_out,
+        "synced_at": synced_at,
+        "has_sources": len(active_sources) > 0,
+    }
+
+
+@router.delete("/reset")
+def reset_data(db: Session = Depends(get_db)):
+    db.query(ParsedRule).delete()
+    db.query(ParserField).delete()
+    db.commit()
+    return {"cleared": True}
@@ -0,0 +1,122 @@
+from datetime import datetime, timedelta
+from fastapi import APIRouter, Query, HTTPException
+from pydantic import BaseModel
+from services import s1_client
+
+router = APIRouter()
+
+
+def _date_range(days: int) -> tuple[str, str]:
+    now = datetime.utcnow()
+    return (
+        (now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+        now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+    )
+
+
+def _date_range_hours(hours: int) -> tuple[str, str]:
+    now = datetime.utcnow()
+    return (
+        (now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+        now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+    )
+
+
+@router.get("/top-sources")
+async def get_top_sources(
+    days: int = Query(None, ge=1, le=90),
+    hours: int = Query(None, ge=1, le=24),
+):
+    """Top log sources by event count over the given period."""
+    if hours is not None:
+        from_dt, to_dt = _date_range_hours(hours)
+        period_label = f"{hours}h"
+    else:
+        from_dt, to_dt = _date_range(days or 7)
+        period_label = f"{days or 7}d"
+    query = "| group events=count() by dataSource.name | sort -events | limit 25"
+    try:
+        result = await s1_client.run_powerquery(query, from_dt, to_dt)
+    except Exception as e:
+        raise HTTPException(502, f"PowerQuery error: {e}")
+    return {"period": period_label, "data": result.get("events", [])}
+
+
+@router.get("/by-event-type")
+async def get_by_event_type(days: int = Query(7, ge=1, le=90)):
+    """Event counts grouped by source and event type."""
+    from_dt, to_dt = _date_range(days)
+    query = "| group events=count() by dataSource.name, event.type | sort -events | limit 100"
+    try:
+        result = await s1_client.run_powerquery(query, from_dt, to_dt)
+    except Exception as e:
+        raise HTTPException(502, f"PowerQuery error: {e}")
+    return {"period_days": days, "data": result.get("events", [])}
+
+
+@router.get("/daily-volume")
+async def get_daily_volume(days: int = Query(5, ge=1, le=7)):
+    """Total event count per day — queries run in parallel."""
+    import asyncio
+
+    now = datetime.utcnow()
+    points = min(days, 7)
+
+    async def _fetch_day(i: int) -> dict:
+        day_from = (now - timedelta(days=i + 1)).strftime("%Y-%m-%dT00:00:00.000Z")
+        day_to   = (now - timedelta(days=i)).strftime("%Y-%m-%dT00:00:00.000Z")
+        label    = (now - timedelta(days=i + 1)).strftime("%Y-%m-%d")
+        try:
+            result = await s1_client.run_powerquery("| group total=count()", day_from, day_to)
+            events_list = result.get("events", []) if isinstance(result, dict) else []
+            count = events_list[0].get("total", 0) if events_list else 0
+        except Exception:
+            count = 0
+        return {"date": label, "events": count}
+
+    results = await asyncio.gather(*[_fetch_day(i) for i in range(points)])
+    return list(reversed(results))
+
+
+class FilterRule(BaseModel):
+    source: str = ""
+    event_type: str = ""
+    days: int = 7
+    gb_per_million_events: float = 0.5
+
+
+@router.post("/simulate-filter")
+async def simulate_filter(rule: FilterRule):
+    """Estimate how many events and GB would be eliminated by an exclusion filter."""
+    from_dt, to_dt = _date_range(rule.days)
+
+    clauses = []
+    if rule.source:
+        clauses.append(f"dataSource.name=='{rule.source}'")
+    if rule.event_type:
+        clauses.append(f"event.type=='{rule.event_type}'")
+
+    if clauses:
+        filter_expr = " and ".join(clauses)
+        query = f"| filter {filter_expr} | group events=count()"
+    else:
+        query = "| group events=count()"
+
+    try:
+        result = await s1_client.run_powerquery(query, from_dt, to_dt)
+        events = (result.get("events") or [{}])[0].get("events", 0) if isinstance(result.get("events"), list) else 0
+    except Exception as e:
+        raise HTTPException(502, f"PowerQuery error: {e}")
+
+    estimated_gb = round(events / 1_000_000 * rule.gb_per_million_events, 3)
+    monthly_events = int(events / rule.days * 30)
+    monthly_gb = round(monthly_events / 1_000_000 * rule.gb_per_million_events, 2)
+
+    return {
+        "period_days": rule.days,
+        "matched_events": events,
+        "estimated_gb_period": estimated_gb,
+        "projected_monthly_events": monthly_events,
+        "projected_monthly_gb": monthly_gb,
+        "filter": {"source": rule.source, "event_type": rule.event_type},
+    }
@@ -0,0 +1,440 @@
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+from datetime import datetime, timedelta
+from services import s1_client
+import os
+import re
+
+router = APIRouter()
+
+
+PARSERS_DIR = "/app/parsers"
+
+
+@router.get("/parsers")
+def list_parser_files():
+    """List parser filenames available under /app/parsers/ for the Test Runner."""
+    try:
+        names = sorted(
+            e.name for e in os.scandir(PARSERS_DIR)
+            if e.is_file() and not e.name.startswith(".")
+        )
+    except FileNotFoundError:
+        names = []
+    return {"parsers": names, "count": len(names)}
+
+
+@router.post("/sync-from-sdl")
+async def sync_parsers_from_sdl():
+    """Download every parser file under /logParsers/ on the SDL tenant into
+    /app/parsers/. After this call returns, the Parser Test Runner dropdown
+    will include all tenant parsers (including custom ones).
+
+    Requires SDL_CONFIG_READ_KEY in .env (Configuration Read scope on the
+    Data Lake API key).
+    """
+    if not s1_client.SDL_CONFIG_READ_KEY:
+        raise HTTPException(
+            400,
+            "SDL_CONFIG_READ_KEY is not set in .env. Generate a Data Lake API key "
+            "with 'Configuration Read' scope in the S1 console and add it to .env."
+        )
+
+    try:
+        names = await s1_client.list_sdl_parsers()
+    except Exception as e:
+        raise HTTPException(502, f"SDL listFiles failed: {e}")
+
+    os.makedirs(PARSERS_DIR, exist_ok=True)
+    downloaded: list[str] = []
+    errors: list[dict] = []
+
+    for name in names:
+        # The path on SDL is /logParsers/<name>; we write to /app/parsers/<sanitized-name>.
+        safe_name = name.replace("/", "_")
+        try:
+            resp = await s1_client.get_sdl_parser(name)
+            content = resp.get("content")
+            if content is None:
+                errors.append({"parser": name, "error": "no content field in response"})
+                continue
+            with open(os.path.join(PARSERS_DIR, safe_name), "w", encoding="utf-8") as fh:
+                fh.write(content)
+            downloaded.append(safe_name)
+        except Exception as e:
+            errors.append({"parser": name, "error": str(e) or e.__class__.__name__})
+
+    return {
+        "downloaded": len(downloaded),
+        "parsers": downloaded,
+        "errors": errors,
+        "directory": PARSERS_DIR,
+    }
+
+
+def _date_range_hours(hours: int) -> tuple[str, str]:
+    now = datetime.utcnow()
+    return (
+        (now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+        now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Models
+# ---------------------------------------------------------------------------
+
+class SampleEventsRequest(BaseModel):
+    source: str
+    limit: int = 20
+    hours: int = 1
+
+
+class FieldPopulationRequest(BaseModel):
+    source: str
+    hours: int = 24
+    fields: list[str] = [
+        "src.ip",
+        "src.port",
+        "dst.ip",
+        "dst.port",
+        "user.name",
+        "event.type",
+        "src.process.name",
+        "src.process.cmdline",
+        "tgt.file.path",
+        "network.direction",
+        "dataSource.name",
+    ]
+
+
+class TestParserRequest(BaseModel):
+    parser_name: str
+    log_line: str
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _flatten_dict(d: dict, prefix: str = "", out: dict | None = None) -> dict:
+    """Recursively flatten a nested dict into dotted keys."""
+    if out is None:
+        out = {}
+    if not isinstance(d, dict):
+        return out
+    for k, v in d.items():
+        key = f"{prefix}.{k}" if prefix else k
+        if isinstance(v, dict):
+            _flatten_dict(v, key, out)
+        else:
+            out[key] = v
+    return out
+
+
+def _flatten_event(event: dict) -> dict:
+    """Return a flat field→value dict from a PowerQuery result row.
+
+    If the row only carries a JSON-stringified payload in `message` (i.e. the
+    parser wasn't applied at query time), parse and flatten it inline so the
+    UI can measure field population accurately. The original raw `message`
+    is preserved under its own key.
+    """
+    if not isinstance(event, dict):
+        return {}
+    flat = dict(event)
+    msg = flat.get("message")
+    if isinstance(msg, str) and msg.startswith("{") and msg.endswith("}"):
+        try:
+            parsed = __import__("json").loads(msg)
+            if isinstance(parsed, dict):
+                flat.update(_flatten_dict(parsed))
+        except Exception:
+            pass
+    return flat
+
+
+def _extract_format_strings(content: str) -> list[str]:
+    """
+    Extract SDL format string values from augmented-JSON parser content.
+    Matches:  "format": "..."  (double-quoted value, supports escaped quotes).
+    """
+    pattern = re.compile(r'"format"\s*:\s*"((?:[^"\\]|\\.)*)"')
+    return pattern.findall(content)
+
+
+def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
+    """
+    Convert an SDL format string to a compiled Python regex.
+
+    Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can
+    translate group names back to the original SDL field names.
+
+    Raises re.error if the resulting pattern cannot be compiled.
+    """
+    # Split on $...$ tokens
+    token_pattern = re.compile(r'\$([^$]+)\$')
+    parts = token_pattern.split(fmt)
+    # parts alternates: literal, token, literal, token, ...
+
+    regex_parts: list[str] = []
+    py_group_to_sdl: dict[str, str] = {}
+    seen_groups: dict[str, int] = {}
+
+    for i, part in enumerate(parts):
+        if i % 2 == 0:
+            # Literal text
+            regex_parts.append(re.escape(part))
+        else:
+            # Token: either "field.name=PATTERN" or just "field.name"
+            if '=' in part:
+                field_name, pattern = part.split('=', 1)
+            else:
+                field_name = part
+                pattern = r'[^\s]+'
+
+            # Build a valid Python group name
+            safe = re.sub(r'[.\-]', '_', field_name)
+            if safe in seen_groups:
+                seen_groups[safe] += 1
+                safe = f"{safe}_{seen_groups[safe]}"
+            else:
+                seen_groups[safe] = 0
+
+            py_group_to_sdl[safe] = field_name
+            regex_parts.append(f'(?P<{safe}>{pattern})')
+
+    compiled = re.compile(''.join(regex_parts), re.IGNORECASE)
+    return compiled, py_group_to_sdl
+
+
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+
+@router.post("/sample-events")
+async def sample_events(req: SampleEventsRequest):
+    """Return a sample of raw events from a given data source."""
+    query = f'| filter dataSource.name = "{req.source}" | limit {req.limit}'
+    from_dt, to_dt = _date_range_hours(req.hours)
+
+    result = await s1_client.run_powerquery(query, from_dt, to_dt)
+
+    rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
+    events = [_flatten_event(row) for row in rows]
+
+    return {
+        "source": req.source,
+        "events": events,
+        "count": len(events),
+        "hours": req.hours,
+    }
+
+
+@router.post("/field-population")
+async def field_population(req: FieldPopulationRequest):
+    """
+    Analyse how consistently each requested field is populated across a sample
+    of events from a data source.
+    """
+    query = f'| filter dataSource.name = "{req.source}" | limit 500'
+    from_dt, to_dt = _date_range_hours(req.hours)
+
+    result = await s1_client.run_powerquery(query, from_dt, to_dt)
+
+    rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
+    events = [_flatten_event(row) for row in rows]
+
+    if not events:
+        raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.")
+
+    total = len(events)
+    _empty = {None, "", "null"}
+
+    # Collect all field names seen across the sample (useful for surfacing what IS there)
+    all_seen_fields = sorted({k for ev in events for k in ev})
+
+    field_stats = []
+    for field in req.fields:
+        # dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back
+        if field == "dataSource.name":
+            populated = total
+        else:
+            populated = sum(1 for ev in events if ev.get(field) not in _empty)
+        rate = round((populated / total) * 100, 1)
+        field_stats.append({
+            "field": field,
+            "populated": populated,
+            "total": total,
+            "rate": rate,
+        })
+
+    # Sort ascending by rate (worst coverage first)
+    field_stats.sort(key=lambda x: x["rate"])
+
+    return {
+        "source": req.source,
+        "total_sampled": total,
+        "hours": req.hours,
+        "fields": field_stats,
+        "fields_seen_in_sample": all_seen_fields,
+    }
+
+
+@router.post("/test-parser")
+async def test_parser(req: TestParserRequest):
+    """
+    Test a parser against a raw log line by extracting and matching SDL format
+    strings found in the parser file.
+    """
+    parser_path = f"/app/parsers/{req.parser_name}"
+
+    try:
+        with open(parser_path, "r", encoding="utf-8") as fh:
+            content = fh.read()
+    except FileNotFoundError:
+        raise HTTPException(status_code=404, detail=f"Parser file not found: {req.parser_name}")
+    except OSError as exc:
+        raise HTTPException(status_code=500, detail=f"Could not read parser file: {exc}")
+
+    format_strings = _extract_format_strings(content)
+
+    # ── JSON auto-extract path ──────────────────────────────────────────────
+    # SDL parsers that use `$=json{parse=json}$` (or any format containing
+    # `parse=json`) auto-extract every top-level JSON key as an attribute.
+    # The regex-based path can't model that — handle it explicitly so users
+    # can test JSON-shaped logs against JSON-mode parsers.
+    log_input = req.log_line.strip()
+    is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{")
+    if is_json_mode:
+        import json as _json
+        # Support multi-line input (one JSON object per line, or a JSON array)
+        lines = [ln for ln in (l.strip() for l in log_input.splitlines()) if ln]
+        payloads: list[dict] = []
+        parse_errors: list[str] = []
+        # Single line: try direct parse; if it's a JSON array, expand.
+        if len(lines) == 1:
+            try:
+                obj = _json.loads(lines[0])
+            except Exception as e:
+                return {
+                    "parser_name": req.parser_name,
+                    "matched": False,
+                    "message": f"Parser expects JSON but log line could not be parsed as JSON: {e}",
+                    "fields": [],
+                }
+            if isinstance(obj, list):
+                payloads = [x for x in obj if isinstance(x, dict)]
+            elif isinstance(obj, dict):
+                payloads = [obj]
+            else:
+                return {
+                    "parser_name": req.parser_name,
+                    "matched": False,
+                    "message": "Parser expects a JSON object (got scalar).",
+                    "fields": [],
+                }
+        else:
+            # Multi-line: one JSON object per line (NDJSON)
+            for i, ln in enumerate(lines, 1):
+                try:
+                    obj = _json.loads(ln)
+                    if isinstance(obj, dict):
+                        payloads.append(obj)
+                    else:
+                        parse_errors.append(f"line {i}: not a JSON object")
+                except Exception as e:
+                    parse_errors.append(f"line {i}: {e}")
+
+        if not payloads:
+            return {
+                "parser_name": req.parser_name,
+                "matched": False,
+                "message": "No valid JSON objects found. " + " | ".join(parse_errors[:3]),
+                "fields": [],
+            }
+
+        # Use the first payload for the detail table; report totals.
+        payload = payloads[0]
+        extracted = _flatten_dict(payload)
+        # Apply lightweight rewrites if present (input/output/match/replace blocks).
+        # We only handle simple literal/regex matches with $0 or string replacements;
+        # this is best-effort, intended for quick visual verification.
+        rewrites_applied = []
+        rewrite_re = re.compile(
+            r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}',
+            re.DOTALL,
+        )
+        derived: dict[str, str] = {}
+        for m in rewrite_re.finditer(content):
+            in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4)
+            src_val = extracted.get(in_field)
+            if src_val is None:
+                continue
+            try:
+                m2 = re.search(match_pat, str(src_val))
+            except re.error:
+                continue
+            if not m2:
+                continue
+            # SDL uses $0 for whole match, $1.. for groups. Translate to Python
+            # \g<0>, \g<1>, ... so re.sub doesn't read \0 as a null byte.
+            def _to_py_backref(s: str) -> str:
+                return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s)
+            try:
+                val = re.sub(match_pat, _to_py_backref(replace_val), str(src_val), count=1)
+            except re.error:
+                val = replace_val
+            derived[out_field] = val
+            rewrites_applied.append({
+                "input": in_field, "input_value": src_val,
+                "output": out_field, "matched_on": match_pat, "result": val,
+            })
+
+        fields = (
+            [{"field": k, "value": v, "source": "json-extract"} for k, v in sorted(extracted.items())]
+            + [{"field": k, "value": v, "source": "rewrite"}     for k, v in sorted(derived.items())]
+        )
+        return {
+            "parser_name": req.parser_name,
+            "matched": True,
+            "mode": "json",
+            "format_matched": "$=json{parse=json}$",
+            "fields": fields,
+            "rewrites_applied": rewrites_applied,
+            "extracted_count": len(extracted),
+            "derived_count": len(derived),
+            "payload_count": len(payloads),
+            "parse_errors": parse_errors,
+            "showing_payload": 1,
+        }
+
+    # ── Regex format-string path (original) ─────────────────────────────────
+    for fmt in format_strings:
+        try:
+            compiled, py_to_sdl = _sdl_format_to_regex(fmt)
+        except re.error:
+            # Skip unparseable format strings
+            continue
+
+        match = compiled.search(req.log_line)
+        if match:
+            fields = [
+                {"field": py_to_sdl.get(group, group), "value": value}
+                for group, value in match.groupdict().items()
+                if value is not None
+            ]
+            return {
+                "parser_name": req.parser_name,
+                "matched": True,
+                "mode": "regex",
+                "format_matched": fmt,
+                "fields": fields,
+            }
+
+    return {
+        "parser_name": req.parser_name,
+        "matched": False,
+        "message": "No format pattern matched",
+        "fields": [],
+    }
@@ -0,0 +1,105 @@
+import os
+import re
+from pathlib import Path
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+router = APIRouter()
+
+ENV_FILE = Path(os.environ.get("ENV_FILE_PATH", "/app/.env"))
+
+# Fields we expose in the UI — order matters for display
+FIELDS = [
+    {"key": "S1_BASE_URL",          "label": "Console URL",                   "secret": False, "placeholder": "https://demo.sentinelone.net"},
+    {"key": "S1_API_TOKEN",         "label": "Console API Token",             "secret": True,  "placeholder": "eyJ..."},
+    {"key": "SDL_XDR_URL",          "label": "SDL XDR URL",                   "secret": False, "placeholder": "https://xdr.us1.sentinelone.net"},
+    {"key": "SDL_LOG_READ_KEY",     "label": "SDL Log Read Key",              "secret": True,  "placeholder": "1DnK0Y4e..."},
+    {"key": "ANTHROPIC_API_KEY",    "label": "Anthropic API Key",             "secret": True,  "placeholder": "sk-ant-..."},
+]
+
+FIELD_KEYS = {f["key"] for f in FIELDS}
+
+
+def _read_env() -> dict[str, str]:
+    """Read .env file into a dict."""
+    vals: dict[str, str] = {}
+    if ENV_FILE.exists():
+        for line in ENV_FILE.read_text().splitlines():
+            line = line.strip()
+            if line and not line.startswith("#") and "=" in line:
+                k, _, v = line.partition("=")
+                vals[k.strip()] = v.strip()
+    return vals
+
+
+def _write_env(updates: dict[str, str]) -> None:
+    """Write updates into .env, preserving comments and unknown keys."""
+    existing_lines: list[str] = []
+    if ENV_FILE.exists():
+        existing_lines = ENV_FILE.read_text().splitlines()
+
+    written: set[str] = set()
+    new_lines: list[str] = []
+
+    for line in existing_lines:
+        stripped = line.strip()
+        if stripped and not stripped.startswith("#") and "=" in stripped:
+            k, _, _ = stripped.partition("=")
+            k = k.strip()
+            if k in updates:
+                new_lines.append(f"{k}={updates[k]}")
+                written.add(k)
+                continue
+        new_lines.append(line)
+
+    # Append any new keys not already in the file
+    for k, v in updates.items():
+        if k not in written:
+            new_lines.append(f"{k}={v}")
+
+    ENV_FILE.write_text("\n".join(new_lines) + "\n")
+
+
+@router.get("/config")
+async def get_config():
+    """Return current config values. Secrets are masked."""
+    env_vals = _read_env()
+    result = []
+    for f in FIELDS:
+        key = f["key"]
+        # Prefer live env var, fall back to .env file value
+        raw = os.environ.get(key, env_vals.get(key, ""))
+        if f["secret"] and raw:
+            # Show first 6 + last 4 chars, mask middle
+            masked = raw[:6] + "•" * max(4, len(raw) - 10) + raw[-4:] if len(raw) > 10 else "••••••••"
+        else:
+            masked = raw
+        result.append({
+            "key": key,
+            "label": f["label"],
+            "secret": f["secret"],
+            "placeholder": f["placeholder"],
+            "value": masked,
+            "set": bool(raw),
+        })
+    env_file_exists = ENV_FILE.exists()
+    return {"fields": result, "env_file_exists": env_file_exists, "env_file_path": str(ENV_FILE)}
+
+
+class ConfigUpdate(BaseModel):
+    updates: dict[str, str]
+
+
+@router.post("/config")
+async def save_config(body: ConfigUpdate):
+    """Save config values to .env file. Only known keys accepted."""
+    bad = [k for k in body.updates if k not in FIELD_KEYS]
+    if bad:
+        raise HTTPException(400, f"Unknown keys: {bad}")
+    if not ENV_FILE.parent.exists():
+        raise HTTPException(503, f"Cannot write to {ENV_FILE} — check Docker volume mount")
+    try:
+        _write_env(body.updates)
+    except Exception as e:
+        raise HTTPException(500, f"Failed to write .env: {e}")
+    return {"saved": list(body.updates.keys()), "restart_required": True}