v0.1 Mick Marc merged

This commit is contained in:
marc
2026-05-20 23:44:53 +02:00
commit 79efb6bf7d
50 changed files with 5190 additions and 0 deletions
View File
+648
View File
@@ -0,0 +1,648 @@
import json
import os
from fastapi import APIRouter, UploadFile, File, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy.orm import Session
from datetime import datetime
from db import get_db, ParsedRule, ParserField, ActiveSource
from services import s1_client, rule_parser
DETECTIONS_FILE = os.environ.get("DETECTIONS_FILE", "/app/data/detections.json")
router = APIRouter()
def _star_query_texts(rule: dict) -> list[str]:
"""
Extract all PowerQuery/filter strings from a STAR rule.
Handles simple rules (s1ql) and correlation rules (subQueries[].subQuery).
"""
texts = []
# Simple rules
for field in ("s1ql", "queryLang", "query", "powerQuery"):
v = rule.get(field)
# queryLang "2.0" is a version string, not a query — skip short strings
if v and isinstance(v, str) and len(v) > 5:
texts.append(v)
# Correlation rules: subQueries[].subQuery
cp = rule.get("correlationParams") or {}
for sq in cp.get("subQueries", []):
v = sq.get("subQuery")
if v and isinstance(v, str):
texts.append(v)
# Also handle older conditions[] format
for cond in cp.get("conditions", []):
for key in ("filter", "query", "subQuery"):
v = cond.get(key)
if v and isinstance(v, str):
texts.append(v)
return texts
@router.post("/load-star-rules")
async def load_star_rules(db: Session = Depends(get_db)):
"""Fetch all STAR rules from the Management Console API and index their fields."""
try:
rules = await s1_client.get_star_rules()
except Exception as e:
raise HTTPException(502, f"S1 API error: {type(e).__name__}: {e}")
# Replace all existing STAR rules cleanly to avoid duplicate key errors
db.query(ParsedRule).filter_by(rule_type="star").delete()
db.flush()
loaded = []
for rule in rules:
all_fields: set = set()
for qt in _star_query_texts(rule):
all_fields |= rule_parser.extract_star_fields(qt)
fields = list(all_fields)
record = ParsedRule(
rule_id=str(rule.get("id", "")),
name=rule.get("name", "unnamed"),
rule_type="star",
fields_used=fields,
raw=json.dumps(rule),
)
db.add(record)
loaded.append({"id": record.rule_id, "name": record.name, "fields": fields})
db.commit()
return {"loaded": len(loaded), "rules": loaded}
_EXCLUDED_PATHS = ("/rules/silent/", "/rules/dev/")
def _import_from_api_rules(db, rules: list) -> int:
"""
Import platform rules fetched directly from the S1 API into the database.
Each rule has a 'sources' list — the authoritative dataSource.name values.
"""
db.query(ParsedRule).filter_by(rule_type="library").delete()
db.commit()
loaded = 0
seen_ids: set = set()
for rule in rules:
rule_id = str(rule.get("id", f"lib_{loaded}"))
if rule_id in seen_ids:
continue
seen_ids.add(rule_id)
sources = rule.get("sources") or []
db.add(ParsedRule(
rule_id=rule_id,
name=rule.get("name", "unnamed"),
rule_type="library",
fields_used=[], # API rules don't expose field-level info
raw=json.dumps({"data_sources": sources}),
))
loaded += 1
if loaded % 500 == 0:
db.flush()
db.commit()
return loaded
def _import_detections(db, detections_file: str) -> int:
"""
Import library detection rules from extracted.json into the database.
Replaces any existing library rules. Returns the count of rules loaded.
"""
with open(detections_file, "r", encoding="utf-8") as fh:
data = json.load(fh)
results = data.get("results", [])
results = [r for r in results if not any(r.get("file", "").startswith(p) for p in _EXCLUDED_PATHS)]
db.query(ParsedRule).filter_by(rule_type="library").delete()
db.commit()
loaded = 0
seen_ids: set = set()
for rule in results:
all_fields: set = set()
data_sources: list[str] = []
for q in rule.get("queries", []):
all_fields.update(q.get("keys", []))
ds_vals = q.get("pairs", {}).get("dataSource.name", [])
for v in ds_vals:
if isinstance(v, str):
data_sources.append(v)
elif isinstance(v, list):
data_sources.extend(str(x) for x in v)
rule_id = str(rule.get("id", f"lib_{loaded}"))
if rule_id in seen_ids:
continue
seen_ids.add(rule_id)
db.add(ParsedRule(
rule_id=rule_id,
name=rule.get("name", "unnamed"),
rule_type="library",
fields_used=list(all_fields),
raw=json.dumps({"data_sources": list(set(data_sources))}),
))
loaded += 1
if loaded % 500 == 0:
db.flush()
db.commit()
return loaded
@router.post("/load-detections")
async def load_detections(db: Session = Depends(get_db)):
"""
Reload detection library rules.
Tries the live S1 API first (platform-rules endpoint); falls back to extracted.json.
"""
# Prefer the live API — gives accurate 'sources' and is always up to date
try:
rules = await s1_client.get_platform_rules()
if rules:
loaded = _import_from_api_rules(db, rules)
return {"loaded": loaded, "source": "api"}
except Exception:
pass
# Fall back to local extracted.json
if not os.path.exists(DETECTIONS_FILE):
raise HTTPException(
404,
"S1 API unavailable and no detections file found — "
"ensure the data/ volume is mounted with detections.json"
)
try:
loaded = _import_detections(db, DETECTIONS_FILE)
except Exception as e:
raise HTTPException(500, f"Failed to import detections: {e}")
return {"loaded": loaded, "source": "file"}
@router.post("/upload-sigma")
async def upload_sigma(files: list[UploadFile] = File(...), db: Session = Depends(get_db)):
"""Upload one or more Sigma YAML files and index their fields."""
loaded = []
for file in files:
content = (await file.read()).decode("utf-8", errors="replace")
fields = list(rule_parser.extract_sigma_fields(content))
record = ParsedRule(
rule_id=f"sigma_{file.filename}",
name=file.filename or "unnamed",
rule_type="sigma",
fields_used=fields,
raw=content,
)
db.merge(record)
loaded.append({"name": file.filename, "fields": fields})
db.commit()
return {"loaded": len(loaded), "rules": loaded}
@router.post("/load-parsers-from-sdl")
async def load_parsers_from_sdl(db: Session = Depends(get_db)):
"""
Load SDL parsers from the local /app/parsers directory (mounted from ./parsers/).
Files are placed there by the MCP-based loader or by manual copy.
Falls back to a clear error if the directory is empty.
"""
import os
parsers_dir = "/app/parsers"
try:
entries = [
e for e in os.scandir(parsers_dir)
if e.is_file() and not e.name.startswith(".")
]
except FileNotFoundError:
raise HTTPException(503, "parsers/ directory not found — check Docker volume mount")
if not entries:
raise HTTPException(
422,
"No parser files found in parsers/ directory. "
"Use 'Load SDL Parsers via MCP' in Claude Code to populate it, "
"or upload a parser file manually."
)
loaded = []
errors = []
for entry in entries:
try:
with open(entry.path, "r", encoding="utf-8", errors="replace") as fh:
content = fh.read()
fields: set = set()
try:
import json as _json
parser_data = _json.loads(content)
fields = rule_parser.extract_parser_fields(parser_data)
except Exception:
pass
fields |= rule_parser.extract_parser_fields_from_content(content)
name = entry.name
db.query(ParserField).filter_by(parser_name=name).delete()
for f in fields:
db.add(ParserField(parser_name=name, field_name=f, field_type="string"))
loaded.append({"parser": name, "fields": list(fields), "field_count": len(fields)})
except Exception as e:
errors.append({"parser": entry.name, "error": str(e)})
db.commit()
return {"loaded": len(loaded), "parsers": loaded, "errors": errors}
@router.post("/upload-parser")
async def upload_parser(file: UploadFile = File(...), db: Session = Depends(get_db)):
"""Upload an SDL parser JSON file and index its output fields."""
raw_bytes = await file.read()
content_str = raw_bytes.decode("utf-8", errors="replace")
# Try structured JSON extraction first, fall back to content-string extraction
fields: set = set()
try:
parser_data = json.loads(content_str)
fields = rule_parser.extract_parser_fields(parser_data)
except json.JSONDecodeError:
pass
# Always also run content-string extraction (catches $field$ SDL format strings)
fields |= rule_parser.extract_parser_fields_from_content(content_str)
db.query(ParserField).filter_by(parser_name=file.filename).delete()
for f in fields:
db.add(ParserField(parser_name=file.filename, field_name=f, field_type="string"))
db.commit()
return {"parser": file.filename, "fields": list(fields)}
class ParserContentPayload(BaseModel):
parser_name: str
content: str # raw SDL parser file content as string
@router.post("/load-parser-content")
async def load_parser_content(payload: ParserContentPayload, db: Session = Depends(get_db)):
"""
Accept raw SDL parser content (as a string) and index its output fields.
Used by MCP-based loader scripts since the SDL HTTP API endpoint is not
accessible from inside Docker with standard API token auth.
"""
fields: set = set()
# Try JSON parsing first (structured attributes/fields/mappings)
try:
parser_data = json.loads(payload.content)
fields = rule_parser.extract_parser_fields(parser_data)
except (json.JSONDecodeError, Exception):
pass
# Always run SDL format-string extraction ($field.name$ patterns)
fields |= rule_parser.extract_parser_fields_from_content(payload.content)
if not fields:
raise HTTPException(422, "No fields could be extracted from the parser content")
db.query(ParserField).filter_by(parser_name=payload.parser_name).delete()
for f in fields:
db.add(ParserField(parser_name=payload.parser_name, field_name=f, field_type="string"))
db.commit()
return {"parser": payload.parser_name, "fields": list(fields), "field_count": len(fields)}
# Native SentinelOne platform sources — parsed by the system, not by SDL parsers.
# Excluded from the coverage map as they do not require custom parser coverage.
_S1_NATIVE_SOURCES = {
"SentinelOne", "asset", "alert", "vulnerability",
"ActivityFeed", "indicator", "misconfiguration",
"SentinelOne Ranger AD",
}
@router.post("/sync-sources")
async def sync_sources(days: int = 7, db: Session = Depends(get_db)):
"""Pull active dataSource.names from the SDL and store them.
Also detects whether a parser is already producing structured fields
for each source by checking if event.type is populated in the data lake.
Native S1 platform sources are excluded as they do not require SDL parsers.
"""
import asyncio
from datetime import datetime, timedelta
now = datetime.utcnow()
from_dt = (now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z")
to_dt = now.strftime("%Y-%m-%dT%H:%M:%S.000Z")
try:
volume_result, parsed_result = await asyncio.gather(
s1_client.run_powerquery(
"| group events=count() by dataSource.name | sort -events | limit 200",
from_dt, to_dt
),
s1_client.run_powerquery(
"| filter event.type != '' | group parsed=count() by dataSource.name | limit 200",
from_dt, to_dt
),
)
except Exception as e:
raise HTTPException(502, f"PowerQuery error: {e}")
# Build lookup: source_name → count of parsed events seen
parsed_by_source: dict[str, int] = {}
for row in parsed_result.get("events", []):
name = row.get("dataSource.name")
if name:
parsed_by_source[name] = row.get("parsed", 0)
rows = volume_result.get("events", [])
db.query(ActiveSource).delete()
synced_at = datetime.utcnow()
seen = 0
for row in rows:
name = row.get("dataSource.name")
if name and name not in _S1_NATIVE_SOURCES:
db.add(ActiveSource(
source_name=name,
event_count=row.get("events", 0),
synced_at=synced_at,
parser_detected=parsed_by_source.get(name, 0),
))
seen += 1
db.commit()
return {"synced": seen, "sources": [r["dataSource.name"] for r in rows if r.get("dataSource.name") and r["dataSource.name"] not in _S1_NATIVE_SOURCES]}
def _build_parser_ds_index() -> dict[str, dict]:
"""
Read all parser files from /app/parsers/ and build an index:
dataSource.name (exact, from parser attributes) → {parser_name, format_type}
Format type is "grok", "dottedJson", or "custom".
Sources with grok/dottedJson parsers are flagged as needing a proper parser.
"""
import os, re
parsers_dir = "/app/parsers"
_DS_NAME_RE = re.compile(r'"dataSource\.name"\s*:\s*"([^"]+)"')
_FORMAT_TYPE_RE = re.compile(r'"type"\s*:\s*"([^"]+)"')
index: dict[str, dict] = {}
try:
entries = [e for e in os.scandir(parsers_dir) if e.is_file() and not e.name.startswith(".")]
except FileNotFoundError:
return index
for entry in entries:
try:
with open(entry.path, "r", encoding="utf-8", errors="replace") as fh:
content = fh.read()
except Exception:
continue
# Extract dataSource.name (may appear multiple times — take first)
ds_match = _DS_NAME_RE.search(content)
if not ds_match:
continue
ds_name = ds_match.group(1).strip()
# Determine format type — look for grok/dottedJson/custom in "type" values
format_types = {m.group(1).lower() for m in _FORMAT_TYPE_RE.finditer(content)}
if "grok" in format_types:
fmt = "grok"
elif "dottedjson" in format_types:
fmt = "dottedJson"
else:
fmt = "custom"
index[ds_name] = {"parser_name": entry.name, "format_type": fmt}
return index
@router.get("/map")
def get_coverage_map(db: Session = Depends(get_db)):
"""
Source-centric coverage map.
For each active dataSource.name in the SDL:
- covered = a custom parser is loaded for it (dataSource.name matches)
- parser_needed = no parser, OR parser uses grok/dottedJson format
Also surfaces which STAR rules reference each source.
"""
active_sources = db.query(ActiveSource).order_by(ActiveSource.event_count.desc()).all()
parser_fields_rows = db.query(ParserField).all()
rules = db.query(ParsedRule).all()
# parser_name → set of field names (for field count display)
parser_index: dict[str, set] = {}
for pf in parser_fields_rows:
parser_index.setdefault(pf.parser_name, set()).add(pf.field_name)
# Build dataSource.name → {parser_name, format_type} index from parser files
ds_index = _build_parser_ds_index()
def _normalize(s: str) -> str:
return s.lower().replace(" ", "").replace("-", "").replace("_", "").replace(".", "")
def _find_parser_info(source_name: str) -> dict | None:
"""
Match priority:
1. Exact dataSource.name match
2. Normalized substring: active source name ↔ parser dataSource.name
3. Normalized substring: active source name ↔ parser filename
(catches cases where the parser file has a wrong dataSource.name)
"""
# 1. Exact match on dataSource.name
if source_name in ds_index:
return ds_index[source_name]
sn = _normalize(source_name)
# 2. Normalized ds_name substring
for ds_name, info in ds_index.items():
if _normalize(ds_name) in sn or sn in _normalize(ds_name):
return info
# 3. Normalized filename substring
for info in ds_index.values():
if _normalize(info["parser_name"]) in sn or sn in _normalize(info["parser_name"]):
return info
return None
# Fields each rule needs: rule.name → set of field names
rule_fields_index: dict[str, set] = {
rule.name: set(rule.fields_used or []) for rule in rules
}
# Build rule index: source_name → rules that reference it
rule_by_source: dict[str, list] = {}
for rule in rules:
try:
raw_data = json.loads(rule.raw) if rule.raw else {}
except Exception:
raw_data = {}
if rule.rule_type == "library":
# Library rules store pre-extracted data_sources list in raw
data_sources = raw_data.get("data_sources", [])
else:
query_texts = _star_query_texts(raw_data)
data_sources = rule_parser.extract_data_sources(query_texts)
for ds in data_sources:
rule_by_source.setdefault(ds, []).append({"rule": rule.name, "type": rule.rule_type})
# Fields to ignore when computing "missing" — these are metadata/schema fields
# always present in events regardless of the parser
_SCHEMA_FIELDS = {
"dataSource.name", "dataSource.vendor", "dataSource.category",
"event.type", "timestamp", "src.endpoint.ip", "src.endpoint.name",
# Endpoint agent fields — populated by the SentinelOne agent, not by SDL parsers
"cmdScript.content", "endpoint.os", "endpoint.name", "endpoint.uid",
}
sources_out = []
covered_count = 0
needed_count = 0
for src in active_sources:
parser_info = _find_parser_info(src.source_name)
parser_in_data = (src.parser_detected or 0) > 0
if parser_info and parser_info["format_type"] == "custom":
status = "covered"
matched_parser = parser_info["parser_name"]
format_type = "custom"
elif parser_info and parser_info["format_type"] in ("grok", "dottedJson") and not parser_in_data:
# Known parser but primitive format and no evidence of parsing in data
status = "parser_needed"
matched_parser = parser_info["parser_name"]
format_type = parser_info["format_type"]
elif parser_in_data:
# Parsed fields detected in the data lake — a parser is running
status = "covered"
matched_parser = parser_info["parser_name"] if parser_info else "detected in data"
format_type = parser_info["format_type"] if parser_info else "unknown"
else:
status = "parser_needed"
matched_parser = None
format_type = None
if status == "covered":
covered_count += 1
else:
needed_count += 1
rules_for_src: list = [r for r in rule_by_source.get(src.source_name, []) if r["type"] == "library"]
# Close-match suggestions — shown when there are no library rules for this source.
close_matches: list = []
if not rules_for_src:
import re as _re
def _word_tokens(s: str) -> set:
"""Split on non-alphanumeric boundaries, lowercase, drop single chars."""
return {t for t in _re.split(r"[^a-z0-9]+", s.lower()) if len(t) >= 2}
def _is_close(a: str, b: str) -> bool:
na, nb = _normalize(a), _normalize(b)
# 1. Simple substring match
if na in nb or nb in na:
return True
# 2. Token-level: handles "Microsoft 365 Collaboration" vs "Microsoft O365"
# — "365" is inside "o365", and they share "microsoft"
ta, tb = _word_tokens(a), _word_tokens(b)
shared_exact = ta & tb
if not shared_exact:
return False # Must share at least one word exactly
# Check that a DISTINCTIVE (non-shared) token from one name
# appears as a substring inside a token from the other.
# This avoids matching "Azure AD" to "Azure Platform" on "azure" alone.
unique_a = ta - shared_exact
unique_b = tb - shared_exact
return any(
ua in ub or ub in ua
for ua in unique_a for ub in unique_b
if len(ua) >= 2 and len(ub) >= 2
)
sn = _normalize(src.source_name)
for lib_ds, lib_rules in rule_by_source.items():
lib_only = [r for r in lib_rules if r["type"] == "library"]
if not lib_only:
continue
if _is_close(src.source_name, lib_ds):
close_matches.append({
"library_name": lib_ds,
"rule_count": len(lib_only),
})
close_matches.sort(key=lambda x: x["rule_count"], reverse=True)
close_matches = close_matches[:3]
# Count how many rules reference each field (frequency)
field_freq: dict[str, int] = {}
for r in rules_for_src:
for f in rule_fields_index.get(r["rule"], set()):
field_freq[f] = field_freq.get(f, 0) + 1
# Fields the parser provides
parser_provides = parser_index.get(matched_parser, set()) if matched_parser and matched_parser != "detected in data" else set()
# Minimum number of rules that must reference a field before we flag it.
# Scales with rule count so single-rule oddities don't dominate.
rule_count = len(rules_for_src)
min_rules = max(2, round(rule_count * 0.05)) if rule_count >= 10 else 2
# Missing = dotted-path fields needed by >= min_rules rules,
# not in schema constants, not provided by the parser.
missing_fields = sorted(
f for f, count in field_freq.items()
if count >= min_rules
and "." in f
and f not in _SCHEMA_FIELDS
and f not in parser_provides
)
sources_out.append({
"source_name": src.source_name,
"event_count": src.event_count,
"status": status,
"parser": matched_parser,
"format_type": format_type,
"parser_fields": len(parser_provides),
"parser_detected": src.parser_detected or 0,
"rules": rules_for_src,
"rule_count": len(rules_for_src),
"close_matches": close_matches,
"missing_fields": missing_fields,
"missing_fields_count": len(missing_fields),
"synced_at": src.synced_at.isoformat() if src.synced_at else None,
})
synced_at = active_sources[0].synced_at.isoformat() if active_sources else None
return {
"summary": {
"active_sources": len(active_sources),
"covered": covered_count,
"parser_needed": needed_count,
"parsers_loaded": len(parser_index),
"rules_loaded": len(rules),
},
"sources": sources_out,
"synced_at": synced_at,
"has_sources": len(active_sources) > 0,
}
@router.delete("/reset")
def reset_data(db: Session = Depends(get_db)):
db.query(ParsedRule).delete()
db.query(ParserField).delete()
db.commit()
return {"cleared": True}
+122
View File
@@ -0,0 +1,122 @@
from datetime import datetime, timedelta
from fastapi import APIRouter, Query, HTTPException
from pydantic import BaseModel
from services import s1_client
router = APIRouter()
def _date_range(days: int) -> tuple[str, str]:
now = datetime.utcnow()
return (
(now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
)
def _date_range_hours(hours: int) -> tuple[str, str]:
now = datetime.utcnow()
return (
(now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
)
@router.get("/top-sources")
async def get_top_sources(
days: int = Query(None, ge=1, le=90),
hours: int = Query(None, ge=1, le=24),
):
"""Top log sources by event count over the given period."""
if hours is not None:
from_dt, to_dt = _date_range_hours(hours)
period_label = f"{hours}h"
else:
from_dt, to_dt = _date_range(days or 7)
period_label = f"{days or 7}d"
query = "| group events=count() by dataSource.name | sort -events | limit 25"
try:
result = await s1_client.run_powerquery(query, from_dt, to_dt)
except Exception as e:
raise HTTPException(502, f"PowerQuery error: {e}")
return {"period": period_label, "data": result.get("events", [])}
@router.get("/by-event-type")
async def get_by_event_type(days: int = Query(7, ge=1, le=90)):
"""Event counts grouped by source and event type."""
from_dt, to_dt = _date_range(days)
query = "| group events=count() by dataSource.name, event.type | sort -events | limit 100"
try:
result = await s1_client.run_powerquery(query, from_dt, to_dt)
except Exception as e:
raise HTTPException(502, f"PowerQuery error: {e}")
return {"period_days": days, "data": result.get("events", [])}
@router.get("/daily-volume")
async def get_daily_volume(days: int = Query(5, ge=1, le=7)):
"""Total event count per day — queries run in parallel."""
import asyncio
now = datetime.utcnow()
points = min(days, 7)
async def _fetch_day(i: int) -> dict:
day_from = (now - timedelta(days=i + 1)).strftime("%Y-%m-%dT00:00:00.000Z")
day_to = (now - timedelta(days=i)).strftime("%Y-%m-%dT00:00:00.000Z")
label = (now - timedelta(days=i + 1)).strftime("%Y-%m-%d")
try:
result = await s1_client.run_powerquery("| group total=count()", day_from, day_to)
events_list = result.get("events", []) if isinstance(result, dict) else []
count = events_list[0].get("total", 0) if events_list else 0
except Exception:
count = 0
return {"date": label, "events": count}
results = await asyncio.gather(*[_fetch_day(i) for i in range(points)])
return list(reversed(results))
class FilterRule(BaseModel):
source: str = ""
event_type: str = ""
days: int = 7
gb_per_million_events: float = 0.5
@router.post("/simulate-filter")
async def simulate_filter(rule: FilterRule):
"""Estimate how many events and GB would be eliminated by an exclusion filter."""
from_dt, to_dt = _date_range(rule.days)
clauses = []
if rule.source:
clauses.append(f"dataSource.name=='{rule.source}'")
if rule.event_type:
clauses.append(f"event.type=='{rule.event_type}'")
if clauses:
filter_expr = " and ".join(clauses)
query = f"| filter {filter_expr} | group events=count()"
else:
query = "| group events=count()"
try:
result = await s1_client.run_powerquery(query, from_dt, to_dt)
events = (result.get("events") or [{}])[0].get("events", 0) if isinstance(result.get("events"), list) else 0
except Exception as e:
raise HTTPException(502, f"PowerQuery error: {e}")
estimated_gb = round(events / 1_000_000 * rule.gb_per_million_events, 3)
monthly_events = int(events / rule.days * 30)
monthly_gb = round(monthly_events / 1_000_000 * rule.gb_per_million_events, 2)
return {
"period_days": rule.days,
"matched_events": events,
"estimated_gb_period": estimated_gb,
"projected_monthly_events": monthly_events,
"projected_monthly_gb": monthly_gb,
"filter": {"source": rule.source, "event_type": rule.event_type},
}
+440
View File
@@ -0,0 +1,440 @@
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from datetime import datetime, timedelta
from services import s1_client
import os
import re
router = APIRouter()
PARSERS_DIR = "/app/parsers"
@router.get("/parsers")
def list_parser_files():
"""List parser filenames available under /app/parsers/ for the Test Runner."""
try:
names = sorted(
e.name for e in os.scandir(PARSERS_DIR)
if e.is_file() and not e.name.startswith(".")
)
except FileNotFoundError:
names = []
return {"parsers": names, "count": len(names)}
@router.post("/sync-from-sdl")
async def sync_parsers_from_sdl():
"""Download every parser file under /logParsers/ on the SDL tenant into
/app/parsers/. After this call returns, the Parser Test Runner dropdown
will include all tenant parsers (including custom ones).
Requires SDL_CONFIG_READ_KEY in .env (Configuration Read scope on the
Data Lake API key).
"""
if not s1_client.SDL_CONFIG_READ_KEY:
raise HTTPException(
400,
"SDL_CONFIG_READ_KEY is not set in .env. Generate a Data Lake API key "
"with 'Configuration Read' scope in the S1 console and add it to .env."
)
try:
names = await s1_client.list_sdl_parsers()
except Exception as e:
raise HTTPException(502, f"SDL listFiles failed: {e}")
os.makedirs(PARSERS_DIR, exist_ok=True)
downloaded: list[str] = []
errors: list[dict] = []
for name in names:
# The path on SDL is /logParsers/<name>; we write to /app/parsers/<sanitized-name>.
safe_name = name.replace("/", "_")
try:
resp = await s1_client.get_sdl_parser(name)
content = resp.get("content")
if content is None:
errors.append({"parser": name, "error": "no content field in response"})
continue
with open(os.path.join(PARSERS_DIR, safe_name), "w", encoding="utf-8") as fh:
fh.write(content)
downloaded.append(safe_name)
except Exception as e:
errors.append({"parser": name, "error": str(e) or e.__class__.__name__})
return {
"downloaded": len(downloaded),
"parsers": downloaded,
"errors": errors,
"directory": PARSERS_DIR,
}
def _date_range_hours(hours: int) -> tuple[str, str]:
now = datetime.utcnow()
return (
(now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
)
# ---------------------------------------------------------------------------
# Models
# ---------------------------------------------------------------------------
class SampleEventsRequest(BaseModel):
source: str
limit: int = 20
hours: int = 1
class FieldPopulationRequest(BaseModel):
source: str
hours: int = 24
fields: list[str] = [
"src.ip",
"src.port",
"dst.ip",
"dst.port",
"user.name",
"event.type",
"src.process.name",
"src.process.cmdline",
"tgt.file.path",
"network.direction",
"dataSource.name",
]
class TestParserRequest(BaseModel):
parser_name: str
log_line: str
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _flatten_dict(d: dict, prefix: str = "", out: dict | None = None) -> dict:
"""Recursively flatten a nested dict into dotted keys."""
if out is None:
out = {}
if not isinstance(d, dict):
return out
for k, v in d.items():
key = f"{prefix}.{k}" if prefix else k
if isinstance(v, dict):
_flatten_dict(v, key, out)
else:
out[key] = v
return out
def _flatten_event(event: dict) -> dict:
"""Return a flat field→value dict from a PowerQuery result row.
If the row only carries a JSON-stringified payload in `message` (i.e. the
parser wasn't applied at query time), parse and flatten it inline so the
UI can measure field population accurately. The original raw `message`
is preserved under its own key.
"""
if not isinstance(event, dict):
return {}
flat = dict(event)
msg = flat.get("message")
if isinstance(msg, str) and msg.startswith("{") and msg.endswith("}"):
try:
parsed = __import__("json").loads(msg)
if isinstance(parsed, dict):
flat.update(_flatten_dict(parsed))
except Exception:
pass
return flat
def _extract_format_strings(content: str) -> list[str]:
"""
Extract SDL format string values from augmented-JSON parser content.
Matches: "format": "..." (double-quoted value, supports escaped quotes).
"""
pattern = re.compile(r'"format"\s*:\s*"((?:[^"\\]|\\.)*)"')
return pattern.findall(content)
def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
"""
Convert an SDL format string to a compiled Python regex.
Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can
translate group names back to the original SDL field names.
Raises re.error if the resulting pattern cannot be compiled.
"""
# Split on $...$ tokens
token_pattern = re.compile(r'\$([^$]+)\$')
parts = token_pattern.split(fmt)
# parts alternates: literal, token, literal, token, ...
regex_parts: list[str] = []
py_group_to_sdl: dict[str, str] = {}
seen_groups: dict[str, int] = {}
for i, part in enumerate(parts):
if i % 2 == 0:
# Literal text
regex_parts.append(re.escape(part))
else:
# Token: either "field.name=PATTERN" or just "field.name"
if '=' in part:
field_name, pattern = part.split('=', 1)
else:
field_name = part
pattern = r'[^\s]+'
# Build a valid Python group name
safe = re.sub(r'[.\-]', '_', field_name)
if safe in seen_groups:
seen_groups[safe] += 1
safe = f"{safe}_{seen_groups[safe]}"
else:
seen_groups[safe] = 0
py_group_to_sdl[safe] = field_name
regex_parts.append(f'(?P<{safe}>{pattern})')
compiled = re.compile(''.join(regex_parts), re.IGNORECASE)
return compiled, py_group_to_sdl
# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------
@router.post("/sample-events")
async def sample_events(req: SampleEventsRequest):
"""Return a sample of raw events from a given data source."""
query = f'| filter dataSource.name = "{req.source}" | limit {req.limit}'
from_dt, to_dt = _date_range_hours(req.hours)
result = await s1_client.run_powerquery(query, from_dt, to_dt)
rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
events = [_flatten_event(row) for row in rows]
return {
"source": req.source,
"events": events,
"count": len(events),
"hours": req.hours,
}
@router.post("/field-population")
async def field_population(req: FieldPopulationRequest):
"""
Analyse how consistently each requested field is populated across a sample
of events from a data source.
"""
query = f'| filter dataSource.name = "{req.source}" | limit 500'
from_dt, to_dt = _date_range_hours(req.hours)
result = await s1_client.run_powerquery(query, from_dt, to_dt)
rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
events = [_flatten_event(row) for row in rows]
if not events:
raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.")
total = len(events)
_empty = {None, "", "null"}
# Collect all field names seen across the sample (useful for surfacing what IS there)
all_seen_fields = sorted({k for ev in events for k in ev})
field_stats = []
for field in req.fields:
# dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back
if field == "dataSource.name":
populated = total
else:
populated = sum(1 for ev in events if ev.get(field) not in _empty)
rate = round((populated / total) * 100, 1)
field_stats.append({
"field": field,
"populated": populated,
"total": total,
"rate": rate,
})
# Sort ascending by rate (worst coverage first)
field_stats.sort(key=lambda x: x["rate"])
return {
"source": req.source,
"total_sampled": total,
"hours": req.hours,
"fields": field_stats,
"fields_seen_in_sample": all_seen_fields,
}
@router.post("/test-parser")
async def test_parser(req: TestParserRequest):
"""
Test a parser against a raw log line by extracting and matching SDL format
strings found in the parser file.
"""
parser_path = f"/app/parsers/{req.parser_name}"
try:
with open(parser_path, "r", encoding="utf-8") as fh:
content = fh.read()
except FileNotFoundError:
raise HTTPException(status_code=404, detail=f"Parser file not found: {req.parser_name}")
except OSError as exc:
raise HTTPException(status_code=500, detail=f"Could not read parser file: {exc}")
format_strings = _extract_format_strings(content)
# ── JSON auto-extract path ──────────────────────────────────────────────
# SDL parsers that use `$=json{parse=json}$` (or any format containing
# `parse=json`) auto-extract every top-level JSON key as an attribute.
# The regex-based path can't model that — handle it explicitly so users
# can test JSON-shaped logs against JSON-mode parsers.
log_input = req.log_line.strip()
is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{")
if is_json_mode:
import json as _json
# Support multi-line input (one JSON object per line, or a JSON array)
lines = [ln for ln in (l.strip() for l in log_input.splitlines()) if ln]
payloads: list[dict] = []
parse_errors: list[str] = []
# Single line: try direct parse; if it's a JSON array, expand.
if len(lines) == 1:
try:
obj = _json.loads(lines[0])
except Exception as e:
return {
"parser_name": req.parser_name,
"matched": False,
"message": f"Parser expects JSON but log line could not be parsed as JSON: {e}",
"fields": [],
}
if isinstance(obj, list):
payloads = [x for x in obj if isinstance(x, dict)]
elif isinstance(obj, dict):
payloads = [obj]
else:
return {
"parser_name": req.parser_name,
"matched": False,
"message": "Parser expects a JSON object (got scalar).",
"fields": [],
}
else:
# Multi-line: one JSON object per line (NDJSON)
for i, ln in enumerate(lines, 1):
try:
obj = _json.loads(ln)
if isinstance(obj, dict):
payloads.append(obj)
else:
parse_errors.append(f"line {i}: not a JSON object")
except Exception as e:
parse_errors.append(f"line {i}: {e}")
if not payloads:
return {
"parser_name": req.parser_name,
"matched": False,
"message": "No valid JSON objects found. " + " | ".join(parse_errors[:3]),
"fields": [],
}
# Use the first payload for the detail table; report totals.
payload = payloads[0]
extracted = _flatten_dict(payload)
# Apply lightweight rewrites if present (input/output/match/replace blocks).
# We only handle simple literal/regex matches with $0 or string replacements;
# this is best-effort, intended for quick visual verification.
rewrites_applied = []
rewrite_re = re.compile(
r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}',
re.DOTALL,
)
derived: dict[str, str] = {}
for m in rewrite_re.finditer(content):
in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4)
src_val = extracted.get(in_field)
if src_val is None:
continue
try:
m2 = re.search(match_pat, str(src_val))
except re.error:
continue
if not m2:
continue
# SDL uses $0 for whole match, $1.. for groups. Translate to Python
# \g<0>, \g<1>, ... so re.sub doesn't read \0 as a null byte.
def _to_py_backref(s: str) -> str:
return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s)
try:
val = re.sub(match_pat, _to_py_backref(replace_val), str(src_val), count=1)
except re.error:
val = replace_val
derived[out_field] = val
rewrites_applied.append({
"input": in_field, "input_value": src_val,
"output": out_field, "matched_on": match_pat, "result": val,
})
fields = (
[{"field": k, "value": v, "source": "json-extract"} for k, v in sorted(extracted.items())]
+ [{"field": k, "value": v, "source": "rewrite"} for k, v in sorted(derived.items())]
)
return {
"parser_name": req.parser_name,
"matched": True,
"mode": "json",
"format_matched": "$=json{parse=json}$",
"fields": fields,
"rewrites_applied": rewrites_applied,
"extracted_count": len(extracted),
"derived_count": len(derived),
"payload_count": len(payloads),
"parse_errors": parse_errors,
"showing_payload": 1,
}
# ── Regex format-string path (original) ─────────────────────────────────
for fmt in format_strings:
try:
compiled, py_to_sdl = _sdl_format_to_regex(fmt)
except re.error:
# Skip unparseable format strings
continue
match = compiled.search(req.log_line)
if match:
fields = [
{"field": py_to_sdl.get(group, group), "value": value}
for group, value in match.groupdict().items()
if value is not None
]
return {
"parser_name": req.parser_name,
"matched": True,
"mode": "regex",
"format_matched": fmt,
"fields": fields,
}
return {
"parser_name": req.parser_name,
"matched": False,
"message": "No format pattern matched",
"fields": [],
}
+105
View File
@@ -0,0 +1,105 @@
import os
import re
from pathlib import Path
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
router = APIRouter()
ENV_FILE = Path(os.environ.get("ENV_FILE_PATH", "/app/.env"))
# Fields we expose in the UI — order matters for display
FIELDS = [
{"key": "S1_BASE_URL", "label": "Console URL", "secret": False, "placeholder": "https://demo.sentinelone.net"},
{"key": "S1_API_TOKEN", "label": "Console API Token", "secret": True, "placeholder": "eyJ..."},
{"key": "SDL_XDR_URL", "label": "SDL XDR URL", "secret": False, "placeholder": "https://xdr.us1.sentinelone.net"},
{"key": "SDL_LOG_READ_KEY", "label": "SDL Log Read Key", "secret": True, "placeholder": "1DnK0Y4e..."},
{"key": "ANTHROPIC_API_KEY", "label": "Anthropic API Key", "secret": True, "placeholder": "sk-ant-..."},
]
FIELD_KEYS = {f["key"] for f in FIELDS}
def _read_env() -> dict[str, str]:
"""Read .env file into a dict."""
vals: dict[str, str] = {}
if ENV_FILE.exists():
for line in ENV_FILE.read_text().splitlines():
line = line.strip()
if line and not line.startswith("#") and "=" in line:
k, _, v = line.partition("=")
vals[k.strip()] = v.strip()
return vals
def _write_env(updates: dict[str, str]) -> None:
"""Write updates into .env, preserving comments and unknown keys."""
existing_lines: list[str] = []
if ENV_FILE.exists():
existing_lines = ENV_FILE.read_text().splitlines()
written: set[str] = set()
new_lines: list[str] = []
for line in existing_lines:
stripped = line.strip()
if stripped and not stripped.startswith("#") and "=" in stripped:
k, _, _ = stripped.partition("=")
k = k.strip()
if k in updates:
new_lines.append(f"{k}={updates[k]}")
written.add(k)
continue
new_lines.append(line)
# Append any new keys not already in the file
for k, v in updates.items():
if k not in written:
new_lines.append(f"{k}={v}")
ENV_FILE.write_text("\n".join(new_lines) + "\n")
@router.get("/config")
async def get_config():
"""Return current config values. Secrets are masked."""
env_vals = _read_env()
result = []
for f in FIELDS:
key = f["key"]
# Prefer live env var, fall back to .env file value
raw = os.environ.get(key, env_vals.get(key, ""))
if f["secret"] and raw:
# Show first 6 + last 4 chars, mask middle
masked = raw[:6] + "" * max(4, len(raw) - 10) + raw[-4:] if len(raw) > 10 else "••••••••"
else:
masked = raw
result.append({
"key": key,
"label": f["label"],
"secret": f["secret"],
"placeholder": f["placeholder"],
"value": masked,
"set": bool(raw),
})
env_file_exists = ENV_FILE.exists()
return {"fields": result, "env_file_exists": env_file_exists, "env_file_path": str(ENV_FILE)}
class ConfigUpdate(BaseModel):
updates: dict[str, str]
@router.post("/config")
async def save_config(body: ConfigUpdate):
"""Save config values to .env file. Only known keys accepted."""
bad = [k for k in body.updates if k not in FIELD_KEYS]
if bad:
raise HTTPException(400, f"Unknown keys: {bad}")
if not ENV_FILE.parent.exists():
raise HTTPException(503, f"Cannot write to {ENV_FILE} — check Docker volume mount")
try:
_write_env(body.updates)
except Exception as e:
raise HTTPException(500, f"Failed to write .env: {e}")
return {"saved": list(body.updates.keys()), "restart_required": True}