mirror of
https://github.com/marcredhat/SIEM-toolkit-patched
synced 2026-06-10 13:21:17 +00:00
v0.1 Mick Marc merged
This commit is contained in:
@@ -0,0 +1,648 @@
|
||||
import json
|
||||
import os
|
||||
from fastapi import APIRouter, UploadFile, File, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
from datetime import datetime
|
||||
from db import get_db, ParsedRule, ParserField, ActiveSource
|
||||
from services import s1_client, rule_parser
|
||||
|
||||
DETECTIONS_FILE = os.environ.get("DETECTIONS_FILE", "/app/data/detections.json")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _star_query_texts(rule: dict) -> list[str]:
|
||||
"""
|
||||
Extract all PowerQuery/filter strings from a STAR rule.
|
||||
Handles simple rules (s1ql) and correlation rules (subQueries[].subQuery).
|
||||
"""
|
||||
texts = []
|
||||
|
||||
# Simple rules
|
||||
for field in ("s1ql", "queryLang", "query", "powerQuery"):
|
||||
v = rule.get(field)
|
||||
# queryLang "2.0" is a version string, not a query — skip short strings
|
||||
if v and isinstance(v, str) and len(v) > 5:
|
||||
texts.append(v)
|
||||
|
||||
# Correlation rules: subQueries[].subQuery
|
||||
cp = rule.get("correlationParams") or {}
|
||||
for sq in cp.get("subQueries", []):
|
||||
v = sq.get("subQuery")
|
||||
if v and isinstance(v, str):
|
||||
texts.append(v)
|
||||
# Also handle older conditions[] format
|
||||
for cond in cp.get("conditions", []):
|
||||
for key in ("filter", "query", "subQuery"):
|
||||
v = cond.get(key)
|
||||
if v and isinstance(v, str):
|
||||
texts.append(v)
|
||||
|
||||
return texts
|
||||
|
||||
|
||||
@router.post("/load-star-rules")
|
||||
async def load_star_rules(db: Session = Depends(get_db)):
|
||||
"""Fetch all STAR rules from the Management Console API and index their fields."""
|
||||
try:
|
||||
rules = await s1_client.get_star_rules()
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"S1 API error: {type(e).__name__}: {e}")
|
||||
|
||||
# Replace all existing STAR rules cleanly to avoid duplicate key errors
|
||||
db.query(ParsedRule).filter_by(rule_type="star").delete()
|
||||
db.flush()
|
||||
|
||||
loaded = []
|
||||
for rule in rules:
|
||||
all_fields: set = set()
|
||||
for qt in _star_query_texts(rule):
|
||||
all_fields |= rule_parser.extract_star_fields(qt)
|
||||
fields = list(all_fields)
|
||||
record = ParsedRule(
|
||||
rule_id=str(rule.get("id", "")),
|
||||
name=rule.get("name", "unnamed"),
|
||||
rule_type="star",
|
||||
fields_used=fields,
|
||||
raw=json.dumps(rule),
|
||||
)
|
||||
db.add(record)
|
||||
loaded.append({"id": record.rule_id, "name": record.name, "fields": fields})
|
||||
|
||||
db.commit()
|
||||
return {"loaded": len(loaded), "rules": loaded}
|
||||
|
||||
|
||||
_EXCLUDED_PATHS = ("/rules/silent/", "/rules/dev/")
|
||||
|
||||
|
||||
def _import_from_api_rules(db, rules: list) -> int:
|
||||
"""
|
||||
Import platform rules fetched directly from the S1 API into the database.
|
||||
Each rule has a 'sources' list — the authoritative dataSource.name values.
|
||||
"""
|
||||
db.query(ParsedRule).filter_by(rule_type="library").delete()
|
||||
db.commit()
|
||||
|
||||
loaded = 0
|
||||
seen_ids: set = set()
|
||||
for rule in rules:
|
||||
rule_id = str(rule.get("id", f"lib_{loaded}"))
|
||||
if rule_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(rule_id)
|
||||
|
||||
sources = rule.get("sources") or []
|
||||
db.add(ParsedRule(
|
||||
rule_id=rule_id,
|
||||
name=rule.get("name", "unnamed"),
|
||||
rule_type="library",
|
||||
fields_used=[], # API rules don't expose field-level info
|
||||
raw=json.dumps({"data_sources": sources}),
|
||||
))
|
||||
loaded += 1
|
||||
if loaded % 500 == 0:
|
||||
db.flush()
|
||||
|
||||
db.commit()
|
||||
return loaded
|
||||
|
||||
|
||||
def _import_detections(db, detections_file: str) -> int:
|
||||
"""
|
||||
Import library detection rules from extracted.json into the database.
|
||||
Replaces any existing library rules. Returns the count of rules loaded.
|
||||
"""
|
||||
with open(detections_file, "r", encoding="utf-8") as fh:
|
||||
data = json.load(fh)
|
||||
|
||||
results = data.get("results", [])
|
||||
results = [r for r in results if not any(r.get("file", "").startswith(p) for p in _EXCLUDED_PATHS)]
|
||||
|
||||
db.query(ParsedRule).filter_by(rule_type="library").delete()
|
||||
db.commit()
|
||||
|
||||
loaded = 0
|
||||
seen_ids: set = set()
|
||||
for rule in results:
|
||||
all_fields: set = set()
|
||||
data_sources: list[str] = []
|
||||
for q in rule.get("queries", []):
|
||||
all_fields.update(q.get("keys", []))
|
||||
ds_vals = q.get("pairs", {}).get("dataSource.name", [])
|
||||
for v in ds_vals:
|
||||
if isinstance(v, str):
|
||||
data_sources.append(v)
|
||||
elif isinstance(v, list):
|
||||
data_sources.extend(str(x) for x in v)
|
||||
|
||||
rule_id = str(rule.get("id", f"lib_{loaded}"))
|
||||
if rule_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(rule_id)
|
||||
|
||||
db.add(ParsedRule(
|
||||
rule_id=rule_id,
|
||||
name=rule.get("name", "unnamed"),
|
||||
rule_type="library",
|
||||
fields_used=list(all_fields),
|
||||
raw=json.dumps({"data_sources": list(set(data_sources))}),
|
||||
))
|
||||
loaded += 1
|
||||
if loaded % 500 == 0:
|
||||
db.flush()
|
||||
|
||||
db.commit()
|
||||
return loaded
|
||||
|
||||
|
||||
@router.post("/load-detections")
|
||||
async def load_detections(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Reload detection library rules.
|
||||
Tries the live S1 API first (platform-rules endpoint); falls back to extracted.json.
|
||||
"""
|
||||
# Prefer the live API — gives accurate 'sources' and is always up to date
|
||||
try:
|
||||
rules = await s1_client.get_platform_rules()
|
||||
if rules:
|
||||
loaded = _import_from_api_rules(db, rules)
|
||||
return {"loaded": loaded, "source": "api"}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fall back to local extracted.json
|
||||
if not os.path.exists(DETECTIONS_FILE):
|
||||
raise HTTPException(
|
||||
404,
|
||||
"S1 API unavailable and no detections file found — "
|
||||
"ensure the data/ volume is mounted with detections.json"
|
||||
)
|
||||
try:
|
||||
loaded = _import_detections(db, DETECTIONS_FILE)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"Failed to import detections: {e}")
|
||||
return {"loaded": loaded, "source": "file"}
|
||||
|
||||
|
||||
@router.post("/upload-sigma")
|
||||
async def upload_sigma(files: list[UploadFile] = File(...), db: Session = Depends(get_db)):
|
||||
"""Upload one or more Sigma YAML files and index their fields."""
|
||||
loaded = []
|
||||
for file in files:
|
||||
content = (await file.read()).decode("utf-8", errors="replace")
|
||||
fields = list(rule_parser.extract_sigma_fields(content))
|
||||
record = ParsedRule(
|
||||
rule_id=f"sigma_{file.filename}",
|
||||
name=file.filename or "unnamed",
|
||||
rule_type="sigma",
|
||||
fields_used=fields,
|
||||
raw=content,
|
||||
)
|
||||
db.merge(record)
|
||||
loaded.append({"name": file.filename, "fields": fields})
|
||||
|
||||
db.commit()
|
||||
return {"loaded": len(loaded), "rules": loaded}
|
||||
|
||||
|
||||
@router.post("/load-parsers-from-sdl")
|
||||
async def load_parsers_from_sdl(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Load SDL parsers from the local /app/parsers directory (mounted from ./parsers/).
|
||||
Files are placed there by the MCP-based loader or by manual copy.
|
||||
Falls back to a clear error if the directory is empty.
|
||||
"""
|
||||
import os
|
||||
parsers_dir = "/app/parsers"
|
||||
|
||||
try:
|
||||
entries = [
|
||||
e for e in os.scandir(parsers_dir)
|
||||
if e.is_file() and not e.name.startswith(".")
|
||||
]
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(503, "parsers/ directory not found — check Docker volume mount")
|
||||
|
||||
if not entries:
|
||||
raise HTTPException(
|
||||
422,
|
||||
"No parser files found in parsers/ directory. "
|
||||
"Use 'Load SDL Parsers via MCP' in Claude Code to populate it, "
|
||||
"or upload a parser file manually."
|
||||
)
|
||||
|
||||
loaded = []
|
||||
errors = []
|
||||
for entry in entries:
|
||||
try:
|
||||
with open(entry.path, "r", encoding="utf-8", errors="replace") as fh:
|
||||
content = fh.read()
|
||||
|
||||
fields: set = set()
|
||||
try:
|
||||
import json as _json
|
||||
parser_data = _json.loads(content)
|
||||
fields = rule_parser.extract_parser_fields(parser_data)
|
||||
except Exception:
|
||||
pass
|
||||
fields |= rule_parser.extract_parser_fields_from_content(content)
|
||||
|
||||
name = entry.name
|
||||
db.query(ParserField).filter_by(parser_name=name).delete()
|
||||
for f in fields:
|
||||
db.add(ParserField(parser_name=name, field_name=f, field_type="string"))
|
||||
loaded.append({"parser": name, "fields": list(fields), "field_count": len(fields)})
|
||||
except Exception as e:
|
||||
errors.append({"parser": entry.name, "error": str(e)})
|
||||
|
||||
db.commit()
|
||||
return {"loaded": len(loaded), "parsers": loaded, "errors": errors}
|
||||
|
||||
|
||||
@router.post("/upload-parser")
|
||||
async def upload_parser(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
||||
"""Upload an SDL parser JSON file and index its output fields."""
|
||||
raw_bytes = await file.read()
|
||||
content_str = raw_bytes.decode("utf-8", errors="replace")
|
||||
|
||||
# Try structured JSON extraction first, fall back to content-string extraction
|
||||
fields: set = set()
|
||||
try:
|
||||
parser_data = json.loads(content_str)
|
||||
fields = rule_parser.extract_parser_fields(parser_data)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Always also run content-string extraction (catches $field$ SDL format strings)
|
||||
fields |= rule_parser.extract_parser_fields_from_content(content_str)
|
||||
|
||||
db.query(ParserField).filter_by(parser_name=file.filename).delete()
|
||||
for f in fields:
|
||||
db.add(ParserField(parser_name=file.filename, field_name=f, field_type="string"))
|
||||
|
||||
db.commit()
|
||||
return {"parser": file.filename, "fields": list(fields)}
|
||||
|
||||
|
||||
class ParserContentPayload(BaseModel):
|
||||
parser_name: str
|
||||
content: str # raw SDL parser file content as string
|
||||
|
||||
|
||||
@router.post("/load-parser-content")
|
||||
async def load_parser_content(payload: ParserContentPayload, db: Session = Depends(get_db)):
|
||||
"""
|
||||
Accept raw SDL parser content (as a string) and index its output fields.
|
||||
Used by MCP-based loader scripts since the SDL HTTP API endpoint is not
|
||||
accessible from inside Docker with standard API token auth.
|
||||
"""
|
||||
fields: set = set()
|
||||
|
||||
# Try JSON parsing first (structured attributes/fields/mappings)
|
||||
try:
|
||||
parser_data = json.loads(payload.content)
|
||||
fields = rule_parser.extract_parser_fields(parser_data)
|
||||
except (json.JSONDecodeError, Exception):
|
||||
pass
|
||||
|
||||
# Always run SDL format-string extraction ($field.name$ patterns)
|
||||
fields |= rule_parser.extract_parser_fields_from_content(payload.content)
|
||||
|
||||
if not fields:
|
||||
raise HTTPException(422, "No fields could be extracted from the parser content")
|
||||
|
||||
db.query(ParserField).filter_by(parser_name=payload.parser_name).delete()
|
||||
for f in fields:
|
||||
db.add(ParserField(parser_name=payload.parser_name, field_name=f, field_type="string"))
|
||||
|
||||
db.commit()
|
||||
return {"parser": payload.parser_name, "fields": list(fields), "field_count": len(fields)}
|
||||
|
||||
|
||||
# Native SentinelOne platform sources — parsed by the system, not by SDL parsers.
|
||||
# Excluded from the coverage map as they do not require custom parser coverage.
|
||||
_S1_NATIVE_SOURCES = {
|
||||
"SentinelOne", "asset", "alert", "vulnerability",
|
||||
"ActivityFeed", "indicator", "misconfiguration",
|
||||
"SentinelOne Ranger AD",
|
||||
}
|
||||
|
||||
|
||||
@router.post("/sync-sources")
|
||||
async def sync_sources(days: int = 7, db: Session = Depends(get_db)):
|
||||
"""Pull active dataSource.names from the SDL and store them.
|
||||
Also detects whether a parser is already producing structured fields
|
||||
for each source by checking if event.type is populated in the data lake.
|
||||
Native S1 platform sources are excluded as they do not require SDL parsers.
|
||||
"""
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
now = datetime.utcnow()
|
||||
from_dt = (now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
to_dt = now.strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
|
||||
try:
|
||||
volume_result, parsed_result = await asyncio.gather(
|
||||
s1_client.run_powerquery(
|
||||
"| group events=count() by dataSource.name | sort -events | limit 200",
|
||||
from_dt, to_dt
|
||||
),
|
||||
s1_client.run_powerquery(
|
||||
"| filter event.type != '' | group parsed=count() by dataSource.name | limit 200",
|
||||
from_dt, to_dt
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"PowerQuery error: {e}")
|
||||
|
||||
# Build lookup: source_name → count of parsed events seen
|
||||
parsed_by_source: dict[str, int] = {}
|
||||
for row in parsed_result.get("events", []):
|
||||
name = row.get("dataSource.name")
|
||||
if name:
|
||||
parsed_by_source[name] = row.get("parsed", 0)
|
||||
|
||||
rows = volume_result.get("events", [])
|
||||
db.query(ActiveSource).delete()
|
||||
synced_at = datetime.utcnow()
|
||||
seen = 0
|
||||
for row in rows:
|
||||
name = row.get("dataSource.name")
|
||||
if name and name not in _S1_NATIVE_SOURCES:
|
||||
db.add(ActiveSource(
|
||||
source_name=name,
|
||||
event_count=row.get("events", 0),
|
||||
synced_at=synced_at,
|
||||
parser_detected=parsed_by_source.get(name, 0),
|
||||
))
|
||||
seen += 1
|
||||
db.commit()
|
||||
return {"synced": seen, "sources": [r["dataSource.name"] for r in rows if r.get("dataSource.name") and r["dataSource.name"] not in _S1_NATIVE_SOURCES]}
|
||||
|
||||
|
||||
def _build_parser_ds_index() -> dict[str, dict]:
|
||||
"""
|
||||
Read all parser files from /app/parsers/ and build an index:
|
||||
dataSource.name (exact, from parser attributes) → {parser_name, format_type}
|
||||
|
||||
Format type is "grok", "dottedJson", or "custom".
|
||||
Sources with grok/dottedJson parsers are flagged as needing a proper parser.
|
||||
"""
|
||||
import os, re
|
||||
parsers_dir = "/app/parsers"
|
||||
_DS_NAME_RE = re.compile(r'"dataSource\.name"\s*:\s*"([^"]+)"')
|
||||
_FORMAT_TYPE_RE = re.compile(r'"type"\s*:\s*"([^"]+)"')
|
||||
|
||||
index: dict[str, dict] = {}
|
||||
try:
|
||||
entries = [e for e in os.scandir(parsers_dir) if e.is_file() and not e.name.startswith(".")]
|
||||
except FileNotFoundError:
|
||||
return index
|
||||
|
||||
for entry in entries:
|
||||
try:
|
||||
with open(entry.path, "r", encoding="utf-8", errors="replace") as fh:
|
||||
content = fh.read()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Extract dataSource.name (may appear multiple times — take first)
|
||||
ds_match = _DS_NAME_RE.search(content)
|
||||
if not ds_match:
|
||||
continue
|
||||
ds_name = ds_match.group(1).strip()
|
||||
|
||||
# Determine format type — look for grok/dottedJson/custom in "type" values
|
||||
format_types = {m.group(1).lower() for m in _FORMAT_TYPE_RE.finditer(content)}
|
||||
if "grok" in format_types:
|
||||
fmt = "grok"
|
||||
elif "dottedjson" in format_types:
|
||||
fmt = "dottedJson"
|
||||
else:
|
||||
fmt = "custom"
|
||||
|
||||
index[ds_name] = {"parser_name": entry.name, "format_type": fmt}
|
||||
|
||||
return index
|
||||
|
||||
|
||||
@router.get("/map")
|
||||
def get_coverage_map(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Source-centric coverage map.
|
||||
For each active dataSource.name in the SDL:
|
||||
- covered = a custom parser is loaded for it (dataSource.name matches)
|
||||
- parser_needed = no parser, OR parser uses grok/dottedJson format
|
||||
Also surfaces which STAR rules reference each source.
|
||||
"""
|
||||
active_sources = db.query(ActiveSource).order_by(ActiveSource.event_count.desc()).all()
|
||||
parser_fields_rows = db.query(ParserField).all()
|
||||
rules = db.query(ParsedRule).all()
|
||||
|
||||
# parser_name → set of field names (for field count display)
|
||||
parser_index: dict[str, set] = {}
|
||||
for pf in parser_fields_rows:
|
||||
parser_index.setdefault(pf.parser_name, set()).add(pf.field_name)
|
||||
|
||||
# Build dataSource.name → {parser_name, format_type} index from parser files
|
||||
ds_index = _build_parser_ds_index()
|
||||
|
||||
def _normalize(s: str) -> str:
|
||||
return s.lower().replace(" ", "").replace("-", "").replace("_", "").replace(".", "")
|
||||
|
||||
def _find_parser_info(source_name: str) -> dict | None:
|
||||
"""
|
||||
Match priority:
|
||||
1. Exact dataSource.name match
|
||||
2. Normalized substring: active source name ↔ parser dataSource.name
|
||||
3. Normalized substring: active source name ↔ parser filename
|
||||
(catches cases where the parser file has a wrong dataSource.name)
|
||||
"""
|
||||
# 1. Exact match on dataSource.name
|
||||
if source_name in ds_index:
|
||||
return ds_index[source_name]
|
||||
sn = _normalize(source_name)
|
||||
# 2. Normalized ds_name substring
|
||||
for ds_name, info in ds_index.items():
|
||||
if _normalize(ds_name) in sn or sn in _normalize(ds_name):
|
||||
return info
|
||||
# 3. Normalized filename substring
|
||||
for info in ds_index.values():
|
||||
if _normalize(info["parser_name"]) in sn or sn in _normalize(info["parser_name"]):
|
||||
return info
|
||||
return None
|
||||
|
||||
# Fields each rule needs: rule.name → set of field names
|
||||
rule_fields_index: dict[str, set] = {
|
||||
rule.name: set(rule.fields_used or []) for rule in rules
|
||||
}
|
||||
|
||||
# Build rule index: source_name → rules that reference it
|
||||
rule_by_source: dict[str, list] = {}
|
||||
for rule in rules:
|
||||
try:
|
||||
raw_data = json.loads(rule.raw) if rule.raw else {}
|
||||
except Exception:
|
||||
raw_data = {}
|
||||
|
||||
if rule.rule_type == "library":
|
||||
# Library rules store pre-extracted data_sources list in raw
|
||||
data_sources = raw_data.get("data_sources", [])
|
||||
else:
|
||||
query_texts = _star_query_texts(raw_data)
|
||||
data_sources = rule_parser.extract_data_sources(query_texts)
|
||||
|
||||
for ds in data_sources:
|
||||
rule_by_source.setdefault(ds, []).append({"rule": rule.name, "type": rule.rule_type})
|
||||
|
||||
# Fields to ignore when computing "missing" — these are metadata/schema fields
|
||||
# always present in events regardless of the parser
|
||||
_SCHEMA_FIELDS = {
|
||||
"dataSource.name", "dataSource.vendor", "dataSource.category",
|
||||
"event.type", "timestamp", "src.endpoint.ip", "src.endpoint.name",
|
||||
# Endpoint agent fields — populated by the SentinelOne agent, not by SDL parsers
|
||||
"cmdScript.content", "endpoint.os", "endpoint.name", "endpoint.uid",
|
||||
}
|
||||
|
||||
sources_out = []
|
||||
covered_count = 0
|
||||
needed_count = 0
|
||||
|
||||
for src in active_sources:
|
||||
parser_info = _find_parser_info(src.source_name)
|
||||
parser_in_data = (src.parser_detected or 0) > 0
|
||||
|
||||
if parser_info and parser_info["format_type"] == "custom":
|
||||
status = "covered"
|
||||
matched_parser = parser_info["parser_name"]
|
||||
format_type = "custom"
|
||||
elif parser_info and parser_info["format_type"] in ("grok", "dottedJson") and not parser_in_data:
|
||||
# Known parser but primitive format and no evidence of parsing in data
|
||||
status = "parser_needed"
|
||||
matched_parser = parser_info["parser_name"]
|
||||
format_type = parser_info["format_type"]
|
||||
elif parser_in_data:
|
||||
# Parsed fields detected in the data lake — a parser is running
|
||||
status = "covered"
|
||||
matched_parser = parser_info["parser_name"] if parser_info else "detected in data"
|
||||
format_type = parser_info["format_type"] if parser_info else "unknown"
|
||||
else:
|
||||
status = "parser_needed"
|
||||
matched_parser = None
|
||||
format_type = None
|
||||
|
||||
if status == "covered":
|
||||
covered_count += 1
|
||||
else:
|
||||
needed_count += 1
|
||||
|
||||
rules_for_src: list = [r for r in rule_by_source.get(src.source_name, []) if r["type"] == "library"]
|
||||
|
||||
# Close-match suggestions — shown when there are no library rules for this source.
|
||||
close_matches: list = []
|
||||
if not rules_for_src:
|
||||
import re as _re
|
||||
|
||||
def _word_tokens(s: str) -> set:
|
||||
"""Split on non-alphanumeric boundaries, lowercase, drop single chars."""
|
||||
return {t for t in _re.split(r"[^a-z0-9]+", s.lower()) if len(t) >= 2}
|
||||
|
||||
def _is_close(a: str, b: str) -> bool:
|
||||
na, nb = _normalize(a), _normalize(b)
|
||||
# 1. Simple substring match
|
||||
if na in nb or nb in na:
|
||||
return True
|
||||
# 2. Token-level: handles "Microsoft 365 Collaboration" vs "Microsoft O365"
|
||||
# — "365" is inside "o365", and they share "microsoft"
|
||||
ta, tb = _word_tokens(a), _word_tokens(b)
|
||||
shared_exact = ta & tb
|
||||
if not shared_exact:
|
||||
return False # Must share at least one word exactly
|
||||
# Check that a DISTINCTIVE (non-shared) token from one name
|
||||
# appears as a substring inside a token from the other.
|
||||
# This avoids matching "Azure AD" to "Azure Platform" on "azure" alone.
|
||||
unique_a = ta - shared_exact
|
||||
unique_b = tb - shared_exact
|
||||
return any(
|
||||
ua in ub or ub in ua
|
||||
for ua in unique_a for ub in unique_b
|
||||
if len(ua) >= 2 and len(ub) >= 2
|
||||
)
|
||||
|
||||
sn = _normalize(src.source_name)
|
||||
for lib_ds, lib_rules in rule_by_source.items():
|
||||
lib_only = [r for r in lib_rules if r["type"] == "library"]
|
||||
if not lib_only:
|
||||
continue
|
||||
if _is_close(src.source_name, lib_ds):
|
||||
close_matches.append({
|
||||
"library_name": lib_ds,
|
||||
"rule_count": len(lib_only),
|
||||
})
|
||||
close_matches.sort(key=lambda x: x["rule_count"], reverse=True)
|
||||
close_matches = close_matches[:3]
|
||||
|
||||
# Count how many rules reference each field (frequency)
|
||||
field_freq: dict[str, int] = {}
|
||||
for r in rules_for_src:
|
||||
for f in rule_fields_index.get(r["rule"], set()):
|
||||
field_freq[f] = field_freq.get(f, 0) + 1
|
||||
|
||||
# Fields the parser provides
|
||||
parser_provides = parser_index.get(matched_parser, set()) if matched_parser and matched_parser != "detected in data" else set()
|
||||
|
||||
# Minimum number of rules that must reference a field before we flag it.
|
||||
# Scales with rule count so single-rule oddities don't dominate.
|
||||
rule_count = len(rules_for_src)
|
||||
min_rules = max(2, round(rule_count * 0.05)) if rule_count >= 10 else 2
|
||||
|
||||
# Missing = dotted-path fields needed by >= min_rules rules,
|
||||
# not in schema constants, not provided by the parser.
|
||||
missing_fields = sorted(
|
||||
f for f, count in field_freq.items()
|
||||
if count >= min_rules
|
||||
and "." in f
|
||||
and f not in _SCHEMA_FIELDS
|
||||
and f not in parser_provides
|
||||
)
|
||||
|
||||
sources_out.append({
|
||||
"source_name": src.source_name,
|
||||
"event_count": src.event_count,
|
||||
"status": status,
|
||||
"parser": matched_parser,
|
||||
"format_type": format_type,
|
||||
"parser_fields": len(parser_provides),
|
||||
"parser_detected": src.parser_detected or 0,
|
||||
"rules": rules_for_src,
|
||||
"rule_count": len(rules_for_src),
|
||||
"close_matches": close_matches,
|
||||
"missing_fields": missing_fields,
|
||||
"missing_fields_count": len(missing_fields),
|
||||
"synced_at": src.synced_at.isoformat() if src.synced_at else None,
|
||||
})
|
||||
|
||||
synced_at = active_sources[0].synced_at.isoformat() if active_sources else None
|
||||
|
||||
return {
|
||||
"summary": {
|
||||
"active_sources": len(active_sources),
|
||||
"covered": covered_count,
|
||||
"parser_needed": needed_count,
|
||||
"parsers_loaded": len(parser_index),
|
||||
"rules_loaded": len(rules),
|
||||
},
|
||||
"sources": sources_out,
|
||||
"synced_at": synced_at,
|
||||
"has_sources": len(active_sources) > 0,
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/reset")
|
||||
def reset_data(db: Session = Depends(get_db)):
|
||||
db.query(ParsedRule).delete()
|
||||
db.query(ParserField).delete()
|
||||
db.commit()
|
||||
return {"cleared": True}
|
||||
@@ -0,0 +1,122 @@
|
||||
from datetime import datetime, timedelta
|
||||
from fastapi import APIRouter, Query, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from services import s1_client
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _date_range(days: int) -> tuple[str, str]:
|
||||
now = datetime.utcnow()
|
||||
return (
|
||||
(now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
)
|
||||
|
||||
|
||||
def _date_range_hours(hours: int) -> tuple[str, str]:
|
||||
now = datetime.utcnow()
|
||||
return (
|
||||
(now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
)
|
||||
|
||||
|
||||
@router.get("/top-sources")
|
||||
async def get_top_sources(
|
||||
days: int = Query(None, ge=1, le=90),
|
||||
hours: int = Query(None, ge=1, le=24),
|
||||
):
|
||||
"""Top log sources by event count over the given period."""
|
||||
if hours is not None:
|
||||
from_dt, to_dt = _date_range_hours(hours)
|
||||
period_label = f"{hours}h"
|
||||
else:
|
||||
from_dt, to_dt = _date_range(days or 7)
|
||||
period_label = f"{days or 7}d"
|
||||
query = "| group events=count() by dataSource.name | sort -events | limit 25"
|
||||
try:
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"PowerQuery error: {e}")
|
||||
return {"period": period_label, "data": result.get("events", [])}
|
||||
|
||||
|
||||
@router.get("/by-event-type")
|
||||
async def get_by_event_type(days: int = Query(7, ge=1, le=90)):
|
||||
"""Event counts grouped by source and event type."""
|
||||
from_dt, to_dt = _date_range(days)
|
||||
query = "| group events=count() by dataSource.name, event.type | sort -events | limit 100"
|
||||
try:
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"PowerQuery error: {e}")
|
||||
return {"period_days": days, "data": result.get("events", [])}
|
||||
|
||||
|
||||
@router.get("/daily-volume")
|
||||
async def get_daily_volume(days: int = Query(5, ge=1, le=7)):
|
||||
"""Total event count per day — queries run in parallel."""
|
||||
import asyncio
|
||||
|
||||
now = datetime.utcnow()
|
||||
points = min(days, 7)
|
||||
|
||||
async def _fetch_day(i: int) -> dict:
|
||||
day_from = (now - timedelta(days=i + 1)).strftime("%Y-%m-%dT00:00:00.000Z")
|
||||
day_to = (now - timedelta(days=i)).strftime("%Y-%m-%dT00:00:00.000Z")
|
||||
label = (now - timedelta(days=i + 1)).strftime("%Y-%m-%d")
|
||||
try:
|
||||
result = await s1_client.run_powerquery("| group total=count()", day_from, day_to)
|
||||
events_list = result.get("events", []) if isinstance(result, dict) else []
|
||||
count = events_list[0].get("total", 0) if events_list else 0
|
||||
except Exception:
|
||||
count = 0
|
||||
return {"date": label, "events": count}
|
||||
|
||||
results = await asyncio.gather(*[_fetch_day(i) for i in range(points)])
|
||||
return list(reversed(results))
|
||||
|
||||
|
||||
class FilterRule(BaseModel):
|
||||
source: str = ""
|
||||
event_type: str = ""
|
||||
days: int = 7
|
||||
gb_per_million_events: float = 0.5
|
||||
|
||||
|
||||
@router.post("/simulate-filter")
|
||||
async def simulate_filter(rule: FilterRule):
|
||||
"""Estimate how many events and GB would be eliminated by an exclusion filter."""
|
||||
from_dt, to_dt = _date_range(rule.days)
|
||||
|
||||
clauses = []
|
||||
if rule.source:
|
||||
clauses.append(f"dataSource.name=='{rule.source}'")
|
||||
if rule.event_type:
|
||||
clauses.append(f"event.type=='{rule.event_type}'")
|
||||
|
||||
if clauses:
|
||||
filter_expr = " and ".join(clauses)
|
||||
query = f"| filter {filter_expr} | group events=count()"
|
||||
else:
|
||||
query = "| group events=count()"
|
||||
|
||||
try:
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
events = (result.get("events") or [{}])[0].get("events", 0) if isinstance(result.get("events"), list) else 0
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"PowerQuery error: {e}")
|
||||
|
||||
estimated_gb = round(events / 1_000_000 * rule.gb_per_million_events, 3)
|
||||
monthly_events = int(events / rule.days * 30)
|
||||
monthly_gb = round(monthly_events / 1_000_000 * rule.gb_per_million_events, 2)
|
||||
|
||||
return {
|
||||
"period_days": rule.days,
|
||||
"matched_events": events,
|
||||
"estimated_gb_period": estimated_gb,
|
||||
"projected_monthly_events": monthly_events,
|
||||
"projected_monthly_gb": monthly_gb,
|
||||
"filter": {"source": rule.source, "event_type": rule.event_type},
|
||||
}
|
||||
@@ -0,0 +1,440 @@
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime, timedelta
|
||||
from services import s1_client
|
||||
import os
|
||||
import re
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
PARSERS_DIR = "/app/parsers"
|
||||
|
||||
|
||||
@router.get("/parsers")
|
||||
def list_parser_files():
|
||||
"""List parser filenames available under /app/parsers/ for the Test Runner."""
|
||||
try:
|
||||
names = sorted(
|
||||
e.name for e in os.scandir(PARSERS_DIR)
|
||||
if e.is_file() and not e.name.startswith(".")
|
||||
)
|
||||
except FileNotFoundError:
|
||||
names = []
|
||||
return {"parsers": names, "count": len(names)}
|
||||
|
||||
|
||||
@router.post("/sync-from-sdl")
|
||||
async def sync_parsers_from_sdl():
|
||||
"""Download every parser file under /logParsers/ on the SDL tenant into
|
||||
/app/parsers/. After this call returns, the Parser Test Runner dropdown
|
||||
will include all tenant parsers (including custom ones).
|
||||
|
||||
Requires SDL_CONFIG_READ_KEY in .env (Configuration Read scope on the
|
||||
Data Lake API key).
|
||||
"""
|
||||
if not s1_client.SDL_CONFIG_READ_KEY:
|
||||
raise HTTPException(
|
||||
400,
|
||||
"SDL_CONFIG_READ_KEY is not set in .env. Generate a Data Lake API key "
|
||||
"with 'Configuration Read' scope in the S1 console and add it to .env."
|
||||
)
|
||||
|
||||
try:
|
||||
names = await s1_client.list_sdl_parsers()
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"SDL listFiles failed: {e}")
|
||||
|
||||
os.makedirs(PARSERS_DIR, exist_ok=True)
|
||||
downloaded: list[str] = []
|
||||
errors: list[dict] = []
|
||||
|
||||
for name in names:
|
||||
# The path on SDL is /logParsers/<name>; we write to /app/parsers/<sanitized-name>.
|
||||
safe_name = name.replace("/", "_")
|
||||
try:
|
||||
resp = await s1_client.get_sdl_parser(name)
|
||||
content = resp.get("content")
|
||||
if content is None:
|
||||
errors.append({"parser": name, "error": "no content field in response"})
|
||||
continue
|
||||
with open(os.path.join(PARSERS_DIR, safe_name), "w", encoding="utf-8") as fh:
|
||||
fh.write(content)
|
||||
downloaded.append(safe_name)
|
||||
except Exception as e:
|
||||
errors.append({"parser": name, "error": str(e) or e.__class__.__name__})
|
||||
|
||||
return {
|
||||
"downloaded": len(downloaded),
|
||||
"parsers": downloaded,
|
||||
"errors": errors,
|
||||
"directory": PARSERS_DIR,
|
||||
}
|
||||
|
||||
|
||||
def _date_range_hours(hours: int) -> tuple[str, str]:
|
||||
now = datetime.utcnow()
|
||||
return (
|
||||
(now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class SampleEventsRequest(BaseModel):
|
||||
source: str
|
||||
limit: int = 20
|
||||
hours: int = 1
|
||||
|
||||
|
||||
class FieldPopulationRequest(BaseModel):
|
||||
source: str
|
||||
hours: int = 24
|
||||
fields: list[str] = [
|
||||
"src.ip",
|
||||
"src.port",
|
||||
"dst.ip",
|
||||
"dst.port",
|
||||
"user.name",
|
||||
"event.type",
|
||||
"src.process.name",
|
||||
"src.process.cmdline",
|
||||
"tgt.file.path",
|
||||
"network.direction",
|
||||
"dataSource.name",
|
||||
]
|
||||
|
||||
|
||||
class TestParserRequest(BaseModel):
|
||||
parser_name: str
|
||||
log_line: str
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _flatten_dict(d: dict, prefix: str = "", out: dict | None = None) -> dict:
|
||||
"""Recursively flatten a nested dict into dotted keys."""
|
||||
if out is None:
|
||||
out = {}
|
||||
if not isinstance(d, dict):
|
||||
return out
|
||||
for k, v in d.items():
|
||||
key = f"{prefix}.{k}" if prefix else k
|
||||
if isinstance(v, dict):
|
||||
_flatten_dict(v, key, out)
|
||||
else:
|
||||
out[key] = v
|
||||
return out
|
||||
|
||||
|
||||
def _flatten_event(event: dict) -> dict:
|
||||
"""Return a flat field→value dict from a PowerQuery result row.
|
||||
|
||||
If the row only carries a JSON-stringified payload in `message` (i.e. the
|
||||
parser wasn't applied at query time), parse and flatten it inline so the
|
||||
UI can measure field population accurately. The original raw `message`
|
||||
is preserved under its own key.
|
||||
"""
|
||||
if not isinstance(event, dict):
|
||||
return {}
|
||||
flat = dict(event)
|
||||
msg = flat.get("message")
|
||||
if isinstance(msg, str) and msg.startswith("{") and msg.endswith("}"):
|
||||
try:
|
||||
parsed = __import__("json").loads(msg)
|
||||
if isinstance(parsed, dict):
|
||||
flat.update(_flatten_dict(parsed))
|
||||
except Exception:
|
||||
pass
|
||||
return flat
|
||||
|
||||
|
||||
def _extract_format_strings(content: str) -> list[str]:
|
||||
"""
|
||||
Extract SDL format string values from augmented-JSON parser content.
|
||||
Matches: "format": "..." (double-quoted value, supports escaped quotes).
|
||||
"""
|
||||
pattern = re.compile(r'"format"\s*:\s*"((?:[^"\\]|\\.)*)"')
|
||||
return pattern.findall(content)
|
||||
|
||||
|
||||
def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
|
||||
"""
|
||||
Convert an SDL format string to a compiled Python regex.
|
||||
|
||||
Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can
|
||||
translate group names back to the original SDL field names.
|
||||
|
||||
Raises re.error if the resulting pattern cannot be compiled.
|
||||
"""
|
||||
# Split on $...$ tokens
|
||||
token_pattern = re.compile(r'\$([^$]+)\$')
|
||||
parts = token_pattern.split(fmt)
|
||||
# parts alternates: literal, token, literal, token, ...
|
||||
|
||||
regex_parts: list[str] = []
|
||||
py_group_to_sdl: dict[str, str] = {}
|
||||
seen_groups: dict[str, int] = {}
|
||||
|
||||
for i, part in enumerate(parts):
|
||||
if i % 2 == 0:
|
||||
# Literal text
|
||||
regex_parts.append(re.escape(part))
|
||||
else:
|
||||
# Token: either "field.name=PATTERN" or just "field.name"
|
||||
if '=' in part:
|
||||
field_name, pattern = part.split('=', 1)
|
||||
else:
|
||||
field_name = part
|
||||
pattern = r'[^\s]+'
|
||||
|
||||
# Build a valid Python group name
|
||||
safe = re.sub(r'[.\-]', '_', field_name)
|
||||
if safe in seen_groups:
|
||||
seen_groups[safe] += 1
|
||||
safe = f"{safe}_{seen_groups[safe]}"
|
||||
else:
|
||||
seen_groups[safe] = 0
|
||||
|
||||
py_group_to_sdl[safe] = field_name
|
||||
regex_parts.append(f'(?P<{safe}>{pattern})')
|
||||
|
||||
compiled = re.compile(''.join(regex_parts), re.IGNORECASE)
|
||||
return compiled, py_group_to_sdl
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Endpoints
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/sample-events")
|
||||
async def sample_events(req: SampleEventsRequest):
|
||||
"""Return a sample of raw events from a given data source."""
|
||||
query = f'| filter dataSource.name = "{req.source}" | limit {req.limit}'
|
||||
from_dt, to_dt = _date_range_hours(req.hours)
|
||||
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
|
||||
rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
|
||||
events = [_flatten_event(row) for row in rows]
|
||||
|
||||
return {
|
||||
"source": req.source,
|
||||
"events": events,
|
||||
"count": len(events),
|
||||
"hours": req.hours,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/field-population")
|
||||
async def field_population(req: FieldPopulationRequest):
|
||||
"""
|
||||
Analyse how consistently each requested field is populated across a sample
|
||||
of events from a data source.
|
||||
"""
|
||||
query = f'| filter dataSource.name = "{req.source}" | limit 500'
|
||||
from_dt, to_dt = _date_range_hours(req.hours)
|
||||
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
|
||||
rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
|
||||
events = [_flatten_event(row) for row in rows]
|
||||
|
||||
if not events:
|
||||
raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.")
|
||||
|
||||
total = len(events)
|
||||
_empty = {None, "", "null"}
|
||||
|
||||
# Collect all field names seen across the sample (useful for surfacing what IS there)
|
||||
all_seen_fields = sorted({k for ev in events for k in ev})
|
||||
|
||||
field_stats = []
|
||||
for field in req.fields:
|
||||
# dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back
|
||||
if field == "dataSource.name":
|
||||
populated = total
|
||||
else:
|
||||
populated = sum(1 for ev in events if ev.get(field) not in _empty)
|
||||
rate = round((populated / total) * 100, 1)
|
||||
field_stats.append({
|
||||
"field": field,
|
||||
"populated": populated,
|
||||
"total": total,
|
||||
"rate": rate,
|
||||
})
|
||||
|
||||
# Sort ascending by rate (worst coverage first)
|
||||
field_stats.sort(key=lambda x: x["rate"])
|
||||
|
||||
return {
|
||||
"source": req.source,
|
||||
"total_sampled": total,
|
||||
"hours": req.hours,
|
||||
"fields": field_stats,
|
||||
"fields_seen_in_sample": all_seen_fields,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/test-parser")
|
||||
async def test_parser(req: TestParserRequest):
|
||||
"""
|
||||
Test a parser against a raw log line by extracting and matching SDL format
|
||||
strings found in the parser file.
|
||||
"""
|
||||
parser_path = f"/app/parsers/{req.parser_name}"
|
||||
|
||||
try:
|
||||
with open(parser_path, "r", encoding="utf-8") as fh:
|
||||
content = fh.read()
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(status_code=404, detail=f"Parser file not found: {req.parser_name}")
|
||||
except OSError as exc:
|
||||
raise HTTPException(status_code=500, detail=f"Could not read parser file: {exc}")
|
||||
|
||||
format_strings = _extract_format_strings(content)
|
||||
|
||||
# ── JSON auto-extract path ──────────────────────────────────────────────
|
||||
# SDL parsers that use `$=json{parse=json}$` (or any format containing
|
||||
# `parse=json`) auto-extract every top-level JSON key as an attribute.
|
||||
# The regex-based path can't model that — handle it explicitly so users
|
||||
# can test JSON-shaped logs against JSON-mode parsers.
|
||||
log_input = req.log_line.strip()
|
||||
is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{")
|
||||
if is_json_mode:
|
||||
import json as _json
|
||||
# Support multi-line input (one JSON object per line, or a JSON array)
|
||||
lines = [ln for ln in (l.strip() for l in log_input.splitlines()) if ln]
|
||||
payloads: list[dict] = []
|
||||
parse_errors: list[str] = []
|
||||
# Single line: try direct parse; if it's a JSON array, expand.
|
||||
if len(lines) == 1:
|
||||
try:
|
||||
obj = _json.loads(lines[0])
|
||||
except Exception as e:
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": False,
|
||||
"message": f"Parser expects JSON but log line could not be parsed as JSON: {e}",
|
||||
"fields": [],
|
||||
}
|
||||
if isinstance(obj, list):
|
||||
payloads = [x for x in obj if isinstance(x, dict)]
|
||||
elif isinstance(obj, dict):
|
||||
payloads = [obj]
|
||||
else:
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": False,
|
||||
"message": "Parser expects a JSON object (got scalar).",
|
||||
"fields": [],
|
||||
}
|
||||
else:
|
||||
# Multi-line: one JSON object per line (NDJSON)
|
||||
for i, ln in enumerate(lines, 1):
|
||||
try:
|
||||
obj = _json.loads(ln)
|
||||
if isinstance(obj, dict):
|
||||
payloads.append(obj)
|
||||
else:
|
||||
parse_errors.append(f"line {i}: not a JSON object")
|
||||
except Exception as e:
|
||||
parse_errors.append(f"line {i}: {e}")
|
||||
|
||||
if not payloads:
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": False,
|
||||
"message": "No valid JSON objects found. " + " | ".join(parse_errors[:3]),
|
||||
"fields": [],
|
||||
}
|
||||
|
||||
# Use the first payload for the detail table; report totals.
|
||||
payload = payloads[0]
|
||||
extracted = _flatten_dict(payload)
|
||||
# Apply lightweight rewrites if present (input/output/match/replace blocks).
|
||||
# We only handle simple literal/regex matches with $0 or string replacements;
|
||||
# this is best-effort, intended for quick visual verification.
|
||||
rewrites_applied = []
|
||||
rewrite_re = re.compile(
|
||||
r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}',
|
||||
re.DOTALL,
|
||||
)
|
||||
derived: dict[str, str] = {}
|
||||
for m in rewrite_re.finditer(content):
|
||||
in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4)
|
||||
src_val = extracted.get(in_field)
|
||||
if src_val is None:
|
||||
continue
|
||||
try:
|
||||
m2 = re.search(match_pat, str(src_val))
|
||||
except re.error:
|
||||
continue
|
||||
if not m2:
|
||||
continue
|
||||
# SDL uses $0 for whole match, $1.. for groups. Translate to Python
|
||||
# \g<0>, \g<1>, ... so re.sub doesn't read \0 as a null byte.
|
||||
def _to_py_backref(s: str) -> str:
|
||||
return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s)
|
||||
try:
|
||||
val = re.sub(match_pat, _to_py_backref(replace_val), str(src_val), count=1)
|
||||
except re.error:
|
||||
val = replace_val
|
||||
derived[out_field] = val
|
||||
rewrites_applied.append({
|
||||
"input": in_field, "input_value": src_val,
|
||||
"output": out_field, "matched_on": match_pat, "result": val,
|
||||
})
|
||||
|
||||
fields = (
|
||||
[{"field": k, "value": v, "source": "json-extract"} for k, v in sorted(extracted.items())]
|
||||
+ [{"field": k, "value": v, "source": "rewrite"} for k, v in sorted(derived.items())]
|
||||
)
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": True,
|
||||
"mode": "json",
|
||||
"format_matched": "$=json{parse=json}$",
|
||||
"fields": fields,
|
||||
"rewrites_applied": rewrites_applied,
|
||||
"extracted_count": len(extracted),
|
||||
"derived_count": len(derived),
|
||||
"payload_count": len(payloads),
|
||||
"parse_errors": parse_errors,
|
||||
"showing_payload": 1,
|
||||
}
|
||||
|
||||
# ── Regex format-string path (original) ─────────────────────────────────
|
||||
for fmt in format_strings:
|
||||
try:
|
||||
compiled, py_to_sdl = _sdl_format_to_regex(fmt)
|
||||
except re.error:
|
||||
# Skip unparseable format strings
|
||||
continue
|
||||
|
||||
match = compiled.search(req.log_line)
|
||||
if match:
|
||||
fields = [
|
||||
{"field": py_to_sdl.get(group, group), "value": value}
|
||||
for group, value in match.groupdict().items()
|
||||
if value is not None
|
||||
]
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": True,
|
||||
"mode": "regex",
|
||||
"format_matched": fmt,
|
||||
"fields": fields,
|
||||
}
|
||||
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": False,
|
||||
"message": "No format pattern matched",
|
||||
"fields": [],
|
||||
}
|
||||
@@ -0,0 +1,105 @@
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
ENV_FILE = Path(os.environ.get("ENV_FILE_PATH", "/app/.env"))
|
||||
|
||||
# Fields we expose in the UI — order matters for display
|
||||
FIELDS = [
|
||||
{"key": "S1_BASE_URL", "label": "Console URL", "secret": False, "placeholder": "https://demo.sentinelone.net"},
|
||||
{"key": "S1_API_TOKEN", "label": "Console API Token", "secret": True, "placeholder": "eyJ..."},
|
||||
{"key": "SDL_XDR_URL", "label": "SDL XDR URL", "secret": False, "placeholder": "https://xdr.us1.sentinelone.net"},
|
||||
{"key": "SDL_LOG_READ_KEY", "label": "SDL Log Read Key", "secret": True, "placeholder": "1DnK0Y4e..."},
|
||||
{"key": "ANTHROPIC_API_KEY", "label": "Anthropic API Key", "secret": True, "placeholder": "sk-ant-..."},
|
||||
]
|
||||
|
||||
FIELD_KEYS = {f["key"] for f in FIELDS}
|
||||
|
||||
|
||||
def _read_env() -> dict[str, str]:
|
||||
"""Read .env file into a dict."""
|
||||
vals: dict[str, str] = {}
|
||||
if ENV_FILE.exists():
|
||||
for line in ENV_FILE.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith("#") and "=" in line:
|
||||
k, _, v = line.partition("=")
|
||||
vals[k.strip()] = v.strip()
|
||||
return vals
|
||||
|
||||
|
||||
def _write_env(updates: dict[str, str]) -> None:
|
||||
"""Write updates into .env, preserving comments and unknown keys."""
|
||||
existing_lines: list[str] = []
|
||||
if ENV_FILE.exists():
|
||||
existing_lines = ENV_FILE.read_text().splitlines()
|
||||
|
||||
written: set[str] = set()
|
||||
new_lines: list[str] = []
|
||||
|
||||
for line in existing_lines:
|
||||
stripped = line.strip()
|
||||
if stripped and not stripped.startswith("#") and "=" in stripped:
|
||||
k, _, _ = stripped.partition("=")
|
||||
k = k.strip()
|
||||
if k in updates:
|
||||
new_lines.append(f"{k}={updates[k]}")
|
||||
written.add(k)
|
||||
continue
|
||||
new_lines.append(line)
|
||||
|
||||
# Append any new keys not already in the file
|
||||
for k, v in updates.items():
|
||||
if k not in written:
|
||||
new_lines.append(f"{k}={v}")
|
||||
|
||||
ENV_FILE.write_text("\n".join(new_lines) + "\n")
|
||||
|
||||
|
||||
@router.get("/config")
|
||||
async def get_config():
|
||||
"""Return current config values. Secrets are masked."""
|
||||
env_vals = _read_env()
|
||||
result = []
|
||||
for f in FIELDS:
|
||||
key = f["key"]
|
||||
# Prefer live env var, fall back to .env file value
|
||||
raw = os.environ.get(key, env_vals.get(key, ""))
|
||||
if f["secret"] and raw:
|
||||
# Show first 6 + last 4 chars, mask middle
|
||||
masked = raw[:6] + "•" * max(4, len(raw) - 10) + raw[-4:] if len(raw) > 10 else "••••••••"
|
||||
else:
|
||||
masked = raw
|
||||
result.append({
|
||||
"key": key,
|
||||
"label": f["label"],
|
||||
"secret": f["secret"],
|
||||
"placeholder": f["placeholder"],
|
||||
"value": masked,
|
||||
"set": bool(raw),
|
||||
})
|
||||
env_file_exists = ENV_FILE.exists()
|
||||
return {"fields": result, "env_file_exists": env_file_exists, "env_file_path": str(ENV_FILE)}
|
||||
|
||||
|
||||
class ConfigUpdate(BaseModel):
|
||||
updates: dict[str, str]
|
||||
|
||||
|
||||
@router.post("/config")
|
||||
async def save_config(body: ConfigUpdate):
|
||||
"""Save config values to .env file. Only known keys accepted."""
|
||||
bad = [k for k in body.updates if k not in FIELD_KEYS]
|
||||
if bad:
|
||||
raise HTTPException(400, f"Unknown keys: {bad}")
|
||||
if not ENV_FILE.parent.exists():
|
||||
raise HTTPException(503, f"Cannot write to {ENV_FILE} — check Docker volume mount")
|
||||
try:
|
||||
_write_env(body.updates)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"Failed to write .env: {e}")
|
||||
return {"saved": list(body.updates.keys()), "restart_required": True}
|
||||
Reference in New Issue
Block a user