commit 79efb6bf7da0ed308600e164bc18c937c9b5e765 Author: marc Date: Wed May 20 23:44:53 2026 +0200 v0.1 Mick Marc merged diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..c5de2c8 --- /dev/null +++ b/.env.example @@ -0,0 +1,29 @@ +# ───────────────────────────────────────────────────────────────────────────── +# SIEM Toolkit — Environment Configuration +# ───────────────────────────────────────────────────────────────────────────── +# 1. Copy this file: cp .env.example .env +# 2. Fill in values below (see comments for where to find each one) +# 3. Start the app: docker-compose up -d --build +# ───────────────────────────────────────────────────────────────────────────── + +# SentinelOne Management Console +# ─ URL: your console (e.g. https://demo.sentinelone.net) +# ─ Token: Settings → Users → Service Users → generate API token +S1_BASE_URL=https://demo.sentinelone.net +S1_API_TOKEN= + +# Singularity Data Lake (SDL) — PowerQuery credentials +# ─ Console: Settings → Integrations → Data Lake API Keys +# ─ XDR URL: shown on the API Keys page (e.g. https://xdr.us1.sentinelone.net) +# ─ Log Read Key: copy the "Log Read" key from that page +SDL_XDR_URL=https://xdr.us1.sentinelone.net +SDL_LOG_READ_KEY= + +# Anthropic (for Onboarding Accelerator AI features) +# ─ https://console.anthropic.com/settings/api-keys +ANTHROPIC_API_KEY= + +# SDL Configuration Read key — used by /api/quality/sync-from-sdl to +# download parser files from /logParsers/ on the SDL tenant. +# Generate in S1 console: Settings -> Integrations -> Data Lake API Keys (Configuration Read scope). +SDL_CONFIG_READ_KEY= diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aba9209 --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +.env +mcp_config.txt +__pycache__/ +*.pyc +node_modules/ +.next/ +frontend/out/ +pgdata/ +parsers/*.json +data/ + +# Tenant-synced parsers (downloaded via /api/quality/sync-from-sdl) - +# do not commit; each tenant generates its own set +parsers/* +!parsers/.gitkeep diff --git a/CHANGES.md b/CHANGES.md new file mode 100644 index 0000000..869cfb2 --- /dev/null +++ b/CHANGES.md @@ -0,0 +1,104 @@ +# Changes vs upstream `mickbrowns1/SIEM-Toolkit` + +All edits are confined to a handful of files; everything else is untouched. + +## `backend/services/s1_client.py` + +### PowerQuery client +- All raised exceptions now include the request body / status / query so the + UI never shows a blank `"PowerQuery error: "`. +- Non-JSON responses (HTML 5xx gateway pages) surface as a readable error + string instead of crashing on `resp.json()`. + +### Detection library: site-scope fallback (`get_platform_rules`) +- Upstream hardcoded **account scope** which 403s with site-scoped API + tokens. Added `get_scope_for_platform_rules()` that probes `/accounts` + first, then `/sites`, returning whichever scope the token can access. +- `get_account_id()` now also reads `accountId` from the `/sites` payload as + a fallback for site-scoped tokens. + +### SDL parser sync helpers +- `list_sdl_parsers()` — rewritten to use the real **SDL Configuration File + API** (`POST /api/listFiles` with `pathPrefix=/logParsers/`). Previously + it hit a 404 path on the mgmt console. +- `get_sdl_parser()` — rewritten to `POST /api/getFile` with `{path}`. +- New `_sdl_config_headers()` helper that uses `SDL_CONFIG_READ_KEY` (a + separate scope from `SDL_LOG_READ_KEY`). + +## `backend/routers/ingest.py` + +- `/api/ingest/simulate-filter`: + * Rebuilt the query into valid SDL syntax — was generating + `| group events=count()` (dangling pipe) for empty bodies; now uses a + proper base expression and falls back to `dataSource.name!=''` baseline. + * Field name corrected from `src.name` → `dataSource.name`. + * Surfaces both `result["error"]` and exception text so blank + `"PowerQuery error: "` messages are gone. + +## `backend/routers/quality.py` + +- `GET /api/quality/parsers`: lists actual parser filenames in + `/app/parsers/` (drives the Test Runner dropdown). +- **New `POST /api/quality/sync-from-sdl`**: downloads every parser file + under `/logParsers/` on the SDL tenant into `/app/parsers/`. After this + call returns, the Parser Test Runner dropdown automatically reflects all + tenant parsers (including custom OCSF parsers like + `Avelios-Medical-OCSF`). Requires `SDL_CONFIG_READ_KEY` in `.env`. +- `_flatten_event`: when a PowerQuery row only carries a JSON-stringified + payload in `message` (i.e. the parser isn't applied at query time), parse + and flatten that JSON inline so the Field Population tool can measure real + coverage. +- `POST /api/quality/test-parser`: + * Detects SDL JSON-mode parsers (`$=json{parse=json}$`) and parses log + lines as JSON. + * Applies parser `rewrites: [{input,output,match,replace}]` blocks with + correct `$0/$N` backreference translation (`$0` was being mangled to a + null byte). + * Accepts single JSON object, JSON array, or NDJSON multi-line input. + * Returns mode badge data + per-payload counters for the UI. + +## `frontend/index.html` + +- Parser Test Runner dropdown now loads from `/api/quality/parsers` instead + of filtering the coverage map (which only has `detected in data` + placeholders). +- Field Population and Sample Events: added **Last 7d** lookback option. +- Parser Test Runner UI: mode badge (`JSON auto-extract` vs `regex format`), + payload counter for multi-line input, separate tables for extracted vs + derived/rewritten fields. + +## `docker-compose.yml` + +- Pass `SDL_CONFIG_READ_KEY` through to the backend container. + +## `.env.example` / `.gitignore` + +- Document the new `SDL_CONFIG_READ_KEY` variable. +- Broaden `.gitignore` so `parsers/*` (tenant-specific synced content) is + not committed. + +## New helper scripts (`tools/`) + +- `sync_sdl_parsers.py` — pull all `/logParsers/*` from the tenant. +- `probe_pq_syntax.py` — probe which PowerQuery syntaxes the tenant accepts. +- `probe_avelios{,_wide,_fields}.py` — inspect a source's event presence, + columns, and embedded JSON fields. +- `test_avelios_parser.py`, `test_avelios_multi.py` — smoke-test the patched + `/api/quality/test-parser` endpoint with single-line and multi-line input. +- `probe_simulate_filter.py` — smoke-test the patched + `/api/ingest/simulate-filter` endpoint with progressively larger windows. +- `probe_sync_from_sdl.py` — call `/api/quality/sync-from-sdl` and verify + that `/api/quality/parsers` then reflects the downloaded parsers. +- `sdl_config.example.json` — template config (the toolkit's `.env` is + separate from the SDL config used by these helper scripts). + +## New `.env` knobs + +```bash +# PowerQuery transport tuning (both optional; defaults work for most tenants) +SDL_PQ_TIMEOUT=600 # PowerQuery read timeout in seconds (default 600) +SDL_PQ_TIMEOUT_RETRIES=1 # extra retries on ReadTimeout (default 1) + +# Required for /api/quality/sync-from-sdl +SDL_CONFIG_READ_KEY=... # Data Lake API key with Configuration Read scope +``` diff --git a/PATCHES.md b/PATCHES.md new file mode 100644 index 0000000..31d3d15 --- /dev/null +++ b/PATCHES.md @@ -0,0 +1,168 @@ +# SIEM-Toolkit Patches & Helper Scripts + +A drop-in patch set that fixes several issues in the upstream +[`mickbrowns1/SIEM-Toolkit`](https://github.com/mickbrowns1/SIEM-Toolkit) and +adds helper scripts for syncing parsers from a SentinelOne SDL tenant and +probing PowerQuery / event data. + +## What's fixed in the upstream code + +| File | Fix | +|---|---| +| `backend/routers/ingest.py` | **Filter Simulator** PowerQuery rewritten — replaced legacy `count() as events` and `src.name` field with valid SDL `\| filter dataSource.name=='X' \| group events=count()` | +| `backend/routers/quality.py` | New `GET /api/quality/parsers` endpoint lists actual parser files; `_flatten_event` now JSON-parses nested `message` payloads so the **Field Population** tool reports real coverage (was always 0% for sources where the parser isn't applied at query time) | +| `backend/routers/quality.py` (Parser Test Runner) | Detects SDL JSON auto-extract format `$=json{parse=json}$` and parses log lines as JSON; applies parser `rewrites` (`input/output/match/replace` blocks) with correct `$0`/`$N` backreference handling; accepts **single JSON / JSON array / NDJSON** input | +| `frontend/index.html` | Parser dropdown now loads from `/api/quality/parsers` (was filtering `coverage/map` which only has `detected in data` placeholders); added **Last 7d** lookback to both Field Population and Sample Events; Test Runner UI now shows mode badge (`JSON auto-extract` vs `regex format`), payload count for multi-line input, and separate tables for extracted vs derived/rewritten fields | + +## What's NOT fixed in the upstream code (configuration) + +The repo's `docker-compose.yml` interpolates `S1_BASE_URL` etc. from +`.env` at compose-up time. **A `docker compose restart` does NOT pick up +`.env` changes** — always use `docker compose up -d --force-recreate backend`. + +`S1_BASE_URL` must be the **per-tenant management console subdomain** (e.g. +`usea1-XXXX.sentinelone.net`), not the regional SDL/XDR endpoint. If you +only know the XDR URL, you can probe candidates with curl: + +```bash +TOKEN=$(jq -r .api_token < ~/.../mgmt-config.json) +for H in usea1-yourtenant usea1-purple usea1-partners; do + printf "%-45s %s\\n" "$H" \\ + "$(curl -s -o /dev/null -w '%{http_code}' \\ + \"https://$H.sentinelone.net/web/api/v2.1/cloud-detection/rules?limit=1\" \\ + -H \"Authorization: ApiToken $TOKEN\")" +done +# 200 = correct host +``` + +## Contents + +``` +. +├── README.md (this file) +├── env.example template for the toolkit's .env +├── sdl_config.example.json template for helper scripts' SDL config +├── patched-files/ +│ ├── backend/routers/ +│ │ ├── ingest.py <- copy over upstream +│ │ └── quality.py <- copy over upstream +│ └── frontend/ +│ └── index.html <- copy over upstream +└── scripts/ + ├── sync_sdl_parsers.py pull all /logParsers/* from the tenant into ./parsers/ + ├── probe_pq_syntax.py test what PowerQuery dialect the tenant accepts + ├── probe_avelios.py sample probe: find a source's events + columns + ├── probe_avelios_wide.py same, sweeping 1d/3d/7d + ├── probe_avelios_fields.py parse JSON `message` payloads & count fields + ├── test_avelios_parser.py hit /api/quality/test-parser with one JSON line + └── test_avelios_multi.py same, with multi-line NDJSON +``` + +## Applying the patches + +1. Clone the upstream repo: + ```bash + git clone https://github.com/mickbrowns1/SIEM-Toolkit.git + cd SIEM-Toolkit + ``` +2. Overlay the patched files: + ```bash + PATCH=/path/to/this/dir + cp "$PATCH"/patched-files/backend/routers/quality.py backend/routers/quality.py + cp "$PATCH"/patched-files/backend/routers/ingest.py backend/routers/ingest.py + cp "$PATCH"/patched-files/frontend/index.html frontend/index.html + ``` +3. Configure: + ```bash + cp "$PATCH"/env.example .env + $EDITOR .env # fill in your real values + ``` +4. Start the stack: + ```bash + docker compose up -d --build + open http://localhost:3001 + ``` + +## Helper-script setup + +The helper scripts read a small JSON config (separate from the toolkit's `.env`) +containing your SDL log-read / config-read keys: + +```bash +cp sdl_config.example.json scripts/sdl_config.json +$EDITOR scripts/sdl_config.json +# or set the env var +export SDL_CONFIG=/somewhere/sdl_config.json +``` + +## Helper-script usage + +### Sync parsers from the SDL tenant into the toolkit's `parsers/` dir + +```bash +PARSERS_DIR=/path/to/SIEM-Toolkit/parsers \\ + python3 scripts/sync_sdl_parsers.py +``` + +By default `PARSERS_DIR` defaults to `../parsers` relative to the script. + +### Probe PowerQuery syntax compatibility on your tenant + +```bash +python3 scripts/probe_pq_syntax.py +``` + +Output tells you which command shapes (`| group ...`, `filter ...`, `count() as`, etc.) +work on the active deployment. + +### Inspect what a given source's events actually look like + +```bash +python3 scripts/probe_avelios.py # finds a source's name + 1-line sample +python3 scripts/probe_avelios_wide.py # sweeps 1d/3d/7d top sources +python3 scripts/probe_avelios_fields.py # if `message` is JSON, flatten & count fields +``` + +The scripts are named `*_avelios` for the original use case but work for **any +source** — open the file and change the `dataSource.name` filter. + +### Smoke-test the patched Parser Test Runner endpoint + +```bash +python3 scripts/test_avelios_parser.py # single-line JSON +python3 scripts/test_avelios_multi.py # multi-line NDJSON +``` + +These hit `http://localhost:8001/api/quality/test-parser` directly so you can +verify the backend without using the UI. + +## Common pitfalls + +- **Parser dropdown is empty** → run `sync_sdl_parsers.py`. The upstream "Load + SDL Parsers" button only indexes whatever already exists in `parsers/`. +- **Field Population shows 0% everywhere** → the source's parser isn't being + applied at query time, so PowerQuery returns just `timestamp`+`message`. + This patch's `_flatten_event` parses JSON inside `message`. Also try widening + the window (the new **Last 7d** option) — some sources are low-volume. +- **PowerQuery 400 "Unknown command [count]"** → fixed in `ingest.py`. If you + hit it elsewhere, the rule is: SDL PowerQuery requires `\| group events=count()`, + never `\| count() as events`, and `count()` must be inside a `group`. +- **STAR rules → 302 to /404** → `S1_BASE_URL` is pointed at the SDL/XDR URL + instead of the management-console subdomain. + +## Verification + +After applying patches and recreating containers: + +```bash +curl http://localhost:8001/health +curl http://localhost:8001/api/quality/parsers | python3 -m json.tool # count > 0 +curl 'http://localhost:8001/api/ingest/top-sources?hours=24' # real numbers +curl -X POST http://localhost:8001/api/coverage/load-star-rules # not 502 +``` + +In the UI: +- **Coverage Map**: shows `parsers_loaded` and `rules_loaded` > 0 +- **Ingest → Filter Simulator**: returns matched events + projected GB/month +- **Parser Quality → Parser Test Runner**: dropdown lists all parsers +- **Parser Quality → Field Population**: real coverage rates (not all 0%) diff --git a/README.md b/README.md new file mode 100644 index 0000000..c769b3e --- /dev/null +++ b/README.md @@ -0,0 +1,276 @@ +# SIEM Toolkit — SentinelOne AI-SIEM + +> *Inspired by Pineapple Boy!* 🍍 + +A self-hosted troubleshooting and visibility tool for SentinelOne AI-SIEM SecOps engineers. Runs as a Docker Compose stack against your SentinelOne demo or production tenant and provides real-time insight into parser coverage, ingest volume, and data quality — all without leaving a single interface. + +--- + +## What's Inside + +| Page | Purpose | +|---|---| +| **Overview** | Live health stats — coverage percentage, active sources, top uncovered sources by volume | +| **Parser Coverage Map** | Which active data sources have a parser? Which don't? | +| **Ingest Dashboard** | Event volume, top sources, cost projection, filter simulator | +| **Parser Quality** | Live event sampler, field population rate, parser test runner | +| **Onboarding Accelerator** | Prompt template for onboarding new log sources with Claude Code | +| **Settings** | Manage your `.env` credentials directly from the interface | + +--- + +## Architecture + +``` +browser → nginx (port 3001) → single-page HTML/JS application + ↓ API calls + FastAPI backend (port 8001) + ↓ + ┌───────────────────────────┐ + │ PostgreSQL (SQLAlchemy) │ parser fields, active sources + └───────────────────────────┘ + ↓ + ┌───────────────────────────┐ + │ SentinelOne APIs │ + │ • Management API │ demo.sentinelone.net + │ • Scalyr XDR PowerQuery │ xdr.us1.sentinelone.net + └───────────────────────────┘ +``` + +All services run via Docker Compose. The `parsers/` directory is volume-mounted into the backend so SDL parser files may be loaded without rebuilding the image. + +--- + +## Setup + +### 1. Clone and Configure + +```bash +git clone https://github.com/mickbrowns1/SIEM-Toolkit.git +cd SIEM-Toolkit +cp .env.example .env +``` + +Edit `.env` with your credentials: + +```env +S1_BASE_URL=https://demo.sentinelone.net # Your console URL +S1_API_TOKEN=eyJ... # Service user API token (account scope or higher) +SDL_XDR_URL=https://xdr.us1.sentinelone.net # Scalyr XDR endpoint +SDL_LOG_READ_KEY=1j2IU0S... # Data Lake read key +ANTHROPIC_API_KEY= # Optional — not currently used +``` + +**S1_API_TOKEN** — generate at *Settings → Users → Service Users* in the console. The service user should be provisioned at **account scope** or higher. +**SDL_LOG_READ_KEY** — found at *Settings → Integrations → Data Lake API Keys*. + +### 2. Add the Detection Library (strongly recommended) + +The Detection Fields Missing column and per-source detection counts on the Coverage Map require a local detections export. This is generated from the [detection-validator](https://github.com/mickbrowns1/detection-validator) repository. + +```bash +# Clone the detection-validator repo alongside this one +git clone https://github.com/mickbrowns1/detection-validator.git +cd detection-validator + +# Follow its README to generate the export, then copy the output here: +mkdir -p ../SIEM-Toolkit/data +cp data/data/detections/extracted.json ../SIEM-Toolkit/data/detections.json + +cd ../SIEM-Toolkit +``` + +The `data/` directory is gitignored and never committed. Once the stack is running, click **Load Detections** on the Coverage Map to import the rules into the database. + +### 3. Add Parser Files (optional but strongly recommended) + +Place your SDL parser JSON files into the `parsers/` directory. The backend reads them directly at query time — no rebuild is necessary. + +```bash +cp ~/my-parsers/*.json parsers/ +``` + +### 4. Start the Stack + +```bash +docker-compose up -d --build +``` + +Open **http://localhost:3001** in your browser and you're off. + +--- + +## Features + +### Overview Dashboard + +The landing page gives you an at-a-glance health summary drawn live from the database: + +- **Parser Coverage %** — proportion of active sources with a confirmed parser +- **Active Sources** — total number of `dataSource.name` values seen in the last 7 days +- **Covered / Need Parser** — counts for each status + +If any sources are uncovered, the **Top Sources Needing a Parser** table lists the highest-volume offenders. Click any source name to jump directly to the Parser Quality page with that source pre-selected. + +--- + +### Parser Coverage Map + +Answers the question: *does each active data source have a parser running?* + +**How it works:** + +1. **Sync Live Sources** — executes a PowerQuery against your data lake to retrieve every `dataSource.name` seen in the last 7 days, along with event counts. +2. **Load SDL Parsers** — reads parser files from `parsers/`, extracts the `dataSource.name` attribute from each, and stores the field list in the database. + +**Matching logic (three-tier):** +1. Exact `dataSource.name` match between the active source and the parser attribute +2. Normalised substring match (ignores spaces, dashes, and case) between the active source name and the parser's `dataSource.name` +3. Normalised substring match against the parser filename — catches files where the `dataSource.name` attribute is incorrect or missing + +**Parser detection from data:** During sync, a parallel PowerQuery checks whether each source has events with `event.type` populated in the data lake. If so, a parser is confirmed as running — the source is marked **Covered** even without a local parser file. This handles built-in and cloud-managed parsers that are not present in your `parsers/` folder. + +**Status values:** +- 🟢 **Covered** — custom parser confirmed (local file or detected via parsed events in the data lake) +- 🔴 **Parser Needed** — no parser found, or only a grok/dottedJson format (which typically indicates an incomplete parser) + +**Filters:** Use the filter pills to focus on Custom Parser only, Default Parser Only (data lake detected), or No Parser. + +**Deep link:** Click any source name in the table to open it directly in Parser Quality with all dropdowns pre-populated. + +**Expected results:** After syncing sources and loading parsers, sources with active SDL parsers will appear as Covered. Sources sending raw, unparsed data — where only `message` and `timestamp` appear in the data lake — will appear as Parser Needed. + +--- + +### Ingest Dashboard + +Answers the question: *where is my event volume coming from, and what would happen if I filtered some of it?* + +**Time range:** 1h (default), 3d, 5d, 7d + +**Daily Event Volume** — bar chart of total events per day. In 1h mode, this switches to a by-source breakdown of the current hour's activity. + +**Top Sources** — a table of the 25 highest-volume `dataSource.name` values with event count and estimated GB (calculated at 0.5 GB per million events). + +**Filter Simulator** — enter a source name and an optional event type, then press Simulate. The backend runs a live PowerQuery counting matching events and projects: +- Matched events in the selected period +- Estimated GB that would be saved +- Projected monthly events and GB if the filter were applied permanently + +This is entirely read-only — no filter is created or applied. Use the results to inform an exclusion rule you apply manually in the console. + +**Expected results:** Top sources should reflect what you see in the SentinelOne console PowerQuery tool. The filter simulator provides a reasonable GB estimate assuming uniform event size across the source. + +--- + +### Parser Quality + +Three tools for diagnosing parser extraction failures. + +#### Live Event Sampler + +Pulls raw events from a selected source directly from the data lake and renders every field that came back. The `message` column is pinned to the right of the table, with a **⎘ copy** button on each row for convenient extraction of raw log lines. + +- **Empty fields** are displayed as `∅` in grey — immediately highlighting fields the parser is failing to populate +- **Healthy source:** many fields populated (`src.ip`, `user.name`, `event.type`, etc.), with `message` present as the raw log backup +- **Unhealthy source:** only `timestamp` and `message` populated — the parser is not extracting anything of value + +#### Field Population Rate + +Samples up to 500 events from a source and measures what percentage of them have each field populated. Results are sorted worst-first so the most pressing gaps are immediately visible. + +When you select a source, the tool automatically discovers which fields exist in that source's events and pre-fills the field list — merged with SDL schema defaults. The list is fully editable before running the analysis. + +**Colour coding:** +- 🟢 ≥ 80% — healthy extraction +- 🟡 40–79% — partial extraction; check your regex patterns +- 🔴 < 40% — field is rarely populated; the parser is likely not matching this log format variant + +**Healthy parser:** Key fields such as `src.ip`, `event.type`, and `user.name` should sit between 70–100%. Niche fields like `src.process.cmdline` or `tgt.file.path` will naturally be lower, as not every event type produces them. + +**Broken parser:** All SDL fields at 0%, with only `timestamp` and `message` visible in the "fields seen in sample" chip list at the bottom of the results. + +#### Parser Test Runner + +Paste a raw log line, select a loaded parser, and press Test. The backend extracts SDL `$field=pattern$` format strings from the parser file, converts them to Python named-group regular expressions, and tries each against your log line. + +- **Matched:** displays the format string that matched and every field extracted with its value +- **No match:** none of the parser's format strings apply to this log line — the log may contain a format variant the parser does not yet cover + +> **Note:** Only parsers using SDL custom format strings are supported by the test runner. Grok and dottedJson parsers are not currently testable here. + +--- + +### Onboarding Accelerator + +A prompt template for using Claude Code to onboard a new log source. Copy the template, paste a sample of raw log lines, and Claude Code will generate: + +- An SDL parser skeleton in augmented-JSON format +- Field mappings to the SDL common schema +- Parser test assertions + +No Anthropic API key is required — this uses Claude Code directly from your terminal. + +--- + +### Settings + +Read and write your `.env` credentials from the interface. Secret fields (API tokens, keys) are masked by default with a show/hide toggle. Changes are written to the mounted `.env` file and take effect after restarting the backend: + +```bash +docker-compose up -d --build backend +``` + +--- + +## Rebuilding + +```bash +# Full rebuild +docker-compose up -d --build + +# Backend only (after Python changes) +docker-compose up -d --build backend + +# Frontend only (after HTML/JS changes) +docker-compose up -d --build frontend + +# Reset the database +curl -X DELETE http://localhost:8001/api/coverage/reset +``` + +--- + +## Project Layout + +``` +. +├── backend/ +│ ├── main.py # FastAPI application, router registration +│ ├── db.py # SQLAlchemy models +│ ├── routers/ +│ │ ├── coverage.py # Parser coverage map endpoints +│ │ ├── ingest.py # Ingest dashboard + filter simulator +│ │ ├── quality.py # Parser quality tools +│ │ └── settings.py # .env read/write +│ └── services/ +│ ├── s1_client.py # SentinelOne + Scalyr API client +│ └── rule_parser.py # SDL format string field extraction +├── frontend/ +│ └── index.html # Single-page application (Tailwind, vanilla JS) +├── parsers/ # SDL parser files (volume-mounted) +├── db/ +│ └── init.sql # Postgres initialisation (tables created by SQLAlchemy) +├── docker-compose.yml +├── .env.example +└── README.md +``` + +--- + +## Notes + +- The backend queries your **demo tenant** (`demo.sentinelone.net`) — not usea1-purple or any other tenant. Ensure your `S1_BASE_URL` and `SDL_LOG_READ_KEY` are pointed at the same tenant. +- Parser files in `parsers/` are read at query time, not on startup — add or update files at any point without rebuilding the image. +- The filter simulator is entirely read-only and makes no changes whatsoever to your tenant configuration. +- The service user API token must be at **account scope** or higher. Site-scoped tokens will have limited visibility into rules and may see reduced source counts. diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..061e6ff --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.12-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] diff --git a/backend/db.py b/backend/db.py new file mode 100644 index 0000000..aaa93fa --- /dev/null +++ b/backend/db.py @@ -0,0 +1,55 @@ +import os +from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, Text +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.orm import declarative_base, sessionmaker +from datetime import datetime + +DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://siem:siem@db:5432/siem") + +engine = create_engine(DATABASE_URL) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) +Base = declarative_base() + + +class ParsedRule(Base): + __tablename__ = "parsed_rules" + id = Column(Integer, primary_key=True) + rule_id = Column(String, unique=True, index=True) + name = Column(String) + rule_type = Column(String) # 'star' or 'sigma' + fields_used = Column(JSONB) + raw = Column(Text) + cached_at = Column(DateTime, default=datetime.utcnow) + + +class ParserField(Base): + __tablename__ = "parser_fields" + id = Column(Integer, primary_key=True) + parser_name = Column(String, index=True) + field_name = Column(String) + field_type = Column(String) + + +class ActiveSource(Base): + __tablename__ = "active_sources" + id = Column(Integer, primary_key=True) + source_name = Column(String, unique=True, index=True) + event_count = Column(Integer, default=0) + synced_at = Column(DateTime, default=datetime.utcnow) + parser_detected = Column(Integer, default=0) # >0 means parsed events seen in data lake + + +class IngestSnapshot(Base): + __tablename__ = "ingest_snapshots" + id = Column(Integer, primary_key=True) + period_days = Column(Integer) + data = Column(JSONB) + recorded_at = Column(DateTime, default=datetime.utcnow) + + +def get_db(): + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000..b0b67de --- /dev/null +++ b/backend/main.py @@ -0,0 +1,68 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from db import engine, Base, get_db, ParsedRule +from routers import coverage, ingest, settings, quality + +Base.metadata.create_all(bind=engine) + +# Runtime migration: add columns that didn't exist in earlier schema versions +from sqlalchemy import text +with engine.connect() as _conn: + _conn.execute(text( + "ALTER TABLE active_sources ADD COLUMN IF NOT EXISTS parser_detected INTEGER DEFAULT 0" + )) + _conn.commit() + +app = FastAPI(title="SIEM Toolkit", version="1.0.0") + + +@app.on_event("startup") +async def auto_load_detections(): + """ + Auto-load detection library rules on startup. + Tries the live S1 API first (accurate 'sources' field); falls back to extracted.json. + Skips if rules are already loaded — use the 'Sync Library' button to force a refresh. + """ + import os + from sqlalchemy.orm import Session + from services import s1_client + + db: Session = next(get_db()) + try: + existing = db.query(ParsedRule).filter_by(rule_type="library").count() + if existing > 0: + return # Already loaded — skip until user manually refreshes + + # Try live API first + try: + rules = await s1_client.get_platform_rules() + if rules: + coverage._import_from_api_rules(db, rules) + return + except Exception: + pass + + # Fall back to local file + detections_file = os.environ.get("DETECTIONS_FILE", "/app/data/detections.json") + if os.path.exists(detections_file): + coverage._import_detections(db, detections_file) + finally: + db.close() + +app.add_middleware( + CORSMiddleware, + allow_origins=["http://localhost:3001"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(coverage.router, prefix="/api/coverage", tags=["Coverage"]) +app.include_router(ingest.router, prefix="/api/ingest", tags=["Ingest"]) +app.include_router(settings.router, prefix="/api/settings", tags=["Settings"]) +app.include_router(quality.router, prefix="/api/quality", tags=["Quality"]) + + +@app.get("/health") +def health(): + return {"status": "ok"} diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..d242925 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,9 @@ +fastapi==0.115.0 +uvicorn[standard]==0.30.0 +httpx==0.27.2 +psycopg2-binary==2.9.9 +sqlalchemy==2.0.36 +pydantic==2.9.2 +pydantic-settings==2.6.1 +pyyaml==6.0.2 +python-multipart==0.0.12 diff --git a/backend/routers/__init__.py b/backend/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/routers/coverage.py b/backend/routers/coverage.py new file mode 100644 index 0000000..1e5516d --- /dev/null +++ b/backend/routers/coverage.py @@ -0,0 +1,648 @@ +import json +import os +from fastapi import APIRouter, UploadFile, File, Depends, HTTPException +from pydantic import BaseModel +from sqlalchemy.orm import Session +from datetime import datetime +from db import get_db, ParsedRule, ParserField, ActiveSource +from services import s1_client, rule_parser + +DETECTIONS_FILE = os.environ.get("DETECTIONS_FILE", "/app/data/detections.json") + +router = APIRouter() + + +def _star_query_texts(rule: dict) -> list[str]: + """ + Extract all PowerQuery/filter strings from a STAR rule. + Handles simple rules (s1ql) and correlation rules (subQueries[].subQuery). + """ + texts = [] + + # Simple rules + for field in ("s1ql", "queryLang", "query", "powerQuery"): + v = rule.get(field) + # queryLang "2.0" is a version string, not a query — skip short strings + if v and isinstance(v, str) and len(v) > 5: + texts.append(v) + + # Correlation rules: subQueries[].subQuery + cp = rule.get("correlationParams") or {} + for sq in cp.get("subQueries", []): + v = sq.get("subQuery") + if v and isinstance(v, str): + texts.append(v) + # Also handle older conditions[] format + for cond in cp.get("conditions", []): + for key in ("filter", "query", "subQuery"): + v = cond.get(key) + if v and isinstance(v, str): + texts.append(v) + + return texts + + +@router.post("/load-star-rules") +async def load_star_rules(db: Session = Depends(get_db)): + """Fetch all STAR rules from the Management Console API and index their fields.""" + try: + rules = await s1_client.get_star_rules() + except Exception as e: + raise HTTPException(502, f"S1 API error: {type(e).__name__}: {e}") + + # Replace all existing STAR rules cleanly to avoid duplicate key errors + db.query(ParsedRule).filter_by(rule_type="star").delete() + db.flush() + + loaded = [] + for rule in rules: + all_fields: set = set() + for qt in _star_query_texts(rule): + all_fields |= rule_parser.extract_star_fields(qt) + fields = list(all_fields) + record = ParsedRule( + rule_id=str(rule.get("id", "")), + name=rule.get("name", "unnamed"), + rule_type="star", + fields_used=fields, + raw=json.dumps(rule), + ) + db.add(record) + loaded.append({"id": record.rule_id, "name": record.name, "fields": fields}) + + db.commit() + return {"loaded": len(loaded), "rules": loaded} + + +_EXCLUDED_PATHS = ("/rules/silent/", "/rules/dev/") + + +def _import_from_api_rules(db, rules: list) -> int: + """ + Import platform rules fetched directly from the S1 API into the database. + Each rule has a 'sources' list — the authoritative dataSource.name values. + """ + db.query(ParsedRule).filter_by(rule_type="library").delete() + db.commit() + + loaded = 0 + seen_ids: set = set() + for rule in rules: + rule_id = str(rule.get("id", f"lib_{loaded}")) + if rule_id in seen_ids: + continue + seen_ids.add(rule_id) + + sources = rule.get("sources") or [] + db.add(ParsedRule( + rule_id=rule_id, + name=rule.get("name", "unnamed"), + rule_type="library", + fields_used=[], # API rules don't expose field-level info + raw=json.dumps({"data_sources": sources}), + )) + loaded += 1 + if loaded % 500 == 0: + db.flush() + + db.commit() + return loaded + + +def _import_detections(db, detections_file: str) -> int: + """ + Import library detection rules from extracted.json into the database. + Replaces any existing library rules. Returns the count of rules loaded. + """ + with open(detections_file, "r", encoding="utf-8") as fh: + data = json.load(fh) + + results = data.get("results", []) + results = [r for r in results if not any(r.get("file", "").startswith(p) for p in _EXCLUDED_PATHS)] + + db.query(ParsedRule).filter_by(rule_type="library").delete() + db.commit() + + loaded = 0 + seen_ids: set = set() + for rule in results: + all_fields: set = set() + data_sources: list[str] = [] + for q in rule.get("queries", []): + all_fields.update(q.get("keys", [])) + ds_vals = q.get("pairs", {}).get("dataSource.name", []) + for v in ds_vals: + if isinstance(v, str): + data_sources.append(v) + elif isinstance(v, list): + data_sources.extend(str(x) for x in v) + + rule_id = str(rule.get("id", f"lib_{loaded}")) + if rule_id in seen_ids: + continue + seen_ids.add(rule_id) + + db.add(ParsedRule( + rule_id=rule_id, + name=rule.get("name", "unnamed"), + rule_type="library", + fields_used=list(all_fields), + raw=json.dumps({"data_sources": list(set(data_sources))}), + )) + loaded += 1 + if loaded % 500 == 0: + db.flush() + + db.commit() + return loaded + + +@router.post("/load-detections") +async def load_detections(db: Session = Depends(get_db)): + """ + Reload detection library rules. + Tries the live S1 API first (platform-rules endpoint); falls back to extracted.json. + """ + # Prefer the live API — gives accurate 'sources' and is always up to date + try: + rules = await s1_client.get_platform_rules() + if rules: + loaded = _import_from_api_rules(db, rules) + return {"loaded": loaded, "source": "api"} + except Exception: + pass + + # Fall back to local extracted.json + if not os.path.exists(DETECTIONS_FILE): + raise HTTPException( + 404, + "S1 API unavailable and no detections file found — " + "ensure the data/ volume is mounted with detections.json" + ) + try: + loaded = _import_detections(db, DETECTIONS_FILE) + except Exception as e: + raise HTTPException(500, f"Failed to import detections: {e}") + return {"loaded": loaded, "source": "file"} + + +@router.post("/upload-sigma") +async def upload_sigma(files: list[UploadFile] = File(...), db: Session = Depends(get_db)): + """Upload one or more Sigma YAML files and index their fields.""" + loaded = [] + for file in files: + content = (await file.read()).decode("utf-8", errors="replace") + fields = list(rule_parser.extract_sigma_fields(content)) + record = ParsedRule( + rule_id=f"sigma_{file.filename}", + name=file.filename or "unnamed", + rule_type="sigma", + fields_used=fields, + raw=content, + ) + db.merge(record) + loaded.append({"name": file.filename, "fields": fields}) + + db.commit() + return {"loaded": len(loaded), "rules": loaded} + + +@router.post("/load-parsers-from-sdl") +async def load_parsers_from_sdl(db: Session = Depends(get_db)): + """ + Load SDL parsers from the local /app/parsers directory (mounted from ./parsers/). + Files are placed there by the MCP-based loader or by manual copy. + Falls back to a clear error if the directory is empty. + """ + import os + parsers_dir = "/app/parsers" + + try: + entries = [ + e for e in os.scandir(parsers_dir) + if e.is_file() and not e.name.startswith(".") + ] + except FileNotFoundError: + raise HTTPException(503, "parsers/ directory not found — check Docker volume mount") + + if not entries: + raise HTTPException( + 422, + "No parser files found in parsers/ directory. " + "Use 'Load SDL Parsers via MCP' in Claude Code to populate it, " + "or upload a parser file manually." + ) + + loaded = [] + errors = [] + for entry in entries: + try: + with open(entry.path, "r", encoding="utf-8", errors="replace") as fh: + content = fh.read() + + fields: set = set() + try: + import json as _json + parser_data = _json.loads(content) + fields = rule_parser.extract_parser_fields(parser_data) + except Exception: + pass + fields |= rule_parser.extract_parser_fields_from_content(content) + + name = entry.name + db.query(ParserField).filter_by(parser_name=name).delete() + for f in fields: + db.add(ParserField(parser_name=name, field_name=f, field_type="string")) + loaded.append({"parser": name, "fields": list(fields), "field_count": len(fields)}) + except Exception as e: + errors.append({"parser": entry.name, "error": str(e)}) + + db.commit() + return {"loaded": len(loaded), "parsers": loaded, "errors": errors} + + +@router.post("/upload-parser") +async def upload_parser(file: UploadFile = File(...), db: Session = Depends(get_db)): + """Upload an SDL parser JSON file and index its output fields.""" + raw_bytes = await file.read() + content_str = raw_bytes.decode("utf-8", errors="replace") + + # Try structured JSON extraction first, fall back to content-string extraction + fields: set = set() + try: + parser_data = json.loads(content_str) + fields = rule_parser.extract_parser_fields(parser_data) + except json.JSONDecodeError: + pass + + # Always also run content-string extraction (catches $field$ SDL format strings) + fields |= rule_parser.extract_parser_fields_from_content(content_str) + + db.query(ParserField).filter_by(parser_name=file.filename).delete() + for f in fields: + db.add(ParserField(parser_name=file.filename, field_name=f, field_type="string")) + + db.commit() + return {"parser": file.filename, "fields": list(fields)} + + +class ParserContentPayload(BaseModel): + parser_name: str + content: str # raw SDL parser file content as string + + +@router.post("/load-parser-content") +async def load_parser_content(payload: ParserContentPayload, db: Session = Depends(get_db)): + """ + Accept raw SDL parser content (as a string) and index its output fields. + Used by MCP-based loader scripts since the SDL HTTP API endpoint is not + accessible from inside Docker with standard API token auth. + """ + fields: set = set() + + # Try JSON parsing first (structured attributes/fields/mappings) + try: + parser_data = json.loads(payload.content) + fields = rule_parser.extract_parser_fields(parser_data) + except (json.JSONDecodeError, Exception): + pass + + # Always run SDL format-string extraction ($field.name$ patterns) + fields |= rule_parser.extract_parser_fields_from_content(payload.content) + + if not fields: + raise HTTPException(422, "No fields could be extracted from the parser content") + + db.query(ParserField).filter_by(parser_name=payload.parser_name).delete() + for f in fields: + db.add(ParserField(parser_name=payload.parser_name, field_name=f, field_type="string")) + + db.commit() + return {"parser": payload.parser_name, "fields": list(fields), "field_count": len(fields)} + + +# Native SentinelOne platform sources — parsed by the system, not by SDL parsers. +# Excluded from the coverage map as they do not require custom parser coverage. +_S1_NATIVE_SOURCES = { + "SentinelOne", "asset", "alert", "vulnerability", + "ActivityFeed", "indicator", "misconfiguration", + "SentinelOne Ranger AD", +} + + +@router.post("/sync-sources") +async def sync_sources(days: int = 7, db: Session = Depends(get_db)): + """Pull active dataSource.names from the SDL and store them. + Also detects whether a parser is already producing structured fields + for each source by checking if event.type is populated in the data lake. + Native S1 platform sources are excluded as they do not require SDL parsers. + """ + import asyncio + from datetime import datetime, timedelta + now = datetime.utcnow() + from_dt = (now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z") + to_dt = now.strftime("%Y-%m-%dT%H:%M:%S.000Z") + + try: + volume_result, parsed_result = await asyncio.gather( + s1_client.run_powerquery( + "| group events=count() by dataSource.name | sort -events | limit 200", + from_dt, to_dt + ), + s1_client.run_powerquery( + "| filter event.type != '' | group parsed=count() by dataSource.name | limit 200", + from_dt, to_dt + ), + ) + except Exception as e: + raise HTTPException(502, f"PowerQuery error: {e}") + + # Build lookup: source_name → count of parsed events seen + parsed_by_source: dict[str, int] = {} + for row in parsed_result.get("events", []): + name = row.get("dataSource.name") + if name: + parsed_by_source[name] = row.get("parsed", 0) + + rows = volume_result.get("events", []) + db.query(ActiveSource).delete() + synced_at = datetime.utcnow() + seen = 0 + for row in rows: + name = row.get("dataSource.name") + if name and name not in _S1_NATIVE_SOURCES: + db.add(ActiveSource( + source_name=name, + event_count=row.get("events", 0), + synced_at=synced_at, + parser_detected=parsed_by_source.get(name, 0), + )) + seen += 1 + db.commit() + return {"synced": seen, "sources": [r["dataSource.name"] for r in rows if r.get("dataSource.name") and r["dataSource.name"] not in _S1_NATIVE_SOURCES]} + + +def _build_parser_ds_index() -> dict[str, dict]: + """ + Read all parser files from /app/parsers/ and build an index: + dataSource.name (exact, from parser attributes) → {parser_name, format_type} + + Format type is "grok", "dottedJson", or "custom". + Sources with grok/dottedJson parsers are flagged as needing a proper parser. + """ + import os, re + parsers_dir = "/app/parsers" + _DS_NAME_RE = re.compile(r'"dataSource\.name"\s*:\s*"([^"]+)"') + _FORMAT_TYPE_RE = re.compile(r'"type"\s*:\s*"([^"]+)"') + + index: dict[str, dict] = {} + try: + entries = [e for e in os.scandir(parsers_dir) if e.is_file() and not e.name.startswith(".")] + except FileNotFoundError: + return index + + for entry in entries: + try: + with open(entry.path, "r", encoding="utf-8", errors="replace") as fh: + content = fh.read() + except Exception: + continue + + # Extract dataSource.name (may appear multiple times — take first) + ds_match = _DS_NAME_RE.search(content) + if not ds_match: + continue + ds_name = ds_match.group(1).strip() + + # Determine format type — look for grok/dottedJson/custom in "type" values + format_types = {m.group(1).lower() for m in _FORMAT_TYPE_RE.finditer(content)} + if "grok" in format_types: + fmt = "grok" + elif "dottedjson" in format_types: + fmt = "dottedJson" + else: + fmt = "custom" + + index[ds_name] = {"parser_name": entry.name, "format_type": fmt} + + return index + + +@router.get("/map") +def get_coverage_map(db: Session = Depends(get_db)): + """ + Source-centric coverage map. + For each active dataSource.name in the SDL: + - covered = a custom parser is loaded for it (dataSource.name matches) + - parser_needed = no parser, OR parser uses grok/dottedJson format + Also surfaces which STAR rules reference each source. + """ + active_sources = db.query(ActiveSource).order_by(ActiveSource.event_count.desc()).all() + parser_fields_rows = db.query(ParserField).all() + rules = db.query(ParsedRule).all() + + # parser_name → set of field names (for field count display) + parser_index: dict[str, set] = {} + for pf in parser_fields_rows: + parser_index.setdefault(pf.parser_name, set()).add(pf.field_name) + + # Build dataSource.name → {parser_name, format_type} index from parser files + ds_index = _build_parser_ds_index() + + def _normalize(s: str) -> str: + return s.lower().replace(" ", "").replace("-", "").replace("_", "").replace(".", "") + + def _find_parser_info(source_name: str) -> dict | None: + """ + Match priority: + 1. Exact dataSource.name match + 2. Normalized substring: active source name ↔ parser dataSource.name + 3. Normalized substring: active source name ↔ parser filename + (catches cases where the parser file has a wrong dataSource.name) + """ + # 1. Exact match on dataSource.name + if source_name in ds_index: + return ds_index[source_name] + sn = _normalize(source_name) + # 2. Normalized ds_name substring + for ds_name, info in ds_index.items(): + if _normalize(ds_name) in sn or sn in _normalize(ds_name): + return info + # 3. Normalized filename substring + for info in ds_index.values(): + if _normalize(info["parser_name"]) in sn or sn in _normalize(info["parser_name"]): + return info + return None + + # Fields each rule needs: rule.name → set of field names + rule_fields_index: dict[str, set] = { + rule.name: set(rule.fields_used or []) for rule in rules + } + + # Build rule index: source_name → rules that reference it + rule_by_source: dict[str, list] = {} + for rule in rules: + try: + raw_data = json.loads(rule.raw) if rule.raw else {} + except Exception: + raw_data = {} + + if rule.rule_type == "library": + # Library rules store pre-extracted data_sources list in raw + data_sources = raw_data.get("data_sources", []) + else: + query_texts = _star_query_texts(raw_data) + data_sources = rule_parser.extract_data_sources(query_texts) + + for ds in data_sources: + rule_by_source.setdefault(ds, []).append({"rule": rule.name, "type": rule.rule_type}) + + # Fields to ignore when computing "missing" — these are metadata/schema fields + # always present in events regardless of the parser + _SCHEMA_FIELDS = { + "dataSource.name", "dataSource.vendor", "dataSource.category", + "event.type", "timestamp", "src.endpoint.ip", "src.endpoint.name", + # Endpoint agent fields — populated by the SentinelOne agent, not by SDL parsers + "cmdScript.content", "endpoint.os", "endpoint.name", "endpoint.uid", + } + + sources_out = [] + covered_count = 0 + needed_count = 0 + + for src in active_sources: + parser_info = _find_parser_info(src.source_name) + parser_in_data = (src.parser_detected or 0) > 0 + + if parser_info and parser_info["format_type"] == "custom": + status = "covered" + matched_parser = parser_info["parser_name"] + format_type = "custom" + elif parser_info and parser_info["format_type"] in ("grok", "dottedJson") and not parser_in_data: + # Known parser but primitive format and no evidence of parsing in data + status = "parser_needed" + matched_parser = parser_info["parser_name"] + format_type = parser_info["format_type"] + elif parser_in_data: + # Parsed fields detected in the data lake — a parser is running + status = "covered" + matched_parser = parser_info["parser_name"] if parser_info else "detected in data" + format_type = parser_info["format_type"] if parser_info else "unknown" + else: + status = "parser_needed" + matched_parser = None + format_type = None + + if status == "covered": + covered_count += 1 + else: + needed_count += 1 + + rules_for_src: list = [r for r in rule_by_source.get(src.source_name, []) if r["type"] == "library"] + + # Close-match suggestions — shown when there are no library rules for this source. + close_matches: list = [] + if not rules_for_src: + import re as _re + + def _word_tokens(s: str) -> set: + """Split on non-alphanumeric boundaries, lowercase, drop single chars.""" + return {t for t in _re.split(r"[^a-z0-9]+", s.lower()) if len(t) >= 2} + + def _is_close(a: str, b: str) -> bool: + na, nb = _normalize(a), _normalize(b) + # 1. Simple substring match + if na in nb or nb in na: + return True + # 2. Token-level: handles "Microsoft 365 Collaboration" vs "Microsoft O365" + # — "365" is inside "o365", and they share "microsoft" + ta, tb = _word_tokens(a), _word_tokens(b) + shared_exact = ta & tb + if not shared_exact: + return False # Must share at least one word exactly + # Check that a DISTINCTIVE (non-shared) token from one name + # appears as a substring inside a token from the other. + # This avoids matching "Azure AD" to "Azure Platform" on "azure" alone. + unique_a = ta - shared_exact + unique_b = tb - shared_exact + return any( + ua in ub or ub in ua + for ua in unique_a for ub in unique_b + if len(ua) >= 2 and len(ub) >= 2 + ) + + sn = _normalize(src.source_name) + for lib_ds, lib_rules in rule_by_source.items(): + lib_only = [r for r in lib_rules if r["type"] == "library"] + if not lib_only: + continue + if _is_close(src.source_name, lib_ds): + close_matches.append({ + "library_name": lib_ds, + "rule_count": len(lib_only), + }) + close_matches.sort(key=lambda x: x["rule_count"], reverse=True) + close_matches = close_matches[:3] + + # Count how many rules reference each field (frequency) + field_freq: dict[str, int] = {} + for r in rules_for_src: + for f in rule_fields_index.get(r["rule"], set()): + field_freq[f] = field_freq.get(f, 0) + 1 + + # Fields the parser provides + parser_provides = parser_index.get(matched_parser, set()) if matched_parser and matched_parser != "detected in data" else set() + + # Minimum number of rules that must reference a field before we flag it. + # Scales with rule count so single-rule oddities don't dominate. + rule_count = len(rules_for_src) + min_rules = max(2, round(rule_count * 0.05)) if rule_count >= 10 else 2 + + # Missing = dotted-path fields needed by >= min_rules rules, + # not in schema constants, not provided by the parser. + missing_fields = sorted( + f for f, count in field_freq.items() + if count >= min_rules + and "." in f + and f not in _SCHEMA_FIELDS + and f not in parser_provides + ) + + sources_out.append({ + "source_name": src.source_name, + "event_count": src.event_count, + "status": status, + "parser": matched_parser, + "format_type": format_type, + "parser_fields": len(parser_provides), + "parser_detected": src.parser_detected or 0, + "rules": rules_for_src, + "rule_count": len(rules_for_src), + "close_matches": close_matches, + "missing_fields": missing_fields, + "missing_fields_count": len(missing_fields), + "synced_at": src.synced_at.isoformat() if src.synced_at else None, + }) + + synced_at = active_sources[0].synced_at.isoformat() if active_sources else None + + return { + "summary": { + "active_sources": len(active_sources), + "covered": covered_count, + "parser_needed": needed_count, + "parsers_loaded": len(parser_index), + "rules_loaded": len(rules), + }, + "sources": sources_out, + "synced_at": synced_at, + "has_sources": len(active_sources) > 0, + } + + +@router.delete("/reset") +def reset_data(db: Session = Depends(get_db)): + db.query(ParsedRule).delete() + db.query(ParserField).delete() + db.commit() + return {"cleared": True} diff --git a/backend/routers/ingest.py b/backend/routers/ingest.py new file mode 100644 index 0000000..a665731 --- /dev/null +++ b/backend/routers/ingest.py @@ -0,0 +1,122 @@ +from datetime import datetime, timedelta +from fastapi import APIRouter, Query, HTTPException +from pydantic import BaseModel +from services import s1_client + +router = APIRouter() + + +def _date_range(days: int) -> tuple[str, str]: + now = datetime.utcnow() + return ( + (now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z"), + now.strftime("%Y-%m-%dT%H:%M:%S.000Z"), + ) + + +def _date_range_hours(hours: int) -> tuple[str, str]: + now = datetime.utcnow() + return ( + (now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"), + now.strftime("%Y-%m-%dT%H:%M:%S.000Z"), + ) + + +@router.get("/top-sources") +async def get_top_sources( + days: int = Query(None, ge=1, le=90), + hours: int = Query(None, ge=1, le=24), +): + """Top log sources by event count over the given period.""" + if hours is not None: + from_dt, to_dt = _date_range_hours(hours) + period_label = f"{hours}h" + else: + from_dt, to_dt = _date_range(days or 7) + period_label = f"{days or 7}d" + query = "| group events=count() by dataSource.name | sort -events | limit 25" + try: + result = await s1_client.run_powerquery(query, from_dt, to_dt) + except Exception as e: + raise HTTPException(502, f"PowerQuery error: {e}") + return {"period": period_label, "data": result.get("events", [])} + + +@router.get("/by-event-type") +async def get_by_event_type(days: int = Query(7, ge=1, le=90)): + """Event counts grouped by source and event type.""" + from_dt, to_dt = _date_range(days) + query = "| group events=count() by dataSource.name, event.type | sort -events | limit 100" + try: + result = await s1_client.run_powerquery(query, from_dt, to_dt) + except Exception as e: + raise HTTPException(502, f"PowerQuery error: {e}") + return {"period_days": days, "data": result.get("events", [])} + + +@router.get("/daily-volume") +async def get_daily_volume(days: int = Query(5, ge=1, le=7)): + """Total event count per day — queries run in parallel.""" + import asyncio + + now = datetime.utcnow() + points = min(days, 7) + + async def _fetch_day(i: int) -> dict: + day_from = (now - timedelta(days=i + 1)).strftime("%Y-%m-%dT00:00:00.000Z") + day_to = (now - timedelta(days=i)).strftime("%Y-%m-%dT00:00:00.000Z") + label = (now - timedelta(days=i + 1)).strftime("%Y-%m-%d") + try: + result = await s1_client.run_powerquery("| group total=count()", day_from, day_to) + events_list = result.get("events", []) if isinstance(result, dict) else [] + count = events_list[0].get("total", 0) if events_list else 0 + except Exception: + count = 0 + return {"date": label, "events": count} + + results = await asyncio.gather(*[_fetch_day(i) for i in range(points)]) + return list(reversed(results)) + + +class FilterRule(BaseModel): + source: str = "" + event_type: str = "" + days: int = 7 + gb_per_million_events: float = 0.5 + + +@router.post("/simulate-filter") +async def simulate_filter(rule: FilterRule): + """Estimate how many events and GB would be eliminated by an exclusion filter.""" + from_dt, to_dt = _date_range(rule.days) + + clauses = [] + if rule.source: + clauses.append(f"dataSource.name=='{rule.source}'") + if rule.event_type: + clauses.append(f"event.type=='{rule.event_type}'") + + if clauses: + filter_expr = " and ".join(clauses) + query = f"| filter {filter_expr} | group events=count()" + else: + query = "| group events=count()" + + try: + result = await s1_client.run_powerquery(query, from_dt, to_dt) + events = (result.get("events") or [{}])[0].get("events", 0) if isinstance(result.get("events"), list) else 0 + except Exception as e: + raise HTTPException(502, f"PowerQuery error: {e}") + + estimated_gb = round(events / 1_000_000 * rule.gb_per_million_events, 3) + monthly_events = int(events / rule.days * 30) + monthly_gb = round(monthly_events / 1_000_000 * rule.gb_per_million_events, 2) + + return { + "period_days": rule.days, + "matched_events": events, + "estimated_gb_period": estimated_gb, + "projected_monthly_events": monthly_events, + "projected_monthly_gb": monthly_gb, + "filter": {"source": rule.source, "event_type": rule.event_type}, + } diff --git a/backend/routers/quality.py b/backend/routers/quality.py new file mode 100644 index 0000000..2021f78 --- /dev/null +++ b/backend/routers/quality.py @@ -0,0 +1,440 @@ +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel +from datetime import datetime, timedelta +from services import s1_client +import os +import re + +router = APIRouter() + + +PARSERS_DIR = "/app/parsers" + + +@router.get("/parsers") +def list_parser_files(): + """List parser filenames available under /app/parsers/ for the Test Runner.""" + try: + names = sorted( + e.name for e in os.scandir(PARSERS_DIR) + if e.is_file() and not e.name.startswith(".") + ) + except FileNotFoundError: + names = [] + return {"parsers": names, "count": len(names)} + + +@router.post("/sync-from-sdl") +async def sync_parsers_from_sdl(): + """Download every parser file under /logParsers/ on the SDL tenant into + /app/parsers/. After this call returns, the Parser Test Runner dropdown + will include all tenant parsers (including custom ones). + + Requires SDL_CONFIG_READ_KEY in .env (Configuration Read scope on the + Data Lake API key). + """ + if not s1_client.SDL_CONFIG_READ_KEY: + raise HTTPException( + 400, + "SDL_CONFIG_READ_KEY is not set in .env. Generate a Data Lake API key " + "with 'Configuration Read' scope in the S1 console and add it to .env." + ) + + try: + names = await s1_client.list_sdl_parsers() + except Exception as e: + raise HTTPException(502, f"SDL listFiles failed: {e}") + + os.makedirs(PARSERS_DIR, exist_ok=True) + downloaded: list[str] = [] + errors: list[dict] = [] + + for name in names: + # The path on SDL is /logParsers/; we write to /app/parsers/. + safe_name = name.replace("/", "_") + try: + resp = await s1_client.get_sdl_parser(name) + content = resp.get("content") + if content is None: + errors.append({"parser": name, "error": "no content field in response"}) + continue + with open(os.path.join(PARSERS_DIR, safe_name), "w", encoding="utf-8") as fh: + fh.write(content) + downloaded.append(safe_name) + except Exception as e: + errors.append({"parser": name, "error": str(e) or e.__class__.__name__}) + + return { + "downloaded": len(downloaded), + "parsers": downloaded, + "errors": errors, + "directory": PARSERS_DIR, + } + + +def _date_range_hours(hours: int) -> tuple[str, str]: + now = datetime.utcnow() + return ( + (now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"), + now.strftime("%Y-%m-%dT%H:%M:%S.000Z"), + ) + + +# --------------------------------------------------------------------------- +# Models +# --------------------------------------------------------------------------- + +class SampleEventsRequest(BaseModel): + source: str + limit: int = 20 + hours: int = 1 + + +class FieldPopulationRequest(BaseModel): + source: str + hours: int = 24 + fields: list[str] = [ + "src.ip", + "src.port", + "dst.ip", + "dst.port", + "user.name", + "event.type", + "src.process.name", + "src.process.cmdline", + "tgt.file.path", + "network.direction", + "dataSource.name", + ] + + +class TestParserRequest(BaseModel): + parser_name: str + log_line: str + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _flatten_dict(d: dict, prefix: str = "", out: dict | None = None) -> dict: + """Recursively flatten a nested dict into dotted keys.""" + if out is None: + out = {} + if not isinstance(d, dict): + return out + for k, v in d.items(): + key = f"{prefix}.{k}" if prefix else k + if isinstance(v, dict): + _flatten_dict(v, key, out) + else: + out[key] = v + return out + + +def _flatten_event(event: dict) -> dict: + """Return a flat field→value dict from a PowerQuery result row. + + If the row only carries a JSON-stringified payload in `message` (i.e. the + parser wasn't applied at query time), parse and flatten it inline so the + UI can measure field population accurately. The original raw `message` + is preserved under its own key. + """ + if not isinstance(event, dict): + return {} + flat = dict(event) + msg = flat.get("message") + if isinstance(msg, str) and msg.startswith("{") and msg.endswith("}"): + try: + parsed = __import__("json").loads(msg) + if isinstance(parsed, dict): + flat.update(_flatten_dict(parsed)) + except Exception: + pass + return flat + + +def _extract_format_strings(content: str) -> list[str]: + """ + Extract SDL format string values from augmented-JSON parser content. + Matches: "format": "..." (double-quoted value, supports escaped quotes). + """ + pattern = re.compile(r'"format"\s*:\s*"((?:[^"\\]|\\.)*)"') + return pattern.findall(content) + + +def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]: + """ + Convert an SDL format string to a compiled Python regex. + + Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can + translate group names back to the original SDL field names. + + Raises re.error if the resulting pattern cannot be compiled. + """ + # Split on $...$ tokens + token_pattern = re.compile(r'\$([^$]+)\$') + parts = token_pattern.split(fmt) + # parts alternates: literal, token, literal, token, ... + + regex_parts: list[str] = [] + py_group_to_sdl: dict[str, str] = {} + seen_groups: dict[str, int] = {} + + for i, part in enumerate(parts): + if i % 2 == 0: + # Literal text + regex_parts.append(re.escape(part)) + else: + # Token: either "field.name=PATTERN" or just "field.name" + if '=' in part: + field_name, pattern = part.split('=', 1) + else: + field_name = part + pattern = r'[^\s]+' + + # Build a valid Python group name + safe = re.sub(r'[.\-]', '_', field_name) + if safe in seen_groups: + seen_groups[safe] += 1 + safe = f"{safe}_{seen_groups[safe]}" + else: + seen_groups[safe] = 0 + + py_group_to_sdl[safe] = field_name + regex_parts.append(f'(?P<{safe}>{pattern})') + + compiled = re.compile(''.join(regex_parts), re.IGNORECASE) + return compiled, py_group_to_sdl + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@router.post("/sample-events") +async def sample_events(req: SampleEventsRequest): + """Return a sample of raw events from a given data source.""" + query = f'| filter dataSource.name = "{req.source}" | limit {req.limit}' + from_dt, to_dt = _date_range_hours(req.hours) + + result = await s1_client.run_powerquery(query, from_dt, to_dt) + + rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or []) + events = [_flatten_event(row) for row in rows] + + return { + "source": req.source, + "events": events, + "count": len(events), + "hours": req.hours, + } + + +@router.post("/field-population") +async def field_population(req: FieldPopulationRequest): + """ + Analyse how consistently each requested field is populated across a sample + of events from a data source. + """ + query = f'| filter dataSource.name = "{req.source}" | limit 500' + from_dt, to_dt = _date_range_hours(req.hours) + + result = await s1_client.run_powerquery(query, from_dt, to_dt) + + rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or []) + events = [_flatten_event(row) for row in rows] + + if not events: + raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.") + + total = len(events) + _empty = {None, "", "null"} + + # Collect all field names seen across the sample (useful for surfacing what IS there) + all_seen_fields = sorted({k for ev in events for k in ev}) + + field_stats = [] + for field in req.fields: + # dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back + if field == "dataSource.name": + populated = total + else: + populated = sum(1 for ev in events if ev.get(field) not in _empty) + rate = round((populated / total) * 100, 1) + field_stats.append({ + "field": field, + "populated": populated, + "total": total, + "rate": rate, + }) + + # Sort ascending by rate (worst coverage first) + field_stats.sort(key=lambda x: x["rate"]) + + return { + "source": req.source, + "total_sampled": total, + "hours": req.hours, + "fields": field_stats, + "fields_seen_in_sample": all_seen_fields, + } + + +@router.post("/test-parser") +async def test_parser(req: TestParserRequest): + """ + Test a parser against a raw log line by extracting and matching SDL format + strings found in the parser file. + """ + parser_path = f"/app/parsers/{req.parser_name}" + + try: + with open(parser_path, "r", encoding="utf-8") as fh: + content = fh.read() + except FileNotFoundError: + raise HTTPException(status_code=404, detail=f"Parser file not found: {req.parser_name}") + except OSError as exc: + raise HTTPException(status_code=500, detail=f"Could not read parser file: {exc}") + + format_strings = _extract_format_strings(content) + + # ── JSON auto-extract path ────────────────────────────────────────────── + # SDL parsers that use `$=json{parse=json}$` (or any format containing + # `parse=json`) auto-extract every top-level JSON key as an attribute. + # The regex-based path can't model that — handle it explicitly so users + # can test JSON-shaped logs against JSON-mode parsers. + log_input = req.log_line.strip() + is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{") + if is_json_mode: + import json as _json + # Support multi-line input (one JSON object per line, or a JSON array) + lines = [ln for ln in (l.strip() for l in log_input.splitlines()) if ln] + payloads: list[dict] = [] + parse_errors: list[str] = [] + # Single line: try direct parse; if it's a JSON array, expand. + if len(lines) == 1: + try: + obj = _json.loads(lines[0]) + except Exception as e: + return { + "parser_name": req.parser_name, + "matched": False, + "message": f"Parser expects JSON but log line could not be parsed as JSON: {e}", + "fields": [], + } + if isinstance(obj, list): + payloads = [x for x in obj if isinstance(x, dict)] + elif isinstance(obj, dict): + payloads = [obj] + else: + return { + "parser_name": req.parser_name, + "matched": False, + "message": "Parser expects a JSON object (got scalar).", + "fields": [], + } + else: + # Multi-line: one JSON object per line (NDJSON) + for i, ln in enumerate(lines, 1): + try: + obj = _json.loads(ln) + if isinstance(obj, dict): + payloads.append(obj) + else: + parse_errors.append(f"line {i}: not a JSON object") + except Exception as e: + parse_errors.append(f"line {i}: {e}") + + if not payloads: + return { + "parser_name": req.parser_name, + "matched": False, + "message": "No valid JSON objects found. " + " | ".join(parse_errors[:3]), + "fields": [], + } + + # Use the first payload for the detail table; report totals. + payload = payloads[0] + extracted = _flatten_dict(payload) + # Apply lightweight rewrites if present (input/output/match/replace blocks). + # We only handle simple literal/regex matches with $0 or string replacements; + # this is best-effort, intended for quick visual verification. + rewrites_applied = [] + rewrite_re = re.compile( + r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}', + re.DOTALL, + ) + derived: dict[str, str] = {} + for m in rewrite_re.finditer(content): + in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4) + src_val = extracted.get(in_field) + if src_val is None: + continue + try: + m2 = re.search(match_pat, str(src_val)) + except re.error: + continue + if not m2: + continue + # SDL uses $0 for whole match, $1.. for groups. Translate to Python + # \g<0>, \g<1>, ... so re.sub doesn't read \0 as a null byte. + def _to_py_backref(s: str) -> str: + return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s) + try: + val = re.sub(match_pat, _to_py_backref(replace_val), str(src_val), count=1) + except re.error: + val = replace_val + derived[out_field] = val + rewrites_applied.append({ + "input": in_field, "input_value": src_val, + "output": out_field, "matched_on": match_pat, "result": val, + }) + + fields = ( + [{"field": k, "value": v, "source": "json-extract"} for k, v in sorted(extracted.items())] + + [{"field": k, "value": v, "source": "rewrite"} for k, v in sorted(derived.items())] + ) + return { + "parser_name": req.parser_name, + "matched": True, + "mode": "json", + "format_matched": "$=json{parse=json}$", + "fields": fields, + "rewrites_applied": rewrites_applied, + "extracted_count": len(extracted), + "derived_count": len(derived), + "payload_count": len(payloads), + "parse_errors": parse_errors, + "showing_payload": 1, + } + + # ── Regex format-string path (original) ───────────────────────────────── + for fmt in format_strings: + try: + compiled, py_to_sdl = _sdl_format_to_regex(fmt) + except re.error: + # Skip unparseable format strings + continue + + match = compiled.search(req.log_line) + if match: + fields = [ + {"field": py_to_sdl.get(group, group), "value": value} + for group, value in match.groupdict().items() + if value is not None + ] + return { + "parser_name": req.parser_name, + "matched": True, + "mode": "regex", + "format_matched": fmt, + "fields": fields, + } + + return { + "parser_name": req.parser_name, + "matched": False, + "message": "No format pattern matched", + "fields": [], + } diff --git a/backend/routers/settings.py b/backend/routers/settings.py new file mode 100644 index 0000000..9eddca0 --- /dev/null +++ b/backend/routers/settings.py @@ -0,0 +1,105 @@ +import os +import re +from pathlib import Path +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +router = APIRouter() + +ENV_FILE = Path(os.environ.get("ENV_FILE_PATH", "/app/.env")) + +# Fields we expose in the UI — order matters for display +FIELDS = [ + {"key": "S1_BASE_URL", "label": "Console URL", "secret": False, "placeholder": "https://demo.sentinelone.net"}, + {"key": "S1_API_TOKEN", "label": "Console API Token", "secret": True, "placeholder": "eyJ..."}, + {"key": "SDL_XDR_URL", "label": "SDL XDR URL", "secret": False, "placeholder": "https://xdr.us1.sentinelone.net"}, + {"key": "SDL_LOG_READ_KEY", "label": "SDL Log Read Key", "secret": True, "placeholder": "1DnK0Y4e..."}, + {"key": "ANTHROPIC_API_KEY", "label": "Anthropic API Key", "secret": True, "placeholder": "sk-ant-..."}, +] + +FIELD_KEYS = {f["key"] for f in FIELDS} + + +def _read_env() -> dict[str, str]: + """Read .env file into a dict.""" + vals: dict[str, str] = {} + if ENV_FILE.exists(): + for line in ENV_FILE.read_text().splitlines(): + line = line.strip() + if line and not line.startswith("#") and "=" in line: + k, _, v = line.partition("=") + vals[k.strip()] = v.strip() + return vals + + +def _write_env(updates: dict[str, str]) -> None: + """Write updates into .env, preserving comments and unknown keys.""" + existing_lines: list[str] = [] + if ENV_FILE.exists(): + existing_lines = ENV_FILE.read_text().splitlines() + + written: set[str] = set() + new_lines: list[str] = [] + + for line in existing_lines: + stripped = line.strip() + if stripped and not stripped.startswith("#") and "=" in stripped: + k, _, _ = stripped.partition("=") + k = k.strip() + if k in updates: + new_lines.append(f"{k}={updates[k]}") + written.add(k) + continue + new_lines.append(line) + + # Append any new keys not already in the file + for k, v in updates.items(): + if k not in written: + new_lines.append(f"{k}={v}") + + ENV_FILE.write_text("\n".join(new_lines) + "\n") + + +@router.get("/config") +async def get_config(): + """Return current config values. Secrets are masked.""" + env_vals = _read_env() + result = [] + for f in FIELDS: + key = f["key"] + # Prefer live env var, fall back to .env file value + raw = os.environ.get(key, env_vals.get(key, "")) + if f["secret"] and raw: + # Show first 6 + last 4 chars, mask middle + masked = raw[:6] + "•" * max(4, len(raw) - 10) + raw[-4:] if len(raw) > 10 else "••••••••" + else: + masked = raw + result.append({ + "key": key, + "label": f["label"], + "secret": f["secret"], + "placeholder": f["placeholder"], + "value": masked, + "set": bool(raw), + }) + env_file_exists = ENV_FILE.exists() + return {"fields": result, "env_file_exists": env_file_exists, "env_file_path": str(ENV_FILE)} + + +class ConfigUpdate(BaseModel): + updates: dict[str, str] + + +@router.post("/config") +async def save_config(body: ConfigUpdate): + """Save config values to .env file. Only known keys accepted.""" + bad = [k for k in body.updates if k not in FIELD_KEYS] + if bad: + raise HTTPException(400, f"Unknown keys: {bad}") + if not ENV_FILE.parent.exists(): + raise HTTPException(503, f"Cannot write to {ENV_FILE} — check Docker volume mount") + try: + _write_env(body.updates) + except Exception as e: + raise HTTPException(500, f"Failed to write .env: {e}") + return {"saved": list(body.updates.keys()), "restart_required": True} diff --git a/backend/services/__init__.py b/backend/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/services/rule_parser.py b/backend/services/rule_parser.py new file mode 100644 index 0000000..7287f7a --- /dev/null +++ b/backend/services/rule_parser.py @@ -0,0 +1,209 @@ +import re +import json +import yaml +from typing import Set, List + +_DS_PATTERN = re.compile( + r"dataSource\.name\s*[=in]+\s*[\('\"]([^'\"),]+)['\")]", + re.IGNORECASE, +) + + +# STAR PowerQuery operators that follow a field name +_STAR_OPS = [ + "ContainsCIS", "NotContainsCIS", "Contains", "NotContains", + "StartsWith", "EndsWith", "In", "NotIn", + "IsEmpty", "IsNotEmpty", "Matches", "NotMatches", + "GreaterOrEqual", "LessOrEqual", "GreaterThan", "LessThan", + "Between", "=", "!=", +] +_STAR_KEYWORD = {"and", "or", "not", "true", "false", "null"} +_OP_PATTERN = re.compile( + r"([\w.]+)\s*(?:" + "|".join(re.escape(op) for op in _STAR_OPS) + r")\b" + r"|([\w.]+)\s*=", # also catch field= (no-space form used in subQuery strings) + re.IGNORECASE, +) + + +def extract_star_fields(query: str) -> Set[str]: + """Extract field names referenced in a STAR PowerQuery/subQuery string.""" + fields: Set[str] = set() + for match in _OP_PATTERN.finditer(query): + field = match.group(1) or match.group(2) + if field and field.lower() not in _STAR_KEYWORD and not field[0].isdigit(): + fields.add(field) + return fields + + +def extract_sigma_fields(sigma_content: str) -> Set[str]: + """Extract field names from a Sigma rule YAML.""" + try: + rule = yaml.safe_load(sigma_content) + except Exception: + return set() + + fields: Set[str] = set() + detection = rule.get("detection", {}) if isinstance(rule, dict) else {} + + def _walk(node): + if isinstance(node, dict): + for key, val in node.items(): + if key == "condition": + continue + # Strip pipe modifiers: CommandLine|contains → CommandLine + clean = key.split("|")[0] + if clean and clean not in ("keywords",): + fields.add(clean) + _walk(val) + elif isinstance(node, list): + for item in node: + _walk(item) + + _walk(detection) + return fields + + +def extract_data_sources(texts: List[str]) -> List[str]: + """Extract dataSource.name values from a list of query strings.""" + sources: Set[str] = set() + for text in texts: + for match in _DS_PATTERN.finditer(text): + sources.add(match.group(1).strip()) + return sorted(sources) + + +_SDL_FIELD_PAT = re.compile(r'\$([a-zA-Z][a-zA-Z0-9._]*)(?:=[^$]*)?\$') +_SDL_ATTR_KEY_PAT = re.compile(r'"([a-zA-Z][a-zA-Z0-9._]+)"\s*:') +# Matches both quoted and unquoted output/to keys in rewrites: +# output: "user.name" OR "output": "user.name" +# "to": "src_endpoint.ip" +_SDL_REWRITE_OUT_PAT = re.compile( + r'(?:"output"|output|"to"|"replace")\s*:\s*"([a-zA-Z][a-zA-Z0-9._]+)"' +) + + +def extract_parser_fields_from_content(content: str) -> Set[str]: + """ + Extract output field names from SDL augmented-JSON parser content string. + Handles: + - $field.name$ and $field.name=pattern$ from format strings + - "output": "field.name" and output: "field.name" from rewrites + - quoted attribute keys from attributes{} blocks + """ + fields: Set[str] = set() + + # Fields from format strings: $field.name$ or $field.name=pattern_var$ + for match in _SDL_FIELD_PAT.finditer(content): + field = match.group(1) + # Skip pattern variable names (no dot, short, all lowercase) + if "." in field or field[0].isupper() or len(field) > 6: + fields.add(field) + + # Rewrite output targets: output: "field.name" / "output": "field.name" + _skip_values = {"$0", "1", "2", "3", "4", "99"} + for match in _SDL_REWRITE_OUT_PAT.finditer(content): + val = match.group(1) + if val not in _skip_values and "." in val: + fields.add(val) + + # Quoted attribute keys (skip single-word SDL builtins) + _skip_keys = {"id", "format", "halt", "input", "output", "match", "replace", + "timezone", "attribute", "attributes", "patterns", "formats", + "rewrites", "type", "version"} + for match in _SDL_ATTR_KEY_PAT.finditer(content): + key = match.group(1) + if key not in _skip_keys and ("." in key or len(key) > 8): + fields.add(key) + + return fields + + +_SKIP_FIELD_NAMES = { + "id", "format", "halt", "input", "output", "match", "replace", + "timezone", "attribute", "attributes", "patterns", "formats", + "rewrites", "type", "version", "source", "dataset", "predicate", + "transformations", "mappings", "observables", "fields", "constant", + "copy", "from", "to", "value", "field", "name", +} + + +def _extract_rewrite_fields(rewrites) -> Set[str]: + """Extract 'output' field names from a rewrites list.""" + fields: Set[str] = set() + if not isinstance(rewrites, list): + return fields + for rw in rewrites: + if not isinstance(rw, dict): + continue + # Standard SDL rewrite: {"input": "...", "output": "field.name"} + out = rw.get("output") or rw.get("to") + if out and isinstance(out, str) and "." in out and out not in _SKIP_FIELD_NAMES: + fields.add(out) + return fields + + +def _walk_mappings(node) -> Set[str]: + """Recursively extract copy.to and constant.field from SDL mappings blocks.""" + fields: Set[str] = set() + if isinstance(node, dict): + # transformations copy: {"copy": {"from": "...", "to": "field.name"}} + if "copy" in node and isinstance(node["copy"], dict): + to = node["copy"].get("to") + if to and isinstance(to, str) and "." in to: + fields.add(to) + # transformations constant: {"constant": {"value": ..., "field": "field.name"}} + if "constant" in node and isinstance(node["constant"], dict): + f = node["constant"].get("field") + if f and isinstance(f, str) and "." in f: + fields.add(f) + for v in node.values(): + fields |= _walk_mappings(v) + elif isinstance(node, list): + for item in node: + fields |= _walk_mappings(item) + return fields + + +def extract_parser_fields(parser_json: dict) -> Set[str]: + """ + Extract output field names from an SDL parser JSON dict. + Handles: attributes lists, fields lists, mappings targets, + rewrites[].output, rewrites[].to, copy.to, constant.field. + """ + fields: Set[str] = set() + + # Legacy: attributes as list of {name: ...} + for attr in parser_json.get("attributes", []): + if isinstance(attr, dict) and "name" in attr: + fields.add(attr["name"]) + + # Legacy: fields list + for field in parser_json.get("fields", []): + if isinstance(field, str): + fields.add(field) + elif isinstance(field, dict) and "name" in field: + fields.add(field["name"]) + + # Legacy: flat mappings list with "target" + for mapping in parser_json.get("mappings", []): + if isinstance(mapping, dict) and "target" in mapping: + fields.add(mapping["target"]) + + # SDL rewrites[].output in top-level formats[] + for fmt in parser_json.get("formats", []): + if isinstance(fmt, dict): + fields |= _extract_rewrite_fields(fmt.get("rewrites", [])) + + # SDL mappings block (nested transformations with copy.to / constant.field) + mappings_block = parser_json.get("mappings", {}) + if isinstance(mappings_block, dict): + fields |= _walk_mappings(mappings_block) + + # observables[].name + for obs in parser_json.get("observables", {}).get("fields", []): + if isinstance(obs, dict) and "name" in obs: + n = obs["name"] + if "." in n: + fields.add(n) + + return fields diff --git a/backend/services/s1_client.py b/backend/services/s1_client.py new file mode 100644 index 0000000..ac0e5e7 --- /dev/null +++ b/backend/services/s1_client.py @@ -0,0 +1,344 @@ +import os +import asyncio +import httpx +from datetime import datetime, timezone + +BASE_URL = os.environ.get("S1_BASE_URL", "https://demo.sentinelone.net").rstrip("/") +TOKEN = os.environ.get("S1_API_TOKEN", "") + +# Scalyr/XDR PowerQuery credentials — from SDL_XDR_URL + SDL_LOG_READ_KEY +# in the SentinelOne console: Settings → Integrations → Data Lake API Keys +SDL_XDR_URL = os.environ.get("SDL_XDR_URL", "https://xdr.us1.sentinelone.net").rstrip("/") +SDL_LOG_READ_KEY = os.environ.get("SDL_LOG_READ_KEY", "") + +# SDL Configuration Read Key — used to list/fetch parser files under /logParsers/ +# (separate from SDL_LOG_READ_KEY which is for querying events only). +# Find it in the S1 console: Settings → Integrations → Data Lake API Keys → Configuration Read. +SDL_CONFIG_READ_KEY = os.environ.get("SDL_CONFIG_READ_KEY", "") + +# Management Console API uses ApiToken auth +HEADERS = { + "Authorization": f"ApiToken {TOKEN}", + "Content-Type": "application/json", +} + + +def _iso_to_epoch_ms(iso_str: str) -> int: + """Convert ISO-8601 UTC string to epoch milliseconds for Scalyr API.""" + dt = datetime.fromisoformat(iso_str.replace("Z", "+00:00")) + return int(dt.timestamp() * 1000) + + +async def get_star_rules(page_size: int = 100) -> list: + """Fetch custom STAR rules from /cloud-detection/rules, paginating via cursor.""" + all_rules = [] + cursor = None + async with httpx.AsyncClient(timeout=30) as client: + while True: + params = {"limit": page_size} + if cursor: + params["cursor"] = cursor + resp = await client.get( + f"{BASE_URL}/web/api/v2.1/cloud-detection/rules", + headers=HEADERS, + params=params, + ) + resp.raise_for_status() + body = resp.json() + all_rules.extend(body.get("data", [])) + cursor = body.get("pagination", {}).get("nextCursor") + if not cursor: + break + return all_rules + + +async def get_library_rules(page_size: int = 100) -> list: + """ + Fetch Detection Library (OOTB/Platform) rules from /web/api/v2.1/detection-library/rules. + Requires an account-level or higher API token — site-scoped tokens will receive a 400. + Returns an empty list gracefully if the token lacks sufficient scope. + """ + all_rules = [] + cursor = None + async with httpx.AsyncClient(timeout=60) as client: + while True: + params: dict = {"limit": page_size} + if cursor: + params["cursor"] = cursor + resp = await client.get( + f"{BASE_URL}/web/api/v2.1/detection-library/rules", + headers=HEADERS, + params=params, + ) + # 400 typically means site-scoped token — return empty rather than crash + if resp.status_code == 400: + return [] + resp.raise_for_status() + body = resp.json() + batch = body.get("data", []) + all_rules.extend(batch) + cursor = body.get("pagination", {}).get("nextCursor") + if not cursor: + break + + results = [] + for rule in all_rules: + results.append({ + "id": str(rule.get("id", "")), + "name": rule.get("name", "unnamed"), + "s1ql": rule.get("s1ql") or rule.get("query", ""), + "queryType": rule.get("queryType", "events"), + "severity": rule.get("severity", ""), + "description": rule.get("description", ""), + "gdlRuleId": rule.get("id", ""), + "creator": "SentinelOne", + "expirationMode": rule.get("expirationMode", "Permanent"), + }) + return results + + +async def run_powerquery(query: str, from_date: str, to_date: str) -> dict: + """ + Run a PowerQuery against the Singularity Data Lake via the Scalyr XDR API. + Uses SDL_XDR_URL + SDL_LOG_READ_KEY (Scalyr readlog token). + The Scalyr PowerQuery API is synchronous — results return in one request. + """ + if not SDL_LOG_READ_KEY: + return {"events": [], "error": "SDL_LOG_READ_KEY not configured — add it to .env"} + + start_ms = _iso_to_epoch_ms(from_date) + end_ms = _iso_to_epoch_ms(to_date) + + payload = { + "token": SDL_LOG_READ_KEY, + "query": query, + "startTime": start_ms, + "endTime": end_ms, + "maxCount": 1000, + } + + async with httpx.AsyncClient(timeout=120) as client: + for attempt in range(3): + try: + resp = await client.post( + f"{SDL_XDR_URL}/api/powerQuery", + json=payload, + ) + resp.raise_for_status() + break + except httpx.HTTPStatusError as e: + if e.response.status_code == 429 and attempt < 2: + await asyncio.sleep(10 * (attempt + 1)) + continue + raise RuntimeError( + f"HTTP {e.response.status_code} from {e.request.url}: {e.response.text[:500]}" + ) from e + + data = resp.json() + status = data.get("status", "") + + if status != "success": + # Return full response as error detail for debugging + return {"events": [], "error": f"PowerQuery status={status}: {str(data)[:400]}"} + + # Scalyr PowerQuery returns: {"status":"success","columns":[{"name":"..."},...], "values":[[...],...],...} + raw_cols = data.get("columns", []) + values = data.get("values", []) + + if raw_cols and values: + # columns may be list of strings or list of {"name":...} dicts + col_names = [ + c["name"] if isinstance(c, dict) else c + for c in raw_cols + ] + rows = [dict(zip(col_names, row)) for row in values] + return {"events": rows} + + # Fallback: return raw matches array + matches = data.get("matches", []) + return {"events": matches} + + +def _sdl_config_headers() -> dict: + """Auth headers for the SDL Configuration File API (uses POST /api/listFiles, + POST /api/getFile, etc.). Falls back to SDL_LOG_READ_KEY if no dedicated + Configuration Read key is set — that won't work for all endpoints, but lets + callers fail with a meaningful 401 instead of crashing.""" + key = SDL_CONFIG_READ_KEY or SDL_LOG_READ_KEY + return { + "Authorization": f"Bearer {key}", + "Content-Type": "application/json", + } + + +async def list_sdl_parsers() -> list[str]: + """List parser paths under /logParsers/ via the SDL Configuration File API. + + Requires SDL_CONFIG_READ_KEY (or higher) in .env. The endpoint is + POST /api/listFiles with {"pathPrefix": "/logParsers/"}. + Returns names without the /logParsers/ prefix, suitable for use as + filenames in the local parsers/ directory. + """ + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + f"{SDL_XDR_URL}/api/listFiles", + headers=_sdl_config_headers(), + json={"pathPrefix": "/logParsers/"}, + ) + resp.raise_for_status() + data = resp.json() + paths = data.get("paths") or data.get("files") or [] + # Normalize: strip leading /logParsers/ and ignore anything that isn't there + names: list[str] = [] + for p in paths: + if isinstance(p, dict): + p = p.get("path") or p.get("name") or "" + if isinstance(p, str) and p.startswith("/logParsers/"): + names.append(p[len("/logParsers/"):]) + return names + + +async def list_sdl_parsers_legacy() -> list[str]: + """[Deprecated] Legacy management-console path — kept for reference but unused.""" + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.get( + f"{BASE_URL}/api/v1/files/logParsers", + headers=HEADERS, + ) + resp.raise_for_status() + data = resp.json() + # Response is a list of file objects or a dict with 'files' key + if isinstance(data, list): + return [f.get("name") or f.get("path", "") for f in data if isinstance(f, dict)] + return [f.get("name") or f.get("path", "") for f in data.get("files", [])] + + +async def get_sdl_parser(filename: str) -> dict: + """Fetch a single SDL parser file by name via POST /api/getFile. + + Returns the raw SDL response dict, e.g. + {"status": "success", "path": "/logParsers/Foo", "content": "...", "version": 3, ...} + """ + path = filename if filename.startswith("/logParsers/") else f"/logParsers/{filename}" + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + f"{SDL_XDR_URL}/api/getFile", + headers=_sdl_config_headers(), + json={"path": path}, + ) + resp.raise_for_status() + return resp.json() + + +async def get_account_id() -> str | None: + """Return the first account ID visible to the current token. + + Tries /accounts first (works for account-scoped or higher tokens). If that + returns 403 (site-scoped token), falls back to /sites and reads accountId + from the first site. + """ + async with httpx.AsyncClient(timeout=15) as client: + # Path 1: account-scoped token + resp = await client.get( + f"{BASE_URL}/web/api/v2.1/accounts", + headers=HEADERS, + params={"limit": 1}, + ) + if resp.status_code == 200: + accounts = resp.json().get("data", []) + if accounts: + return str(accounts[0]["id"]) + # Path 2: site-scoped token — accountId is embedded in sites payload + if resp.status_code in (401, 403): + sresp = await client.get( + f"{BASE_URL}/web/api/v2.1/sites", + headers=HEADERS, + params={"limit": 1}, + ) + if sresp.status_code == 200: + data = sresp.json().get("data", {}) + sites = data.get("sites") if isinstance(data, dict) else data + if sites: + return str(sites[0].get("accountId") or "") or None + return None + + +async def get_scope_for_platform_rules() -> tuple[str, str] | None: + """Pick the best scope for /detection-library/platform-rules. + + Returns (scopeLevel, scopeId). Tries account first, then site — site-scoped + tokens cannot list accounts but CAN query platform-rules with site scope. + """ + async with httpx.AsyncClient(timeout=15) as client: + # Prefer account scope (broadest) + a = await client.get( + f"{BASE_URL}/web/api/v2.1/accounts", + headers=HEADERS, + params={"limit": 1}, + ) + if a.status_code == 200: + accounts = a.json().get("data", []) + if accounts: + return ("account", str(accounts[0]["id"])) + # Fall back to site scope (site-scoped tokens land here) + s = await client.get( + f"{BASE_URL}/web/api/v2.1/sites", + headers=HEADERS, + params={"limit": 1}, + ) + if s.status_code == 200: + data = s.json().get("data", {}) + sites = data.get("sites") if isinstance(data, dict) else data + if sites: + sid = sites[0].get("id") + if sid: + return ("site", str(sid)) + return None + + +async def get_platform_rules(page_size: int = 1000) -> list: + """ + Fetch all Detection Library platform rules from /detection-library/platform-rules. + Requires scopeLevel + scopeId. Tries account scope first, then site scope so + site-scoped tokens also work. + """ + scope = await get_scope_for_platform_rules() + if not scope: + return [] + scope_level, scope_id = scope + + all_rules: list = [] + cursor: str = "" + async with httpx.AsyncClient(timeout=60) as client: + while True: + params: dict = { + "scopeLevel": scope_level, + "scopeId": scope_id, + "limit": page_size, + "cursor": cursor, + } + resp = await client.get( + f"{BASE_URL}/web/api/v2.1/detection-library/platform-rules", + headers=HEADERS, + params=params, + ) + if resp.status_code == 400: + return [] + resp.raise_for_status() + body = resp.json() + all_rules.extend(body.get("data", [])) + cursor = body.get("pagination", {}).get("nextCursor") or "" + if not cursor: + break + return all_rules + + +async def get_sites() -> list: + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.get( + f"{BASE_URL}/web/api/v2.1/sites", + headers=HEADERS, + params={"limit": 100}, + ) + resp.raise_for_status() + return resp.json().get("data", {}).get("sites", []) diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..b9b2cd6 --- /dev/null +++ b/build.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +echo "==> Starting Docker containers..." +docker-compose up --build "$@" diff --git a/db/init.sql b/db/init.sql new file mode 100644 index 0000000..e377c1c --- /dev/null +++ b/db/init.sql @@ -0,0 +1,3 @@ +-- Tables are created by SQLAlchemy on startup. +-- This file exists for the postgres healthcheck mount. +SELECT 1; diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..ba9bd04 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,46 @@ +services: + frontend: + build: ./frontend + ports: + - "3001:3000" + depends_on: + - backend + + backend: + build: ./backend + ports: + - "8001:8000" + environment: + - S1_API_TOKEN=${S1_API_TOKEN} + - S1_BASE_URL=${S1_BASE_URL} + - SDL_XDR_URL=${SDL_XDR_URL} + - SDL_LOG_READ_KEY=${SDL_LOG_READ_KEY} + - SDL_CONFIG_READ_KEY=${SDL_CONFIG_READ_KEY} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + - DATABASE_URL=postgresql://siem:siem@db:5432/siem + - DETECTIONS_FILE=/app/data/detections.json + depends_on: + db: + condition: service_healthy + volumes: + - ./parsers:/app/parsers + - ./.env:/app/.env + - ./data:/app/data:ro + + db: + image: postgres:16-alpine + environment: + - POSTGRES_DB=siem + - POSTGRES_USER=siem + - POSTGRES_PASSWORD=siem + volumes: + - pgdata:/var/lib/postgresql/data + - ./db/init.sql:/docker-entrypoint-initdb.d/init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U siem"] + interval: 5s + timeout: 5s + retries: 5 + +volumes: + pgdata: diff --git a/frontend/.dockerignore b/frontend/.dockerignore new file mode 100644 index 0000000..b90a368 --- /dev/null +++ b/frontend/.dockerignore @@ -0,0 +1,2 @@ +node_modules +.next diff --git a/frontend/Dockerfile b/frontend/Dockerfile new file mode 100644 index 0000000..ef916a9 --- /dev/null +++ b/frontend/Dockerfile @@ -0,0 +1,4 @@ +FROM nginx:alpine +COPY index.html /usr/share/nginx/html/index.html +COPY nginx.conf /etc/nginx/conf.d/default.conf +EXPOSE 3000 diff --git a/frontend/index.html b/frontend/index.html new file mode 100644 index 0000000..b5b1f46 --- /dev/null +++ b/frontend/index.html @@ -0,0 +1,1171 @@ + + + + + + SIEM Toolkit + + + + + + +
+ + + + diff --git a/frontend/next.config.js b/frontend/next.config.js new file mode 100644 index 0000000..5c113ca --- /dev/null +++ b/frontend/next.config.js @@ -0,0 +1,6 @@ +/** @type {import('next').NextConfig} */ +const nextConfig = { + output: 'export', + trailingSlash: true, +} +module.exports = nextConfig diff --git a/frontend/nginx.conf b/frontend/nginx.conf new file mode 100644 index 0000000..dfaedad --- /dev/null +++ b/frontend/nginx.conf @@ -0,0 +1,9 @@ +server { + listen 3000; + root /usr/share/nginx/html; + index index.html; + + location / { + try_files $uri $uri/ $uri.html /index.html; + } +} diff --git a/frontend/package.json b/frontend/package.json new file mode 100644 index 0000000..ced1333 --- /dev/null +++ b/frontend/package.json @@ -0,0 +1,29 @@ +{ + "name": "siem-toolkit", + "version": "1.0.0", + "private": true, + "scripts": { + "dev": "next dev", + "build": "next build", + "start": "next start" + }, + "dependencies": { + "next": "14.2.5", + "react": "18.3.1", + "react-dom": "18.3.1", + "recharts": "2.12.7", + "@tanstack/react-query": "5.56.2", + "axios": "1.7.7", + "clsx": "2.1.1", + "lucide-react": "0.441.0" + }, + "devDependencies": { + "@types/node": "22.5.4", + "@types/react": "18.3.5", + "@types/react-dom": "18.3.0", + "autoprefixer": "10.4.20", + "postcss": "8.4.45", + "tailwindcss": "3.4.11", + "typescript": "5.6.2" + } +} diff --git a/frontend/postcss.config.js b/frontend/postcss.config.js new file mode 100644 index 0000000..95aa892 --- /dev/null +++ b/frontend/postcss.config.js @@ -0,0 +1,3 @@ +module.exports = { + plugins: { tailwindcss: {}, autoprefixer: {} }, +} diff --git a/frontend/src/app/coverage/page.tsx b/frontend/src/app/coverage/page.tsx new file mode 100644 index 0000000..ef77052 --- /dev/null +++ b/frontend/src/app/coverage/page.tsx @@ -0,0 +1,232 @@ +'use client' + +import { useState, useRef } from 'react' +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query' +import { api } from '@/lib/api' +import clsx from 'clsx' + +type FieldDetail = { + in_parser: boolean + parser_name: string | null + rule_count: number + rules: { rule: string; type: string }[] + status: 'covered' | 'unused' | 'missing_parser' +} + +type CoverageMap = { + summary: { + total_parser_fields: number + total_rule_fields: number + covered: number + parsed_but_unused: number + rules_missing_parser: number + } + fields: Record +} + +const STATUS_STYLE = { + covered: 'bg-emerald-900/50 text-emerald-300 border-emerald-700', + unused: 'bg-yellow-900/50 text-yellow-300 border-yellow-700', + missing_parser: 'bg-red-900/50 text-red-300 border-red-700', +} + +const STATUS_LABEL = { + covered: 'Covered', + unused: 'Unused (reduce candidate)', + missing_parser: 'Missing parser', +} + +export default function CoveragePage() { + const qc = useQueryClient() + const sigmaRef = useRef(null) + const parserRef = useRef(null) + const [filter, setFilter] = useState<'all' | 'covered' | 'unused' | 'missing_parser'>('all') + const [err, setErr] = useState('') + + const { data, isLoading } = useQuery({ + queryKey: ['coverage-map'], + queryFn: () => api.get('/api/coverage/map'), + }) + + const loadStar = useMutation({ + mutationFn: () => api.post('/api/coverage/load-star-rules', {}), + onSuccess: () => qc.invalidateQueries({ queryKey: ['coverage-map'] }), + onError: (e: Error) => setErr(e.message), + }) + + const uploadSigma = useMutation({ + mutationFn: async (files: FileList) => { + const form = new FormData() + Array.from(files).forEach((f) => form.append('files', f)) + return api.postForm('/api/coverage/upload-sigma', form) + }, + onSuccess: () => qc.invalidateQueries({ queryKey: ['coverage-map'] }), + onError: (e: Error) => setErr(e.message), + }) + + const uploadParser = useMutation({ + mutationFn: async (file: File) => { + const form = new FormData() + form.append('file', file) + return api.postForm('/api/coverage/upload-parser', form) + }, + onSuccess: () => qc.invalidateQueries({ queryKey: ['coverage-map'] }), + onError: (e: Error) => setErr(e.message), + }) + + const reset = useMutation({ + mutationFn: () => api.get('/api/coverage/reset'), + onSuccess: () => qc.invalidateQueries({ queryKey: ['coverage-map'] }), + }) + + const fields = data + ? Object.entries(data.fields).filter( + ([, d]) => filter === 'all' || d.status === filter + ) + : [] + + const busy = loadStar.isPending || uploadSigma.isPending || uploadParser.isPending + + return ( +
+
+
+

Parser Coverage Map

+

+ Cross-reference SDL parser fields against STAR / Sigma rule fields +

+
+
+ + + + +
+
+ + e.target.files && uploadSigma.mutate(e.target.files)} + /> + e.target.files?.[0] && uploadParser.mutate(e.target.files[0])} + /> + + {err && ( +
+ {err} +
+ )} + + {data && ( +
+ {[ + { label: 'Parser Fields', value: data.summary.total_parser_fields, color: 'text-gray-200' }, + { label: 'Rule Fields', value: data.summary.total_rule_fields, color: 'text-gray-200' }, + { label: 'Covered', value: data.summary.covered, color: 'text-emerald-400' }, + { label: 'Parsed Unused', value: data.summary.parsed_but_unused, color: 'text-yellow-400' }, + { label: 'Missing Parser', value: data.summary.rules_missing_parser, color: 'text-red-400' }, + ].map(({ label, value, color }) => ( +
+
{value}
+
{label}
+
+ ))} +
+ )} + +
+ {(['all', 'covered', 'unused', 'missing_parser'] as const).map((f) => ( + + ))} +
+ + {isLoading ? ( +
Loading…
+ ) : fields.length === 0 ? ( +
+ {data ? 'No fields match this filter.' : 'Load STAR rules or upload parsers to begin.'} +
+ ) : ( +
+ + + + + + + + + + + {fields.map(([field, detail]) => ( + + + + + + + ))} + +
FieldStatusParserRules using it
{field} + + {STATUS_LABEL[detail.status]} + + {detail.parser_name ?? '—'} + {detail.rule_count > 0 + ? detail.rules.map((r) => r.rule).join(', ') + : '—'} +
+
+ )} +
+ ) +} diff --git a/frontend/src/app/globals.css b/frontend/src/app/globals.css new file mode 100644 index 0000000..b5c61c9 --- /dev/null +++ b/frontend/src/app/globals.css @@ -0,0 +1,3 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; diff --git a/frontend/src/app/ingest/page.tsx b/frontend/src/app/ingest/page.tsx new file mode 100644 index 0000000..64c0850 --- /dev/null +++ b/frontend/src/app/ingest/page.tsx @@ -0,0 +1,169 @@ +'use client' + +import { useState } from 'react' +import { useQuery, useMutation } from '@tanstack/react-query' +import { + BarChart, Bar, XAxis, YAxis, Tooltip, ResponsiveContainer, CartesianGrid, +} from 'recharts' +import { api } from '@/lib/api' + +type SourceRow = { 'src.name': string; events: number } +type DayRow = { date: string; events: number } + +export default function IngestPage() { + const [days, setDays] = useState(7) + const [simSource, setSimSource] = useState('') + const [simEventType, setSimEventType] = useState('') + const [simResult, setSimResult] = useState | null>(null) + const [simErr, setSimErr] = useState('') + + const sources = useQuery<{ data: SourceRow[] }>({ + queryKey: ['top-sources', days], + queryFn: () => api.get(`/api/ingest/top-sources?days=${days}`), + }) + + const daily = useQuery({ + queryKey: ['daily-volume', days], + queryFn: () => api.get(`/api/ingest/daily-volume?days=${days}`), + }) + + const simulate = useMutation({ + mutationFn: () => + api.post>('/api/ingest/simulate-filter', { + source: simSource, + event_type: simEventType, + days, + gb_per_million_events: 0.5, + }), + onSuccess: (data) => { setSimResult(data); setSimErr('') }, + onError: (e: Error) => setSimErr(e.message), + }) + + const chartData = (sources.data?.data ?? []).slice(0, 15).map((r) => ({ + name: r['src.name'] ?? 'unknown', + events: r.events ?? 0, + })) + + return ( +
+
+
+

Ingest Dashboard

+

Event volume · cost projection · filter simulator

+
+
+ {[7, 14, 30].map((d) => ( + + ))} +
+
+ + {/* Daily volume chart */} +
+

Daily Event Volume

+ {daily.isLoading ? ( +
Loading…
+ ) : ( + + + + + + + + + + )} +
+ + {/* Top sources table */} +
+

Top Sources — last {days}d

+ {sources.isLoading ? ( +
Loading…
+ ) : sources.isError ? ( +
{String(sources.error)}
+ ) : ( + + + + + + + + + + {chartData.map((row) => ( + + + + + + ))} + +
SourceEventsEst. GB
{row.name}{row.events.toLocaleString()} + {(row.events / 1_000_000 * 0.5).toFixed(3)} +
+ )} +
+ + {/* Filter simulator */} +
+

Filter Simulator

+

+ Estimate events and GB eliminated by dropping a source + event type combination. +

+
+ setSimSource(e.target.value)} + placeholder="Source name (optional)" + className="flex-1 min-w-48 bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-200 placeholder-gray-600 focus:outline-none focus:border-purple-600" + /> + setSimEventType(e.target.value)} + placeholder="Event type (optional)" + className="flex-1 min-w-48 bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-200 placeholder-gray-600 focus:outline-none focus:border-purple-600" + /> + +
+ {simErr &&

{simErr}

} + {simResult && ( +
+ {[ + { label: 'Matched Events', value: String(simResult.matched_events ?? 0) }, + { label: `Est. GB (${days}d)`, value: String(simResult.estimated_gb_period ?? 0) }, + { label: 'Projected Monthly Events', value: String(simResult.projected_monthly_events ?? 0) }, + { label: 'Projected Monthly GB', value: String(simResult.projected_monthly_gb ?? 0) }, + ].map(({ label, value }) => ( +
+
{value}
+
{label}
+
+ ))} +
+ )} +
+
+ ) +} diff --git a/frontend/src/app/layout.tsx b/frontend/src/app/layout.tsx new file mode 100644 index 0000000..f0525b9 --- /dev/null +++ b/frontend/src/app/layout.tsx @@ -0,0 +1,22 @@ +import type { Metadata } from 'next' +import './globals.css' +import Sidebar from '@/components/Sidebar' +import QueryProvider from '@/components/QueryProvider' + +export const metadata: Metadata = { + title: 'SIEM Toolkit', + description: 'SentinelOne AI-SIEM Engineering Toolkit', +} + +export default function RootLayout({ children }: { children: React.ReactNode }) { + return ( + + + + +
{children}
+
+ + + ) +} diff --git a/frontend/src/app/onboarding/_CopyButton.tsx b/frontend/src/app/onboarding/_CopyButton.tsx new file mode 100644 index 0000000..617dc54 --- /dev/null +++ b/frontend/src/app/onboarding/_CopyButton.tsx @@ -0,0 +1,21 @@ +'use client' + +import { useState } from 'react' +import { Copy, Check } from 'lucide-react' + +export default function CopyButton({ text }: { text: string }) { + const [copied, setCopied] = useState(false) + return ( + + ) +} diff --git a/frontend/src/app/onboarding/page.tsx b/frontend/src/app/onboarding/page.tsx new file mode 100644 index 0000000..6079526 --- /dev/null +++ b/frontend/src/app/onboarding/page.tsx @@ -0,0 +1,78 @@ +import { Zap, MessageSquare, FileText, Code2 } from 'lucide-react' + +const STEPS = [ + { + icon: FileText, + title: '1. Grab a log sample', + desc: 'Copy 10–50 representative lines from the new log source. Include edge cases — errors, different event types, varying field presence.', + }, + { + icon: MessageSquare, + title: '2. Paste into Claude Code', + desc: 'Open Claude Code and say: "Onboard this log source for SentinelOne SDL" then paste the sample. Mention the source type if known (e.g. "Palo Alto firewall").', + }, + { + icon: Code2, + title: '3. Get your artefacts', + desc: 'Claude returns an SDL parser (augmented-JSON), field mappings to the SDL schema, starter STAR detection rules, and parser test assertions.', + }, + { + icon: Zap, + title: '4. Deploy', + desc: 'Drop the parser JSON into your /logParsers/ path. Paste the STAR rules into the AI-SIEM rule editor. Run the test assertions to validate extraction.', + }, +] + +const PROMPT = `Onboard this log source for SentinelOne SDL. Please generate: +1. An SDL parser skeleton in augmented-JSON format (/logParsers/ format) +2. Field mappings from raw fields to the SDL common schema +3. 2–3 starter STAR detection rules for common threats from this source type +4. 5 parser test assertions (input line → expected field → expected value) + +Log source: [describe source, e.g. "Palo Alto PAN-OS firewall"] + +Raw log sample: +[paste your log lines here]` + +export default function OnboardingPage() { + return ( +
+
+

Onboarding Accelerator

+

+ Use Claude Code directly — no API key required +

+
+ +
+ {STEPS.map(({ icon: Icon, title, desc }) => ( +
+
+ +
+
+
{title}
+
{desc}
+
+
+ ))} +
+ +
+
+ Copy this prompt template + +
+
{PROMPT}
+
+
+ ) +} + +function CopyButton({ text }: { text: string }) { + 'use client' + return <_CopyButton text={text} /> +} + +// Split to keep the page a server component with one small client island +import _CopyButton from './_CopyButton' diff --git a/frontend/src/app/page.tsx b/frontend/src/app/page.tsx new file mode 100644 index 0000000..dde6866 --- /dev/null +++ b/frontend/src/app/page.tsx @@ -0,0 +1,59 @@ +import { Shield, BarChart2, Zap } from 'lucide-react' +import Link from 'next/link' + +const CARDS = [ + { + href: '/coverage', + icon: Shield, + title: 'Parser Coverage Map', + desc: 'Cross-reference SDL parser output fields against STAR and Sigma rule fields. Surface parsed-but-unused fields as reduction candidates.', + cta: 'Open Coverage Map', + color: 'from-purple-700 to-purple-900', + }, + { + href: '/ingest', + icon: BarChart2, + title: 'Ingest Dashboard', + desc: 'Visualize event volume by source and type. Project monthly GB costs and simulate the impact of exclusion filters before applying them.', + cta: 'Open Dashboard', + color: 'from-blue-700 to-blue-900', + }, + { + href: '/onboarding', + icon: Zap, + title: 'Onboarding Accelerator', + desc: 'Step-by-step guide for onboarding a new log source using Claude Code directly — no API key required.', + cta: 'View Onboarding Guide', + color: 'from-emerald-700 to-emerald-900', + }, +] + +export default function Home() { + return ( +
+
+

SIEM Engineering Toolkit

+

SentinelOne AI-SIEM · demo.sentinelone.net

+
+
+ {CARDS.map(({ href, icon: Icon, title, desc, cta, color }) => ( +
+
+ +
+
+

{title}

+

{desc}

+
+ + {cta} → + +
+ ))} +
+
+ ) +} diff --git a/frontend/src/components/QueryProvider.tsx b/frontend/src/components/QueryProvider.tsx new file mode 100644 index 0000000..b744f07 --- /dev/null +++ b/frontend/src/components/QueryProvider.tsx @@ -0,0 +1,9 @@ +'use client' + +import { QueryClient, QueryClientProvider } from '@tanstack/react-query' +import { useState } from 'react' + +export default function QueryProvider({ children }: { children: React.ReactNode }) { + const [client] = useState(() => new QueryClient({ defaultOptions: { queries: { retry: 1 } } })) + return {children} +} diff --git a/frontend/src/components/Sidebar.tsx b/frontend/src/components/Sidebar.tsx new file mode 100644 index 0000000..edbbc57 --- /dev/null +++ b/frontend/src/components/Sidebar.tsx @@ -0,0 +1,45 @@ +'use client' + +import Link from 'next/link' +import { usePathname } from 'next/navigation' +import { Shield, BarChart2, Zap, Home } from 'lucide-react' +import clsx from 'clsx' + +const NAV = [ + { href: '/', label: 'Overview', icon: Home }, + { href: '/coverage', label: 'Parser Coverage', icon: Shield }, + { href: '/ingest', label: 'Ingest Dashboard', icon: BarChart2 }, + { href: '/onboarding', label: 'Onboarding', icon: Zap }, +] + +export default function Sidebar() { + const path = usePathname() + return ( + + ) +} diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts new file mode 100644 index 0000000..924ee41 --- /dev/null +++ b/frontend/src/lib/api.ts @@ -0,0 +1,22 @@ +const BASE = process.env.NEXT_PUBLIC_API_URL ?? 'http://localhost:8000' + +export async function apiFetch(path: string, init?: RequestInit): Promise { + const res = await fetch(`${BASE}${path}`, init) + if (!res.ok) { + const text = await res.text() + throw new Error(`${res.status}: ${text}`) + } + return res.json() as Promise +} + +export const api = { + get: (path: string) => apiFetch(path), + post: (path: string, body: unknown) => + apiFetch(path, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + }), + postForm: (path: string, form: FormData) => + apiFetch(path, { method: 'POST', body: form }), +} diff --git a/frontend/tailwind.config.js b/frontend/tailwind.config.js new file mode 100644 index 0000000..e4fa2e3 --- /dev/null +++ b/frontend/tailwind.config.js @@ -0,0 +1,12 @@ +/** @type {import('tailwindcss').Config} */ +module.exports = { + content: ['./src/**/*.{ts,tsx}'], + theme: { + extend: { + colors: { + brand: '#7c3aed', + }, + }, + }, + plugins: [], +} diff --git a/frontend/tsconfig.json b/frontend/tsconfig.json new file mode 100644 index 0000000..f0de5f0 --- /dev/null +++ b/frontend/tsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "target": "es2017", + "lib": ["dom", "dom.iterable", "esnext"], + "allowJs": true, + "skipLibCheck": true, + "strict": true, + "noEmit": true, + "esModuleInterop": true, + "module": "esnext", + "moduleResolution": "bundler", + "resolveJsonModule": true, + "isolatedModules": true, + "jsx": "preserve", + "incremental": true, + "plugins": [{ "name": "next" }], + "paths": { "@/*": ["./src/*"] } + }, + "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], + "exclude": ["node_modules"] +} diff --git a/parsers/.gitkeep b/parsers/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tools/probe_avelios.py b/tools/probe_avelios.py new file mode 100644 index 0000000..a1b3e5c --- /dev/null +++ b/tools/probe_avelios.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""Probe the SDL tenant to understand why Avelios Medical field-population shows 0%.""" +import json, time, urllib.request, urllib.error +import os + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +CFG = _load_sdl_cfg() +BASE = CFG['base_url'].rstrip('/') +KEY = CFG['log_read_key'] +END_MS = int(time.time() * 1000) +START_MS = END_MS - 24 * 3600 * 1000 # last 24h + + +def pq(query: str, max_count: int = 10) -> dict: + body = json.dumps({ + "token": KEY, "query": query, + "startTime": START_MS, "endTime": END_MS, + "maxCount": max_count, + }).encode() + req = urllib.request.Request(BASE + '/api/powerQuery', data=body, + headers={"Content-Type": "application/json"}) + try: + return json.loads(urllib.request.urlopen(req, timeout=30).read()) + except urllib.error.HTTPError as e: + return {"_err": f"HTTP {e.code}: {e.read().decode()[:200]}"} + except Exception as e: + return {"_err": str(e)[:200]} + + +def show(label, d): + if "_err" in d: + print(f"[ERR] {label}: {d['_err']}"); return + cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])] + vals = d.get('values', []) or d.get('matches', []) + print(f"[OK ] {label} cols={cols} rows={len(vals)}") + for v in vals[:8]: + print(f" {v}") + + +# 1. Distinct dataSource.name values containing 'velio' +print("=" * 70) +print("1. Source-name spellings containing 'velio'") +print("=" * 70) +show("by dataSource.name", + pq("| group n=count() by dataSource.name | sort -n | limit 50", max_count=50)) + +# 2. Try a few candidate names +print() +print("=" * 70) +print("2. Try filtering by candidate names") +print("=" * 70) +for cand in ["Avelios Medical", "Avelios-Medical", "Avelios-Medical-OCSF", + "avelios", "Avelios"]: + d = pq(f"| filter dataSource.name == '{cand}' | group n=count()", max_count=1) + n = (d.get('values') or [[None]])[0][0] if 'values' in d else d + print(f" {cand!r:<35} -> {n}") +for cand in ["Avelios Medical", "Avelios-Medical-OCSF", "avelios"]: + d = pq(f"| filter dataSource.name contains '{cand}' | group n=count()", max_count=1) + n = (d.get('values') or [[None]])[0][0] if 'values' in d else d + print(f" contains {cand!r:<25} -> {n}") + +# 3. Sample one raw event to see what column names actually come back +print() +print("=" * 70) +print("3. Sample one event — what keys/columns are returned?") +print("=" * 70) +d = pq("| filter dataSource.name contains 'velio' | limit 1", max_count=1) +if "_err" in d: + print(" ", d["_err"]) +else: + print(" columns:", [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])][:30]) + print(" first row sample:", str((d.get('values') or [None])[0])[:400]) + +# 4. If we got columns, check which OCSF fields exist +print() +print("=" * 70) +print("4. Field presence in last 24h for Avelios (using columns command)") +print("=" * 70) +d = pq("| filter dataSource.name contains 'velio' | " + "columns dataSource.name, metadata.product.name, metadata.event_code, " + "actor.user.name, src_endpoint.ip, dst_endpoint.ip | limit 5", + max_count=5) +show("columns view", d) diff --git a/tools/probe_avelios_fields.py b/tools/probe_avelios_fields.py new file mode 100644 index 0000000..8e8d22d --- /dev/null +++ b/tools/probe_avelios_fields.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Inspect Avelios Medical events: one query, full row dump, then field stats from Python.""" +import json, time, urllib.request, collections +import os + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +CFG = _load_sdl_cfg() +BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key'] +NOW = int(time.time() * 1000) +START = NOW - 72 * 3600 * 1000 # last 3 days + + +def pq(query, mc=200): + body = json.dumps({"token": KEY, "query": query, + "startTime": START, "endTime": NOW, + "maxCount": mc}).encode() + req = urllib.request.Request(BASE + '/api/powerQuery', data=body, + headers={"Content-Type": "application/json"}) + return json.loads(urllib.request.urlopen(req, timeout=60).read()) + + +print("Fetching Avelios Medical sample (max 200, last 72h) ...") +d = pq("| filter dataSource.name == 'Avelios Medical' | limit 200") +cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])] +vals = d.get('values', []) or [] +print(f"Columns returned ({len(cols)}): {cols}") +print(f"Rows: {len(vals)}") +print() + +# Tally non-null rate per returned column +counts = {c: 0 for c in cols} +for row in vals: + for c, v in zip(cols, row): + if v not in (None, '', 'null'): + counts[c] += 1 +print("=== Column populated-rate (out of returned columns) ===") +for c in cols: + n = counts[c] + pct = round(100 * n / max(1, len(vals)), 1) + print(f" {c:<35} {n:>4} / {len(vals)} {pct:>5}%") + +print() +print("=== First 2 events (pretty) ===") +for row in vals[:2]: + print(json.dumps(dict(zip(cols, row)), indent=2, default=str)[:1500]) + print("---") + +print() +print("=== Distinct fields IN the message body (if JSON) ===") +# If the events carry a structured body, peek inside it +field_freq = collections.Counter() +for row in vals: + rd = dict(zip(cols, row)) + msg = rd.get('message') or rd.get('body') or rd.get('attributes') + if isinstance(msg, str): + try: + j = json.loads(msg) + except Exception: + continue + else: + j = msg + if isinstance(j, dict): + def walk(obj, prefix=''): + for k, v in obj.items(): + key = f"{prefix}.{k}" if prefix else k + if isinstance(v, dict): + walk(v, key) + else: + field_freq[key] += 1 + walk(j) +for k, c in field_freq.most_common(40): + print(f" {k:<45} in {c:>3} events") diff --git a/tools/probe_avelios_wide.py b/tools/probe_avelios_wide.py new file mode 100644 index 0000000..86bf857 --- /dev/null +++ b/tools/probe_avelios_wide.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +"""Search wider windows for Avelios Medical events.""" +import json, time, urllib.request +import os + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +CFG = _load_sdl_cfg() +BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key'] +NOW = int(time.time() * 1000) + + +def pq(query, start_ms, end_ms, mc=5): + body = json.dumps({"token": KEY, "query": query, + "startTime": start_ms, "endTime": end_ms, + "maxCount": mc}).encode() + req = urllib.request.Request(BASE + '/api/powerQuery', data=body, + headers={"Content-Type": "application/json"}) + try: + return json.loads(urllib.request.urlopen(req, timeout=60).read()) + except Exception as e: + return {"_err": str(e)[:200]} + + +for days in (1, 3, 7): + start = NOW - days * 24 * 3600 * 1000 + print(f"\n=== last {days}d ===") + d = pq("| group n=count() by dataSource.name | sort -n | limit 30", start, NOW, mc=30) + if "_err" in d: + print(d["_err"]); continue + for row in d.get("values", []): + name = row[0] + if name and "velio" in name.lower(): + print(f" HIT: {row}") + # show top 10 in this window + for row in (d.get("values", []) or [])[:10]: + print(f" {row}") diff --git a/tools/probe_pq_syntax.py b/tools/probe_pq_syntax.py new file mode 100644 index 0000000..128b40d --- /dev/null +++ b/tools/probe_pq_syntax.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +"""Probe what PowerQuery syntax this SDL tenant accepts.""" +import json, time, urllib.request, urllib.error, sys +import os + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +CFG = _load_sdl_cfg() +URL = CFG['base_url'].rstrip('/') + '/api/powerQuery' +END_MS = int(time.time() * 1000) +START_MS = END_MS - 3600 * 1000 # last hour + + +def run(label: str, query: str): + body = json.dumps({ + "token": CFG['log_read_key'], + "query": query, + "startTime": START_MS, + "endTime": END_MS, + "maxCount": 5, + }).encode() + req = urllib.request.Request(URL, data=body, headers={"Content-Type": "application/json"}) + try: + resp = urllib.request.urlopen(req, timeout=30).read() + d = json.loads(resp) + st = d.get('status', '?') + cols = d.get('columns') or [] + vals = d.get('values') or d.get('matches') or [] + print(f"[OK ] {label:<40} status={st} cols={len(cols)} rows={len(vals)}") + if vals: + print(f" sample={str(vals[0])[:160]}") + except urllib.error.HTTPError as e: + body = e.read().decode() + try: + j = json.loads(body) + msg = j.get('message', body)[:200] + except Exception: + msg = body[:200] + print(f"[ERR] {label:<40} HTTP {e.code}: {msg}") + except Exception as e: + print(f"[ERR] {label:<40} {type(e).__name__}: {str(e)[:160]}") + + +CASES = [ + ("leading-pipe single-stage", "| group total=count()"), + ("no-pipe single-stage", "group total=count()"), + ("leading-pipe multi-stage", "| group events=count() by dataSource.name | sort -events | limit 5"), + ("no-pipe multi-stage", "group events=count() by dataSource.name | sort -events | limit 5"), + ("no-pipe trim sort", "group events=count() by dataSource.name | limit 5"), + ("filter then group", "dataSource.name=='SentinelOne' | group events=count()"), + ("filter (modern keyword)", "filter dataSource.name=='SentinelOne' | group events=count()"), + ("dataset-style with sort", "group events=count() by dataSource.name | sort events desc | limit 5"), + ("count() as alias", "| count() as events"), + ("group by event.type", "group events=count() by event.type | limit 5"), +] + +print(f"URL: {URL}") +print(f"Window: last 1h ({START_MS}..{END_MS} ms)") +print() +for label, q in CASES: + run(label, q) diff --git a/tools/probe_simulate_filter.py b/tools/probe_simulate_filter.py new file mode 100644 index 0000000..827577b --- /dev/null +++ b/tools/probe_simulate_filter.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +"""Probe /api/ingest/simulate-filter using small 1-day windows + long client +timeouts to avoid urllib aborting before the SDL query returns. + +Run one case at a time and print elapsed time so we can tell whether failures +are HTTP errors or slow tenant queries. +""" +from __future__ import annotations +import json +import sys +import time +import urllib.request +import urllib.error + +URL = "http://localhost:8001/api/ingest/simulate-filter" +TIMEOUT = 600 # seconds — generous; SDL queries on large tenants can take >60s + +# Smallest windows first so cheap calls succeed before we try the expensive ones. +CASES = [ + ("empty body, 1d", {"days": 1}), + ("bogus source, 1d", {"source": "definitely-no-such-source", "days": 1}), + ("source only, 1d", {"source": "Avelios Medical", "days": 1}), + ("source only, 7d", {"source": "Avelios Medical", "days": 7}), + ("event_type only, 1d", {"event_type": "login", "days": 1}), + ("source + event_type, 7d", {"source": "Avelios Medical", "event_type": "login", "days": 7}), +] + + +def hit(body: dict) -> tuple[int, str, float]: + data = json.dumps(body).encode() + req = urllib.request.Request( + URL, + data=data, + headers={"Content-Type": "application/json"}, + method="POST", + ) + t0 = time.monotonic() + try: + with urllib.request.urlopen(req, timeout=TIMEOUT) as r: + return r.status, r.read().decode(), time.monotonic() - t0 + except urllib.error.HTTPError as e: + return e.code, e.read().decode(), time.monotonic() - t0 + except Exception as e: + return -1, f"{type(e).__name__}: {e or 'no detail'}", time.monotonic() - t0 + + +# Allow narrowing via CLI: `python3 probe_simulate_filter.py 2 3` runs cases 2 & 3. +indices = [int(x) for x in sys.argv[1:]] if len(sys.argv) > 1 else range(len(CASES)) + +for i in indices: + if i >= len(CASES): + continue + label, body = CASES[i] + print("=" * 78) + print(f"[{i}] {label:<32} body={body}") + sys.stdout.flush() + status, payload, elapsed = hit(body) + print(f" HTTP {status} elapsed={elapsed:.1f}s") + try: + parsed = json.loads(payload) + print(" " + json.dumps(parsed, indent=2).replace("\n", "\n ")) + except Exception: + print(f" raw: {payload[:800]}") diff --git a/tools/probe_sync_from_sdl.py b/tools/probe_sync_from_sdl.py new file mode 100644 index 0000000..f19d02d --- /dev/null +++ b/tools/probe_sync_from_sdl.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +"""Trigger /api/quality/sync-from-sdl and pretty-print the result. + +Then re-list /api/quality/parsers to confirm the new files appear in the +Parser Test Runner dropdown. +""" +from __future__ import annotations +import json +import sys +import time +import urllib.request +import urllib.error + +BACKEND = "http://localhost:8001" +TIMEOUT = 300 + + +def call(method: str, path: str) -> tuple[int, dict | str, float]: + req = urllib.request.Request(BACKEND + path, method=method) + t0 = time.monotonic() + try: + with urllib.request.urlopen(req, timeout=TIMEOUT) as r: + return r.status, json.loads(r.read().decode()), time.monotonic() - t0 + except urllib.error.HTTPError as e: + body = e.read().decode() + try: + return e.code, json.loads(body), time.monotonic() - t0 + except Exception: + return e.code, body, time.monotonic() - t0 + except Exception as e: + return -1, f"{type(e).__name__}: {e or 'no detail'}", time.monotonic() - t0 + + +print("=" * 72) +print("POST /api/quality/sync-from-sdl") +print("=" * 72) +status, body, elapsed = call("POST", "/api/quality/sync-from-sdl") +print(f"HTTP {status} elapsed={elapsed:.1f}s") +if isinstance(body, dict): + if "detail" in body: + print(f" ERROR: {body['detail']}") + else: + print(f" downloaded: {body.get('downloaded')}") + print(f" errors: {len(body.get('errors') or [])}") + print(f" directory: {body.get('directory')}") + names = body.get("parsers") or [] + print(f"\n sample of parser filenames (first 25):") + for n in names[:25]: + print(f" {n}") + if len(names) > 25: + print(f" ... and {len(names) - 25} more") + # Highlight anything that looks like a customer/custom parser + custom = [n for n in names if "avelios" in n.lower() or "ocsf" in n.lower()] + if custom: + print("\n matched custom-parser patterns (avelios / ocsf):") + for n in custom: + print(f" ✓ {n}") + errs = body.get("errors") or [] + if errs: + print(f"\n errors (first 5 of {len(errs)}):") + for e in errs[:5]: + print(f" - {e}") +else: + print(f" raw: {str(body)[:600]}") + +print() +print("=" * 72) +print("GET /api/quality/parsers (Parser Test Runner dropdown source)") +print("=" * 72) +status, body, elapsed = call("GET", "/api/quality/parsers") +print(f"HTTP {status} elapsed={elapsed:.1f}s") +if isinstance(body, dict): + print(f" count: {body.get('count')}") + print(f" parsers:") + for n in (body.get("parsers") or [])[:50]: + print(f" {n}") + if (body.get("count") or 0) > 50: + print(f" ... and {body['count'] - 50} more") +else: + print(f" raw: {str(body)[:400]}") diff --git a/tools/sdl_config.example.json b/tools/sdl_config.example.json new file mode 100644 index 0000000..0c96307 --- /dev/null +++ b/tools/sdl_config.example.json @@ -0,0 +1,7 @@ +{ + "_comment": "Copy to sdl_config.json (or set $SDL_CONFIG to its path). Only the keys you need are required. log_read_key for queries; config_read_key for listFiles/getFile (parser sync).", + "base_url": "https://xdr.us1.sentinelone.net", + "log_read_key": "REPLACE_WITH_LOG_READ_KEY", + "config_read_key": "REPLACE_WITH_CONFIG_READ_KEY", + "console_api_token": "REPLACE_WITH_CONSOLE_API_TOKEN_OR_LEAVE_BLANK" +} diff --git a/tools/sync_sdl_parsers.py b/tools/sync_sdl_parsers.py new file mode 100644 index 0000000..675745a --- /dev/null +++ b/tools/sync_sdl_parsers.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +""" +Pull every parser under /logParsers/ from the SDL tenant and drop it into +./parsers/ so the SIEM-Toolkit Parser Test Runner can list it. + +Auth: config_read_key from sentinelone-sdl-api/config.json +""" +from __future__ import annotations +import json +import os +import sys +import urllib.request +import urllib.parse +import urllib.error + +def _load_sdl_cfg(): + import json as _j, os as _o, sys as _s + here = _o.path.dirname(_o.path.abspath(__file__)) + candidates = [ + _o.environ.get("SDL_CONFIG"), + _o.path.join(here, "sdl_config.json"), + _o.path.join(here, "..", "sdl_config.json"), + ] + for p in candidates: + if p and _o.path.exists(p): + with open(p) as fh: + return _j.load(fh) + _s.stderr.write( + "ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json " + "(see sdl_config.example.json)\n") + _s.exit(2) + + +SDL_CFG_PATH = os.environ.get('SDL_CONFIG') # placeholder; cfg loaded below +DEST = os.environ.get('PARSERS_DIR', os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'parsers')) +def call(base_url: str, token: str, path: str, params: dict) -> dict: + """POST with JSON body — works for both listFiles and getFile on SDL.""" + url = f"{base_url.rstrip('/')}{path}" + body = json.dumps({**params, "token": token}).encode() + req = urllib.request.Request(url, data=body, headers={ + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + }) + try: + with urllib.request.urlopen(req, timeout=30) as r: + return json.loads(r.read()) + except urllib.error.HTTPError as e: + body = e.read().decode(errors="replace")[:300] + raise RuntimeError(f"HTTP {e.code} {path}: {body}") + + +def main() -> int: + cfg = _load_sdl_cfg() + base = cfg["base_url"] + # config_read_key first (per docs), fall back to console_api_token + token = cfg.get("config_read_key") or cfg.get("console_api_token") + if not token: + print("No config_read_key or console_api_token in config.json", file=sys.stderr) + return 2 + + print(f"Listing /logParsers/ from {base} ...") + res = call(base, token, "/api/listFiles", {"pathPrefix": "/logParsers/"}) + paths = res.get("paths", []) + print(f"Found {len(paths)} files under /logParsers/") + + os.makedirs(DEST, exist_ok=True) + fetched, skipped, failed = 0, 0, [] + + for p in paths: + # Strip leading /logParsers/, sanitize for filesystem + name = p.rsplit("/", 1)[-1] or "_unnamed" + # Avoid colliding with existing sample files? Always overwrite to keep fresh. + try: + r = call(base, token, "/api/getFile", {"path": p}) + except Exception as e: + failed.append((p, str(e))) + continue + + content = r.get("content") + if content is None: + failed.append((p, "no content")) + continue + + out = os.path.join(DEST, name) + with open(out, "w", encoding="utf-8") as fh: + fh.write(content) + ver = r.get("version", "?") + print(f" + {name:<60} v{ver} ({len(content)} bytes)") + fetched += 1 + + print() + print(f"Done: fetched={fetched}, failed={len(failed)}") + if failed: + for p, err in failed[:10]: + print(f" ! {p}: {err}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/test_avelios_multi.py b/tools/test_avelios_multi.py new file mode 100644 index 0000000..c3f2ffd --- /dev/null +++ b/tools/test_avelios_multi.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +"""Verify the Parser Test Runner accepts multi-line NDJSON for JSON-mode parsers.""" +import json, urllib.request +import os + +LINES = [ + '{"timestamp":"2026-05-14T00:00:41.969Z","event_type":"DATA_IMPORT_COMPLETED","event_category":"data_transfer","severity":"INFO","outcome":"success","source":{"application":"Avelios Medical"}}', + '{"timestamp":"2026-05-14T00:07:41.969Z","event_type":"PERFORMANCE_DEGRADATION","event_category":"system","severity":"MEDIUM","outcome":"success","source":{"application":"Avelios Medical"}}', + '{"timestamp":"2026-05-14T00:24:41.969Z","event_type":"MALWARE_DETECTED","event_category":"security","severity":"CRITICAL","outcome":"detected","source":{"application":"Avelios Medical"},"details":{"malware_name":"Trojan.GenericKD"}}', +] + +body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": "\n".join(LINES)}).encode() +req = urllib.request.Request( + "http://localhost:8001/api/quality/test-parser", + data=body, headers={"Content-Type": "application/json"}) +r = json.loads(urllib.request.urlopen(req, timeout=30).read()) + +print(f"matched = {r.get('matched')}") +print(f"mode = {r.get('mode')}") +print(f"payloads = {r.get('payload_count')} (showing {r.get('showing_payload')})") +print(f"extracted = {r.get('extracted_count')}") +print(f"derived = {r.get('derived_count')}") +print(f"parse_errors = {r.get('parse_errors')}") +print() +print("rewrites applied (first payload):") +for rw in r.get("rewrites_applied", [])[:10]: + print(f" {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}") diff --git a/tools/test_avelios_parser.py b/tools/test_avelios_parser.py new file mode 100644 index 0000000..48afcc2 --- /dev/null +++ b/tools/test_avelios_parser.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +import json, urllib.request +import os + +log = '{"timestamp": "2026-05-14T00:24:41.969Z", "event_id": "d5c76dd2-5320-4b32-bd27-09acedfb5fdb", "event_type": "MALWARE_DETECTED", "event_category": "security", "severity": "CRITICAL", "source": {"application": "Avelios Medical", "module": "SecurityMonitor"}, "outcome": "detected", "details": {"malware_name": "Trojan.GenericKD"}}' + +body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": log}).encode() +req = urllib.request.Request( + "http://localhost:8001/api/quality/test-parser", + data=body, headers={"Content-Type": "application/json"}) +r = json.loads(urllib.request.urlopen(req, timeout=30).read()) + +print(f"matched={r.get('matched')} mode={r.get('mode')} " + f"extracted={r.get('extracted_count')} derived={r.get('derived_count')}") +print() +print("json-extract fields (first 12):") +for f in [x for x in r.get("fields", []) if x.get("source") == "json-extract"][:12]: + print(f" {f['field']:<32} = {str(f['value'])[:50]}") +print() +print("rewrites applied:") +for rw in r.get("rewrites_applied", [])[:12]: + print(f" {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")