mirror of
https://github.com/marcredhat/SIEM-toolkit-patched
synced 2026-06-11 05:41:19 +00:00
v0.1 Mick Marc merged
This commit is contained in:
@@ -0,0 +1,29 @@
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# SIEM Toolkit — Environment Configuration
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# 1. Copy this file: cp .env.example .env
|
||||
# 2. Fill in values below (see comments for where to find each one)
|
||||
# 3. Start the app: docker-compose up -d --build
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# SentinelOne Management Console
|
||||
# ─ URL: your console (e.g. https://demo.sentinelone.net)
|
||||
# ─ Token: Settings → Users → Service Users → generate API token
|
||||
S1_BASE_URL=https://demo.sentinelone.net
|
||||
S1_API_TOKEN=
|
||||
|
||||
# Singularity Data Lake (SDL) — PowerQuery credentials
|
||||
# ─ Console: Settings → Integrations → Data Lake API Keys
|
||||
# ─ XDR URL: shown on the API Keys page (e.g. https://xdr.us1.sentinelone.net)
|
||||
# ─ Log Read Key: copy the "Log Read" key from that page
|
||||
SDL_XDR_URL=https://xdr.us1.sentinelone.net
|
||||
SDL_LOG_READ_KEY=
|
||||
|
||||
# Anthropic (for Onboarding Accelerator AI features)
|
||||
# ─ https://console.anthropic.com/settings/api-keys
|
||||
ANTHROPIC_API_KEY=
|
||||
|
||||
# SDL Configuration Read key — used by /api/quality/sync-from-sdl to
|
||||
# download parser files from /logParsers/ on the SDL tenant.
|
||||
# Generate in S1 console: Settings -> Integrations -> Data Lake API Keys (Configuration Read scope).
|
||||
SDL_CONFIG_READ_KEY=
|
||||
+15
@@ -0,0 +1,15 @@
|
||||
.env
|
||||
mcp_config.txt
|
||||
__pycache__/
|
||||
*.pyc
|
||||
node_modules/
|
||||
.next/
|
||||
frontend/out/
|
||||
pgdata/
|
||||
parsers/*.json
|
||||
data/
|
||||
|
||||
# Tenant-synced parsers (downloaded via /api/quality/sync-from-sdl) -
|
||||
# do not commit; each tenant generates its own set
|
||||
parsers/*
|
||||
!parsers/.gitkeep
|
||||
+104
@@ -0,0 +1,104 @@
|
||||
# Changes vs upstream `mickbrowns1/SIEM-Toolkit`
|
||||
|
||||
All edits are confined to a handful of files; everything else is untouched.
|
||||
|
||||
## `backend/services/s1_client.py`
|
||||
|
||||
### PowerQuery client
|
||||
- All raised exceptions now include the request body / status / query so the
|
||||
UI never shows a blank `"PowerQuery error: "`.
|
||||
- Non-JSON responses (HTML 5xx gateway pages) surface as a readable error
|
||||
string instead of crashing on `resp.json()`.
|
||||
|
||||
### Detection library: site-scope fallback (`get_platform_rules`)
|
||||
- Upstream hardcoded **account scope** which 403s with site-scoped API
|
||||
tokens. Added `get_scope_for_platform_rules()` that probes `/accounts`
|
||||
first, then `/sites`, returning whichever scope the token can access.
|
||||
- `get_account_id()` now also reads `accountId` from the `/sites` payload as
|
||||
a fallback for site-scoped tokens.
|
||||
|
||||
### SDL parser sync helpers
|
||||
- `list_sdl_parsers()` — rewritten to use the real **SDL Configuration File
|
||||
API** (`POST /api/listFiles` with `pathPrefix=/logParsers/`). Previously
|
||||
it hit a 404 path on the mgmt console.
|
||||
- `get_sdl_parser()` — rewritten to `POST /api/getFile` with `{path}`.
|
||||
- New `_sdl_config_headers()` helper that uses `SDL_CONFIG_READ_KEY` (a
|
||||
separate scope from `SDL_LOG_READ_KEY`).
|
||||
|
||||
## `backend/routers/ingest.py`
|
||||
|
||||
- `/api/ingest/simulate-filter`:
|
||||
* Rebuilt the query into valid SDL syntax — was generating
|
||||
`| group events=count()` (dangling pipe) for empty bodies; now uses a
|
||||
proper base expression and falls back to `dataSource.name!=''` baseline.
|
||||
* Field name corrected from `src.name` → `dataSource.name`.
|
||||
* Surfaces both `result["error"]` and exception text so blank
|
||||
`"PowerQuery error: "` messages are gone.
|
||||
|
||||
## `backend/routers/quality.py`
|
||||
|
||||
- `GET /api/quality/parsers`: lists actual parser filenames in
|
||||
`/app/parsers/` (drives the Test Runner dropdown).
|
||||
- **New `POST /api/quality/sync-from-sdl`**: downloads every parser file
|
||||
under `/logParsers/` on the SDL tenant into `/app/parsers/`. After this
|
||||
call returns, the Parser Test Runner dropdown automatically reflects all
|
||||
tenant parsers (including custom OCSF parsers like
|
||||
`Avelios-Medical-OCSF`). Requires `SDL_CONFIG_READ_KEY` in `.env`.
|
||||
- `_flatten_event`: when a PowerQuery row only carries a JSON-stringified
|
||||
payload in `message` (i.e. the parser isn't applied at query time), parse
|
||||
and flatten that JSON inline so the Field Population tool can measure real
|
||||
coverage.
|
||||
- `POST /api/quality/test-parser`:
|
||||
* Detects SDL JSON-mode parsers (`$=json{parse=json}$`) and parses log
|
||||
lines as JSON.
|
||||
* Applies parser `rewrites: [{input,output,match,replace}]` blocks with
|
||||
correct `$0/$N` backreference translation (`$0` was being mangled to a
|
||||
null byte).
|
||||
* Accepts single JSON object, JSON array, or NDJSON multi-line input.
|
||||
* Returns mode badge data + per-payload counters for the UI.
|
||||
|
||||
## `frontend/index.html`
|
||||
|
||||
- Parser Test Runner dropdown now loads from `/api/quality/parsers` instead
|
||||
of filtering the coverage map (which only has `detected in data`
|
||||
placeholders).
|
||||
- Field Population and Sample Events: added **Last 7d** lookback option.
|
||||
- Parser Test Runner UI: mode badge (`JSON auto-extract` vs `regex format`),
|
||||
payload counter for multi-line input, separate tables for extracted vs
|
||||
derived/rewritten fields.
|
||||
|
||||
## `docker-compose.yml`
|
||||
|
||||
- Pass `SDL_CONFIG_READ_KEY` through to the backend container.
|
||||
|
||||
## `.env.example` / `.gitignore`
|
||||
|
||||
- Document the new `SDL_CONFIG_READ_KEY` variable.
|
||||
- Broaden `.gitignore` so `parsers/*` (tenant-specific synced content) is
|
||||
not committed.
|
||||
|
||||
## New helper scripts (`tools/`)
|
||||
|
||||
- `sync_sdl_parsers.py` — pull all `/logParsers/*` from the tenant.
|
||||
- `probe_pq_syntax.py` — probe which PowerQuery syntaxes the tenant accepts.
|
||||
- `probe_avelios{,_wide,_fields}.py` — inspect a source's event presence,
|
||||
columns, and embedded JSON fields.
|
||||
- `test_avelios_parser.py`, `test_avelios_multi.py` — smoke-test the patched
|
||||
`/api/quality/test-parser` endpoint with single-line and multi-line input.
|
||||
- `probe_simulate_filter.py` — smoke-test the patched
|
||||
`/api/ingest/simulate-filter` endpoint with progressively larger windows.
|
||||
- `probe_sync_from_sdl.py` — call `/api/quality/sync-from-sdl` and verify
|
||||
that `/api/quality/parsers` then reflects the downloaded parsers.
|
||||
- `sdl_config.example.json` — template config (the toolkit's `.env` is
|
||||
separate from the SDL config used by these helper scripts).
|
||||
|
||||
## New `.env` knobs
|
||||
|
||||
```bash
|
||||
# PowerQuery transport tuning (both optional; defaults work for most tenants)
|
||||
SDL_PQ_TIMEOUT=600 # PowerQuery read timeout in seconds (default 600)
|
||||
SDL_PQ_TIMEOUT_RETRIES=1 # extra retries on ReadTimeout (default 1)
|
||||
|
||||
# Required for /api/quality/sync-from-sdl
|
||||
SDL_CONFIG_READ_KEY=... # Data Lake API key with Configuration Read scope
|
||||
```
|
||||
+168
@@ -0,0 +1,168 @@
|
||||
# SIEM-Toolkit Patches & Helper Scripts
|
||||
|
||||
A drop-in patch set that fixes several issues in the upstream
|
||||
[`mickbrowns1/SIEM-Toolkit`](https://github.com/mickbrowns1/SIEM-Toolkit) and
|
||||
adds helper scripts for syncing parsers from a SentinelOne SDL tenant and
|
||||
probing PowerQuery / event data.
|
||||
|
||||
## What's fixed in the upstream code
|
||||
|
||||
| File | Fix |
|
||||
|---|---|
|
||||
| `backend/routers/ingest.py` | **Filter Simulator** PowerQuery rewritten — replaced legacy `count() as events` and `src.name` field with valid SDL `\| filter dataSource.name=='X' \| group events=count()` |
|
||||
| `backend/routers/quality.py` | New `GET /api/quality/parsers` endpoint lists actual parser files; `_flatten_event` now JSON-parses nested `message` payloads so the **Field Population** tool reports real coverage (was always 0% for sources where the parser isn't applied at query time) |
|
||||
| `backend/routers/quality.py` (Parser Test Runner) | Detects SDL JSON auto-extract format `$=json{parse=json}$` and parses log lines as JSON; applies parser `rewrites` (`input/output/match/replace` blocks) with correct `$0`/`$N` backreference handling; accepts **single JSON / JSON array / NDJSON** input |
|
||||
| `frontend/index.html` | Parser dropdown now loads from `/api/quality/parsers` (was filtering `coverage/map` which only has `detected in data` placeholders); added **Last 7d** lookback to both Field Population and Sample Events; Test Runner UI now shows mode badge (`JSON auto-extract` vs `regex format`), payload count for multi-line input, and separate tables for extracted vs derived/rewritten fields |
|
||||
|
||||
## What's NOT fixed in the upstream code (configuration)
|
||||
|
||||
The repo's `docker-compose.yml` interpolates `S1_BASE_URL` etc. from
|
||||
`.env` at compose-up time. **A `docker compose restart` does NOT pick up
|
||||
`.env` changes** — always use `docker compose up -d --force-recreate backend`.
|
||||
|
||||
`S1_BASE_URL` must be the **per-tenant management console subdomain** (e.g.
|
||||
`usea1-XXXX.sentinelone.net`), not the regional SDL/XDR endpoint. If you
|
||||
only know the XDR URL, you can probe candidates with curl:
|
||||
|
||||
```bash
|
||||
TOKEN=$(jq -r .api_token < ~/.../mgmt-config.json)
|
||||
for H in usea1-yourtenant usea1-purple usea1-partners; do
|
||||
printf "%-45s %s\\n" "$H" \\
|
||||
"$(curl -s -o /dev/null -w '%{http_code}' \\
|
||||
\"https://$H.sentinelone.net/web/api/v2.1/cloud-detection/rules?limit=1\" \\
|
||||
-H \"Authorization: ApiToken $TOKEN\")"
|
||||
done
|
||||
# 200 = correct host
|
||||
```
|
||||
|
||||
## Contents
|
||||
|
||||
```
|
||||
.
|
||||
├── README.md (this file)
|
||||
├── env.example template for the toolkit's .env
|
||||
├── sdl_config.example.json template for helper scripts' SDL config
|
||||
├── patched-files/
|
||||
│ ├── backend/routers/
|
||||
│ │ ├── ingest.py <- copy over upstream
|
||||
│ │ └── quality.py <- copy over upstream
|
||||
│ └── frontend/
|
||||
│ └── index.html <- copy over upstream
|
||||
└── scripts/
|
||||
├── sync_sdl_parsers.py pull all /logParsers/* from the tenant into ./parsers/
|
||||
├── probe_pq_syntax.py test what PowerQuery dialect the tenant accepts
|
||||
├── probe_avelios.py sample probe: find a source's events + columns
|
||||
├── probe_avelios_wide.py same, sweeping 1d/3d/7d
|
||||
├── probe_avelios_fields.py parse JSON `message` payloads & count fields
|
||||
├── test_avelios_parser.py hit /api/quality/test-parser with one JSON line
|
||||
└── test_avelios_multi.py same, with multi-line NDJSON
|
||||
```
|
||||
|
||||
## Applying the patches
|
||||
|
||||
1. Clone the upstream repo:
|
||||
```bash
|
||||
git clone https://github.com/mickbrowns1/SIEM-Toolkit.git
|
||||
cd SIEM-Toolkit
|
||||
```
|
||||
2. Overlay the patched files:
|
||||
```bash
|
||||
PATCH=/path/to/this/dir
|
||||
cp "$PATCH"/patched-files/backend/routers/quality.py backend/routers/quality.py
|
||||
cp "$PATCH"/patched-files/backend/routers/ingest.py backend/routers/ingest.py
|
||||
cp "$PATCH"/patched-files/frontend/index.html frontend/index.html
|
||||
```
|
||||
3. Configure:
|
||||
```bash
|
||||
cp "$PATCH"/env.example .env
|
||||
$EDITOR .env # fill in your real values
|
||||
```
|
||||
4. Start the stack:
|
||||
```bash
|
||||
docker compose up -d --build
|
||||
open http://localhost:3001
|
||||
```
|
||||
|
||||
## Helper-script setup
|
||||
|
||||
The helper scripts read a small JSON config (separate from the toolkit's `.env`)
|
||||
containing your SDL log-read / config-read keys:
|
||||
|
||||
```bash
|
||||
cp sdl_config.example.json scripts/sdl_config.json
|
||||
$EDITOR scripts/sdl_config.json
|
||||
# or set the env var
|
||||
export SDL_CONFIG=/somewhere/sdl_config.json
|
||||
```
|
||||
|
||||
## Helper-script usage
|
||||
|
||||
### Sync parsers from the SDL tenant into the toolkit's `parsers/` dir
|
||||
|
||||
```bash
|
||||
PARSERS_DIR=/path/to/SIEM-Toolkit/parsers \\
|
||||
python3 scripts/sync_sdl_parsers.py
|
||||
```
|
||||
|
||||
By default `PARSERS_DIR` defaults to `../parsers` relative to the script.
|
||||
|
||||
### Probe PowerQuery syntax compatibility on your tenant
|
||||
|
||||
```bash
|
||||
python3 scripts/probe_pq_syntax.py
|
||||
```
|
||||
|
||||
Output tells you which command shapes (`| group ...`, `filter ...`, `count() as`, etc.)
|
||||
work on the active deployment.
|
||||
|
||||
### Inspect what a given source's events actually look like
|
||||
|
||||
```bash
|
||||
python3 scripts/probe_avelios.py # finds a source's name + 1-line sample
|
||||
python3 scripts/probe_avelios_wide.py # sweeps 1d/3d/7d top sources
|
||||
python3 scripts/probe_avelios_fields.py # if `message` is JSON, flatten & count fields
|
||||
```
|
||||
|
||||
The scripts are named `*_avelios` for the original use case but work for **any
|
||||
source** — open the file and change the `dataSource.name` filter.
|
||||
|
||||
### Smoke-test the patched Parser Test Runner endpoint
|
||||
|
||||
```bash
|
||||
python3 scripts/test_avelios_parser.py # single-line JSON
|
||||
python3 scripts/test_avelios_multi.py # multi-line NDJSON
|
||||
```
|
||||
|
||||
These hit `http://localhost:8001/api/quality/test-parser` directly so you can
|
||||
verify the backend without using the UI.
|
||||
|
||||
## Common pitfalls
|
||||
|
||||
- **Parser dropdown is empty** → run `sync_sdl_parsers.py`. The upstream "Load
|
||||
SDL Parsers" button only indexes whatever already exists in `parsers/`.
|
||||
- **Field Population shows 0% everywhere** → the source's parser isn't being
|
||||
applied at query time, so PowerQuery returns just `timestamp`+`message`.
|
||||
This patch's `_flatten_event` parses JSON inside `message`. Also try widening
|
||||
the window (the new **Last 7d** option) — some sources are low-volume.
|
||||
- **PowerQuery 400 "Unknown command [count]"** → fixed in `ingest.py`. If you
|
||||
hit it elsewhere, the rule is: SDL PowerQuery requires `\| group events=count()`,
|
||||
never `\| count() as events`, and `count()` must be inside a `group`.
|
||||
- **STAR rules → 302 to /404** → `S1_BASE_URL` is pointed at the SDL/XDR URL
|
||||
instead of the management-console subdomain.
|
||||
|
||||
## Verification
|
||||
|
||||
After applying patches and recreating containers:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8001/health
|
||||
curl http://localhost:8001/api/quality/parsers | python3 -m json.tool # count > 0
|
||||
curl 'http://localhost:8001/api/ingest/top-sources?hours=24' # real numbers
|
||||
curl -X POST http://localhost:8001/api/coverage/load-star-rules # not 502
|
||||
```
|
||||
|
||||
In the UI:
|
||||
- **Coverage Map**: shows `parsers_loaded` and `rules_loaded` > 0
|
||||
- **Ingest → Filter Simulator**: returns matched events + projected GB/month
|
||||
- **Parser Quality → Parser Test Runner**: dropdown lists all parsers
|
||||
- **Parser Quality → Field Population**: real coverage rates (not all 0%)
|
||||
@@ -0,0 +1,276 @@
|
||||
# SIEM Toolkit — SentinelOne AI-SIEM
|
||||
|
||||
> *Inspired by Pineapple Boy!* 🍍
|
||||
|
||||
A self-hosted troubleshooting and visibility tool for SentinelOne AI-SIEM SecOps engineers. Runs as a Docker Compose stack against your SentinelOne demo or production tenant and provides real-time insight into parser coverage, ingest volume, and data quality — all without leaving a single interface.
|
||||
|
||||
---
|
||||
|
||||
## What's Inside
|
||||
|
||||
| Page | Purpose |
|
||||
|---|---|
|
||||
| **Overview** | Live health stats — coverage percentage, active sources, top uncovered sources by volume |
|
||||
| **Parser Coverage Map** | Which active data sources have a parser? Which don't? |
|
||||
| **Ingest Dashboard** | Event volume, top sources, cost projection, filter simulator |
|
||||
| **Parser Quality** | Live event sampler, field population rate, parser test runner |
|
||||
| **Onboarding Accelerator** | Prompt template for onboarding new log sources with Claude Code |
|
||||
| **Settings** | Manage your `.env` credentials directly from the interface |
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
browser → nginx (port 3001) → single-page HTML/JS application
|
||||
↓ API calls
|
||||
FastAPI backend (port 8001)
|
||||
↓
|
||||
┌───────────────────────────┐
|
||||
│ PostgreSQL (SQLAlchemy) │ parser fields, active sources
|
||||
└───────────────────────────┘
|
||||
↓
|
||||
┌───────────────────────────┐
|
||||
│ SentinelOne APIs │
|
||||
│ • Management API │ demo.sentinelone.net
|
||||
│ • Scalyr XDR PowerQuery │ xdr.us1.sentinelone.net
|
||||
└───────────────────────────┘
|
||||
```
|
||||
|
||||
All services run via Docker Compose. The `parsers/` directory is volume-mounted into the backend so SDL parser files may be loaded without rebuilding the image.
|
||||
|
||||
---
|
||||
|
||||
## Setup
|
||||
|
||||
### 1. Clone and Configure
|
||||
|
||||
```bash
|
||||
git clone https://github.com/mickbrowns1/SIEM-Toolkit.git
|
||||
cd SIEM-Toolkit
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
Edit `.env` with your credentials:
|
||||
|
||||
```env
|
||||
S1_BASE_URL=https://demo.sentinelone.net # Your console URL
|
||||
S1_API_TOKEN=eyJ... # Service user API token (account scope or higher)
|
||||
SDL_XDR_URL=https://xdr.us1.sentinelone.net # Scalyr XDR endpoint
|
||||
SDL_LOG_READ_KEY=1j2IU0S... # Data Lake read key
|
||||
ANTHROPIC_API_KEY= # Optional — not currently used
|
||||
```
|
||||
|
||||
**S1_API_TOKEN** — generate at *Settings → Users → Service Users* in the console. The service user should be provisioned at **account scope** or higher.
|
||||
**SDL_LOG_READ_KEY** — found at *Settings → Integrations → Data Lake API Keys*.
|
||||
|
||||
### 2. Add the Detection Library (strongly recommended)
|
||||
|
||||
The Detection Fields Missing column and per-source detection counts on the Coverage Map require a local detections export. This is generated from the [detection-validator](https://github.com/mickbrowns1/detection-validator) repository.
|
||||
|
||||
```bash
|
||||
# Clone the detection-validator repo alongside this one
|
||||
git clone https://github.com/mickbrowns1/detection-validator.git
|
||||
cd detection-validator
|
||||
|
||||
# Follow its README to generate the export, then copy the output here:
|
||||
mkdir -p ../SIEM-Toolkit/data
|
||||
cp data/data/detections/extracted.json ../SIEM-Toolkit/data/detections.json
|
||||
|
||||
cd ../SIEM-Toolkit
|
||||
```
|
||||
|
||||
The `data/` directory is gitignored and never committed. Once the stack is running, click **Load Detections** on the Coverage Map to import the rules into the database.
|
||||
|
||||
### 3. Add Parser Files (optional but strongly recommended)
|
||||
|
||||
Place your SDL parser JSON files into the `parsers/` directory. The backend reads them directly at query time — no rebuild is necessary.
|
||||
|
||||
```bash
|
||||
cp ~/my-parsers/*.json parsers/
|
||||
```
|
||||
|
||||
### 4. Start the Stack
|
||||
|
||||
```bash
|
||||
docker-compose up -d --build
|
||||
```
|
||||
|
||||
Open **http://localhost:3001** in your browser and you're off.
|
||||
|
||||
---
|
||||
|
||||
## Features
|
||||
|
||||
### Overview Dashboard
|
||||
|
||||
The landing page gives you an at-a-glance health summary drawn live from the database:
|
||||
|
||||
- **Parser Coverage %** — proportion of active sources with a confirmed parser
|
||||
- **Active Sources** — total number of `dataSource.name` values seen in the last 7 days
|
||||
- **Covered / Need Parser** — counts for each status
|
||||
|
||||
If any sources are uncovered, the **Top Sources Needing a Parser** table lists the highest-volume offenders. Click any source name to jump directly to the Parser Quality page with that source pre-selected.
|
||||
|
||||
---
|
||||
|
||||
### Parser Coverage Map
|
||||
|
||||
Answers the question: *does each active data source have a parser running?*
|
||||
|
||||
**How it works:**
|
||||
|
||||
1. **Sync Live Sources** — executes a PowerQuery against your data lake to retrieve every `dataSource.name` seen in the last 7 days, along with event counts.
|
||||
2. **Load SDL Parsers** — reads parser files from `parsers/`, extracts the `dataSource.name` attribute from each, and stores the field list in the database.
|
||||
|
||||
**Matching logic (three-tier):**
|
||||
1. Exact `dataSource.name` match between the active source and the parser attribute
|
||||
2. Normalised substring match (ignores spaces, dashes, and case) between the active source name and the parser's `dataSource.name`
|
||||
3. Normalised substring match against the parser filename — catches files where the `dataSource.name` attribute is incorrect or missing
|
||||
|
||||
**Parser detection from data:** During sync, a parallel PowerQuery checks whether each source has events with `event.type` populated in the data lake. If so, a parser is confirmed as running — the source is marked **Covered** even without a local parser file. This handles built-in and cloud-managed parsers that are not present in your `parsers/` folder.
|
||||
|
||||
**Status values:**
|
||||
- 🟢 **Covered** — custom parser confirmed (local file or detected via parsed events in the data lake)
|
||||
- 🔴 **Parser Needed** — no parser found, or only a grok/dottedJson format (which typically indicates an incomplete parser)
|
||||
|
||||
**Filters:** Use the filter pills to focus on Custom Parser only, Default Parser Only (data lake detected), or No Parser.
|
||||
|
||||
**Deep link:** Click any source name in the table to open it directly in Parser Quality with all dropdowns pre-populated.
|
||||
|
||||
**Expected results:** After syncing sources and loading parsers, sources with active SDL parsers will appear as Covered. Sources sending raw, unparsed data — where only `message` and `timestamp` appear in the data lake — will appear as Parser Needed.
|
||||
|
||||
---
|
||||
|
||||
### Ingest Dashboard
|
||||
|
||||
Answers the question: *where is my event volume coming from, and what would happen if I filtered some of it?*
|
||||
|
||||
**Time range:** 1h (default), 3d, 5d, 7d
|
||||
|
||||
**Daily Event Volume** — bar chart of total events per day. In 1h mode, this switches to a by-source breakdown of the current hour's activity.
|
||||
|
||||
**Top Sources** — a table of the 25 highest-volume `dataSource.name` values with event count and estimated GB (calculated at 0.5 GB per million events).
|
||||
|
||||
**Filter Simulator** — enter a source name and an optional event type, then press Simulate. The backend runs a live PowerQuery counting matching events and projects:
|
||||
- Matched events in the selected period
|
||||
- Estimated GB that would be saved
|
||||
- Projected monthly events and GB if the filter were applied permanently
|
||||
|
||||
This is entirely read-only — no filter is created or applied. Use the results to inform an exclusion rule you apply manually in the console.
|
||||
|
||||
**Expected results:** Top sources should reflect what you see in the SentinelOne console PowerQuery tool. The filter simulator provides a reasonable GB estimate assuming uniform event size across the source.
|
||||
|
||||
---
|
||||
|
||||
### Parser Quality
|
||||
|
||||
Three tools for diagnosing parser extraction failures.
|
||||
|
||||
#### Live Event Sampler
|
||||
|
||||
Pulls raw events from a selected source directly from the data lake and renders every field that came back. The `message` column is pinned to the right of the table, with a **⎘ copy** button on each row for convenient extraction of raw log lines.
|
||||
|
||||
- **Empty fields** are displayed as `∅` in grey — immediately highlighting fields the parser is failing to populate
|
||||
- **Healthy source:** many fields populated (`src.ip`, `user.name`, `event.type`, etc.), with `message` present as the raw log backup
|
||||
- **Unhealthy source:** only `timestamp` and `message` populated — the parser is not extracting anything of value
|
||||
|
||||
#### Field Population Rate
|
||||
|
||||
Samples up to 500 events from a source and measures what percentage of them have each field populated. Results are sorted worst-first so the most pressing gaps are immediately visible.
|
||||
|
||||
When you select a source, the tool automatically discovers which fields exist in that source's events and pre-fills the field list — merged with SDL schema defaults. The list is fully editable before running the analysis.
|
||||
|
||||
**Colour coding:**
|
||||
- 🟢 ≥ 80% — healthy extraction
|
||||
- 🟡 40–79% — partial extraction; check your regex patterns
|
||||
- 🔴 < 40% — field is rarely populated; the parser is likely not matching this log format variant
|
||||
|
||||
**Healthy parser:** Key fields such as `src.ip`, `event.type`, and `user.name` should sit between 70–100%. Niche fields like `src.process.cmdline` or `tgt.file.path` will naturally be lower, as not every event type produces them.
|
||||
|
||||
**Broken parser:** All SDL fields at 0%, with only `timestamp` and `message` visible in the "fields seen in sample" chip list at the bottom of the results.
|
||||
|
||||
#### Parser Test Runner
|
||||
|
||||
Paste a raw log line, select a loaded parser, and press Test. The backend extracts SDL `$field=pattern$` format strings from the parser file, converts them to Python named-group regular expressions, and tries each against your log line.
|
||||
|
||||
- **Matched:** displays the format string that matched and every field extracted with its value
|
||||
- **No match:** none of the parser's format strings apply to this log line — the log may contain a format variant the parser does not yet cover
|
||||
|
||||
> **Note:** Only parsers using SDL custom format strings are supported by the test runner. Grok and dottedJson parsers are not currently testable here.
|
||||
|
||||
---
|
||||
|
||||
### Onboarding Accelerator
|
||||
|
||||
A prompt template for using Claude Code to onboard a new log source. Copy the template, paste a sample of raw log lines, and Claude Code will generate:
|
||||
|
||||
- An SDL parser skeleton in augmented-JSON format
|
||||
- Field mappings to the SDL common schema
|
||||
- Parser test assertions
|
||||
|
||||
No Anthropic API key is required — this uses Claude Code directly from your terminal.
|
||||
|
||||
---
|
||||
|
||||
### Settings
|
||||
|
||||
Read and write your `.env` credentials from the interface. Secret fields (API tokens, keys) are masked by default with a show/hide toggle. Changes are written to the mounted `.env` file and take effect after restarting the backend:
|
||||
|
||||
```bash
|
||||
docker-compose up -d --build backend
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Rebuilding
|
||||
|
||||
```bash
|
||||
# Full rebuild
|
||||
docker-compose up -d --build
|
||||
|
||||
# Backend only (after Python changes)
|
||||
docker-compose up -d --build backend
|
||||
|
||||
# Frontend only (after HTML/JS changes)
|
||||
docker-compose up -d --build frontend
|
||||
|
||||
# Reset the database
|
||||
curl -X DELETE http://localhost:8001/api/coverage/reset
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Project Layout
|
||||
|
||||
```
|
||||
.
|
||||
├── backend/
|
||||
│ ├── main.py # FastAPI application, router registration
|
||||
│ ├── db.py # SQLAlchemy models
|
||||
│ ├── routers/
|
||||
│ │ ├── coverage.py # Parser coverage map endpoints
|
||||
│ │ ├── ingest.py # Ingest dashboard + filter simulator
|
||||
│ │ ├── quality.py # Parser quality tools
|
||||
│ │ └── settings.py # .env read/write
|
||||
│ └── services/
|
||||
│ ├── s1_client.py # SentinelOne + Scalyr API client
|
||||
│ └── rule_parser.py # SDL format string field extraction
|
||||
├── frontend/
|
||||
│ └── index.html # Single-page application (Tailwind, vanilla JS)
|
||||
├── parsers/ # SDL parser files (volume-mounted)
|
||||
├── db/
|
||||
│ └── init.sql # Postgres initialisation (tables created by SQLAlchemy)
|
||||
├── docker-compose.yml
|
||||
├── .env.example
|
||||
└── README.md
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- The backend queries your **demo tenant** (`demo.sentinelone.net`) — not usea1-purple or any other tenant. Ensure your `S1_BASE_URL` and `SDL_LOG_READ_KEY` are pointed at the same tenant.
|
||||
- Parser files in `parsers/` are read at query time, not on startup — add or update files at any point without rebuilding the image.
|
||||
- The filter simulator is entirely read-only and makes no changes whatsoever to your tenant configuration.
|
||||
- The service user API token must be at **account scope** or higher. Site-scoped tokens will have limited visibility into rules and may see reduced source counts.
|
||||
@@ -0,0 +1,10 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||
@@ -0,0 +1,55 @@
|
||||
import os
|
||||
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, Text
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from sqlalchemy.orm import declarative_base, sessionmaker
|
||||
from datetime import datetime
|
||||
|
||||
DATABASE_URL = os.environ.get("DATABASE_URL", "postgresql://siem:siem@db:5432/siem")
|
||||
|
||||
engine = create_engine(DATABASE_URL)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class ParsedRule(Base):
|
||||
__tablename__ = "parsed_rules"
|
||||
id = Column(Integer, primary_key=True)
|
||||
rule_id = Column(String, unique=True, index=True)
|
||||
name = Column(String)
|
||||
rule_type = Column(String) # 'star' or 'sigma'
|
||||
fields_used = Column(JSONB)
|
||||
raw = Column(Text)
|
||||
cached_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
|
||||
class ParserField(Base):
|
||||
__tablename__ = "parser_fields"
|
||||
id = Column(Integer, primary_key=True)
|
||||
parser_name = Column(String, index=True)
|
||||
field_name = Column(String)
|
||||
field_type = Column(String)
|
||||
|
||||
|
||||
class ActiveSource(Base):
|
||||
__tablename__ = "active_sources"
|
||||
id = Column(Integer, primary_key=True)
|
||||
source_name = Column(String, unique=True, index=True)
|
||||
event_count = Column(Integer, default=0)
|
||||
synced_at = Column(DateTime, default=datetime.utcnow)
|
||||
parser_detected = Column(Integer, default=0) # >0 means parsed events seen in data lake
|
||||
|
||||
|
||||
class IngestSnapshot(Base):
|
||||
__tablename__ = "ingest_snapshots"
|
||||
id = Column(Integer, primary_key=True)
|
||||
period_days = Column(Integer)
|
||||
data = Column(JSONB)
|
||||
recorded_at = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
@@ -0,0 +1,68 @@
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from db import engine, Base, get_db, ParsedRule
|
||||
from routers import coverage, ingest, settings, quality
|
||||
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
# Runtime migration: add columns that didn't exist in earlier schema versions
|
||||
from sqlalchemy import text
|
||||
with engine.connect() as _conn:
|
||||
_conn.execute(text(
|
||||
"ALTER TABLE active_sources ADD COLUMN IF NOT EXISTS parser_detected INTEGER DEFAULT 0"
|
||||
))
|
||||
_conn.commit()
|
||||
|
||||
app = FastAPI(title="SIEM Toolkit", version="1.0.0")
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def auto_load_detections():
|
||||
"""
|
||||
Auto-load detection library rules on startup.
|
||||
Tries the live S1 API first (accurate 'sources' field); falls back to extracted.json.
|
||||
Skips if rules are already loaded — use the 'Sync Library' button to force a refresh.
|
||||
"""
|
||||
import os
|
||||
from sqlalchemy.orm import Session
|
||||
from services import s1_client
|
||||
|
||||
db: Session = next(get_db())
|
||||
try:
|
||||
existing = db.query(ParsedRule).filter_by(rule_type="library").count()
|
||||
if existing > 0:
|
||||
return # Already loaded — skip until user manually refreshes
|
||||
|
||||
# Try live API first
|
||||
try:
|
||||
rules = await s1_client.get_platform_rules()
|
||||
if rules:
|
||||
coverage._import_from_api_rules(db, rules)
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fall back to local file
|
||||
detections_file = os.environ.get("DETECTIONS_FILE", "/app/data/detections.json")
|
||||
if os.path.exists(detections_file):
|
||||
coverage._import_detections(db, detections_file)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["http://localhost:3001"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.include_router(coverage.router, prefix="/api/coverage", tags=["Coverage"])
|
||||
app.include_router(ingest.router, prefix="/api/ingest", tags=["Ingest"])
|
||||
app.include_router(settings.router, prefix="/api/settings", tags=["Settings"])
|
||||
app.include_router(quality.router, prefix="/api/quality", tags=["Quality"])
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"status": "ok"}
|
||||
@@ -0,0 +1,9 @@
|
||||
fastapi==0.115.0
|
||||
uvicorn[standard]==0.30.0
|
||||
httpx==0.27.2
|
||||
psycopg2-binary==2.9.9
|
||||
sqlalchemy==2.0.36
|
||||
pydantic==2.9.2
|
||||
pydantic-settings==2.6.1
|
||||
pyyaml==6.0.2
|
||||
python-multipart==0.0.12
|
||||
@@ -0,0 +1,648 @@
|
||||
import json
|
||||
import os
|
||||
from fastapi import APIRouter, UploadFile, File, Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
from datetime import datetime
|
||||
from db import get_db, ParsedRule, ParserField, ActiveSource
|
||||
from services import s1_client, rule_parser
|
||||
|
||||
DETECTIONS_FILE = os.environ.get("DETECTIONS_FILE", "/app/data/detections.json")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _star_query_texts(rule: dict) -> list[str]:
|
||||
"""
|
||||
Extract all PowerQuery/filter strings from a STAR rule.
|
||||
Handles simple rules (s1ql) and correlation rules (subQueries[].subQuery).
|
||||
"""
|
||||
texts = []
|
||||
|
||||
# Simple rules
|
||||
for field in ("s1ql", "queryLang", "query", "powerQuery"):
|
||||
v = rule.get(field)
|
||||
# queryLang "2.0" is a version string, not a query — skip short strings
|
||||
if v and isinstance(v, str) and len(v) > 5:
|
||||
texts.append(v)
|
||||
|
||||
# Correlation rules: subQueries[].subQuery
|
||||
cp = rule.get("correlationParams") or {}
|
||||
for sq in cp.get("subQueries", []):
|
||||
v = sq.get("subQuery")
|
||||
if v and isinstance(v, str):
|
||||
texts.append(v)
|
||||
# Also handle older conditions[] format
|
||||
for cond in cp.get("conditions", []):
|
||||
for key in ("filter", "query", "subQuery"):
|
||||
v = cond.get(key)
|
||||
if v and isinstance(v, str):
|
||||
texts.append(v)
|
||||
|
||||
return texts
|
||||
|
||||
|
||||
@router.post("/load-star-rules")
|
||||
async def load_star_rules(db: Session = Depends(get_db)):
|
||||
"""Fetch all STAR rules from the Management Console API and index their fields."""
|
||||
try:
|
||||
rules = await s1_client.get_star_rules()
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"S1 API error: {type(e).__name__}: {e}")
|
||||
|
||||
# Replace all existing STAR rules cleanly to avoid duplicate key errors
|
||||
db.query(ParsedRule).filter_by(rule_type="star").delete()
|
||||
db.flush()
|
||||
|
||||
loaded = []
|
||||
for rule in rules:
|
||||
all_fields: set = set()
|
||||
for qt in _star_query_texts(rule):
|
||||
all_fields |= rule_parser.extract_star_fields(qt)
|
||||
fields = list(all_fields)
|
||||
record = ParsedRule(
|
||||
rule_id=str(rule.get("id", "")),
|
||||
name=rule.get("name", "unnamed"),
|
||||
rule_type="star",
|
||||
fields_used=fields,
|
||||
raw=json.dumps(rule),
|
||||
)
|
||||
db.add(record)
|
||||
loaded.append({"id": record.rule_id, "name": record.name, "fields": fields})
|
||||
|
||||
db.commit()
|
||||
return {"loaded": len(loaded), "rules": loaded}
|
||||
|
||||
|
||||
_EXCLUDED_PATHS = ("/rules/silent/", "/rules/dev/")
|
||||
|
||||
|
||||
def _import_from_api_rules(db, rules: list) -> int:
|
||||
"""
|
||||
Import platform rules fetched directly from the S1 API into the database.
|
||||
Each rule has a 'sources' list — the authoritative dataSource.name values.
|
||||
"""
|
||||
db.query(ParsedRule).filter_by(rule_type="library").delete()
|
||||
db.commit()
|
||||
|
||||
loaded = 0
|
||||
seen_ids: set = set()
|
||||
for rule in rules:
|
||||
rule_id = str(rule.get("id", f"lib_{loaded}"))
|
||||
if rule_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(rule_id)
|
||||
|
||||
sources = rule.get("sources") or []
|
||||
db.add(ParsedRule(
|
||||
rule_id=rule_id,
|
||||
name=rule.get("name", "unnamed"),
|
||||
rule_type="library",
|
||||
fields_used=[], # API rules don't expose field-level info
|
||||
raw=json.dumps({"data_sources": sources}),
|
||||
))
|
||||
loaded += 1
|
||||
if loaded % 500 == 0:
|
||||
db.flush()
|
||||
|
||||
db.commit()
|
||||
return loaded
|
||||
|
||||
|
||||
def _import_detections(db, detections_file: str) -> int:
|
||||
"""
|
||||
Import library detection rules from extracted.json into the database.
|
||||
Replaces any existing library rules. Returns the count of rules loaded.
|
||||
"""
|
||||
with open(detections_file, "r", encoding="utf-8") as fh:
|
||||
data = json.load(fh)
|
||||
|
||||
results = data.get("results", [])
|
||||
results = [r for r in results if not any(r.get("file", "").startswith(p) for p in _EXCLUDED_PATHS)]
|
||||
|
||||
db.query(ParsedRule).filter_by(rule_type="library").delete()
|
||||
db.commit()
|
||||
|
||||
loaded = 0
|
||||
seen_ids: set = set()
|
||||
for rule in results:
|
||||
all_fields: set = set()
|
||||
data_sources: list[str] = []
|
||||
for q in rule.get("queries", []):
|
||||
all_fields.update(q.get("keys", []))
|
||||
ds_vals = q.get("pairs", {}).get("dataSource.name", [])
|
||||
for v in ds_vals:
|
||||
if isinstance(v, str):
|
||||
data_sources.append(v)
|
||||
elif isinstance(v, list):
|
||||
data_sources.extend(str(x) for x in v)
|
||||
|
||||
rule_id = str(rule.get("id", f"lib_{loaded}"))
|
||||
if rule_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(rule_id)
|
||||
|
||||
db.add(ParsedRule(
|
||||
rule_id=rule_id,
|
||||
name=rule.get("name", "unnamed"),
|
||||
rule_type="library",
|
||||
fields_used=list(all_fields),
|
||||
raw=json.dumps({"data_sources": list(set(data_sources))}),
|
||||
))
|
||||
loaded += 1
|
||||
if loaded % 500 == 0:
|
||||
db.flush()
|
||||
|
||||
db.commit()
|
||||
return loaded
|
||||
|
||||
|
||||
@router.post("/load-detections")
|
||||
async def load_detections(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Reload detection library rules.
|
||||
Tries the live S1 API first (platform-rules endpoint); falls back to extracted.json.
|
||||
"""
|
||||
# Prefer the live API — gives accurate 'sources' and is always up to date
|
||||
try:
|
||||
rules = await s1_client.get_platform_rules()
|
||||
if rules:
|
||||
loaded = _import_from_api_rules(db, rules)
|
||||
return {"loaded": loaded, "source": "api"}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fall back to local extracted.json
|
||||
if not os.path.exists(DETECTIONS_FILE):
|
||||
raise HTTPException(
|
||||
404,
|
||||
"S1 API unavailable and no detections file found — "
|
||||
"ensure the data/ volume is mounted with detections.json"
|
||||
)
|
||||
try:
|
||||
loaded = _import_detections(db, DETECTIONS_FILE)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"Failed to import detections: {e}")
|
||||
return {"loaded": loaded, "source": "file"}
|
||||
|
||||
|
||||
@router.post("/upload-sigma")
|
||||
async def upload_sigma(files: list[UploadFile] = File(...), db: Session = Depends(get_db)):
|
||||
"""Upload one or more Sigma YAML files and index their fields."""
|
||||
loaded = []
|
||||
for file in files:
|
||||
content = (await file.read()).decode("utf-8", errors="replace")
|
||||
fields = list(rule_parser.extract_sigma_fields(content))
|
||||
record = ParsedRule(
|
||||
rule_id=f"sigma_{file.filename}",
|
||||
name=file.filename or "unnamed",
|
||||
rule_type="sigma",
|
||||
fields_used=fields,
|
||||
raw=content,
|
||||
)
|
||||
db.merge(record)
|
||||
loaded.append({"name": file.filename, "fields": fields})
|
||||
|
||||
db.commit()
|
||||
return {"loaded": len(loaded), "rules": loaded}
|
||||
|
||||
|
||||
@router.post("/load-parsers-from-sdl")
|
||||
async def load_parsers_from_sdl(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Load SDL parsers from the local /app/parsers directory (mounted from ./parsers/).
|
||||
Files are placed there by the MCP-based loader or by manual copy.
|
||||
Falls back to a clear error if the directory is empty.
|
||||
"""
|
||||
import os
|
||||
parsers_dir = "/app/parsers"
|
||||
|
||||
try:
|
||||
entries = [
|
||||
e for e in os.scandir(parsers_dir)
|
||||
if e.is_file() and not e.name.startswith(".")
|
||||
]
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(503, "parsers/ directory not found — check Docker volume mount")
|
||||
|
||||
if not entries:
|
||||
raise HTTPException(
|
||||
422,
|
||||
"No parser files found in parsers/ directory. "
|
||||
"Use 'Load SDL Parsers via MCP' in Claude Code to populate it, "
|
||||
"or upload a parser file manually."
|
||||
)
|
||||
|
||||
loaded = []
|
||||
errors = []
|
||||
for entry in entries:
|
||||
try:
|
||||
with open(entry.path, "r", encoding="utf-8", errors="replace") as fh:
|
||||
content = fh.read()
|
||||
|
||||
fields: set = set()
|
||||
try:
|
||||
import json as _json
|
||||
parser_data = _json.loads(content)
|
||||
fields = rule_parser.extract_parser_fields(parser_data)
|
||||
except Exception:
|
||||
pass
|
||||
fields |= rule_parser.extract_parser_fields_from_content(content)
|
||||
|
||||
name = entry.name
|
||||
db.query(ParserField).filter_by(parser_name=name).delete()
|
||||
for f in fields:
|
||||
db.add(ParserField(parser_name=name, field_name=f, field_type="string"))
|
||||
loaded.append({"parser": name, "fields": list(fields), "field_count": len(fields)})
|
||||
except Exception as e:
|
||||
errors.append({"parser": entry.name, "error": str(e)})
|
||||
|
||||
db.commit()
|
||||
return {"loaded": len(loaded), "parsers": loaded, "errors": errors}
|
||||
|
||||
|
||||
@router.post("/upload-parser")
|
||||
async def upload_parser(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
||||
"""Upload an SDL parser JSON file and index its output fields."""
|
||||
raw_bytes = await file.read()
|
||||
content_str = raw_bytes.decode("utf-8", errors="replace")
|
||||
|
||||
# Try structured JSON extraction first, fall back to content-string extraction
|
||||
fields: set = set()
|
||||
try:
|
||||
parser_data = json.loads(content_str)
|
||||
fields = rule_parser.extract_parser_fields(parser_data)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Always also run content-string extraction (catches $field$ SDL format strings)
|
||||
fields |= rule_parser.extract_parser_fields_from_content(content_str)
|
||||
|
||||
db.query(ParserField).filter_by(parser_name=file.filename).delete()
|
||||
for f in fields:
|
||||
db.add(ParserField(parser_name=file.filename, field_name=f, field_type="string"))
|
||||
|
||||
db.commit()
|
||||
return {"parser": file.filename, "fields": list(fields)}
|
||||
|
||||
|
||||
class ParserContentPayload(BaseModel):
|
||||
parser_name: str
|
||||
content: str # raw SDL parser file content as string
|
||||
|
||||
|
||||
@router.post("/load-parser-content")
|
||||
async def load_parser_content(payload: ParserContentPayload, db: Session = Depends(get_db)):
|
||||
"""
|
||||
Accept raw SDL parser content (as a string) and index its output fields.
|
||||
Used by MCP-based loader scripts since the SDL HTTP API endpoint is not
|
||||
accessible from inside Docker with standard API token auth.
|
||||
"""
|
||||
fields: set = set()
|
||||
|
||||
# Try JSON parsing first (structured attributes/fields/mappings)
|
||||
try:
|
||||
parser_data = json.loads(payload.content)
|
||||
fields = rule_parser.extract_parser_fields(parser_data)
|
||||
except (json.JSONDecodeError, Exception):
|
||||
pass
|
||||
|
||||
# Always run SDL format-string extraction ($field.name$ patterns)
|
||||
fields |= rule_parser.extract_parser_fields_from_content(payload.content)
|
||||
|
||||
if not fields:
|
||||
raise HTTPException(422, "No fields could be extracted from the parser content")
|
||||
|
||||
db.query(ParserField).filter_by(parser_name=payload.parser_name).delete()
|
||||
for f in fields:
|
||||
db.add(ParserField(parser_name=payload.parser_name, field_name=f, field_type="string"))
|
||||
|
||||
db.commit()
|
||||
return {"parser": payload.parser_name, "fields": list(fields), "field_count": len(fields)}
|
||||
|
||||
|
||||
# Native SentinelOne platform sources — parsed by the system, not by SDL parsers.
|
||||
# Excluded from the coverage map as they do not require custom parser coverage.
|
||||
_S1_NATIVE_SOURCES = {
|
||||
"SentinelOne", "asset", "alert", "vulnerability",
|
||||
"ActivityFeed", "indicator", "misconfiguration",
|
||||
"SentinelOne Ranger AD",
|
||||
}
|
||||
|
||||
|
||||
@router.post("/sync-sources")
|
||||
async def sync_sources(days: int = 7, db: Session = Depends(get_db)):
|
||||
"""Pull active dataSource.names from the SDL and store them.
|
||||
Also detects whether a parser is already producing structured fields
|
||||
for each source by checking if event.type is populated in the data lake.
|
||||
Native S1 platform sources are excluded as they do not require SDL parsers.
|
||||
"""
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
now = datetime.utcnow()
|
||||
from_dt = (now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
to_dt = now.strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
|
||||
try:
|
||||
volume_result, parsed_result = await asyncio.gather(
|
||||
s1_client.run_powerquery(
|
||||
"| group events=count() by dataSource.name | sort -events | limit 200",
|
||||
from_dt, to_dt
|
||||
),
|
||||
s1_client.run_powerquery(
|
||||
"| filter event.type != '' | group parsed=count() by dataSource.name | limit 200",
|
||||
from_dt, to_dt
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"PowerQuery error: {e}")
|
||||
|
||||
# Build lookup: source_name → count of parsed events seen
|
||||
parsed_by_source: dict[str, int] = {}
|
||||
for row in parsed_result.get("events", []):
|
||||
name = row.get("dataSource.name")
|
||||
if name:
|
||||
parsed_by_source[name] = row.get("parsed", 0)
|
||||
|
||||
rows = volume_result.get("events", [])
|
||||
db.query(ActiveSource).delete()
|
||||
synced_at = datetime.utcnow()
|
||||
seen = 0
|
||||
for row in rows:
|
||||
name = row.get("dataSource.name")
|
||||
if name and name not in _S1_NATIVE_SOURCES:
|
||||
db.add(ActiveSource(
|
||||
source_name=name,
|
||||
event_count=row.get("events", 0),
|
||||
synced_at=synced_at,
|
||||
parser_detected=parsed_by_source.get(name, 0),
|
||||
))
|
||||
seen += 1
|
||||
db.commit()
|
||||
return {"synced": seen, "sources": [r["dataSource.name"] for r in rows if r.get("dataSource.name") and r["dataSource.name"] not in _S1_NATIVE_SOURCES]}
|
||||
|
||||
|
||||
def _build_parser_ds_index() -> dict[str, dict]:
|
||||
"""
|
||||
Read all parser files from /app/parsers/ and build an index:
|
||||
dataSource.name (exact, from parser attributes) → {parser_name, format_type}
|
||||
|
||||
Format type is "grok", "dottedJson", or "custom".
|
||||
Sources with grok/dottedJson parsers are flagged as needing a proper parser.
|
||||
"""
|
||||
import os, re
|
||||
parsers_dir = "/app/parsers"
|
||||
_DS_NAME_RE = re.compile(r'"dataSource\.name"\s*:\s*"([^"]+)"')
|
||||
_FORMAT_TYPE_RE = re.compile(r'"type"\s*:\s*"([^"]+)"')
|
||||
|
||||
index: dict[str, dict] = {}
|
||||
try:
|
||||
entries = [e for e in os.scandir(parsers_dir) if e.is_file() and not e.name.startswith(".")]
|
||||
except FileNotFoundError:
|
||||
return index
|
||||
|
||||
for entry in entries:
|
||||
try:
|
||||
with open(entry.path, "r", encoding="utf-8", errors="replace") as fh:
|
||||
content = fh.read()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Extract dataSource.name (may appear multiple times — take first)
|
||||
ds_match = _DS_NAME_RE.search(content)
|
||||
if not ds_match:
|
||||
continue
|
||||
ds_name = ds_match.group(1).strip()
|
||||
|
||||
# Determine format type — look for grok/dottedJson/custom in "type" values
|
||||
format_types = {m.group(1).lower() for m in _FORMAT_TYPE_RE.finditer(content)}
|
||||
if "grok" in format_types:
|
||||
fmt = "grok"
|
||||
elif "dottedjson" in format_types:
|
||||
fmt = "dottedJson"
|
||||
else:
|
||||
fmt = "custom"
|
||||
|
||||
index[ds_name] = {"parser_name": entry.name, "format_type": fmt}
|
||||
|
||||
return index
|
||||
|
||||
|
||||
@router.get("/map")
|
||||
def get_coverage_map(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Source-centric coverage map.
|
||||
For each active dataSource.name in the SDL:
|
||||
- covered = a custom parser is loaded for it (dataSource.name matches)
|
||||
- parser_needed = no parser, OR parser uses grok/dottedJson format
|
||||
Also surfaces which STAR rules reference each source.
|
||||
"""
|
||||
active_sources = db.query(ActiveSource).order_by(ActiveSource.event_count.desc()).all()
|
||||
parser_fields_rows = db.query(ParserField).all()
|
||||
rules = db.query(ParsedRule).all()
|
||||
|
||||
# parser_name → set of field names (for field count display)
|
||||
parser_index: dict[str, set] = {}
|
||||
for pf in parser_fields_rows:
|
||||
parser_index.setdefault(pf.parser_name, set()).add(pf.field_name)
|
||||
|
||||
# Build dataSource.name → {parser_name, format_type} index from parser files
|
||||
ds_index = _build_parser_ds_index()
|
||||
|
||||
def _normalize(s: str) -> str:
|
||||
return s.lower().replace(" ", "").replace("-", "").replace("_", "").replace(".", "")
|
||||
|
||||
def _find_parser_info(source_name: str) -> dict | None:
|
||||
"""
|
||||
Match priority:
|
||||
1. Exact dataSource.name match
|
||||
2. Normalized substring: active source name ↔ parser dataSource.name
|
||||
3. Normalized substring: active source name ↔ parser filename
|
||||
(catches cases where the parser file has a wrong dataSource.name)
|
||||
"""
|
||||
# 1. Exact match on dataSource.name
|
||||
if source_name in ds_index:
|
||||
return ds_index[source_name]
|
||||
sn = _normalize(source_name)
|
||||
# 2. Normalized ds_name substring
|
||||
for ds_name, info in ds_index.items():
|
||||
if _normalize(ds_name) in sn or sn in _normalize(ds_name):
|
||||
return info
|
||||
# 3. Normalized filename substring
|
||||
for info in ds_index.values():
|
||||
if _normalize(info["parser_name"]) in sn or sn in _normalize(info["parser_name"]):
|
||||
return info
|
||||
return None
|
||||
|
||||
# Fields each rule needs: rule.name → set of field names
|
||||
rule_fields_index: dict[str, set] = {
|
||||
rule.name: set(rule.fields_used or []) for rule in rules
|
||||
}
|
||||
|
||||
# Build rule index: source_name → rules that reference it
|
||||
rule_by_source: dict[str, list] = {}
|
||||
for rule in rules:
|
||||
try:
|
||||
raw_data = json.loads(rule.raw) if rule.raw else {}
|
||||
except Exception:
|
||||
raw_data = {}
|
||||
|
||||
if rule.rule_type == "library":
|
||||
# Library rules store pre-extracted data_sources list in raw
|
||||
data_sources = raw_data.get("data_sources", [])
|
||||
else:
|
||||
query_texts = _star_query_texts(raw_data)
|
||||
data_sources = rule_parser.extract_data_sources(query_texts)
|
||||
|
||||
for ds in data_sources:
|
||||
rule_by_source.setdefault(ds, []).append({"rule": rule.name, "type": rule.rule_type})
|
||||
|
||||
# Fields to ignore when computing "missing" — these are metadata/schema fields
|
||||
# always present in events regardless of the parser
|
||||
_SCHEMA_FIELDS = {
|
||||
"dataSource.name", "dataSource.vendor", "dataSource.category",
|
||||
"event.type", "timestamp", "src.endpoint.ip", "src.endpoint.name",
|
||||
# Endpoint agent fields — populated by the SentinelOne agent, not by SDL parsers
|
||||
"cmdScript.content", "endpoint.os", "endpoint.name", "endpoint.uid",
|
||||
}
|
||||
|
||||
sources_out = []
|
||||
covered_count = 0
|
||||
needed_count = 0
|
||||
|
||||
for src in active_sources:
|
||||
parser_info = _find_parser_info(src.source_name)
|
||||
parser_in_data = (src.parser_detected or 0) > 0
|
||||
|
||||
if parser_info and parser_info["format_type"] == "custom":
|
||||
status = "covered"
|
||||
matched_parser = parser_info["parser_name"]
|
||||
format_type = "custom"
|
||||
elif parser_info and parser_info["format_type"] in ("grok", "dottedJson") and not parser_in_data:
|
||||
# Known parser but primitive format and no evidence of parsing in data
|
||||
status = "parser_needed"
|
||||
matched_parser = parser_info["parser_name"]
|
||||
format_type = parser_info["format_type"]
|
||||
elif parser_in_data:
|
||||
# Parsed fields detected in the data lake — a parser is running
|
||||
status = "covered"
|
||||
matched_parser = parser_info["parser_name"] if parser_info else "detected in data"
|
||||
format_type = parser_info["format_type"] if parser_info else "unknown"
|
||||
else:
|
||||
status = "parser_needed"
|
||||
matched_parser = None
|
||||
format_type = None
|
||||
|
||||
if status == "covered":
|
||||
covered_count += 1
|
||||
else:
|
||||
needed_count += 1
|
||||
|
||||
rules_for_src: list = [r for r in rule_by_source.get(src.source_name, []) if r["type"] == "library"]
|
||||
|
||||
# Close-match suggestions — shown when there are no library rules for this source.
|
||||
close_matches: list = []
|
||||
if not rules_for_src:
|
||||
import re as _re
|
||||
|
||||
def _word_tokens(s: str) -> set:
|
||||
"""Split on non-alphanumeric boundaries, lowercase, drop single chars."""
|
||||
return {t for t in _re.split(r"[^a-z0-9]+", s.lower()) if len(t) >= 2}
|
||||
|
||||
def _is_close(a: str, b: str) -> bool:
|
||||
na, nb = _normalize(a), _normalize(b)
|
||||
# 1. Simple substring match
|
||||
if na in nb or nb in na:
|
||||
return True
|
||||
# 2. Token-level: handles "Microsoft 365 Collaboration" vs "Microsoft O365"
|
||||
# — "365" is inside "o365", and they share "microsoft"
|
||||
ta, tb = _word_tokens(a), _word_tokens(b)
|
||||
shared_exact = ta & tb
|
||||
if not shared_exact:
|
||||
return False # Must share at least one word exactly
|
||||
# Check that a DISTINCTIVE (non-shared) token from one name
|
||||
# appears as a substring inside a token from the other.
|
||||
# This avoids matching "Azure AD" to "Azure Platform" on "azure" alone.
|
||||
unique_a = ta - shared_exact
|
||||
unique_b = tb - shared_exact
|
||||
return any(
|
||||
ua in ub or ub in ua
|
||||
for ua in unique_a for ub in unique_b
|
||||
if len(ua) >= 2 and len(ub) >= 2
|
||||
)
|
||||
|
||||
sn = _normalize(src.source_name)
|
||||
for lib_ds, lib_rules in rule_by_source.items():
|
||||
lib_only = [r for r in lib_rules if r["type"] == "library"]
|
||||
if not lib_only:
|
||||
continue
|
||||
if _is_close(src.source_name, lib_ds):
|
||||
close_matches.append({
|
||||
"library_name": lib_ds,
|
||||
"rule_count": len(lib_only),
|
||||
})
|
||||
close_matches.sort(key=lambda x: x["rule_count"], reverse=True)
|
||||
close_matches = close_matches[:3]
|
||||
|
||||
# Count how many rules reference each field (frequency)
|
||||
field_freq: dict[str, int] = {}
|
||||
for r in rules_for_src:
|
||||
for f in rule_fields_index.get(r["rule"], set()):
|
||||
field_freq[f] = field_freq.get(f, 0) + 1
|
||||
|
||||
# Fields the parser provides
|
||||
parser_provides = parser_index.get(matched_parser, set()) if matched_parser and matched_parser != "detected in data" else set()
|
||||
|
||||
# Minimum number of rules that must reference a field before we flag it.
|
||||
# Scales with rule count so single-rule oddities don't dominate.
|
||||
rule_count = len(rules_for_src)
|
||||
min_rules = max(2, round(rule_count * 0.05)) if rule_count >= 10 else 2
|
||||
|
||||
# Missing = dotted-path fields needed by >= min_rules rules,
|
||||
# not in schema constants, not provided by the parser.
|
||||
missing_fields = sorted(
|
||||
f for f, count in field_freq.items()
|
||||
if count >= min_rules
|
||||
and "." in f
|
||||
and f not in _SCHEMA_FIELDS
|
||||
and f not in parser_provides
|
||||
)
|
||||
|
||||
sources_out.append({
|
||||
"source_name": src.source_name,
|
||||
"event_count": src.event_count,
|
||||
"status": status,
|
||||
"parser": matched_parser,
|
||||
"format_type": format_type,
|
||||
"parser_fields": len(parser_provides),
|
||||
"parser_detected": src.parser_detected or 0,
|
||||
"rules": rules_for_src,
|
||||
"rule_count": len(rules_for_src),
|
||||
"close_matches": close_matches,
|
||||
"missing_fields": missing_fields,
|
||||
"missing_fields_count": len(missing_fields),
|
||||
"synced_at": src.synced_at.isoformat() if src.synced_at else None,
|
||||
})
|
||||
|
||||
synced_at = active_sources[0].synced_at.isoformat() if active_sources else None
|
||||
|
||||
return {
|
||||
"summary": {
|
||||
"active_sources": len(active_sources),
|
||||
"covered": covered_count,
|
||||
"parser_needed": needed_count,
|
||||
"parsers_loaded": len(parser_index),
|
||||
"rules_loaded": len(rules),
|
||||
},
|
||||
"sources": sources_out,
|
||||
"synced_at": synced_at,
|
||||
"has_sources": len(active_sources) > 0,
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/reset")
|
||||
def reset_data(db: Session = Depends(get_db)):
|
||||
db.query(ParsedRule).delete()
|
||||
db.query(ParserField).delete()
|
||||
db.commit()
|
||||
return {"cleared": True}
|
||||
@@ -0,0 +1,122 @@
|
||||
from datetime import datetime, timedelta
|
||||
from fastapi import APIRouter, Query, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from services import s1_client
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _date_range(days: int) -> tuple[str, str]:
|
||||
now = datetime.utcnow()
|
||||
return (
|
||||
(now - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
)
|
||||
|
||||
|
||||
def _date_range_hours(hours: int) -> tuple[str, str]:
|
||||
now = datetime.utcnow()
|
||||
return (
|
||||
(now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
)
|
||||
|
||||
|
||||
@router.get("/top-sources")
|
||||
async def get_top_sources(
|
||||
days: int = Query(None, ge=1, le=90),
|
||||
hours: int = Query(None, ge=1, le=24),
|
||||
):
|
||||
"""Top log sources by event count over the given period."""
|
||||
if hours is not None:
|
||||
from_dt, to_dt = _date_range_hours(hours)
|
||||
period_label = f"{hours}h"
|
||||
else:
|
||||
from_dt, to_dt = _date_range(days or 7)
|
||||
period_label = f"{days or 7}d"
|
||||
query = "| group events=count() by dataSource.name | sort -events | limit 25"
|
||||
try:
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"PowerQuery error: {e}")
|
||||
return {"period": period_label, "data": result.get("events", [])}
|
||||
|
||||
|
||||
@router.get("/by-event-type")
|
||||
async def get_by_event_type(days: int = Query(7, ge=1, le=90)):
|
||||
"""Event counts grouped by source and event type."""
|
||||
from_dt, to_dt = _date_range(days)
|
||||
query = "| group events=count() by dataSource.name, event.type | sort -events | limit 100"
|
||||
try:
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"PowerQuery error: {e}")
|
||||
return {"period_days": days, "data": result.get("events", [])}
|
||||
|
||||
|
||||
@router.get("/daily-volume")
|
||||
async def get_daily_volume(days: int = Query(5, ge=1, le=7)):
|
||||
"""Total event count per day — queries run in parallel."""
|
||||
import asyncio
|
||||
|
||||
now = datetime.utcnow()
|
||||
points = min(days, 7)
|
||||
|
||||
async def _fetch_day(i: int) -> dict:
|
||||
day_from = (now - timedelta(days=i + 1)).strftime("%Y-%m-%dT00:00:00.000Z")
|
||||
day_to = (now - timedelta(days=i)).strftime("%Y-%m-%dT00:00:00.000Z")
|
||||
label = (now - timedelta(days=i + 1)).strftime("%Y-%m-%d")
|
||||
try:
|
||||
result = await s1_client.run_powerquery("| group total=count()", day_from, day_to)
|
||||
events_list = result.get("events", []) if isinstance(result, dict) else []
|
||||
count = events_list[0].get("total", 0) if events_list else 0
|
||||
except Exception:
|
||||
count = 0
|
||||
return {"date": label, "events": count}
|
||||
|
||||
results = await asyncio.gather(*[_fetch_day(i) for i in range(points)])
|
||||
return list(reversed(results))
|
||||
|
||||
|
||||
class FilterRule(BaseModel):
|
||||
source: str = ""
|
||||
event_type: str = ""
|
||||
days: int = 7
|
||||
gb_per_million_events: float = 0.5
|
||||
|
||||
|
||||
@router.post("/simulate-filter")
|
||||
async def simulate_filter(rule: FilterRule):
|
||||
"""Estimate how many events and GB would be eliminated by an exclusion filter."""
|
||||
from_dt, to_dt = _date_range(rule.days)
|
||||
|
||||
clauses = []
|
||||
if rule.source:
|
||||
clauses.append(f"dataSource.name=='{rule.source}'")
|
||||
if rule.event_type:
|
||||
clauses.append(f"event.type=='{rule.event_type}'")
|
||||
|
||||
if clauses:
|
||||
filter_expr = " and ".join(clauses)
|
||||
query = f"| filter {filter_expr} | group events=count()"
|
||||
else:
|
||||
query = "| group events=count()"
|
||||
|
||||
try:
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
events = (result.get("events") or [{}])[0].get("events", 0) if isinstance(result.get("events"), list) else 0
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"PowerQuery error: {e}")
|
||||
|
||||
estimated_gb = round(events / 1_000_000 * rule.gb_per_million_events, 3)
|
||||
monthly_events = int(events / rule.days * 30)
|
||||
monthly_gb = round(monthly_events / 1_000_000 * rule.gb_per_million_events, 2)
|
||||
|
||||
return {
|
||||
"period_days": rule.days,
|
||||
"matched_events": events,
|
||||
"estimated_gb_period": estimated_gb,
|
||||
"projected_monthly_events": monthly_events,
|
||||
"projected_monthly_gb": monthly_gb,
|
||||
"filter": {"source": rule.source, "event_type": rule.event_type},
|
||||
}
|
||||
@@ -0,0 +1,440 @@
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime, timedelta
|
||||
from services import s1_client
|
||||
import os
|
||||
import re
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
PARSERS_DIR = "/app/parsers"
|
||||
|
||||
|
||||
@router.get("/parsers")
|
||||
def list_parser_files():
|
||||
"""List parser filenames available under /app/parsers/ for the Test Runner."""
|
||||
try:
|
||||
names = sorted(
|
||||
e.name for e in os.scandir(PARSERS_DIR)
|
||||
if e.is_file() and not e.name.startswith(".")
|
||||
)
|
||||
except FileNotFoundError:
|
||||
names = []
|
||||
return {"parsers": names, "count": len(names)}
|
||||
|
||||
|
||||
@router.post("/sync-from-sdl")
|
||||
async def sync_parsers_from_sdl():
|
||||
"""Download every parser file under /logParsers/ on the SDL tenant into
|
||||
/app/parsers/. After this call returns, the Parser Test Runner dropdown
|
||||
will include all tenant parsers (including custom ones).
|
||||
|
||||
Requires SDL_CONFIG_READ_KEY in .env (Configuration Read scope on the
|
||||
Data Lake API key).
|
||||
"""
|
||||
if not s1_client.SDL_CONFIG_READ_KEY:
|
||||
raise HTTPException(
|
||||
400,
|
||||
"SDL_CONFIG_READ_KEY is not set in .env. Generate a Data Lake API key "
|
||||
"with 'Configuration Read' scope in the S1 console and add it to .env."
|
||||
)
|
||||
|
||||
try:
|
||||
names = await s1_client.list_sdl_parsers()
|
||||
except Exception as e:
|
||||
raise HTTPException(502, f"SDL listFiles failed: {e}")
|
||||
|
||||
os.makedirs(PARSERS_DIR, exist_ok=True)
|
||||
downloaded: list[str] = []
|
||||
errors: list[dict] = []
|
||||
|
||||
for name in names:
|
||||
# The path on SDL is /logParsers/<name>; we write to /app/parsers/<sanitized-name>.
|
||||
safe_name = name.replace("/", "_")
|
||||
try:
|
||||
resp = await s1_client.get_sdl_parser(name)
|
||||
content = resp.get("content")
|
||||
if content is None:
|
||||
errors.append({"parser": name, "error": "no content field in response"})
|
||||
continue
|
||||
with open(os.path.join(PARSERS_DIR, safe_name), "w", encoding="utf-8") as fh:
|
||||
fh.write(content)
|
||||
downloaded.append(safe_name)
|
||||
except Exception as e:
|
||||
errors.append({"parser": name, "error": str(e) or e.__class__.__name__})
|
||||
|
||||
return {
|
||||
"downloaded": len(downloaded),
|
||||
"parsers": downloaded,
|
||||
"errors": errors,
|
||||
"directory": PARSERS_DIR,
|
||||
}
|
||||
|
||||
|
||||
def _date_range_hours(hours: int) -> tuple[str, str]:
|
||||
now = datetime.utcnow()
|
||||
return (
|
||||
(now - timedelta(hours=hours)).strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
now.strftime("%Y-%m-%dT%H:%M:%S.000Z"),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class SampleEventsRequest(BaseModel):
|
||||
source: str
|
||||
limit: int = 20
|
||||
hours: int = 1
|
||||
|
||||
|
||||
class FieldPopulationRequest(BaseModel):
|
||||
source: str
|
||||
hours: int = 24
|
||||
fields: list[str] = [
|
||||
"src.ip",
|
||||
"src.port",
|
||||
"dst.ip",
|
||||
"dst.port",
|
||||
"user.name",
|
||||
"event.type",
|
||||
"src.process.name",
|
||||
"src.process.cmdline",
|
||||
"tgt.file.path",
|
||||
"network.direction",
|
||||
"dataSource.name",
|
||||
]
|
||||
|
||||
|
||||
class TestParserRequest(BaseModel):
|
||||
parser_name: str
|
||||
log_line: str
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _flatten_dict(d: dict, prefix: str = "", out: dict | None = None) -> dict:
|
||||
"""Recursively flatten a nested dict into dotted keys."""
|
||||
if out is None:
|
||||
out = {}
|
||||
if not isinstance(d, dict):
|
||||
return out
|
||||
for k, v in d.items():
|
||||
key = f"{prefix}.{k}" if prefix else k
|
||||
if isinstance(v, dict):
|
||||
_flatten_dict(v, key, out)
|
||||
else:
|
||||
out[key] = v
|
||||
return out
|
||||
|
||||
|
||||
def _flatten_event(event: dict) -> dict:
|
||||
"""Return a flat field→value dict from a PowerQuery result row.
|
||||
|
||||
If the row only carries a JSON-stringified payload in `message` (i.e. the
|
||||
parser wasn't applied at query time), parse and flatten it inline so the
|
||||
UI can measure field population accurately. The original raw `message`
|
||||
is preserved under its own key.
|
||||
"""
|
||||
if not isinstance(event, dict):
|
||||
return {}
|
||||
flat = dict(event)
|
||||
msg = flat.get("message")
|
||||
if isinstance(msg, str) and msg.startswith("{") and msg.endswith("}"):
|
||||
try:
|
||||
parsed = __import__("json").loads(msg)
|
||||
if isinstance(parsed, dict):
|
||||
flat.update(_flatten_dict(parsed))
|
||||
except Exception:
|
||||
pass
|
||||
return flat
|
||||
|
||||
|
||||
def _extract_format_strings(content: str) -> list[str]:
|
||||
"""
|
||||
Extract SDL format string values from augmented-JSON parser content.
|
||||
Matches: "format": "..." (double-quoted value, supports escaped quotes).
|
||||
"""
|
||||
pattern = re.compile(r'"format"\s*:\s*"((?:[^"\\]|\\.)*)"')
|
||||
return pattern.findall(content)
|
||||
|
||||
|
||||
def _sdl_format_to_regex(fmt: str) -> tuple[re.Pattern, dict[str, str]]:
|
||||
"""
|
||||
Convert an SDL format string to a compiled Python regex.
|
||||
|
||||
Returns (compiled_pattern, py_group_to_sdl_field) mapping so callers can
|
||||
translate group names back to the original SDL field names.
|
||||
|
||||
Raises re.error if the resulting pattern cannot be compiled.
|
||||
"""
|
||||
# Split on $...$ tokens
|
||||
token_pattern = re.compile(r'\$([^$]+)\$')
|
||||
parts = token_pattern.split(fmt)
|
||||
# parts alternates: literal, token, literal, token, ...
|
||||
|
||||
regex_parts: list[str] = []
|
||||
py_group_to_sdl: dict[str, str] = {}
|
||||
seen_groups: dict[str, int] = {}
|
||||
|
||||
for i, part in enumerate(parts):
|
||||
if i % 2 == 0:
|
||||
# Literal text
|
||||
regex_parts.append(re.escape(part))
|
||||
else:
|
||||
# Token: either "field.name=PATTERN" or just "field.name"
|
||||
if '=' in part:
|
||||
field_name, pattern = part.split('=', 1)
|
||||
else:
|
||||
field_name = part
|
||||
pattern = r'[^\s]+'
|
||||
|
||||
# Build a valid Python group name
|
||||
safe = re.sub(r'[.\-]', '_', field_name)
|
||||
if safe in seen_groups:
|
||||
seen_groups[safe] += 1
|
||||
safe = f"{safe}_{seen_groups[safe]}"
|
||||
else:
|
||||
seen_groups[safe] = 0
|
||||
|
||||
py_group_to_sdl[safe] = field_name
|
||||
regex_parts.append(f'(?P<{safe}>{pattern})')
|
||||
|
||||
compiled = re.compile(''.join(regex_parts), re.IGNORECASE)
|
||||
return compiled, py_group_to_sdl
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Endpoints
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/sample-events")
|
||||
async def sample_events(req: SampleEventsRequest):
|
||||
"""Return a sample of raw events from a given data source."""
|
||||
query = f'| filter dataSource.name = "{req.source}" | limit {req.limit}'
|
||||
from_dt, to_dt = _date_range_hours(req.hours)
|
||||
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
|
||||
rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
|
||||
events = [_flatten_event(row) for row in rows]
|
||||
|
||||
return {
|
||||
"source": req.source,
|
||||
"events": events,
|
||||
"count": len(events),
|
||||
"hours": req.hours,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/field-population")
|
||||
async def field_population(req: FieldPopulationRequest):
|
||||
"""
|
||||
Analyse how consistently each requested field is populated across a sample
|
||||
of events from a data source.
|
||||
"""
|
||||
query = f'| filter dataSource.name = "{req.source}" | limit 500'
|
||||
from_dt, to_dt = _date_range_hours(req.hours)
|
||||
|
||||
result = await s1_client.run_powerquery(query, from_dt, to_dt)
|
||||
|
||||
rows = result if isinstance(result, list) else (result.get("rows") or result.get("events") or [])
|
||||
events = [_flatten_event(row) for row in rows]
|
||||
|
||||
if not events:
|
||||
raise HTTPException(status_code=404, detail=f"No events found for source '{req.source}' in the last {req.hours} hours.")
|
||||
|
||||
total = len(events)
|
||||
_empty = {None, "", "null"}
|
||||
|
||||
# Collect all field names seen across the sample (useful for surfacing what IS there)
|
||||
all_seen_fields = sorted({k for ev in events for k in ev})
|
||||
|
||||
field_stats = []
|
||||
for field in req.fields:
|
||||
# dataSource.name is always 100% — we filtered by it; Scalyr just doesn't echo it back
|
||||
if field == "dataSource.name":
|
||||
populated = total
|
||||
else:
|
||||
populated = sum(1 for ev in events if ev.get(field) not in _empty)
|
||||
rate = round((populated / total) * 100, 1)
|
||||
field_stats.append({
|
||||
"field": field,
|
||||
"populated": populated,
|
||||
"total": total,
|
||||
"rate": rate,
|
||||
})
|
||||
|
||||
# Sort ascending by rate (worst coverage first)
|
||||
field_stats.sort(key=lambda x: x["rate"])
|
||||
|
||||
return {
|
||||
"source": req.source,
|
||||
"total_sampled": total,
|
||||
"hours": req.hours,
|
||||
"fields": field_stats,
|
||||
"fields_seen_in_sample": all_seen_fields,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/test-parser")
|
||||
async def test_parser(req: TestParserRequest):
|
||||
"""
|
||||
Test a parser against a raw log line by extracting and matching SDL format
|
||||
strings found in the parser file.
|
||||
"""
|
||||
parser_path = f"/app/parsers/{req.parser_name}"
|
||||
|
||||
try:
|
||||
with open(parser_path, "r", encoding="utf-8") as fh:
|
||||
content = fh.read()
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(status_code=404, detail=f"Parser file not found: {req.parser_name}")
|
||||
except OSError as exc:
|
||||
raise HTTPException(status_code=500, detail=f"Could not read parser file: {exc}")
|
||||
|
||||
format_strings = _extract_format_strings(content)
|
||||
|
||||
# ── JSON auto-extract path ──────────────────────────────────────────────
|
||||
# SDL parsers that use `$=json{parse=json}$` (or any format containing
|
||||
# `parse=json`) auto-extract every top-level JSON key as an attribute.
|
||||
# The regex-based path can't model that — handle it explicitly so users
|
||||
# can test JSON-shaped logs against JSON-mode parsers.
|
||||
log_input = req.log_line.strip()
|
||||
is_json_mode = any("parse=json" in f for f in format_strings) or log_input.startswith("{")
|
||||
if is_json_mode:
|
||||
import json as _json
|
||||
# Support multi-line input (one JSON object per line, or a JSON array)
|
||||
lines = [ln for ln in (l.strip() for l in log_input.splitlines()) if ln]
|
||||
payloads: list[dict] = []
|
||||
parse_errors: list[str] = []
|
||||
# Single line: try direct parse; if it's a JSON array, expand.
|
||||
if len(lines) == 1:
|
||||
try:
|
||||
obj = _json.loads(lines[0])
|
||||
except Exception as e:
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": False,
|
||||
"message": f"Parser expects JSON but log line could not be parsed as JSON: {e}",
|
||||
"fields": [],
|
||||
}
|
||||
if isinstance(obj, list):
|
||||
payloads = [x for x in obj if isinstance(x, dict)]
|
||||
elif isinstance(obj, dict):
|
||||
payloads = [obj]
|
||||
else:
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": False,
|
||||
"message": "Parser expects a JSON object (got scalar).",
|
||||
"fields": [],
|
||||
}
|
||||
else:
|
||||
# Multi-line: one JSON object per line (NDJSON)
|
||||
for i, ln in enumerate(lines, 1):
|
||||
try:
|
||||
obj = _json.loads(ln)
|
||||
if isinstance(obj, dict):
|
||||
payloads.append(obj)
|
||||
else:
|
||||
parse_errors.append(f"line {i}: not a JSON object")
|
||||
except Exception as e:
|
||||
parse_errors.append(f"line {i}: {e}")
|
||||
|
||||
if not payloads:
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": False,
|
||||
"message": "No valid JSON objects found. " + " | ".join(parse_errors[:3]),
|
||||
"fields": [],
|
||||
}
|
||||
|
||||
# Use the first payload for the detail table; report totals.
|
||||
payload = payloads[0]
|
||||
extracted = _flatten_dict(payload)
|
||||
# Apply lightweight rewrites if present (input/output/match/replace blocks).
|
||||
# We only handle simple literal/regex matches with $0 or string replacements;
|
||||
# this is best-effort, intended for quick visual verification.
|
||||
rewrites_applied = []
|
||||
rewrite_re = re.compile(
|
||||
r'\{\s*input:\s*"([^"]+)"\s*,\s*output:\s*"([^"]+)"\s*,\s*match:\s*"((?:[^"\\]|\\.)*)"\s*,\s*replace:\s*"((?:[^"\\]|\\.)*)"\s*\}',
|
||||
re.DOTALL,
|
||||
)
|
||||
derived: dict[str, str] = {}
|
||||
for m in rewrite_re.finditer(content):
|
||||
in_field, out_field, match_pat, replace_val = m.group(1), m.group(2), m.group(3), m.group(4)
|
||||
src_val = extracted.get(in_field)
|
||||
if src_val is None:
|
||||
continue
|
||||
try:
|
||||
m2 = re.search(match_pat, str(src_val))
|
||||
except re.error:
|
||||
continue
|
||||
if not m2:
|
||||
continue
|
||||
# SDL uses $0 for whole match, $1.. for groups. Translate to Python
|
||||
# \g<0>, \g<1>, ... so re.sub doesn't read \0 as a null byte.
|
||||
def _to_py_backref(s: str) -> str:
|
||||
return re.sub(r"\$(\d+)", lambda mm: f"\\g<{mm.group(1)}>", s)
|
||||
try:
|
||||
val = re.sub(match_pat, _to_py_backref(replace_val), str(src_val), count=1)
|
||||
except re.error:
|
||||
val = replace_val
|
||||
derived[out_field] = val
|
||||
rewrites_applied.append({
|
||||
"input": in_field, "input_value": src_val,
|
||||
"output": out_field, "matched_on": match_pat, "result": val,
|
||||
})
|
||||
|
||||
fields = (
|
||||
[{"field": k, "value": v, "source": "json-extract"} for k, v in sorted(extracted.items())]
|
||||
+ [{"field": k, "value": v, "source": "rewrite"} for k, v in sorted(derived.items())]
|
||||
)
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": True,
|
||||
"mode": "json",
|
||||
"format_matched": "$=json{parse=json}$",
|
||||
"fields": fields,
|
||||
"rewrites_applied": rewrites_applied,
|
||||
"extracted_count": len(extracted),
|
||||
"derived_count": len(derived),
|
||||
"payload_count": len(payloads),
|
||||
"parse_errors": parse_errors,
|
||||
"showing_payload": 1,
|
||||
}
|
||||
|
||||
# ── Regex format-string path (original) ─────────────────────────────────
|
||||
for fmt in format_strings:
|
||||
try:
|
||||
compiled, py_to_sdl = _sdl_format_to_regex(fmt)
|
||||
except re.error:
|
||||
# Skip unparseable format strings
|
||||
continue
|
||||
|
||||
match = compiled.search(req.log_line)
|
||||
if match:
|
||||
fields = [
|
||||
{"field": py_to_sdl.get(group, group), "value": value}
|
||||
for group, value in match.groupdict().items()
|
||||
if value is not None
|
||||
]
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": True,
|
||||
"mode": "regex",
|
||||
"format_matched": fmt,
|
||||
"fields": fields,
|
||||
}
|
||||
|
||||
return {
|
||||
"parser_name": req.parser_name,
|
||||
"matched": False,
|
||||
"message": "No format pattern matched",
|
||||
"fields": [],
|
||||
}
|
||||
@@ -0,0 +1,105 @@
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
ENV_FILE = Path(os.environ.get("ENV_FILE_PATH", "/app/.env"))
|
||||
|
||||
# Fields we expose in the UI — order matters for display
|
||||
FIELDS = [
|
||||
{"key": "S1_BASE_URL", "label": "Console URL", "secret": False, "placeholder": "https://demo.sentinelone.net"},
|
||||
{"key": "S1_API_TOKEN", "label": "Console API Token", "secret": True, "placeholder": "eyJ..."},
|
||||
{"key": "SDL_XDR_URL", "label": "SDL XDR URL", "secret": False, "placeholder": "https://xdr.us1.sentinelone.net"},
|
||||
{"key": "SDL_LOG_READ_KEY", "label": "SDL Log Read Key", "secret": True, "placeholder": "1DnK0Y4e..."},
|
||||
{"key": "ANTHROPIC_API_KEY", "label": "Anthropic API Key", "secret": True, "placeholder": "sk-ant-..."},
|
||||
]
|
||||
|
||||
FIELD_KEYS = {f["key"] for f in FIELDS}
|
||||
|
||||
|
||||
def _read_env() -> dict[str, str]:
|
||||
"""Read .env file into a dict."""
|
||||
vals: dict[str, str] = {}
|
||||
if ENV_FILE.exists():
|
||||
for line in ENV_FILE.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith("#") and "=" in line:
|
||||
k, _, v = line.partition("=")
|
||||
vals[k.strip()] = v.strip()
|
||||
return vals
|
||||
|
||||
|
||||
def _write_env(updates: dict[str, str]) -> None:
|
||||
"""Write updates into .env, preserving comments and unknown keys."""
|
||||
existing_lines: list[str] = []
|
||||
if ENV_FILE.exists():
|
||||
existing_lines = ENV_FILE.read_text().splitlines()
|
||||
|
||||
written: set[str] = set()
|
||||
new_lines: list[str] = []
|
||||
|
||||
for line in existing_lines:
|
||||
stripped = line.strip()
|
||||
if stripped and not stripped.startswith("#") and "=" in stripped:
|
||||
k, _, _ = stripped.partition("=")
|
||||
k = k.strip()
|
||||
if k in updates:
|
||||
new_lines.append(f"{k}={updates[k]}")
|
||||
written.add(k)
|
||||
continue
|
||||
new_lines.append(line)
|
||||
|
||||
# Append any new keys not already in the file
|
||||
for k, v in updates.items():
|
||||
if k not in written:
|
||||
new_lines.append(f"{k}={v}")
|
||||
|
||||
ENV_FILE.write_text("\n".join(new_lines) + "\n")
|
||||
|
||||
|
||||
@router.get("/config")
|
||||
async def get_config():
|
||||
"""Return current config values. Secrets are masked."""
|
||||
env_vals = _read_env()
|
||||
result = []
|
||||
for f in FIELDS:
|
||||
key = f["key"]
|
||||
# Prefer live env var, fall back to .env file value
|
||||
raw = os.environ.get(key, env_vals.get(key, ""))
|
||||
if f["secret"] and raw:
|
||||
# Show first 6 + last 4 chars, mask middle
|
||||
masked = raw[:6] + "•" * max(4, len(raw) - 10) + raw[-4:] if len(raw) > 10 else "••••••••"
|
||||
else:
|
||||
masked = raw
|
||||
result.append({
|
||||
"key": key,
|
||||
"label": f["label"],
|
||||
"secret": f["secret"],
|
||||
"placeholder": f["placeholder"],
|
||||
"value": masked,
|
||||
"set": bool(raw),
|
||||
})
|
||||
env_file_exists = ENV_FILE.exists()
|
||||
return {"fields": result, "env_file_exists": env_file_exists, "env_file_path": str(ENV_FILE)}
|
||||
|
||||
|
||||
class ConfigUpdate(BaseModel):
|
||||
updates: dict[str, str]
|
||||
|
||||
|
||||
@router.post("/config")
|
||||
async def save_config(body: ConfigUpdate):
|
||||
"""Save config values to .env file. Only known keys accepted."""
|
||||
bad = [k for k in body.updates if k not in FIELD_KEYS]
|
||||
if bad:
|
||||
raise HTTPException(400, f"Unknown keys: {bad}")
|
||||
if not ENV_FILE.parent.exists():
|
||||
raise HTTPException(503, f"Cannot write to {ENV_FILE} — check Docker volume mount")
|
||||
try:
|
||||
_write_env(body.updates)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"Failed to write .env: {e}")
|
||||
return {"saved": list(body.updates.keys()), "restart_required": True}
|
||||
@@ -0,0 +1,209 @@
|
||||
import re
|
||||
import json
|
||||
import yaml
|
||||
from typing import Set, List
|
||||
|
||||
_DS_PATTERN = re.compile(
|
||||
r"dataSource\.name\s*[=in]+\s*[\('\"]([^'\"),]+)['\")]",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
# STAR PowerQuery operators that follow a field name
|
||||
_STAR_OPS = [
|
||||
"ContainsCIS", "NotContainsCIS", "Contains", "NotContains",
|
||||
"StartsWith", "EndsWith", "In", "NotIn",
|
||||
"IsEmpty", "IsNotEmpty", "Matches", "NotMatches",
|
||||
"GreaterOrEqual", "LessOrEqual", "GreaterThan", "LessThan",
|
||||
"Between", "=", "!=",
|
||||
]
|
||||
_STAR_KEYWORD = {"and", "or", "not", "true", "false", "null"}
|
||||
_OP_PATTERN = re.compile(
|
||||
r"([\w.]+)\s*(?:" + "|".join(re.escape(op) for op in _STAR_OPS) + r")\b"
|
||||
r"|([\w.]+)\s*=", # also catch field= (no-space form used in subQuery strings)
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def extract_star_fields(query: str) -> Set[str]:
|
||||
"""Extract field names referenced in a STAR PowerQuery/subQuery string."""
|
||||
fields: Set[str] = set()
|
||||
for match in _OP_PATTERN.finditer(query):
|
||||
field = match.group(1) or match.group(2)
|
||||
if field and field.lower() not in _STAR_KEYWORD and not field[0].isdigit():
|
||||
fields.add(field)
|
||||
return fields
|
||||
|
||||
|
||||
def extract_sigma_fields(sigma_content: str) -> Set[str]:
|
||||
"""Extract field names from a Sigma rule YAML."""
|
||||
try:
|
||||
rule = yaml.safe_load(sigma_content)
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
fields: Set[str] = set()
|
||||
detection = rule.get("detection", {}) if isinstance(rule, dict) else {}
|
||||
|
||||
def _walk(node):
|
||||
if isinstance(node, dict):
|
||||
for key, val in node.items():
|
||||
if key == "condition":
|
||||
continue
|
||||
# Strip pipe modifiers: CommandLine|contains → CommandLine
|
||||
clean = key.split("|")[0]
|
||||
if clean and clean not in ("keywords",):
|
||||
fields.add(clean)
|
||||
_walk(val)
|
||||
elif isinstance(node, list):
|
||||
for item in node:
|
||||
_walk(item)
|
||||
|
||||
_walk(detection)
|
||||
return fields
|
||||
|
||||
|
||||
def extract_data_sources(texts: List[str]) -> List[str]:
|
||||
"""Extract dataSource.name values from a list of query strings."""
|
||||
sources: Set[str] = set()
|
||||
for text in texts:
|
||||
for match in _DS_PATTERN.finditer(text):
|
||||
sources.add(match.group(1).strip())
|
||||
return sorted(sources)
|
||||
|
||||
|
||||
_SDL_FIELD_PAT = re.compile(r'\$([a-zA-Z][a-zA-Z0-9._]*)(?:=[^$]*)?\$')
|
||||
_SDL_ATTR_KEY_PAT = re.compile(r'"([a-zA-Z][a-zA-Z0-9._]+)"\s*:')
|
||||
# Matches both quoted and unquoted output/to keys in rewrites:
|
||||
# output: "user.name" OR "output": "user.name"
|
||||
# "to": "src_endpoint.ip"
|
||||
_SDL_REWRITE_OUT_PAT = re.compile(
|
||||
r'(?:"output"|output|"to"|"replace")\s*:\s*"([a-zA-Z][a-zA-Z0-9._]+)"'
|
||||
)
|
||||
|
||||
|
||||
def extract_parser_fields_from_content(content: str) -> Set[str]:
|
||||
"""
|
||||
Extract output field names from SDL augmented-JSON parser content string.
|
||||
Handles:
|
||||
- $field.name$ and $field.name=pattern$ from format strings
|
||||
- "output": "field.name" and output: "field.name" from rewrites
|
||||
- quoted attribute keys from attributes{} blocks
|
||||
"""
|
||||
fields: Set[str] = set()
|
||||
|
||||
# Fields from format strings: $field.name$ or $field.name=pattern_var$
|
||||
for match in _SDL_FIELD_PAT.finditer(content):
|
||||
field = match.group(1)
|
||||
# Skip pattern variable names (no dot, short, all lowercase)
|
||||
if "." in field or field[0].isupper() or len(field) > 6:
|
||||
fields.add(field)
|
||||
|
||||
# Rewrite output targets: output: "field.name" / "output": "field.name"
|
||||
_skip_values = {"$0", "1", "2", "3", "4", "99"}
|
||||
for match in _SDL_REWRITE_OUT_PAT.finditer(content):
|
||||
val = match.group(1)
|
||||
if val not in _skip_values and "." in val:
|
||||
fields.add(val)
|
||||
|
||||
# Quoted attribute keys (skip single-word SDL builtins)
|
||||
_skip_keys = {"id", "format", "halt", "input", "output", "match", "replace",
|
||||
"timezone", "attribute", "attributes", "patterns", "formats",
|
||||
"rewrites", "type", "version"}
|
||||
for match in _SDL_ATTR_KEY_PAT.finditer(content):
|
||||
key = match.group(1)
|
||||
if key not in _skip_keys and ("." in key or len(key) > 8):
|
||||
fields.add(key)
|
||||
|
||||
return fields
|
||||
|
||||
|
||||
_SKIP_FIELD_NAMES = {
|
||||
"id", "format", "halt", "input", "output", "match", "replace",
|
||||
"timezone", "attribute", "attributes", "patterns", "formats",
|
||||
"rewrites", "type", "version", "source", "dataset", "predicate",
|
||||
"transformations", "mappings", "observables", "fields", "constant",
|
||||
"copy", "from", "to", "value", "field", "name",
|
||||
}
|
||||
|
||||
|
||||
def _extract_rewrite_fields(rewrites) -> Set[str]:
|
||||
"""Extract 'output' field names from a rewrites list."""
|
||||
fields: Set[str] = set()
|
||||
if not isinstance(rewrites, list):
|
||||
return fields
|
||||
for rw in rewrites:
|
||||
if not isinstance(rw, dict):
|
||||
continue
|
||||
# Standard SDL rewrite: {"input": "...", "output": "field.name"}
|
||||
out = rw.get("output") or rw.get("to")
|
||||
if out and isinstance(out, str) and "." in out and out not in _SKIP_FIELD_NAMES:
|
||||
fields.add(out)
|
||||
return fields
|
||||
|
||||
|
||||
def _walk_mappings(node) -> Set[str]:
|
||||
"""Recursively extract copy.to and constant.field from SDL mappings blocks."""
|
||||
fields: Set[str] = set()
|
||||
if isinstance(node, dict):
|
||||
# transformations copy: {"copy": {"from": "...", "to": "field.name"}}
|
||||
if "copy" in node and isinstance(node["copy"], dict):
|
||||
to = node["copy"].get("to")
|
||||
if to and isinstance(to, str) and "." in to:
|
||||
fields.add(to)
|
||||
# transformations constant: {"constant": {"value": ..., "field": "field.name"}}
|
||||
if "constant" in node and isinstance(node["constant"], dict):
|
||||
f = node["constant"].get("field")
|
||||
if f and isinstance(f, str) and "." in f:
|
||||
fields.add(f)
|
||||
for v in node.values():
|
||||
fields |= _walk_mappings(v)
|
||||
elif isinstance(node, list):
|
||||
for item in node:
|
||||
fields |= _walk_mappings(item)
|
||||
return fields
|
||||
|
||||
|
||||
def extract_parser_fields(parser_json: dict) -> Set[str]:
|
||||
"""
|
||||
Extract output field names from an SDL parser JSON dict.
|
||||
Handles: attributes lists, fields lists, mappings targets,
|
||||
rewrites[].output, rewrites[].to, copy.to, constant.field.
|
||||
"""
|
||||
fields: Set[str] = set()
|
||||
|
||||
# Legacy: attributes as list of {name: ...}
|
||||
for attr in parser_json.get("attributes", []):
|
||||
if isinstance(attr, dict) and "name" in attr:
|
||||
fields.add(attr["name"])
|
||||
|
||||
# Legacy: fields list
|
||||
for field in parser_json.get("fields", []):
|
||||
if isinstance(field, str):
|
||||
fields.add(field)
|
||||
elif isinstance(field, dict) and "name" in field:
|
||||
fields.add(field["name"])
|
||||
|
||||
# Legacy: flat mappings list with "target"
|
||||
for mapping in parser_json.get("mappings", []):
|
||||
if isinstance(mapping, dict) and "target" in mapping:
|
||||
fields.add(mapping["target"])
|
||||
|
||||
# SDL rewrites[].output in top-level formats[]
|
||||
for fmt in parser_json.get("formats", []):
|
||||
if isinstance(fmt, dict):
|
||||
fields |= _extract_rewrite_fields(fmt.get("rewrites", []))
|
||||
|
||||
# SDL mappings block (nested transformations with copy.to / constant.field)
|
||||
mappings_block = parser_json.get("mappings", {})
|
||||
if isinstance(mappings_block, dict):
|
||||
fields |= _walk_mappings(mappings_block)
|
||||
|
||||
# observables[].name
|
||||
for obs in parser_json.get("observables", {}).get("fields", []):
|
||||
if isinstance(obs, dict) and "name" in obs:
|
||||
n = obs["name"]
|
||||
if "." in n:
|
||||
fields.add(n)
|
||||
|
||||
return fields
|
||||
@@ -0,0 +1,344 @@
|
||||
import os
|
||||
import asyncio
|
||||
import httpx
|
||||
from datetime import datetime, timezone
|
||||
|
||||
BASE_URL = os.environ.get("S1_BASE_URL", "https://demo.sentinelone.net").rstrip("/")
|
||||
TOKEN = os.environ.get("S1_API_TOKEN", "")
|
||||
|
||||
# Scalyr/XDR PowerQuery credentials — from SDL_XDR_URL + SDL_LOG_READ_KEY
|
||||
# in the SentinelOne console: Settings → Integrations → Data Lake API Keys
|
||||
SDL_XDR_URL = os.environ.get("SDL_XDR_URL", "https://xdr.us1.sentinelone.net").rstrip("/")
|
||||
SDL_LOG_READ_KEY = os.environ.get("SDL_LOG_READ_KEY", "")
|
||||
|
||||
# SDL Configuration Read Key — used to list/fetch parser files under /logParsers/
|
||||
# (separate from SDL_LOG_READ_KEY which is for querying events only).
|
||||
# Find it in the S1 console: Settings → Integrations → Data Lake API Keys → Configuration Read.
|
||||
SDL_CONFIG_READ_KEY = os.environ.get("SDL_CONFIG_READ_KEY", "")
|
||||
|
||||
# Management Console API uses ApiToken auth
|
||||
HEADERS = {
|
||||
"Authorization": f"ApiToken {TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
|
||||
def _iso_to_epoch_ms(iso_str: str) -> int:
|
||||
"""Convert ISO-8601 UTC string to epoch milliseconds for Scalyr API."""
|
||||
dt = datetime.fromisoformat(iso_str.replace("Z", "+00:00"))
|
||||
return int(dt.timestamp() * 1000)
|
||||
|
||||
|
||||
async def get_star_rules(page_size: int = 100) -> list:
|
||||
"""Fetch custom STAR rules from /cloud-detection/rules, paginating via cursor."""
|
||||
all_rules = []
|
||||
cursor = None
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
while True:
|
||||
params = {"limit": page_size}
|
||||
if cursor:
|
||||
params["cursor"] = cursor
|
||||
resp = await client.get(
|
||||
f"{BASE_URL}/web/api/v2.1/cloud-detection/rules",
|
||||
headers=HEADERS,
|
||||
params=params,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
body = resp.json()
|
||||
all_rules.extend(body.get("data", []))
|
||||
cursor = body.get("pagination", {}).get("nextCursor")
|
||||
if not cursor:
|
||||
break
|
||||
return all_rules
|
||||
|
||||
|
||||
async def get_library_rules(page_size: int = 100) -> list:
|
||||
"""
|
||||
Fetch Detection Library (OOTB/Platform) rules from /web/api/v2.1/detection-library/rules.
|
||||
Requires an account-level or higher API token — site-scoped tokens will receive a 400.
|
||||
Returns an empty list gracefully if the token lacks sufficient scope.
|
||||
"""
|
||||
all_rules = []
|
||||
cursor = None
|
||||
async with httpx.AsyncClient(timeout=60) as client:
|
||||
while True:
|
||||
params: dict = {"limit": page_size}
|
||||
if cursor:
|
||||
params["cursor"] = cursor
|
||||
resp = await client.get(
|
||||
f"{BASE_URL}/web/api/v2.1/detection-library/rules",
|
||||
headers=HEADERS,
|
||||
params=params,
|
||||
)
|
||||
# 400 typically means site-scoped token — return empty rather than crash
|
||||
if resp.status_code == 400:
|
||||
return []
|
||||
resp.raise_for_status()
|
||||
body = resp.json()
|
||||
batch = body.get("data", [])
|
||||
all_rules.extend(batch)
|
||||
cursor = body.get("pagination", {}).get("nextCursor")
|
||||
if not cursor:
|
||||
break
|
||||
|
||||
results = []
|
||||
for rule in all_rules:
|
||||
results.append({
|
||||
"id": str(rule.get("id", "")),
|
||||
"name": rule.get("name", "unnamed"),
|
||||
"s1ql": rule.get("s1ql") or rule.get("query", ""),
|
||||
"queryType": rule.get("queryType", "events"),
|
||||
"severity": rule.get("severity", ""),
|
||||
"description": rule.get("description", ""),
|
||||
"gdlRuleId": rule.get("id", ""),
|
||||
"creator": "SentinelOne",
|
||||
"expirationMode": rule.get("expirationMode", "Permanent"),
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
async def run_powerquery(query: str, from_date: str, to_date: str) -> dict:
|
||||
"""
|
||||
Run a PowerQuery against the Singularity Data Lake via the Scalyr XDR API.
|
||||
Uses SDL_XDR_URL + SDL_LOG_READ_KEY (Scalyr readlog token).
|
||||
The Scalyr PowerQuery API is synchronous — results return in one request.
|
||||
"""
|
||||
if not SDL_LOG_READ_KEY:
|
||||
return {"events": [], "error": "SDL_LOG_READ_KEY not configured — add it to .env"}
|
||||
|
||||
start_ms = _iso_to_epoch_ms(from_date)
|
||||
end_ms = _iso_to_epoch_ms(to_date)
|
||||
|
||||
payload = {
|
||||
"token": SDL_LOG_READ_KEY,
|
||||
"query": query,
|
||||
"startTime": start_ms,
|
||||
"endTime": end_ms,
|
||||
"maxCount": 1000,
|
||||
}
|
||||
|
||||
async with httpx.AsyncClient(timeout=120) as client:
|
||||
for attempt in range(3):
|
||||
try:
|
||||
resp = await client.post(
|
||||
f"{SDL_XDR_URL}/api/powerQuery",
|
||||
json=payload,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
break
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code == 429 and attempt < 2:
|
||||
await asyncio.sleep(10 * (attempt + 1))
|
||||
continue
|
||||
raise RuntimeError(
|
||||
f"HTTP {e.response.status_code} from {e.request.url}: {e.response.text[:500]}"
|
||||
) from e
|
||||
|
||||
data = resp.json()
|
||||
status = data.get("status", "")
|
||||
|
||||
if status != "success":
|
||||
# Return full response as error detail for debugging
|
||||
return {"events": [], "error": f"PowerQuery status={status}: {str(data)[:400]}"}
|
||||
|
||||
# Scalyr PowerQuery returns: {"status":"success","columns":[{"name":"..."},...], "values":[[...],...],...}
|
||||
raw_cols = data.get("columns", [])
|
||||
values = data.get("values", [])
|
||||
|
||||
if raw_cols and values:
|
||||
# columns may be list of strings or list of {"name":...} dicts
|
||||
col_names = [
|
||||
c["name"] if isinstance(c, dict) else c
|
||||
for c in raw_cols
|
||||
]
|
||||
rows = [dict(zip(col_names, row)) for row in values]
|
||||
return {"events": rows}
|
||||
|
||||
# Fallback: return raw matches array
|
||||
matches = data.get("matches", [])
|
||||
return {"events": matches}
|
||||
|
||||
|
||||
def _sdl_config_headers() -> dict:
|
||||
"""Auth headers for the SDL Configuration File API (uses POST /api/listFiles,
|
||||
POST /api/getFile, etc.). Falls back to SDL_LOG_READ_KEY if no dedicated
|
||||
Configuration Read key is set — that won't work for all endpoints, but lets
|
||||
callers fail with a meaningful 401 instead of crashing."""
|
||||
key = SDL_CONFIG_READ_KEY or SDL_LOG_READ_KEY
|
||||
return {
|
||||
"Authorization": f"Bearer {key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
|
||||
async def list_sdl_parsers() -> list[str]:
|
||||
"""List parser paths under /logParsers/ via the SDL Configuration File API.
|
||||
|
||||
Requires SDL_CONFIG_READ_KEY (or higher) in .env. The endpoint is
|
||||
POST <SDL_XDR_URL>/api/listFiles with {"pathPrefix": "/logParsers/"}.
|
||||
Returns names without the /logParsers/ prefix, suitable for use as
|
||||
filenames in the local parsers/ directory.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{SDL_XDR_URL}/api/listFiles",
|
||||
headers=_sdl_config_headers(),
|
||||
json={"pathPrefix": "/logParsers/"},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
paths = data.get("paths") or data.get("files") or []
|
||||
# Normalize: strip leading /logParsers/ and ignore anything that isn't there
|
||||
names: list[str] = []
|
||||
for p in paths:
|
||||
if isinstance(p, dict):
|
||||
p = p.get("path") or p.get("name") or ""
|
||||
if isinstance(p, str) and p.startswith("/logParsers/"):
|
||||
names.append(p[len("/logParsers/"):])
|
||||
return names
|
||||
|
||||
|
||||
async def list_sdl_parsers_legacy() -> list[str]:
|
||||
"""[Deprecated] Legacy management-console path — kept for reference but unused."""
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.get(
|
||||
f"{BASE_URL}/api/v1/files/logParsers",
|
||||
headers=HEADERS,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
# Response is a list of file objects or a dict with 'files' key
|
||||
if isinstance(data, list):
|
||||
return [f.get("name") or f.get("path", "") for f in data if isinstance(f, dict)]
|
||||
return [f.get("name") or f.get("path", "") for f in data.get("files", [])]
|
||||
|
||||
|
||||
async def get_sdl_parser(filename: str) -> dict:
|
||||
"""Fetch a single SDL parser file by name via POST /api/getFile.
|
||||
|
||||
Returns the raw SDL response dict, e.g.
|
||||
{"status": "success", "path": "/logParsers/Foo", "content": "...", "version": 3, ...}
|
||||
"""
|
||||
path = filename if filename.startswith("/logParsers/") else f"/logParsers/{filename}"
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.post(
|
||||
f"{SDL_XDR_URL}/api/getFile",
|
||||
headers=_sdl_config_headers(),
|
||||
json={"path": path},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
|
||||
async def get_account_id() -> str | None:
|
||||
"""Return the first account ID visible to the current token.
|
||||
|
||||
Tries /accounts first (works for account-scoped or higher tokens). If that
|
||||
returns 403 (site-scoped token), falls back to /sites and reads accountId
|
||||
from the first site.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=15) as client:
|
||||
# Path 1: account-scoped token
|
||||
resp = await client.get(
|
||||
f"{BASE_URL}/web/api/v2.1/accounts",
|
||||
headers=HEADERS,
|
||||
params={"limit": 1},
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
accounts = resp.json().get("data", [])
|
||||
if accounts:
|
||||
return str(accounts[0]["id"])
|
||||
# Path 2: site-scoped token — accountId is embedded in sites payload
|
||||
if resp.status_code in (401, 403):
|
||||
sresp = await client.get(
|
||||
f"{BASE_URL}/web/api/v2.1/sites",
|
||||
headers=HEADERS,
|
||||
params={"limit": 1},
|
||||
)
|
||||
if sresp.status_code == 200:
|
||||
data = sresp.json().get("data", {})
|
||||
sites = data.get("sites") if isinstance(data, dict) else data
|
||||
if sites:
|
||||
return str(sites[0].get("accountId") or "") or None
|
||||
return None
|
||||
|
||||
|
||||
async def get_scope_for_platform_rules() -> tuple[str, str] | None:
|
||||
"""Pick the best scope for /detection-library/platform-rules.
|
||||
|
||||
Returns (scopeLevel, scopeId). Tries account first, then site — site-scoped
|
||||
tokens cannot list accounts but CAN query platform-rules with site scope.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=15) as client:
|
||||
# Prefer account scope (broadest)
|
||||
a = await client.get(
|
||||
f"{BASE_URL}/web/api/v2.1/accounts",
|
||||
headers=HEADERS,
|
||||
params={"limit": 1},
|
||||
)
|
||||
if a.status_code == 200:
|
||||
accounts = a.json().get("data", [])
|
||||
if accounts:
|
||||
return ("account", str(accounts[0]["id"]))
|
||||
# Fall back to site scope (site-scoped tokens land here)
|
||||
s = await client.get(
|
||||
f"{BASE_URL}/web/api/v2.1/sites",
|
||||
headers=HEADERS,
|
||||
params={"limit": 1},
|
||||
)
|
||||
if s.status_code == 200:
|
||||
data = s.json().get("data", {})
|
||||
sites = data.get("sites") if isinstance(data, dict) else data
|
||||
if sites:
|
||||
sid = sites[0].get("id")
|
||||
if sid:
|
||||
return ("site", str(sid))
|
||||
return None
|
||||
|
||||
|
||||
async def get_platform_rules(page_size: int = 1000) -> list:
|
||||
"""
|
||||
Fetch all Detection Library platform rules from /detection-library/platform-rules.
|
||||
Requires scopeLevel + scopeId. Tries account scope first, then site scope so
|
||||
site-scoped tokens also work.
|
||||
"""
|
||||
scope = await get_scope_for_platform_rules()
|
||||
if not scope:
|
||||
return []
|
||||
scope_level, scope_id = scope
|
||||
|
||||
all_rules: list = []
|
||||
cursor: str = ""
|
||||
async with httpx.AsyncClient(timeout=60) as client:
|
||||
while True:
|
||||
params: dict = {
|
||||
"scopeLevel": scope_level,
|
||||
"scopeId": scope_id,
|
||||
"limit": page_size,
|
||||
"cursor": cursor,
|
||||
}
|
||||
resp = await client.get(
|
||||
f"{BASE_URL}/web/api/v2.1/detection-library/platform-rules",
|
||||
headers=HEADERS,
|
||||
params=params,
|
||||
)
|
||||
if resp.status_code == 400:
|
||||
return []
|
||||
resp.raise_for_status()
|
||||
body = resp.json()
|
||||
all_rules.extend(body.get("data", []))
|
||||
cursor = body.get("pagination", {}).get("nextCursor") or ""
|
||||
if not cursor:
|
||||
break
|
||||
return all_rules
|
||||
|
||||
|
||||
async def get_sites() -> list:
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
resp = await client.get(
|
||||
f"{BASE_URL}/web/api/v2.1/sites",
|
||||
headers=HEADERS,
|
||||
params={"limit": 100},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json().get("data", {}).get("sites", [])
|
||||
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
echo "==> Starting Docker containers..."
|
||||
docker-compose up --build "$@"
|
||||
@@ -0,0 +1,3 @@
|
||||
-- Tables are created by SQLAlchemy on startup.
|
||||
-- This file exists for the postgres healthcheck mount.
|
||||
SELECT 1;
|
||||
@@ -0,0 +1,46 @@
|
||||
services:
|
||||
frontend:
|
||||
build: ./frontend
|
||||
ports:
|
||||
- "3001:3000"
|
||||
depends_on:
|
||||
- backend
|
||||
|
||||
backend:
|
||||
build: ./backend
|
||||
ports:
|
||||
- "8001:8000"
|
||||
environment:
|
||||
- S1_API_TOKEN=${S1_API_TOKEN}
|
||||
- S1_BASE_URL=${S1_BASE_URL}
|
||||
- SDL_XDR_URL=${SDL_XDR_URL}
|
||||
- SDL_LOG_READ_KEY=${SDL_LOG_READ_KEY}
|
||||
- SDL_CONFIG_READ_KEY=${SDL_CONFIG_READ_KEY}
|
||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
|
||||
- DATABASE_URL=postgresql://siem:siem@db:5432/siem
|
||||
- DETECTIONS_FILE=/app/data/detections.json
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- ./parsers:/app/parsers
|
||||
- ./.env:/app/.env
|
||||
- ./data:/app/data:ro
|
||||
|
||||
db:
|
||||
image: postgres:16-alpine
|
||||
environment:
|
||||
- POSTGRES_DB=siem
|
||||
- POSTGRES_USER=siem
|
||||
- POSTGRES_PASSWORD=siem
|
||||
volumes:
|
||||
- pgdata:/var/lib/postgresql/data
|
||||
- ./db/init.sql:/docker-entrypoint-initdb.d/init.sql
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U siem"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
volumes:
|
||||
pgdata:
|
||||
@@ -0,0 +1,2 @@
|
||||
node_modules
|
||||
.next
|
||||
@@ -0,0 +1,4 @@
|
||||
FROM nginx:alpine
|
||||
COPY index.html /usr/share/nginx/html/index.html
|
||||
COPY nginx.conf /etc/nginx/conf.d/default.conf
|
||||
EXPOSE 3000
|
||||
+1171
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,6 @@
|
||||
/** @type {import('next').NextConfig} */
|
||||
const nextConfig = {
|
||||
output: 'export',
|
||||
trailingSlash: true,
|
||||
}
|
||||
module.exports = nextConfig
|
||||
@@ -0,0 +1,9 @@
|
||||
server {
|
||||
listen 3000;
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
location / {
|
||||
try_files $uri $uri/ $uri.html /index.html;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"name": "siem-toolkit",
|
||||
"version": "1.0.0",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"dev": "next dev",
|
||||
"build": "next build",
|
||||
"start": "next start"
|
||||
},
|
||||
"dependencies": {
|
||||
"next": "14.2.5",
|
||||
"react": "18.3.1",
|
||||
"react-dom": "18.3.1",
|
||||
"recharts": "2.12.7",
|
||||
"@tanstack/react-query": "5.56.2",
|
||||
"axios": "1.7.7",
|
||||
"clsx": "2.1.1",
|
||||
"lucide-react": "0.441.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "22.5.4",
|
||||
"@types/react": "18.3.5",
|
||||
"@types/react-dom": "18.3.0",
|
||||
"autoprefixer": "10.4.20",
|
||||
"postcss": "8.4.45",
|
||||
"tailwindcss": "3.4.11",
|
||||
"typescript": "5.6.2"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
module.exports = {
|
||||
plugins: { tailwindcss: {}, autoprefixer: {} },
|
||||
}
|
||||
@@ -0,0 +1,232 @@
|
||||
'use client'
|
||||
|
||||
import { useState, useRef } from 'react'
|
||||
import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'
|
||||
import { api } from '@/lib/api'
|
||||
import clsx from 'clsx'
|
||||
|
||||
type FieldDetail = {
|
||||
in_parser: boolean
|
||||
parser_name: string | null
|
||||
rule_count: number
|
||||
rules: { rule: string; type: string }[]
|
||||
status: 'covered' | 'unused' | 'missing_parser'
|
||||
}
|
||||
|
||||
type CoverageMap = {
|
||||
summary: {
|
||||
total_parser_fields: number
|
||||
total_rule_fields: number
|
||||
covered: number
|
||||
parsed_but_unused: number
|
||||
rules_missing_parser: number
|
||||
}
|
||||
fields: Record<string, FieldDetail>
|
||||
}
|
||||
|
||||
const STATUS_STYLE = {
|
||||
covered: 'bg-emerald-900/50 text-emerald-300 border-emerald-700',
|
||||
unused: 'bg-yellow-900/50 text-yellow-300 border-yellow-700',
|
||||
missing_parser: 'bg-red-900/50 text-red-300 border-red-700',
|
||||
}
|
||||
|
||||
const STATUS_LABEL = {
|
||||
covered: 'Covered',
|
||||
unused: 'Unused (reduce candidate)',
|
||||
missing_parser: 'Missing parser',
|
||||
}
|
||||
|
||||
export default function CoveragePage() {
|
||||
const qc = useQueryClient()
|
||||
const sigmaRef = useRef<HTMLInputElement>(null)
|
||||
const parserRef = useRef<HTMLInputElement>(null)
|
||||
const [filter, setFilter] = useState<'all' | 'covered' | 'unused' | 'missing_parser'>('all')
|
||||
const [err, setErr] = useState('')
|
||||
|
||||
const { data, isLoading } = useQuery<CoverageMap>({
|
||||
queryKey: ['coverage-map'],
|
||||
queryFn: () => api.get('/api/coverage/map'),
|
||||
})
|
||||
|
||||
const loadStar = useMutation({
|
||||
mutationFn: () => api.post('/api/coverage/load-star-rules', {}),
|
||||
onSuccess: () => qc.invalidateQueries({ queryKey: ['coverage-map'] }),
|
||||
onError: (e: Error) => setErr(e.message),
|
||||
})
|
||||
|
||||
const uploadSigma = useMutation({
|
||||
mutationFn: async (files: FileList) => {
|
||||
const form = new FormData()
|
||||
Array.from(files).forEach((f) => form.append('files', f))
|
||||
return api.postForm('/api/coverage/upload-sigma', form)
|
||||
},
|
||||
onSuccess: () => qc.invalidateQueries({ queryKey: ['coverage-map'] }),
|
||||
onError: (e: Error) => setErr(e.message),
|
||||
})
|
||||
|
||||
const uploadParser = useMutation({
|
||||
mutationFn: async (file: File) => {
|
||||
const form = new FormData()
|
||||
form.append('file', file)
|
||||
return api.postForm('/api/coverage/upload-parser', form)
|
||||
},
|
||||
onSuccess: () => qc.invalidateQueries({ queryKey: ['coverage-map'] }),
|
||||
onError: (e: Error) => setErr(e.message),
|
||||
})
|
||||
|
||||
const reset = useMutation({
|
||||
mutationFn: () => api.get('/api/coverage/reset'),
|
||||
onSuccess: () => qc.invalidateQueries({ queryKey: ['coverage-map'] }),
|
||||
})
|
||||
|
||||
const fields = data
|
||||
? Object.entries(data.fields).filter(
|
||||
([, d]) => filter === 'all' || d.status === filter
|
||||
)
|
||||
: []
|
||||
|
||||
const busy = loadStar.isPending || uploadSigma.isPending || uploadParser.isPending
|
||||
|
||||
return (
|
||||
<div className="p-8 max-w-6xl">
|
||||
<div className="flex items-start justify-between mb-6">
|
||||
<div>
|
||||
<h1 className="text-xl font-bold text-white">Parser Coverage Map</h1>
|
||||
<p className="text-sm text-gray-400 mt-1">
|
||||
Cross-reference SDL parser fields against STAR / Sigma rule fields
|
||||
</p>
|
||||
</div>
|
||||
<div className="flex gap-2 flex-wrap justify-end">
|
||||
<button
|
||||
onClick={() => loadStar.mutate()}
|
||||
disabled={busy}
|
||||
className="px-3 py-1.5 text-sm bg-purple-700 hover:bg-purple-600 disabled:opacity-50 rounded-lg text-white"
|
||||
>
|
||||
{loadStar.isPending ? 'Loading…' : 'Load STAR Rules'}
|
||||
</button>
|
||||
<button
|
||||
onClick={() => sigmaRef.current?.click()}
|
||||
disabled={busy}
|
||||
className="px-3 py-1.5 text-sm bg-gray-700 hover:bg-gray-600 disabled:opacity-50 rounded-lg text-white"
|
||||
>
|
||||
Upload Sigma Rules
|
||||
</button>
|
||||
<button
|
||||
onClick={() => parserRef.current?.click()}
|
||||
disabled={busy}
|
||||
className="px-3 py-1.5 text-sm bg-gray-700 hover:bg-gray-600 disabled:opacity-50 rounded-lg text-white"
|
||||
>
|
||||
Upload Parser
|
||||
</button>
|
||||
<button
|
||||
onClick={() => reset.mutate()}
|
||||
disabled={busy}
|
||||
className="px-3 py-1.5 text-sm bg-red-900/60 hover:bg-red-800 disabled:opacity-50 rounded-lg text-red-300"
|
||||
>
|
||||
Reset
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<input
|
||||
ref={sigmaRef}
|
||||
type="file"
|
||||
accept=".yml,.yaml"
|
||||
multiple
|
||||
className="hidden"
|
||||
onChange={(e) => e.target.files && uploadSigma.mutate(e.target.files)}
|
||||
/>
|
||||
<input
|
||||
ref={parserRef}
|
||||
type="file"
|
||||
accept=".json"
|
||||
className="hidden"
|
||||
onChange={(e) => e.target.files?.[0] && uploadParser.mutate(e.target.files[0])}
|
||||
/>
|
||||
|
||||
{err && (
|
||||
<div className="mb-4 p-3 bg-red-900/40 border border-red-700 rounded-lg text-sm text-red-300">
|
||||
{err}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{data && (
|
||||
<div className="grid grid-cols-5 gap-3 mb-6">
|
||||
{[
|
||||
{ label: 'Parser Fields', value: data.summary.total_parser_fields, color: 'text-gray-200' },
|
||||
{ label: 'Rule Fields', value: data.summary.total_rule_fields, color: 'text-gray-200' },
|
||||
{ label: 'Covered', value: data.summary.covered, color: 'text-emerald-400' },
|
||||
{ label: 'Parsed Unused', value: data.summary.parsed_but_unused, color: 'text-yellow-400' },
|
||||
{ label: 'Missing Parser', value: data.summary.rules_missing_parser, color: 'text-red-400' },
|
||||
].map(({ label, value, color }) => (
|
||||
<div key={label} className="bg-gray-900 border border-gray-800 rounded-lg p-4 text-center">
|
||||
<div className={`text-2xl font-bold ${color}`}>{value}</div>
|
||||
<div className="text-xs text-gray-500 mt-1">{label}</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="flex gap-2 mb-4">
|
||||
{(['all', 'covered', 'unused', 'missing_parser'] as const).map((f) => (
|
||||
<button
|
||||
key={f}
|
||||
onClick={() => setFilter(f)}
|
||||
className={clsx(
|
||||
'px-3 py-1 text-xs rounded-full border transition-colors',
|
||||
filter === f
|
||||
? 'bg-purple-700 border-purple-600 text-white'
|
||||
: 'border-gray-700 text-gray-400 hover:border-gray-500'
|
||||
)}
|
||||
>
|
||||
{f === 'all' ? 'All' : STATUS_LABEL[f]}
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
|
||||
{isLoading ? (
|
||||
<div className="text-gray-500 text-sm">Loading…</div>
|
||||
) : fields.length === 0 ? (
|
||||
<div className="text-gray-600 text-sm">
|
||||
{data ? 'No fields match this filter.' : 'Load STAR rules or upload parsers to begin.'}
|
||||
</div>
|
||||
) : (
|
||||
<div className="overflow-x-auto">
|
||||
<table className="w-full text-sm">
|
||||
<thead>
|
||||
<tr className="text-left text-gray-500 border-b border-gray-800">
|
||||
<th className="pb-2 pr-4 font-medium">Field</th>
|
||||
<th className="pb-2 pr-4 font-medium">Status</th>
|
||||
<th className="pb-2 pr-4 font-medium">Parser</th>
|
||||
<th className="pb-2 font-medium">Rules using it</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{fields.map(([field, detail]) => (
|
||||
<tr key={field} className="border-b border-gray-800/50 hover:bg-gray-900/30">
|
||||
<td className="py-2 pr-4 font-mono text-xs text-gray-200">{field}</td>
|
||||
<td className="py-2 pr-4">
|
||||
<span
|
||||
className={clsx(
|
||||
'px-2 py-0.5 rounded text-xs border',
|
||||
STATUS_STYLE[detail.status]
|
||||
)}
|
||||
>
|
||||
{STATUS_LABEL[detail.status]}
|
||||
</span>
|
||||
</td>
|
||||
<td className="py-2 pr-4 text-xs text-gray-400">{detail.parser_name ?? '—'}</td>
|
||||
<td className="py-2 text-xs text-gray-400">
|
||||
{detail.rule_count > 0
|
||||
? detail.rules.map((r) => r.rule).join(', ')
|
||||
: '—'}
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
@@ -0,0 +1,169 @@
|
||||
'use client'
|
||||
|
||||
import { useState } from 'react'
|
||||
import { useQuery, useMutation } from '@tanstack/react-query'
|
||||
import {
|
||||
BarChart, Bar, XAxis, YAxis, Tooltip, ResponsiveContainer, CartesianGrid,
|
||||
} from 'recharts'
|
||||
import { api } from '@/lib/api'
|
||||
|
||||
type SourceRow = { 'src.name': string; events: number }
|
||||
type DayRow = { date: string; events: number }
|
||||
|
||||
export default function IngestPage() {
|
||||
const [days, setDays] = useState(7)
|
||||
const [simSource, setSimSource] = useState('')
|
||||
const [simEventType, setSimEventType] = useState('')
|
||||
const [simResult, setSimResult] = useState<Record<string, unknown> | null>(null)
|
||||
const [simErr, setSimErr] = useState('')
|
||||
|
||||
const sources = useQuery<{ data: SourceRow[] }>({
|
||||
queryKey: ['top-sources', days],
|
||||
queryFn: () => api.get(`/api/ingest/top-sources?days=${days}`),
|
||||
})
|
||||
|
||||
const daily = useQuery<DayRow[]>({
|
||||
queryKey: ['daily-volume', days],
|
||||
queryFn: () => api.get(`/api/ingest/daily-volume?days=${days}`),
|
||||
})
|
||||
|
||||
const simulate = useMutation({
|
||||
mutationFn: () =>
|
||||
api.post<Record<string, unknown>>('/api/ingest/simulate-filter', {
|
||||
source: simSource,
|
||||
event_type: simEventType,
|
||||
days,
|
||||
gb_per_million_events: 0.5,
|
||||
}),
|
||||
onSuccess: (data) => { setSimResult(data); setSimErr('') },
|
||||
onError: (e: Error) => setSimErr(e.message),
|
||||
})
|
||||
|
||||
const chartData = (sources.data?.data ?? []).slice(0, 15).map((r) => ({
|
||||
name: r['src.name'] ?? 'unknown',
|
||||
events: r.events ?? 0,
|
||||
}))
|
||||
|
||||
return (
|
||||
<div className="p-8 max-w-6xl">
|
||||
<div className="flex items-center justify-between mb-6">
|
||||
<div>
|
||||
<h1 className="text-xl font-bold text-white">Ingest Dashboard</h1>
|
||||
<p className="text-sm text-gray-400 mt-1">Event volume · cost projection · filter simulator</p>
|
||||
</div>
|
||||
<div className="flex gap-2">
|
||||
{[7, 14, 30].map((d) => (
|
||||
<button
|
||||
key={d}
|
||||
onClick={() => setDays(d)}
|
||||
className={`px-3 py-1.5 text-xs rounded-lg border transition-colors ${
|
||||
days === d
|
||||
? 'bg-purple-700 border-purple-600 text-white'
|
||||
: 'border-gray-700 text-gray-400 hover:border-gray-500'
|
||||
}`}
|
||||
>
|
||||
{d}d
|
||||
</button>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Daily volume chart */}
|
||||
<div className="bg-gray-900 border border-gray-800 rounded-xl p-5 mb-5">
|
||||
<h2 className="text-sm font-medium text-gray-300 mb-4">Daily Event Volume</h2>
|
||||
{daily.isLoading ? (
|
||||
<div className="text-gray-600 text-sm h-32 flex items-center">Loading…</div>
|
||||
) : (
|
||||
<ResponsiveContainer width="100%" height={160}>
|
||||
<BarChart data={daily.data ?? []}>
|
||||
<CartesianGrid strokeDasharray="3 3" stroke="#1f2937" />
|
||||
<XAxis dataKey="date" tick={{ fontSize: 10, fill: '#6b7280' }} />
|
||||
<YAxis tick={{ fontSize: 10, fill: '#6b7280' }} />
|
||||
<Tooltip
|
||||
contentStyle={{ background: '#111827', border: '1px solid #374151', fontSize: 12 }}
|
||||
labelStyle={{ color: '#d1d5db' }}
|
||||
/>
|
||||
<Bar dataKey="events" fill="#7c3aed" radius={[3, 3, 0, 0]} />
|
||||
</BarChart>
|
||||
</ResponsiveContainer>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Top sources table */}
|
||||
<div className="bg-gray-900 border border-gray-800 rounded-xl p-5 mb-5">
|
||||
<h2 className="text-sm font-medium text-gray-300 mb-4">Top Sources — last {days}d</h2>
|
||||
{sources.isLoading ? (
|
||||
<div className="text-gray-600 text-sm">Loading…</div>
|
||||
) : sources.isError ? (
|
||||
<div className="text-red-400 text-sm">{String(sources.error)}</div>
|
||||
) : (
|
||||
<table className="w-full text-sm">
|
||||
<thead>
|
||||
<tr className="text-left text-gray-500 border-b border-gray-800">
|
||||
<th className="pb-2 font-medium">Source</th>
|
||||
<th className="pb-2 font-medium text-right">Events</th>
|
||||
<th className="pb-2 font-medium text-right">Est. GB</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{chartData.map((row) => (
|
||||
<tr key={row.name} className="border-b border-gray-800/50">
|
||||
<td className="py-2 font-mono text-xs text-gray-200">{row.name}</td>
|
||||
<td className="py-2 text-right text-gray-300">{row.events.toLocaleString()}</td>
|
||||
<td className="py-2 text-right text-gray-400">
|
||||
{(row.events / 1_000_000 * 0.5).toFixed(3)}
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Filter simulator */}
|
||||
<div className="bg-gray-900 border border-gray-800 rounded-xl p-5">
|
||||
<h2 className="text-sm font-medium text-gray-300 mb-4">Filter Simulator</h2>
|
||||
<p className="text-xs text-gray-500 mb-4">
|
||||
Estimate events and GB eliminated by dropping a source + event type combination.
|
||||
</p>
|
||||
<div className="flex gap-3 flex-wrap mb-4">
|
||||
<input
|
||||
value={simSource}
|
||||
onChange={(e) => setSimSource(e.target.value)}
|
||||
placeholder="Source name (optional)"
|
||||
className="flex-1 min-w-48 bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-200 placeholder-gray-600 focus:outline-none focus:border-purple-600"
|
||||
/>
|
||||
<input
|
||||
value={simEventType}
|
||||
onChange={(e) => setSimEventType(e.target.value)}
|
||||
placeholder="Event type (optional)"
|
||||
className="flex-1 min-w-48 bg-gray-800 border border-gray-700 rounded-lg px-3 py-2 text-sm text-gray-200 placeholder-gray-600 focus:outline-none focus:border-purple-600"
|
||||
/>
|
||||
<button
|
||||
onClick={() => simulate.mutate()}
|
||||
disabled={simulate.isPending || (!simSource && !simEventType)}
|
||||
className="px-4 py-2 text-sm bg-purple-700 hover:bg-purple-600 disabled:opacity-50 rounded-lg text-white"
|
||||
>
|
||||
{simulate.isPending ? 'Running…' : 'Simulate'}
|
||||
</button>
|
||||
</div>
|
||||
{simErr && <p className="text-red-400 text-sm">{simErr}</p>}
|
||||
{simResult && (
|
||||
<div className="grid grid-cols-2 md:grid-cols-4 gap-3">
|
||||
{[
|
||||
{ label: 'Matched Events', value: String(simResult.matched_events ?? 0) },
|
||||
{ label: `Est. GB (${days}d)`, value: String(simResult.estimated_gb_period ?? 0) },
|
||||
{ label: 'Projected Monthly Events', value: String(simResult.projected_monthly_events ?? 0) },
|
||||
{ label: 'Projected Monthly GB', value: String(simResult.projected_monthly_gb ?? 0) },
|
||||
].map(({ label, value }) => (
|
||||
<div key={label} className="bg-gray-800 rounded-lg p-3 text-center">
|
||||
<div className="text-lg font-bold text-purple-300">{value}</div>
|
||||
<div className="text-xs text-gray-500 mt-1">{label}</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
import type { Metadata } from 'next'
|
||||
import './globals.css'
|
||||
import Sidebar from '@/components/Sidebar'
|
||||
import QueryProvider from '@/components/QueryProvider'
|
||||
|
||||
export const metadata: Metadata = {
|
||||
title: 'SIEM Toolkit',
|
||||
description: 'SentinelOne AI-SIEM Engineering Toolkit',
|
||||
}
|
||||
|
||||
export default function RootLayout({ children }: { children: React.ReactNode }) {
|
||||
return (
|
||||
<html lang="en">
|
||||
<body className="font-sans bg-gray-950 text-gray-100 h-screen flex overflow-hidden">
|
||||
<QueryProvider>
|
||||
<Sidebar />
|
||||
<main className="flex-1 overflow-auto">{children}</main>
|
||||
</QueryProvider>
|
||||
</body>
|
||||
</html>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
'use client'
|
||||
|
||||
import { useState } from 'react'
|
||||
import { Copy, Check } from 'lucide-react'
|
||||
|
||||
export default function CopyButton({ text }: { text: string }) {
|
||||
const [copied, setCopied] = useState(false)
|
||||
return (
|
||||
<button
|
||||
onClick={() => {
|
||||
navigator.clipboard.writeText(text)
|
||||
setCopied(true)
|
||||
setTimeout(() => setCopied(false), 1500)
|
||||
}}
|
||||
className="flex items-center gap-1.5 px-2 py-1 text-xs text-gray-400 hover:text-gray-200 transition-colors"
|
||||
>
|
||||
{copied ? <Check size={12} className="text-emerald-400" /> : <Copy size={12} />}
|
||||
{copied ? 'Copied' : 'Copy'}
|
||||
</button>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
import { Zap, MessageSquare, FileText, Code2 } from 'lucide-react'
|
||||
|
||||
const STEPS = [
|
||||
{
|
||||
icon: FileText,
|
||||
title: '1. Grab a log sample',
|
||||
desc: 'Copy 10–50 representative lines from the new log source. Include edge cases — errors, different event types, varying field presence.',
|
||||
},
|
||||
{
|
||||
icon: MessageSquare,
|
||||
title: '2. Paste into Claude Code',
|
||||
desc: 'Open Claude Code and say: "Onboard this log source for SentinelOne SDL" then paste the sample. Mention the source type if known (e.g. "Palo Alto firewall").',
|
||||
},
|
||||
{
|
||||
icon: Code2,
|
||||
title: '3. Get your artefacts',
|
||||
desc: 'Claude returns an SDL parser (augmented-JSON), field mappings to the SDL schema, starter STAR detection rules, and parser test assertions.',
|
||||
},
|
||||
{
|
||||
icon: Zap,
|
||||
title: '4. Deploy',
|
||||
desc: 'Drop the parser JSON into your /logParsers/ path. Paste the STAR rules into the AI-SIEM rule editor. Run the test assertions to validate extraction.',
|
||||
},
|
||||
]
|
||||
|
||||
const PROMPT = `Onboard this log source for SentinelOne SDL. Please generate:
|
||||
1. An SDL parser skeleton in augmented-JSON format (/logParsers/ format)
|
||||
2. Field mappings from raw fields to the SDL common schema
|
||||
3. 2–3 starter STAR detection rules for common threats from this source type
|
||||
4. 5 parser test assertions (input line → expected field → expected value)
|
||||
|
||||
Log source: [describe source, e.g. "Palo Alto PAN-OS firewall"]
|
||||
|
||||
Raw log sample:
|
||||
[paste your log lines here]`
|
||||
|
||||
export default function OnboardingPage() {
|
||||
return (
|
||||
<div className="p-8 max-w-3xl">
|
||||
<div className="mb-8">
|
||||
<h1 className="text-xl font-bold text-white">Onboarding Accelerator</h1>
|
||||
<p className="text-sm text-gray-400 mt-1">
|
||||
Use Claude Code directly — no API key required
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div className="space-y-4 mb-8">
|
||||
{STEPS.map(({ icon: Icon, title, desc }) => (
|
||||
<div key={title} className="flex gap-4 bg-gray-900 border border-gray-800 rounded-xl p-4">
|
||||
<div className="w-8 h-8 shrink-0 rounded-lg bg-purple-900/60 flex items-center justify-center mt-0.5">
|
||||
<Icon size={15} className="text-purple-300" />
|
||||
</div>
|
||||
<div>
|
||||
<div className="text-sm font-medium text-white">{title}</div>
|
||||
<div className="text-sm text-gray-400 mt-1">{desc}</div>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
<div className="bg-gray-900 border border-gray-800 rounded-xl overflow-hidden">
|
||||
<div className="px-4 py-2 border-b border-gray-800 flex items-center justify-between">
|
||||
<span className="text-xs font-medium text-gray-400">Copy this prompt template</span>
|
||||
<CopyButton text={PROMPT} />
|
||||
</div>
|
||||
<pre className="p-4 text-xs text-gray-300 font-mono leading-relaxed whitespace-pre-wrap">{PROMPT}</pre>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function CopyButton({ text }: { text: string }) {
|
||||
'use client'
|
||||
return <_CopyButton text={text} />
|
||||
}
|
||||
|
||||
// Split to keep the page a server component with one small client island
|
||||
import _CopyButton from './_CopyButton'
|
||||
@@ -0,0 +1,59 @@
|
||||
import { Shield, BarChart2, Zap } from 'lucide-react'
|
||||
import Link from 'next/link'
|
||||
|
||||
const CARDS = [
|
||||
{
|
||||
href: '/coverage',
|
||||
icon: Shield,
|
||||
title: 'Parser Coverage Map',
|
||||
desc: 'Cross-reference SDL parser output fields against STAR and Sigma rule fields. Surface parsed-but-unused fields as reduction candidates.',
|
||||
cta: 'Open Coverage Map',
|
||||
color: 'from-purple-700 to-purple-900',
|
||||
},
|
||||
{
|
||||
href: '/ingest',
|
||||
icon: BarChart2,
|
||||
title: 'Ingest Dashboard',
|
||||
desc: 'Visualize event volume by source and type. Project monthly GB costs and simulate the impact of exclusion filters before applying them.',
|
||||
cta: 'Open Dashboard',
|
||||
color: 'from-blue-700 to-blue-900',
|
||||
},
|
||||
{
|
||||
href: '/onboarding',
|
||||
icon: Zap,
|
||||
title: 'Onboarding Accelerator',
|
||||
desc: 'Step-by-step guide for onboarding a new log source using Claude Code directly — no API key required.',
|
||||
cta: 'View Onboarding Guide',
|
||||
color: 'from-emerald-700 to-emerald-900',
|
||||
},
|
||||
]
|
||||
|
||||
export default function Home() {
|
||||
return (
|
||||
<div className="p-8 max-w-5xl">
|
||||
<div className="mb-8">
|
||||
<h1 className="text-2xl font-bold text-white">SIEM Engineering Toolkit</h1>
|
||||
<p className="text-gray-400 mt-1">SentinelOne AI-SIEM · demo.sentinelone.net</p>
|
||||
</div>
|
||||
<div className="grid grid-cols-1 md:grid-cols-3 gap-5">
|
||||
{CARDS.map(({ href, icon: Icon, title, desc, cta, color }) => (
|
||||
<div key={href} className="bg-gray-900 border border-gray-800 rounded-xl p-6 flex flex-col gap-4">
|
||||
<div className={`w-10 h-10 rounded-lg bg-gradient-to-br ${color} flex items-center justify-center`}>
|
||||
<Icon size={20} className="text-white" />
|
||||
</div>
|
||||
<div>
|
||||
<h2 className="font-semibold text-white">{title}</h2>
|
||||
<p className="text-sm text-gray-400 mt-1 leading-relaxed">{desc}</p>
|
||||
</div>
|
||||
<Link
|
||||
href={href}
|
||||
className="mt-auto text-sm text-purple-400 hover:text-purple-300 font-medium"
|
||||
>
|
||||
{cta} →
|
||||
</Link>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
'use client'
|
||||
|
||||
import { QueryClient, QueryClientProvider } from '@tanstack/react-query'
|
||||
import { useState } from 'react'
|
||||
|
||||
export default function QueryProvider({ children }: { children: React.ReactNode }) {
|
||||
const [client] = useState(() => new QueryClient({ defaultOptions: { queries: { retry: 1 } } }))
|
||||
return <QueryClientProvider client={client}>{children}</QueryClientProvider>
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
'use client'
|
||||
|
||||
import Link from 'next/link'
|
||||
import { usePathname } from 'next/navigation'
|
||||
import { Shield, BarChart2, Zap, Home } from 'lucide-react'
|
||||
import clsx from 'clsx'
|
||||
|
||||
const NAV = [
|
||||
{ href: '/', label: 'Overview', icon: Home },
|
||||
{ href: '/coverage', label: 'Parser Coverage', icon: Shield },
|
||||
{ href: '/ingest', label: 'Ingest Dashboard', icon: BarChart2 },
|
||||
{ href: '/onboarding', label: 'Onboarding', icon: Zap },
|
||||
]
|
||||
|
||||
export default function Sidebar() {
|
||||
const path = usePathname()
|
||||
return (
|
||||
<aside className="w-56 shrink-0 bg-gray-900 border-r border-gray-800 flex flex-col">
|
||||
<div className="p-4 border-b border-gray-800">
|
||||
<div className="flex items-center gap-2">
|
||||
<div className="w-6 h-6 rounded bg-purple-600 flex items-center justify-center text-xs font-bold">S1</div>
|
||||
<span className="font-semibold text-sm text-white">SIEM Toolkit</span>
|
||||
</div>
|
||||
<p className="text-xs text-gray-500 mt-1">demo.sentinelone.net</p>
|
||||
</div>
|
||||
<nav className="flex-1 p-3 space-y-1">
|
||||
{NAV.map(({ href, label, icon: Icon }) => (
|
||||
<Link
|
||||
key={href}
|
||||
href={href}
|
||||
className={clsx(
|
||||
'flex items-center gap-3 px-3 py-2 rounded-lg text-sm transition-colors',
|
||||
path === href
|
||||
? 'bg-purple-700 text-white'
|
||||
: 'text-gray-400 hover:bg-gray-800 hover:text-gray-100'
|
||||
)}
|
||||
>
|
||||
<Icon size={15} />
|
||||
{label}
|
||||
</Link>
|
||||
))}
|
||||
</nav>
|
||||
</aside>
|
||||
)
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
const BASE = process.env.NEXT_PUBLIC_API_URL ?? 'http://localhost:8000'
|
||||
|
||||
export async function apiFetch<T = unknown>(path: string, init?: RequestInit): Promise<T> {
|
||||
const res = await fetch(`${BASE}${path}`, init)
|
||||
if (!res.ok) {
|
||||
const text = await res.text()
|
||||
throw new Error(`${res.status}: ${text}`)
|
||||
}
|
||||
return res.json() as Promise<T>
|
||||
}
|
||||
|
||||
export const api = {
|
||||
get: <T>(path: string) => apiFetch<T>(path),
|
||||
post: <T>(path: string, body: unknown) =>
|
||||
apiFetch<T>(path, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(body),
|
||||
}),
|
||||
postForm: <T>(path: string, form: FormData) =>
|
||||
apiFetch<T>(path, { method: 'POST', body: form }),
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
/** @type {import('tailwindcss').Config} */
|
||||
module.exports = {
|
||||
content: ['./src/**/*.{ts,tsx}'],
|
||||
theme: {
|
||||
extend: {
|
||||
colors: {
|
||||
brand: '#7c3aed',
|
||||
},
|
||||
},
|
||||
},
|
||||
plugins: [],
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "es2017",
|
||||
"lib": ["dom", "dom.iterable", "esnext"],
|
||||
"allowJs": true,
|
||||
"skipLibCheck": true,
|
||||
"strict": true,
|
||||
"noEmit": true,
|
||||
"esModuleInterop": true,
|
||||
"module": "esnext",
|
||||
"moduleResolution": "bundler",
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"jsx": "preserve",
|
||||
"incremental": true,
|
||||
"plugins": [{ "name": "next" }],
|
||||
"paths": { "@/*": ["./src/*"] }
|
||||
},
|
||||
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Probe the SDL tenant to understand why Avelios Medical field-population shows 0%."""
|
||||
import json, time, urllib.request, urllib.error
|
||||
import os
|
||||
|
||||
def _load_sdl_cfg():
|
||||
import json as _j, os as _o, sys as _s
|
||||
here = _o.path.dirname(_o.path.abspath(__file__))
|
||||
candidates = [
|
||||
_o.environ.get("SDL_CONFIG"),
|
||||
_o.path.join(here, "sdl_config.json"),
|
||||
_o.path.join(here, "..", "sdl_config.json"),
|
||||
]
|
||||
for p in candidates:
|
||||
if p and _o.path.exists(p):
|
||||
with open(p) as fh:
|
||||
return _j.load(fh)
|
||||
_s.stderr.write(
|
||||
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
|
||||
"(see sdl_config.example.json)\n")
|
||||
_s.exit(2)
|
||||
|
||||
|
||||
CFG = _load_sdl_cfg()
|
||||
BASE = CFG['base_url'].rstrip('/')
|
||||
KEY = CFG['log_read_key']
|
||||
END_MS = int(time.time() * 1000)
|
||||
START_MS = END_MS - 24 * 3600 * 1000 # last 24h
|
||||
|
||||
|
||||
def pq(query: str, max_count: int = 10) -> dict:
|
||||
body = json.dumps({
|
||||
"token": KEY, "query": query,
|
||||
"startTime": START_MS, "endTime": END_MS,
|
||||
"maxCount": max_count,
|
||||
}).encode()
|
||||
req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
|
||||
headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
return json.loads(urllib.request.urlopen(req, timeout=30).read())
|
||||
except urllib.error.HTTPError as e:
|
||||
return {"_err": f"HTTP {e.code}: {e.read().decode()[:200]}"}
|
||||
except Exception as e:
|
||||
return {"_err": str(e)[:200]}
|
||||
|
||||
|
||||
def show(label, d):
|
||||
if "_err" in d:
|
||||
print(f"[ERR] {label}: {d['_err']}"); return
|
||||
cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])]
|
||||
vals = d.get('values', []) or d.get('matches', [])
|
||||
print(f"[OK ] {label} cols={cols} rows={len(vals)}")
|
||||
for v in vals[:8]:
|
||||
print(f" {v}")
|
||||
|
||||
|
||||
# 1. Distinct dataSource.name values containing 'velio'
|
||||
print("=" * 70)
|
||||
print("1. Source-name spellings containing 'velio'")
|
||||
print("=" * 70)
|
||||
show("by dataSource.name",
|
||||
pq("| group n=count() by dataSource.name | sort -n | limit 50", max_count=50))
|
||||
|
||||
# 2. Try a few candidate names
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("2. Try filtering by candidate names")
|
||||
print("=" * 70)
|
||||
for cand in ["Avelios Medical", "Avelios-Medical", "Avelios-Medical-OCSF",
|
||||
"avelios", "Avelios"]:
|
||||
d = pq(f"| filter dataSource.name == '{cand}' | group n=count()", max_count=1)
|
||||
n = (d.get('values') or [[None]])[0][0] if 'values' in d else d
|
||||
print(f" {cand!r:<35} -> {n}")
|
||||
for cand in ["Avelios Medical", "Avelios-Medical-OCSF", "avelios"]:
|
||||
d = pq(f"| filter dataSource.name contains '{cand}' | group n=count()", max_count=1)
|
||||
n = (d.get('values') or [[None]])[0][0] if 'values' in d else d
|
||||
print(f" contains {cand!r:<25} -> {n}")
|
||||
|
||||
# 3. Sample one raw event to see what column names actually come back
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("3. Sample one event — what keys/columns are returned?")
|
||||
print("=" * 70)
|
||||
d = pq("| filter dataSource.name contains 'velio' | limit 1", max_count=1)
|
||||
if "_err" in d:
|
||||
print(" ", d["_err"])
|
||||
else:
|
||||
print(" columns:", [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])][:30])
|
||||
print(" first row sample:", str((d.get('values') or [None])[0])[:400])
|
||||
|
||||
# 4. If we got columns, check which OCSF fields exist
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("4. Field presence in last 24h for Avelios (using columns command)")
|
||||
print("=" * 70)
|
||||
d = pq("| filter dataSource.name contains 'velio' | "
|
||||
"columns dataSource.name, metadata.product.name, metadata.event_code, "
|
||||
"actor.user.name, src_endpoint.ip, dst_endpoint.ip | limit 5",
|
||||
max_count=5)
|
||||
show("columns view", d)
|
||||
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Inspect Avelios Medical events: one query, full row dump, then field stats from Python."""
|
||||
import json, time, urllib.request, collections
|
||||
import os
|
||||
|
||||
def _load_sdl_cfg():
|
||||
import json as _j, os as _o, sys as _s
|
||||
here = _o.path.dirname(_o.path.abspath(__file__))
|
||||
candidates = [
|
||||
_o.environ.get("SDL_CONFIG"),
|
||||
_o.path.join(here, "sdl_config.json"),
|
||||
_o.path.join(here, "..", "sdl_config.json"),
|
||||
]
|
||||
for p in candidates:
|
||||
if p and _o.path.exists(p):
|
||||
with open(p) as fh:
|
||||
return _j.load(fh)
|
||||
_s.stderr.write(
|
||||
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
|
||||
"(see sdl_config.example.json)\n")
|
||||
_s.exit(2)
|
||||
|
||||
|
||||
CFG = _load_sdl_cfg()
|
||||
BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key']
|
||||
NOW = int(time.time() * 1000)
|
||||
START = NOW - 72 * 3600 * 1000 # last 3 days
|
||||
|
||||
|
||||
def pq(query, mc=200):
|
||||
body = json.dumps({"token": KEY, "query": query,
|
||||
"startTime": START, "endTime": NOW,
|
||||
"maxCount": mc}).encode()
|
||||
req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
|
||||
headers={"Content-Type": "application/json"})
|
||||
return json.loads(urllib.request.urlopen(req, timeout=60).read())
|
||||
|
||||
|
||||
print("Fetching Avelios Medical sample (max 200, last 72h) ...")
|
||||
d = pq("| filter dataSource.name == 'Avelios Medical' | limit 200")
|
||||
cols = [c['name'] if isinstance(c, dict) else c for c in d.get('columns', [])]
|
||||
vals = d.get('values', []) or []
|
||||
print(f"Columns returned ({len(cols)}): {cols}")
|
||||
print(f"Rows: {len(vals)}")
|
||||
print()
|
||||
|
||||
# Tally non-null rate per returned column
|
||||
counts = {c: 0 for c in cols}
|
||||
for row in vals:
|
||||
for c, v in zip(cols, row):
|
||||
if v not in (None, '', 'null'):
|
||||
counts[c] += 1
|
||||
print("=== Column populated-rate (out of returned columns) ===")
|
||||
for c in cols:
|
||||
n = counts[c]
|
||||
pct = round(100 * n / max(1, len(vals)), 1)
|
||||
print(f" {c:<35} {n:>4} / {len(vals)} {pct:>5}%")
|
||||
|
||||
print()
|
||||
print("=== First 2 events (pretty) ===")
|
||||
for row in vals[:2]:
|
||||
print(json.dumps(dict(zip(cols, row)), indent=2, default=str)[:1500])
|
||||
print("---")
|
||||
|
||||
print()
|
||||
print("=== Distinct fields IN the message body (if JSON) ===")
|
||||
# If the events carry a structured body, peek inside it
|
||||
field_freq = collections.Counter()
|
||||
for row in vals:
|
||||
rd = dict(zip(cols, row))
|
||||
msg = rd.get('message') or rd.get('body') or rd.get('attributes')
|
||||
if isinstance(msg, str):
|
||||
try:
|
||||
j = json.loads(msg)
|
||||
except Exception:
|
||||
continue
|
||||
else:
|
||||
j = msg
|
||||
if isinstance(j, dict):
|
||||
def walk(obj, prefix=''):
|
||||
for k, v in obj.items():
|
||||
key = f"{prefix}.{k}" if prefix else k
|
||||
if isinstance(v, dict):
|
||||
walk(v, key)
|
||||
else:
|
||||
field_freq[key] += 1
|
||||
walk(j)
|
||||
for k, c in field_freq.most_common(40):
|
||||
print(f" {k:<45} in {c:>3} events")
|
||||
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Search wider windows for Avelios Medical events."""
|
||||
import json, time, urllib.request
|
||||
import os
|
||||
|
||||
def _load_sdl_cfg():
|
||||
import json as _j, os as _o, sys as _s
|
||||
here = _o.path.dirname(_o.path.abspath(__file__))
|
||||
candidates = [
|
||||
_o.environ.get("SDL_CONFIG"),
|
||||
_o.path.join(here, "sdl_config.json"),
|
||||
_o.path.join(here, "..", "sdl_config.json"),
|
||||
]
|
||||
for p in candidates:
|
||||
if p and _o.path.exists(p):
|
||||
with open(p) as fh:
|
||||
return _j.load(fh)
|
||||
_s.stderr.write(
|
||||
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
|
||||
"(see sdl_config.example.json)\n")
|
||||
_s.exit(2)
|
||||
|
||||
|
||||
CFG = _load_sdl_cfg()
|
||||
BASE, KEY = CFG['base_url'].rstrip('/'), CFG['log_read_key']
|
||||
NOW = int(time.time() * 1000)
|
||||
|
||||
|
||||
def pq(query, start_ms, end_ms, mc=5):
|
||||
body = json.dumps({"token": KEY, "query": query,
|
||||
"startTime": start_ms, "endTime": end_ms,
|
||||
"maxCount": mc}).encode()
|
||||
req = urllib.request.Request(BASE + '/api/powerQuery', data=body,
|
||||
headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
return json.loads(urllib.request.urlopen(req, timeout=60).read())
|
||||
except Exception as e:
|
||||
return {"_err": str(e)[:200]}
|
||||
|
||||
|
||||
for days in (1, 3, 7):
|
||||
start = NOW - days * 24 * 3600 * 1000
|
||||
print(f"\n=== last {days}d ===")
|
||||
d = pq("| group n=count() by dataSource.name | sort -n | limit 30", start, NOW, mc=30)
|
||||
if "_err" in d:
|
||||
print(d["_err"]); continue
|
||||
for row in d.get("values", []):
|
||||
name = row[0]
|
||||
if name and "velio" in name.lower():
|
||||
print(f" HIT: {row}")
|
||||
# show top 10 in this window
|
||||
for row in (d.get("values", []) or [])[:10]:
|
||||
print(f" {row}")
|
||||
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Probe what PowerQuery syntax this SDL tenant accepts."""
|
||||
import json, time, urllib.request, urllib.error, sys
|
||||
import os
|
||||
|
||||
def _load_sdl_cfg():
|
||||
import json as _j, os as _o, sys as _s
|
||||
here = _o.path.dirname(_o.path.abspath(__file__))
|
||||
candidates = [
|
||||
_o.environ.get("SDL_CONFIG"),
|
||||
_o.path.join(here, "sdl_config.json"),
|
||||
_o.path.join(here, "..", "sdl_config.json"),
|
||||
]
|
||||
for p in candidates:
|
||||
if p and _o.path.exists(p):
|
||||
with open(p) as fh:
|
||||
return _j.load(fh)
|
||||
_s.stderr.write(
|
||||
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
|
||||
"(see sdl_config.example.json)\n")
|
||||
_s.exit(2)
|
||||
|
||||
|
||||
CFG = _load_sdl_cfg()
|
||||
URL = CFG['base_url'].rstrip('/') + '/api/powerQuery'
|
||||
END_MS = int(time.time() * 1000)
|
||||
START_MS = END_MS - 3600 * 1000 # last hour
|
||||
|
||||
|
||||
def run(label: str, query: str):
|
||||
body = json.dumps({
|
||||
"token": CFG['log_read_key'],
|
||||
"query": query,
|
||||
"startTime": START_MS,
|
||||
"endTime": END_MS,
|
||||
"maxCount": 5,
|
||||
}).encode()
|
||||
req = urllib.request.Request(URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urllib.request.urlopen(req, timeout=30).read()
|
||||
d = json.loads(resp)
|
||||
st = d.get('status', '?')
|
||||
cols = d.get('columns') or []
|
||||
vals = d.get('values') or d.get('matches') or []
|
||||
print(f"[OK ] {label:<40} status={st} cols={len(cols)} rows={len(vals)}")
|
||||
if vals:
|
||||
print(f" sample={str(vals[0])[:160]}")
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode()
|
||||
try:
|
||||
j = json.loads(body)
|
||||
msg = j.get('message', body)[:200]
|
||||
except Exception:
|
||||
msg = body[:200]
|
||||
print(f"[ERR] {label:<40} HTTP {e.code}: {msg}")
|
||||
except Exception as e:
|
||||
print(f"[ERR] {label:<40} {type(e).__name__}: {str(e)[:160]}")
|
||||
|
||||
|
||||
CASES = [
|
||||
("leading-pipe single-stage", "| group total=count()"),
|
||||
("no-pipe single-stage", "group total=count()"),
|
||||
("leading-pipe multi-stage", "| group events=count() by dataSource.name | sort -events | limit 5"),
|
||||
("no-pipe multi-stage", "group events=count() by dataSource.name | sort -events | limit 5"),
|
||||
("no-pipe trim sort", "group events=count() by dataSource.name | limit 5"),
|
||||
("filter then group", "dataSource.name=='SentinelOne' | group events=count()"),
|
||||
("filter (modern keyword)", "filter dataSource.name=='SentinelOne' | group events=count()"),
|
||||
("dataset-style with sort", "group events=count() by dataSource.name | sort events desc | limit 5"),
|
||||
("count() as alias", "| count() as events"),
|
||||
("group by event.type", "group events=count() by event.type | limit 5"),
|
||||
]
|
||||
|
||||
print(f"URL: {URL}")
|
||||
print(f"Window: last 1h ({START_MS}..{END_MS} ms)")
|
||||
print()
|
||||
for label, q in CASES:
|
||||
run(label, q)
|
||||
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Probe /api/ingest/simulate-filter using small 1-day windows + long client
|
||||
timeouts to avoid urllib aborting before the SDL query returns.
|
||||
|
||||
Run one case at a time and print elapsed time so we can tell whether failures
|
||||
are HTTP errors or slow tenant queries.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
URL = "http://localhost:8001/api/ingest/simulate-filter"
|
||||
TIMEOUT = 600 # seconds — generous; SDL queries on large tenants can take >60s
|
||||
|
||||
# Smallest windows first so cheap calls succeed before we try the expensive ones.
|
||||
CASES = [
|
||||
("empty body, 1d", {"days": 1}),
|
||||
("bogus source, 1d", {"source": "definitely-no-such-source", "days": 1}),
|
||||
("source only, 1d", {"source": "Avelios Medical", "days": 1}),
|
||||
("source only, 7d", {"source": "Avelios Medical", "days": 7}),
|
||||
("event_type only, 1d", {"event_type": "login", "days": 1}),
|
||||
("source + event_type, 7d", {"source": "Avelios Medical", "event_type": "login", "days": 7}),
|
||||
]
|
||||
|
||||
|
||||
def hit(body: dict) -> tuple[int, str, float]:
|
||||
data = json.dumps(body).encode()
|
||||
req = urllib.request.Request(
|
||||
URL,
|
||||
data=data,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
|
||||
return r.status, r.read().decode(), time.monotonic() - t0
|
||||
except urllib.error.HTTPError as e:
|
||||
return e.code, e.read().decode(), time.monotonic() - t0
|
||||
except Exception as e:
|
||||
return -1, f"{type(e).__name__}: {e or 'no detail'}", time.monotonic() - t0
|
||||
|
||||
|
||||
# Allow narrowing via CLI: `python3 probe_simulate_filter.py 2 3` runs cases 2 & 3.
|
||||
indices = [int(x) for x in sys.argv[1:]] if len(sys.argv) > 1 else range(len(CASES))
|
||||
|
||||
for i in indices:
|
||||
if i >= len(CASES):
|
||||
continue
|
||||
label, body = CASES[i]
|
||||
print("=" * 78)
|
||||
print(f"[{i}] {label:<32} body={body}")
|
||||
sys.stdout.flush()
|
||||
status, payload, elapsed = hit(body)
|
||||
print(f" HTTP {status} elapsed={elapsed:.1f}s")
|
||||
try:
|
||||
parsed = json.loads(payload)
|
||||
print(" " + json.dumps(parsed, indent=2).replace("\n", "\n "))
|
||||
except Exception:
|
||||
print(f" raw: {payload[:800]}")
|
||||
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Trigger /api/quality/sync-from-sdl and pretty-print the result.
|
||||
|
||||
Then re-list /api/quality/parsers to confirm the new files appear in the
|
||||
Parser Test Runner dropdown.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
BACKEND = "http://localhost:8001"
|
||||
TIMEOUT = 300
|
||||
|
||||
|
||||
def call(method: str, path: str) -> tuple[int, dict | str, float]:
|
||||
req = urllib.request.Request(BACKEND + path, method=method)
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
|
||||
return r.status, json.loads(r.read().decode()), time.monotonic() - t0
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode()
|
||||
try:
|
||||
return e.code, json.loads(body), time.monotonic() - t0
|
||||
except Exception:
|
||||
return e.code, body, time.monotonic() - t0
|
||||
except Exception as e:
|
||||
return -1, f"{type(e).__name__}: {e or 'no detail'}", time.monotonic() - t0
|
||||
|
||||
|
||||
print("=" * 72)
|
||||
print("POST /api/quality/sync-from-sdl")
|
||||
print("=" * 72)
|
||||
status, body, elapsed = call("POST", "/api/quality/sync-from-sdl")
|
||||
print(f"HTTP {status} elapsed={elapsed:.1f}s")
|
||||
if isinstance(body, dict):
|
||||
if "detail" in body:
|
||||
print(f" ERROR: {body['detail']}")
|
||||
else:
|
||||
print(f" downloaded: {body.get('downloaded')}")
|
||||
print(f" errors: {len(body.get('errors') or [])}")
|
||||
print(f" directory: {body.get('directory')}")
|
||||
names = body.get("parsers") or []
|
||||
print(f"\n sample of parser filenames (first 25):")
|
||||
for n in names[:25]:
|
||||
print(f" {n}")
|
||||
if len(names) > 25:
|
||||
print(f" ... and {len(names) - 25} more")
|
||||
# Highlight anything that looks like a customer/custom parser
|
||||
custom = [n for n in names if "avelios" in n.lower() or "ocsf" in n.lower()]
|
||||
if custom:
|
||||
print("\n matched custom-parser patterns (avelios / ocsf):")
|
||||
for n in custom:
|
||||
print(f" ✓ {n}")
|
||||
errs = body.get("errors") or []
|
||||
if errs:
|
||||
print(f"\n errors (first 5 of {len(errs)}):")
|
||||
for e in errs[:5]:
|
||||
print(f" - {e}")
|
||||
else:
|
||||
print(f" raw: {str(body)[:600]}")
|
||||
|
||||
print()
|
||||
print("=" * 72)
|
||||
print("GET /api/quality/parsers (Parser Test Runner dropdown source)")
|
||||
print("=" * 72)
|
||||
status, body, elapsed = call("GET", "/api/quality/parsers")
|
||||
print(f"HTTP {status} elapsed={elapsed:.1f}s")
|
||||
if isinstance(body, dict):
|
||||
print(f" count: {body.get('count')}")
|
||||
print(f" parsers:")
|
||||
for n in (body.get("parsers") or [])[:50]:
|
||||
print(f" {n}")
|
||||
if (body.get("count") or 0) > 50:
|
||||
print(f" ... and {body['count'] - 50} more")
|
||||
else:
|
||||
print(f" raw: {str(body)[:400]}")
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"_comment": "Copy to sdl_config.json (or set $SDL_CONFIG to its path). Only the keys you need are required. log_read_key for queries; config_read_key for listFiles/getFile (parser sync).",
|
||||
"base_url": "https://xdr.us1.sentinelone.net",
|
||||
"log_read_key": "REPLACE_WITH_LOG_READ_KEY",
|
||||
"config_read_key": "REPLACE_WITH_CONFIG_READ_KEY",
|
||||
"console_api_token": "REPLACE_WITH_CONSOLE_API_TOKEN_OR_LEAVE_BLANK"
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Pull every parser under /logParsers/ from the SDL tenant and drop it into
|
||||
./parsers/ so the SIEM-Toolkit Parser Test Runner can list it.
|
||||
|
||||
Auth: config_read_key from sentinelone-sdl-api/config.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import urllib.error
|
||||
|
||||
def _load_sdl_cfg():
|
||||
import json as _j, os as _o, sys as _s
|
||||
here = _o.path.dirname(_o.path.abspath(__file__))
|
||||
candidates = [
|
||||
_o.environ.get("SDL_CONFIG"),
|
||||
_o.path.join(here, "sdl_config.json"),
|
||||
_o.path.join(here, "..", "sdl_config.json"),
|
||||
]
|
||||
for p in candidates:
|
||||
if p and _o.path.exists(p):
|
||||
with open(p) as fh:
|
||||
return _j.load(fh)
|
||||
_s.stderr.write(
|
||||
"ERROR: no SDL config found. Set $SDL_CONFIG or create sdl_config.json "
|
||||
"(see sdl_config.example.json)\n")
|
||||
_s.exit(2)
|
||||
|
||||
|
||||
SDL_CFG_PATH = os.environ.get('SDL_CONFIG') # placeholder; cfg loaded below
|
||||
DEST = os.environ.get('PARSERS_DIR', os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'parsers'))
|
||||
def call(base_url: str, token: str, path: str, params: dict) -> dict:
|
||||
"""POST with JSON body — works for both listFiles and getFile on SDL."""
|
||||
url = f"{base_url.rstrip('/')}{path}"
|
||||
body = json.dumps({**params, "token": token}).encode()
|
||||
req = urllib.request.Request(url, data=body, headers={
|
||||
"Authorization": f"Bearer {token}",
|
||||
"Content-Type": "application/json",
|
||||
})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
return json.loads(r.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode(errors="replace")[:300]
|
||||
raise RuntimeError(f"HTTP {e.code} {path}: {body}")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
cfg = _load_sdl_cfg()
|
||||
base = cfg["base_url"]
|
||||
# config_read_key first (per docs), fall back to console_api_token
|
||||
token = cfg.get("config_read_key") or cfg.get("console_api_token")
|
||||
if not token:
|
||||
print("No config_read_key or console_api_token in config.json", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
print(f"Listing /logParsers/ from {base} ...")
|
||||
res = call(base, token, "/api/listFiles", {"pathPrefix": "/logParsers/"})
|
||||
paths = res.get("paths", [])
|
||||
print(f"Found {len(paths)} files under /logParsers/")
|
||||
|
||||
os.makedirs(DEST, exist_ok=True)
|
||||
fetched, skipped, failed = 0, 0, []
|
||||
|
||||
for p in paths:
|
||||
# Strip leading /logParsers/, sanitize for filesystem
|
||||
name = p.rsplit("/", 1)[-1] or "_unnamed"
|
||||
# Avoid colliding with existing sample files? Always overwrite to keep fresh.
|
||||
try:
|
||||
r = call(base, token, "/api/getFile", {"path": p})
|
||||
except Exception as e:
|
||||
failed.append((p, str(e)))
|
||||
continue
|
||||
|
||||
content = r.get("content")
|
||||
if content is None:
|
||||
failed.append((p, "no content"))
|
||||
continue
|
||||
|
||||
out = os.path.join(DEST, name)
|
||||
with open(out, "w", encoding="utf-8") as fh:
|
||||
fh.write(content)
|
||||
ver = r.get("version", "?")
|
||||
print(f" + {name:<60} v{ver} ({len(content)} bytes)")
|
||||
fetched += 1
|
||||
|
||||
print()
|
||||
print(f"Done: fetched={fetched}, failed={len(failed)}")
|
||||
if failed:
|
||||
for p, err in failed[:10]:
|
||||
print(f" ! {p}: {err}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Verify the Parser Test Runner accepts multi-line NDJSON for JSON-mode parsers."""
|
||||
import json, urllib.request
|
||||
import os
|
||||
|
||||
LINES = [
|
||||
'{"timestamp":"2026-05-14T00:00:41.969Z","event_type":"DATA_IMPORT_COMPLETED","event_category":"data_transfer","severity":"INFO","outcome":"success","source":{"application":"Avelios Medical"}}',
|
||||
'{"timestamp":"2026-05-14T00:07:41.969Z","event_type":"PERFORMANCE_DEGRADATION","event_category":"system","severity":"MEDIUM","outcome":"success","source":{"application":"Avelios Medical"}}',
|
||||
'{"timestamp":"2026-05-14T00:24:41.969Z","event_type":"MALWARE_DETECTED","event_category":"security","severity":"CRITICAL","outcome":"detected","source":{"application":"Avelios Medical"},"details":{"malware_name":"Trojan.GenericKD"}}',
|
||||
]
|
||||
|
||||
body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": "\n".join(LINES)}).encode()
|
||||
req = urllib.request.Request(
|
||||
"http://localhost:8001/api/quality/test-parser",
|
||||
data=body, headers={"Content-Type": "application/json"})
|
||||
r = json.loads(urllib.request.urlopen(req, timeout=30).read())
|
||||
|
||||
print(f"matched = {r.get('matched')}")
|
||||
print(f"mode = {r.get('mode')}")
|
||||
print(f"payloads = {r.get('payload_count')} (showing {r.get('showing_payload')})")
|
||||
print(f"extracted = {r.get('extracted_count')}")
|
||||
print(f"derived = {r.get('derived_count')}")
|
||||
print(f"parse_errors = {r.get('parse_errors')}")
|
||||
print()
|
||||
print("rewrites applied (first payload):")
|
||||
for rw in r.get("rewrites_applied", [])[:10]:
|
||||
print(f" {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")
|
||||
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python3
|
||||
import json, urllib.request
|
||||
import os
|
||||
|
||||
log = '{"timestamp": "2026-05-14T00:24:41.969Z", "event_id": "d5c76dd2-5320-4b32-bd27-09acedfb5fdb", "event_type": "MALWARE_DETECTED", "event_category": "security", "severity": "CRITICAL", "source": {"application": "Avelios Medical", "module": "SecurityMonitor"}, "outcome": "detected", "details": {"malware_name": "Trojan.GenericKD"}}'
|
||||
|
||||
body = json.dumps({"parser_name": "Avelios-Medical-OCSF", "log_line": log}).encode()
|
||||
req = urllib.request.Request(
|
||||
"http://localhost:8001/api/quality/test-parser",
|
||||
data=body, headers={"Content-Type": "application/json"})
|
||||
r = json.loads(urllib.request.urlopen(req, timeout=30).read())
|
||||
|
||||
print(f"matched={r.get('matched')} mode={r.get('mode')} "
|
||||
f"extracted={r.get('extracted_count')} derived={r.get('derived_count')}")
|
||||
print()
|
||||
print("json-extract fields (first 12):")
|
||||
for f in [x for x in r.get("fields", []) if x.get("source") == "json-extract"][:12]:
|
||||
print(f" {f['field']:<32} = {str(f['value'])[:50]}")
|
||||
print()
|
||||
print("rewrites applied:")
|
||||
for rw in r.get("rewrites_applied", [])[:12]:
|
||||
print(f" {rw['input']:<18} -> {rw['output']:<28} = {rw['result']!r}")
|
||||
Reference in New Issue
Block a user