Sync upstream features; preserve fork KV scanner, parsers, verifier

Brought in 35 upstream commits (MITRE heatmap, health score, dependency map,
PowerQuery playground, onboarding tracker, product grouping, modern UI redesign).

Preserved fork additions:
  backend/routers/quality.py  KV scanner, pattern refs, JS keys, JSON mode,
                              /parsers + /sync-from-sdl endpoints
  parsers/                    96 OCSF + tenant parsers
  tools/stormshield-verify/   end-to-end ingest regression test
  .gitignore                  un-ignored parsers/*
  CHANGES.md, PATCHES.md
This commit is contained in:
marc
2026-05-22 18:19:52 +02:00
parent a7ebcac9a6
commit 7c1687efce
102 changed files with 13912 additions and 178 deletions
+63
View File
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""Probe /api/ingest/simulate-filter using small 1-day windows + long client
timeouts to avoid urllib aborting before the SDL query returns.
Run one case at a time and print elapsed time so we can tell whether failures
are HTTP errors or slow tenant queries.
"""
from __future__ import annotations
import json
import sys
import time
import urllib.request
import urllib.error
URL = "http://localhost:8001/api/ingest/simulate-filter"
TIMEOUT = 600 # seconds — generous; SDL queries on large tenants can take >60s
# Smallest windows first so cheap calls succeed before we try the expensive ones.
CASES = [
("empty body, 1d", {"days": 1}),
("bogus source, 1d", {"source": "definitely-no-such-source", "days": 1}),
("source only, 1d", {"source": "Avelios Medical", "days": 1}),
("source only, 7d", {"source": "Avelios Medical", "days": 7}),
("event_type only, 1d", {"event_type": "login", "days": 1}),
("source + event_type, 7d", {"source": "Avelios Medical", "event_type": "login", "days": 7}),
]
def hit(body: dict) -> tuple[int, str, float]:
data = json.dumps(body).encode()
req = urllib.request.Request(
URL,
data=data,
headers={"Content-Type": "application/json"},
method="POST",
)
t0 = time.monotonic()
try:
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
return r.status, r.read().decode(), time.monotonic() - t0
except urllib.error.HTTPError as e:
return e.code, e.read().decode(), time.monotonic() - t0
except Exception as e:
return -1, f"{type(e).__name__}: {e or 'no detail'}", time.monotonic() - t0
# Allow narrowing via CLI: `python3 probe_simulate_filter.py 2 3` runs cases 2 & 3.
indices = [int(x) for x in sys.argv[1:]] if len(sys.argv) > 1 else range(len(CASES))
for i in indices:
if i >= len(CASES):
continue
label, body = CASES[i]
print("=" * 78)
print(f"[{i}] {label:<32} body={body}")
sys.stdout.flush()
status, payload, elapsed = hit(body)
print(f" HTTP {status} elapsed={elapsed:.1f}s")
try:
parsed = json.loads(payload)
print(" " + json.dumps(parsed, indent=2).replace("\n", "\n "))
except Exception:
print(f" raw: {payload[:800]}")
+80
View File
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""Trigger /api/quality/sync-from-sdl and pretty-print the result.
Then re-list /api/quality/parsers to confirm the new files appear in the
Parser Test Runner dropdown.
"""
from __future__ import annotations
import json
import sys
import time
import urllib.request
import urllib.error
BACKEND = "http://localhost:8001"
TIMEOUT = 300
def call(method: str, path: str) -> tuple[int, dict | str, float]:
req = urllib.request.Request(BACKEND + path, method=method)
t0 = time.monotonic()
try:
with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
return r.status, json.loads(r.read().decode()), time.monotonic() - t0
except urllib.error.HTTPError as e:
body = e.read().decode()
try:
return e.code, json.loads(body), time.monotonic() - t0
except Exception:
return e.code, body, time.monotonic() - t0
except Exception as e:
return -1, f"{type(e).__name__}: {e or 'no detail'}", time.monotonic() - t0
print("=" * 72)
print("POST /api/quality/sync-from-sdl")
print("=" * 72)
status, body, elapsed = call("POST", "/api/quality/sync-from-sdl")
print(f"HTTP {status} elapsed={elapsed:.1f}s")
if isinstance(body, dict):
if "detail" in body:
print(f" ERROR: {body['detail']}")
else:
print(f" downloaded: {body.get('downloaded')}")
print(f" errors: {len(body.get('errors') or [])}")
print(f" directory: {body.get('directory')}")
names = body.get("parsers") or []
print(f"\n sample of parser filenames (first 25):")
for n in names[:25]:
print(f" {n}")
if len(names) > 25:
print(f" ... and {len(names) - 25} more")
# Highlight anything that looks like a customer/custom parser
custom = [n for n in names if "avelios" in n.lower() or "ocsf" in n.lower()]
if custom:
print("\n matched custom-parser patterns (avelios / ocsf):")
for n in custom:
print(f"{n}")
errs = body.get("errors") or []
if errs:
print(f"\n errors (first 5 of {len(errs)}):")
for e in errs[:5]:
print(f" - {e}")
else:
print(f" raw: {str(body)[:600]}")
print()
print("=" * 72)
print("GET /api/quality/parsers (Parser Test Runner dropdown source)")
print("=" * 72)
status, body, elapsed = call("GET", "/api/quality/parsers")
print(f"HTTP {status} elapsed={elapsed:.1f}s")
if isinstance(body, dict):
print(f" count: {body.get('count')}")
print(f" parsers:")
for n in (body.get("parsers") or [])[:50]:
print(f" {n}")
if (body.get("count") or 0) > 50:
print(f" ... and {body['count'] - 50} more")
else:
print(f" raw: {str(body)[:400]}")
+59
View File
@@ -0,0 +1,59 @@
# Stormshield ingest verifier
End-to-end regression test for the SDL Stormshield parser. Sends raw syslog
events to `/api/uploadLogs`, waits for ingest, and confirms the OCSF rewrites
(`src_endpoint.ip`, `dst_endpoint.ip`, `actor.user.name`, ...) populated by
the parser at ingest time.
## Setup
```bash
cp config.example.json config.json
chmod 600 config.json
# Fill in log_write_key, log_read_key — both are SDL Data Lake API keys.
# Generate them in the S1 console: Singularity Data Lake -> API Keys.
```
`config.json` is gitignored. Never commit real tokens.
## Run
```bash
# Single-event upload + 150s polling verifier (prints which OCSF fields landed)
python3 test.py
# Burst of 4 varied events with current timestamps (different users, IPs, actions)
python3 send_burst.py
# One-shot regression: burst + 40s wait + query last 15 min
bash run_and_verify.sh
```
## How to find the events afterwards
The SDL console search field (and PowerQuery) attribute for the parser name
is **`parser`**, not `parser.name`:
```
parser="stormshield" | sort -timestamp | limit 10
```
## Behaviour quirks worth knowing
1. **`server-host` HTTP header is overwritten** to the literal string
`uploadLogs` on this tenant. Don't try to filter by `serverHost` for
precise event matching; use `parser='stormshield'` instead.
2. **`parser.name` is always None** on `uploadLogs`-ingested events.
Use the bare `parser` attribute.
3. **Embedded `time="..."`** in the syslog body is taken as the event's
canonical timestamp via `$timestamp=tsPattern$`. The scripts rewrite
this to "now" so events appear under recent activity in the console.
4. **Ingest latency** is 5-60s. `test.py` polls for up to 150s.
## Files
- `test.py` — single upload + polling verifier
- `send_burst.py` — N varied events with current timestamps
- `verify_query.py` — query last 15 min of stormshield events
- `run_and_verify.sh` — burst + sleep + verify (regression test)
- `config.example.json` — template, copy to `config.json`
@@ -0,0 +1,12 @@
{
"_comment": "Copy to config.json (gitignored) and fill in your SDL keys. Generate them in the SentinelOne console under Singularity Data Lake -> API Keys. log_write_key needs 'Log Write Access'. log_read_key needs 'Log Read Access'. config_read_key needs 'Configuration Read'. config_write_key needs 'Configuration Write'. console_api_token is a regular console user/service-user API token; it works for query and config methods but NOT for uploadLogs (uploadLogs requires a real Log Write key).",
"base_url": "https://xdr.us1.sentinelone.net/",
"log_write_key": "REPLACE_WITH_LOG_WRITE_KEY",
"log_read_key": "REPLACE_WITH_LOG_READ_KEY",
"config_read_key": "REPLACE_WITH_CONFIG_READ_KEY",
"config_write_key": "REPLACE_WITH_CONFIG_WRITE_KEY",
"console_api_token": "REPLACE_WITH_CONSOLE_API_TOKEN_OR_LEAVE_EMPTY",
"s1_scope": "",
"verify_tls": true,
"timeout_seconds": 30
}
+11
View File
@@ -0,0 +1,11 @@
#!/usr/bin/env bash
set -e
cd /tmp/stormshield-verify
echo "============ STEP 1: send burst ============"
python3 send_burst.py
echo
echo "============ STEP 2: wait 40s for ingest ============"
sleep 40
echo
echo "============ STEP 3: query SDL ============"
python3 verify_query.py
+89
View File
@@ -0,0 +1,89 @@
#!/usr/bin/env python3
"""Send N Stormshield events with current timestamps, varied src IPs/users,
so they appear as a recognizable cluster in the SDL console under
parser="stormshield"."""
import json, time, uuid, urllib.request, urllib.error
from datetime import datetime, timezone, timedelta
CFG = json.load(open("./config.json"))
BASE = CFG["base_url"].rstrip("/")
WRITE_KEY = CFG["log_write_key"]
PARSER = "stormshield"
# A handful of plausible variations
USERS = ["aimee.ndzodo", "luc.martin", "claire.dubois", "fatima.khelifi"]
SRCS = ["10.200.0.82", "10.200.0.91", "10.200.1.14", "10.200.2.55"]
DSTS = [("192.168.10.7","53","dns_udp","53"),
("192.168.10.7","53","dns_udp","53"),
("8.8.8.8","53","dns_udp","53"),
("1.1.1.1","443","https","443")]
ACTIONS = ["pass", "pass", "pass", "block"]
def _local_now():
tz = datetime.now(timezone.utc).astimezone().tzinfo
return datetime.now(tz).replace(microsecond=0)
def _ts(now):
syslog = now.strftime("%Y-%m-%dT%H:%M:%S%z")
syslog = syslog[:-2] + ":" + syslog[-2:]
time_ = now.strftime("%Y-%m-%d %H:%M:%S")
return syslog, time_
def build_line(i):
now = _local_now() + timedelta(seconds=i)
syslog, time_ = _ts(now)
start = (now - timedelta(seconds=120)).strftime("%Y-%m-%d %H:%M:%S")
u, src, (dst, dport, dpname, dportname), act = USERS[i % 4], SRCS[i % 4], DSTS[i % 4], ACTIONS[i % 4]
sport = 50000 + i * 137
return (
f'<14>1 {syslog} stormshield-v.univ-evry.fr asqd - - - '
f'?id=firewall time="{time_}" fw="stormshield-v.univ-evry.fr" '
f'tz=+0200 startime="{start}" pri=5 confid=01 slotlevel=2 ruleid={34+i} '
f'rulename="17209b9db27_{i+1}" user="{u}" domain="ueve.local" '
f'srcif="sslvpn0" srcifname="sslvpn" ipproto=udp dstif="Ethernet1" dstifname="in" '
f'proto={dpname} src={src} srcport={sport} srcportname=ephemeral_fw_udp '
f'dst={dst} dstport={dport} dstportname={dportname} dstname=resolver.example.com '
f'modsrc={src} modsrcport={sport} origdst={dst} origdstport={dport} '
f'ipv=4 sent={80+i*8} rcvd={196+i*16} duration=0.0{i} action={act} logtype="connection"'
)
def send_one(body, idx):
nonce = str(uuid.uuid4())
req = urllib.request.Request(
f"{BASE}/api/uploadLogs",
method="POST",
data=body.encode(),
headers={
"Authorization": f"Bearer {WRITE_KEY}",
"Content-Type": "text/plain",
"parser": PARSER,
"Nonce": nonce,
},
)
try:
with urllib.request.urlopen(req, timeout=30) as r:
print(f"[{idx}] HTTP {r.status} nonce={nonce[:8]}… body=`{body[:90]}...`")
return r.status
except urllib.error.HTTPError as e:
print(f"[{idx}] HTTP {e.code} {e.read().decode()[:120]}")
return e.code
def main():
n = 4
print(f"Sending {n} Stormshield events to {BASE} ...")
for i in range(n):
send_one(build_line(i), i)
time.sleep(1)
print(f"\nDone. Wait ~30-60s, then in https://demo.sentinelone.net search:")
print(f" parser=\"stormshield\"")
print("or run:")
print(f" parser='stormshield' | sort -timestamp | limit 10")
if __name__ == "__main__":
main()
+181
View File
@@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""
End-to-end test that the Stormshield parser is actually applied at ingest by
SentinelOne SDL.
1. POSTs a raw Stormshield syslog line to /api/uploadLogs with `parser: stormshield`.
2. Polls SDL with PowerQuery to find the event we just ingested.
3. Inspects which OCSF fields are populated to confirm SDL parsed it correctly.
Requires: log_write_key + log_read_key in ./config.json (see config.example.json)
"""
from __future__ import annotations
import json, time, uuid, urllib.request, urllib.error, sys, os
CFG_PATH = "./config.json"
PARSER = "stormshield"
SERVER_HOST = f"siemtoolkit-test-{int(time.time())}" # unique tag to find our event back
# Use current timestamps so events show up under "now" in the SDL console.
# The parser extracts `time="..."` as the canonical event timestamp via
# $timestamp=tsPattern$, so we must rewrite that field (not just the syslog
# header) to see the event under recent activity in https://demo.sentinelone.net.
from datetime import datetime, timezone, timedelta
import time as _time
_local_tz = datetime.now(timezone.utc).astimezone().tzinfo
_now = datetime.now(_local_tz).replace(microsecond=0)
_start = _now - timedelta(minutes=2)
SYSLOG_TS = _now.strftime("%Y-%m-%dT%H:%M:%S%z") # 2026-05-22T16:32:00+0200
SYSLOG_TS = SYSLOG_TS[:-2] + ":" + SYSLOG_TS[-2:] # → 2026-05-22T16:32:00+02:00
TIME_TS = _now.strftime("%Y-%m-%d %H:%M:%S")
START_TS = _start.strftime("%Y-%m-%d %H:%M:%S")
TZ_OFFSET = _now.strftime("%z") # +0200
TZ_OFFSET = TZ_OFFSET[:-2] + TZ_OFFSET[-2:] # keep +0200 form
LOG_LINE = (
f'<14>1 {SYSLOG_TS} stormshield-v.univ-evry.fr asqd - - - '
f'?id=firewall time="{TIME_TS}" fw="stormshield-v.univ-evry.fr" '
f'tz={TZ_OFFSET} startime="{START_TS}" pri=5 confid=01 slotlevel=2 ruleid=34 '
'rulename="17209b9db27_4" user="aimee.ndzodo" domain="ueve.local" '
'srcif="sslvpn0" srcifname="sslvpn" ipproto=udp dstif="Ethernet1" dstifname="in" '
'proto=dns_udp src=10.200.0.82 srcport=56637 srcportname=ephemeral_fw_udp '
'dst=192.168.10.7 dstport=53 dstportname=dns_udp dstname=hyperion.univ-evry.fr '
'modsrc=10.200.0.82 modsrcport=56637 origdst=192.168.10.7 origdstport=53 '
'ipv=4 sent=80 rcvd=196 duration=0.00 action=pass logtype="connection"'
)
def _http(method, url, *, headers=None, data=None, timeout=60):
req = urllib.request.Request(url, method=method, headers=headers or {}, data=data)
try:
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.status, r.read().decode("utf-8", "replace")
except urllib.error.HTTPError as e:
return e.code, e.read().decode("utf-8", "replace")
def main():
with open(CFG_PATH) as f:
cfg = json.load(f)
base = cfg["base_url"].rstrip("/")
write_key = cfg["log_write_key"]
read_key = cfg["log_read_key"]
nonce = str(uuid.uuid4())
headers = {
"Authorization": f"Bearer {write_key}",
"Content-Type": "text/plain",
"parser": PARSER,
"server-host": SERVER_HOST,
"Nonce": nonce,
}
print("=" * 70)
print("STEP 1 — POST /api/uploadLogs")
print("=" * 70)
print(f" url = {base}/api/uploadLogs")
print(f" parser = {PARSER}")
print(f" server_host = {SERVER_HOST}")
print(f" nonce = {nonce}")
print(f" body bytes = {len(LOG_LINE)}")
print(f" embedded ts = time=\"{TIME_TS}\" (parser uses this as event time)")
print(f" log line = {LOG_LINE[:140]}...")
status, body = _http("POST", f"{base}/api/uploadLogs",
headers=headers, data=LOG_LINE.encode())
print(f" -> HTTP {status}")
print(f" -> {body[:300]}")
if status >= 400:
sys.exit(f"uploadLogs failed: {status}")
# ── STEP 3: poll for the event ──────────────────────────────────────
# SDL ingest is typically visible in ~5-30s but can take up to 2 min.
# Note: `server-host` HTTP header is overwritten to "uploadLogs" by SDL,
# and `parser.name` is None on uploadLogs-ingested events. The reliable
# filter is `parser='stormshield' and dataSource.name='Stormshield'`
# constrained by Nonce (echoed back as an attribute) for our exact upload.
query = (
f"parser='{PARSER}' and dataSource.name='Stormshield' "
"| columns timestamp, dataSource.name, parser, "
"src_endpoint.ip, src_endpoint.port, dst_endpoint.ip, dst_endpoint.port, "
"actor.user.name, unmapped.action, unmapped.proto, unmapped.fw, "
"unmapped.rulename, unmapped.duration, message "
"| sort -timestamp | limit 5"
)
print("=" * 70)
print(f"STEP 2 — poll /api/powerQuery (up to 150s)")
print("=" * 70)
print(f" query = {query}\n")
matches: list = []
columns: list = []
deadline = time.time() + 150
waited = 0
while time.time() < deadline:
time.sleep(10); waited += 10
end_ms = int(time.time() * 1000)
start_ms = end_ms - 15 * 60 * 1000
pq_body = {"query": query, "startTime": str(start_ms), "endTime": str(end_ms)}
status, body = _http(
"POST",
f"{base}/api/powerQuery",
headers={"Authorization": f"Bearer {read_key}",
"Content-Type": "application/json"},
data=json.dumps(pq_body).encode(),
)
if status != 200:
print(f" t+{waited:3d}s: HTTP {status}{body[:200]}")
continue
result = json.loads(body)
columns = result.get("columns") or []
values = result.get("values") or []
n = result.get("matchingEvents", len(values))
print(f" t+{waited:3d}s: matchingEvents={n}")
if values:
matches = [{"values": v} for v in values]
break
if not matches:
print("\n No events found after 150s. Either ingest is slow today, "
"or the upload was rejected silently. Inspect upload response above.")
sys.exit(2)
# The response uses a columns/values layout. Discover column order.
columns = result.get("columns") or []
col_names = [c.get("name") if isinstance(c, dict) else str(c) for c in columns]
print(f"\ncolumns: {col_names}")
print(f"matches: {len(matches)}")
print("\n" + "=" * 70)
print("STEP 4 — parse results, check OCSF fields are populated")
print("=" * 70)
EXPECTED = {
"src_endpoint.ip": "10.200.0.82",
"src_endpoint.port": "56637",
"dst_endpoint.ip": "192.168.10.7",
"dst_endpoint.port": "53",
"actor.user.name": "aimee.ndzodo",
}
for i, m in enumerate(matches, 1):
vals = m.get("values") or m
row = dict(zip(col_names, vals)) if isinstance(vals, list) else vals
print(f"\n--- match {i} ---")
for k in col_names:
v = row.get(k)
mark = ""
if k in EXPECTED:
mark = "" if str(v) == EXPECTED[k] else f" ❌ (expected {EXPECTED[k]!r})"
print(f" {k:25s} = {v!r}{mark}")
# Summary
hits = sum(1 for k, want in EXPECTED.items() if str(row.get(k)) == want)
print(f"\n OCSF rewrites populated: {hits}/{len(EXPECTED)}")
if hits == len(EXPECTED):
print(" → SDL parser applied the rewrites correctly. ✅")
else:
print(" → Some rewrites missing — the SDL parser may not have run.")
if __name__ == "__main__":
main()
+69
View File
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
"""Query SDL to verify recent Stormshield events landed and were parsed."""
import json, time, urllib.request, sys
CFG = json.load(open("./config.json"))
BASE = CFG["base_url"].rstrip("/")
READ_KEY = CFG["log_read_key"]
now_ms = int(time.time() * 1000)
start_ms = now_ms - 15 * 60 * 1000 # last 15 minutes
QUERY = (
"parser='stormshield' "
"| columns timestamp, dataSource.name, parser, "
"src_endpoint.ip, src_endpoint.port, dst_endpoint.ip, dst_endpoint.port, "
"actor.user.name, unmapped.action, unmapped.proto, unmapped.fw, unmapped.rulename "
"| sort -timestamp | limit 10"
)
body = json.dumps({
"query": QUERY,
"startTime": str(start_ms),
"endTime": str(now_ms),
}).encode()
req = urllib.request.Request(
f"{BASE}/api/powerQuery",
method="POST",
data=body,
headers={
"Authorization": f"Bearer {READ_KEY}",
"Content-Type": "application/json",
},
)
with urllib.request.urlopen(req, timeout=60) as r:
resp = json.loads(r.read())
cols = [c["name"] for c in resp.get("columns", [])]
values = resp.get("values", [])
total = resp.get("matchingEvents", len(values))
print(f"query = {QUERY}")
print(f"window = last 15 min")
print(f"matchingEvents = {total}")
print(f"cols = {cols}")
print()
if not values:
print("No events visible yet. SDL ingest can take 30-90s; re-run verify_query.py in a minute.")
sys.exit(1)
print(f"{'timestamp(ns)':>20} {'src':<16} {'sport':<6} -> {'dst':<16} {'dport':<6} {'user':<20} {'action':<8} {'proto':<8}")
print("-" * 110)
for row in values:
d = dict(zip(cols, row))
print(
f"{d.get('timestamp',''):>20} "
f"{str(d.get('src_endpoint.ip','')):<16} "
f"{str(d.get('src_endpoint.port','')):<6} -> "
f"{str(d.get('dst_endpoint.ip','')):<16} "
f"{str(d.get('dst_endpoint.port','')):<6} "
f"{str(d.get('actor.user.name','')):<20} "
f"{str(d.get('unmapped.action','')):<8} "
f"{str(d.get('unmapped.proto','')):<8}"
)
print()
print("✅ Events are visible in the SDL data lake under parser='stormshield'")
print(" Search in https://demo.sentinelone.net with: parser=\"stormshield\"")