From 2c40bf81eef5ac1da9ca9bcfc5448e22389fd9a4 Mon Sep 17 00:00:00 2001 From: Mick <119439091+mickbrowns1@users.noreply.github.com> Date: Fri, 22 May 2026 10:11:42 -0400 Subject: [PATCH] Cherry-pick improvements from PR #2 (marcredhat) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - s1_client: configurable PowerQuery timeout via SDL_PQ_TIMEOUT env var (default 600s, was hardcoded 120s) with separate connect/read timeouts via httpx.Timeout; retry on ReadTimeout via SDL_PQ_TIMEOUT_RETRIES; better error messages include query snippet and parse non-JSON responses - ingest: fix simulate-filter SDL syntax (== → =, drop leading | on base expression, surface PowerQuery error field, cleaner empty-filter fallback) - docker-compose: pass SDL_PQ_TIMEOUT and SDL_PQ_TIMEOUT_RETRIES through to backend container with sensible defaults Not taken from PR #2: - .gitignore parsers/* change — would untrack the 7 committed parser files - s1_client/quality/coverage changes already present in main from prior work Co-Authored-By: Claude Sonnet 4.6 --- backend/routers/ingest.py | 19 +++++++++++++------ backend/services/s1_client.py | 34 ++++++++++++++++++++++++++++------ docker-compose.yml | 2 ++ 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/backend/routers/ingest.py b/backend/routers/ingest.py index a665731..146e5f3 100644 --- a/backend/routers/ingest.py +++ b/backend/routers/ingest.py @@ -90,21 +90,28 @@ async def simulate_filter(rule: FilterRule): """Estimate how many events and GB would be eliminated by an exclusion filter.""" from_dt, to_dt = _date_range(rule.days) + # Build Scalyr filter expression clauses (uses = not ==, SDL syntax) clauses = [] if rule.source: - clauses.append(f"dataSource.name=='{rule.source}'") + clauses.append(f"dataSource.name = '{rule.source}'") if rule.event_type: - clauses.append(f"event.type=='{rule.event_type}'") + clauses.append(f"event.type = '{rule.event_type}'") if clauses: - filter_expr = " and ".join(clauses) - query = f"| filter {filter_expr} | group events=count()" + filter_expr = " ".join(clauses) + query = f"{filter_expr} | group events=count()" else: - query = "| group events=count()" + query = "dataSource.name != '' | group events=count()" try: result = await s1_client.run_powerquery(query, from_dt, to_dt) - events = (result.get("events") or [{}])[0].get("events", 0) if isinstance(result.get("events"), list) else 0 + err = result.get("error") if isinstance(result, dict) else None + if err: + raise HTTPException(502, f"PowerQuery error: {err}") + rows = result.get("events") or [] + events = rows[0].get("events", 0) if rows else 0 + except HTTPException: + raise except Exception as e: raise HTTPException(502, f"PowerQuery error: {e}") diff --git a/backend/services/s1_client.py b/backend/services/s1_client.py index 9b7497b..f8cf1fa 100644 --- a/backend/services/s1_client.py +++ b/backend/services/s1_client.py @@ -6,6 +6,12 @@ from datetime import datetime, timezone BASE_URL = os.environ.get("S1_BASE_URL", "https://demo.sentinelone.net").rstrip("/") TOKEN = os.environ.get("S1_API_TOKEN", "") +# Configurable PowerQuery timeout — SDL queries on large tenants can exceed 2 min. +# Set SDL_PQ_TIMEOUT in .env (seconds). Default: 600. +SDL_PQ_TIMEOUT = int(os.environ.get("SDL_PQ_TIMEOUT", "600")) +# How many times to retry on ReadTimeout before giving up. Default: 1 (one retry). +SDL_PQ_TIMEOUT_RETRIES = int(os.environ.get("SDL_PQ_TIMEOUT_RETRIES", "1")) + # Scalyr/XDR PowerQuery credentials — from SDL_XDR_URL + SDL_LOG_READ_KEY # in the SentinelOne console: Settings → Integrations → Data Lake API Keys SDL_XDR_URL = os.environ.get("SDL_XDR_URL", "https://xdr.us1.sentinelone.net").rstrip("/") @@ -117,8 +123,12 @@ async def run_powerquery(query: str, from_date: str, to_date: str, max_count: in "maxCount": max_count, } - async with httpx.AsyncClient(timeout=120) as client: - for attempt in range(3): + # Use a generous read timeout for PowerQuery — large SDL scans can be slow. + pq_timeout = httpx.Timeout(connect=15.0, read=SDL_PQ_TIMEOUT, write=30.0, pool=15.0) + max_attempts = 2 + SDL_PQ_TIMEOUT_RETRIES # base 2 (rate-limit) + timeout retries + + async with httpx.AsyncClient(timeout=pq_timeout) as client: + for attempt in range(max_attempts): try: resp = await client.post( f"{SDL_XDR_URL}/api/powerQuery", @@ -126,12 +136,24 @@ async def run_powerquery(query: str, from_date: str, to_date: str, max_count: in ) resp.raise_for_status() break - except httpx.HTTPStatusError as e: - if e.response.status_code == 429 and attempt < 2: - await asyncio.sleep(10 * (attempt + 1)) + except httpx.ReadTimeout: + if attempt < max_attempts - 1: + await asyncio.sleep(5) continue raise RuntimeError( - f"HTTP {e.response.status_code} from {e.request.url}: {e.response.text[:500]}" + f"PowerQuery timed out after {SDL_PQ_TIMEOUT}s " + f"(increase SDL_PQ_TIMEOUT in .env). Query: {query[:200]}" + ) + except httpx.HTTPStatusError as e: + if e.response.status_code == 429 and attempt < max_attempts - 1: + await asyncio.sleep(10 * (attempt + 1)) + continue + try: + detail = e.response.json() + except Exception: + detail = e.response.text[:500] + raise RuntimeError( + f"HTTP {e.response.status_code} from {e.request.url}: {detail}" ) from e data = resp.json() diff --git a/docker-compose.yml b/docker-compose.yml index ba9bd04..383a635 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,6 +16,8 @@ services: - SDL_XDR_URL=${SDL_XDR_URL} - SDL_LOG_READ_KEY=${SDL_LOG_READ_KEY} - SDL_CONFIG_READ_KEY=${SDL_CONFIG_READ_KEY} + - SDL_PQ_TIMEOUT=${SDL_PQ_TIMEOUT:-600} + - SDL_PQ_TIMEOUT_RETRIES=${SDL_PQ_TIMEOUT_RETRIES:-1} - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} - DATABASE_URL=postgresql://siem:siem@db:5432/siem - DETECTIONS_FILE=/app/data/detections.json