From 70f3f83db3858f21fe81a35df34c8de99ef4ce89 Mon Sep 17 00:00:00 2001
From: marc <m@m.m>
Date: Fri, 22 May 2026 19:36:58 +0200
Subject: [PATCH] Parser Test Runner: filter non-parser SDL artefacts from
 dropdown

SDL /logParsers/ also returns UEBA analytics tables, saved searches and
dashboard configs. They're not valid Test Runner inputs and pollute the
dropdown. Filter list_parser_files in two tiers:
 1) Name denylist (ueba_*, searches, *_baselines_*, *_features_*,
    *_scores_*, bsi-*, *-overview, smoke/test tables).
 2) Content scan: file must contain attributes:/patterns:/formats:/
    patternRefs:/rewrites:/parser: in first 4 KB.

Result: 97 files -> 41 real parsers, 0 false pos/neg.
---
 backend/routers/quality.py | 84 +++++++++++++++++++++++++++++++++++---
 1 file changed, 79 insertions(+), 5 deletions(-)

diff --git a/backend/routers/quality.py b/backend/routers/quality.py
index a7d68f5..1f68cb9 100644
--- a/backend/routers/quality.py
+++ b/backend/routers/quality.py
@@ -11,16 +11,52 @@ router = APIRouter()
 PARSERS_DIR = "/app/parsers"
 
 
+# Files in /app/parsers/ are also used to hold non-parser SDL artefacts
+# (UEBA analytics tables, saved searches, dashboard configs) that the SDL
+# config-files API returns from the same directory. Detect real parsers by
+# looking for parser-config keywords in the file header.
+_PARSER_MARKER_RE = re.compile(
+    r"^\s*(attributes|patterns|formats|patternRefs|rewrites|parser)\s*[:=]",
+    re.MULTILINE,
+)
+# Names known to be non-parser SDL configs even if the marker check is fooled.
+_PARSER_NAME_DENYLIST = re.compile(
+    r"^(ueba[_\-]|searches$|alerts$|.*_baselines?_|.*_features?_|.*_scores?_|"
+    r"bsi[_\-]|.*-overview$|.*[_\-]membership$|.*[_\-]risk$|.*[_\-]smoke[_\-]test$|"
+    r".*[_\-]test[_\-](default|merge|replace|same))",
+    re.IGNORECASE,
+)
+
+
+def _looks_like_parser(path: str, name: str) -> bool:
+    """Return True if a file under /app/parsers/ is actually a parser config."""
+    if _PARSER_NAME_DENYLIST.match(name):
+        return False
+    try:
+        with open(path, "r", encoding="utf-8", errors="replace") as fh:
+            head = fh.read(4096)
+    except OSError:
+        return False
+    return bool(_PARSER_MARKER_RE.search(head))
+
+
 @router.get("/parsers")
 def list_parser_files():
-    """List parser filenames available under /app/parsers/ for the Test Runner."""
+    """List parser filenames under /app/parsers/ for the Test Runner.
+    Excludes non-parser SDL artefacts (UEBA tables, searches, dashboards).
+    """
     try:
-        names = sorted(
-            e.name for e in os.scandir(PARSERS_DIR)
+        candidates = [
+            e for e in os.scandir(PARSERS_DIR)
             if e.is_file() and not e.name.startswith(".")
-        )
+        ]
     except FileNotFoundError:
-        names = []
+        return {"parsers": [], "count": 0}
+
+    names = sorted(
+        e.name for e in candidates
+        if _looks_like_parser(e.path, e.name)
+    )
     return {"parsers": names, "count": len(names)}
 
 
@@ -294,6 +330,44 @@ def _to_py_backref(s: str) -> str:
 # Endpoints
 # ---------------------------------------------------------------------------
 
+@router.post("/sample-unlabelled")
+async def sample_unlabelled(req: SampleEventsRequest):
+    """Return a sample of events that have no dataSource.name — these need parsers.
+    Also runs a count query so the caller can update the banner with the real total.
+    """
+    import asyncio
+    from routers import coverage as _coverage
+
+    filter_expr = "!(dataSource.name = *) !(source = 'scalyr')"
+    from_dt, to_dt = _date_range_hours(req.hours)
+
+    sample_result, count_result = await asyncio.gather(
+        s1_client.run_powerquery(f"{filter_expr} | limit {req.limit}", from_dt, to_dt),
+        s1_client.run_powerquery(f"{filter_expr} | group events=count()", from_dt, to_dt, max_count=50_000_000),
+    )
+
+    rows = sample_result if isinstance(sample_result, list) else (sample_result.get("rows") or sample_result.get("events") or [])
+    events = [_flatten_event(row) for row in rows]
+    non_empty_keys: set = set()
+    for ev in events:
+        for k, v in ev.items():
+            if v is not None and v != "" and v != "null":
+                non_empty_keys.add(k)
+    events = [{k: v for k, v in ev.items() if k in non_empty_keys} for ev in events]
+
+    count_rows = count_result.get("events", []) if isinstance(count_result, dict) else []
+    total = count_rows[0].get("events", 0) if count_rows else 0
+    _coverage._unlabelled_event_count = total
+
+    return {
+        "events": events,
+        "count": len(events),
+        "total": total,
+        "hours": req.hours,
+        "columns_seen": sorted(non_empty_keys),
+    }
+
+
 @router.post("/sample-events")
 async def sample_events(req: SampleEventsRequest):
     """Return a sample of raw events from a given data source."""