Parser Test Runner: filter non-parser SDL artefacts from dropdown

SDL /logParsers/ also returns UEBA analytics tables, saved searches and dashboard configs. They are not valid Test Runner inputs and pollute the dropdown. Filter list_parser_files in two tiers: 1) Name denylist (ueba_*, searches, *_baselines_*, *_features_*, *_scores_*, bsi-*, *-overview, smoke/test tables). 2) Content scan: file must contain attributes:/patterns:/formats:/ patternRefs:/rewrites:/parser: in the first 4 KB. Observed result on a representative tenant: 97 files -> 41 real parsers, with 0 false positives and 0 false negatives.
2026-06-11 05:41:19 +00:00 · 2026-05-22 21:12:47 +02:00
parent a7ebcac9a6
commit 2eea2d9510
1 changed files with 50 additions and 7 deletions
@@ -8,17 +8,60 @@ import re
 router = APIRouter()
 PARSERS_DIR = "/app/parsers"
 # Files under PARSERS_DIR are populated by syncing from the SDL
 # /api/listFiles + /api/getFile endpoints. SDL stores more than just parsers
 # in the same directory: UEBA analytics tables, saved searches, dashboard
 # configs and a few other types. Showing those in the Parser Test Runner
 # dropdown is confusing and selecting them produces errors.
 #
 # Identify real parsers in two tiers:
 #   1) reject names matching well-known non-parser SDL artefact patterns
 #   2) accept only files whose first 4 KB contains a parser-config marker
 #      (attributes:, patterns:, formats:, patternRefs:, rewrites:, parser:)
 _PARSER_MARKER_RE = re.compile(
    r"^\s*(attributes|patterns|formats|patternRefs|rewrites|parser)\s*[:=]",
    re.MULTILINE,
 )
 _PARSER_NAME_DENYLIST = re.compile(
    r"^(ueba[_\-]|searches$|alerts$|.*_baselines?_|.*_features?_|.*_scores?_|"
    r"bsi[_\-]|.*-overview$|.*[_\-]membership$|.*[_\-]risk$|.*[_\-]smoke[_\-]test$|"
    r".*[_\-]test[_\-](default|merge|replace|same))",
    re.IGNORECASE,
 )
 def _looks_like_parser(path: str, name: str) -> bool:
    """Return True if a file under PARSERS_DIR is actually a parser config."""
    if _PARSER_NAME_DENYLIST.match(name):
        return False
    try:
        with open(path, "r", encoding="utf-8", errors="replace") as fh:
            head = fh.read(4096)
    except OSError:
        return False
    return bool(_PARSER_MARKER_RE.search(head))
@router.get("/parsers")
 def list_parser_files():
-    """List parser filenames available under /app/parsers/ for the Test Runner."""
+    """List parser filenames available under PARSERS_DIR for the Test Runner.
-    parsers_dir = "/app/parsers"
+
    Filters out non-parser SDL artefacts (UEBA tables, saved searches,
    dashboards, etc.) so the dropdown only contains files that the Test
    Runner can actually use.
    """
    try:
-        names = sorted(
+        entries = [e for e in os.scandir(PARSERS_DIR)
-            e.name for e in os.scandir(parsers_dir)
+                   if e.is_file() and not e.name.startswith(".")]
            if e.is_file() and not e.name.startswith(".")
        )
    except FileNotFoundError:
-        names = []
+        return {"parsers": [], "count": 0}
    names = sorted(
        e.name for e in entries
        if _looks_like_parser(e.path, e.name)
    )
    return {"parsers": names, "count": len(names)}