mirror of
https://github.com/marcredhat/kql
synced 2026-06-08 21:27:09 +00:00
35 lines
1.2 KiB
Python
35 lines
1.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Count duplicate timestamps within the generated JSONL.
|
|
|
|
SDL appears to dedupe addEvents by (session, ts) - events sharing a ts
|
|
within the same session are silently dropped. If our generator emits many
|
|
events at colliding ts_epoch_ms values, only one of each cluster survives.
|
|
"""
|
|
import json
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
JSONL = Path(__file__).resolve().parents[1] / "sample_data" / "events.jsonl"
|
|
|
|
per_type_total = Counter()
|
|
per_type_unique = defaultdict(set)
|
|
per_type_max_collision = defaultdict(int)
|
|
with JSONL.open() as f:
|
|
for line in f:
|
|
r = json.loads(line)
|
|
et = r["event_type"]
|
|
ts = r["ts_epoch_ms"]
|
|
per_type_total[et] += 1
|
|
per_type_unique[et].add(ts)
|
|
|
|
print(f"{'event_type':30s} {'events':>8} {'uniq_ts':>8} {'collision_loss%':>16}")
|
|
print("-" * 70)
|
|
for et in sorted(per_type_total):
|
|
n = per_type_total[et]
|
|
u = len(per_type_unique[et])
|
|
loss = 100 * (n - u) / n if n else 0
|
|
print(f"{et:30s} {n:>8} {u:>8} {loss:>15.1f}%")
|
|
print("-" * 70)
|
|
print(f"{'TOTAL':30s} {sum(per_type_total.values()):>8} "
|
|
f"{sum(len(s) for s in per_type_unique.values()):>8}")
|