Files
marcredhat-kql/harness/check_ts_collisions.py
T

35 lines
1.2 KiB
Python

#!/usr/bin/env python3
"""Count duplicate timestamps within the generated JSONL.
SDL appears to dedupe addEvents by (session, ts) - events sharing a ts
within the same session are silently dropped. If our generator emits many
events at colliding ts_epoch_ms values, only one of each cluster survives.
"""
import json
from collections import Counter, defaultdict
from pathlib import Path
JSONL = Path(__file__).resolve().parents[1] / "sample_data" / "events.jsonl"
per_type_total = Counter()
per_type_unique = defaultdict(set)
per_type_max_collision = defaultdict(int)
with JSONL.open() as f:
for line in f:
r = json.loads(line)
et = r["event_type"]
ts = r["ts_epoch_ms"]
per_type_total[et] += 1
per_type_unique[et].add(ts)
print(f"{'event_type':30s} {'events':>8} {'uniq_ts':>8} {'collision_loss%':>16}")
print("-" * 70)
for et in sorted(per_type_total):
n = per_type_total[et]
u = len(per_type_unique[et])
loss = 100 * (n - u) / n if n else 0
print(f"{et:30s} {n:>8} {u:>8} {loss:>15.1f}%")
print("-" * 70)
print(f"{'TOTAL':30s} {sum(per_type_total.values()):>8} "
f"{sum(len(s) for s in per_type_unique.values()):>8}")