marcredhat-kql/harness/check_ts_collisions.py

#!/usr/bin/env python3
"""Count duplicate timestamps within the generated JSONL.

SDL appears to dedupe addEvents by (session, ts) - events sharing a ts
within the same session are silently dropped. If our generator emits many
events at colliding ts_epoch_ms values, only one of each cluster survives.
"""
import json
from collections import Counter, defaultdict
from pathlib import Path

JSONL = Path(__file__).resolve().parents[1] / "sample_data" / "events.jsonl"

per_type_total = Counter()
per_type_unique = defaultdict(set)
per_type_max_collision = defaultdict(int)
with JSONL.open() as f:
    for line in f:
        r = json.loads(line)
        et = r["event_type"]
        ts = r["ts_epoch_ms"]
        per_type_total[et] += 1
        per_type_unique[et].add(ts)

print(f"{'event_type':30s} {'events':>8} {'uniq_ts':>8} {'collision_loss%':>16}")
print("-" * 70)
for et in sorted(per_type_total):
    n = per_type_total[et]
    u = len(per_type_unique[et])
    loss = 100 * (n - u) / n if n else 0
    print(f"{et:30s} {n:>8} {u:>8} {loss:>15.1f}%")
print("-" * 70)
print(f"{'TOTAL':30s} {sum(per_type_total.values()):>8} "
      f"{sum(len(s) for s in per_type_unique.values()):>8}")