mcu(health): commit rate-limit window before early returns (AUDIT-CAL follow-up)

checkSystemHealth() had three watchdog blocks with the identical
"last_X_check not updated on error path" bug — same root cause as
AUDIT-CAL (BMP180 fix in commit 95aed35), distinct sites:

  AD9523 clock check   (5 s)  main.cpp:693-705
  ADAR1000 comm check  (2 s)  main.cpp:729-749
  IMU comm check       (10 s) main.cpp:752-760

Pre-fix, each block placed `last_X_check = HAL_GetTick();` below the
early-return path, so once the underlying check (STATUS0/1 RESET,
SCRATCHPAD verify fail, GY85_Update false) started failing, the
rate-limit window never engaged. Every subsequent iteration of the
main while(1) loop re-fired the corresponding ERROR_*. With
error_count > 10 latching system_emergency_state per MCU-N1, the
radar would trip into SAFE-MODE within ~10 main-loop iterations of
the first transient — far short of the intended ~100-150 s grace
window meant for operator intervention or attemptErrorRecovery
to succeed. ADAR1000 comm-failure also re-ran the 16 ms blocking
SPI verify (4 devices × 4 ms HAL_Delay) per iteration → chirp jitter.

Fix at all three sites: move the timestamp update INTO the if-block
and BEFORE any sub-check call. Mirrors the AUDIT-CAL post-fix
BMP180 block at main.cpp:771-780. ADAR1000 overtemp check stays
per-loop (unchanged) — over-temperature must remain responsive.

Test: tests/test_audit_imu_watchdog_cadence.c (6 tests, 6/6 PASS)
exercises the post-fix predicate against simulated HAL_GetTick()
ticks and a controllable GY85_Update() mock; counter-test runs the
pre-fix predicate to demonstrate the regression. Test uses IMU as
representative; AD9523 (5 s) and ADAR1000 (2 s) sites have identical
control flow.

Verification: full MCU host suite 34/34 PASS (was 33/33; +1 new test,
0 regressions).
This commit is contained in:
Jason
2026-04-29 20:57:50 +05:45
parent 1f307f77a9
commit 1b1b5f4fb2
3 changed files with 276 additions and 5 deletions
@@ -74,6 +74,7 @@ TESTS_STANDALONE := test_bug12_pa_cal_loop_inverted \
test_mcu_a4_ocxo_warm_restart \
test_audit_c17_bmp180_sentinel_and_cast \
test_audit_cal_bmp180_begin \
test_audit_imu_watchdog_cadence \
test_gap3_iwdg_config \
test_gap3_temperature_max \
test_gap3_idq_periodic_reread \
@@ -191,6 +192,9 @@ test_audit_c17_bmp180_sentinel_and_cast: test_audit_c17_bmp180_sentinel_and_cast
test_audit_cal_bmp180_begin: test_audit_cal_bmp180_begin.c
$(CC) $(CFLAGS) $< -o $@
test_audit_imu_watchdog_cadence: test_audit_imu_watchdog_cadence.c
$(CC) $(CFLAGS) $< -o $@
# Gap-3 safety tests -- mock-only (needs spy log for GPIO sequence)
test_gap3_emergency_stop_rails: test_gap3_emergency_stop_rails.c $(MOCK_OBJS)
$(CC) $(CFLAGS) $(INCLUDES) $< $(MOCK_OBJS) -o $@
@@ -0,0 +1,244 @@
/*******************************************************************************
* test_audit_imu_watchdog_cadence.c
*
* AUDIT-CAL follow-up: identical "last_X_check not updated on error path" bug
* existed at THREE sites in checkSystemHealth() (pre commit):
* - AD9523 clock check (main.cpp:693-705, 5 s rate-limit broken)
* - ADAR1000 comm check (main.cpp:729-749, 2 s rate-limit broken)
* - IMU comm check (main.cpp:752-760, 10 s rate-limit broken)
*
* All three shared one fix pattern: commit `last_X_check = HAL_GetTick();`
* BEFORE any early return inside the watchdog block, so a transient sub-check
* failure cannot bypass the rate-limit window. This test uses the IMU branch
* as the representative predicate (10 s window is the longest; pre-fix demo
* is most dramatic) the AD9523 (5 s) and ADAR1000 (2 s) sites have
* identical control flow and are covered by code review against the test's
* extracted predicate.
*
* The pre-fix watchdog block was:
*
* static uint32_t last_imu_check = 0;
* if (HAL_GetTick() - last_imu_check > 10000) {
* if (!GY85_Update(&imu)) {
* current_error = ERROR_IMU_COMM;
* return current_error; // <-- skips timestamp update
* }
* last_imu_check = HAL_GetTick(); // <-- only on success path
* }
*
* Once GY85_Update started returning false (transient I2C glitch, cable pull,
* sensor brown-out), the 10 s rate-limit never engaged. Every iteration of
* the main while(1) loop fired ERROR_IMU_COMM. Combined with the MCU-N1
* `error_count > 10 -> system_emergency_state=true` latch, the radar would
* trip into SAFE-MODE within ~10 main-loop iterations of the first IMU
* failure far short of the intended ~100 s (10 errors x 10 s) grace window
* meant to give an operator time to intervene or `attemptErrorRecovery` to
* succeed.
*
* Same bug class as AUDIT-CAL (BMP180 watchdog at main.cpp:771-780); same
* fix pattern: move `last_imu_check = HAL_GetTick();` to BEFORE the early
* return, so the rate-limit window commits on every fired-watchdog call,
* regardless of whether the underlying check passed.
*
* Test strategy:
* - Extract the post-fix watchdog predicate into a pure function.
* - Drive it with a simulated HAL_GetTick() and a controllable
* GY85_Update() mock; assert error count tracks the 10 s rate-limit and
* never tracks the main-loop iteration count.
* - Add a counter-test using the pre-fix predicate to demonstrate the
* regression we are guarding against.
******************************************************************************/
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
/* ---- Mock state ---- */
static int gy85_returns_ok = 1; /* 1=PASS, 0=FAIL */
static int gy85_call_count = 0; /* incremented each invocation */
static int mock_GY85_Update(void)
{
gy85_call_count++;
return gy85_returns_ok;
}
/* ============================================================================
* Post-fix predicate (matches the new main.cpp:760-769 block).
* Returns 1 iff this call raises ERROR_IMU_COMM.
* Updates last_imu_check BEFORE the early return on GY85 failure, so a
* flapping IMU never bypasses the rate-limit.
* ============================================================================ */
static uint32_t last_imu_check_postfix = 0;
static int imu_watchdog_postfix(uint32_t now_tick)
{
if (now_tick - last_imu_check_postfix > 10000) {
last_imu_check_postfix = now_tick; /* commit BEFORE check */
if (!mock_GY85_Update()) {
return 1; /* ERROR_IMU_COMM */
}
}
return 0;
}
/* ============================================================================
* Pre-fix predicate (matches the old main.cpp:752-760 block, kept here as a
* counter-test). Updates last_imu_check ONLY on the success path.
* ============================================================================ */
static uint32_t last_imu_check_prefix = 0;
static int imu_watchdog_prefix(uint32_t now_tick)
{
if (now_tick - last_imu_check_prefix > 10000) {
if (!mock_GY85_Update()) {
return 1; /* ERROR_IMU_COMM */
}
last_imu_check_prefix = now_tick; /* only on success */
}
return 0;
}
/* ---- Test bookkeeping ---- */
static void reset_state(void)
{
last_imu_check_postfix = 0;
last_imu_check_prefix = 0;
gy85_call_count = 0;
gy85_returns_ok = 1;
}
int main(void)
{
printf("=== AUDIT-CAL follow-up: IMU watchdog rate-limit on error path ===\n");
/* ----------------------------------------------------------------
* T1: Healthy IMU only one GY85_Update per 10 s window.
* Drive 10 s of 10 ms-spaced main-loop ticks (1000 calls).
* Expect: 1 invocation per 10 s window (so ~1-2 across the run).
* ---------------------------------------------------------------- */
printf(" T1 healthy IMU — calls rate-limited to ~1 per 10 s... ");
reset_state();
int errors = 0;
for (int i = 0; i <= 1000; i++) {
errors += imu_watchdog_postfix(i * 10); /* 0..10000 ms */
}
/* First call (t=0): now - last(0) = 0, so the > 10000 test is FALSE,
no GY85 call. The window first opens at t > 10000 (ie i = 1001+). */
assert(errors == 0);
assert(gy85_call_count == 0);
/* Push past the threshold once. */
errors += imu_watchdog_postfix(10001);
assert(errors == 0); /* IMU OK -> no error */
assert(gy85_call_count == 1); /* one update */
printf("PASS (gy85 calls=%d, errors=%d)\n", gy85_call_count, errors);
/* ----------------------------------------------------------------
* T2: First failure path commits the rate-limit window.
* Set GY85_Update to fail. Walk the simulated tick across one window
* boundary; expect exactly 1 error AND last_imu_check_postfix updated.
* ---------------------------------------------------------------- */
printf(" T2 single failure commits window... ");
reset_state();
gy85_returns_ok = 0;
/* Cross threshold once. */
int e = imu_watchdog_postfix(10001);
assert(e == 1);
assert(last_imu_check_postfix == 10001u); /* ts updated despite error */
assert(gy85_call_count == 1);
/* Immediately ask again — must NOT re-fire (rate-limit holds). */
e = imu_watchdog_postfix(10002);
assert(e == 0);
assert(gy85_call_count == 1);
printf("PASS\n");
/* ----------------------------------------------------------------
* T3: Continuous failure across many main-loop iterations does NOT
* exceed 1 error per 10 s window.
* Drive 60 s of 10 ms-spaced calls with GY85 always failing.
* Expect: ceil(60 / 10) = 6 errors max (one per crossed window).
* ---------------------------------------------------------------- */
printf(" T3 flapping IMU — errors capped at 1 per 10 s window over 60 s... ");
reset_state();
gy85_returns_ok = 0;
int total_errors = 0;
int total_calls = 0;
for (int i = 0; i <= 6000; i++) { /* 0..60000 ms, step 10 ms */
if (imu_watchdog_postfix(i * 10)) total_errors++;
total_calls++;
}
/* Threshold is strict > 10000, so windows fire at t=10001, 20001, ...
Across [0, 60000] inclusive: t exceeds 10000 starting i=1001 -> first
fire; ts becomes 10010 (not 10001 since we stepped by 10 ms). Next
window opens at ts + 10000 -> t > 20010 -> first fire at 20020.
Continuing pattern: 30030, 40040, 50050, 60060 (out of range).
Across [0, 60000]: exactly 5 fires. */
assert(total_calls == 6001);
assert(total_errors == 5);
/* Critical invariant: errors >>>> main-loop iterations means broken. */
assert(total_errors < total_calls);
/* MCU-N1 latch (error_count > 10) requires ~110 s of post-fix flapping;
under pre-fix it would have tripped within ~10 main-loop iterations. */
printf("PASS (calls=%d errors=%d)\n", total_calls, total_errors);
/* ----------------------------------------------------------------
* T4: Counter-test pre-fix predicate fires error every main-loop
* iteration past the first 10 s, demonstrating the bug we just fixed.
* ---------------------------------------------------------------- */
printf(" T4 counter-test: pre-fix predicate fires every loop... ");
reset_state();
gy85_returns_ok = 0;
int prefix_errors = 0;
int prefix_calls = 0;
/* Prime: cross the threshold (any t > 10000), then fail thereafter. */
for (int i = 1001; i <= 1100; i++) { /* 100 main-loop iterations */
if (imu_watchdog_prefix(i * 10)) prefix_errors++;
prefix_calls++;
}
/* Pre-fix bug: each iteration past t=10000 returns an error, because
last_imu_check_prefix never updates on the failure path. */
assert(prefix_calls == 100);
assert(prefix_errors == 100); /* ALL fail = bug demo */
printf("PASS (prefix would have %d errors in %d iterations)\n",
prefix_errors, prefix_calls);
/* ----------------------------------------------------------------
* T5: Recovery when GY85 starts passing again, errors stop and the
* window cadence resumes normally.
* ---------------------------------------------------------------- */
printf(" T5 recovery — IMU comes back, errors stop... ");
reset_state();
gy85_returns_ok = 0;
/* One failure across first window. */
assert(imu_watchdog_postfix(10001) == 1);
/* Now IMU recovers. Cross next window boundary. */
gy85_returns_ok = 1;
assert(imu_watchdog_postfix(20002) == 0); /* window crossed; check OK */
assert(imu_watchdog_postfix(20003) == 0); /* same window; not re-tested */
/* Cross another window boundary while healthy. */
assert(imu_watchdog_postfix(30004) == 0);
printf("PASS\n");
/* ----------------------------------------------------------------
* T6: HAL_GetTick() 32-bit wrap. Same modulo-arithmetic guarantee as
* test_gap3_health_watchdog_cold_start.c T8.
* ---------------------------------------------------------------- */
printf(" T6 HAL_GetTick wrap (0xFFFFFF00 -> 0x00000064)... ");
reset_state();
gy85_returns_ok = 1;
/* Seed: prime the watchdog at t=0xFFFFFF00 by crossing a window from 0. */
last_imu_check_postfix = 0xFFFFFF00u;
/* Now ask at 0x00000064 (just after wrap). True elapsed = 0x164 = 356 ms,
which is BELOW the 10 s threshold, so the watchdog should NOT trigger
a GY85 call. */
int err = imu_watchdog_postfix(0x00000064u);
assert(err == 0);
assert(gy85_call_count == 0); /* no GY85 call */
/* Now jump >10 s past the wrap: 0x00000064 + 10001 = 0x00002775. */
err = imu_watchdog_postfix(0x00002775u);
assert(err == 0); /* IMU OK */
assert(gy85_call_count == 1); /* one GY85 call */
printf("PASS\n");
printf("\n=== AUDIT-CAL follow-up: ALL TESTS PASSED ===\n\n");
return 0;
}